Source code for tika.parser

#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from http import HTTPStatus
from pathlib import Path
from typing import Any, BinaryIO, cast

import orjson

from tika.core import SERVER_ENDPOINT, TikaError, TikaResponse, call_server, parse_1



[docs]
async def from_file(
    obj: str | Path | BinaryIO,
    *,
    server_endpoint: str = SERVER_ENDPOINT,
    service: str = "all",
    xml_content: bool = False,
    headers: dict[str, Any] | None = None,
    config_path: str | None = None,
    request_options: dict[str, Any] | None = None,
) -> TikaResponse:
    """Parses a file using Apache Tika server and returns structured content and metadata.

    This function sends a file to the Tika server for parsing using the specified service
    and configuration options. It can handle local files, URLs, or binary streams.

    Args:
        obj: The file to be parsed. Can be:
            - str: A file path or URL
            - Path: A pathlib.Path object pointing to a file
            - BinaryIO: A file-like object in binary read mode
        server_endpoint: The URL of the Tika server. Defaults to SERVER_ENDPOINT.
        service: The Tika service to use. Must be one of:
            - "all": Both content and metadata (default)
            - "meta": Only metadata
            - "text": Only text content
        xml_content: If True, requests XML output instead of plain text.
            This affects how the content is structured in the response.
        headers: Additional HTTP headers to include in the request.
        config_path: Path to a custom Tika configuration file.
        request_options: Additional options for the HTTP request (e.g., timeout).

    Returns:
        TikaResponse: A dictionary-like object containing:
            - content: Extracted text or XML content (str or None)
            - metadata: Dictionary of document metadata (dict or None)
            - status: HTTP status code (int)
            - attachments: Any embedded files (dict or None)

    Raises:
        TikaError: If the server returns an error or parsing fails
        FileNotFoundError: If the specified file doesn't exist
        ValueError: If an invalid service type is specified

    Example:
        >>> response = from_file("document.pdf", service="all")
        >>> print(response.content)  # Print extracted text
        >>> print(response.metadata.get("Content-Type"))  # Get document type
    """
    if not xml_content:
        output = await parse_1(
            option=service,
            url_or_path=obj,
            server_endpoint=server_endpoint,
            headers=headers,
            config_path=config_path,
            request_options=request_options,
        )
    else:
        output = await parse_1(
            option=service,
            url_or_path=obj,
            server_endpoint=server_endpoint,
            services={"meta": "/meta", "text": "/tika", "all": "/rmeta/xml"},
            headers=headers,
            config_path=config_path,
            request_options=request_options,
        )
    return _parse(output=output, service=service)




[docs]
async def from_buffer(
    buf: str | bytes | BinaryIO,
    *,
    server_endpoint: str = SERVER_ENDPOINT,
    xml_content: bool = False,
    headers: dict[str, Any] | None = None,
    config_path: str | None = None,
    request_options: dict[str, Any] | None = None,
) -> TikaResponse:
    """Parses content directly from a buffer using Apache Tika server.

    This function sends buffered content to the Tika server for parsing and returns
    structured content and metadata. It automatically uses the /rmeta endpoint for
    either text or XML output.

    Args:
        buf: The content to parse. Can be:
            - str: Text content
            - bytes: Binary content
            - BinaryIO: File-like object with binary content
        server_endpoint: The URL of the Tika server. Defaults to SERVER_ENDPOINT.
        xml_content: If True, requests XML output instead of plain text.
            Affects the structure of the returned content.
        headers: Additional HTTP headers to include in the request.
            'Accept: application/json' is automatically added.
        config_path: Path to a custom Tika configuration file.
        request_options: Additional options for the HTTP request (e.g., timeout).

    Returns:
        TikaResponse: A dictionary-like object containing:
            - content: Extracted text or XML content (str or None)
            - metadata: Dictionary of document metadata (dict or None)
            - status: HTTP status code (int)
            - attachments: Any embedded files (dict or None)

    Raises:
        TikaError: If the server returns a non-200 status code or parsing fails
        TypeError: If the buffer is not of a supported type

    Example:
        >>> with open("document.pdf", "rb") as f:
        ...     response = from_buffer(f.read())
        >>> print(response.metadata)  # Print all metadata
    """
    headers = headers or {}
    headers.update({"Accept": "application/json"})

    if not xml_content:
        status, response = await call_server(
            verb="put",
            server_endpoint=server_endpoint,
            service="/rmeta/text",
            data=buf,
            headers=headers,
            verbose=False,
            config_path=config_path,
            request_options=request_options,
        )
    else:
        status, response = await call_server(
            verb="put",
            server_endpoint=server_endpoint,
            service="/rmeta/xml",
            data=buf,
            headers=headers,
            verbose=False,
            config_path=config_path,
            request_options=request_options,
        )

    if status != HTTPStatus.OK:
        msg = f"Unexpected response from Tika server ({status}): {response}"
        raise TikaError(msg)

    return _parse((status, response))



def _parse(output: tuple[int, str | bytes | BinaryIO | None], service: str = "all") -> TikaResponse:  # noqa: C901
    """Parses the raw response from Tika server into a structured format.

    Internal function that processes the raw response from Tika's REST API and
    converts it into a structured TikaResponse object. Handles different response
    formats based on the service type used.

    Args:
        output: A tuple containing:
            - HTTP status code (int)
            - Raw response content (str, bytes, BinaryIO, or None)
        service: The type of service that was requested. Must be one of:
            - "all": Both content and metadata (default)
            - "meta": Only metadata
            - "text": Only text content

    Returns:
        TikaResponse: A dictionary-like object containing:
            - content: Extracted text or XML content (str or None)
            - metadata: Dictionary of document metadata (dict or None)
            - status: HTTP status code (int)
            - attachments: Any embedded files (dict or None)

    Notes:
        - For 'text' service, the raw content is returned directly in the content field
        - For 'meta' service, the JSON response is parsed into the metadata field
        - For 'all' service, both content and metadata are extracted from the response
        - Handles complex metadata cases where values can be either strings or lists
        - This is an internal function that should not be called directly
    """
    status, raw_content = output
    parsed = TikaResponse(metadata=None, content=None, status=status, attachments=None)

    if not raw_content:
        return parsed

    if service == "text":
        parsed["content"] = raw_content
        return parsed

    raw_json: dict[str, Any] | list[dict[str, Any]] = orjson.loads(
        raw_content if not isinstance(raw_content, BinaryIO) else raw_content.read()
    )

    parsed["metadata"] = {}
    if service == "meta" and isinstance(raw_json, dict):
        for key, value in raw_json.items():
            if isinstance(value, str | list):
                parsed["metadata"][key] = cast(str | list[str], value)
        return parsed

    content: list[str] = []
    if isinstance(raw_json, list):
        for js in raw_json:
            if "X-TIKA:content" in js and isinstance(js["X-TIKA:content"], str):
                content.append(js["X-TIKA:content"])

        parsed["content"] = "".join(content) if content else None

        metadata_dict = parsed["metadata"]
        if metadata_dict is not None:
            for js in raw_json:
                for key, value in js.items():
                    if key != "X-TIKA:content":
                        current_value = metadata_dict.get(key)

                        if current_value is not None:
                            if isinstance(current_value, str):
                                metadata_dict[key] = [current_value]

                            if isinstance(value, str):  # noqa: SIM102
                                if isinstance(metadata_dict[key], list):
                                    cast(list[str], metadata_dict[key]).append(value)
                        else:
                            if isinstance(value, str | list):
                                metadata_dict[key] = cast(str | list[str], value)

    return parsed