Source code for tika.detector

#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from http import HTTPStatus
from pathlib import Path
from typing import Any, BinaryIO

from tika.core import SERVER_ENDPOINT, TikaError, call_server, detect_type_1


[docs] async def from_file( file_obj: str | Path | BinaryIO, *, config_path: str | None = None, request_options: dict[str, Any] | None = None, ) -> str | bytes | BinaryIO: """Detects the MIME type of a file using Apache Tika server. Analyzes the file content to determine its MIME type (media type) using Tika's detection capabilities. This is more reliable than extension-based detection. Args: file_obj: The file to analyze. Can be: - str: A file path or URL - Path: A pathlib.Path object pointing to the file - BinaryIO: A file-like object in binary read mode config_path: Optional path to a custom Tika configuration file. request_options: Optional dictionary of request options to pass to the server. Can include parameters like timeout, headers, etc. Returns: The detected MIME type (e.g., 'application/pdf', 'image/jpeg'). Return type matches the server response, which may be str, bytes, or BinaryIO. Raises: TikaError: If the server returns an unsuccessful status code or if type detection fails. FileNotFoundError: If the specified file does not exist. Example: >>> from pathlib import Path >>> mime_type = from_file(Path("document.pdf")) >>> print(mime_type) # Prints 'application/pdf' >>> mime_type = from_file("image.jpg") >>> print(mime_type) # Prints 'image/jpeg' """ status, response = await detect_type_1( option="type", url_or_path=file_obj, config_path=config_path, request_options=request_options, ) if status != HTTPStatus.OK: msg = f"Unexpected response from Tika server ({status}): {response}" raise TikaError(msg) return response
[docs] async def from_buffer( buf: str | bytes | BinaryIO, *, config_path: str | None = None, request_options: dict[str, Any] | None = None, ) -> str | bytes | BinaryIO: """Detects the MIME type of content provided in a buffer using Apache Tika server. Analyzes the buffered content to determine its MIME type (media type) using Tika's detection capabilities. Useful for content that hasn't been saved to a file or for streaming data. Args: buf: The content to analyze. Can be: - str: Text content - bytes: Binary content - BinaryIO: File-like object containing binary content config_path: Optional path to a custom Tika configuration file. request_options: Optional dictionary of request options to pass to the server. Can include parameters like timeout, headers, etc. Returns: The detected MIME type (e.g., 'application/pdf', 'text/plain'). Return type matches the server response, which may be str, bytes, or BinaryIO. Raises: TikaError: If the server returns an unsuccessful status code or if type detection fails. TypeError: If the input buffer is not of the correct type. Example: >>> with open("document.pdf", "rb") as f: ... mime_type = from_buffer(f.read()) >>> print(mime_type) # Prints 'application/pdf' >>> text_content = "Hello, world!" >>> mime_type = from_buffer(text_content) >>> print(mime_type) # Prints 'text/plain' """ status, response = await call_server( verb="put", server_endpoint=SERVER_ENDPOINT, service="/detect/stream", data=buf, headers={"Accept": "text/plain"}, verbose=False, config_path=config_path, request_options=request_options, ) if status != HTTPStatus.OK: msg = f"Unexpected response from Tika server ({status}): {response}" raise TikaError(msg) return response