Source code for tika.language
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from http import HTTPStatus
from pathlib import Path
from typing import Any, BinaryIO
from tika.core import SERVER_ENDPOINT, TikaError, call_server, detect_lang_1
[docs]
async def from_file(
file_obj: str | Path | BinaryIO,
request_options: dict[str, Any] | None = None,
) -> str | bytes | BinaryIO:
"""Detects the language of a file using Apache Tika server.
Uses Tika's language detection capabilities to identify the primary language
of text content within a file.
Args:
file_obj: The file to analyze. Can be:
- str: A string path to the file
- Path: A pathlib.Path object pointing to the file
- BinaryIO: A file-like object in binary read mode
request_options: Optional dictionary of request options to pass to the server.
Can include parameters like timeout, headers, etc.
Returns:
The detected language code (e.g., 'en' for English, 'fr' for French).
Return type matches the server response, which may be str, bytes, or BinaryIO.
Raises:
TikaError: If the server returns an unsuccessful status code or if language
detection fails.
FileNotFoundError: If the specified file does not exist.
Example:
>>> from pathlib import Path
>>> language = from_file(Path("document.txt"))
>>> print(language) # Prints 'en' for English text
"""
status, response = await detect_lang_1(option="file", url_or_path=file_obj, request_options=request_options)
if status != HTTPStatus.OK:
msg = f"Unexpected response from Tika server ({status}): {response}"
raise TikaError(msg)
return response
[docs]
async def from_buffer(
buf: str | bytes | BinaryIO,
request_options: dict[str, Any] | None = None,
) -> str | bytes | BinaryIO:
"""Detects the language of content provided in a buffer using Apache Tika server.
Sends the buffered content directly to Tika's language detection service
to identify the primary language of the text.
Args:
buf: The content to analyze. Can be:
- str: Text content as a string
- bytes: Binary content
- BinaryIO: File-like object containing content
request_options: Optional dictionary of request options to pass to the server.
Can include parameters like timeout, headers, etc.
Returns:
The detected language code (e.g., 'en' for English, 'fr' for French).
Return type matches the server response, which may be str, bytes, or BinaryIO.
Raises:
TikaError: If the server returns an unsuccessful status code or if language
detection fails.
TypeError: If the input buffer is not of the correct type.
Example:
>>> text = "Bonjour le monde!"
>>> language = from_buffer(text)
>>> print(language) # Prints 'fr' for French text
"""
status, response = await call_server(
verb="put",
server_endpoint=SERVER_ENDPOINT,
service="/language/string",
data=buf,
headers={"Accept": "text/plain"},
verbose=False,
request_options=request_options,
)
if status != HTTPStatus.OK:
msg = f"Unexpected response from Tika server ({status}): {response}"
raise TikaError(msg)
return response