| | import requests |
| | import logging |
| | import ftfy |
| | import sys |
| | import json |
| |
|
| | from azure.identity import DefaultAzureCredential |
| | from langchain_community.document_loaders import ( |
| | AzureAIDocumentIntelligenceLoader, |
| | BSHTMLLoader, |
| | CSVLoader, |
| | Docx2txtLoader, |
| | OutlookMessageLoader, |
| | PyPDFLoader, |
| | TextLoader, |
| | UnstructuredEPubLoader, |
| | UnstructuredExcelLoader, |
| | UnstructuredODTLoader, |
| | UnstructuredPowerPointLoader, |
| | UnstructuredRSTLoader, |
| | UnstructuredXMLLoader, |
| | YoutubeLoader, |
| | ) |
| | from langchain_core.documents import Document |
| |
|
| | from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader |
| |
|
| | from open_webui.retrieval.loaders.mistral import MistralLoader |
| | from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader |
| | from open_webui.retrieval.loaders.mineru import MinerULoader |
| |
|
| |
|
| | from open_webui.env import GLOBAL_LOG_LEVEL, REQUESTS_VERIFY |
| |
|
| | logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL) |
| | log = logging.getLogger(__name__) |
| |
|
| | known_source_ext = [ |
| | "go", |
| | "py", |
| | "java", |
| | "sh", |
| | "bat", |
| | "ps1", |
| | "cmd", |
| | "js", |
| | "ts", |
| | "css", |
| | "cpp", |
| | "hpp", |
| | "h", |
| | "c", |
| | "cs", |
| | "sql", |
| | "log", |
| | "ini", |
| | "pl", |
| | "pm", |
| | "r", |
| | "dart", |
| | "dockerfile", |
| | "env", |
| | "php", |
| | "hs", |
| | "hsc", |
| | "lua", |
| | "nginxconf", |
| | "conf", |
| | "m", |
| | "mm", |
| | "plsql", |
| | "perl", |
| | "rb", |
| | "rs", |
| | "db2", |
| | "scala", |
| | "bash", |
| | "swift", |
| | "vue", |
| | "svelte", |
| | "ex", |
| | "exs", |
| | "erl", |
| | "tsx", |
| | "jsx", |
| | "hs", |
| | "lhs", |
| | "json", |
| | ] |
| |
|
| |
|
| | class TikaLoader: |
| | def __init__(self, url, file_path, mime_type=None, extract_images=None): |
| | self.url = url |
| | self.file_path = file_path |
| | self.mime_type = mime_type |
| |
|
| | self.extract_images = extract_images |
| |
|
| | def load(self) -> list[Document]: |
| | with open(self.file_path, "rb") as f: |
| | data = f.read() |
| |
|
| | if self.mime_type is not None: |
| | headers = {"Content-Type": self.mime_type} |
| | else: |
| | headers = {} |
| |
|
| | if self.extract_images == True: |
| | headers["X-Tika-PDFextractInlineImages"] = "true" |
| |
|
| | endpoint = self.url |
| | if not endpoint.endswith("/"): |
| | endpoint += "/" |
| | endpoint += "tika/text" |
| |
|
| | r = requests.put(endpoint, data=data, headers=headers, verify=REQUESTS_VERIFY) |
| |
|
| | if r.ok: |
| | raw_metadata = r.json() |
| | text = raw_metadata.get("X-TIKA:content", "<No text content found>").strip() |
| |
|
| | if "Content-Type" in raw_metadata: |
| | headers["Content-Type"] = raw_metadata["Content-Type"] |
| |
|
| | log.debug("Tika extracted text: %s", text) |
| |
|
| | return [Document(page_content=text, metadata=headers)] |
| | else: |
| | raise Exception(f"Error calling Tika: {r.reason}") |
| |
|
| |
|
| | class DoclingLoader: |
| | def __init__(self, url, api_key=None, file_path=None, mime_type=None, params=None): |
| | self.url = url.rstrip("/") |
| | self.api_key = api_key |
| | self.file_path = file_path |
| | self.mime_type = mime_type |
| |
|
| | self.params = params or {} |
| |
|
| | def load(self) -> list[Document]: |
| | with open(self.file_path, "rb") as f: |
| | headers = {} |
| | if self.api_key: |
| | headers["X-Api-Key"] = f"{self.api_key}" |
| |
|
| | r = requests.post( |
| | f"{self.url}/v1/convert/file", |
| | files={ |
| | "files": ( |
| | self.file_path, |
| | f, |
| | self.mime_type or "application/octet-stream", |
| | ) |
| | }, |
| | data={ |
| | "image_export_mode": "placeholder", |
| | **self.params, |
| | }, |
| | headers=headers, |
| | ) |
| | if r.ok: |
| | result = r.json() |
| | document_data = result.get("document", {}) |
| | text = document_data.get("md_content", "<No text content found>") |
| |
|
| | metadata = {"Content-Type": self.mime_type} if self.mime_type else {} |
| |
|
| | log.debug("Docling extracted text: %s", text) |
| | return [Document(page_content=text, metadata=metadata)] |
| | else: |
| | error_msg = f"Error calling Docling API: {r.reason}" |
| | if r.text: |
| | try: |
| | error_data = r.json() |
| | if "detail" in error_data: |
| | error_msg += f" - {error_data['detail']}" |
| | except Exception: |
| | error_msg += f" - {r.text}" |
| | raise Exception(f"Error calling Docling: {error_msg}") |
| |
|
| |
|
| | class Loader: |
| | def __init__(self, engine: str = "", **kwargs): |
| | self.engine = engine |
| | self.user = kwargs.get("user", None) |
| | self.kwargs = kwargs |
| |
|
| | def load( |
| | self, filename: str, file_content_type: str, file_path: str |
| | ) -> list[Document]: |
| | loader = self._get_loader(filename, file_content_type, file_path) |
| | docs = loader.load() |
| |
|
| | return [ |
| | Document( |
| | page_content=ftfy.fix_text(doc.page_content), metadata=doc.metadata |
| | ) |
| | for doc in docs |
| | ] |
| |
|
| | def _is_text_file(self, file_ext: str, file_content_type: str) -> bool: |
| | return file_ext in known_source_ext or ( |
| | file_content_type |
| | and file_content_type.find("text/") >= 0 |
| | |
| | and not file_content_type.find("html") >= 0 |
| | ) |
| |
|
| | def _get_loader(self, filename: str, file_content_type: str, file_path: str): |
| | file_ext = filename.split(".")[-1].lower() |
| |
|
| | if ( |
| | self.engine == "external" |
| | and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL") |
| | and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY") |
| | ): |
| | loader = ExternalDocumentLoader( |
| | file_path=file_path, |
| | url=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL"), |
| | api_key=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY"), |
| | mime_type=file_content_type, |
| | user=self.user, |
| | ) |
| | elif self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"): |
| | if self._is_text_file(file_ext, file_content_type): |
| | loader = TextLoader(file_path, autodetect_encoding=True) |
| | else: |
| | loader = TikaLoader( |
| | url=self.kwargs.get("TIKA_SERVER_URL"), |
| | file_path=file_path, |
| | extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"), |
| | ) |
| | elif ( |
| | self.engine == "datalab_marker" |
| | and self.kwargs.get("DATALAB_MARKER_API_KEY") |
| | and file_ext |
| | in [ |
| | "pdf", |
| | "xls", |
| | "xlsx", |
| | "ods", |
| | "doc", |
| | "docx", |
| | "odt", |
| | "ppt", |
| | "pptx", |
| | "odp", |
| | "html", |
| | "epub", |
| | "png", |
| | "jpeg", |
| | "jpg", |
| | "webp", |
| | "gif", |
| | "tiff", |
| | ] |
| | ): |
| | api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "") |
| | if not api_base_url or api_base_url.strip() == "": |
| | api_base_url = "https://www.datalab.to/api/v1/marker" |
| |
|
| | loader = DatalabMarkerLoader( |
| | file_path=file_path, |
| | api_key=self.kwargs["DATALAB_MARKER_API_KEY"], |
| | api_base_url=api_base_url, |
| | additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"), |
| | use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False), |
| | skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False), |
| | force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False), |
| | paginate=self.kwargs.get("DATALAB_MARKER_PAGINATE", False), |
| | strip_existing_ocr=self.kwargs.get( |
| | "DATALAB_MARKER_STRIP_EXISTING_OCR", False |
| | ), |
| | disable_image_extraction=self.kwargs.get( |
| | "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False |
| | ), |
| | format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False), |
| | output_format=self.kwargs.get( |
| | "DATALAB_MARKER_OUTPUT_FORMAT", "markdown" |
| | ), |
| | ) |
| | elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): |
| | if self._is_text_file(file_ext, file_content_type): |
| | loader = TextLoader(file_path, autodetect_encoding=True) |
| | else: |
| | |
| | params = self.kwargs.get("DOCLING_PARAMS", {}) |
| | if not isinstance(params, dict): |
| | try: |
| | params = json.loads(params) |
| | except json.JSONDecodeError: |
| | log.error("Invalid DOCLING_PARAMS format, expected JSON object") |
| | params = {} |
| |
|
| | loader = DoclingLoader( |
| | url=self.kwargs.get("DOCLING_SERVER_URL"), |
| | api_key=self.kwargs.get("DOCLING_API_KEY", None), |
| | file_path=file_path, |
| | mime_type=file_content_type, |
| | params=params, |
| | ) |
| | elif ( |
| | self.engine == "document_intelligence" |
| | and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" |
| | and ( |
| | file_ext in ["pdf", "docx", "ppt", "pptx"] |
| | or file_content_type |
| | in [ |
| | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| | "application/vnd.ms-powerpoint", |
| | "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| | ] |
| | ) |
| | ): |
| | if self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != "": |
| | loader = AzureAIDocumentIntelligenceLoader( |
| | file_path=file_path, |
| | api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), |
| | api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"), |
| | api_model=self.kwargs.get("DOCUMENT_INTELLIGENCE_MODEL"), |
| | ) |
| | else: |
| | loader = AzureAIDocumentIntelligenceLoader( |
| | file_path=file_path, |
| | api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), |
| | azure_credential=DefaultAzureCredential(), |
| | api_model=self.kwargs.get("DOCUMENT_INTELLIGENCE_MODEL"), |
| | ) |
| | elif self.engine == "mineru" and file_ext in [ |
| | "pdf" |
| | ]: |
| |
|
| | mineru_timeout = self.kwargs.get("MINERU_API_TIMEOUT", 300) |
| | if mineru_timeout: |
| | try: |
| | mineru_timeout = int(mineru_timeout) |
| | except ValueError: |
| | mineru_timeout = 300 |
| |
|
| | loader = MinerULoader( |
| | file_path=file_path, |
| | api_mode=self.kwargs.get("MINERU_API_MODE", "local"), |
| | api_url=self.kwargs.get("MINERU_API_URL", "http://localhost:8000"), |
| | api_key=self.kwargs.get("MINERU_API_KEY", ""), |
| | params=self.kwargs.get("MINERU_PARAMS", {}), |
| | timeout=mineru_timeout, |
| | ) |
| | elif ( |
| | self.engine == "mistral_ocr" |
| | and self.kwargs.get("MISTRAL_OCR_API_KEY") != "" |
| | and file_ext |
| | in ["pdf"] |
| | ): |
| | loader = MistralLoader( |
| | base_url=self.kwargs.get("MISTRAL_OCR_API_BASE_URL"), |
| | api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), |
| | file_path=file_path, |
| | ) |
| | else: |
| | if file_ext == "pdf": |
| | loader = PyPDFLoader( |
| | file_path, |
| | extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"), |
| | mode=self.kwargs.get("PDF_LOADER_MODE", "page"), |
| | ) |
| | elif file_ext == "csv": |
| | loader = CSVLoader(file_path, autodetect_encoding=True) |
| | elif file_ext == "rst": |
| | loader = UnstructuredRSTLoader(file_path, mode="elements") |
| | elif file_ext == "xml": |
| | loader = UnstructuredXMLLoader(file_path) |
| | elif file_ext in ["htm", "html"]: |
| | loader = BSHTMLLoader(file_path, open_encoding="unicode_escape") |
| | elif file_ext == "md": |
| | loader = TextLoader(file_path, autodetect_encoding=True) |
| | elif file_content_type == "application/epub+zip": |
| | loader = UnstructuredEPubLoader(file_path) |
| | elif ( |
| | file_content_type |
| | == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
| | or file_ext == "docx" |
| | ): |
| | loader = Docx2txtLoader(file_path) |
| | elif file_content_type in [ |
| | "application/vnd.ms-excel", |
| | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
| | ] or file_ext in ["xls", "xlsx"]: |
| | loader = UnstructuredExcelLoader(file_path) |
| | elif file_content_type in [ |
| | "application/vnd.ms-powerpoint", |
| | "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| | ] or file_ext in ["ppt", "pptx"]: |
| | loader = UnstructuredPowerPointLoader(file_path) |
| | elif file_ext == "msg": |
| | loader = OutlookMessageLoader(file_path) |
| | elif file_ext == "odt": |
| | loader = UnstructuredODTLoader(file_path) |
| | elif self._is_text_file(file_ext, file_content_type): |
| | loader = TextLoader(file_path, autodetect_encoding=True) |
| | else: |
| | loader = TextLoader(file_path, autodetect_encoding=True) |
| |
|
| | return loader |
| |
|