""" Multi-format document text extraction. Primary engine: Docling (layout-aware, handles tables, complex PDFs, DOCX). Fallback engine: pdfplumber (fast, text-native PDFs). Design principles: - Never raises exceptions — every public function returns None on failure. - Logs warnings on degraded paths so operators can track quality. - Docling is an optional dependency; if not installed, pdfplumber is used transparently for PDFs. - All file I/O is in-memory except Docling's temp-file requirement for PDFs (Docling's converter currently requires a path, not a stream). """ from __future__ import annotations import contextlib import logging import os import tempfile from dataclasses import dataclass logger = logging.getLogger(__name__) # Supported extensions → handler routing _PDF_EXTS = {".pdf"} _DOCX_EXTS = {".docx"} _TEXT_EXTS = {".txt", ".md", ".csv"} @dataclass class ExtractionResult: """Outcome of a document text extraction attempt.""" text: str method: str # "docling" | "pdfplumber" | "text" page_count: int | None = None has_tables: bool = False # ── Public API ──────────────────────────────────────────────────────────────── def extract_text(raw_bytes: bytes, filename: str) -> ExtractionResult | None: """Extract text from a document given its raw bytes and filename. Routes by file extension: .pdf → _extract_pdf (Docling → pdfplumber fallback) .docx → _extract_docx (Docling) .txt / .md / .csv → _extract_text_file (UTF-8 decode) Returns None for empty input, unsupported formats, or unrecoverable errors. """ if not raw_bytes: return None ext = os.path.splitext(filename.lower())[1] if ext in _PDF_EXTS: return _extract_pdf(raw_bytes, filename) if ext in _DOCX_EXTS: return _extract_docx(raw_bytes, filename) if ext in _TEXT_EXTS: return _extract_text_file(raw_bytes) logger.warning("document_extractor: unsupported file type '%s' for file '%s'", ext, filename) return None # ── Private helpers ─────────────────────────────────────────────────────────── def _extract_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None: """Try Docling first; fall back to pdfplumber on any failure.""" result = _docling_pdf(raw_bytes, filename) if result is not None: return result logger.info("document_extractor: Docling unavailable or failed for '%s', trying pdfplumber", filename) return _pdfplumber_pdf(raw_bytes, filename) def _docling_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None: """Attempt PDF extraction via Docling. Docling requires a file path, so we write bytes to a NamedTemporaryFile, convert, and clean up regardless of outcome. """ try: from docling.document_converter import DocumentConverter # type: ignore[import] except ImportError: logger.debug("document_extractor: Docling not installed, skipping") return None tmp_path: str | None = None try: with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp.write(raw_bytes) tmp_path = tmp.name converter = DocumentConverter() conversion = converter.convert(tmp_path) doc = conversion.document text = doc.export_to_markdown() if not text or not text.strip(): logger.warning("document_extractor: Docling returned empty text for '%s'", filename) return None # Detect table presence: Docling markdown uses pipe-table syntax has_tables = "|" in text page_count: int | None = None with contextlib.suppress(Exception): page_count = len(doc.pages) if hasattr(doc, "pages") else None return ExtractionResult( text=text.strip(), method="docling", page_count=page_count, has_tables=has_tables, ) except Exception as exc: logger.warning("document_extractor: Docling extraction failed for '%s': %s", filename, exc) return None finally: if tmp_path and os.path.exists(tmp_path): with contextlib.suppress(OSError): os.unlink(tmp_path) def _pdfplumber_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None: """Extract text from a PDF using pdfplumber.""" try: import io import pdfplumber # type: ignore[import] with pdfplumber.open(io.BytesIO(raw_bytes)) as pdf: pages_text: list[str] = [] has_tables = False for page in pdf.pages: text = (page.extract_text() or "").strip() if text: pages_text.append(text) if not has_tables and page.extract_tables(): has_tables = True if not pages_text: logger.warning("document_extractor: pdfplumber found no extractable text in '%s'", filename) return None return ExtractionResult( text="\n\n---\n\n".join(pages_text), method="pdfplumber", page_count=len(pdf.pages), has_tables=has_tables, ) except Exception as exc: logger.warning("document_extractor: pdfplumber extraction failed for '%s': %s", filename, exc) return None def _extract_docx(raw_bytes: bytes, filename: str) -> ExtractionResult | None: """Extract text from a DOCX file via Docling.""" try: from docling.document_converter import DocumentConverter # type: ignore[import] except ImportError: logger.warning("document_extractor: Docling not installed, cannot extract DOCX '%s'", filename) return None tmp_path: str | None = None try: with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp: tmp.write(raw_bytes) tmp_path = tmp.name converter = DocumentConverter() conversion = converter.convert(tmp_path) doc = conversion.document text = doc.export_to_markdown() if not text or not text.strip(): logger.warning("document_extractor: Docling returned empty text for DOCX '%s'", filename) return None has_tables = "|" in text page_count: int | None = None with contextlib.suppress(Exception): page_count = len(doc.pages) if hasattr(doc, "pages") else None return ExtractionResult( text=text.strip(), method="docling", page_count=page_count, has_tables=has_tables, ) except Exception as exc: logger.warning("document_extractor: Docling DOCX extraction failed for '%s': %s", filename, exc) return None finally: if tmp_path and os.path.exists(tmp_path): with contextlib.suppress(OSError): os.unlink(tmp_path) def _extract_text_file(raw_bytes: bytes) -> ExtractionResult | None: """Decode a plain-text file (TXT, MD, CSV) as UTF-8.""" try: text = raw_bytes.decode("utf-8", errors="replace") return ExtractionResult( text=text, method="text", page_count=None, has_tables=False, ) except Exception as exc: logger.warning("document_extractor: text file decode failed: %s", exc) return None