File size: 7,727 Bytes
36b2bff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | """
Multi-format document text extraction.
Primary engine: Docling (layout-aware, handles tables, complex PDFs, DOCX).
Fallback engine: pdfplumber (fast, text-native PDFs).
Design principles:
- Never raises exceptions β every public function returns None on failure.
- Logs warnings on degraded paths so operators can track quality.
- Docling is an optional dependency; if not installed, pdfplumber is used
transparently for PDFs.
- All file I/O is in-memory except Docling's temp-file requirement for PDFs
(Docling's converter currently requires a path, not a stream).
"""
from __future__ import annotations
import contextlib
import logging
import os
import tempfile
from dataclasses import dataclass
logger = logging.getLogger(__name__)
# Supported extensions β handler routing
_PDF_EXTS = {".pdf"}
_DOCX_EXTS = {".docx"}
_TEXT_EXTS = {".txt", ".md", ".csv"}
@dataclass
class ExtractionResult:
"""Outcome of a document text extraction attempt."""
text: str
method: str # "docling" | "pdfplumber" | "text"
page_count: int | None = None
has_tables: bool = False
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def extract_text(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
"""Extract text from a document given its raw bytes and filename.
Routes by file extension:
.pdf β _extract_pdf (Docling β pdfplumber fallback)
.docx β _extract_docx (Docling)
.txt / .md / .csv β _extract_text_file (UTF-8 decode)
Returns None for empty input, unsupported formats, or unrecoverable errors.
"""
if not raw_bytes:
return None
ext = os.path.splitext(filename.lower())[1]
if ext in _PDF_EXTS:
return _extract_pdf(raw_bytes, filename)
if ext in _DOCX_EXTS:
return _extract_docx(raw_bytes, filename)
if ext in _TEXT_EXTS:
return _extract_text_file(raw_bytes)
logger.warning("document_extractor: unsupported file type '%s' for file '%s'", ext, filename)
return None
# ββ Private helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _extract_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
"""Try Docling first; fall back to pdfplumber on any failure."""
result = _docling_pdf(raw_bytes, filename)
if result is not None:
return result
logger.info("document_extractor: Docling unavailable or failed for '%s', trying pdfplumber", filename)
return _pdfplumber_pdf(raw_bytes, filename)
def _docling_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
"""Attempt PDF extraction via Docling.
Docling requires a file path, so we write bytes to a NamedTemporaryFile,
convert, and clean up regardless of outcome.
"""
try:
from docling.document_converter import DocumentConverter # type: ignore[import]
except ImportError:
logger.debug("document_extractor: Docling not installed, skipping")
return None
tmp_path: str | None = None
try:
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(raw_bytes)
tmp_path = tmp.name
converter = DocumentConverter()
conversion = converter.convert(tmp_path)
doc = conversion.document
text = doc.export_to_markdown()
if not text or not text.strip():
logger.warning("document_extractor: Docling returned empty text for '%s'", filename)
return None
# Detect table presence: Docling markdown uses pipe-table syntax
has_tables = "|" in text
page_count: int | None = None
with contextlib.suppress(Exception):
page_count = len(doc.pages) if hasattr(doc, "pages") else None
return ExtractionResult(
text=text.strip(),
method="docling",
page_count=page_count,
has_tables=has_tables,
)
except Exception as exc:
logger.warning("document_extractor: Docling extraction failed for '%s': %s", filename, exc)
return None
finally:
if tmp_path and os.path.exists(tmp_path):
with contextlib.suppress(OSError):
os.unlink(tmp_path)
def _pdfplumber_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
"""Extract text from a PDF using pdfplumber."""
try:
import io
import pdfplumber # type: ignore[import]
with pdfplumber.open(io.BytesIO(raw_bytes)) as pdf:
pages_text: list[str] = []
has_tables = False
for page in pdf.pages:
text = (page.extract_text() or "").strip()
if text:
pages_text.append(text)
if not has_tables and page.extract_tables():
has_tables = True
if not pages_text:
logger.warning("document_extractor: pdfplumber found no extractable text in '%s'", filename)
return None
return ExtractionResult(
text="\n\n---\n\n".join(pages_text),
method="pdfplumber",
page_count=len(pdf.pages),
has_tables=has_tables,
)
except Exception as exc:
logger.warning("document_extractor: pdfplumber extraction failed for '%s': %s", filename, exc)
return None
def _extract_docx(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
"""Extract text from a DOCX file via Docling."""
try:
from docling.document_converter import DocumentConverter # type: ignore[import]
except ImportError:
logger.warning("document_extractor: Docling not installed, cannot extract DOCX '%s'", filename)
return None
tmp_path: str | None = None
try:
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
tmp.write(raw_bytes)
tmp_path = tmp.name
converter = DocumentConverter()
conversion = converter.convert(tmp_path)
doc = conversion.document
text = doc.export_to_markdown()
if not text or not text.strip():
logger.warning("document_extractor: Docling returned empty text for DOCX '%s'", filename)
return None
has_tables = "|" in text
page_count: int | None = None
with contextlib.suppress(Exception):
page_count = len(doc.pages) if hasattr(doc, "pages") else None
return ExtractionResult(
text=text.strip(),
method="docling",
page_count=page_count,
has_tables=has_tables,
)
except Exception as exc:
logger.warning("document_extractor: Docling DOCX extraction failed for '%s': %s", filename, exc)
return None
finally:
if tmp_path and os.path.exists(tmp_path):
with contextlib.suppress(OSError):
os.unlink(tmp_path)
def _extract_text_file(raw_bytes: bytes) -> ExtractionResult | None:
"""Decode a plain-text file (TXT, MD, CSV) as UTF-8."""
try:
text = raw_bytes.decode("utf-8", errors="replace")
return ExtractionResult(
text=text,
method="text",
page_count=None,
has_tables=False,
)
except Exception as exc:
logger.warning("document_extractor: text file decode failed: %s", exc)
return None
|