File size: 7,727 Bytes
36b2bff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
Multi-format document text extraction.

Primary engine: Docling (layout-aware, handles tables, complex PDFs, DOCX).
Fallback engine: pdfplumber (fast, text-native PDFs).

Design principles:
- Never raises exceptions β€” every public function returns None on failure.
- Logs warnings on degraded paths so operators can track quality.
- Docling is an optional dependency; if not installed, pdfplumber is used
  transparently for PDFs.
- All file I/O is in-memory except Docling's temp-file requirement for PDFs
  (Docling's converter currently requires a path, not a stream).
"""

from __future__ import annotations

import contextlib
import logging
import os
import tempfile
from dataclasses import dataclass

logger = logging.getLogger(__name__)

# Supported extensions β†’ handler routing
_PDF_EXTS = {".pdf"}
_DOCX_EXTS = {".docx"}
_TEXT_EXTS = {".txt", ".md", ".csv"}


@dataclass
class ExtractionResult:
    """Outcome of a document text extraction attempt."""

    text: str
    method: str  # "docling" | "pdfplumber" | "text"
    page_count: int | None = None
    has_tables: bool = False


# ── Public API ────────────────────────────────────────────────────────────────


def extract_text(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
    """Extract text from a document given its raw bytes and filename.

    Routes by file extension:
      .pdf          β†’ _extract_pdf  (Docling β†’ pdfplumber fallback)
      .docx         β†’ _extract_docx (Docling)
      .txt / .md / .csv β†’ _extract_text_file (UTF-8 decode)

    Returns None for empty input, unsupported formats, or unrecoverable errors.
    """
    if not raw_bytes:
        return None

    ext = os.path.splitext(filename.lower())[1]

    if ext in _PDF_EXTS:
        return _extract_pdf(raw_bytes, filename)
    if ext in _DOCX_EXTS:
        return _extract_docx(raw_bytes, filename)
    if ext in _TEXT_EXTS:
        return _extract_text_file(raw_bytes)

    logger.warning("document_extractor: unsupported file type '%s' for file '%s'", ext, filename)
    return None


# ── Private helpers ───────────────────────────────────────────────────────────


def _extract_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
    """Try Docling first; fall back to pdfplumber on any failure."""
    result = _docling_pdf(raw_bytes, filename)
    if result is not None:
        return result

    logger.info("document_extractor: Docling unavailable or failed for '%s', trying pdfplumber", filename)
    return _pdfplumber_pdf(raw_bytes, filename)


def _docling_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
    """Attempt PDF extraction via Docling.

    Docling requires a file path, so we write bytes to a NamedTemporaryFile,
    convert, and clean up regardless of outcome.
    """
    try:
        from docling.document_converter import DocumentConverter  # type: ignore[import]
    except ImportError:
        logger.debug("document_extractor: Docling not installed, skipping")
        return None

    tmp_path: str | None = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp.write(raw_bytes)
            tmp_path = tmp.name

        converter = DocumentConverter()
        conversion = converter.convert(tmp_path)
        doc = conversion.document

        text = doc.export_to_markdown()
        if not text or not text.strip():
            logger.warning("document_extractor: Docling returned empty text for '%s'", filename)
            return None

        # Detect table presence: Docling markdown uses pipe-table syntax
        has_tables = "|" in text

        page_count: int | None = None
        with contextlib.suppress(Exception):
            page_count = len(doc.pages) if hasattr(doc, "pages") else None

        return ExtractionResult(
            text=text.strip(),
            method="docling",
            page_count=page_count,
            has_tables=has_tables,
        )

    except Exception as exc:
        logger.warning("document_extractor: Docling extraction failed for '%s': %s", filename, exc)
        return None

    finally:
        if tmp_path and os.path.exists(tmp_path):
            with contextlib.suppress(OSError):
                os.unlink(tmp_path)


def _pdfplumber_pdf(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
    """Extract text from a PDF using pdfplumber."""
    try:
        import io

        import pdfplumber  # type: ignore[import]

        with pdfplumber.open(io.BytesIO(raw_bytes)) as pdf:
            pages_text: list[str] = []
            has_tables = False

            for page in pdf.pages:
                text = (page.extract_text() or "").strip()
                if text:
                    pages_text.append(text)
                if not has_tables and page.extract_tables():
                    has_tables = True

            if not pages_text:
                logger.warning("document_extractor: pdfplumber found no extractable text in '%s'", filename)
                return None

            return ExtractionResult(
                text="\n\n---\n\n".join(pages_text),
                method="pdfplumber",
                page_count=len(pdf.pages),
                has_tables=has_tables,
            )

    except Exception as exc:
        logger.warning("document_extractor: pdfplumber extraction failed for '%s': %s", filename, exc)
        return None


def _extract_docx(raw_bytes: bytes, filename: str) -> ExtractionResult | None:
    """Extract text from a DOCX file via Docling."""
    try:
        from docling.document_converter import DocumentConverter  # type: ignore[import]
    except ImportError:
        logger.warning("document_extractor: Docling not installed, cannot extract DOCX '%s'", filename)
        return None

    tmp_path: str | None = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
            tmp.write(raw_bytes)
            tmp_path = tmp.name

        converter = DocumentConverter()
        conversion = converter.convert(tmp_path)
        doc = conversion.document

        text = doc.export_to_markdown()
        if not text or not text.strip():
            logger.warning("document_extractor: Docling returned empty text for DOCX '%s'", filename)
            return None

        has_tables = "|" in text

        page_count: int | None = None
        with contextlib.suppress(Exception):
            page_count = len(doc.pages) if hasattr(doc, "pages") else None

        return ExtractionResult(
            text=text.strip(),
            method="docling",
            page_count=page_count,
            has_tables=has_tables,
        )

    except Exception as exc:
        logger.warning("document_extractor: Docling DOCX extraction failed for '%s': %s", filename, exc)
        return None

    finally:
        if tmp_path and os.path.exists(tmp_path):
            with contextlib.suppress(OSError):
                os.unlink(tmp_path)


def _extract_text_file(raw_bytes: bytes) -> ExtractionResult | None:
    """Decode a plain-text file (TXT, MD, CSV) as UTF-8."""
    try:
        text = raw_bytes.decode("utf-8", errors="replace")
        return ExtractionResult(
            text=text,
            method="text",
            page_count=None,
            has_tables=False,
        )
    except Exception as exc:
        logger.warning("document_extractor: text file decode failed: %s", exc)
        return None