File size: 2,790 Bytes
10ff0db
abf3ab0
 
 
 
 
10ff0db
 
 
 
abf3ab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10ff0db
 
 
 
abf3ab0
 
10ff0db
 
abf3ab0
10ff0db
abf3ab0
 
10ff0db
 
abf3ab0
10ff0db
 
 
 
abf3ab0
10ff0db
abf3ab0
 
 
10ff0db
abf3ab0
10ff0db
abf3ab0
 
10ff0db
 
abf3ab0
 
10ff0db
 
 
 
abf3ab0
 
10ff0db
 
 
abf3ab0
10ff0db
 
 
 
 
abf3ab0
10ff0db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
PDF → Markdown extraction using Docling.

Docling preserves complex financial table structures by converting
PDF content into clean Markdown, solving cell-merging and missing-row
issues that plague basic text extractors like PyPDF2.
"""

import os
from typing import Optional

from docling.document_converter import DocumentConverter


# Module-level singleton — avoids re-initializing the heavy converter on every call
_converter: Optional[DocumentConverter] = None


def _get_converter() -> DocumentConverter:
    """Lazy-load the Docling DocumentConverter (one-time cost)."""
    global _converter
    if _converter is None:
        print("  [Docling] Initializing DocumentConverter...")
        _converter = DocumentConverter()
        print("  [Docling] Converter ready.")
    return _converter


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Convert a PDF file to a Markdown string using Docling.

    Args:
        pdf_path: Path to the PDF file.

    Returns:
        Markdown-formatted string preserving tables and structure.

    Raises:
        FileNotFoundError: If the PDF file doesn't exist.
        ValueError: If Docling produces no content.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")

    converter = _get_converter()

    print(f"  [Docling] Converting: {os.path.basename(pdf_path)} → Markdown")
    result = converter.convert(pdf_path)
    markdown_text = result.document.export_to_markdown()

    if not markdown_text.strip():
        raise ValueError(
            f"No content extracted from {pdf_path}. "
            "The PDF may be image-only or corrupted."
        )

    print(f"  [Docling] Extracted {len(markdown_text):,} characters of Markdown.")
    return markdown_text


def extract_text_from_directory(dir_path: str, max_files: Optional[int] = None) -> list:
    """
    Extract Markdown from all PDFs in a directory.

    Args:
        dir_path: Path to directory containing PDFs.
        max_files: Maximum number of files to process.

    Returns:
        List of dicts with 'filename' and 'text' keys.
    """
    results = []
    pdf_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.pdf')]

    if max_files:
        pdf_files = pdf_files[:max_files]

    for filename in pdf_files:
        filepath = os.path.join(dir_path, filename)
        try:
            text = extract_text_from_pdf(filepath)
            results.append({
                "filename": filename,
                "text": text,
                "char_count": len(text),
            })
        except (ValueError, Exception) as e:
            print(f"  [SKIP] {filename}: {e}")

    print(f"Successfully extracted text from {len(results)}/{len(pdf_files)} PDFs")
    return results