financial-intelligence-ai / src /pdf_reader.py
Vaibuzzz's picture
Upload folder using huggingface_hub
abf3ab0 verified
"""
PDF → Markdown extraction using Docling.
Docling preserves complex financial table structures by converting
PDF content into clean Markdown, solving cell-merging and missing-row
issues that plague basic text extractors like PyPDF2.
"""
import os
from typing import Optional
from docling.document_converter import DocumentConverter
# Module-level singleton — avoids re-initializing the heavy converter on every call
_converter: Optional[DocumentConverter] = None
def _get_converter() -> DocumentConverter:
"""Lazy-load the Docling DocumentConverter (one-time cost)."""
global _converter
if _converter is None:
print(" [Docling] Initializing DocumentConverter...")
_converter = DocumentConverter()
print(" [Docling] Converter ready.")
return _converter
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Convert a PDF file to a Markdown string using Docling.
Args:
pdf_path: Path to the PDF file.
Returns:
Markdown-formatted string preserving tables and structure.
Raises:
FileNotFoundError: If the PDF file doesn't exist.
ValueError: If Docling produces no content.
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF not found: {pdf_path}")
converter = _get_converter()
print(f" [Docling] Converting: {os.path.basename(pdf_path)} → Markdown")
result = converter.convert(pdf_path)
markdown_text = result.document.export_to_markdown()
if not markdown_text.strip():
raise ValueError(
f"No content extracted from {pdf_path}. "
"The PDF may be image-only or corrupted."
)
print(f" [Docling] Extracted {len(markdown_text):,} characters of Markdown.")
return markdown_text
def extract_text_from_directory(dir_path: str, max_files: Optional[int] = None) -> list:
"""
Extract Markdown from all PDFs in a directory.
Args:
dir_path: Path to directory containing PDFs.
max_files: Maximum number of files to process.
Returns:
List of dicts with 'filename' and 'text' keys.
"""
results = []
pdf_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.pdf')]
if max_files:
pdf_files = pdf_files[:max_files]
for filename in pdf_files:
filepath = os.path.join(dir_path, filename)
try:
text = extract_text_from_pdf(filepath)
results.append({
"filename": filename,
"text": text,
"char_count": len(text),
})
except (ValueError, Exception) as e:
print(f" [SKIP] {filename}: {e}")
print(f"Successfully extracted text from {len(results)}/{len(pdf_files)} PDFs")
return results