import hashlib import json import tempfile import os import io from pathlib import Path from typing import List, Dict from llama_index.core import Document import pandas as pd # Advanced Docling Imports for Table Extraction from llama_index.readers.docling import DoclingReader from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode from docling.document_converter import DocumentConverter from docling.document_converter import PdfFormatOption class PDFProcessor: def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap # Initialize advanced pipeline options for accurate table discovery pipeline_options = PdfPipelineOptions() pipeline_options.do_table_structure = True # NOTE: OCR is disabled by default for 10x speed boost on text-based PDFs. pipeline_options.do_ocr = False pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE self.doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ) } ) def get_pdf_hash(self, pdf_file) -> str: """ Generates an MD5 hash for the PDF file object to serve as a cache key. """ pos = pdf_file.tell() pdf_file.seek(0) file_hash = hashlib.md5(pdf_file.read()).hexdigest() pdf_file.seek(pos) return file_hash def load_docling_documents(self, pdf_file, cache_path: Path = None) -> Dict: """ Uses Docling for unified RAG and Table Extraction in a single pass. Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames). """ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=os.getcwd()) as tmp: pdf_file.seek(0) tmp.write(pdf_file.read()) tmp_path = Path(tmp.name) try: # 1. Single-Pass Conversion (Core Optimization - Truly One Pass) result = self.doc_converter.convert(tmp_path) doc = result.document # This is a DoclingDocument (v2) # 2. Extract LlamaIndex Documents from the result manually # This replaces DoclingReader and avoids double-conversion json_content = result.document.model_dump_json() # We create a single LlamaIndex Document with the full JSON content. # Use our existing hash generator for consistency. pdf_file.seek(0) file_hash = self.get_pdf_hash(pdf_file) documents = [Document( text=json_content, metadata={ "filename": pdf_file.name, "dl_doc_hash": file_hash } )] # 3. Extract structured tables (Uses Docling v2 high-speed export) tables = [] for i, table in enumerate(doc.tables): try: df = table.export_to_dataframe(doc=doc) if not df.empty: # Find page number for labeling page_no = "?" if table.prov and len(table.prov) > 0: page_no = table.prov[0].page_no tables.append({ "id": i + 1, "label": f"Table {i+1} (Page {page_no})", "df": df }) except Exception as e: print(f"Table Extraction Error [Table {i+1}]: {e}") # PERSIST: Save Markdown and Tables if cache_path: try: cache_path.mkdir(parents=True, exist_ok=True) # Markdown Export (Instant from existing result) md_content = result.document.export_to_markdown() with open(cache_path / "content.md", "w", encoding="utf-8") as f: f.write(md_content) # Store tables as JSON for persistence if tables: serialized_tables = [{ "id": t["id"], "label": t["label"], "data": t["df"].to_dict(orient="records") } for t in tables] with open(cache_path / "tables.json", "w", encoding="utf-8") as f: json.dump(serialized_tables, f, indent=2) except Exception as e: print(f"Persistence Error: {e}") return { "documents": documents, "tables": tables } finally: try: if tmp_path.exists(): tmp_path.unlink() except Exception: pass if __name__ == "__main__": # Test pass