Spaces:
Running
Running
| import hashlib | |
| import json | |
| import tempfile | |
| import os | |
| import io | |
| from pathlib import Path | |
| from typing import List, Dict | |
| from llama_index.core import Document | |
| import pandas as pd | |
| # Advanced Docling Imports for Table Extraction | |
| from llama_index.readers.docling import DoclingReader | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode | |
| from docling.document_converter import DocumentConverter | |
| from docling.document_converter import PdfFormatOption | |
| class PDFProcessor: | |
| def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| # Initialize advanced pipeline options for accurate table discovery | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.do_table_structure = True | |
| # NOTE: OCR is disabled by default for 10x speed boost on text-based PDFs. | |
| pipeline_options.do_ocr = False | |
| pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE | |
| self.doc_converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| def get_pdf_hash(self, pdf_file) -> str: | |
| """ | |
| Generates an MD5 hash for the PDF file object to serve as a cache key. | |
| """ | |
| pos = pdf_file.tell() | |
| pdf_file.seek(0) | |
| file_hash = hashlib.md5(pdf_file.read()).hexdigest() | |
| pdf_file.seek(pos) | |
| return file_hash | |
| def load_docling_documents(self, pdf_file, cache_path: Path = None) -> Dict: | |
| """ | |
| Uses Docling for unified RAG and Table Extraction in a single pass. | |
| Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames). | |
| """ | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=os.getcwd()) as tmp: | |
| pdf_file.seek(0) | |
| tmp.write(pdf_file.read()) | |
| tmp_path = Path(tmp.name) | |
| try: | |
| # 1. Single-Pass Conversion (Core Optimization - Truly One Pass) | |
| result = self.doc_converter.convert(tmp_path) | |
| doc = result.document # This is a DoclingDocument (v2) | |
| # 2. Extract LlamaIndex Documents from the result manually | |
| # This replaces DoclingReader and avoids double-conversion | |
| json_content = result.document.model_dump_json() | |
| # We create a single LlamaIndex Document with the full JSON content. | |
| # Use our existing hash generator for consistency. | |
| pdf_file.seek(0) | |
| file_hash = self.get_pdf_hash(pdf_file) | |
| documents = [Document( | |
| text=json_content, | |
| metadata={ | |
| "filename": pdf_file.name, | |
| "dl_doc_hash": file_hash | |
| } | |
| )] | |
| # 3. Extract structured tables (Uses Docling v2 high-speed export) | |
| tables = [] | |
| for i, table in enumerate(doc.tables): | |
| try: | |
| df = table.export_to_dataframe(doc=doc) | |
| if not df.empty: | |
| # Find page number for labeling | |
| page_no = "?" | |
| if table.prov and len(table.prov) > 0: | |
| page_no = table.prov[0].page_no | |
| tables.append({ | |
| "id": i + 1, | |
| "label": f"Table {i+1} (Page {page_no})", | |
| "df": df | |
| }) | |
| except Exception as e: | |
| print(f"Table Extraction Error [Table {i+1}]: {e}") | |
| # PERSIST: Save Markdown and Tables | |
| if cache_path: | |
| try: | |
| cache_path.mkdir(parents=True, exist_ok=True) | |
| # Markdown Export (Instant from existing result) | |
| md_content = result.document.export_to_markdown() | |
| with open(cache_path / "content.md", "w", encoding="utf-8") as f: | |
| f.write(md_content) | |
| # Store tables as JSON for persistence | |
| if tables: | |
| serialized_tables = [{ | |
| "id": t["id"], | |
| "label": t["label"], | |
| "data": t["df"].to_dict(orient="records") | |
| } for t in tables] | |
| with open(cache_path / "tables.json", "w", encoding="utf-8") as f: | |
| json.dump(serialized_tables, f, indent=2) | |
| except Exception as e: | |
| print(f"Persistence Error: {e}") | |
| return { | |
| "documents": documents, | |
| "tables": tables | |
| } | |
| finally: | |
| try: | |
| if tmp_path.exists(): | |
| tmp_path.unlink() | |
| except Exception: | |
| pass | |
| if __name__ == "__main__": | |
| # Test | |
| pass | |