Spaces:
Running
Running
File size: 5,314 Bytes
49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 109bdd3 49cf970 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import hashlib
import json
import tempfile
import os
import io
from pathlib import Path
from typing import List, Dict
from llama_index.core import Document
import pandas as pd
# Advanced Docling Imports for Table Extraction
from llama_index.readers.docling import DoclingReader
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
from docling.document_converter import DocumentConverter
from docling.document_converter import PdfFormatOption
class PDFProcessor:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# Initialize advanced pipeline options for accurate table discovery
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
# NOTE: OCR is disabled by default for 10x speed boost on text-based PDFs.
pipeline_options.do_ocr = False
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
self.doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}
)
def get_pdf_hash(self, pdf_file) -> str:
"""
Generates an MD5 hash for the PDF file object to serve as a cache key.
"""
pos = pdf_file.tell()
pdf_file.seek(0)
file_hash = hashlib.md5(pdf_file.read()).hexdigest()
pdf_file.seek(pos)
return file_hash
def load_docling_documents(self, pdf_file, cache_path: Path = None) -> Dict:
"""
Uses Docling for unified RAG and Table Extraction in a single pass.
Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=os.getcwd()) as tmp:
pdf_file.seek(0)
tmp.write(pdf_file.read())
tmp_path = Path(tmp.name)
try:
# 1. Single-Pass Conversion (Core Optimization - Truly One Pass)
result = self.doc_converter.convert(tmp_path)
doc = result.document # This is a DoclingDocument (v2)
# 2. Extract LlamaIndex Documents from the result manually
# This replaces DoclingReader and avoids double-conversion
json_content = result.document.model_dump_json()
# We create a single LlamaIndex Document with the full JSON content.
# Use our existing hash generator for consistency.
pdf_file.seek(0)
file_hash = self.get_pdf_hash(pdf_file)
documents = [Document(
text=json_content,
metadata={
"filename": pdf_file.name,
"dl_doc_hash": file_hash
}
)]
# 3. Extract structured tables (Uses Docling v2 high-speed export)
tables = []
for i, table in enumerate(doc.tables):
try:
df = table.export_to_dataframe(doc=doc)
if not df.empty:
# Find page number for labeling
page_no = "?"
if table.prov and len(table.prov) > 0:
page_no = table.prov[0].page_no
tables.append({
"id": i + 1,
"label": f"Table {i+1} (Page {page_no})",
"df": df
})
except Exception as e:
print(f"Table Extraction Error [Table {i+1}]: {e}")
# PERSIST: Save Markdown and Tables
if cache_path:
try:
cache_path.mkdir(parents=True, exist_ok=True)
# Markdown Export (Instant from existing result)
md_content = result.document.export_to_markdown()
with open(cache_path / "content.md", "w", encoding="utf-8") as f:
f.write(md_content)
# Store tables as JSON for persistence
if tables:
serialized_tables = [{
"id": t["id"],
"label": t["label"],
"data": t["df"].to_dict(orient="records")
} for t in tables]
with open(cache_path / "tables.json", "w", encoding="utf-8") as f:
json.dump(serialized_tables, f, indent=2)
except Exception as e:
print(f"Persistence Error: {e}")
return {
"documents": documents,
"tables": tables
}
finally:
try:
if tmp_path.exists():
tmp_path.unlink()
except Exception:
pass
if __name__ == "__main__":
# Test
pass
|