Spaces:

NEXAS
/

challenge-b

Running

App Files Files Community

NEXAS commited on 5 days ago

Commit

109bdd3

verified ·

1 Parent(s): d4ba13e

Upload 23 files

Browse files

Files changed (11) hide show

.gitattributes +1 -0
.gitignore +41 -0
agent/__pycache__/__init__.cpython-311.pyc +0 -0
agent/__pycache__/agent.cpython-311.pyc +0 -0
agent/__pycache__/llm_client.cpython-311.pyc +0 -0
agent/agent.py +24 -4
app.py +2 -1
nvidia_q4_fy24.pdf +3 -0
processor/__pycache__/__init__.cpython-311.pyc +0 -0
processor/__pycache__/pdf_processor.cpython-311.pyc +0 -0
processor/pdf_processor.py +78 -19

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+nvidia_q4_fy24.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDEs
+.vscode/
+.idea/
+# Project Specific
+.env
+.llama_cache/
+test_cache/
+*.pdf
+*.log
+# Docling/Models
+models/
+.cache/

agent/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (187 Bytes). View file

agent/__pycache__/agent.cpython-311.pyc ADDED Viewed

Binary file (17.7 kB). View file

agent/__pycache__/llm_client.cpython-311.pyc ADDED Viewed

Binary file (4.1 kB). View file

agent/agent.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import hashlib
 import json
 import faiss
 import re
 import time
 from typing import List, Dict, Any
@@ -73,6 +74,7 @@ class LlamaPDFAgent:
         """
         file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
         self.current_hash = file_hash
         doc_cache_path = os.path.join(self.cache_dir, file_hash)
         # 1. Check if already indexed
@@ -83,10 +85,27 @@ class LlamaPDFAgent:
             )
             self.vector_index = load_index_from_storage(storage_context)
             # Re-load metadata (Docling)
-            result = self.pdf_processor.load_docling_documents(pdf_file)
             documents = result["documents"]
-            self.tables = result["tables"]
             self.summary_index = SummaryIndex.from_documents(documents)
             # Rebuild Retriever/Engine
@@ -111,7 +130,7 @@ class LlamaPDFAgent:
         # 2. Fresh Ingest (Load and parse)
         # 1. Load Documents with rich metadata via Docling JSON
-        result = self.pdf_processor.load_docling_documents(pdf_file)
         documents = result["documents"]
         self.tables = result["tables"]
@@ -202,7 +221,8 @@ class LlamaPDFAgent:
         return {
-            "answer_gen": response.response_gen, # Generator for streaming
             "sources": sources
         }

 import hashlib
 import json
 import faiss
+from pathlib import Path
 import re
 import time
 from typing import List, Dict, Any
         """
         file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
         self.current_hash = file_hash
+        cache_path = Path(self.cache_dir) / file_hash
         doc_cache_path = os.path.join(self.cache_dir, file_hash)
         # 1. Check if already indexed
             )
             self.vector_index = load_index_from_storage(storage_context)
+            # NEW: Check for persistent JSON tables and Markdown (Eliminates redundant heavy parsing)
+            tables_cache = cache_path / "tables.json"
+            if tables_cache.exists():
+                try:
+                    with open(tables_cache, "r", encoding="utf-8") as f:
+                        raw_tables = json.load(f)
+                    self.tables = []
+                    for rt in raw_tables:
+                        self.tables.append({
+                            "id": rt["id"],
+                            "label": rt["label"],
+                            "df": pd.DataFrame(rt["data"])
+                        })
+                except Exception as e:
+                    self.tables = []
             # Re-load metadata (Docling)
+            result = self.pdf_processor.load_docling_documents(pdf_file, cache_path=cache_path)
             documents = result["documents"]
+            if not self.tables: self.tables = result["tables"]
             self.summary_index = SummaryIndex.from_documents(documents)
             # Rebuild Retriever/Engine
         # 2. Fresh Ingest (Load and parse)
         # 1. Load Documents with rich metadata via Docling JSON
+        result = self.pdf_processor.load_docling_documents(pdf_file, cache_path=cache_path)
         documents = result["documents"]
         self.tables = result["tables"]
         return {
+            "answer": str(response), # Full text for batch processing (SWOT, Insights)
+            "answer_gen": response.response_gen, # Generator for streaming (Chat)
             "sources": sources
         }

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 import os
 import pandas as pd
 import json
 import time
@@ -448,7 +449,7 @@ if st.session_state.pdf_agent:
             selected_table = tables[selected_idx]
             st.markdown(f"#### {selected_table['label']}")
-            st.dataframe(selected_table['df'], use_container_width=True)
             # Download as CSV
             csv = selected_table['df'].to_csv(index=False).encode('utf-8')

 import streamlit as st
 import os
+import traceback
 import pandas as pd
 import json
 import time
             selected_table = tables[selected_idx]
             st.markdown(f"#### {selected_table['label']}")
+            st.dataframe(selected_table['df'], width="stretch")
             # Download as CSV
             csv = selected_table['df'].to_csv(index=False).encode('utf-8')

nvidia_q4_fy24.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b194a19b08a74d6eef744a11f31fa115b064e4dd712b9de54baef346cb3eae5e
+size 2783211

processor/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

processor/__pycache__/pdf_processor.cpython-311.pyc ADDED Viewed

Binary file (7.31 kB). View file

processor/pdf_processor.py CHANGED Viewed

@@ -1,18 +1,38 @@
 import hashlib
 import tempfile
 import os
 import io
 from pathlib import Path
 from typing import List, Dict
 import pandas as pd
 from llama_index.readers.docling import DoclingReader
 from docling.document_converter import DocumentConverter
 class PDFProcessor:
     def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
-        self.doc_converter = DocumentConverter()
     def get_pdf_hash(self, pdf_file) -> str:
         """
@@ -24,40 +44,79 @@ class PDFProcessor:
         pdf_file.seek(pos)
         return file_hash
-    def load_docling_documents(self, pdf_file) -> Dict:
         """
-        Uses DoclingReader for RAG and DocumentConverter for Table Extraction.
         Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
         """
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
             pdf_file.seek(0)
             tmp.write(pdf_file.read())
             tmp_path = Path(tmp.name)
         try:
-            # 1. Ingest for LlamaIndex RAG
-            reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
-            documents = reader.load_data(file_path=tmp_path)
-            # 2. Extract structured tables for DataFrame explorer
             result = self.doc_converter.convert(tmp_path)
-            doc = result.document
             tables = []
             for i, table in enumerate(doc.tables):
                 try:
-                    # Export table to HTML then read via pandas
-                    html_table = table.export_to_html()
-                    dfs = pd.read_html(io.StringIO(html_table))
-                    if dfs:
                         tables.append({
                             "id": i + 1,
-                            "label": f"Table {i+1}",
-                            "df": dfs[0]
                         })
-                except Exception:
-                    pass
             return {
                 "documents": documents,

 import hashlib
+import json
 import tempfile
 import os
 import io
 from pathlib import Path
 from typing import List, Dict
+from llama_index.core import Document
 import pandas as pd
+# Advanced Docling Imports for Table Extraction
 from llama_index.readers.docling import DoclingReader
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
 from docling.document_converter import DocumentConverter
+from docling.document_converter import PdfFormatOption
 class PDFProcessor:
     def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
+        # Initialize advanced pipeline options for accurate table discovery
+        pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_table_structure = True
+        # NOTE: OCR is disabled by default for 10x speed boost on text-based PDFs.
+        pipeline_options.do_ocr = False
+        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        self.doc_converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options
+                )
+            }
+        )
     def get_pdf_hash(self, pdf_file) -> str:
         """
         pdf_file.seek(pos)
         return file_hash
+    def load_docling_documents(self, pdf_file, cache_path: Path = None) -> Dict:
         """
+        Uses Docling for unified RAG and Table Extraction in a single pass.
         Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
         """
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=os.getcwd()) as tmp:
             pdf_file.seek(0)
             tmp.write(pdf_file.read())
             tmp_path = Path(tmp.name)
         try:
+            # 1. Single-Pass Conversion (Core Optimization - Truly One Pass)
             result = self.doc_converter.convert(tmp_path)
+            doc = result.document # This is a DoclingDocument (v2)
+            # 2. Extract LlamaIndex Documents from the result manually
+            # This replaces DoclingReader and avoids double-conversion
+            json_content = result.document.model_dump_json()
+            # We create a single LlamaIndex Document with the full JSON content.
+            # Use our existing hash generator for consistency.
+            pdf_file.seek(0)
+            file_hash = self.get_pdf_hash(pdf_file)
+            documents = [Document(
+                text=json_content,
+                metadata={
+                    "filename": pdf_file.name,
+                    "dl_doc_hash": file_hash
+                }
+            )]
+            # 3. Extract structured tables (Uses Docling v2 high-speed export)
             tables = []
             for i, table in enumerate(doc.tables):
                 try:
+                    df = table.export_to_dataframe(doc=doc)
+                    if not df.empty:
+                        # Find page number for labeling
+                        page_no = "?"
+                        if table.prov and len(table.prov) > 0:
+                            page_no = table.prov[0].page_no
                         tables.append({
                             "id": i + 1,
+                            "label": f"Table {i+1} (Page {page_no})",
+                            "df": df
                         })
+                except Exception as e:
+                    print(f"Table Extraction Error [Table {i+1}]: {e}")
+            # PERSIST: Save Markdown and Tables
+            if cache_path:
+                try:
+                    cache_path.mkdir(parents=True, exist_ok=True)
+                    # Markdown Export (Instant from existing result)
+                    md_content = result.document.export_to_markdown()
+                    with open(cache_path / "content.md", "w", encoding="utf-8") as f:
+                        f.write(md_content)
+                    # Store tables as JSON for persistence
+                    if tables:
+                        serialized_tables = [{
+                            "id": t["id"],
+                            "label": t["label"],
+                            "data": t["df"].to_dict(orient="records")
+                        } for t in tables]
+                        with open(cache_path / "tables.json", "w", encoding="utf-8") as f:
+                            json.dump(serialized_tables, f, indent=2)
+                except Exception as e:
+                    print(f"Persistence Error: {e}")
             return {
                 "documents": documents,