NEXAS commited on
Commit
109bdd3
·
verified ·
1 Parent(s): d4ba13e

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ nvidia_q4_fy24.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+
24
+ # Virtual Environment
25
+ venv/
26
+ ENV/
27
+
28
+ # IDEs
29
+ .vscode/
30
+ .idea/
31
+
32
+ # Project Specific
33
+ .env
34
+ .llama_cache/
35
+ test_cache/
36
+ *.pdf
37
+ *.log
38
+
39
+ # Docling/Models
40
+ models/
41
+ .cache/
agent/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
agent/__pycache__/agent.cpython-311.pyc ADDED
Binary file (17.7 kB). View file
 
agent/__pycache__/llm_client.cpython-311.pyc ADDED
Binary file (4.1 kB). View file
 
agent/agent.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import hashlib
3
  import json
4
  import faiss
 
5
  import re
6
  import time
7
  from typing import List, Dict, Any
@@ -73,6 +74,7 @@ class LlamaPDFAgent:
73
  """
74
  file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
75
  self.current_hash = file_hash
 
76
  doc_cache_path = os.path.join(self.cache_dir, file_hash)
77
 
78
  # 1. Check if already indexed
@@ -83,10 +85,27 @@ class LlamaPDFAgent:
83
  )
84
  self.vector_index = load_index_from_storage(storage_context)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  # Re-load metadata (Docling)
87
- result = self.pdf_processor.load_docling_documents(pdf_file)
88
  documents = result["documents"]
89
- self.tables = result["tables"]
90
  self.summary_index = SummaryIndex.from_documents(documents)
91
 
92
  # Rebuild Retriever/Engine
@@ -111,7 +130,7 @@ class LlamaPDFAgent:
111
  # 2. Fresh Ingest (Load and parse)
112
 
113
  # 1. Load Documents with rich metadata via Docling JSON
114
- result = self.pdf_processor.load_docling_documents(pdf_file)
115
  documents = result["documents"]
116
  self.tables = result["tables"]
117
 
@@ -202,7 +221,8 @@ class LlamaPDFAgent:
202
 
203
 
204
  return {
205
- "answer_gen": response.response_gen, # Generator for streaming
 
206
  "sources": sources
207
  }
208
 
 
2
  import hashlib
3
  import json
4
  import faiss
5
+ from pathlib import Path
6
  import re
7
  import time
8
  from typing import List, Dict, Any
 
74
  """
75
  file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
76
  self.current_hash = file_hash
77
+ cache_path = Path(self.cache_dir) / file_hash
78
  doc_cache_path = os.path.join(self.cache_dir, file_hash)
79
 
80
  # 1. Check if already indexed
 
85
  )
86
  self.vector_index = load_index_from_storage(storage_context)
87
 
88
+ # NEW: Check for persistent JSON tables and Markdown (Eliminates redundant heavy parsing)
89
+ tables_cache = cache_path / "tables.json"
90
+
91
+ if tables_cache.exists():
92
+ try:
93
+ with open(tables_cache, "r", encoding="utf-8") as f:
94
+ raw_tables = json.load(f)
95
+ self.tables = []
96
+ for rt in raw_tables:
97
+ self.tables.append({
98
+ "id": rt["id"],
99
+ "label": rt["label"],
100
+ "df": pd.DataFrame(rt["data"])
101
+ })
102
+ except Exception as e:
103
+ self.tables = []
104
+
105
  # Re-load metadata (Docling)
106
+ result = self.pdf_processor.load_docling_documents(pdf_file, cache_path=cache_path)
107
  documents = result["documents"]
108
+ if not self.tables: self.tables = result["tables"]
109
  self.summary_index = SummaryIndex.from_documents(documents)
110
 
111
  # Rebuild Retriever/Engine
 
130
  # 2. Fresh Ingest (Load and parse)
131
 
132
  # 1. Load Documents with rich metadata via Docling JSON
133
+ result = self.pdf_processor.load_docling_documents(pdf_file, cache_path=cache_path)
134
  documents = result["documents"]
135
  self.tables = result["tables"]
136
 
 
221
 
222
 
223
  return {
224
+ "answer": str(response), # Full text for batch processing (SWOT, Insights)
225
+ "answer_gen": response.response_gen, # Generator for streaming (Chat)
226
  "sources": sources
227
  }
228
 
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import os
 
3
  import pandas as pd
4
  import json
5
  import time
@@ -448,7 +449,7 @@ if st.session_state.pdf_agent:
448
  selected_table = tables[selected_idx]
449
 
450
  st.markdown(f"#### {selected_table['label']}")
451
- st.dataframe(selected_table['df'], use_container_width=True)
452
 
453
  # Download as CSV
454
  csv = selected_table['df'].to_csv(index=False).encode('utf-8')
 
1
  import streamlit as st
2
  import os
3
+ import traceback
4
  import pandas as pd
5
  import json
6
  import time
 
449
  selected_table = tables[selected_idx]
450
 
451
  st.markdown(f"#### {selected_table['label']}")
452
+ st.dataframe(selected_table['df'], width="stretch")
453
 
454
  # Download as CSV
455
  csv = selected_table['df'].to_csv(index=False).encode('utf-8')
nvidia_q4_fy24.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b194a19b08a74d6eef744a11f31fa115b064e4dd712b9de54baef346cb3eae5e
3
+ size 2783211
processor/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
processor/__pycache__/pdf_processor.cpython-311.pyc ADDED
Binary file (7.31 kB). View file
 
processor/pdf_processor.py CHANGED
@@ -1,18 +1,38 @@
1
  import hashlib
 
2
  import tempfile
3
  import os
4
  import io
5
  from pathlib import Path
6
  from typing import List, Dict
 
7
  import pandas as pd
 
8
  from llama_index.readers.docling import DoclingReader
 
 
9
  from docling.document_converter import DocumentConverter
 
10
 
11
  class PDFProcessor:
12
  def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
13
  self.chunk_size = chunk_size
14
  self.chunk_overlap = chunk_overlap
15
- self.doc_converter = DocumentConverter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def get_pdf_hash(self, pdf_file) -> str:
18
  """
@@ -24,40 +44,79 @@ class PDFProcessor:
24
  pdf_file.seek(pos)
25
  return file_hash
26
 
27
- def load_docling_documents(self, pdf_file) -> Dict:
28
  """
29
- Uses DoclingReader for RAG and DocumentConverter for Table Extraction.
30
  Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
31
  """
32
-
33
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
34
  pdf_file.seek(0)
35
  tmp.write(pdf_file.read())
36
  tmp_path = Path(tmp.name)
37
 
38
  try:
39
- # 1. Ingest for LlamaIndex RAG
40
- reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
41
- documents = reader.load_data(file_path=tmp_path)
42
-
43
- # 2. Extract structured tables for DataFrame explorer
44
  result = self.doc_converter.convert(tmp_path)
45
- doc = result.document
 
 
 
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  tables = []
48
  for i, table in enumerate(doc.tables):
49
  try:
50
- # Export table to HTML then read via pandas
51
- html_table = table.export_to_html()
52
- dfs = pd.read_html(io.StringIO(html_table))
53
- if dfs:
 
 
 
54
  tables.append({
55
  "id": i + 1,
56
- "label": f"Table {i+1}",
57
- "df": dfs[0]
58
  })
59
- except Exception:
60
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  return {
63
  "documents": documents,
 
1
  import hashlib
2
+ import json
3
  import tempfile
4
  import os
5
  import io
6
  from pathlib import Path
7
  from typing import List, Dict
8
+ from llama_index.core import Document
9
  import pandas as pd
10
+ # Advanced Docling Imports for Table Extraction
11
  from llama_index.readers.docling import DoclingReader
12
+ from docling.datamodel.base_models import InputFormat
13
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, TableFormerMode
14
  from docling.document_converter import DocumentConverter
15
+ from docling.document_converter import PdfFormatOption
16
 
17
  class PDFProcessor:
18
  def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
19
  self.chunk_size = chunk_size
20
  self.chunk_overlap = chunk_overlap
21
+
22
+ # Initialize advanced pipeline options for accurate table discovery
23
+ pipeline_options = PdfPipelineOptions()
24
+ pipeline_options.do_table_structure = True
25
+ # NOTE: OCR is disabled by default for 10x speed boost on text-based PDFs.
26
+ pipeline_options.do_ocr = False
27
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
28
+
29
+ self.doc_converter = DocumentConverter(
30
+ format_options={
31
+ InputFormat.PDF: PdfFormatOption(
32
+ pipeline_options=pipeline_options
33
+ )
34
+ }
35
+ )
36
 
37
  def get_pdf_hash(self, pdf_file) -> str:
38
  """
 
44
  pdf_file.seek(pos)
45
  return file_hash
46
 
47
+ def load_docling_documents(self, pdf_file, cache_path: Path = None) -> Dict:
48
  """
49
+ Uses Docling for unified RAG and Table Extraction in a single pass.
50
  Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
51
  """
52
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=os.getcwd()) as tmp:
 
53
  pdf_file.seek(0)
54
  tmp.write(pdf_file.read())
55
  tmp_path = Path(tmp.name)
56
 
57
  try:
58
+ # 1. Single-Pass Conversion (Core Optimization - Truly One Pass)
 
 
 
 
59
  result = self.doc_converter.convert(tmp_path)
60
+ doc = result.document # This is a DoclingDocument (v2)
61
+
62
+ # 2. Extract LlamaIndex Documents from the result manually
63
+ # This replaces DoclingReader and avoids double-conversion
64
+ json_content = result.document.model_dump_json()
65
 
66
+ # We create a single LlamaIndex Document with the full JSON content.
67
+ # Use our existing hash generator for consistency.
68
+ pdf_file.seek(0)
69
+ file_hash = self.get_pdf_hash(pdf_file)
70
+
71
+ documents = [Document(
72
+ text=json_content,
73
+ metadata={
74
+ "filename": pdf_file.name,
75
+ "dl_doc_hash": file_hash
76
+ }
77
+ )]
78
+
79
+ # 3. Extract structured tables (Uses Docling v2 high-speed export)
80
  tables = []
81
  for i, table in enumerate(doc.tables):
82
  try:
83
+ df = table.export_to_dataframe(doc=doc)
84
+ if not df.empty:
85
+ # Find page number for labeling
86
+ page_no = "?"
87
+ if table.prov and len(table.prov) > 0:
88
+ page_no = table.prov[0].page_no
89
+
90
  tables.append({
91
  "id": i + 1,
92
+ "label": f"Table {i+1} (Page {page_no})",
93
+ "df": df
94
  })
95
+ except Exception as e:
96
+ print(f"Table Extraction Error [Table {i+1}]: {e}")
97
+
98
+ # PERSIST: Save Markdown and Tables
99
+ if cache_path:
100
+ try:
101
+ cache_path.mkdir(parents=True, exist_ok=True)
102
+
103
+ # Markdown Export (Instant from existing result)
104
+ md_content = result.document.export_to_markdown()
105
+ with open(cache_path / "content.md", "w", encoding="utf-8") as f:
106
+ f.write(md_content)
107
+
108
+ # Store tables as JSON for persistence
109
+ if tables:
110
+ serialized_tables = [{
111
+ "id": t["id"],
112
+ "label": t["label"],
113
+ "data": t["df"].to_dict(orient="records")
114
+ } for t in tables]
115
+ with open(cache_path / "tables.json", "w", encoding="utf-8") as f:
116
+ json.dump(serialized_tables, f, indent=2)
117
+
118
+ except Exception as e:
119
+ print(f"Persistence Error: {e}")
120
 
121
  return {
122
  "documents": documents,