NEXAS commited on
Commit
49cf970
·
verified ·
1 Parent(s): f97ec1e

Upload 16 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PYTHONDONTWRITEBYTECODE=1 \
7
+ HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
+
10
+ # Create a non-root user
11
+ RUN useradd -m -u 1000 user
12
+ WORKDIR $HOME/app
13
+
14
+ # Install system dependencies
15
+ RUN apt-get update && apt-get install -y \
16
+ build-essential \
17
+ libgomp1 \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # Install requirements
21
+ COPY requirements.txt .
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy the application code and set ownership
25
+ COPY --chown=user:user . .
26
+
27
+ # Switch to the non-root user
28
+ USER user
29
+
30
+ # Hugging Face Spaces expect port 7860
31
+ EXPOSE 7860
32
+
33
+ # Run Streamlit with the correct port and address
34
+ CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
agent/__init__.py ADDED
File without changes
agent/agent.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import json
4
+ import faiss
5
+ import re
6
+ import time
7
+ from typing import List, Dict, Any
8
+
9
+ from llama_index.core import (
10
+ VectorStoreIndex,
11
+ SummaryIndex,
12
+ StorageContext,
13
+ Document,
14
+ Settings,
15
+ QueryBundle,
16
+ load_index_from_storage
17
+ )
18
+ from llama_index.node_parser.docling import DoclingNodeParser
19
+ from llama_index.core.retrievers import RecursiveRetriever
20
+ from llama_index.core.query_engine import RetrieverQueryEngine
21
+ from llama_index.core.postprocessor import LLMRerank
22
+ from llama_index.llms.groq import Groq
23
+ from llama_index.embeddings.fastembed import FastEmbedEmbedding
24
+ from llama_index.vector_stores.faiss import FaissVectorStore
25
+ from llama_index.retrievers.bm25 import BM25Retriever
26
+ import shutil
27
+
28
+
29
+ # NEW: Import the refactored PDFProcessor
30
+ from processor.pdf_processor import PDFProcessor
31
+
32
+ class AgentRateLimitError(Exception):
33
+ """Custom exception containing the wait time extracted from an API rate limit error."""
34
+ def __init__(self, wait_time: float, message: str):
35
+ self.wait_time = wait_time
36
+ super().__init__(message)
37
+
38
+ class LlamaPDFAgent:
39
+
40
+ def __init__(self, api_key: str = None, model: str = None):
41
+ # 1. Initialize Settings with Groq and FastEmbed
42
+ self.api_key = api_key or os.getenv("GROQ_API_KEY")
43
+ self.model = model or os.getenv("GROQ_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct")
44
+
45
+
46
+ Settings.llm = Groq(
47
+ model=self.model,
48
+ api_key=self.api_key,
49
+ streaming=True # Global streaming support
50
+ )
51
+ Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
52
+
53
+
54
+ # 2. Use the specialized PDFProcessor
55
+ self.pdf_processor = PDFProcessor()
56
+
57
+ self.vector_index = None
58
+ self.summary_index = None
59
+ self.recursive_query_engine = None
60
+ self.is_loaded = False
61
+ self.cache_dir = "./.llama_cache"
62
+ if not os.path.exists(self.cache_dir):
63
+ os.makedirs(self.cache_dir)
64
+ self.tables = [] # Store extracted DataFrames
65
+ self.registry_path = os.path.join(self.cache_dir, "registry.json")
66
+ self._init_registry()
67
+
68
+
69
+
70
+ def ingest_pdf(self, pdf_file):
71
+ """
72
+ Ingests a PDF using Persistence: Loads from disk if already indexed.
73
+ """
74
+ file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
75
+ self.current_hash = file_hash
76
+ doc_cache_path = os.path.join(self.cache_dir, file_hash)
77
+
78
+ # 1. Check if already indexed
79
+ if os.path.exists(os.path.join(doc_cache_path, "default_vector_store.json")):
80
+ storage_context = StorageContext.from_defaults(
81
+ persist_dir=doc_cache_path,
82
+ vector_store=FaissVectorStore.from_persist_dir(doc_cache_path)
83
+ )
84
+ self.vector_index = load_index_from_storage(storage_context)
85
+
86
+ # Re-load metadata (Docling)
87
+ result = self.pdf_processor.load_docling_documents(pdf_file)
88
+ documents = result["documents"]
89
+ self.tables = result["tables"]
90
+ self.summary_index = SummaryIndex.from_documents(documents)
91
+
92
+ # Rebuild Retriever/Engine
93
+ nodes = list(self.vector_index.docstore.docs.values())
94
+ self.bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
95
+ vector_retriever = self.vector_index.as_retriever(similarity_top_k=5)
96
+ self.recursive_retriever = RecursiveRetriever(
97
+ "vector",
98
+ retriever_dict={"vector": vector_retriever},
99
+ node_dict={node.node_id: node for node in nodes}
100
+ )
101
+ self.recursive_query_engine = RetrieverQueryEngine.from_args(
102
+ self.recursive_retriever,
103
+ node_postprocessors=[LLMRerank(top_n=3)],
104
+ streaming=True
105
+ )
106
+
107
+ self.is_loaded = True
108
+ self._save_to_registry(file_hash, pdf_file.name)
109
+ return f"Loaded '{pdf_file.name}' from library storage."
110
+
111
+ # 2. Fresh Ingest (Load and parse)
112
+
113
+ # 1. Load Documents with rich metadata via Docling JSON
114
+ result = self.pdf_processor.load_docling_documents(pdf_file)
115
+ documents = result["documents"]
116
+ self.tables = result["tables"]
117
+
118
+ # 2. Advanced Node Parsing (Captures page numbers and layout)
119
+ node_parser = DoclingNodeParser()
120
+ nodes = node_parser.get_nodes_from_documents(documents)
121
+
122
+
123
+ # 3. Vector Index with FAISS
124
+ d = 384 # BGE-small-en-v1.5 dimension
125
+ faiss_index = faiss.IndexFlatL2(d)
126
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
127
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
128
+ storage_context.docstore.add_documents(nodes)
129
+
130
+ self.vector_index = VectorStoreIndex(
131
+ nodes,
132
+ storage_context=storage_context
133
+ )
134
+
135
+ # Persist to disk
136
+ self.vector_index.storage_context.persist(persist_dir=doc_cache_path)
137
+
138
+ # 4. BM25 Retriever for Hybrid Search
139
+ self.bm25_retriever = BM25Retriever.from_defaults(
140
+ nodes=nodes,
141
+ similarity_top_k=5
142
+ )
143
+
144
+ # 5. Recursive Retriever for Context Depth
145
+ vector_retriever = self.vector_index.as_retriever(similarity_top_k=5)
146
+ self.recursive_retriever = RecursiveRetriever(
147
+ "vector",
148
+ retriever_dict={"vector": vector_retriever},
149
+ node_dict={node.node_id: node for node in list(nodes)},
150
+ verbose=True
151
+ )
152
+
153
+ # 6. Summary Index for global overview
154
+ self.summary_index = SummaryIndex.from_documents(documents)
155
+
156
+ # Setup the main recursive query engine
157
+ self.recursive_query_engine = RetrieverQueryEngine.from_args(
158
+ self.recursive_retriever,
159
+ node_postprocessors=[LLMRerank(top_n=3)],
160
+ streaming=True # Enable at engine level
161
+ )
162
+
163
+ self.is_loaded = True
164
+ self._save_to_registry(file_hash, pdf_file.name)
165
+ return f"Successfully indexed '{pdf_file.name}' and saved to library."
166
+
167
+
168
+ def answer_question(self, question: str) -> Dict[str, Any]:
169
+ """
170
+ Returns answer and source citations including page numbers.
171
+ """
172
+ if not self.is_loaded: return {"answer": "No document loaded.", "sources": []}
173
+
174
+ try:
175
+ response = self.recursive_query_engine.query(question)
176
+ except Exception as e:
177
+ # Check for RateLimit (429) message: "Please try again in X.XXXs"
178
+ error_str = str(e)
179
+ match = re.search(r"Please try again in (\d+\.\d+)s", error_str)
180
+ if match:
181
+ wait_time = float(match.group(1))
182
+ raise AgentRateLimitError(wait_time, error_str)
183
+ raise e
184
+
185
+ sources = []
186
+ for node in response.source_nodes:
187
+ # metadata contains 'doc_items' which has 'prov' with 'page_no'
188
+ page_no = node.metadata.get("page_label") or node.metadata.get("page_no")
189
+
190
+ if not page_no and "doc_items" in node.metadata:
191
+ try:
192
+ doc_items = node.metadata["doc_items"]
193
+ if doc_items and "prov" in doc_items[0] and doc_items[0]["prov"]:
194
+ page_no = doc_items[0]["prov"][0].get("page_no")
195
+ except (KeyError, IndexError, TypeError):
196
+ pass
197
+
198
+ sources.append({
199
+ "text": node.get_content()[:250] + "...", # Snippet for UI
200
+ "page": page_no
201
+ })
202
+
203
+
204
+ return {
205
+ "answer_gen": response.response_gen, # Generator for streaming
206
+ "sources": sources
207
+ }
208
+
209
+
210
+
211
+ def get_kpi_viz_data(self):
212
+ """
213
+ Processes existing KPI text and extracts numerical pairs for charting.
214
+ """
215
+ kpi_text = self.get_deep_insights().get("key_metrics", "")
216
+ if not kpi_text:
217
+ return None
218
+
219
+ prompt = f"""
220
+ Extract key numerical metrics from the following text for visualization.
221
+ Format as a JSON list of objects with 'label' and 'value'.
222
+ Include only numerical values. If a value is a percentage, convert 10% to 10.
223
+
224
+ Text: {kpi_text}
225
+ """
226
+
227
+ try:
228
+ response = self.llm.complete(prompt)
229
+ raw_json = str(response)
230
+ if "```json" in raw_json:
231
+ raw_json = raw_json.split("```json")[1].split("```")[0].strip()
232
+ return json.loads(raw_json)
233
+ except Exception:
234
+ return None
235
+
236
+
237
+ def summarize_document(self):
238
+ if not self.is_loaded: return "No document loaded."
239
+ query_engine = self.summary_index.as_query_engine(
240
+ response_mode="tree_summarize",
241
+ streaming=True
242
+ )
243
+ response = query_engine.query("Provide a comprehensive executive summary of this document.")
244
+ return response
245
+
246
+
247
+ def get_deep_insights(self) -> Dict[str, str]:
248
+ """
249
+ Performs a multi-stage analysis to extract strategic depth.
250
+ """
251
+ if not self.is_loaded: return {}
252
+
253
+ prompts = {
254
+ "strategic_vision": "What is the primary strategic vision or long-term objective described in this document?",
255
+ "key_metrics": "Extract the top 5 most critical numerical KPIs or financial metrics mentioned. Format as a list.",
256
+ "risks_and_challenges": "Identify the most significant risks, headwinds, or challenges mentioned for the business.",
257
+ "swot_analysis": "Based on the content, provide a concise SWOT analysis (Strengths, Weaknesses, Opportunities, Threats) in valid JSON format with keys 'S', 'W', 'O', 'T'."
258
+ }
259
+
260
+ insights = {}
261
+ for key, query in prompts.items():
262
+ result = self.answer_question(query)
263
+ insights[key] = result.get("answer_text") or result.get("answer", "")
264
+
265
+ return insights
266
+
267
+
268
+ def _init_registry(self):
269
+ if not os.path.exists(self.registry_path):
270
+ with open(self.registry_path, "w") as f:
271
+ json.dump({}, f)
272
+
273
+ def _get_registry(self) -> Dict[str, str]:
274
+ try:
275
+ with open(self.registry_path, "r") as f:
276
+ return json.load(f)
277
+ except Exception:
278
+ return {}
279
+
280
+ def _save_to_registry(self, file_hash: str, filename: str):
281
+ registry = self._get_registry()
282
+ registry[file_hash] = filename
283
+ with open(self.registry_path, "w") as f:
284
+ json.dump(registry, f)
285
+
286
+ def get_library(self) -> List[Dict[str, str]]:
287
+ registry = self._get_registry()
288
+ return [{"hash": h, "filename": f} for h, f in registry.items()]
289
+
290
+ def delete_document(self, file_hash: str):
291
+ registry = self._get_registry()
292
+ if file_hash in registry:
293
+ doc_path = os.path.join(self.cache_dir, file_hash)
294
+ if os.path.exists(doc_path):
295
+ shutil.rmtree(doc_path)
296
+ del registry[file_hash]
297
+ with open(self.registry_path, "w") as f:
298
+ json.dump(registry, f)
299
+ if self.is_loaded and getattr(self, "current_hash", None) == file_hash:
300
+ self.is_loaded = False
301
+ return True
302
+ return False
303
+
304
+
305
+
agent/llm_client.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from groq import Groq
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables
6
+ load_dotenv()
7
+
8
+ class GroqClient:
9
+ def __init__(self, api_key=None, model=None):
10
+ self.api_key = api_key or os.getenv("GROQ_API_KEY")
11
+ self.model = model or os.getenv("GROQ_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct")
12
+
13
+ if not self.api_key:
14
+ raise ValueError("Groq API Key not found. Please set GROQ_API_KEY in your .env file.")
15
+
16
+ self.client = Groq(api_key=self.api_key)
17
+
18
+ def get_completion(self, prompt: str, system_message: str = "You are a helpful AI assistant."):
19
+ """
20
+ Calls the Groq API to get a completion for the given prompt.
21
+ """
22
+ try:
23
+ chat_completion = self.client.chat.completions.create(
24
+ messages=[
25
+ {
26
+ "role": "system",
27
+ "content": system_message,
28
+ },
29
+ {
30
+ "role": "user",
31
+ "content": prompt,
32
+ }
33
+ ],
34
+ model=self.model,
35
+ )
36
+ return chat_completion.choices[0].message.content
37
+ except Exception as e:
38
+ return f"Error calling Groq API: {e}"
39
+
40
+ def get_json_completion(self, prompt: str, system_message: str = "You are a helpful AI assistant."):
41
+ """
42
+ Calls the Groq API with JSON mode enabled.
43
+ """
44
+ try:
45
+ chat_completion = self.client.chat.completions.create(
46
+ messages=[
47
+ {
48
+ "role": "system",
49
+ "content": system_message,
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content": prompt,
54
+ }
55
+ ],
56
+ model=self.model,
57
+ response_format={"type": "json_object"},
58
+ )
59
+ return chat_completion.choices[0].message.content
60
+ except Exception as e:
61
+ return f"{{\"error\": \"{e}\"}}"
62
+
63
+ def list_models(self):
64
+ """
65
+ Lists available models from Groq.
66
+ """
67
+ try:
68
+ models = self.client.models.list()
69
+ return [model.id for model in models.data]
70
+ except Exception as e:
71
+ print(f"Error listing models: {e}")
72
+ return []
73
+
74
+ if __name__ == "__main__":
75
+ # Test LLM client (requires API key)
76
+ try:
77
+ client = GroqClient()
78
+ response = client.get_completion("Hello, how are you?")
79
+ print(f"Groq Response: {response}")
80
+ except ValueError as e:
81
+ print(e)
app.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pandas as pd
4
+ import json
5
+ import time
6
+
7
+ from dotenv import load_dotenv
8
+ from agent.llm_client import GroqClient
9
+ from agent.agent import LlamaPDFAgent as PDFAgent, AgentRateLimitError
10
+
11
+ # Load environment variables
12
+
13
+ load_dotenv()
14
+
15
+ # Page configuration
16
+ st.set_page_config(
17
+ page_title="Naresh AI - PDF Intelligence",
18
+ page_icon="📄",
19
+ layout="wide",
20
+ )
21
+
22
+ # Custom Styling for a Premium Dark Mode (Consistent with Challenge A)
23
+ st.markdown("""
24
+ <style>
25
+ /* Main container styling - Deep Dark Gradient */
26
+ .stApp {
27
+ background: radial-gradient(circle at top left, #1e293b 0%, #0f172a 100%) !important;
28
+ color: #f1f5f9 !important;
29
+ }
30
+
31
+ /* Header and Title styling - Neon Blue */
32
+ h1 {
33
+ color: #60a5fa !important;
34
+ font-family: 'Outfit', sans-serif;
35
+ font-weight: 800 !important;
36
+ letter-spacing: -0.05rem;
37
+ text-shadow: 0 0 20px rgba(96, 165, 250, 0.3);
38
+ }
39
+
40
+ h3 {
41
+ color: #94a3b8 !important;
42
+ font-weight: 400 !important;
43
+ }
44
+
45
+ /* Input styling - Darker Glass */
46
+ .stTextInput>div>div>input {
47
+ background-color: rgba(30, 41, 59, 0.7) !important;
48
+ color: white !important;
49
+ border: 1px solid rgba(96, 165, 250, 0.5) !important;
50
+ border-radius: 12px !important;
51
+ padding: 12px 20px !important;
52
+ font-size: 1.1rem !important;
53
+ }
54
+
55
+ /* Button styling - Glowing Blue */
56
+ .stButton>button {
57
+ background: linear-gradient(90deg, #2563eb 0%, #3b82f6 100%) !important;
58
+ color: white !important;
59
+ border: none !important;
60
+ border-radius: 12px !important;
61
+ padding: 15px 30px !important;
62
+ font-weight: 700 !important;
63
+ font-size: 1.1rem !important;
64
+ transition: all 0.3s ease !important;
65
+ box-shadow: 0 0 15px rgba(37, 99, 235, 0.4) !important;
66
+ width: 100% !important;
67
+ }
68
+
69
+ .stButton>button:hover {
70
+ transform: translateY(-2px) !important;
71
+ box-shadow: 0 0 30px rgba(59, 130, 246, 0.6) !important;
72
+ }
73
+
74
+ /* Result Card styling - Dark Inset */
75
+ .answer-container {
76
+ background-color: rgba(30, 41, 59, 0.5);
77
+ padding: 30px;
78
+ border-radius: 20px;
79
+ backdrop-filter: blur(20px);
80
+ border: 1px solid rgba(255, 255, 255, 0.1);
81
+ box-shadow: inset 0 0 20px rgba(0, 0, 0, 0.2);
82
+ border-left: 8px solid #2563eb;
83
+ margin-top: 25px;
84
+ }
85
+
86
+ /* Sidebar Dark Glass */
87
+ section[data-testid="stSidebar"] {
88
+ background-color: rgba(15, 23, 42, 0.95) !important;
89
+ backdrop-filter: blur(20px) !important;
90
+ border-right: 1px solid rgba(255, 255, 255, 0.1) !important;
91
+ }
92
+
93
+ .brand-text {
94
+ font-size: 1.5rem;
95
+ font-weight: 900;
96
+ background: linear-gradient(90deg, #60a5fa, #3b82f6);
97
+ -webkit-background-clip: text;
98
+ -webkit-text-fill-color: transparent;
99
+ margin-bottom: 20px;
100
+ }
101
+
102
+ /* Standard Text Color Fixes */
103
+ .stMarkdown, p, li {
104
+ color: #cbd5e1 !important;
105
+ }
106
+
107
+ strong {
108
+ color: #f1f5f9 !important;
109
+ }
110
+ </style>
111
+ """, unsafe_allow_html=True)
112
+
113
+ # Initialize Session State
114
+ if "pdf_agent" not in st.session_state:
115
+ st.session_state.pdf_agent = None
116
+ if "messages" not in st.session_state:
117
+ st.session_state.messages = []
118
+ if "deep_insights" not in st.session_state:
119
+ st.session_state.deep_insights = {}
120
+
121
+
122
+ # Sidebar
123
+ with st.sidebar:
124
+ st.markdown('<div class="brand-text">NARESH AI</div>', unsafe_allow_html=True)
125
+ st.title("Settings")
126
+
127
+ # API Key Input
128
+ groq_api_key = st.text_input("Groq API Key", type="password", value=os.getenv("GROQ_API_KEY", ""))
129
+
130
+ # Dynamic Model Fetching
131
+ available_models = ["meta-llama/llama-4-scout-17b-16e-instruct", "llama-3.3-70b-versatile", "mixtral-8x7b-32768"]
132
+ if groq_api_key:
133
+ try:
134
+ temp_client = GroqClient(api_key=groq_api_key)
135
+ fetched_models = temp_client.list_models()
136
+ if fetched_models:
137
+ available_models = fetched_models
138
+ except Exception:
139
+ pass
140
+
141
+ model_choice = st.selectbox(
142
+ "Model Architecture",
143
+ available_models,
144
+ index=0 if "meta-llama/llama-4-scout-17b-16e-instruct" not in available_models else available_models.index("meta-llama/llama-4-scout-17b-16e-instruct")
145
+ )
146
+
147
+
148
+ st.divider()
149
+ st.markdown("### 🗂️ Document Library")
150
+
151
+ # Initialize agent if not exist (for library access)
152
+ if "pdf_agent" in st.session_state and st.session_state.pdf_agent:
153
+ if not hasattr(st.session_state.pdf_agent, "get_library"):
154
+ st.session_state.pdf_agent = None # Clear stale object
155
+
156
+ if not st.session_state.pdf_agent:
157
+ from agent.agent import LlamaPDFAgent as PDFAgent
158
+ st.session_state.pdf_agent = PDFAgent(api_key=groq_api_key or os.getenv("GROQ_API_KEY"), model=model_choice)
159
+
160
+ library = st.session_state.pdf_agent.get_library()
161
+ if not library:
162
+ st.caption("No documents in library.")
163
+ else:
164
+ for doc in library:
165
+ col1, col2 = st.columns([0.8, 0.2])
166
+ with col1:
167
+ st.markdown(f"**{doc['filename']}**")
168
+ with col2:
169
+ if st.button("🗑️", key=f"del_{doc['hash']}", help="Delete vectors"):
170
+ if st.session_state.pdf_agent.delete_document(doc['hash']):
171
+ st.session_state.pdf_agent = None # Force re-init if active one deleted
172
+ st.rerun()
173
+ st.info("To switch document, simply upload it again. It will load instantly from the library.")
174
+
175
+ st.divider()
176
+ st.markdown("### Document Controls")
177
+ if st.button("Reset Session"):
178
+ st.session_state.pdf_agent = None
179
+ st.session_state.messages = []
180
+ st.session_state.deep_insights = {}
181
+ st.rerun()
182
+
183
+
184
+
185
+ st.divider()
186
+ st.markdown("### Profile")
187
+ st.write("**Built by:** Naresh Kumar Lahajal")
188
+ st.write("**Role:** GenAI Enthusiast")
189
+ st.info("High-speed PDF intelligence powered by Groq and FastEmbed.")
190
+
191
+ # Header
192
+ st.title("Naresh AI DocuPulse")
193
+ st.subheader("Challenge B: PDF RAG & Summarization")
194
+
195
+ # File Upload
196
+ uploaded_file = st.file_uploader("Upload a PDF document", type=["pdf"])
197
+
198
+ if uploaded_file and (st.session_state.pdf_agent is None or uploaded_file.name != st.session_state.get("last_uploaded_file")):
199
+ with st.status("Ingesting document and indexing knowledge...", expanded=True) as status:
200
+ try:
201
+ agent = PDFAgent(api_key=groq_api_key, model=model_choice)
202
+ status_msg = agent.ingest_pdf(uploaded_file)
203
+ st.session_state.pdf_agent = agent
204
+ st.session_state.last_uploaded_file = uploaded_file.name
205
+ # Sync tables for explorer
206
+ st.session_state.extracted_tables = agent.tables
207
+ # Auto-Clear History on New Upload
208
+ st.session_state.messages = []
209
+ st.session_state.deep_insights = {}
210
+ status.update(label=f"✅ {status_msg}", state="complete", expanded=False)
211
+ st.toast("Intelligence Engine Initialized", icon="🧠")
212
+
213
+ except Exception as e:
214
+ st.error(f"Error processing PDF: {e}")
215
+
216
+
217
+ # Helper for Exact Backoff
218
+ def run_with_exact_backoff(func, *args, **kwargs):
219
+ """
220
+ Runs a function and catches AgentRateLimitError to perform a precise UI countdown retry.
221
+ """
222
+ max_attempts = 3
223
+ for attempt in range(max_attempts):
224
+ try:
225
+ return func(*args, **kwargs)
226
+ except AgentRateLimitError as e:
227
+ if attempt == max_attempts - 1:
228
+ st.error(f"Failed after {max_attempts} attempts due to Persistent Rate Limits. Please wait a few minutes.")
229
+ raise e
230
+
231
+ # Precise wait + 1s buffer
232
+ wait_time = int(e.wait_time) + 1
233
+ st.toast(f"Rate Limit Hit! Waiting {wait_time}s to retry...", icon="⏳")
234
+
235
+ # Visual Countdown
236
+ placeholder = st.empty()
237
+ for remaining in range(wait_time, 0, -1):
238
+ placeholder.warning(f"⚠️ API Cooldown: Retrying in {remaining} seconds...")
239
+ time.sleep(1)
240
+ placeholder.empty()
241
+ return None
242
+
243
+ if st.session_state.pdf_agent:
244
+
245
+ # Action Tabs
246
+ tab1, tab2, tab3, tab4 = st.tabs(["💬 Ask Questions", "📝 Auto-Summary", "🧠 Deep Intelligence", "📋 Table Explorer"])
247
+
248
+
249
+ with tab1:
250
+ st.markdown("### 💬 Document Conversation")
251
+ st.caption("Ask questions about the document and maintain a conversation thread.")
252
+
253
+ # Display Chat History
254
+ for message in st.session_state.messages:
255
+ with st.chat_message(message["role"]):
256
+ st.markdown(message["content"])
257
+ if "sources" in message and message["sources"]:
258
+ with st.expander("🔗 Sources & Citations", expanded=False):
259
+ for i, src in enumerate(message["sources"]):
260
+ page_text = f"Page {src['page']}" if src['page'] else "Unknown Page"
261
+ st.markdown(f"**[{i+1}] {page_text}**")
262
+ st.caption(f"_{src['text']}_")
263
+ st.divider()
264
+
265
+ # Chat Input
266
+ if prompt := st.chat_input("What would you like to know?"):
267
+ # Add user message to history
268
+ st.session_state.messages.append({"role": "user", "content": prompt})
269
+ with st.chat_message("user"):
270
+ st.markdown(prompt)
271
+
272
+ # Generate AI response
273
+ with st.chat_message("assistant"):
274
+ with st.spinner("Analyzing document context..."):
275
+ response_data = run_with_exact_backoff(st.session_state.pdf_agent.answer_question, prompt)
276
+ if response_data:
277
+ # Use st.write_stream for typing effect
278
+ answer = st.write_stream(response_data['answer_gen'])
279
+ sources = response_data.get("sources", [])
280
+
281
+ if sources:
282
+ with st.expander("🔗 Sources & Citations", expanded=False):
283
+ for i, src in enumerate(sources):
284
+ page_text = f"Page {src['page']}" if src['page'] else "Unknown Page"
285
+ st.markdown(f"**[{i+1}] {page_text}**")
286
+ st.caption(f"_{src['text']}_")
287
+ st.divider()
288
+
289
+ # Add assistant response to history
290
+ st.session_state.messages.append({
291
+ "role": "assistant",
292
+ "content": answer,
293
+ "sources": sources
294
+ })
295
+
296
+
297
+
298
+
299
+
300
+ with tab2:
301
+ if st.button("Generate Executive Summary"):
302
+ with st.spinner("Synthesizing document overview..."):
303
+ streaming_response = run_with_exact_backoff(st.session_state.pdf_agent.summarize_document)
304
+ if streaming_response:
305
+ st.markdown('<div class="answer-container" style="border-left: 8px solid #60a5fa;">', unsafe_allow_html=True)
306
+ st.markdown("### 📝 Document Summary")
307
+ st.write_stream(streaming_response.response_gen)
308
+ st.markdown('</div>', unsafe_allow_html=True)
309
+
310
+
311
+
312
+ with tab3:
313
+ st.markdown("### 🚀 Strategic Deep Analysis")
314
+ st.info("This mode uses multi-stage recursive retrieval to extract deep strategic insights and KPIs.")
315
+
316
+ if st.button("Run Deep Intelligence Scan"):
317
+ with st.status("Analyzing document layers...", expanded=True) as status:
318
+ st.write("🔍 Extracting Strategic Vision...")
319
+ insights = run_with_exact_backoff(st.session_state.pdf_agent.get_deep_insights)
320
+ if insights:
321
+ st.session_state.deep_insights = insights
322
+
323
+ # Fetch KPI visualization data
324
+ st.write("📊 Generating Visual Analytics...")
325
+ viz_data = run_with_exact_backoff(st.session_state.pdf_agent.get_kpi_viz_data)
326
+ st.session_state.kpi_viz_data = viz_data
327
+
328
+ status.update(label="✅ Deep Analysis Complete", state="complete", expanded=False)
329
+ else:
330
+ status.update(label="❌ Failed after retries", state="error", expanded=False)
331
+
332
+
333
+
334
+ if st.session_state.deep_insights:
335
+ insights = st.session_state.deep_insights
336
+
337
+ # 1. Strategic Vision
338
+ st.markdown('<div class="answer-container" style="border-left: 8px solid #8b5cf6;">', unsafe_allow_html=True)
339
+ st.markdown("#### 🎯 Strategic Vision")
340
+ st.write(insights.get("strategic_vision", "N/A"))
341
+ st.markdown('</div>', unsafe_allow_html=True)
342
+
343
+ col1, col2 = st.columns(2)
344
+
345
+ with col1:
346
+ # 2. Key Metrics
347
+ st.markdown("#### 📊 Key Performance Indicators")
348
+ metrics_text = insights.get("key_metrics", "")
349
+ st.markdown(metrics_text if metrics_text else "No metrics extracted.")
350
+
351
+ with col2:
352
+ # 3. Risks
353
+ st.markdown("#### ⚠️ Risks & Challenges")
354
+ risks_text = insights.get("risks_and_challenges", "")
355
+ st.markdown(risks_text if risks_text else "No risks identified.")
356
+
357
+ # Visual Dashboard Section
358
+ if st.session_state.get("kpi_viz_data"):
359
+ st.divider()
360
+ st.markdown("#### 📈 Key Trends & Metrics")
361
+ viz_df = pd.DataFrame(st.session_state.kpi_viz_data)
362
+
363
+ # Heuristic for chart type
364
+ if any("year" in str(l).lower() or "q1" in str(l).lower() or "q2" in str(l).lower() or "q3" in str(l).lower() or "q4" in str(l).lower() for l in viz_df['label']):
365
+ st.line_chart(viz_df.set_index('label'), color="#3b82f6")
366
+ st.caption("Auto-detected Time Series data.")
367
+ else:
368
+ st.bar_chart(viz_df.set_index('label'), color="#60a5fa")
369
+ st.caption("Bar chart representation of extracted KPIs.")
370
+
371
+ # 4. SWOT Analysis
372
+
373
+ st.divider()
374
+ st.markdown("#### 🛠️ Automated SWOT Analysis")
375
+ swot_raw = insights.get("swot_analysis", "{}")
376
+ try:
377
+ # Attempt to clean potential markdown artifacts around JSON
378
+ if "```json" in swot_raw:
379
+ swot_raw = swot_raw.split("```json")[1].split("```")[0].strip()
380
+ elif "{" in swot_raw:
381
+ swot_raw = "{" + swot_raw.split("{", 1)[1].rsplit("}", 1)[0] + "}"
382
+
383
+ swot_data = json.loads(swot_raw)
384
+
385
+ # Display SWOT in a grid
386
+ s_col1, s_col2 = st.columns(2)
387
+ with s_col1:
388
+ st.success(f"**Strengths**\n\n{swot_data.get('S', 'N/A')}")
389
+ st.info(f"**Opportunities**\n\n{swot_data.get('O', 'N/A')}")
390
+ with s_col2:
391
+ st.warning(f"**Weaknesses**\n\n{swot_data.get('W', 'N/A')}")
392
+ st.error(f"**Threats**\n\n{swot_data.get('T', 'N/A')}")
393
+ except Exception as e:
394
+ st.write("Raw SWOT Insight:")
395
+ st.write(swot_raw)
396
+
397
+ # Report Export
398
+ st.divider()
399
+ report_md = f"""# Executive Intelligence Report: {st.session_state.last_uploaded_file}
400
+
401
+ ## 🎯 Strategic Vision
402
+ {insights.get('strategic_vision', 'N/A')}
403
+
404
+ ## 📊 Key Performance Indicators
405
+ {insights.get('key_metrics', 'N/A')}
406
+
407
+ ## ⚠️ Risks & Challenges
408
+ {insights.get('risks_and_challenges', 'N/A')}
409
+
410
+ ## 🛠️ SWOT Analysis
411
+ ### Strengths
412
+ {swot_data.get('S', 'N/A') if 'swot_data' in locals() else 'N/A'}
413
+
414
+ ### Weaknesses
415
+ {swot_data.get('W', 'N/A') if 'swot_data' in locals() else 'N/A'}
416
+
417
+ ### Opportunities
418
+ {swot_data.get('O', 'N/A') if 'swot_data' in locals() else 'N/A'}
419
+
420
+ ### Threats
421
+ {swot_data.get('T', 'N/A') if 'swot_data' in locals() else 'N/A'}
422
+
423
+ ---
424
+ *Report generated by Naresh AI DocuPulse*
425
+ """
426
+ st.download_button(
427
+ label="📥 Download Executive Intelligence Report",
428
+ data=report_md,
429
+ file_name=f"Intelligence_Report_{st.session_state.last_uploaded_file.replace('.pdf', '')}.md",
430
+ mime="text/markdown"
431
+ )
432
+
433
+ with tab4:
434
+ st.markdown("### 📋 PDF Table Explorer")
435
+ st.info("Direct extraction of tabular data from the document. Select a table to explore.")
436
+
437
+ tables = st.session_state.pdf_agent.tables
438
+ if not tables:
439
+ st.warning("No structured tables were detected in the document.")
440
+ else:
441
+ table_labels = [f"{t['label']} (Page Grounded)" for t in tables]
442
+ selected_label = st.selectbox("Select Table", table_labels)
443
+
444
+ # Find the selected table
445
+ selected_idx = table_labels.index(selected_label)
446
+ selected_table = tables[selected_idx]
447
+
448
+ st.markdown(f"#### {selected_table['label']}")
449
+ st.dataframe(selected_table['df'], use_container_width=True)
450
+
451
+ # Download as CSV
452
+ csv = selected_table['df'].to_csv(index=False).encode('utf-8')
453
+ st.download_button(
454
+ label=f"📥 Download {selected_table['label']} as CSV",
455
+ data=csv,
456
+ file_name=f"{selected_table['label'].replace(' ', '_')}.csv",
457
+ mime="text/csv"
458
+ )
459
+
460
+
461
+
462
+
463
+ else:
464
+ st.info("Please upload a PDF document to begin analysis.")
465
+
466
+
467
+ # Footer
468
+ st.divider()
469
+ st.markdown(
470
+ """
471
+ <div style="text-align: center; color: #64748b; padding: 20px;">
472
+ © 2026 <b>Naresh Kumar Lahajal</b>. All Rights Reserved.<br>
473
+ <small>Powered by Groq and Retrieval-Augmented Generation</small>
474
+ </div>
475
+ """,
476
+ unsafe_allow_html=True
477
+ )
ingestion/__init__.py ADDED
File without changes
ingestion/vector_store.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import os
4
+ from fastembed import TextEmbedding
5
+ from typing import List, Tuple
6
+
7
+ class VectorStore:
8
+ def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5", cache_dir: str = ".cache"):
9
+ self.encoder = TextEmbedding(model_name=model_name)
10
+ self.index = None
11
+ self.chunks = []
12
+ self.cache_dir = cache_dir
13
+ if not os.path.exists(self.cache_dir):
14
+ os.makedirs(self.cache_dir)
15
+
16
+ def build_index(self, chunks: List[str]):
17
+ """
18
+ Embeds chunks and builds a FAISS index.
19
+ """
20
+ self.chunks = chunks
21
+ embeddings = list(self.encoder.embed(chunks))
22
+ embeddings_np = np.array(embeddings).astype('float32')
23
+
24
+ dimension = embeddings_np.shape[1]
25
+ self.index = faiss.IndexFlatL2(dimension)
26
+ self.index.add(embeddings_np)
27
+
28
+ def save_index(self, key: str):
29
+ """
30
+ Saves the FAISS index and chunks to the cache.
31
+ """
32
+ if self.index is not None:
33
+ faiss.write_index(self.index, os.path.join(self.cache_dir, f"{key}.index"))
34
+ np.save(os.path.join(self.cache_dir, f"{key}_chunks.npy"), np.array(self.chunks))
35
+
36
+ def load_index(self, key: str) -> bool:
37
+ """
38
+ Loads the FAISS index and chunks from the cache if available.
39
+ """
40
+ index_path = os.path.join(self.cache_dir, f"{key}.index")
41
+ chunks_path = os.path.join(self.cache_dir, f"{key}_chunks.npy")
42
+ if os.path.exists(index_path) and os.path.exists(chunks_path):
43
+ self.index = faiss.read_index(index_path)
44
+ self.chunks = np.load(chunks_path, allow_pickle=True).tolist()
45
+ return True
46
+ return False
47
+
48
+ def search(self, query: str, top_k: int = 4) -> List[Tuple[str, float]]:
49
+ """
50
+ Searches for the top-k chunks most relevant to the query.
51
+ """
52
+ if self.index is None:
53
+ return []
54
+
55
+ query_embedding = list(self.encoder.embed([query]))[0]
56
+ query_embedding_np = np.array([query_embedding]).astype('float32')
57
+
58
+ distances, indices = self.index.search(query_embedding_np, top_k)
59
+
60
+ results = []
61
+ for i, idx in enumerate(indices[0]):
62
+ if idx != -1:
63
+ results.append((self.chunks[idx], float(distances[0][i])))
64
+ return results
65
+
66
+ if __name__ == "__main__":
67
+ # Test
68
+ vs = VectorStore()
69
+ vs.build_index(["Hello, world!", "The quick brown fox jumps over the lazy dog."])
70
+ results = vs.search("What animal jumps?")
71
+ for res, dist in results:
72
+ print(f"Result: {res} (Distance: {dist})")
processor/__init__.py ADDED
File without changes
processor/pdf_processor.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import tempfile
3
+ import os
4
+ import io
5
+ from pathlib import Path
6
+ from typing import List, Dict
7
+ import pandas as pd
8
+ from llama_index.readers.docling import DoclingReader
9
+ from docling.document_converter import DocumentConverter
10
+
11
+ class PDFProcessor:
12
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
13
+ self.chunk_size = chunk_size
14
+ self.chunk_overlap = chunk_overlap
15
+ self.doc_converter = DocumentConverter()
16
+
17
+ def get_pdf_hash(self, pdf_file) -> str:
18
+ """
19
+ Generates an MD5 hash for the PDF file object to serve as a cache key.
20
+ """
21
+ pos = pdf_file.tell()
22
+ pdf_file.seek(0)
23
+ file_hash = hashlib.md5(pdf_file.read()).hexdigest()
24
+ pdf_file.seek(pos)
25
+ return file_hash
26
+
27
+ def load_docling_documents(self, pdf_file) -> Dict:
28
+ """
29
+ Uses DoclingReader for RAG and DocumentConverter for Table Extraction.
30
+ Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
31
+ """
32
+
33
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
34
+ pdf_file.seek(0)
35
+ tmp.write(pdf_file.read())
36
+ tmp_path = Path(tmp.name)
37
+
38
+ try:
39
+ # 1. Ingest for LlamaIndex RAG
40
+ reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
41
+ documents = reader.load_data(file_path=tmp_path)
42
+
43
+ # 2. Extract structured tables for DataFrame explorer
44
+ result = self.doc_converter.convert(tmp_path)
45
+ doc = result.document
46
+
47
+ tables = []
48
+ for i, table in enumerate(doc.tables):
49
+ try:
50
+ # Export table to HTML then read via pandas
51
+ html_table = table.export_to_html()
52
+ dfs = pd.read_html(io.StringIO(html_table))
53
+ if dfs:
54
+ tables.append({
55
+ "id": i + 1,
56
+ "label": f"Table {i+1}",
57
+ "df": dfs[0]
58
+ })
59
+ except Exception:
60
+ pass
61
+
62
+ return {
63
+ "documents": documents,
64
+ "tables": tables
65
+ }
66
+ finally:
67
+ try:
68
+ if tmp_path.exists():
69
+ tmp_path.unlink()
70
+ except Exception:
71
+ pass
72
+
73
+ if __name__ == "__main__":
74
+ # Test
75
+ pass
76
+
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groq
2
+ pypdf
3
+ python-dotenv
4
+ streamlit
5
+ fastembed
6
+ numpy
7
+ faiss-cpu
8
+ llama-index-core
9
+ llama-index-llms-groq
10
+ llama-index-embeddings-fastembed
11
+ llama-index-readers-file
12
+ llama-index-vector-stores-faiss
13
+ docling
14
+ llama-index-readers-docling
15
+ rank-bm25
16
+ llama-index-retrievers-bm25
17
+ llama-index-node-parser-docling
18
+ pandas
19
+ pydantic
20
+ lxml
21
+ html5lib
scripts/__init__.py ADDED
File without changes
scripts/check_meta.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.readers.docling import DoclingReader
2
+ import os
3
+ from pathlib import Path
4
+
5
+ def check_metadata():
6
+ pdf_path = "nvidia_q4_fy24.pdf"
7
+ if not os.path.exists(pdf_path):
8
+ print("PDF not found.")
9
+ return
10
+
11
+ reader = DoclingReader()
12
+ documents = reader.load_data(file_path=Path(pdf_path))
13
+
14
+ print(f"Loaded {len(documents)} documents.")
15
+ for i, doc in enumerate(documents[:2]): # Just check first two
16
+ print(f"Doc {i} Metadata: {doc.metadata}")
17
+ # print(f"Doc {i} Text Preview: {doc.text[:200]}...")
18
+
19
+ if __name__ == "__main__":
20
+ check_metadata()
scripts/inspect_nodes.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.readers.docling import DoclingReader
2
+ from llama_index.node_parser.docling import DoclingNodeParser
3
+ import os
4
+ from pathlib import Path
5
+
6
+ def inspect_nodes():
7
+ pdf_path = "nvidia_q4_fy24.pdf"
8
+ reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
9
+ documents = reader.load_data(file_path=Path(pdf_path))
10
+
11
+ parser = DoclingNodeParser()
12
+ nodes = parser.get_nodes_from_documents(documents)
13
+
14
+ if nodes:
15
+ print(f"Node 0 Metadata: {nodes[0].metadata.keys()}")
16
+ print(f"Node 0 Metadata Content: {nodes[0].metadata}")
17
+ else:
18
+ print("No nodes created.")
19
+
20
+ if __name__ == "__main__":
21
+ inspect_nodes()
scripts/inspect_nodes_clean.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.readers.docling import DoclingReader
2
+ from llama_index.node_parser.docling import DoclingNodeParser
3
+ import os
4
+ from pathlib import Path
5
+ import json
6
+
7
+ def inspect_nodes():
8
+ pdf_path = "nvidia_q4_fy24.pdf"
9
+ reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
10
+ documents = reader.load_data(file_path=Path(pdf_path))
11
+
12
+ parser = DoclingNodeParser()
13
+ nodes = parser.get_nodes_from_documents(documents)
14
+
15
+ if nodes:
16
+ # Find a node that is likely to have a page number (not just a title)
17
+ for node in nodes[5:15]:
18
+ metadata = node.metadata
19
+ print("--- METADATA START ---")
20
+ print(json.dumps(metadata, indent=2))
21
+ print("--- METADATA END ---")
22
+ else:
23
+ print("No nodes created.")
24
+
25
+ if __name__ == "__main__":
26
+ inspect_nodes()
scripts/test_agent.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add project root to sys.path
6
+ root_dir = Path(__file__).parent.parent
7
+ sys.path.append(str(root_dir))
8
+
9
+ from dotenv import load_dotenv
10
+ from agent.agent import LlamaPDFAgent
11
+ import io
12
+
13
+ load_dotenv()
14
+
15
+ def test_agent():
16
+ api_key = os.getenv("GROQ_API_KEY")
17
+ if not api_key:
18
+ print("GROQ_API_KEY not found in environment.")
19
+ return
20
+
21
+ agent = LlamaPDFAgent(api_key=api_key)
22
+
23
+ # Use the downloaded NVIDIA PDF - updated path
24
+ pdf_path = os.path.join(root_dir, "nvidia_q4_fy24.pdf")
25
+ if not os.path.exists(pdf_path):
26
+ print(f"PDF not found: {pdf_path}")
27
+ return
28
+
29
+ with open(pdf_path, "rb") as f:
30
+ # Mocking a streamlit-like upload object
31
+ class MockFile:
32
+ def __init__(self, file, name):
33
+ self.file = file
34
+ self.name = name
35
+ def read(self):
36
+ return self.file.read()
37
+ def seek(self, pos):
38
+ self.file.seek(pos)
39
+ def tell(self):
40
+ return self.file.tell()
41
+
42
+ mock_file = MockFile(f, pdf_path)
43
+ print("Ingesting PDF...")
44
+ msg = agent.ingest_pdf(mock_file)
45
+ print(msg)
46
+
47
+ print("\n--- Testing Q&A ---")
48
+ q = "What was the total revenue for FY24?"
49
+ result = agent.answer_question(q)
50
+ print(f"Q: {q}")
51
+ print(f"A: {result['answer']}")
52
+ print("\nSources:")
53
+ for src in result['sources']:
54
+ print(f"- [Page {src['page']}] {src['text'][:100]}...")
55
+
56
+ print("\n--- Testing Deep Insights ---")
57
+ insights = agent.get_deep_insights()
58
+
59
+ for key, value in insights.items():
60
+ print(f"\n[{key.upper()}]")
61
+ print(value)
62
+
63
+ if __name__ == "__main__":
64
+ test_agent()
scripts/verify_cite.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add project root to sys.path
6
+ root_dir = Path(__file__).parent.parent
7
+ sys.path.append(str(root_dir))
8
+
9
+ from dotenv import load_dotenv
10
+ from agent.agent import LlamaPDFAgent
11
+
12
+ load_dotenv()
13
+
14
+ def verify_citations():
15
+ agent = LlamaPDFAgent()
16
+ # Updated path to root
17
+ pdf_path = os.path.join(root_dir, "nvidia_q4_fy24.pdf")
18
+
19
+ with open(pdf_path, "rb") as f:
20
+ class MockFile:
21
+ def __init__(self, file, name):
22
+ self.file = file
23
+ self.name = name
24
+ def read(self): return self.file.read()
25
+ def seek(self, pos): self.file.seek(pos)
26
+ def tell(self): return self.file.tell()
27
+
28
+ mock_file = MockFile(f, pdf_path)
29
+ agent.ingest_pdf(mock_file)
30
+
31
+ q = "What was the revenue for Data Center in Q4?"
32
+ result = agent.answer_question(q)
33
+ print(f"\nQ: {q}")
34
+ print(f"A: {result['answer']}")
35
+ print("\nSOURCES:")
36
+ for s in result['sources']:
37
+ print(f"- Page {s['page']}: {s['text'][:50]}...")
38
+
39
+ if __name__ == "__main__":
40
+ verify_citations()