umanggupta commited on
Commit
80ced10
·
1 Parent(s): 4c686cd

Initial deployment of Financial Chatbot

Browse files
Files changed (5) hide show
  1. app.py +7 -0
  2. main.py +1452 -0
  3. packages.txt +2 -0
  4. requirements.txt +15 -0
  5. test_figure_extraction.py +287 -0
app.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+ from main import demo
3
+
4
+ # Launch without share link on Spaces
5
+ if __name__ == "__main__":
6
+ demo.launch()
7
+
main.py ADDED
@@ -0,0 +1,1452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import openai
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client.models import PointStruct, VectorParams, Distance
6
+ from pypdf import PdfReader
7
+ import uuid
8
+ from pathlib import Path
9
+ from dotenv import load_dotenv
10
+ import json
11
+ import langextract as lx
12
+ import cv2
13
+ import numpy as np
14
+ from PIL import Image
15
+ from pdf2image import convert_from_path
16
+ from doclayout_yolo import YOLOv10
17
+ import torch
18
+ import requests
19
+ import base64
20
+ import urllib3
21
+ import re
22
+
23
+ load_dotenv()
24
+
25
+ # Base directory for resolving relative paths
26
+ BASE_DIR = Path(__file__).parent.resolve()
27
+
28
+ # Create uploads directory (use absolute path)
29
+ UPLOAD_DIR = BASE_DIR / "uploaded_pdfs"
30
+ UPLOAD_DIR.mkdir(exist_ok=True)
31
+
32
+ # Initialize OpenAI client
33
+ client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
34
+
35
+ # Initialize Qdrant client with SSL verification disabled
36
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
37
+
38
+ qdrant = QdrantClient(
39
+ url=os.getenv("QDRANT_URL"),
40
+ api_key=os.getenv("QDRANT_API_KEY"),
41
+ https=True,
42
+ verify=False,
43
+ grpc_port=None,
44
+ prefer_grpc=False
45
+ )
46
+
47
+ # Collection name
48
+ COLLECTION_NAME = "pdf_documents"
49
+
50
+ # Create collection if it doesn't exist
51
+ try:
52
+ qdrant.get_collection(COLLECTION_NAME)
53
+ except Exception:
54
+ qdrant.create_collection(
55
+ collection_name=COLLECTION_NAME,
56
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
57
+ )
58
+
59
+ # Figure extraction configuration (use absolute path)
60
+ EXTRACTED_FIGURES_DIR = BASE_DIR / "extracted_figures"
61
+ EXTRACTED_FIGURES_DIR.mkdir(exist_ok=True)
62
+ MODEL_NAME = "doclayout_yolo_docstructbench_imgsz1024.pt"
63
+ CONFIDENCE_THRESHOLD = 0.25
64
+ IMAGE_SIZE = 1024
65
+ FIGURE_CLASSES = ['figure', 'picture', 'chart', 'diagram', 'graph', 'plot']
66
+
67
+ # Lazy load YOLO model
68
+ _yolo_model = None
69
+
70
+ def extract_text_by_page(pdf_path):
71
+ """Extract text from PDF file page by page."""
72
+ reader = PdfReader(pdf_path)
73
+ pages_data = []
74
+
75
+ for page_num, page in enumerate(reader.pages, 1):
76
+ page_text = page.extract_text()
77
+ if page_text.strip(): # Only include pages with content
78
+ pages_data.append({
79
+ 'page_number': page_num,
80
+ 'text': page_text.strip(),
81
+ 'text_length': len(page_text.strip())
82
+ })
83
+
84
+ print(f"🔍 DEBUG: PDF extraction - {len(pages_data)} pages extracted")
85
+ for page_data in pages_data[:3]: # Show first 3 pages
86
+ print(f"🔍 DEBUG: Page {page_data['page_number']}: {page_data['text_length']} chars")
87
+
88
+ return pages_data
89
+
90
+ def download_model():
91
+ """Download the DocLayout-YOLO model if it doesn't exist."""
92
+ model_path = Path(MODEL_NAME)
93
+ if model_path.exists():
94
+ print(f"Model already exists: {MODEL_NAME}")
95
+ return str(model_path)
96
+
97
+ print("Downloading DocLayout-YOLO model...")
98
+ model_url = "https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt"
99
+
100
+ try:
101
+ response = requests.get(model_url, stream=True)
102
+ response.raise_for_status()
103
+
104
+ with open(model_path, 'wb') as f:
105
+ for chunk in response.iter_content(chunk_size=8192):
106
+ f.write(chunk)
107
+
108
+ print(f"Model downloaded successfully: {MODEL_NAME}")
109
+ return str(model_path)
110
+ except Exception as e:
111
+ print(f"Error downloading model: {e}")
112
+ print("Make sure you have internet connection for model download")
113
+ return None
114
+
115
+ def get_yolo_model():
116
+ """Get YOLO model (lazy loading)."""
117
+ global _yolo_model
118
+ if _yolo_model is None:
119
+ print("🖼️ DEBUG: Loading DocLayout-YOLO model...")
120
+ try:
121
+ model_path = download_model()
122
+ if model_path:
123
+ print(f"🖼️ DEBUG: Model path found: {model_path}")
124
+ _yolo_model = YOLOv10(model_path)
125
+ print(f"🖼️ DEBUG: Model loaded successfully: {MODEL_NAME}")
126
+ print(f"🖼️ DEBUG: Model device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
127
+ else:
128
+ print("🖼️ DEBUG: Failed to download model")
129
+ return None
130
+ except Exception as e:
131
+ print(f"🖼️ DEBUG: Error loading model: {e}")
132
+ return None
133
+ else:
134
+ print("🖼️ DEBUG: Using cached YOLO model")
135
+ return _yolo_model
136
+
137
+ def convert_pdf_to_images(pdf_path: Path) -> list:
138
+ """Convert PDF pages to PIL Images."""
139
+ print(f"🖼️ DEBUG: Converting PDF to images: {pdf_path.name}")
140
+ print(f"🖼️ DEBUG: PDF path exists: {pdf_path.exists()}")
141
+ print(f"🖼️ DEBUG: PDF size: {pdf_path.stat().st_size / 1024 / 1024:.2f} MB")
142
+ try:
143
+ images = convert_from_path(pdf_path, dpi=200)
144
+ print(f"🖼️ DEBUG: Converted {len(images)} pages to images")
145
+ for i, img in enumerate(images[:3]): # Show first 3 images info
146
+ print(f"🖼️ DEBUG: Page {i+1} image size: {img.size}")
147
+ return images
148
+ except Exception as e:
149
+ print(f"🖼️ DEBUG: Error converting PDF: {e}")
150
+ return []
151
+
152
+ def detect_figures(model, image: Image.Image) -> list:
153
+ """Detect figures in a single page image."""
154
+ print(f"🖼️ DEBUG: Detecting figures in image size: {image.size}")
155
+ # Convert PIL to numpy array for YOLO
156
+ image_np = np.array(image)
157
+ print(f"🖼️ DEBUG: Image array shape: {image_np.shape}")
158
+
159
+ # Run detection
160
+ print(f"🖼️ DEBUG: Running YOLO prediction with imgsz={IMAGE_SIZE}, conf={CONFIDENCE_THRESHOLD}")
161
+ results = model.predict(
162
+ image_np,
163
+ imgsz=IMAGE_SIZE,
164
+ conf=CONFIDENCE_THRESHOLD,
165
+ device='cuda' if torch.cuda.is_available() else 'cpu',
166
+ verbose=False
167
+ )
168
+
169
+ print(f"🖼️ DEBUG: YOLO prediction completed, results: {len(results) if results else 0}")
170
+
171
+ detections = []
172
+ if results and len(results) > 0 and results[0].boxes is not None:
173
+ boxes = results[0].boxes
174
+ print(f"🖼️ DEBUG: Found {len(boxes)} total detections")
175
+
176
+ for i, box in enumerate(boxes):
177
+ # Get class name
178
+ class_id = int(box.cls[0])
179
+ class_name = model.names[class_id]
180
+ confidence = float(box.conf[0])
181
+
182
+ print(f"🖼️ DEBUG: Detection {i+1}: {class_name} (confidence: {confidence:.3f})")
183
+
184
+ # Check if it's a figure-related class
185
+ if class_name.lower() in FIGURE_CLASSES:
186
+ # Get bounding box coordinates
187
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
188
+
189
+ detections.append({
190
+ 'class_name': class_name,
191
+ 'confidence': confidence,
192
+ 'bbox': [float(x1), float(y1), float(x2), float(y2)],
193
+ 'detection_id': i
194
+ })
195
+ print(f"🖼️ DEBUG: ✓ Added figure detection: {class_name} at [{x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f}]")
196
+ else:
197
+ print(f"🖼️ DEBUG: ✗ Skipped non-figure detection: {class_name}")
198
+ else:
199
+ print("🖼️ DEBUG: No detections found")
200
+
201
+ print(f"🖼️ DEBUG: Returning {len(detections)} figure detections")
202
+ return detections
203
+
204
+ def extract_and_save_figures(image: Image.Image, detections: list,
205
+ page_num: int, pdf_name: str) -> list:
206
+ """Extract and save detected figures."""
207
+ print(f"🖼️ DEBUG: Extracting {len(detections)} figures from page {page_num + 1}")
208
+ saved_figures = []
209
+
210
+ for idx, detection in enumerate(detections):
211
+ x1, y1, x2, y2 = detection['bbox']
212
+ print(f"🖼️ DEBUG: Processing figure {idx + 1}: {detection['class_name']} at [{x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f}]")
213
+
214
+ # Crop the figure from the image
215
+ figure_crop = image.crop((x1, y1, x2, y2))
216
+ print(f"🖼️ DEBUG: Cropped figure size: {figure_crop.size}")
217
+
218
+ # Generate filename
219
+ figure_filename = f"{pdf_name}_page{page_num+1}_figure{idx+1}_{detection['class_name']}.png"
220
+ figure_path = EXTRACTED_FIGURES_DIR / figure_filename
221
+ print(f"🖼️ DEBUG: Saving figure to: {figure_path}")
222
+
223
+ # Save the figure
224
+ figure_crop.save(figure_path, "PNG")
225
+ print(f"🖼️ DEBUG: ✓ Figure saved successfully")
226
+
227
+ # Store metadata
228
+ figure_metadata = {
229
+ 'filename': figure_filename,
230
+ 'page_number': page_num + 1,
231
+ 'class_name': detection['class_name'],
232
+ 'confidence': detection['confidence'],
233
+ 'bbox': detection['bbox'],
234
+ 'image_size': figure_crop.size,
235
+ 'pdf_name': pdf_name,
236
+ 'figure_path': str(figure_path)
237
+ }
238
+
239
+ saved_figures.append(figure_metadata)
240
+ print(f"🖼️ DEBUG: ✓ Saved figure: {figure_filename} (confidence: {detection['confidence']:.3f})")
241
+
242
+ print(f"🖼️ DEBUG: Extracted {len(saved_figures)} figures from page {page_num + 1}")
243
+ return saved_figures
244
+
245
+ def analyze_figure_with_vision_api(image_path):
246
+ """Analyze figure using OpenAI Vision API."""
247
+ print(f"🖼️ DEBUG: Analyzing figure with Vision API: {image_path}")
248
+ print(f"🖼️ DEBUG: Image file exists: {Path(image_path).exists()}")
249
+ print(f"🖼️ DEBUG: Image file size: {Path(image_path).stat().st_size / 1024:.2f} KB")
250
+
251
+ try:
252
+ with open(image_path, "rb") as image_file:
253
+ base64_image = base64.b64encode(image_file.read()).decode('utf-8')
254
+
255
+ print(f"🖼️ DEBUG: Base64 encoded image length: {len(base64_image)} chars")
256
+ print(f"🖼️ DEBUG: Sending request to OpenAI Vision API...")
257
+
258
+ response = client.chat.completions.create(
259
+ model="gpt-4o",
260
+ messages=[{
261
+ "role": "user",
262
+ "content": [
263
+ {"type": "text", "text": "Analyze this figure from a financial document. Describe what it shows, key data points, trends, and insights. Be specific about numbers, labels, and visual elements."},
264
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
265
+ ]
266
+ }],
267
+ max_tokens=500
268
+ )
269
+
270
+ description = response.choices[0].message.content
271
+ print(f"🖼️ DEBUG: ✓ Vision API response received ({len(description)} chars)")
272
+ print(f"🖼️ DEBUG: Description preview: {description[:100]}...")
273
+ return description
274
+ except Exception as e:
275
+ print(f"🖼️ DEBUG: ✗ Error analyzing figure with Vision API: {e}")
276
+ return "Unable to analyze this figure."
277
+
278
+ def is_figure_query(query):
279
+ """Detect if user is asking for a figure/chart/diagram."""
280
+ figure_keywords = ['figure', 'chart', 'diagram', 'graph', 'image',
281
+ 'picture', 'visualization', 'plot', 'show me']
282
+ is_figure = any(keyword in query.lower() for keyword in figure_keywords)
283
+ print(f"🖼️ DEBUG: Query '{query}' -> is_figure_query: {is_figure}")
284
+ return is_figure
285
+
286
+ def rank_figures_by_relevance(figures_found, query):
287
+ """Rank figures by semantic similarity to the user's query using embeddings."""
288
+ print(f"🔍 DEBUG: Ranking {len(figures_found)} figures by semantic similarity to query: '{query}'")
289
+
290
+ if not figures_found:
291
+ return []
292
+
293
+ try:
294
+ # Get embedding for the user query
295
+ print(f"🔍 DEBUG: Creating embedding for query: '{query}'")
296
+ query_embedding = get_embedding(query)
297
+
298
+ if query_embedding == [0.0] * 1536:
299
+ print("🔍 DEBUG: ✗ Failed to create query embedding, falling back to keyword matching")
300
+ return rank_figures_by_keywords(figures_found, query)
301
+
302
+ print(f"🔍 DEBUG: ✓ Query embedding created successfully")
303
+
304
+ # Calculate similarity scores for each figure
305
+ scored_figures = []
306
+
307
+ for fig in figures_found:
308
+ description = fig.get('description', '')
309
+ if not description:
310
+ print(f"🔍 DEBUG: Figure '{fig['filename']}' has no description, skipping")
311
+ continue
312
+
313
+ # Get embedding for figure description
314
+ print(f"🔍 DEBUG: Creating embedding for figure description: '{description[:100]}...'")
315
+ fig_embedding = get_embedding(description)
316
+
317
+ if fig_embedding == [0.0] * 1536:
318
+ print(f"🔍 DEBUG: ✗ Failed to create embedding for figure '{fig['filename']}', skipping")
319
+ continue
320
+
321
+ # Calculate cosine similarity
322
+ similarity_score = calculate_cosine_similarity(query_embedding, fig_embedding)
323
+ print(f"🔍 DEBUG: Figure '{fig['filename']}' similarity score: {similarity_score:.4f}")
324
+
325
+ scored_figures.append((similarity_score, fig))
326
+
327
+ # Sort by similarity score (highest first) and take top 3
328
+ scored_figures.sort(key=lambda x: x[0], reverse=True)
329
+ top_figures = [fig for score, fig in scored_figures[:3]]
330
+
331
+ print(f"🔍 DEBUG: Selected top {len(top_figures)} figures by semantic similarity:")
332
+ for i, fig in enumerate(top_figures):
333
+ print(f"🔍 DEBUG: {i+1}. {fig['filename']} (similarity: {scored_figures[i][0]:.4f})")
334
+
335
+ return top_figures
336
+
337
+ except Exception as e:
338
+ print(f"🔍 DEBUG: ✗ Error in semantic ranking: {e}")
339
+ print(f"🔍 DEBUG: Falling back to keyword-based ranking")
340
+ return rank_figures_by_keywords(figures_found, query)
341
+
342
+ def calculate_cosine_similarity(vec1, vec2):
343
+ """Calculate cosine similarity between two vectors."""
344
+ import numpy as np
345
+
346
+ # Convert to numpy arrays
347
+ a = np.array(vec1)
348
+ b = np.array(vec2)
349
+
350
+ # Calculate cosine similarity
351
+ dot_product = np.dot(a, b)
352
+ norm_a = np.linalg.norm(a)
353
+ norm_b = np.linalg.norm(b)
354
+
355
+ if norm_a == 0 or norm_b == 0:
356
+ return 0.0
357
+
358
+ similarity = dot_product / (norm_a * norm_b)
359
+ return float(similarity)
360
+
361
+ def rank_figures_by_keywords(figures_found, query):
362
+ """Fallback keyword-based ranking method."""
363
+ print(f"🔍 DEBUG: Using keyword-based ranking as fallback")
364
+
365
+ query_lower = query.lower()
366
+ query_words = set(query_lower.split())
367
+
368
+ scored_figures = []
369
+
370
+ for fig in figures_found:
371
+ score = 0
372
+ description = fig.get('description', '').lower()
373
+ filename = fig.get('filename', '').lower()
374
+
375
+ # Basic keyword matching
376
+ description_words = set(description.split())
377
+ common_words = query_words.intersection(description_words)
378
+ score += len(common_words) * 2
379
+
380
+ filename_words = set(filename.split('_'))
381
+ filename_matches = query_words.intersection(filename_words)
382
+ score += len(filename_matches) * 1.5
383
+
384
+ # Add base score for having a valid path
385
+ if fig.get('path') and Path(fig['path']).exists():
386
+ score += 1
387
+
388
+ scored_figures.append((score, fig))
389
+ print(f"🔍 DEBUG: Figure '{fig['filename']}' keyword score: {score}")
390
+
391
+ # Sort by score and take top 3
392
+ scored_figures.sort(key=lambda x: x[0], reverse=True)
393
+ top_figures = [fig for score, fig in scored_figures[:3]]
394
+
395
+ print(f"🔍 DEBUG: Selected top {len(top_figures)} figures by keywords:")
396
+ for i, fig in enumerate(top_figures):
397
+ print(f"🔍 DEBUG: {i+1}. {fig['filename']} (score: {scored_figures[i][0]})")
398
+
399
+ return top_figures
400
+
401
+ def smart_chunk_with_langextract_page_by_page(pages_data, pdf_name):
402
+ """Use LangExtract to create intelligent chunks with semantic understanding, processing each page individually."""
403
+ print(f"\n🔍 DEBUG: Starting LangExtract page-by-page processing for {pdf_name}")
404
+ print(f"📄 DEBUG: Processing {len(pages_data)} pages individually")
405
+
406
+ all_chunks = []
407
+
408
+ for page_data in pages_data:
409
+ page_number = page_data['page_number']
410
+ page_text = page_data['text']
411
+
412
+ print(f"\n📄 DEBUG: Processing Page {page_number} ({len(page_text)} chars)")
413
+ print(f"📄 DEBUG: Page {page_number} preview: {page_text[:200]}...")
414
+
415
+ try:
416
+ # Define chunking prompt for LangExtract - focused on financial document extraction
417
+ chunking_prompt = f"""
418
+ Extract key information from this financial document page. Identify and extract meaningful chunks of information.
419
+ Focus on extracting:
420
+ 1. Financial metrics (revenue, profit, costs, investments)
421
+ 2. Business operations and strategies
422
+ 3. Key announcements and developments
423
+ 4. Risk factors and forward-looking statements
424
+ 5. Company background and history
425
+
426
+ Each extraction should be a coherent piece of information that can stand alone.
427
+ This is page {page_number} of the document.
428
+ """
429
+
430
+ # Define examples for LangExtract using proper ExampleData objects
431
+ examples = [
432
+ lx.data.ExampleData(
433
+ text="DoorDash reported Q3 revenue of $2.2 billion, up 27% year-over-year. The company's marketplace revenue grew to $1.8 billion, driven by increased order volume and higher average order values.",
434
+ extractions=[
435
+ lx.data.Extraction(
436
+ extraction_class="financial",
437
+ extraction_text="DoorDash reported Q3 revenue of $2.2 billion, up 27% year-over-year.",
438
+ attributes={"metric": "revenue", "period": "Q3", "growth": "27%"}
439
+ ),
440
+ lx.data.Extraction(
441
+ extraction_class="financial",
442
+ extraction_text="The company's marketplace revenue grew to $1.8 billion, driven by increased order volume and higher average order values.",
443
+ attributes={"metric": "marketplace_revenue", "amount": "$1.8 billion", "drivers": ["order_volume", "average_order_values"]}
444
+ )
445
+ ]
446
+ )
447
+ ]
448
+
449
+ print(f"🚀 DEBUG: Calling lx.extract for Page {page_number}")
450
+
451
+ result = lx.extract(
452
+ text_or_documents=page_text,
453
+ prompt_description=chunking_prompt,
454
+ examples=examples,
455
+ model_id="gpt-4o",
456
+ api_key=os.getenv("OPENAI_API_KEY"),
457
+ fence_output=True,
458
+ use_schema_constraints=False
459
+ )
460
+
461
+ print(f"✅ DEBUG: LangExtract call completed for Page {page_number}")
462
+
463
+ # Process the structured data into chunks
464
+ page_chunks = []
465
+
466
+ if hasattr(result, 'extractions') and result.extractions:
467
+ print(f"📊 DEBUG: Found {len(result.extractions)} extractions from Page {page_number}")
468
+
469
+ for i, extraction in enumerate(result.extractions):
470
+ chunk_text = getattr(extraction, 'extraction_text', None)
471
+ extraction_class = getattr(extraction, 'extraction_class', 'general')
472
+ attributes = getattr(extraction, 'attributes', {})
473
+
474
+ if chunk_text and len(chunk_text.strip()) > 50: # Minimum chunk size
475
+ # Determine topic and importance based on extraction class and attributes
476
+ topic = extraction_class.title() if extraction_class else 'General Content'
477
+ importance = 'High' if extraction_class in ['financial', 'revenue', 'profit'] else 'Medium'
478
+
479
+ page_chunks.append({
480
+ 'text': chunk_text,
481
+ 'topic': topic,
482
+ 'importance': importance,
483
+ 'type': extraction_class,
484
+ 'chunk_id': f"{pdf_name}_page_{page_number}_chunk_{i}",
485
+ 'source_location': f"Page {page_number}",
486
+ 'page_number': page_number,
487
+ 'attributes': attributes
488
+ })
489
+ print(f"✅ DEBUG: ✓ Page {page_number} chunk {i+1}: {extraction_class} - {len(chunk_text)} chars")
490
+ else:
491
+ print(f"❌ DEBUG: ✗ Skipped Page {page_number} chunk {i+1}: text too short")
492
+ else:
493
+ print(f"❌ DEBUG: No extractions found for Page {page_number}")
494
+
495
+ all_chunks.extend(page_chunks)
496
+ print(f"📊 DEBUG: Page {page_number} contributed {len(page_chunks)} chunks")
497
+
498
+ except Exception as e:
499
+ print(f"\n❌ DEBUG: LangExtract failed for Page {page_number}:")
500
+ print(f"❌ DEBUG: Exception: {str(e)}")
501
+ # Continue with next page instead of failing completely
502
+ continue
503
+
504
+ print(f"\n🔍 DEBUG: Final chunk count: {len(all_chunks)} from {len(pages_data)} pages")
505
+ return all_chunks
506
+
507
+ def extract_page_number(text):
508
+ """Extract page number from text that contains [PAGE X] markers."""
509
+ page_match = re.search(r'\[PAGE (\d+)\]', text)
510
+ return int(page_match.group(1)) if page_match else None
511
+
512
+ def smart_chunk_with_langextract(text, pdf_name):
513
+ """Use LangExtract to create intelligent chunks with semantic understanding."""
514
+ print(f"\n🔍 DEBUG: Starting LangExtract processing for {pdf_name}")
515
+ print(f"📄 DEBUG: Input text length: {len(text)} characters")
516
+ print(f"📄 DEBUG: Input text preview: {text[:200]}...")
517
+
518
+ try:
519
+ # Define chunking prompt for LangExtract - focused on financial document extraction
520
+ chunking_prompt = """
521
+ Extract key information from this financial document. Identify and extract meaningful chunks of information.
522
+ Focus on extracting:
523
+ 1. Financial metrics (revenue, profit, costs, investments)
524
+ 2. Business operations and strategies
525
+ 3. Key announcements and developments
526
+ 4. Risk factors and forward-looking statements
527
+ 5. Company background and history
528
+
529
+ Each extraction should be a coherent piece of information that can stand alone.
530
+ """
531
+
532
+ print(f"📝 DEBUG: Chunking prompt: {chunking_prompt[:100]}...")
533
+
534
+ # Define examples for LangExtract using proper ExampleData objects
535
+ examples = [
536
+ lx.data.ExampleData(
537
+ text="DoorDash reported Q3 revenue of $2.2 billion, up 27% year-over-year. The company's marketplace revenue grew to $1.8 billion, driven by increased order volume and higher average order values.",
538
+ extractions=[
539
+ lx.data.Extraction(
540
+ extraction_class="financial",
541
+ extraction_text="DoorDash reported Q3 revenue of $2.2 billion, up 27% year-over-year.",
542
+ attributes={"metric": "revenue", "period": "Q3", "growth": "27%"}
543
+ ),
544
+ lx.data.Extraction(
545
+ extraction_class="financial",
546
+ extraction_text="The company's marketplace revenue grew to $1.8 billion, driven by increased order volume and higher average order values.",
547
+ attributes={"metric": "marketplace_revenue", "amount": "$1.8 billion", "drivers": ["order_volume", "average_order_values"]}
548
+ )
549
+ ]
550
+ )
551
+ ]
552
+
553
+ print(f"📚 DEBUG: Examples: {examples}")
554
+ print(f"🔑 DEBUG: OpenAI API Key exists: {bool(os.getenv('OPENAI_API_KEY'))}")
555
+ print(f"🔑 DEBUG: OpenAI API Key preview: {os.getenv('OPENAI_API_KEY')[:10]}..." if os.getenv('OPENAI_API_KEY') else "None")
556
+
557
+ # Use LangExtract with OpenAI model
558
+ print(f"🚀 DEBUG: Calling lx.extract with:")
559
+ print(f" - text_or_documents: {len(text[:4000])} chars")
560
+ print(f" - text preview: {text[:4000][:200]}...")
561
+ print(f" - text contains PAGE markers: {'[PAGE' in text[:4000]}")
562
+ print(f" - model_id: gpt-4o")
563
+ print(f" - fence_output: True")
564
+ print(f" - use_schema_constraints: False")
565
+
566
+ result = lx.extract(
567
+ text_or_documents=text[:4000], # Limit to avoid token limits
568
+ prompt_description=chunking_prompt,
569
+ examples=examples,
570
+ model_id="gpt-4o",
571
+ api_key=os.getenv("OPENAI_API_KEY"),
572
+ fence_output=True,
573
+ use_schema_constraints=False
574
+ )
575
+
576
+ print(f"✅ DEBUG: LangExtract call completed successfully!")
577
+
578
+ # Debug: Print the actual result structure
579
+ print(f"\n🔍 DEBUG: LangExtract result analysis:")
580
+ print(f"📊 DEBUG: Result type: {type(result)}")
581
+ print(f"📊 DEBUG: Result dir: {dir(result)}")
582
+ print(f"📊 DEBUG: Result str: {str(result)[:500]}...")
583
+
584
+ # Check if result is a dict
585
+ if isinstance(result, dict):
586
+ print(f"📊 DEBUG: Result is a dict with keys: {list(result.keys())}")
587
+ for key, value in result.items():
588
+ print(f"📊 DEBUG: {key}: {type(value)} = {str(value)[:100]}...")
589
+
590
+ # Process the structured data into chunks
591
+ chunks = []
592
+
593
+ print(f"\n🔍 DEBUG: Checking for extractions attribute...")
594
+ print(f"📊 DEBUG: hasattr(result, 'extractions'): {hasattr(result, 'extractions')}")
595
+
596
+ if hasattr(result, 'extractions'):
597
+ print(f"📊 DEBUG: result.extractions type: {type(result.extractions)}")
598
+ print(f"📊 DEBUG: result.extractions value: {result.extractions}")
599
+
600
+ if result.extractions:
601
+ print(f"📊 DEBUG: Found {len(result.extractions)} extractions from LangExtract")
602
+
603
+ for i, extraction in enumerate(result.extractions):
604
+ print(f"\n🔍 DEBUG: Processing extraction {i+1}:")
605
+ print(f"📊 DEBUG: Extraction type: {type(extraction)}")
606
+ print(f"📊 DEBUG: Extraction dir: {dir(extraction)}")
607
+
608
+ # Access the extraction text and metadata
609
+ chunk_text = getattr(extraction, 'extraction_text', None)
610
+ extraction_class = getattr(extraction, 'extraction_class', 'general')
611
+ attributes = getattr(extraction, 'attributes', {})
612
+ provenance = getattr(extraction, 'provenance', [])
613
+
614
+ print(f"📊 DEBUG: chunk_text: {chunk_text[:100] if chunk_text else 'None'}...")
615
+ print(f"📊 DEBUG: chunk_text contains PAGE marker: {'[PAGE' in chunk_text if chunk_text else False}")
616
+ print(f"📊 DEBUG: extraction_class: {extraction_class}")
617
+ print(f"📊 DEBUG: attributes: {attributes}")
618
+ print(f"📊 DEBUG: provenance: {provenance}")
619
+
620
+ if chunk_text and len(chunk_text.strip()) > 50: # Minimum chunk size
621
+ # Extract page number from chunk text
622
+ page_number = extract_page_number(chunk_text)
623
+
624
+ # Determine topic and importance based on extraction class and attributes
625
+ topic = extraction_class.title() if extraction_class else 'General Content'
626
+ importance = 'High' if extraction_class in ['financial', 'revenue', 'profit'] else 'Medium'
627
+
628
+ # Create user-friendly source location with page number
629
+ source_location = f"Page {page_number}" if page_number else "Unknown page"
630
+
631
+ chunks.append({
632
+ 'text': chunk_text,
633
+ 'topic': topic,
634
+ 'importance': importance,
635
+ 'type': extraction_class,
636
+ 'chunk_id': f"{pdf_name}_chunk_{i}",
637
+ 'source_location': source_location,
638
+ 'page_number': page_number,
639
+ 'attributes': attributes,
640
+ 'provenance': provenance
641
+ })
642
+ print(f"✅ DEBUG: ✓ LangExtract chunk {i+1}: {extraction_class} - Page {page_number} - {len(chunk_text)} chars")
643
+ else:
644
+ print(f"❌ DEBUG: ✗ Skipped extraction {i+1}: text too short or None")
645
+ else:
646
+ print(f"❌ DEBUG: result.extractions is empty or None")
647
+ else:
648
+ print(f"❌ DEBUG: result has no 'extractions' attribute")
649
+
650
+ print(f"\n🔍 DEBUG: Final chunk count: {len(chunks)}")
651
+
652
+ if chunks:
653
+ print(f"✅ DEBUG: LangExtract successfully created {len(chunks)} intelligent chunks")
654
+ return chunks
655
+ else:
656
+ print(f"❌ DEBUG: LangExtract returned no chunks")
657
+ return []
658
+
659
+ except Exception as e:
660
+ print(f"\n❌ DEBUG: LangExtract failed with exception:")
661
+ print(f"❌ DEBUG: Exception type: {type(e)}")
662
+ print(f"❌ DEBUG: Exception message: {str(e)}")
663
+ print(f"❌ DEBUG: Exception args: {e.args}")
664
+ import traceback
665
+ print(f"❌ DEBUG: Full traceback:")
666
+ traceback.print_exc()
667
+ print(f"❌ DEBUG: LangExtract processing failed")
668
+ return []
669
+
670
+
671
+ def generate_query_variations(original_query):
672
+ """Generate multiple query variations using LLM for comprehensive RAG coverage."""
673
+ print(f"\n🔍 DEBUG: Generating query variations for: '{original_query}'")
674
+
675
+ try:
676
+ expansion_prompt = f"""
677
+ Given the following user question, generate 4 different query variations that would help retrieve comprehensive information from financial documents. Each variation should approach the question from a different angle:
678
+
679
+ Original question: "{original_query}"
680
+
681
+ Generate variations that:
682
+ 1. Use different terminology/synonyms
683
+ 2. Ask for specific metrics/data points
684
+ 3. Focus on different aspects (financial, operational, strategic, etc.)
685
+ 4. Use broader or more specific phrasing
686
+
687
+ Return ONLY a JSON array of 4 query strings, no other text.
688
+ Example format: ["query 1", "query 2", "query 3", "query 4"]
689
+ """
690
+
691
+ print(f"📝 DEBUG: Query expansion prompt length: {len(expansion_prompt)}")
692
+
693
+ response = client.chat.completions.create(
694
+ model="gpt-4o",
695
+ messages=[
696
+ {"role": "system", "content": "You are an expert at generating diverse query variations for information retrieval. Return only valid JSON arrays."},
697
+ {"role": "user", "content": expansion_prompt}
698
+ ],
699
+ temperature=0.7 # Higher temperature for more diverse variations
700
+ )
701
+
702
+ print(f"✅ DEBUG: Query expansion response received")
703
+ print(f"📊 DEBUG: Response content: {response.choices[0].message.content}")
704
+
705
+ # Parse the response
706
+ import json
707
+ try:
708
+ response_content = response.choices[0].message.content.strip()
709
+
710
+ # Remove markdown code blocks if present
711
+ if response_content.startswith('```json'):
712
+ response_content = response_content[7:] # Remove ```json
713
+ if response_content.startswith('```'):
714
+ response_content = response_content[3:] # Remove ```
715
+ if response_content.endswith('```'):
716
+ response_content = response_content[:-3] # Remove trailing ```
717
+
718
+ response_content = response_content.strip()
719
+ print(f"📊 DEBUG: Cleaned response content: {response_content}")
720
+
721
+ query_variations = json.loads(response_content)
722
+ print(f"✅ DEBUG: Successfully parsed {len(query_variations)} query variations")
723
+
724
+ # Add original query to the list
725
+ all_queries = [original_query] + query_variations
726
+ print(f"📊 DEBUG: Total queries (including original): {len(all_queries)}")
727
+
728
+ for i, query in enumerate(all_queries):
729
+ print(f"📊 DEBUG: Query {i+1}: {query}")
730
+
731
+ return all_queries
732
+
733
+ except json.JSONDecodeError as e:
734
+ print(f"❌ DEBUG: Failed to parse query variations as JSON: {e}")
735
+ print(f"❌ DEBUG: Raw response: {response.choices[0].message.content}")
736
+ # Fallback to original query only
737
+ return [original_query]
738
+
739
+ except Exception as e:
740
+ print(f"\n❌ DEBUG: Query expansion failed with exception:")
741
+ print(f"❌ DEBUG: Exception: {str(e)}")
742
+ # Fallback to original query only
743
+ return [original_query]
744
+
745
+ def get_embedding(text):
746
+ """Get embedding for text using OpenAI."""
747
+ # Truncate text if it's too long (safety check)
748
+ max_tokens = 8000 # Leave some buffer for the embedding model
749
+ if len(text) > max_tokens:
750
+ text = text[:max_tokens]
751
+
752
+ try:
753
+ response = client.embeddings.create(
754
+ input=text,
755
+ model="text-embedding-3-small"
756
+ )
757
+ return response.data[0].embedding
758
+ except Exception as e:
759
+ print(f"Error creating embedding: {e}")
760
+ return [0.0] * 1536
761
+
762
+ def multi_query_search(queries, collection_name, limit_per_query=2):
763
+ """Search Qdrant with multiple queries and merge results."""
764
+ print(f"\n🔍 DEBUG: Multi-query search with {len(queries)} queries")
765
+
766
+ all_results = []
767
+ seen_chunk_ids = set()
768
+
769
+ for i, query in enumerate(queries):
770
+ print(f"📊 DEBUG: Processing query {i+1}/{len(queries)}: '{query}'")
771
+
772
+ try:
773
+ # Get query embedding
774
+ query_embedding = get_embedding(query)
775
+
776
+ # Search Qdrant for relevant chunks
777
+ search_results = qdrant.search(
778
+ collection_name=collection_name,
779
+ query_vector=query_embedding,
780
+ limit=limit_per_query
781
+ )
782
+
783
+ print(f"📊 DEBUG: Query {i+1} returned {len(search_results)} results")
784
+
785
+ # Add unique results to our collection
786
+ for hit in search_results:
787
+ chunk_id = hit.payload.get('chunk_id', str(hit.id))
788
+ if chunk_id not in seen_chunk_ids:
789
+ all_results.append(hit)
790
+ seen_chunk_ids.add(chunk_id)
791
+ print(f"✅ DEBUG: Added unique result from query {i+1}: {chunk_id}")
792
+ else:
793
+ print(f"🔄 DEBUG: Skipped duplicate result from query {i+1}: {chunk_id}")
794
+
795
+ except Exception as e:
796
+ print(f"❌ DEBUG: Error processing query {i+1}: {e}")
797
+ continue
798
+
799
+ print(f"📊 DEBUG: Multi-query search completed: {len(all_results)} unique results from {len(queries)} queries")
800
+ return all_results
801
+
802
+ def upload_pdf(file):
803
+ """Process and upload PDF to Qdrant."""
804
+ print(f"\n📤 DEBUG: ===== STARTING PDF UPLOAD =====")
805
+ print(f"📤 DEBUG: File: {file}")
806
+
807
+ if file is None:
808
+ print("📤 DEBUG: No file provided")
809
+ return "No file uploaded"
810
+
811
+ # Gradio passes file path as string
812
+ import shutil
813
+ import os
814
+
815
+ # Get filename from the file path
816
+ filename = os.path.basename(file)
817
+ pdf_path = UPLOAD_DIR / filename
818
+ print(f"📤 DEBUG: Processing file: {filename}")
819
+ print(f"📤 DEBUG: PDF path: {pdf_path}")
820
+
821
+ # Copy file to our uploads directory
822
+ try:
823
+ shutil.copy2(file, pdf_path)
824
+ print(f"📤 DEBUG: ✓ File copied successfully")
825
+ except shutil.SameFileError:
826
+ # File is already in the right location, just use it
827
+ print(f"📤 DEBUG: ✓ File already in correct location")
828
+ pass
829
+
830
+ print(f"📤 DEBUG: Starting text extraction...")
831
+ # Extract text page by page and create smart chunks
832
+ pages_data = extract_text_by_page(pdf_path)
833
+ pdf_name = os.path.basename(file)
834
+ print(f"📤 DEBUG: ✓ Text extraction completed: {len(pages_data)} pages")
835
+
836
+ # Extract figures from PDF (with timeout protection)
837
+ print(f"\n🖼️ DEBUG: ===== STARTING FIGURE EXTRACTION FOR {pdf_name} =====")
838
+ model = get_yolo_model()
839
+ if model is None:
840
+ print("🖼️ DEBUG: ⚠️ Warning: Could not load YOLO model, skipping figure extraction")
841
+ figure_chunks = []
842
+ else:
843
+ print("🖼️ DEBUG: ✓ YOLO model loaded successfully")
844
+ # Convert PDF to images
845
+ images = convert_pdf_to_images(pdf_path)
846
+ if not images:
847
+ print("🖼️ DEBUG: ⚠️ Warning: Could not convert PDF to images, skipping figure extraction")
848
+ figure_chunks = []
849
+ else:
850
+ print(f"🖼️ DEBUG: ✓ PDF converted to {len(images)} images")
851
+ figure_chunks = []
852
+ total_figures = 0
853
+
854
+ # Process each page for figures (limit to first 5 pages for faster processing)
855
+ max_pages_to_process = min(5, len(images)) # Limit to first 5 pages
856
+ print(f"🖼️ DEBUG: Processing first {max_pages_to_process} pages out of {len(images)} total pages")
857
+
858
+ for page_num, image in enumerate(images[:max_pages_to_process]):
859
+ print(f"\n🖼️ DEBUG: ===== PROCESSING PAGE {page_num + 1}/{max_pages_to_process} FOR FIGURES =====")
860
+
861
+ # Detect figures on this page
862
+ detections = detect_figures(model, image)
863
+
864
+ if detections:
865
+ print(f"🖼️ DEBUG: ✓ Found {len(detections)} figures on page {page_num + 1}")
866
+ # Extract and save figures
867
+ saved_figures = extract_and_save_figures(image, detections, page_num, pdf_name)
868
+
869
+ # Analyze each figure with Vision API and create chunks
870
+ for fig_idx, figure_metadata in enumerate(saved_figures):
871
+ print(f"🖼️ DEBUG: ===== ANALYZING FIGURE {fig_idx + 1} WITH VISION API =====")
872
+ figure_description = analyze_figure_with_vision_api(figure_metadata['figure_path'])
873
+
874
+ # Create figure chunk
875
+ figure_chunk = {
876
+ 'text': figure_description,
877
+ 'topic': "Figure/Chart/Diagram",
878
+ 'importance': 'High',
879
+ 'type': 'figure',
880
+ 'chunk_id': f"{pdf_name}_figure_{page_num+1}_{fig_idx+1}",
881
+ 'source_location': f"Page {page_num + 1}",
882
+ 'page_number': page_num + 1,
883
+ 'is_figure': True,
884
+ 'figure_path': figure_metadata['figure_path'],
885
+ 'figure_filename': figure_metadata['filename'],
886
+ 'figure_class': figure_metadata['class_name'],
887
+ 'figure_confidence': figure_metadata['confidence'],
888
+ 'figure_bbox': figure_metadata['bbox']
889
+ }
890
+ figure_chunks.append(figure_chunk)
891
+ total_figures += 1
892
+ print(f"🖼️ DEBUG: �� Created figure chunk: {figure_metadata['filename']}")
893
+ print(f"🖼️ DEBUG: ✓ Chunk description length: {len(figure_description)} chars")
894
+ else:
895
+ print(f"🖼️ DEBUG: ✗ No figures detected on page {page_num + 1}")
896
+
897
+ print(f"\n🖼️ DEBUG: ===== FIGURE EXTRACTION COMPLETED =====")
898
+ print(f"🖼️ DEBUG: ✓ Total figures extracted: {total_figures}")
899
+ print(f"🖼️ DEBUG: ✓ Total figure chunks created: {len(figure_chunks)}")
900
+
901
+ print(f"📤 DEBUG: Starting text chunking...")
902
+ chunks = smart_chunk_with_langextract_page_by_page(pages_data, pdf_name)
903
+ print(f"📤 DEBUG: ✓ Text chunking completed: {len(chunks)} chunks")
904
+
905
+ # Combine text and figure chunks
906
+ all_chunks = chunks + figure_chunks
907
+ print(f"📤 DEBUG: ✓ Combined chunks: {len(all_chunks)} total ({len(chunks)} text + {len(figure_chunks)} figures)")
908
+
909
+ # If LangExtract fails, return error instead of fallback
910
+ if not chunks:
911
+ print("📤 DEBUG: ✗ LangExtract failed")
912
+ return f"Failed to process {pdf_name} with LangExtract. Please check the document content."
913
+
914
+ if not all_chunks:
915
+ print("📤 DEBUG: ✗ No chunks created")
916
+ return f"No readable text or figures found in {pdf_name}"
917
+
918
+ # Create embeddings and store in Qdrant
919
+ print(f"📤 DEBUG: Starting embedding creation and Qdrant storage...")
920
+ points = []
921
+ successful_chunks = 0
922
+
923
+ print(f"📤 DEBUG: Processing {len(all_chunks)} total chunks from {pdf_name} ({len(chunks)} text + {len(figure_chunks)} figures)")
924
+
925
+ for i, chunk_data in enumerate(all_chunks):
926
+ try:
927
+ chunk_text = chunk_data['text']
928
+ print(f"📤 DEBUG: Processing chunk {i+1}/{len(all_chunks)} (topic: {chunk_data['topic']}, length: {len(chunk_text)})")
929
+ embedding = get_embedding(chunk_text)
930
+ # Skip if embedding is zero vector (error occurred)
931
+ if embedding != [0.0] * 1536:
932
+ point = PointStruct(
933
+ id=str(uuid.uuid4()),
934
+ vector=embedding,
935
+ payload={
936
+ "text": chunk_text,
937
+ "pdf_name": pdf_name,
938
+ "chunk_index": i,
939
+ "pdf_path": str(pdf_path),
940
+ "topic": chunk_data['topic'],
941
+ "importance": chunk_data['importance'],
942
+ "type": chunk_data['type'],
943
+ "chunk_id": chunk_data['chunk_id'],
944
+ "source_location": chunk_data['source_location'],
945
+ "page_number": chunk_data.get('page_number'),
946
+ "is_figure": chunk_data.get('is_figure', False),
947
+ "figure_path": chunk_data.get('figure_path'),
948
+ "figure_filename": chunk_data.get('figure_filename'),
949
+ "figure_class": chunk_data.get('figure_class'),
950
+ "figure_confidence": chunk_data.get('figure_confidence'),
951
+ "figure_bbox": chunk_data.get('figure_bbox')
952
+ }
953
+ )
954
+ points.append(point)
955
+ successful_chunks += 1
956
+ print(f"📤 DEBUG: ✓ Chunk {i+1} ({chunk_data['topic']}) processed successfully")
957
+ else:
958
+ print(f"📤 DEBUG: ✗ Chunk {i+1} failed - zero embedding")
959
+ except Exception as e:
960
+ print(f"📤 DEBUG: ✗ Error processing chunk {i+1}: {e}")
961
+ continue
962
+
963
+ if points:
964
+ try:
965
+ print(f"📤 DEBUG: Storing {successful_chunks} chunks in Qdrant...")
966
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
967
+ print(f"📤 DEBUG: ✓ Successfully stored {successful_chunks} chunks in Qdrant")
968
+ result_message = f"Successfully uploaded {os.path.basename(file)} with {successful_chunks} chunks ({len(chunks)} text + {len(figure_chunks)} figures)"
969
+ print(f"📤 DEBUG: ===== UPLOAD COMPLETED =====")
970
+ print(f"📤 DEBUG: Result: {result_message}")
971
+ return result_message
972
+ except Exception as e:
973
+ print(f"📤 DEBUG: ✗ Error storing in Qdrant: {e}")
974
+ return f"Error storing chunks in database: {e}"
975
+ else:
976
+ print("📤 DEBUG: ✗ No points to store")
977
+ return f"Failed to process any chunks from {os.path.basename(file)}. Please check the PDF content."
978
+
979
+ def query_rag_with_figures(message, history):
980
+ """Query the RAG system with streaming response and figure information."""
981
+ print(f"\n🔍 DEBUG: ===== STARTING QUERY RAG WITH FIGURES =====")
982
+ print(f"🔍 DEBUG: Query: '{message}'")
983
+ print(f"🔍 DEBUG: History length: {len(history) if history else 0}")
984
+
985
+ if not message.strip():
986
+ print("🔍 DEBUG: Empty query, returning empty response")
987
+ return "", []
988
+
989
+ # Debug: Check collection info
990
+ try:
991
+ collection_info = qdrant.get_collection(COLLECTION_NAME)
992
+ print(f"🔍 DEBUG: Collection points count: {collection_info.points_count}")
993
+ except Exception as e:
994
+ print(f"🔍 DEBUG: Error getting collection info: {e}")
995
+ return "Error accessing document collection. Please try uploading a PDF first.", []
996
+
997
+ # Generate query variations for comprehensive coverage
998
+ print(f"🔍 DEBUG: Generating query variations...")
999
+ expanded_queries = generate_query_variations(message)
1000
+ print(f"🔍 DEBUG: Generated {len(expanded_queries)} query variations")
1001
+
1002
+ # Search Qdrant with multiple queries
1003
+ print(f"🔍 DEBUG: Searching Qdrant with multiple queries...")
1004
+ search_results = multi_query_search(expanded_queries, COLLECTION_NAME, limit_per_query=2)
1005
+
1006
+ print(f"🔍 DEBUG: Search results count: {len(search_results)}")
1007
+
1008
+ if not search_results:
1009
+ print("🔍 DEBUG: No search results found")
1010
+ return "No relevant documents found. Please upload some PDFs first.", []
1011
+
1012
+ # Check if this is a figure query and prioritize figure results
1013
+ is_figure_request = is_figure_query(message)
1014
+ print(f"🔍 DEBUG: Is figure query: {is_figure_request}")
1015
+
1016
+ # Separate figure and text results
1017
+ figure_results = []
1018
+ text_results = []
1019
+
1020
+ for hit in search_results:
1021
+ if hit.payload.get('is_figure', False):
1022
+ figure_results.append(hit)
1023
+ else:
1024
+ text_results.append(hit)
1025
+
1026
+ print(f"🔍 DEBUG: Results breakdown: {len(figure_results)} figures, {len(text_results)} text")
1027
+
1028
+ # Prioritize figure results if user is asking for figures
1029
+ if is_figure_request and figure_results:
1030
+ search_results = figure_results + text_results
1031
+ print(f"🔍 DEBUG: ✓ Prioritizing {len(figure_results)} figure results")
1032
+ elif not is_figure_request and figure_results:
1033
+ search_results = text_results + figure_results
1034
+ print(f"🔍 DEBUG: ✓ Prioritizing {len(text_results)} text results")
1035
+
1036
+ # Build context from search results with citations
1037
+ print(f"🔍 DEBUG: Building context from search results...")
1038
+ context_parts = []
1039
+ citations = []
1040
+ figures_found = [] # Store figure information
1041
+ total_length = 0
1042
+ max_context_length = 8000 # Limit context to ~8000 characters
1043
+
1044
+ for i, hit in enumerate(search_results):
1045
+ print(f"🔍 DEBUG: Processing hit {i+1}/{len(search_results)}")
1046
+
1047
+ # Try different ways to access the text
1048
+ chunk_text = None
1049
+ chunk_metadata = {}
1050
+ if hasattr(hit, 'payload') and hit.payload:
1051
+ if isinstance(hit.payload, dict) and "text" in hit.payload:
1052
+ chunk_text = hit.payload["text"]
1053
+ chunk_metadata = hit.payload
1054
+ elif hasattr(hit.payload, 'text'):
1055
+ chunk_text = hit.payload.text
1056
+ chunk_metadata = hit.payload
1057
+
1058
+ if chunk_text:
1059
+ print(f"🔍 DEBUG: Found text chunk: {len(chunk_text)} characters")
1060
+
1061
+ # Check if this is a figure result
1062
+ is_figure = chunk_metadata.get('is_figure', False)
1063
+ print(f"🔍 DEBUG: Is figure chunk: {is_figure}")
1064
+
1065
+ if is_figure:
1066
+ # Store figure information for later use
1067
+ figure_info = {
1068
+ 'path': chunk_metadata.get('figure_path'),
1069
+ 'description': chunk_text,
1070
+ 'page': chunk_metadata.get('page_number'),
1071
+ 'source': chunk_metadata.get('pdf_name'),
1072
+ 'filename': chunk_metadata.get('figure_filename'),
1073
+ 'class': chunk_metadata.get('figure_class'),
1074
+ 'confidence': chunk_metadata.get('figure_confidence')
1075
+ }
1076
+ figures_found.append(figure_info)
1077
+ print(f"🔍 DEBUG: ✓ Found figure: {figure_info['filename']} on page {figure_info['page']}")
1078
+
1079
+ if total_length + len(chunk_text) > max_context_length:
1080
+ print(f"🔍 DEBUG: Chunk too large, skipping (would exceed {max_context_length} limit)")
1081
+ break
1082
+
1083
+ # Create citation reference
1084
+ citation_ref = f"[{i+1}]"
1085
+ citations.append({
1086
+ 'ref': citation_ref,
1087
+ 'text': chunk_text[:200] + "..." if len(chunk_text) > 200 else chunk_text,
1088
+ 'source': chunk_metadata.get('source_location', 'Unknown source'),
1089
+ 'topic': chunk_metadata.get('topic', 'General'),
1090
+ 'pdf_name': chunk_metadata.get('pdf_name', 'Unknown document'),
1091
+ 'page_number': chunk_metadata.get('page_number')
1092
+ })
1093
+
1094
+ # Add citation reference to the chunk
1095
+ cited_chunk = f"{chunk_text} {citation_ref}"
1096
+ context_parts.append(cited_chunk)
1097
+ total_length += len(cited_chunk)
1098
+ print(f"🔍 DEBUG: ✓ Added chunk to context with citation {citation_ref}. Total length now: {total_length}")
1099
+ else:
1100
+ print(f"🔍 DEBUG: ✗ No text found in hit {i+1}")
1101
+
1102
+ context = "\n\n".join(context_parts)
1103
+
1104
+ # Debug: Print context being sent to LLM
1105
+ print(f"🔍 DEBUG: Context length: {len(context)} characters")
1106
+ print(f"🔍 DEBUG: Context preview: {context[:200]}...")
1107
+ print(f"🔍 DEBUG: User question: {message}")
1108
+
1109
+ # Create messages for OpenAI chat with citation instructions
1110
+ citation_instructions = """
1111
+ IMPORTANT: The context includes citation references like [1], [2], etc.
1112
+ When you reference information from the context, you MUST include the citation reference in your response.
1113
+ For example: "DoorDash reported revenue of $2.2 billion [1]..."
1114
+ """
1115
+
1116
+ messages = [
1117
+ {
1118
+ "role": "system",
1119
+ "content": f"You are a helpful assistant that answers questions based ONLY on the provided context from PDF documents. {citation_instructions} You MUST use the information from the context to answer questions and include appropriate citation references. If the context contains relevant information, provide a detailed answer based on that information with proper citations. If the context doesn't contain enough information to answer the question, say so clearly."
1120
+ },
1121
+ {
1122
+ "role": "user",
1123
+ "content": f"Here is the context from PDF documents with citation references:\n\n{context}\n\nBased on this context, please answer the following question: {message}\n\nRemember to include citation references [1], [2], etc. when referencing information from the context."
1124
+ }
1125
+ ]
1126
+
1127
+ # Stream response from OpenAI
1128
+ print(f"🔍 DEBUG: Sending request to OpenAI GPT-4o...")
1129
+ stream = client.chat.completions.create(
1130
+ model="gpt-4o",
1131
+ messages=messages,
1132
+ stream=True
1133
+ )
1134
+
1135
+ response = ""
1136
+ for chunk in stream:
1137
+ if chunk.choices[0].delta.content is not None:
1138
+ response += chunk.choices[0].delta.content
1139
+
1140
+ print(f"🔍 DEBUG: ✓ OpenAI response received ({len(response)} chars)")
1141
+
1142
+ # Rank and select top 3 most relevant figures
1143
+ print(f"🔍 DEBUG: Ranking {len(figures_found)} figures by relevance...")
1144
+ top_figures = rank_figures_by_relevance(figures_found, message)
1145
+ print(f"🔍 DEBUG: Selected top {len(top_figures)} figures")
1146
+
1147
+ # Add citations section after streaming is complete
1148
+ if citations:
1149
+ print(f"🔍 DEBUG: Adding {len(citations)} citations to response")
1150
+ citations_text = "\n\n**Sources:**\n"
1151
+ for citation in citations:
1152
+ print(f"🔍 DEBUG: Citation {citation['ref']}: page_number={citation['page_number']}, pdf_name={citation['pdf_name']}")
1153
+ page_info = f" (Page {citation['page_number']})" if citation['page_number'] else " (Page unknown)"
1154
+ pdf_name = citation['pdf_name'].replace('.pdf', '') if citation['pdf_name'] else 'Unknown Document'
1155
+ citations_text += f"{citation['ref']} {pdf_name}{page_info} - {citation['topic']}\n"
1156
+
1157
+ # Add figure information if figures were found (show top 3 only)
1158
+ if figures_found:
1159
+ print(f"🔍 DEBUG: Adding top {len(top_figures)} figures to response")
1160
+ citations_text += "\n**Top Related Figures:**\n"
1161
+ for i, fig in enumerate(top_figures):
1162
+ citations_text += f"📊 {fig['filename']} (Page {fig['page']}) - {fig['class']}\n"
1163
+
1164
+ response += citations_text
1165
+ elif figures_found:
1166
+ # If only figures found, add figure information (show top 3 only)
1167
+ print(f"🔍 DEBUG: Adding top {len(top_figures)} figures to response (no citations)")
1168
+ figures_text = "\n\n**Top Related Figures:**\n"
1169
+ for fig in top_figures:
1170
+ figures_text += f"📊 {fig['filename']} (Page {fig['page']}) - {fig['class']}\n"
1171
+ response += figures_text
1172
+
1173
+ # Prepare figure paths for gallery (top 3 only)
1174
+ figure_paths = [fig['path'] for fig in top_figures if fig['path'] and Path(fig['path']).exists()]
1175
+ print(f"🔍 DEBUG: Returning {len(figure_paths)} top figure paths for gallery")
1176
+ print(f"🔍 DEBUG: Top figure paths: {figure_paths}")
1177
+
1178
+ print(f"🔍 DEBUG: ===== QUERY RAG WITH FIGURES COMPLETED =====")
1179
+ return response, figure_paths
1180
+
1181
+ def query_rag(message, history):
1182
+ """Query the RAG system with streaming response."""
1183
+ if not message.strip():
1184
+ return ""
1185
+
1186
+ # Debug: Check collection info
1187
+ try:
1188
+ collection_info = qdrant.get_collection(COLLECTION_NAME)
1189
+ print(f"Collection points count: {collection_info.points_count}")
1190
+ except Exception as e:
1191
+ print(f"Error getting collection info: {e}")
1192
+ return "Error accessing document collection. Please try uploading a PDF first."
1193
+
1194
+ # Generate query variations for comprehensive coverage
1195
+ expanded_queries = generate_query_variations(message)
1196
+
1197
+ # Search Qdrant with multiple queries
1198
+ search_results = multi_query_search(expanded_queries, COLLECTION_NAME, limit_per_query=2)
1199
+
1200
+ print(f"Search results count: {len(search_results)}")
1201
+
1202
+ if not search_results:
1203
+ return "No relevant documents found. Please upload some PDFs first."
1204
+
1205
+ # Check if this is a figure query and prioritize figure results
1206
+ is_figure_request = is_figure_query(message)
1207
+ print(f"🖼️ Is figure query: {is_figure_request}")
1208
+
1209
+ # Separate figure and text results
1210
+ figure_results = []
1211
+ text_results = []
1212
+
1213
+ for hit in search_results:
1214
+ if hit.payload.get('is_figure', False):
1215
+ figure_results.append(hit)
1216
+ else:
1217
+ text_results.append(hit)
1218
+
1219
+ print(f"📊 Results breakdown: {len(figure_results)} figures, {len(text_results)} text")
1220
+
1221
+ # Prioritize figure results if user is asking for figures
1222
+ if is_figure_request and figure_results:
1223
+ search_results = figure_results + text_results
1224
+ print(f"🖼️ Prioritizing {len(figure_results)} figure results")
1225
+ elif not is_figure_request and figure_results:
1226
+ search_results = text_results + figure_results
1227
+ print(f"📝 Prioritizing {len(text_results)} text results")
1228
+
1229
+ # Debug: Print search results structure
1230
+ print(f"First search result type: {type(search_results[0])}")
1231
+ print(f"First search result: {search_results[0]}")
1232
+ print(f"First search result payload: {search_results[0].payload}")
1233
+ print(f"First search result payload type: {type(search_results[0].payload)}")
1234
+
1235
+ # Build context from search results with citations
1236
+ context_parts = []
1237
+ citations = []
1238
+ figures_found = [] # Store figure information
1239
+ total_length = 0
1240
+ max_context_length = 8000 # Limit context to ~8000 characters
1241
+
1242
+ for i, hit in enumerate(search_results):
1243
+ print(f"Processing hit {i+1}: {hit}")
1244
+ print(f"Hit payload: {hit.payload}")
1245
+
1246
+ # Try different ways to access the text
1247
+ chunk_text = None
1248
+ chunk_metadata = {}
1249
+ if hasattr(hit, 'payload') and hit.payload:
1250
+ if isinstance(hit.payload, dict) and "text" in hit.payload:
1251
+ chunk_text = hit.payload["text"]
1252
+ chunk_metadata = hit.payload
1253
+ elif hasattr(hit.payload, 'text'):
1254
+ chunk_text = hit.payload.text
1255
+ chunk_metadata = hit.payload
1256
+
1257
+ if chunk_text:
1258
+ print(f"Found text chunk: {len(chunk_text)} characters")
1259
+ print(f"🔍 DEBUG: Chunk metadata page_number: {chunk_metadata.get('page_number')}")
1260
+ print(f"🔍 DEBUG: Chunk metadata keys: {list(chunk_metadata.keys())}")
1261
+
1262
+ # Check if this is a figure result
1263
+ is_figure = chunk_metadata.get('is_figure', False)
1264
+
1265
+ if is_figure:
1266
+ # Store figure information for later use
1267
+ figure_info = {
1268
+ 'path': chunk_metadata.get('figure_path'),
1269
+ 'description': chunk_text,
1270
+ 'page': chunk_metadata.get('page_number'),
1271
+ 'source': chunk_metadata.get('pdf_name'),
1272
+ 'filename': chunk_metadata.get('figure_filename'),
1273
+ 'class': chunk_metadata.get('figure_class'),
1274
+ 'confidence': chunk_metadata.get('figure_confidence')
1275
+ }
1276
+ figures_found.append(figure_info)
1277
+ print(f"🖼️ Found figure: {figure_info['filename']} on page {figure_info['page']}")
1278
+
1279
+ if total_length + len(chunk_text) > max_context_length:
1280
+ print(f"Chunk too large, skipping (would exceed {max_context_length} limit)")
1281
+ break
1282
+
1283
+ # Create citation reference
1284
+ citation_ref = f"[{i+1}]"
1285
+ citations.append({
1286
+ 'ref': citation_ref,
1287
+ 'text': chunk_text[:200] + "..." if len(chunk_text) > 200 else chunk_text,
1288
+ 'source': chunk_metadata.get('source_location', 'Unknown source'),
1289
+ 'topic': chunk_metadata.get('topic', 'General'),
1290
+ 'pdf_name': chunk_metadata.get('pdf_name', 'Unknown document'),
1291
+ 'page_number': chunk_metadata.get('page_number')
1292
+ })
1293
+
1294
+ # Add citation reference to the chunk
1295
+ cited_chunk = f"{chunk_text} {citation_ref}"
1296
+ context_parts.append(cited_chunk)
1297
+ total_length += len(cited_chunk)
1298
+ print(f"Added chunk to context with citation {citation_ref}. Total length now: {total_length}")
1299
+ else:
1300
+ print(f"No text found in hit {i+1}")
1301
+
1302
+ context = "\n\n".join(context_parts)
1303
+
1304
+ # Debug: Print context being sent to LLM
1305
+ print(f"Context length: {len(context)} characters")
1306
+ print(f"Context preview: {context[:200]}...")
1307
+ print(f"User question: {message}")
1308
+
1309
+ # Create messages for OpenAI chat with citation instructions
1310
+ citation_instructions = """
1311
+ IMPORTANT: The context includes citation references like [1], [2], etc.
1312
+ When you reference information from the context, you MUST include the citation reference in your response.
1313
+ For example: "DoorDash reported revenue of $2.2 billion [1]..."
1314
+ """
1315
+
1316
+ messages = [
1317
+ {
1318
+ "role": "system",
1319
+ "content": f"You are a helpful assistant that answers questions based ONLY on the provided context from PDF documents. {citation_instructions} You MUST use the information from the context to answer questions and include appropriate citation references. If the context contains relevant information, provide a detailed answer based on that information with proper citations. If the context doesn't contain enough information to answer the question, say so clearly."
1320
+ },
1321
+ {
1322
+ "role": "user",
1323
+ "content": f"Here is the context from PDF documents with citation references:\n\n{context}\n\nBased on this context, please answer the following question: {message}\n\nRemember to include citation references [1], [2], etc. when referencing information from the context."
1324
+ }
1325
+ ]
1326
+
1327
+ # Stream response from OpenAI
1328
+ stream = client.chat.completions.create(
1329
+ model="gpt-4o",
1330
+ messages=messages,
1331
+ stream=True
1332
+ )
1333
+
1334
+ response = ""
1335
+ for chunk in stream:
1336
+ if chunk.choices[0].delta.content is not None:
1337
+ response += chunk.choices[0].delta.content
1338
+ yield response
1339
+
1340
+ # Add citations section after streaming is complete
1341
+ if citations:
1342
+ citations_text = "\n\n**Sources:**\n"
1343
+ for citation in citations:
1344
+ print(f"🔍 DEBUG: Citation {citation['ref']}: page_number={citation['page_number']}, pdf_name={citation['pdf_name']}")
1345
+ page_info = f" (Page {citation['page_number']})" if citation['page_number'] else " (Page unknown)"
1346
+ pdf_name = citation['pdf_name'].replace('.pdf', '') if citation['pdf_name'] else 'Unknown Document'
1347
+ citations_text += f"{citation['ref']} {pdf_name}{page_info} - {citation['topic']}\n"
1348
+
1349
+ # Add figure information if figures were found
1350
+ if figures_found:
1351
+ citations_text += "\n**Related Figures:**\n"
1352
+ for fig in figures_found:
1353
+ citations_text += f"📊 {fig['filename']} (Page {fig['page']}) - {fig['class']}\n"
1354
+
1355
+ yield response + citations_text
1356
+ elif figures_found:
1357
+ # If only figures found, add figure information
1358
+ figures_text = "\n\n**Related Figures:**\n"
1359
+ for fig in figures_found:
1360
+ figures_text += f"📊 {fig['filename']} (Page {fig['page']}) - {fig['class']}\n"
1361
+ yield response + figures_text
1362
+ else:
1363
+ yield response
1364
+
1365
+ # Create Gradio interface
1366
+ with gr.Blocks(title="GIC Financial Docs Assistant") as demo:
1367
+ gr.Markdown("# GIC Financial Docs Assistant")
1368
+ gr.Markdown("Upload PDFs and ask questions about their content!")
1369
+
1370
+ with gr.Tab("Upload PDFs"):
1371
+ file_input = gr.File(
1372
+ label="Upload PDF",
1373
+ file_types=[".pdf"],
1374
+ type="filepath"
1375
+ )
1376
+ upload_button = gr.Button("Upload PDF")
1377
+ upload_status = gr.Textbox(label="Status", interactive=False)
1378
+
1379
+ def check_collection():
1380
+ try:
1381
+ collection_info = qdrant.get_collection(COLLECTION_NAME)
1382
+ return f"Collection has {collection_info.points_count} documents"
1383
+ except Exception as e:
1384
+ return f"Error: {e}"
1385
+
1386
+ check_button = gr.Button("Check Collection")
1387
+ collection_status = gr.Textbox(label="Collection Status", interactive=False)
1388
+
1389
+ upload_button.click(
1390
+ upload_pdf,
1391
+ inputs=file_input,
1392
+ outputs=upload_status
1393
+ )
1394
+ check_button.click(
1395
+ check_collection,
1396
+ outputs=collection_status
1397
+ )
1398
+
1399
+ with gr.Tab("Ask Questions"):
1400
+ chatbot = gr.Chatbot(
1401
+ label="Chat with your PDFs",
1402
+ height=500,
1403
+ type="messages"
1404
+ )
1405
+ figure_gallery = gr.Gallery(
1406
+ label="Related Figures",
1407
+ columns=2,
1408
+ rows=2,
1409
+ height=300,
1410
+ show_label=True
1411
+ )
1412
+ msg = gr.Textbox(
1413
+ label="Ask a question about your uploaded PDFs",
1414
+ placeholder="What is this document about? Try asking for charts or figures!"
1415
+ )
1416
+ clear = gr.Button("Clear")
1417
+
1418
+ def user(user_message, history):
1419
+ # history is a list of message dicts when type="messages"
1420
+ history = history or []
1421
+ return "", history + [{"role": "user", "content": user_message}]
1422
+
1423
+ def bot(history):
1424
+ # Expect history as list[{"role":..., "content":...}]
1425
+ print(f"\n🤖 DEBUG: ===== BOT FUNCTION CALLED =====")
1426
+ history = history or []
1427
+ if not history or history[-1].get("role") != "user":
1428
+ print("🤖 DEBUG: No user message found, returning empty")
1429
+ yield history, []
1430
+ return
1431
+
1432
+ user_message = history[-1]["content"]
1433
+ print(f"🤖 DEBUG: User message: '{user_message}'")
1434
+ messages = history + [{"role": "assistant", "content": ""}]
1435
+
1436
+ # Use the new function that returns both response and figures
1437
+ print(f"🤖 DEBUG: Calling query_rag_with_figures...")
1438
+ response, figure_paths = query_rag_with_figures(user_message, messages)
1439
+ print(f"🤖 DEBUG: Response length: {len(response)} chars")
1440
+ print(f"🤖 DEBUG: Figure paths count: {len(figure_paths)}")
1441
+
1442
+ messages[-1]["content"] = response
1443
+ print(f"🤖 DEBUG: Returning messages and figure paths")
1444
+ yield messages, figure_paths
1445
+
1446
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
1447
+ bot, [chatbot], [chatbot, figure_gallery]
1448
+ )
1449
+ clear.click(lambda: ([], "", []), None, [chatbot, msg, figure_gallery], queue=False)
1450
+
1451
+ if __name__ == "__main__":
1452
+ demo.launch(share=False)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.49.1
2
+ qdrant-client
3
+ openai
4
+ pypdf
5
+ python-dotenv>=1.1.1
6
+ langextract[openai]
7
+ pdf2image
8
+ doclayout-yolo
9
+ opencv-python-headless
10
+ pillow
11
+ ultralytics
12
+ --extra-index-url https://download.pytorch.org/whl/cpu
13
+ torch
14
+ torchvision
15
+
test_figure_extraction.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Vision Model Figure Extraction Test Script
4
+
5
+ This script uses DocLayout-YOLO to detect and extract figures, tables, and charts
6
+ from PDF documents. It processes PDFs in the uploaded_pdfs/ directory and saves
7
+ extracted figures as separate image files with metadata.
8
+
9
+ Usage:
10
+ python test_figure_extraction.py # Process all PDFs in uploaded_pdfs/
11
+ python test_figure_extraction.py path/to/file.pdf # Process specific PDF
12
+
13
+ Integration Notes for main.py:
14
+ - This script demonstrates the figure extraction pipeline
15
+ - For integration: modify extract_text_by_page() to also extract figures
16
+ - Store figure embeddings in Qdrant alongside text embeddings
17
+ - Use multimodal retrieval (CLIP embeddings) for figure search
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import json
23
+ import argparse
24
+ from pathlib import Path
25
+ from typing import List, Dict, Any
26
+ import cv2
27
+ import numpy as np
28
+ from PIL import Image
29
+ from pdf2image import convert_from_path
30
+ from doclayout_yolo import YOLOv10
31
+ import torch
32
+ import requests
33
+ import os
34
+
35
+ # Configuration
36
+ EXTRACTED_FIGURES_DIR = Path("extracted_figures")
37
+ UPLOADED_PDFS_DIR = Path("uploaded_pdfs")
38
+ MODEL_NAME = "doclayout_yolo_docstructbench_imgsz1024.pt"
39
+ CONFIDENCE_THRESHOLD = 0.25
40
+ IMAGE_SIZE = 1024
41
+
42
+ # Figure-related class labels in DocLayout-YOLO
43
+ FIGURE_CLASSES = ['figure', 'picture', 'chart', 'diagram', 'graph', 'plot']
44
+
45
+ def setup_directories():
46
+ """Create necessary directories for output."""
47
+ EXTRACTED_FIGURES_DIR.mkdir(exist_ok=True)
48
+ print(f"Created output directory: {EXTRACTED_FIGURES_DIR}")
49
+
50
+ def download_model():
51
+ """Download the DocLayout-YOLO model if it doesn't exist."""
52
+ model_path = Path(MODEL_NAME)
53
+ if model_path.exists():
54
+ print(f"Model already exists: {MODEL_NAME}")
55
+ return str(model_path)
56
+
57
+ print("Downloading DocLayout-YOLO model...")
58
+ model_url = "https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/resolve/main/doclayout_yolo_docstructbench_imgsz1024.pt"
59
+
60
+ try:
61
+ response = requests.get(model_url, stream=True)
62
+ response.raise_for_status()
63
+
64
+ with open(model_path, 'wb') as f:
65
+ for chunk in response.iter_content(chunk_size=8192):
66
+ f.write(chunk)
67
+
68
+ print(f"Model downloaded successfully: {MODEL_NAME}")
69
+ return str(model_path)
70
+ except Exception as e:
71
+ print(f"Error downloading model: {e}")
72
+ print("Make sure you have internet connection for model download")
73
+ sys.exit(1)
74
+
75
+ def load_model():
76
+ """Load the DocLayout-YOLO model."""
77
+ print("Loading DocLayout-YOLO model...")
78
+ try:
79
+ # Download model if not exists
80
+ model_path = download_model()
81
+ model = YOLOv10(model_path)
82
+ print(f"Model loaded successfully: {MODEL_NAME}")
83
+ return model
84
+ except Exception as e:
85
+ print(f"Error loading model: {e}")
86
+ print("Make sure you have internet connection for model download")
87
+ sys.exit(1)
88
+
89
+ def convert_pdf_to_images(pdf_path: Path) -> List[Image.Image]:
90
+ """Convert PDF pages to PIL Images."""
91
+ print(f"Converting PDF to images: {pdf_path.name}")
92
+ try:
93
+ images = convert_from_path(pdf_path, dpi=200)
94
+ print(f"Converted {len(images)} pages to images")
95
+ return images
96
+ except Exception as e:
97
+ print(f"Error converting PDF: {e}")
98
+ return []
99
+
100
+ def detect_figures(model, image: Image.Image) -> List[Dict[str, Any]]:
101
+ """Detect figures in a single page image."""
102
+ # Convert PIL to numpy array for YOLO
103
+ image_np = np.array(image)
104
+
105
+ # Run detection
106
+ results = model.predict(
107
+ image_np,
108
+ imgsz=IMAGE_SIZE,
109
+ conf=CONFIDENCE_THRESHOLD,
110
+ device='cuda' if torch.cuda.is_available() else 'cpu',
111
+ verbose=False
112
+ )
113
+
114
+ detections = []
115
+ if results and len(results) > 0 and results[0].boxes is not None:
116
+ boxes = results[0].boxes
117
+ for i, box in enumerate(boxes):
118
+ # Get class name
119
+ class_id = int(box.cls[0])
120
+ class_name = model.names[class_id]
121
+
122
+ # Check if it's a figure-related class
123
+ if class_name.lower() in FIGURE_CLASSES:
124
+ # Get bounding box coordinates
125
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
126
+ confidence = float(box.conf[0])
127
+
128
+ detections.append({
129
+ 'class_name': class_name,
130
+ 'confidence': confidence,
131
+ 'bbox': [float(x1), float(y1), float(x2), float(y2)],
132
+ 'detection_id': i
133
+ })
134
+
135
+ return detections
136
+
137
+ def extract_and_save_figures(image: Image.Image, detections: List[Dict[str, Any]],
138
+ page_num: int, pdf_name: str) -> List[Dict[str, Any]]:
139
+ """Extract and save detected figures."""
140
+ saved_figures = []
141
+
142
+ for idx, detection in enumerate(detections):
143
+ x1, y1, x2, y2 = detection['bbox']
144
+
145
+ # Crop the figure from the image
146
+ figure_crop = image.crop((x1, y1, x2, y2))
147
+
148
+ # Generate filename
149
+ figure_filename = f"{pdf_name}_page{page_num+1}_figure{idx+1}_{detection['class_name']}.png"
150
+ figure_path = EXTRACTED_FIGURES_DIR / figure_filename
151
+
152
+ # Save the figure
153
+ figure_crop.save(figure_path, "PNG")
154
+
155
+ # Store metadata
156
+ figure_metadata = {
157
+ 'filename': figure_filename,
158
+ 'page_number': page_num + 1,
159
+ 'class_name': detection['class_name'],
160
+ 'confidence': detection['confidence'],
161
+ 'bbox': detection['bbox'],
162
+ 'image_size': figure_crop.size,
163
+ 'pdf_name': pdf_name
164
+ }
165
+
166
+ saved_figures.append(figure_metadata)
167
+ print(f" Saved figure: {figure_filename} (confidence: {detection['confidence']:.3f})")
168
+
169
+ return saved_figures
170
+
171
+ def process_pdf(pdf_path: Path, model) -> Dict[str, Any]:
172
+ """Process a single PDF file."""
173
+ print(f"\n{'='*60}")
174
+ print(f"Processing PDF: {pdf_path.name}")
175
+ print(f"{'='*60}")
176
+
177
+ # Convert PDF to images
178
+ images = convert_pdf_to_images(pdf_path)
179
+ if not images:
180
+ return {'error': 'Failed to convert PDF to images'}
181
+
182
+ pdf_name = pdf_path.stem
183
+ all_figures = []
184
+ total_pages = len(images)
185
+
186
+ # Process each page
187
+ for page_num, image in enumerate(images):
188
+ print(f"\nProcessing page {page_num + 1}/{total_pages}...")
189
+
190
+ # Detect figures on this page
191
+ detections = detect_figures(model, image)
192
+
193
+ if detections:
194
+ print(f" Found {len(detections)} figures on page {page_num + 1}")
195
+ # Extract and save figures
196
+ saved_figures = extract_and_save_figures(image, detections, page_num, pdf_name)
197
+ all_figures.extend(saved_figures)
198
+ else:
199
+ print(f" No figures detected on page {page_num + 1}")
200
+
201
+ # Save metadata
202
+ metadata = {
203
+ 'pdf_name': pdf_name,
204
+ 'pdf_path': str(pdf_path),
205
+ 'total_pages': total_pages,
206
+ 'total_figures': len(all_figures),
207
+ 'figures': all_figures,
208
+ 'processing_timestamp': str(Path().cwd()),
209
+ 'model_used': MODEL_NAME,
210
+ 'confidence_threshold': CONFIDENCE_THRESHOLD
211
+ }
212
+
213
+ metadata_filename = f"{pdf_name}_metadata.json"
214
+ metadata_path = EXTRACTED_FIGURES_DIR / metadata_filename
215
+
216
+ with open(metadata_path, 'w') as f:
217
+ json.dump(metadata, f, indent=2)
218
+
219
+ print(f"\nSummary for {pdf_name}:")
220
+ print(f" Pages processed: {total_pages}")
221
+ print(f" Figures extracted: {len(all_figures)}")
222
+ print(f" Metadata saved: {metadata_filename}")
223
+
224
+ return metadata
225
+
226
+ def main():
227
+ """Main function to process PDFs."""
228
+ parser = argparse.ArgumentParser(description='Extract figures from PDFs using DocLayout-YOLO')
229
+ parser.add_argument('pdf_path', nargs='?', help='Path to specific PDF file (optional)')
230
+ args = parser.parse_args()
231
+
232
+ print("Vision Model Figure Extraction Test Script")
233
+ print("=" * 50)
234
+
235
+ # Setup
236
+ setup_directories()
237
+ model = load_model()
238
+
239
+ # Determine which PDFs to process
240
+ if args.pdf_path:
241
+ pdf_path = Path(args.pdf_path)
242
+ if not pdf_path.exists():
243
+ print(f"Error: PDF file not found: {pdf_path}")
244
+ sys.exit(1)
245
+ pdf_files = [pdf_path]
246
+ else:
247
+ # Process all PDFs in uploaded_pdfs directory
248
+ if not UPLOADED_PDFS_DIR.exists():
249
+ print(f"Error: Directory not found: {UPLOADED_PDFS_DIR}")
250
+ sys.exit(1)
251
+
252
+ pdf_files = list(UPLOADED_PDFS_DIR.glob("*.pdf"))
253
+ if not pdf_files:
254
+ print(f"No PDF files found in {UPLOADED_PDFS_DIR}")
255
+ sys.exit(1)
256
+
257
+ print(f"Found {len(pdf_files)} PDF files to process")
258
+
259
+ # Process each PDF
260
+ all_results = []
261
+ total_figures = 0
262
+
263
+ for pdf_file in pdf_files:
264
+ result = process_pdf(pdf_file, model)
265
+ if 'error' not in result:
266
+ all_results.append(result)
267
+ total_figures += result['total_figures']
268
+
269
+ # Final summary
270
+ print(f"\n{'='*60}")
271
+ print("FINAL SUMMARY")
272
+ print(f"{'='*60}")
273
+ print(f"PDFs processed: {len(all_results)}")
274
+ print(f"Total figures extracted: {total_figures}")
275
+ print(f"Output directory: {EXTRACTED_FIGURES_DIR}")
276
+
277
+ if total_figures > 0:
278
+ print(f"\nExtracted figures are saved in: {EXTRACTED_FIGURES_DIR}")
279
+ print("Each PDF has a corresponding metadata JSON file with detailed information.")
280
+
281
+ print("\nIntegration Notes:")
282
+ print("- Modify extract_text_by_page() in main.py to include figure extraction")
283
+ print("- Store figure embeddings in Qdrant using CLIP or similar vision encoders")
284
+ print("- Implement multimodal retrieval for combined text + figure search")
285
+
286
+ if __name__ == "__main__":
287
+ main()