Ara Yeroyan commited on
Commit
06faccd
Β·
1 Parent(s): 6f5999e

fix gemini chunk extraction

Browse files
src/agents/gemini_chatbot.py CHANGED
@@ -145,11 +145,11 @@ class GeminiRAGChatbot:
145
  return state
146
 
147
  def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
148
- """Enhance Gemini response to include document references"""
149
  if not sources or not answer:
150
  return answer
151
 
152
- # Use LLM to intelligently add document references
153
  try:
154
  from src.llm.adapters import get_llm_client
155
  llm = get_llm_client()
@@ -163,8 +163,17 @@ class GeminiRAGChatbot:
163
  filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
164
  year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
165
  source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
 
166
 
167
- doc_summaries.append(f"[Doc {idx}] {filename} ({year}, {source}): {content[:300]}...")
 
 
 
 
 
 
 
 
168
 
169
  prompt = f"""You are enhancing a response from a document search system. The original response is:
170
 
@@ -175,34 +184,42 @@ The following documents were retrieved and used to generate this response:
175
  {chr(10).join(doc_summaries)}
176
 
177
  CRITICAL RULES:
178
- 1. The response should ONLY contain information from the retrieved documents listed above
179
- 2. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
180
- 3. Add document references [Doc i] at the end of sentences that use information from specific documents
181
- 4. Only reference documents that are actually used in the response
182
- 5. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
183
- 6. Keep the response natural and conversational
184
- 7. Don't change the core content that matches the documents, just add references where appropriate
185
- 8. If multiple documents support the same claim, use [Doc i, Doc j] format
186
- 9. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
 
 
187
 
188
- Return ONLY the enhanced response with references added and any corrections made. Do not include any explanation or meta-commentary."""
189
 
190
  enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
191
 
192
- # Fallback: if LLM fails, just return original
193
  if not enhanced or len(enhanced) < len(answer) * 0.5:
194
- logger.warning("LLM enhancement failed, using original response")
195
- return answer
 
 
 
 
 
196
 
197
  return enhanced
198
 
199
  except Exception as e:
200
  logger.warning(f"Failed to enhance response with references: {e}")
201
- # Fallback: add basic references at the end
 
202
  if sources:
203
  ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
204
- return f"{answer}\n\n*Based on documents: {ref_list}*"
205
- return answer
206
 
207
  def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
208
  """Extract UI filters from query if present"""
@@ -303,8 +320,10 @@ Return ONLY the enhanced response with references added and any corrections made
303
 
304
  # Format sources for display
305
  sources = []
306
- if final_state.get("gemini_result"):
307
- sources = self.gemini_client.format_sources_for_display(final_state["gemini_result"])
 
 
308
 
309
  return {
310
  'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
@@ -313,7 +332,8 @@ Return ONLY the enhanced response with references added and any corrections made
313
  'answer': final_state["final_response"]
314
  },
315
  'agent_logs': final_state["agent_logs"],
316
- 'actual_rag_query': final_state["current_query"]
 
317
  }
318
 
319
  def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
 
145
  return state
146
 
147
  def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
148
+ """Enhance Gemini response to include document references and format nicely"""
149
  if not sources or not answer:
150
  return answer
151
 
152
+ # Use LLM to intelligently add document references and format nicely
153
  try:
154
  from src.llm.adapters import get_llm_client
155
  llm = get_llm_client()
 
163
  filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
164
  year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
165
  source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
166
+ district = metadata.get('district', '') if isinstance(metadata, dict) else ''
167
 
168
+ doc_info = f"{filename}"
169
+ if year and year != 'Unknown':
170
+ doc_info += f" ({year})"
171
+ if source and source != 'Unknown':
172
+ doc_info += f" - {source}"
173
+ if district:
174
+ doc_info += f" - {district}"
175
+
176
+ doc_summaries.append(f"[Doc {idx}] {doc_info}: {content[:300]}...")
177
 
178
  prompt = f"""You are enhancing a response from a document search system. The original response is:
179
 
 
184
  {chr(10).join(doc_summaries)}
185
 
186
  CRITICAL RULES:
187
+ 1. Format the response nicely with proper paragraphs, bullet points, or structured sections where appropriate
188
+ 2. The response should ONLY contain information from the retrieved documents listed above
189
+ 3. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
190
+ 4. Add document references [Doc i] at the end of sentences that use information from specific documents
191
+ 5. Only reference documents that are actually used in the response
192
+ 6. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
193
+ 7. Keep the response natural, conversational, and well-formatted
194
+ 8. Use proper formatting: paragraphs, line breaks, and structure for readability
195
+ 9. Don't change the core content that matches the documents, just add references where appropriate and improve formatting
196
+ 10. If multiple documents support the same claim, use [Doc i, Doc j] format
197
+ 11. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
198
 
199
+ Return ONLY the enhanced, well-formatted response with references added and any corrections made. Do not include any explanation or meta-commentary."""
200
 
201
  enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
202
 
203
+ # Fallback: if LLM fails, just return original with basic formatting
204
  if not enhanced or len(enhanced) < len(answer) * 0.5:
205
+ logger.warning("LLM enhancement failed, using original response with basic formatting")
206
+ # Basic formatting: add line breaks after periods for readability
207
+ formatted = answer.replace('. ', '.\n\n')
208
+ if sources:
209
+ ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
210
+ formatted += f"\n\n*Based on documents: {ref_list}*"
211
+ return formatted
212
 
213
  return enhanced
214
 
215
  except Exception as e:
216
  logger.warning(f"Failed to enhance response with references: {e}")
217
+ # Fallback: add basic formatting and references at the end
218
+ formatted = answer.replace('. ', '.\n\n') # Basic paragraph formatting
219
  if sources:
220
  ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
221
+ formatted += f"\n\n*Based on documents: {ref_list}*"
222
+ return formatted
223
 
224
  def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
225
  """Extract UI filters from query if present"""
 
320
 
321
  # Format sources for display
322
  sources = []
323
+ gemini_result = final_state.get("gemini_result")
324
+ if gemini_result:
325
+ sources = self.gemini_client.format_sources_for_display(gemini_result)
326
+ logger.info(f"πŸ“‹ GEMINI CHAT: Formatted {len(sources)} sources for display")
327
 
328
  return {
329
  'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
 
332
  'answer': final_state["final_response"]
333
  },
334
  'agent_logs': final_state["agent_logs"],
335
+ 'actual_rag_query': final_state["current_query"],
336
+ 'gemini_result': gemini_result # Include raw result for tracking
337
  }
338
 
339
  def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
src/agents/multi_agent_chatbot.py CHANGED
@@ -1222,7 +1222,7 @@ Generate a conversational response with proper document references:""")
1222
  doc_sources.add(str(metadata['source']))
1223
 
1224
  # Correct misspellings in response using correct names from documents
1225
- response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
1226
 
1227
  # Check if response mentions years not in documents
1228
  year_pattern = r'\b(20\d{2})\b'
@@ -1252,27 +1252,6 @@ Generate a conversational response with proper document references:""")
1252
 
1253
  return response
1254
 
1255
- def _correct_misspellings_in_response(self, response: str, correct_districts: set, correct_sources: set) -> str:
1256
- """Correct common misspellings in response using correct names from documents."""
1257
- # Common misspelling mappings (e.g., "Kalagala" -> "Kalangala")
1258
- # We'll use fuzzy matching if needed, but first try direct corrections
1259
-
1260
- corrected = response
1261
-
1262
- # Correct district names
1263
- for correct_district in correct_districts:
1264
- # Try common misspellings
1265
- if correct_district.lower() == "kalangala":
1266
- # Replace "Kalagala" (missing 'n') with "Kalangala"
1267
- corrected = re.sub(r'\bKalagala\b', 'Kalangala', corrected, flags=re.IGNORECASE)
1268
- # Add more common misspellings as needed
1269
- # For now, we rely on the LLM to use correct names from the prompt
1270
-
1271
- # Correct source names if needed
1272
- # Add source corrections as needed in the future
1273
-
1274
- return corrected
1275
-
1276
  def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
1277
  """Generate conversational response using only LLM knowledge and conversation history"""
1278
  logger.info("πŸ’¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
 
1222
  doc_sources.add(str(metadata['source']))
1223
 
1224
  # Correct misspellings in response using correct names from documents
1225
+ # response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
1226
 
1227
  # Check if response mentions years not in documents
1228
  year_pattern = r'\b(20\d{2})\b'
 
1252
 
1253
  return response
1254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1255
  def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
1256
  """Generate conversational response using only LLM knowledge and conversation history"""
1257
  logger.info("πŸ’¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
src/gemini/file_search.py CHANGED
@@ -46,10 +46,20 @@ class GeminiFileSearchClient:
46
  if not self.api_key:
47
  raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
48
 
49
- self.store_name = store_name or os.getenv("GEMINI_FILESTORE_NAME")
50
- if not self.store_name:
51
  raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
52
 
 
 
 
 
 
 
 
 
 
 
53
  self.client = genai.Client(api_key=self.api_key)
54
  self.model = "gemini-2.5-flash" # or "gemini-2.5-pro"
55
 
@@ -95,15 +105,32 @@ class GeminiFileSearchClient:
95
  filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
96
 
97
  # Combine query with filter context
98
- # Add explicit instruction to only use information from retrieved documents
99
- instruction = "\n\nIMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents. If the retrieved documents don't contain the requested information, clearly state that.\n\n"
100
- full_query = query + filter_context + instruction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  try:
103
  # Generate content with file search
104
  # Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
 
 
 
105
  try:
106
- # Try the documented format first
107
  response = self.client.models.generate_content(
108
  model=model,
109
  contents=full_query,
@@ -111,27 +138,53 @@ class GeminiFileSearchClient:
111
  tools=[
112
  types.Tool(
113
  file_search=types.FileSearch(
114
- file_search_store_names=[self.store_name]
115
  )
116
  )
117
  ]
118
  )
119
  )
120
- except (AttributeError, TypeError) as e:
121
- # Fallback: try alternative format
122
- logger.warning(f"Primary API format failed, trying alternative: {e}")
123
- try:
124
- response = self.client.models.generate_content(
125
- model=model,
126
- contents=full_query,
127
- tools=[{
128
- "file_search": {
129
- "file_search_store_names": [self.store_name]
130
- }
131
- }]
132
- )
133
- except Exception as e2:
134
- raise Exception(f"Failed to call Gemini API: {e2}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  # Extract answer
137
  answer = ""
@@ -156,23 +209,36 @@ class GeminiFileSearchClient:
156
  sources = []
157
  grounding_metadata = None
158
 
 
 
159
  if hasattr(response, 'candidates') and response.candidates:
160
  candidate = response.candidates[0]
 
161
 
162
  # Get grounding metadata
163
  if hasattr(candidate, 'grounding_metadata'):
164
  grounding_metadata = candidate.grounding_metadata
 
165
 
166
  # Extract source documents from grounding metadata
167
  # Handle different response formats
168
  grounding_chunks = None
169
  if hasattr(grounding_metadata, 'grounding_chunks'):
170
  grounding_chunks = grounding_metadata.grounding_chunks
 
171
  elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
172
  grounding_chunks = grounding_metadata['grounding_chunks']
 
 
 
 
 
 
 
173
 
174
  if grounding_chunks:
175
- for chunk in grounding_chunks:
 
176
  # Handle both object and dict formats
177
  try:
178
  if isinstance(chunk, dict):
@@ -196,6 +262,46 @@ class GeminiFileSearchClient:
196
  text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
197
  file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  score_data = chunk_data.get('relevance_score', {})
200
  score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
201
 
@@ -204,11 +310,43 @@ class GeminiFileSearchClient:
204
  "content": text,
205
  "filename": file_name,
206
  "score": score,
 
207
  }
208
  sources.append(source_info)
 
209
  except Exception as e:
210
- logger.warning(f"Error extracting chunk info: {e}")
 
 
211
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  return GeminiFileSearchResult(
214
  answer=answer,
@@ -236,21 +374,54 @@ class GeminiFileSearchClient:
236
  formatted_sources = []
237
 
238
  for i, source in enumerate(result.sources):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # Create a Document object compatible with existing code
240
  doc = Document(
241
  page_content=source.get("content", ""),
242
  metadata={
243
- "filename": source.get("filename", "Unknown"),
244
- "source": "Gemini File Search",
245
  "score": source.get("score"),
246
  "chunk_index": i,
247
- # Add default fields that might be expected
248
- "page": None,
249
- "year": None,
250
- "district": None,
 
251
  }
252
  )
253
  formatted_sources.append(doc)
 
254
 
 
255
  return formatted_sources
256
 
 
46
  if not self.api_key:
47
  raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
48
 
49
+ store_name_raw = store_name or os.getenv("GEMINI_FILESTORE_NAME")
50
+ if not store_name_raw:
51
  raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
52
 
53
+ # Normalize store name: API expects the FULL path format (fileSearchStores/xxx)
54
+ # If just the ID is provided, construct the full path
55
+ if store_name_raw.startswith("fileSearchStores/"):
56
+ self.store_name = store_name_raw # Already full path
57
+ else:
58
+ # Just the ID provided, construct full path
59
+ self.store_name = f"fileSearchStores/{store_name_raw}"
60
+
61
+ logger.info(f"πŸ“¦ Using file search store: {self.store_name}")
62
+
63
  self.client = genai.Client(api_key=self.api_key)
64
  self.model = "gemini-2.5-flash" # or "gemini-2.5-pro"
65
 
 
105
  filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
106
 
107
  # Combine query with filter context
108
+ # Add comprehensive system instructions similar to multi-agent system
109
+ system_instructions = """You are a helpful audit report assistant specialized in analyzing government audit reports from Uganda's Office of the Auditor General.
110
+
111
+ CRITICAL RULES:
112
+ 1. **NO HALLUCINATION**: Only use information that is explicitly stated in the retrieved documents. Do not make up facts, numbers, or details.
113
+ 2. **Document References**: Always cite which documents you're using with [Doc i] references at the end of sentences that use specific information.
114
+ 3. **Formatting**: Structure your response with clear paragraphs, bullet points, or sections for readability.
115
+ 4. **Accuracy**: If the retrieved documents don't contain the requested information, explicitly state "The retrieved documents do not contain information about [topic]."
116
+ 5. **Years and Data**: Pay careful attention to years mentioned in documents. If a user asks about a specific year but documents show different years, explicitly state this.
117
+ 6. **District/Source Names**: Use the exact district and source names as they appear in the document metadata (e.g., "Kalangala" not "Kalagala").
118
+ 7. **Financial Data**: When providing financial figures, include the currency (UGX) and be precise about amounts.
119
+ 8. **Conversational Tone**: Be helpful, clear, and conversational while maintaining accuracy.
120
+
121
+ IMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents."""
122
+
123
+ # Combine system instructions with query
124
+ full_query = f"{system_instructions}\n\nUser Question: {query}{filter_context}\n\nPlease provide a detailed, well-formatted response with proper document references."
125
 
126
  try:
127
  # Generate content with file search
128
  # Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
129
+ # Try with full path format first, then fallback to just ID if needed
130
+ store_name_to_try = self.store_name
131
+
132
  try:
133
+ # Try the documented format first with full path
134
  response = self.client.models.generate_content(
135
  model=model,
136
  contents=full_query,
 
138
  tools=[
139
  types.Tool(
140
  file_search=types.FileSearch(
141
+ file_search_store_names=[store_name_to_try]
142
  )
143
  )
144
  ]
145
  )
146
  )
147
+ except Exception as api_error:
148
+ error_str = str(api_error).lower()
149
+ # If format error, try with just the ID (without fileSearchStores/ prefix)
150
+ if 'format' in error_str or 'invalid' in error_str or 'too long' in error_str:
151
+ logger.warning(f"Full path format failed, trying with just store ID: {api_error}")
152
+ # Extract just the ID part
153
+ if store_name_to_try.startswith("fileSearchStores/"):
154
+ store_id = store_name_to_try.split("/", 1)[1]
155
+ store_name_to_try = store_id
156
+
157
+ try:
158
+ response = self.client.models.generate_content(
159
+ model=model,
160
+ contents=full_query,
161
+ config=types.GenerateContentConfig(
162
+ tools=[
163
+ types.Tool(
164
+ file_search=types.FileSearch(
165
+ file_search_store_names=[store_name_to_try]
166
+ )
167
+ )
168
+ ]
169
+ )
170
+ )
171
+ except Exception as e2:
172
+ raise Exception(f"Failed to call Gemini API with both formats. Full path error: {api_error}, ID-only error: {e2}")
173
+ else:
174
+ # Try alternative dict format
175
+ logger.warning(f"Primary API format failed, trying alternative: {api_error}")
176
+ try:
177
+ response = self.client.models.generate_content(
178
+ model=model,
179
+ contents=full_query,
180
+ tools=[{
181
+ "file_search": {
182
+ "file_search_store_names": [store_name_to_try]
183
+ }
184
+ }]
185
+ )
186
+ except Exception as e2:
187
+ raise Exception(f"Failed to call Gemini API: {e2}")
188
 
189
  # Extract answer
190
  answer = ""
 
209
  sources = []
210
  grounding_metadata = None
211
 
212
+ logger.info(f"πŸ” Extracting sources from Gemini response...")
213
+
214
  if hasattr(response, 'candidates') and response.candidates:
215
  candidate = response.candidates[0]
216
+ logger.info(f" Found candidate, checking for grounding_metadata...")
217
 
218
  # Get grounding metadata
219
  if hasattr(candidate, 'grounding_metadata'):
220
  grounding_metadata = candidate.grounding_metadata
221
+ logger.info(f" Found grounding_metadata: {type(grounding_metadata)}")
222
 
223
  # Extract source documents from grounding metadata
224
  # Handle different response formats
225
  grounding_chunks = None
226
  if hasattr(grounding_metadata, 'grounding_chunks'):
227
  grounding_chunks = grounding_metadata.grounding_chunks
228
+ logger.info(f" Found grounding_chunks (attr): {len(grounding_chunks) if grounding_chunks else 0}")
229
  elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
230
  grounding_chunks = grounding_metadata['grounding_chunks']
231
+ logger.info(f" Found grounding_chunks (dict): {len(grounding_chunks) if grounding_chunks else 0}")
232
+ elif hasattr(grounding_metadata, '__dict__'):
233
+ # Try to access as object attributes
234
+ metadata_dict = grounding_metadata.__dict__
235
+ if 'grounding_chunks' in metadata_dict:
236
+ grounding_chunks = metadata_dict['grounding_chunks']
237
+ logger.info(f" Found grounding_chunks (__dict__): {len(grounding_chunks) if grounding_chunks else 0}")
238
 
239
  if grounding_chunks:
240
+ logger.info(f" Processing {len(grounding_chunks)} grounding chunks...")
241
+ for idx, chunk in enumerate(grounding_chunks):
242
  # Handle both object and dict formats
243
  try:
244
  if isinstance(chunk, dict):
 
262
  text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
263
  file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
264
 
265
+ # Try to extract file URI and parse metadata from it
266
+ file_uri = chunk_info.get('file_uri', '') if isinstance(chunk_info, dict) else ''
267
+
268
+ # Also check for 'web' attribute (GroundingChunkData structure)
269
+ if hasattr(chunk, 'web') and chunk.web:
270
+ web_data = chunk.web
271
+ file_uri = getattr(web_data, 'file_uri', '') or file_uri
272
+ file_name = getattr(web_data, 'title', '') or getattr(web_data, 'filename', '') or file_name
273
+ text = getattr(web_data, 'text', '') or getattr(web_data, 'content', '') or text
274
+
275
+ # Check retrieved_context - this is where the actual data seems to be!
276
+ if hasattr(chunk, 'retrieved_context') and chunk.retrieved_context:
277
+ rc = chunk.retrieved_context
278
+ # Get text content
279
+ if hasattr(rc, 'text'):
280
+ text = getattr(rc, 'text', '') or text
281
+ # Get document name
282
+ if hasattr(rc, 'document_name'):
283
+ doc_name = getattr(rc, 'document_name', '')
284
+ if doc_name:
285
+ file_name = doc_name or file_name
286
+
287
+ # Fallback: Parse from string representation if we still don't have filename
288
+ if not file_name:
289
+ chunk_str = str(chunk)
290
+ import re
291
+ # Look for PDF filenames
292
+ pdf_match = re.search(r"([A-Za-z0-9\s_-]+\.pdf)", chunk_str)
293
+ if pdf_match:
294
+ file_name = pdf_match.group(1)
295
+ # Or look for title= pattern
296
+ if not file_name and 'title=' in chunk_str:
297
+ title_match = re.search(r"title=['\"]([^'\"]+)['\"]", chunk_str)
298
+ if title_match:
299
+ file_name = title_match.group(1)
300
+
301
+ if not file_name and file_uri:
302
+ # Extract filename from URI if available
303
+ file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
304
+
305
  score_data = chunk_data.get('relevance_score', {})
306
  score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
307
 
 
310
  "content": text,
311
  "filename": file_name,
312
  "score": score,
313
+ "file_uri": file_uri,
314
  }
315
  sources.append(source_info)
316
+ logger.info(f"πŸ“„ Extracted source {idx+1}: {file_name} (score: {score:.3f}, content length: {len(text)})")
317
  except Exception as e:
318
+ logger.warning(f"Error extracting chunk {idx+1} info: {e}")
319
+ import traceback
320
+ logger.debug(traceback.format_exc())
321
  continue
322
+ else:
323
+ logger.warning(f" No grounding_chunks found in grounding_metadata")
324
+ else:
325
+ logger.warning(f" Candidate does not have grounding_metadata attribute")
326
+
327
+ # Also try to get file references from other parts of the response
328
+ # Sometimes Gemini includes file references in the response itself
329
+ if not sources or len(sources) == 0:
330
+ logger.info(f" No sources from grounding_metadata, trying alternative extraction...")
331
+ # Check if response has file references in other attributes
332
+ if hasattr(candidate, 'content') and candidate.content:
333
+ if hasattr(candidate.content, 'parts'):
334
+ for part in candidate.content.parts:
335
+ if hasattr(part, 'file_data'):
336
+ file_data = part.file_data
337
+ if hasattr(file_data, 'file_uri') or (isinstance(file_data, dict) and 'file_uri' in file_data):
338
+ file_uri = getattr(file_data, 'file_uri', None) or (file_data.get('file_uri') if isinstance(file_data, dict) else None)
339
+ if file_uri:
340
+ file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
341
+ sources.append({
342
+ "content": "",
343
+ "filename": file_name,
344
+ "score": 0.0,
345
+ "file_uri": file_uri,
346
+ })
347
+ logger.info(f"πŸ“„ Extracted source from file_data: {file_name}")
348
+
349
+ logger.info(f"βœ… Total sources extracted: {len(sources)}")
350
 
351
  return GeminiFileSearchResult(
352
  answer=answer,
 
374
  formatted_sources = []
375
 
376
  for i, source in enumerate(result.sources):
377
+ filename = source.get("filename", "Unknown")
378
+
379
+ # Try to extract metadata from filename (e.g., "Kalangala DLG Report of Auditor General 2021.pdf")
380
+ year = None
381
+ district = None
382
+ source_name = "Gemini File Search"
383
+
384
+ # Parse filename for year
385
+ import re
386
+ year_match = re.search(r'\b(20\d{2})\b', filename)
387
+ if year_match:
388
+ year = int(year_match.group(1))
389
+
390
+ # Parse filename for district/source
391
+ if "Kalangala" in filename:
392
+ district = "Kalangala"
393
+ source_name = "Kalangala DLG"
394
+ elif "Gulu" in filename:
395
+ district = "Gulu"
396
+ source_name = "Gulu DLG"
397
+ elif "KCCA" in filename:
398
+ district = "Kampala"
399
+ source_name = "KCCA"
400
+ elif "MAAIF" in filename:
401
+ source_name = "MAAIF"
402
+ elif "MWTS" in filename:
403
+ source_name = "MWTS"
404
+ elif "Consolidated" in filename:
405
+ source_name = "Consolidated"
406
+
407
  # Create a Document object compatible with existing code
408
  doc = Document(
409
  page_content=source.get("content", ""),
410
  metadata={
411
+ "filename": filename,
412
+ "source": source_name,
413
  "score": source.get("score"),
414
  "chunk_index": i,
415
+ "page": None, # Gemini doesn't provide page numbers
416
+ "year": year,
417
+ "district": district,
418
+ "chunk_id": f"gemini_{i}",
419
+ "_id": f"gemini_{i}",
420
  }
421
  )
422
  formatted_sources.append(doc)
423
+ logger.info(f"πŸ“‹ Formatted source {i+1}: {filename} ({year}, {source_name})")
424
 
425
+ logger.info(f"βœ… Formatted {len(formatted_sources)} sources for display")
426
  return formatted_sources
427
 
src/ui_components/components.py CHANGED
@@ -57,7 +57,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
57
  color_continuous_scale='viridis'
58
  )
59
  fig_source.update_layout(height=400, showlegend=False)
60
- st.plotly_chart(fig_source, use_container_width=True)
61
 
62
  with col2:
63
  # Year distribution chart
@@ -84,7 +84,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
84
  # Ensure years are treated as categorical (discrete) not continuous
85
  fig_year.update_xaxes(type='category')
86
  fig_year.update_layout(height=400, showlegend=False)
87
- st.plotly_chart(fig_year, use_container_width=True)
88
  else:
89
  st.info("No valid years found in the results")
90
 
@@ -109,7 +109,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
109
  color_continuous_scale='blues'
110
  )
111
  fig_district.update_layout(height=400, showlegend=False)
112
- st.plotly_chart(fig_district, use_container_width=True)
113
  else:
114
  st.info("No valid districts found in the results")
115
 
@@ -144,7 +144,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
144
  "Count": list(district_dist_filtered.values())
145
  }
146
  district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
147
- st.dataframe(district_df, hide_index=True, use_container_width=True)
148
  else:
149
  st.write("No district data")
150
  else:
@@ -158,7 +158,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
158
  "Count": list(stats['source_distribution'].values())
159
  }
160
  source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
161
- st.dataframe(source_df, hide_index=True, use_container_width=True)
162
  else:
163
  st.write("No source data")
164
 
@@ -175,7 +175,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
175
  # Sort by year as integer but display as string
176
  year_df['Year_Int'] = year_df['Year'].astype(int)
177
  year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
178
- st.dataframe(year_df, hide_index=True, use_container_width=True)
179
  else:
180
  st.write("No year data")
181
  else:
@@ -193,7 +193,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
193
  "Count": [c for f, c in filename_items[:5]]
194
  }
195
  file_df = pd.DataFrame(file_data)
196
- st.dataframe(file_df, hide_index=True, use_container_width=True)
197
  else:
198
  st.write("No file data")
199
 
 
57
  color_continuous_scale='viridis'
58
  )
59
  fig_source.update_layout(height=400, showlegend=False)
60
+ st.plotly_chart(fig_source, use_container_width=True) # Note: plotly_chart still uses use_container_width
61
 
62
  with col2:
63
  # Year distribution chart
 
84
  # Ensure years are treated as categorical (discrete) not continuous
85
  fig_year.update_xaxes(type='category')
86
  fig_year.update_layout(height=400, showlegend=False)
87
+ st.plotly_chart(fig_year, use_container_width=True) # Note: plotly_chart still uses use_container_width
88
  else:
89
  st.info("No valid years found in the results")
90
 
 
109
  color_continuous_scale='blues'
110
  )
111
  fig_district.update_layout(height=400, showlegend=False)
112
+ st.plotly_chart(fig_district, use_container_width=True) # Note: plotly_chart still uses use_container_width
113
  else:
114
  st.info("No valid districts found in the results")
115
 
 
144
  "Count": list(district_dist_filtered.values())
145
  }
146
  district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
147
+ st.dataframe(district_df, hide_index=True, width='stretch')
148
  else:
149
  st.write("No district data")
150
  else:
 
158
  "Count": list(stats['source_distribution'].values())
159
  }
160
  source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
161
+ st.dataframe(source_df, hide_index=True, width='stretch')
162
  else:
163
  st.write("No source data")
164
 
 
175
  # Sort by year as integer but display as string
176
  year_df['Year_Int'] = year_df['Year'].astype(int)
177
  year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
178
+ st.dataframe(year_df, hide_index=True, width='stretch')
179
  else:
180
  st.write("No year data")
181
  else:
 
193
  "Count": [c for f, c in filename_items[:5]]
194
  }
195
  file_df = pd.DataFrame(file_data)
196
+ st.dataframe(file_df, hide_index=True, width='stretch')
197
  else:
198
  st.write("No file data")
199