Spaces:
Sleeping
Sleeping
Ara Yeroyan
commited on
Commit
Β·
06faccd
1
Parent(s):
6f5999e
fix gemini chunk extraction
Browse files- src/agents/gemini_chatbot.py +42 -22
- src/agents/multi_agent_chatbot.py +1 -22
- src/gemini/file_search.py +201 -30
- src/ui_components/components.py +7 -7
src/agents/gemini_chatbot.py
CHANGED
|
@@ -145,11 +145,11 @@ class GeminiRAGChatbot:
|
|
| 145 |
return state
|
| 146 |
|
| 147 |
def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
|
| 148 |
-
"""Enhance Gemini response to include document references"""
|
| 149 |
if not sources or not answer:
|
| 150 |
return answer
|
| 151 |
|
| 152 |
-
# Use LLM to intelligently add document references
|
| 153 |
try:
|
| 154 |
from src.llm.adapters import get_llm_client
|
| 155 |
llm = get_llm_client()
|
|
@@ -163,8 +163,17 @@ class GeminiRAGChatbot:
|
|
| 163 |
filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
| 164 |
year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
| 165 |
source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
|
|
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
prompt = f"""You are enhancing a response from a document search system. The original response is:
|
| 170 |
|
|
@@ -175,34 +184,42 @@ The following documents were retrieved and used to generate this response:
|
|
| 175 |
{chr(10).join(doc_summaries)}
|
| 176 |
|
| 177 |
CRITICAL RULES:
|
| 178 |
-
1.
|
| 179 |
-
2.
|
| 180 |
-
3.
|
| 181 |
-
4.
|
| 182 |
-
5.
|
| 183 |
-
6.
|
| 184 |
-
7.
|
| 185 |
-
8.
|
| 186 |
-
9.
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
Return ONLY the enhanced response with references added and any corrections made. Do not include any explanation or meta-commentary."""
|
| 189 |
|
| 190 |
enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
|
| 191 |
|
| 192 |
-
# Fallback: if LLM fails, just return original
|
| 193 |
if not enhanced or len(enhanced) < len(answer) * 0.5:
|
| 194 |
-
logger.warning("LLM enhancement failed, using original response")
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
return enhanced
|
| 198 |
|
| 199 |
except Exception as e:
|
| 200 |
logger.warning(f"Failed to enhance response with references: {e}")
|
| 201 |
-
# Fallback: add basic references at the end
|
|
|
|
| 202 |
if sources:
|
| 203 |
ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
|
| 204 |
-
|
| 205 |
-
return
|
| 206 |
|
| 207 |
def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
|
| 208 |
"""Extract UI filters from query if present"""
|
|
@@ -303,8 +320,10 @@ Return ONLY the enhanced response with references added and any corrections made
|
|
| 303 |
|
| 304 |
# Format sources for display
|
| 305 |
sources = []
|
| 306 |
-
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
|
| 309 |
return {
|
| 310 |
'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
|
|
@@ -313,7 +332,8 @@ Return ONLY the enhanced response with references added and any corrections made
|
|
| 313 |
'answer': final_state["final_response"]
|
| 314 |
},
|
| 315 |
'agent_logs': final_state["agent_logs"],
|
| 316 |
-
'actual_rag_query': final_state["current_query"]
|
|
|
|
| 317 |
}
|
| 318 |
|
| 319 |
def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
|
|
|
|
| 145 |
return state
|
| 146 |
|
| 147 |
def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
|
| 148 |
+
"""Enhance Gemini response to include document references and format nicely"""
|
| 149 |
if not sources or not answer:
|
| 150 |
return answer
|
| 151 |
|
| 152 |
+
# Use LLM to intelligently add document references and format nicely
|
| 153 |
try:
|
| 154 |
from src.llm.adapters import get_llm_client
|
| 155 |
llm = get_llm_client()
|
|
|
|
| 163 |
filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
| 164 |
year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
| 165 |
source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
|
| 166 |
+
district = metadata.get('district', '') if isinstance(metadata, dict) else ''
|
| 167 |
|
| 168 |
+
doc_info = f"{filename}"
|
| 169 |
+
if year and year != 'Unknown':
|
| 170 |
+
doc_info += f" ({year})"
|
| 171 |
+
if source and source != 'Unknown':
|
| 172 |
+
doc_info += f" - {source}"
|
| 173 |
+
if district:
|
| 174 |
+
doc_info += f" - {district}"
|
| 175 |
+
|
| 176 |
+
doc_summaries.append(f"[Doc {idx}] {doc_info}: {content[:300]}...")
|
| 177 |
|
| 178 |
prompt = f"""You are enhancing a response from a document search system. The original response is:
|
| 179 |
|
|
|
|
| 184 |
{chr(10).join(doc_summaries)}
|
| 185 |
|
| 186 |
CRITICAL RULES:
|
| 187 |
+
1. Format the response nicely with proper paragraphs, bullet points, or structured sections where appropriate
|
| 188 |
+
2. The response should ONLY contain information from the retrieved documents listed above
|
| 189 |
+
3. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
|
| 190 |
+
4. Add document references [Doc i] at the end of sentences that use information from specific documents
|
| 191 |
+
5. Only reference documents that are actually used in the response
|
| 192 |
+
6. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
|
| 193 |
+
7. Keep the response natural, conversational, and well-formatted
|
| 194 |
+
8. Use proper formatting: paragraphs, line breaks, and structure for readability
|
| 195 |
+
9. Don't change the core content that matches the documents, just add references where appropriate and improve formatting
|
| 196 |
+
10. If multiple documents support the same claim, use [Doc i, Doc j] format
|
| 197 |
+
11. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
|
| 198 |
|
| 199 |
+
Return ONLY the enhanced, well-formatted response with references added and any corrections made. Do not include any explanation or meta-commentary."""
|
| 200 |
|
| 201 |
enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
|
| 202 |
|
| 203 |
+
# Fallback: if LLM fails, just return original with basic formatting
|
| 204 |
if not enhanced or len(enhanced) < len(answer) * 0.5:
|
| 205 |
+
logger.warning("LLM enhancement failed, using original response with basic formatting")
|
| 206 |
+
# Basic formatting: add line breaks after periods for readability
|
| 207 |
+
formatted = answer.replace('. ', '.\n\n')
|
| 208 |
+
if sources:
|
| 209 |
+
ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
|
| 210 |
+
formatted += f"\n\n*Based on documents: {ref_list}*"
|
| 211 |
+
return formatted
|
| 212 |
|
| 213 |
return enhanced
|
| 214 |
|
| 215 |
except Exception as e:
|
| 216 |
logger.warning(f"Failed to enhance response with references: {e}")
|
| 217 |
+
# Fallback: add basic formatting and references at the end
|
| 218 |
+
formatted = answer.replace('. ', '.\n\n') # Basic paragraph formatting
|
| 219 |
if sources:
|
| 220 |
ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
|
| 221 |
+
formatted += f"\n\n*Based on documents: {ref_list}*"
|
| 222 |
+
return formatted
|
| 223 |
|
| 224 |
def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
|
| 225 |
"""Extract UI filters from query if present"""
|
|
|
|
| 320 |
|
| 321 |
# Format sources for display
|
| 322 |
sources = []
|
| 323 |
+
gemini_result = final_state.get("gemini_result")
|
| 324 |
+
if gemini_result:
|
| 325 |
+
sources = self.gemini_client.format_sources_for_display(gemini_result)
|
| 326 |
+
logger.info(f"π GEMINI CHAT: Formatted {len(sources)} sources for display")
|
| 327 |
|
| 328 |
return {
|
| 329 |
'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
|
|
|
|
| 332 |
'answer': final_state["final_response"]
|
| 333 |
},
|
| 334 |
'agent_logs': final_state["agent_logs"],
|
| 335 |
+
'actual_rag_query': final_state["current_query"],
|
| 336 |
+
'gemini_result': gemini_result # Include raw result for tracking
|
| 337 |
}
|
| 338 |
|
| 339 |
def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
|
src/agents/multi_agent_chatbot.py
CHANGED
|
@@ -1222,7 +1222,7 @@ Generate a conversational response with proper document references:""")
|
|
| 1222 |
doc_sources.add(str(metadata['source']))
|
| 1223 |
|
| 1224 |
# Correct misspellings in response using correct names from documents
|
| 1225 |
-
response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
|
| 1226 |
|
| 1227 |
# Check if response mentions years not in documents
|
| 1228 |
year_pattern = r'\b(20\d{2})\b'
|
|
@@ -1252,27 +1252,6 @@ Generate a conversational response with proper document references:""")
|
|
| 1252 |
|
| 1253 |
return response
|
| 1254 |
|
| 1255 |
-
def _correct_misspellings_in_response(self, response: str, correct_districts: set, correct_sources: set) -> str:
|
| 1256 |
-
"""Correct common misspellings in response using correct names from documents."""
|
| 1257 |
-
# Common misspelling mappings (e.g., "Kalagala" -> "Kalangala")
|
| 1258 |
-
# We'll use fuzzy matching if needed, but first try direct corrections
|
| 1259 |
-
|
| 1260 |
-
corrected = response
|
| 1261 |
-
|
| 1262 |
-
# Correct district names
|
| 1263 |
-
for correct_district in correct_districts:
|
| 1264 |
-
# Try common misspellings
|
| 1265 |
-
if correct_district.lower() == "kalangala":
|
| 1266 |
-
# Replace "Kalagala" (missing 'n') with "Kalangala"
|
| 1267 |
-
corrected = re.sub(r'\bKalagala\b', 'Kalangala', corrected, flags=re.IGNORECASE)
|
| 1268 |
-
# Add more common misspellings as needed
|
| 1269 |
-
# For now, we rely on the LLM to use correct names from the prompt
|
| 1270 |
-
|
| 1271 |
-
# Correct source names if needed
|
| 1272 |
-
# Add source corrections as needed in the future
|
| 1273 |
-
|
| 1274 |
-
return corrected
|
| 1275 |
-
|
| 1276 |
def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
|
| 1277 |
"""Generate conversational response using only LLM knowledge and conversation history"""
|
| 1278 |
logger.info("π¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
|
|
|
|
| 1222 |
doc_sources.add(str(metadata['source']))
|
| 1223 |
|
| 1224 |
# Correct misspellings in response using correct names from documents
|
| 1225 |
+
# response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
|
| 1226 |
|
| 1227 |
# Check if response mentions years not in documents
|
| 1228 |
year_pattern = r'\b(20\d{2})\b'
|
|
|
|
| 1252 |
|
| 1253 |
return response
|
| 1254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1255 |
def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
|
| 1256 |
"""Generate conversational response using only LLM knowledge and conversation history"""
|
| 1257 |
logger.info("π¬ RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
|
src/gemini/file_search.py
CHANGED
|
@@ -46,10 +46,20 @@ class GeminiFileSearchClient:
|
|
| 46 |
if not self.api_key:
|
| 47 |
raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
|
| 48 |
|
| 49 |
-
|
| 50 |
-
if not
|
| 51 |
raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
self.client = genai.Client(api_key=self.api_key)
|
| 54 |
self.model = "gemini-2.5-flash" # or "gemini-2.5-pro"
|
| 55 |
|
|
@@ -95,15 +105,32 @@ class GeminiFileSearchClient:
|
|
| 95 |
filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
|
| 96 |
|
| 97 |
# Combine query with filter context
|
| 98 |
-
# Add
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
# Generate content with file search
|
| 104 |
# Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
|
|
|
|
|
|
|
|
|
|
| 105 |
try:
|
| 106 |
-
# Try the documented format first
|
| 107 |
response = self.client.models.generate_content(
|
| 108 |
model=model,
|
| 109 |
contents=full_query,
|
|
@@ -111,27 +138,53 @@ class GeminiFileSearchClient:
|
|
| 111 |
tools=[
|
| 112 |
types.Tool(
|
| 113 |
file_search=types.FileSearch(
|
| 114 |
-
file_search_store_names=[
|
| 115 |
)
|
| 116 |
)
|
| 117 |
]
|
| 118 |
)
|
| 119 |
)
|
| 120 |
-
except
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
# Extract answer
|
| 137 |
answer = ""
|
|
@@ -156,23 +209,36 @@ class GeminiFileSearchClient:
|
|
| 156 |
sources = []
|
| 157 |
grounding_metadata = None
|
| 158 |
|
|
|
|
|
|
|
| 159 |
if hasattr(response, 'candidates') and response.candidates:
|
| 160 |
candidate = response.candidates[0]
|
|
|
|
| 161 |
|
| 162 |
# Get grounding metadata
|
| 163 |
if hasattr(candidate, 'grounding_metadata'):
|
| 164 |
grounding_metadata = candidate.grounding_metadata
|
|
|
|
| 165 |
|
| 166 |
# Extract source documents from grounding metadata
|
| 167 |
# Handle different response formats
|
| 168 |
grounding_chunks = None
|
| 169 |
if hasattr(grounding_metadata, 'grounding_chunks'):
|
| 170 |
grounding_chunks = grounding_metadata.grounding_chunks
|
|
|
|
| 171 |
elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
|
| 172 |
grounding_chunks = grounding_metadata['grounding_chunks']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
if grounding_chunks:
|
| 175 |
-
|
|
|
|
| 176 |
# Handle both object and dict formats
|
| 177 |
try:
|
| 178 |
if isinstance(chunk, dict):
|
|
@@ -196,6 +262,46 @@ class GeminiFileSearchClient:
|
|
| 196 |
text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
|
| 197 |
file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
score_data = chunk_data.get('relevance_score', {})
|
| 200 |
score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
|
| 201 |
|
|
@@ -204,11 +310,43 @@ class GeminiFileSearchClient:
|
|
| 204 |
"content": text,
|
| 205 |
"filename": file_name,
|
| 206 |
"score": score,
|
|
|
|
| 207 |
}
|
| 208 |
sources.append(source_info)
|
|
|
|
| 209 |
except Exception as e:
|
| 210 |
-
logger.warning(f"Error extracting chunk info: {e}")
|
|
|
|
|
|
|
| 211 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
return GeminiFileSearchResult(
|
| 214 |
answer=answer,
|
|
@@ -236,21 +374,54 @@ class GeminiFileSearchClient:
|
|
| 236 |
formatted_sources = []
|
| 237 |
|
| 238 |
for i, source in enumerate(result.sources):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
# Create a Document object compatible with existing code
|
| 240 |
doc = Document(
|
| 241 |
page_content=source.get("content", ""),
|
| 242 |
metadata={
|
| 243 |
-
"filename":
|
| 244 |
-
"source":
|
| 245 |
"score": source.get("score"),
|
| 246 |
"chunk_index": i,
|
| 247 |
-
#
|
| 248 |
-
"
|
| 249 |
-
"
|
| 250 |
-
"
|
|
|
|
| 251 |
}
|
| 252 |
)
|
| 253 |
formatted_sources.append(doc)
|
|
|
|
| 254 |
|
|
|
|
| 255 |
return formatted_sources
|
| 256 |
|
|
|
|
| 46 |
if not self.api_key:
|
| 47 |
raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
|
| 48 |
|
| 49 |
+
store_name_raw = store_name or os.getenv("GEMINI_FILESTORE_NAME")
|
| 50 |
+
if not store_name_raw:
|
| 51 |
raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
|
| 52 |
|
| 53 |
+
# Normalize store name: API expects the FULL path format (fileSearchStores/xxx)
|
| 54 |
+
# If just the ID is provided, construct the full path
|
| 55 |
+
if store_name_raw.startswith("fileSearchStores/"):
|
| 56 |
+
self.store_name = store_name_raw # Already full path
|
| 57 |
+
else:
|
| 58 |
+
# Just the ID provided, construct full path
|
| 59 |
+
self.store_name = f"fileSearchStores/{store_name_raw}"
|
| 60 |
+
|
| 61 |
+
logger.info(f"π¦ Using file search store: {self.store_name}")
|
| 62 |
+
|
| 63 |
self.client = genai.Client(api_key=self.api_key)
|
| 64 |
self.model = "gemini-2.5-flash" # or "gemini-2.5-pro"
|
| 65 |
|
|
|
|
| 105 |
filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
|
| 106 |
|
| 107 |
# Combine query with filter context
|
| 108 |
+
# Add comprehensive system instructions similar to multi-agent system
|
| 109 |
+
system_instructions = """You are a helpful audit report assistant specialized in analyzing government audit reports from Uganda's Office of the Auditor General.
|
| 110 |
+
|
| 111 |
+
CRITICAL RULES:
|
| 112 |
+
1. **NO HALLUCINATION**: Only use information that is explicitly stated in the retrieved documents. Do not make up facts, numbers, or details.
|
| 113 |
+
2. **Document References**: Always cite which documents you're using with [Doc i] references at the end of sentences that use specific information.
|
| 114 |
+
3. **Formatting**: Structure your response with clear paragraphs, bullet points, or sections for readability.
|
| 115 |
+
4. **Accuracy**: If the retrieved documents don't contain the requested information, explicitly state "The retrieved documents do not contain information about [topic]."
|
| 116 |
+
5. **Years and Data**: Pay careful attention to years mentioned in documents. If a user asks about a specific year but documents show different years, explicitly state this.
|
| 117 |
+
6. **District/Source Names**: Use the exact district and source names as they appear in the document metadata (e.g., "Kalangala" not "Kalagala").
|
| 118 |
+
7. **Financial Data**: When providing financial figures, include the currency (UGX) and be precise about amounts.
|
| 119 |
+
8. **Conversational Tone**: Be helpful, clear, and conversational while maintaining accuracy.
|
| 120 |
+
|
| 121 |
+
IMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents."""
|
| 122 |
+
|
| 123 |
+
# Combine system instructions with query
|
| 124 |
+
full_query = f"{system_instructions}\n\nUser Question: {query}{filter_context}\n\nPlease provide a detailed, well-formatted response with proper document references."
|
| 125 |
|
| 126 |
try:
|
| 127 |
# Generate content with file search
|
| 128 |
# Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
|
| 129 |
+
# Try with full path format first, then fallback to just ID if needed
|
| 130 |
+
store_name_to_try = self.store_name
|
| 131 |
+
|
| 132 |
try:
|
| 133 |
+
# Try the documented format first with full path
|
| 134 |
response = self.client.models.generate_content(
|
| 135 |
model=model,
|
| 136 |
contents=full_query,
|
|
|
|
| 138 |
tools=[
|
| 139 |
types.Tool(
|
| 140 |
file_search=types.FileSearch(
|
| 141 |
+
file_search_store_names=[store_name_to_try]
|
| 142 |
)
|
| 143 |
)
|
| 144 |
]
|
| 145 |
)
|
| 146 |
)
|
| 147 |
+
except Exception as api_error:
|
| 148 |
+
error_str = str(api_error).lower()
|
| 149 |
+
# If format error, try with just the ID (without fileSearchStores/ prefix)
|
| 150 |
+
if 'format' in error_str or 'invalid' in error_str or 'too long' in error_str:
|
| 151 |
+
logger.warning(f"Full path format failed, trying with just store ID: {api_error}")
|
| 152 |
+
# Extract just the ID part
|
| 153 |
+
if store_name_to_try.startswith("fileSearchStores/"):
|
| 154 |
+
store_id = store_name_to_try.split("/", 1)[1]
|
| 155 |
+
store_name_to_try = store_id
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
response = self.client.models.generate_content(
|
| 159 |
+
model=model,
|
| 160 |
+
contents=full_query,
|
| 161 |
+
config=types.GenerateContentConfig(
|
| 162 |
+
tools=[
|
| 163 |
+
types.Tool(
|
| 164 |
+
file_search=types.FileSearch(
|
| 165 |
+
file_search_store_names=[store_name_to_try]
|
| 166 |
+
)
|
| 167 |
+
)
|
| 168 |
+
]
|
| 169 |
+
)
|
| 170 |
+
)
|
| 171 |
+
except Exception as e2:
|
| 172 |
+
raise Exception(f"Failed to call Gemini API with both formats. Full path error: {api_error}, ID-only error: {e2}")
|
| 173 |
+
else:
|
| 174 |
+
# Try alternative dict format
|
| 175 |
+
logger.warning(f"Primary API format failed, trying alternative: {api_error}")
|
| 176 |
+
try:
|
| 177 |
+
response = self.client.models.generate_content(
|
| 178 |
+
model=model,
|
| 179 |
+
contents=full_query,
|
| 180 |
+
tools=[{
|
| 181 |
+
"file_search": {
|
| 182 |
+
"file_search_store_names": [store_name_to_try]
|
| 183 |
+
}
|
| 184 |
+
}]
|
| 185 |
+
)
|
| 186 |
+
except Exception as e2:
|
| 187 |
+
raise Exception(f"Failed to call Gemini API: {e2}")
|
| 188 |
|
| 189 |
# Extract answer
|
| 190 |
answer = ""
|
|
|
|
| 209 |
sources = []
|
| 210 |
grounding_metadata = None
|
| 211 |
|
| 212 |
+
logger.info(f"π Extracting sources from Gemini response...")
|
| 213 |
+
|
| 214 |
if hasattr(response, 'candidates') and response.candidates:
|
| 215 |
candidate = response.candidates[0]
|
| 216 |
+
logger.info(f" Found candidate, checking for grounding_metadata...")
|
| 217 |
|
| 218 |
# Get grounding metadata
|
| 219 |
if hasattr(candidate, 'grounding_metadata'):
|
| 220 |
grounding_metadata = candidate.grounding_metadata
|
| 221 |
+
logger.info(f" Found grounding_metadata: {type(grounding_metadata)}")
|
| 222 |
|
| 223 |
# Extract source documents from grounding metadata
|
| 224 |
# Handle different response formats
|
| 225 |
grounding_chunks = None
|
| 226 |
if hasattr(grounding_metadata, 'grounding_chunks'):
|
| 227 |
grounding_chunks = grounding_metadata.grounding_chunks
|
| 228 |
+
logger.info(f" Found grounding_chunks (attr): {len(grounding_chunks) if grounding_chunks else 0}")
|
| 229 |
elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
|
| 230 |
grounding_chunks = grounding_metadata['grounding_chunks']
|
| 231 |
+
logger.info(f" Found grounding_chunks (dict): {len(grounding_chunks) if grounding_chunks else 0}")
|
| 232 |
+
elif hasattr(grounding_metadata, '__dict__'):
|
| 233 |
+
# Try to access as object attributes
|
| 234 |
+
metadata_dict = grounding_metadata.__dict__
|
| 235 |
+
if 'grounding_chunks' in metadata_dict:
|
| 236 |
+
grounding_chunks = metadata_dict['grounding_chunks']
|
| 237 |
+
logger.info(f" Found grounding_chunks (__dict__): {len(grounding_chunks) if grounding_chunks else 0}")
|
| 238 |
|
| 239 |
if grounding_chunks:
|
| 240 |
+
logger.info(f" Processing {len(grounding_chunks)} grounding chunks...")
|
| 241 |
+
for idx, chunk in enumerate(grounding_chunks):
|
| 242 |
# Handle both object and dict formats
|
| 243 |
try:
|
| 244 |
if isinstance(chunk, dict):
|
|
|
|
| 262 |
text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
|
| 263 |
file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
|
| 264 |
|
| 265 |
+
# Try to extract file URI and parse metadata from it
|
| 266 |
+
file_uri = chunk_info.get('file_uri', '') if isinstance(chunk_info, dict) else ''
|
| 267 |
+
|
| 268 |
+
# Also check for 'web' attribute (GroundingChunkData structure)
|
| 269 |
+
if hasattr(chunk, 'web') and chunk.web:
|
| 270 |
+
web_data = chunk.web
|
| 271 |
+
file_uri = getattr(web_data, 'file_uri', '') or file_uri
|
| 272 |
+
file_name = getattr(web_data, 'title', '') or getattr(web_data, 'filename', '') or file_name
|
| 273 |
+
text = getattr(web_data, 'text', '') or getattr(web_data, 'content', '') or text
|
| 274 |
+
|
| 275 |
+
# Check retrieved_context - this is where the actual data seems to be!
|
| 276 |
+
if hasattr(chunk, 'retrieved_context') and chunk.retrieved_context:
|
| 277 |
+
rc = chunk.retrieved_context
|
| 278 |
+
# Get text content
|
| 279 |
+
if hasattr(rc, 'text'):
|
| 280 |
+
text = getattr(rc, 'text', '') or text
|
| 281 |
+
# Get document name
|
| 282 |
+
if hasattr(rc, 'document_name'):
|
| 283 |
+
doc_name = getattr(rc, 'document_name', '')
|
| 284 |
+
if doc_name:
|
| 285 |
+
file_name = doc_name or file_name
|
| 286 |
+
|
| 287 |
+
# Fallback: Parse from string representation if we still don't have filename
|
| 288 |
+
if not file_name:
|
| 289 |
+
chunk_str = str(chunk)
|
| 290 |
+
import re
|
| 291 |
+
# Look for PDF filenames
|
| 292 |
+
pdf_match = re.search(r"([A-Za-z0-9\s_-]+\.pdf)", chunk_str)
|
| 293 |
+
if pdf_match:
|
| 294 |
+
file_name = pdf_match.group(1)
|
| 295 |
+
# Or look for title= pattern
|
| 296 |
+
if not file_name and 'title=' in chunk_str:
|
| 297 |
+
title_match = re.search(r"title=['\"]([^'\"]+)['\"]", chunk_str)
|
| 298 |
+
if title_match:
|
| 299 |
+
file_name = title_match.group(1)
|
| 300 |
+
|
| 301 |
+
if not file_name and file_uri:
|
| 302 |
+
# Extract filename from URI if available
|
| 303 |
+
file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
|
| 304 |
+
|
| 305 |
score_data = chunk_data.get('relevance_score', {})
|
| 306 |
score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
|
| 307 |
|
|
|
|
| 310 |
"content": text,
|
| 311 |
"filename": file_name,
|
| 312 |
"score": score,
|
| 313 |
+
"file_uri": file_uri,
|
| 314 |
}
|
| 315 |
sources.append(source_info)
|
| 316 |
+
logger.info(f"π Extracted source {idx+1}: {file_name} (score: {score:.3f}, content length: {len(text)})")
|
| 317 |
except Exception as e:
|
| 318 |
+
logger.warning(f"Error extracting chunk {idx+1} info: {e}")
|
| 319 |
+
import traceback
|
| 320 |
+
logger.debug(traceback.format_exc())
|
| 321 |
continue
|
| 322 |
+
else:
|
| 323 |
+
logger.warning(f" No grounding_chunks found in grounding_metadata")
|
| 324 |
+
else:
|
| 325 |
+
logger.warning(f" Candidate does not have grounding_metadata attribute")
|
| 326 |
+
|
| 327 |
+
# Also try to get file references from other parts of the response
|
| 328 |
+
# Sometimes Gemini includes file references in the response itself
|
| 329 |
+
if not sources or len(sources) == 0:
|
| 330 |
+
logger.info(f" No sources from grounding_metadata, trying alternative extraction...")
|
| 331 |
+
# Check if response has file references in other attributes
|
| 332 |
+
if hasattr(candidate, 'content') and candidate.content:
|
| 333 |
+
if hasattr(candidate.content, 'parts'):
|
| 334 |
+
for part in candidate.content.parts:
|
| 335 |
+
if hasattr(part, 'file_data'):
|
| 336 |
+
file_data = part.file_data
|
| 337 |
+
if hasattr(file_data, 'file_uri') or (isinstance(file_data, dict) and 'file_uri' in file_data):
|
| 338 |
+
file_uri = getattr(file_data, 'file_uri', None) or (file_data.get('file_uri') if isinstance(file_data, dict) else None)
|
| 339 |
+
if file_uri:
|
| 340 |
+
file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
|
| 341 |
+
sources.append({
|
| 342 |
+
"content": "",
|
| 343 |
+
"filename": file_name,
|
| 344 |
+
"score": 0.0,
|
| 345 |
+
"file_uri": file_uri,
|
| 346 |
+
})
|
| 347 |
+
logger.info(f"π Extracted source from file_data: {file_name}")
|
| 348 |
+
|
| 349 |
+
logger.info(f"β
Total sources extracted: {len(sources)}")
|
| 350 |
|
| 351 |
return GeminiFileSearchResult(
|
| 352 |
answer=answer,
|
|
|
|
| 374 |
formatted_sources = []
|
| 375 |
|
| 376 |
for i, source in enumerate(result.sources):
|
| 377 |
+
filename = source.get("filename", "Unknown")
|
| 378 |
+
|
| 379 |
+
# Try to extract metadata from filename (e.g., "Kalangala DLG Report of Auditor General 2021.pdf")
|
| 380 |
+
year = None
|
| 381 |
+
district = None
|
| 382 |
+
source_name = "Gemini File Search"
|
| 383 |
+
|
| 384 |
+
# Parse filename for year
|
| 385 |
+
import re
|
| 386 |
+
year_match = re.search(r'\b(20\d{2})\b', filename)
|
| 387 |
+
if year_match:
|
| 388 |
+
year = int(year_match.group(1))
|
| 389 |
+
|
| 390 |
+
# Parse filename for district/source
|
| 391 |
+
if "Kalangala" in filename:
|
| 392 |
+
district = "Kalangala"
|
| 393 |
+
source_name = "Kalangala DLG"
|
| 394 |
+
elif "Gulu" in filename:
|
| 395 |
+
district = "Gulu"
|
| 396 |
+
source_name = "Gulu DLG"
|
| 397 |
+
elif "KCCA" in filename:
|
| 398 |
+
district = "Kampala"
|
| 399 |
+
source_name = "KCCA"
|
| 400 |
+
elif "MAAIF" in filename:
|
| 401 |
+
source_name = "MAAIF"
|
| 402 |
+
elif "MWTS" in filename:
|
| 403 |
+
source_name = "MWTS"
|
| 404 |
+
elif "Consolidated" in filename:
|
| 405 |
+
source_name = "Consolidated"
|
| 406 |
+
|
| 407 |
# Create a Document object compatible with existing code
|
| 408 |
doc = Document(
|
| 409 |
page_content=source.get("content", ""),
|
| 410 |
metadata={
|
| 411 |
+
"filename": filename,
|
| 412 |
+
"source": source_name,
|
| 413 |
"score": source.get("score"),
|
| 414 |
"chunk_index": i,
|
| 415 |
+
"page": None, # Gemini doesn't provide page numbers
|
| 416 |
+
"year": year,
|
| 417 |
+
"district": district,
|
| 418 |
+
"chunk_id": f"gemini_{i}",
|
| 419 |
+
"_id": f"gemini_{i}",
|
| 420 |
}
|
| 421 |
)
|
| 422 |
formatted_sources.append(doc)
|
| 423 |
+
logger.info(f"π Formatted source {i+1}: {filename} ({year}, {source_name})")
|
| 424 |
|
| 425 |
+
logger.info(f"β
Formatted {len(formatted_sources)} sources for display")
|
| 426 |
return formatted_sources
|
| 427 |
|
src/ui_components/components.py
CHANGED
|
@@ -57,7 +57,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
|
|
| 57 |
color_continuous_scale='viridis'
|
| 58 |
)
|
| 59 |
fig_source.update_layout(height=400, showlegend=False)
|
| 60 |
-
st.plotly_chart(fig_source, use_container_width=True)
|
| 61 |
|
| 62 |
with col2:
|
| 63 |
# Year distribution chart
|
|
@@ -84,7 +84,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
|
|
| 84 |
# Ensure years are treated as categorical (discrete) not continuous
|
| 85 |
fig_year.update_xaxes(type='category')
|
| 86 |
fig_year.update_layout(height=400, showlegend=False)
|
| 87 |
-
st.plotly_chart(fig_year, use_container_width=True)
|
| 88 |
else:
|
| 89 |
st.info("No valid years found in the results")
|
| 90 |
|
|
@@ -109,7 +109,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
|
|
| 109 |
color_continuous_scale='blues'
|
| 110 |
)
|
| 111 |
fig_district.update_layout(height=400, showlegend=False)
|
| 112 |
-
st.plotly_chart(fig_district, use_container_width=True)
|
| 113 |
else:
|
| 114 |
st.info("No valid districts found in the results")
|
| 115 |
|
|
@@ -144,7 +144,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
|
|
| 144 |
"Count": list(district_dist_filtered.values())
|
| 145 |
}
|
| 146 |
district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
|
| 147 |
-
st.dataframe(district_df, hide_index=True,
|
| 148 |
else:
|
| 149 |
st.write("No district data")
|
| 150 |
else:
|
|
@@ -158,7 +158,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
|
|
| 158 |
"Count": list(stats['source_distribution'].values())
|
| 159 |
}
|
| 160 |
source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
|
| 161 |
-
st.dataframe(source_df, hide_index=True,
|
| 162 |
else:
|
| 163 |
st.write("No source data")
|
| 164 |
|
|
@@ -175,7 +175,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
|
|
| 175 |
# Sort by year as integer but display as string
|
| 176 |
year_df['Year_Int'] = year_df['Year'].astype(int)
|
| 177 |
year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
|
| 178 |
-
st.dataframe(year_df, hide_index=True,
|
| 179 |
else:
|
| 180 |
st.write("No year data")
|
| 181 |
else:
|
|
@@ -193,7 +193,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
|
|
| 193 |
"Count": [c for f, c in filename_items[:5]]
|
| 194 |
}
|
| 195 |
file_df = pd.DataFrame(file_data)
|
| 196 |
-
st.dataframe(file_df, hide_index=True,
|
| 197 |
else:
|
| 198 |
st.write("No file data")
|
| 199 |
|
|
|
|
| 57 |
color_continuous_scale='viridis'
|
| 58 |
)
|
| 59 |
fig_source.update_layout(height=400, showlegend=False)
|
| 60 |
+
st.plotly_chart(fig_source, use_container_width=True) # Note: plotly_chart still uses use_container_width
|
| 61 |
|
| 62 |
with col2:
|
| 63 |
# Year distribution chart
|
|
|
|
| 84 |
# Ensure years are treated as categorical (discrete) not continuous
|
| 85 |
fig_year.update_xaxes(type='category')
|
| 86 |
fig_year.update_layout(height=400, showlegend=False)
|
| 87 |
+
st.plotly_chart(fig_year, use_container_width=True) # Note: plotly_chart still uses use_container_width
|
| 88 |
else:
|
| 89 |
st.info("No valid years found in the results")
|
| 90 |
|
|
|
|
| 109 |
color_continuous_scale='blues'
|
| 110 |
)
|
| 111 |
fig_district.update_layout(height=400, showlegend=False)
|
| 112 |
+
st.plotly_chart(fig_district, use_container_width=True) # Note: plotly_chart still uses use_container_width
|
| 113 |
else:
|
| 114 |
st.info("No valid districts found in the results")
|
| 115 |
|
|
|
|
| 144 |
"Count": list(district_dist_filtered.values())
|
| 145 |
}
|
| 146 |
district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
|
| 147 |
+
st.dataframe(district_df, hide_index=True, width='stretch')
|
| 148 |
else:
|
| 149 |
st.write("No district data")
|
| 150 |
else:
|
|
|
|
| 158 |
"Count": list(stats['source_distribution'].values())
|
| 159 |
}
|
| 160 |
source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
|
| 161 |
+
st.dataframe(source_df, hide_index=True, width='stretch')
|
| 162 |
else:
|
| 163 |
st.write("No source data")
|
| 164 |
|
|
|
|
| 175 |
# Sort by year as integer but display as string
|
| 176 |
year_df['Year_Int'] = year_df['Year'].astype(int)
|
| 177 |
year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
|
| 178 |
+
st.dataframe(year_df, hide_index=True, width='stretch')
|
| 179 |
else:
|
| 180 |
st.write("No year data")
|
| 181 |
else:
|
|
|
|
| 193 |
"Count": [c for f, c in filename_items[:5]]
|
| 194 |
}
|
| 195 |
file_df = pd.DataFrame(file_data)
|
| 196 |
+
st.dataframe(file_df, hide_index=True, width='stretch')
|
| 197 |
else:
|
| 198 |
st.write("No file data")
|
| 199 |
|