Zeggai Abdellah commited on
Commit
b12f17b
·
1 Parent(s): a99d17a

add log to system

Browse files
Files changed (2) hide show
  1. prepare_env.py +81 -18
  2. rag_pipeline.py +91 -7
prepare_env.py CHANGED
@@ -70,10 +70,13 @@ def extract_source_ids(response_text):
70
 
71
  def setup_models():
72
  """Initialize embedding model and LLM"""
 
 
73
  # Initialize embedding model
74
  embedding_function = HuggingFaceEmbeddings(
75
  model_name="intfloat/multilingual-e5-base"
76
  )
 
77
 
78
  # Initialize LLM
79
  genai_api_key = os.getenv('GOOGLE_API_KEY')
@@ -81,15 +84,20 @@ def setup_models():
81
  model="gemini-2.0-flash",
82
  google_api_key=genai_api_key
83
  )
 
84
 
85
  return embedding_function, llm
86
 
87
  def create_vectorstore_from_json(json_path: str, collection_name: str, embedding_function):
88
  """Create vector store from JSON chunks"""
 
 
89
  # Load the chunks.json
90
  with open(json_path, "r", encoding="utf-8") as f:
91
  chunks_data = json.load(f)
92
 
 
 
93
  documents = []
94
  for element in chunks_data:
95
  text = element["text"]
@@ -113,31 +121,38 @@ def create_vectorstore_from_json(json_path: str, collection_name: str, embedding
113
  collection_name=collection_name,
114
  persist_directory="chroma_db_multilingual"
115
  )
 
116
  return vectorstore, documents
117
 
118
  def create_retriever(vectorstore, docs, llm):
119
  """Create ensemble retriever with vector and BM25 search"""
 
 
120
  # Vector retriever
121
  vector_retriever = vectorstore.as_retriever(
122
  search_type="similarity",
123
  search_kwargs={"k": 6}
124
  )
 
125
 
126
  # BM25 retriever
127
  bm25_retriever = BM25Retriever.from_documents(docs)
128
  bm25_retriever.k = 2
 
129
 
130
  # Ensemble retriever
131
  ensemble_retriever = EnsembleRetriever(
132
  retrievers=[vector_retriever, bm25_retriever],
133
  weights=[0.5, 0.5]
134
  )
 
135
 
136
  # Multi-query expanding retriever
137
  expanding_retriever = MultiQueryRetriever.from_llm(
138
  retriever=ensemble_retriever,
139
  llm=llm
140
  )
 
141
 
142
  return expanding_retriever
143
 
@@ -164,18 +179,27 @@ def convert_chromadb_to_llamaindex_nodes(chromadb_documents: List) -> List[TextN
164
 
165
  def section_tool_wrapper(retriever, section_path_chunks, query):
166
  """Generic section tool wrapper"""
 
 
167
  try:
168
  retrieved_docs = retriever.get_relevant_documents(query)
 
 
169
  nodes_from_retrieved_docs = convert_chromadb_to_llamaindex_nodes(retrieved_docs)
170
 
171
  if not nodes_from_retrieved_docs:
 
172
  return "No relevant documents found for the query."
173
 
174
  chunk_ids = [node.metadata['element_id'] for node in retrieved_docs]
 
 
175
  with open(section_path_chunks, "r", encoding="utf-8") as f:
176
  chunks_data = json.load(f)
177
 
178
  chunks_unique = [node for node in chunks_data if node.get('element_id', 'Unknown') in chunk_ids]
 
 
179
  combined_text = []
180
 
181
  for chu in chunks_unique:
@@ -188,14 +212,15 @@ def section_tool_wrapper(retriever, section_path_chunks, query):
188
  combined_text.append(text)
189
 
190
  result = "\n---\n".join(combined_text)
191
- print(f"Retrieved {len(nodes_from_retrieved_docs)} documents for query: {query[:50]}...")
192
  return result
193
  except Exception as e:
194
- print(f"Error in section tool: {e}")
195
  return f"Error retrieving documents: {str(e)}"
196
 
197
  def create_section_tools(embedding_function, llm):
198
  """Create all section-specific retrieval tools"""
 
199
 
200
  # Define section paths - Fixed path structure
201
  section_paths = {
@@ -216,15 +241,15 @@ def create_section_tools(embedding_function, llm):
216
  for section, path in section_paths.items():
217
  try:
218
  if os.path.exists(path):
219
- print(f"Creating retriever for section {section} from {path}")
220
  vstore, docs = create_vectorstore_from_json(path, f"Guide_2023_{section}", embedding_function)
221
  section_retrievers[section] = create_retriever(vstore, docs, llm)
222
- print(f"Successfully created retriever for section {section}")
223
  else:
224
- print(f"Warning: File not found for section {section}: {path}")
225
  section_retrievers[section] = None
226
  except Exception as e:
227
- print(f"Error creating retriever for section {section}: {e}")
228
  section_retrievers[section] = None
229
 
230
  # Create main guide retriever
@@ -232,32 +257,32 @@ def create_section_tools(embedding_function, llm):
232
  guide_retriever = None
233
  try:
234
  if os.path.exists(guide_path):
235
- print("Creating main guide retriever...")
236
  guide_vstore, guide_docs = create_vectorstore_from_json(guide_path, "Guide_2023_multilingual", embedding_function)
237
  guide_retriever = create_retriever(guide_vstore, guide_docs, llm)
238
- print("Successfully created main guide retriever")
239
  else:
240
- print(f"Warning: Main guide file not found: {guide_path}")
241
  except Exception as e:
242
- print(f"Error creating main guide retriever: {e}")
243
 
244
  # WHO Immunization in Practice Tool
245
  immunization_path = './data/Immunization in Practice_WHO_eng_2015.json'
246
  immunization_retriever = None
247
  try:
248
  if os.path.exists(immunization_path):
249
- print("Creating immunization retriever...")
250
  immunization_vstore, immunization_docs = create_vectorstore_from_json(
251
  immunization_path,
252
  "Immunization_in_Practice_WHO_eng_2015",
253
  embedding_function
254
  )
255
  immunization_retriever = create_retriever(immunization_vstore, immunization_docs, llm)
256
- print("Successfully created immunization retriever")
257
  else:
258
- print(f"Warning: Immunization file not found: {immunization_path}")
259
  except Exception as e:
260
- print(f"Error creating immunization retriever: {e}")
261
 
262
  # General-purpose tool (entire Algerian guide)
263
  def guide_retrieval_tool(query: str) -> str:
@@ -279,11 +304,14 @@ def create_section_tools(embedding_function, llm):
279
  Returns:
280
  str: Synthesized answer from the entire national guide.
281
  """
 
282
  if not guide_retriever:
 
283
  return "Guide retriever not available - main guide file may be missing"
284
  try:
285
  return section_tool_wrapper(guide_retriever, guide_path, query)
286
  except Exception as e:
 
287
  return f"Error accessing guide retriever: {str(e)}"
288
 
289
  def immunization_tool(query: str) -> str:
@@ -302,11 +330,14 @@ def create_section_tools(embedding_function, llm):
302
  Returns:
303
  str: Content from the WHO Immunization in Practice guide.
304
  """
 
305
  if not immunization_retriever:
 
306
  return "Immunization in Practice retriever not available - WHO guide file may be missing"
307
  try:
308
  return section_tool_wrapper(immunization_retriever, immunization_path, query)
309
  except Exception as e:
 
310
  return f"Error accessing immunization retriever: {str(e)}"
311
 
312
  # Section-Specific Tools - Fixed implementation
@@ -325,11 +356,14 @@ def create_section_tools(embedding_function, llm):
325
  Returns:
326
  str: Response from Section 1.
327
  """
 
328
  if not section_retrievers.get('one'):
 
329
  return "Section 1 retriever not available - file may be missing"
330
  try:
331
  return section_tool_wrapper(section_retrievers['one'], section_paths['one'], query)
332
  except Exception as e:
 
333
  return f"Error accessing section 1: {str(e)}"
334
 
335
  def section_two_tool(query: str) -> str:
@@ -347,11 +381,14 @@ def create_section_tools(embedding_function, llm):
347
  Returns:
348
  str: Disease-specific content from Section 2.
349
  """
 
350
  if not section_retrievers.get('two'):
 
351
  return "Section 2 retriever not available - file may be missing"
352
  try:
353
  return section_tool_wrapper(section_retrievers['two'], section_paths['two'], query)
354
  except Exception as e:
 
355
  return f"Error accessing section 2: {str(e)}"
356
 
357
  def section_three_tool(query: str) -> str:
@@ -369,11 +406,14 @@ def create_section_tools(embedding_function, llm):
369
  Returns:
370
  str: Vaccine info from Section 3.
371
  """
 
372
  if not section_retrievers.get('three'):
 
373
  return "Section 3 retriever not available - file may be missing"
374
  try:
375
  return section_tool_wrapper(section_retrievers['three'], section_paths['three'], query)
376
  except Exception as e:
 
377
  return f"Error accessing section 3: {str(e)}"
378
 
379
  def section_four_tool(query: str) -> str:
@@ -391,11 +431,14 @@ def create_section_tools(embedding_function, llm):
391
  Returns:
392
  str: Catch-up guidance from Section 4.
393
  """
 
394
  if not section_retrievers.get('four'):
 
395
  return "Section 4 retriever not available - file may be missing"
396
  try:
397
  return section_tool_wrapper(section_retrievers['four'], section_paths['four'], query)
398
  except Exception as e:
 
399
  return f"Error accessing section 4: {str(e)}"
400
 
401
  def section_five_tool(query: str) -> str:
@@ -413,11 +456,14 @@ def create_section_tools(embedding_function, llm):
413
  Returns:
414
  str: Custom recommendations from Section 5.
415
  """
 
416
  if not section_retrievers.get('five'):
 
417
  return "Section 5 retriever not available - file may be missing"
418
  try:
419
  return section_tool_wrapper(section_retrievers['five'], section_paths['five'], query)
420
  except Exception as e:
 
421
  return f"Error accessing section 5: {str(e)}"
422
 
423
  def section_six_tool(query: str) -> str:
@@ -435,11 +481,14 @@ def create_section_tools(embedding_function, llm):
435
  Returns:
436
  str: Cold chain instructions from Section 6.
437
  """
 
438
  if not section_retrievers.get('six'):
 
439
  return "Section 6 retriever not available - file may be missing"
440
  try:
441
  return section_tool_wrapper(section_retrievers['six'], section_paths['six'], query)
442
  except Exception as e:
 
443
  return f"Error accessing section 6: {str(e)}"
444
 
445
  def section_seven_tool(query: str) -> str:
@@ -457,11 +506,14 @@ def create_section_tools(embedding_function, llm):
457
  Returns:
458
  str: Best practices from Section 7.
459
  """
 
460
  if not section_retrievers.get('seven'):
 
461
  return "Section 7 retriever not available - file may be missing"
462
  try:
463
  return section_tool_wrapper(section_retrievers['seven'], section_paths['seven'], query)
464
  except Exception as e:
 
465
  return f"Error accessing section 7: {str(e)}"
466
 
467
  def section_eight_tool(query: str) -> str:
@@ -479,11 +531,14 @@ def create_section_tools(embedding_function, llm):
479
  Returns:
480
  str: Workflow and safety monitoring details from Section 8.
481
  """
 
482
  if not section_retrievers.get('eight'):
 
483
  return "Section 8 retriever not available - file may be missing"
484
  try:
485
  return section_tool_wrapper(section_retrievers['eight'], section_paths['eight'], query)
486
  except Exception as e:
 
487
  return f"Error accessing section 8: {str(e)}"
488
 
489
  def section_nine_tool(query: str) -> str:
@@ -501,11 +556,14 @@ def create_section_tools(embedding_function, llm):
501
  Returns:
502
  str: Planning and stock guidance from Section 9.
503
  """
 
504
  if not section_retrievers.get('nine'):
 
505
  return "Section 9 retriever not available - file may be missing"
506
  try:
507
  return section_tool_wrapper(section_retrievers['nine'], section_paths['nine'], query)
508
  except Exception as e:
 
509
  return f"Error accessing section 9: {str(e)}"
510
 
511
  def section_ten_tool(query: str) -> str:
@@ -523,11 +581,14 @@ def create_section_tools(embedding_function, llm):
523
  Returns:
524
  str: Public mobilization strategies from Section 10.
525
  """
 
526
  if not section_retrievers.get('ten'):
 
527
  return "Section 10 retriever not available - file may be missing"
528
  try:
529
  return section_tool_wrapper(section_retrievers['ten'], section_paths['ten'], query)
530
  except Exception as e:
 
531
  return f"Error accessing section 10: {str(e)}"
532
 
533
  # Create FunctionTool objects
@@ -547,17 +608,19 @@ def create_section_tools(embedding_function, llm):
547
  FunctionTool.from_defaults(name="section_ten_vector_query_tool", fn=section_ten_tool),
548
  ]
549
 
 
550
  return tools
551
 
552
  def prepare_environment():
553
  """Main function to prepare the environment and return tools"""
554
- print("Setting up models...")
 
555
  embedding_function, llm = setup_models()
556
 
557
- print("Creating section tools...")
558
  tools = create_section_tools(embedding_function, llm)
559
 
560
- print("Environment prepared successfully!")
561
- print(f"Created {len(tools)} tools")
562
 
563
  return tools, llm
 
70
 
71
  def setup_models():
72
  """Initialize embedding model and LLM"""
73
+ print("🔧 Setting up embedding model and LLM...")
74
+
75
  # Initialize embedding model
76
  embedding_function = HuggingFaceEmbeddings(
77
  model_name="intfloat/multilingual-e5-base"
78
  )
79
+ print("✅ Embedding model initialized: intfloat/multilingual-e5-base")
80
 
81
  # Initialize LLM
82
  genai_api_key = os.getenv('GOOGLE_API_KEY')
 
84
  model="gemini-2.0-flash",
85
  google_api_key=genai_api_key
86
  )
87
+ print("✅ LLM initialized: gemini-2.0-flash")
88
 
89
  return embedding_function, llm
90
 
91
  def create_vectorstore_from_json(json_path: str, collection_name: str, embedding_function):
92
  """Create vector store from JSON chunks"""
93
+ print(f"📚 Creating vector store from: {json_path}")
94
+
95
  # Load the chunks.json
96
  with open(json_path, "r", encoding="utf-8") as f:
97
  chunks_data = json.load(f)
98
 
99
+ print(f"📊 Loaded {len(chunks_data)} chunks from JSON")
100
+
101
  documents = []
102
  for element in chunks_data:
103
  text = element["text"]
 
121
  collection_name=collection_name,
122
  persist_directory="chroma_db_multilingual"
123
  )
124
+ print(f"✅ Vector store created with collection: {collection_name}")
125
  return vectorstore, documents
126
 
127
  def create_retriever(vectorstore, docs, llm):
128
  """Create ensemble retriever with vector and BM25 search"""
129
+ print("🔍 Creating ensemble retriever...")
130
+
131
  # Vector retriever
132
  vector_retriever = vectorstore.as_retriever(
133
  search_type="similarity",
134
  search_kwargs={"k": 6}
135
  )
136
+ print("✅ Vector retriever created (k=6)")
137
 
138
  # BM25 retriever
139
  bm25_retriever = BM25Retriever.from_documents(docs)
140
  bm25_retriever.k = 2
141
+ print("✅ BM25 retriever created (k=2)")
142
 
143
  # Ensemble retriever
144
  ensemble_retriever = EnsembleRetriever(
145
  retrievers=[vector_retriever, bm25_retriever],
146
  weights=[0.5, 0.5]
147
  )
148
+ print("✅ Ensemble retriever created (weights: 0.5, 0.5)")
149
 
150
  # Multi-query expanding retriever
151
  expanding_retriever = MultiQueryRetriever.from_llm(
152
  retriever=ensemble_retriever,
153
  llm=llm
154
  )
155
+ print("✅ Multi-query expanding retriever created")
156
 
157
  return expanding_retriever
158
 
 
179
 
180
  def section_tool_wrapper(retriever, section_path_chunks, query):
181
  """Generic section tool wrapper"""
182
+ print(f"🔍 TOOL CALL: Searching for query: '{query[:100]}...' in {section_path_chunks}")
183
+
184
  try:
185
  retrieved_docs = retriever.get_relevant_documents(query)
186
+ print(f"📄 Retrieved {len(retrieved_docs)} documents")
187
+
188
  nodes_from_retrieved_docs = convert_chromadb_to_llamaindex_nodes(retrieved_docs)
189
 
190
  if not nodes_from_retrieved_docs:
191
+ print("❌ No relevant documents found for the query")
192
  return "No relevant documents found for the query."
193
 
194
  chunk_ids = [node.metadata['element_id'] for node in retrieved_docs]
195
+ print(f"🆔 Found chunk IDs: {chunk_ids}")
196
+
197
  with open(section_path_chunks, "r", encoding="utf-8") as f:
198
  chunks_data = json.load(f)
199
 
200
  chunks_unique = [node for node in chunks_data if node.get('element_id', 'Unknown') in chunk_ids]
201
+ print(f"✅ Matched {len(chunks_unique)} unique chunks")
202
+
203
  combined_text = []
204
 
205
  for chu in chunks_unique:
 
212
  combined_text.append(text)
213
 
214
  result = "\n---\n".join(combined_text)
215
+ print(f" TOOL RESPONSE: Generated response with {len(combined_text)} text sections")
216
  return result
217
  except Exception as e:
218
+ print(f" TOOL ERROR: {e}")
219
  return f"Error retrieving documents: {str(e)}"
220
 
221
  def create_section_tools(embedding_function, llm):
222
  """Create all section-specific retrieval tools"""
223
+ print("🛠️ Creating section-specific retrieval tools...")
224
 
225
  # Define section paths - Fixed path structure
226
  section_paths = {
 
241
  for section, path in section_paths.items():
242
  try:
243
  if os.path.exists(path):
244
+ print(f"📁 Creating retriever for section {section} from {path}")
245
  vstore, docs = create_vectorstore_from_json(path, f"Guide_2023_{section}", embedding_function)
246
  section_retrievers[section] = create_retriever(vstore, docs, llm)
247
+ print(f"Successfully created retriever for section {section}")
248
  else:
249
+ print(f"⚠️ Warning: File not found for section {section}: {path}")
250
  section_retrievers[section] = None
251
  except Exception as e:
252
+ print(f"Error creating retriever for section {section}: {e}")
253
  section_retrievers[section] = None
254
 
255
  # Create main guide retriever
 
257
  guide_retriever = None
258
  try:
259
  if os.path.exists(guide_path):
260
+ print("📚 Creating main guide retriever...")
261
  guide_vstore, guide_docs = create_vectorstore_from_json(guide_path, "Guide_2023_multilingual", embedding_function)
262
  guide_retriever = create_retriever(guide_vstore, guide_docs, llm)
263
+ print("Successfully created main guide retriever")
264
  else:
265
+ print(f"⚠️ Warning: Main guide file not found: {guide_path}")
266
  except Exception as e:
267
+ print(f"Error creating main guide retriever: {e}")
268
 
269
  # WHO Immunization in Practice Tool
270
  immunization_path = './data/Immunization in Practice_WHO_eng_2015.json'
271
  immunization_retriever = None
272
  try:
273
  if os.path.exists(immunization_path):
274
+ print("🌍 Creating immunization retriever...")
275
  immunization_vstore, immunization_docs = create_vectorstore_from_json(
276
  immunization_path,
277
  "Immunization_in_Practice_WHO_eng_2015",
278
  embedding_function
279
  )
280
  immunization_retriever = create_retriever(immunization_vstore, immunization_docs, llm)
281
+ print("Successfully created immunization retriever")
282
  else:
283
+ print(f"⚠️ Warning: Immunization file not found: {immunization_path}")
284
  except Exception as e:
285
+ print(f"Error creating immunization retriever: {e}")
286
 
287
  # General-purpose tool (entire Algerian guide)
288
  def guide_retrieval_tool(query: str) -> str:
 
304
  Returns:
305
  str: Synthesized answer from the entire national guide.
306
  """
307
+ print(f"🏥 GUIDE TOOL CALLED: {query[:50]}...")
308
  if not guide_retriever:
309
+ print("❌ Guide retriever not available - main guide file may be missing")
310
  return "Guide retriever not available - main guide file may be missing"
311
  try:
312
  return section_tool_wrapper(guide_retriever, guide_path, query)
313
  except Exception as e:
314
+ print(f"❌ Error accessing guide retriever: {str(e)}")
315
  return f"Error accessing guide retriever: {str(e)}"
316
 
317
  def immunization_tool(query: str) -> str:
 
330
  Returns:
331
  str: Content from the WHO Immunization in Practice guide.
332
  """
333
+ print(f"🌍 WHO TOOL CALLED: {query[:50]}...")
334
  if not immunization_retriever:
335
+ print("❌ Immunization in Practice retriever not available - WHO guide file may be missing")
336
  return "Immunization in Practice retriever not available - WHO guide file may be missing"
337
  try:
338
  return section_tool_wrapper(immunization_retriever, immunization_path, query)
339
  except Exception as e:
340
+ print(f"❌ Error accessing immunization retriever: {str(e)}")
341
  return f"Error accessing immunization retriever: {str(e)}"
342
 
343
  # Section-Specific Tools - Fixed implementation
 
356
  Returns:
357
  str: Response from Section 1.
358
  """
359
+ print(f"📋 SECTION 1 TOOL CALLED: {query[:50]}...")
360
  if not section_retrievers.get('one'):
361
+ print("❌ Section 1 retriever not available - file may be missing")
362
  return "Section 1 retriever not available - file may be missing"
363
  try:
364
  return section_tool_wrapper(section_retrievers['one'], section_paths['one'], query)
365
  except Exception as e:
366
+ print(f"❌ Error accessing section 1: {str(e)}")
367
  return f"Error accessing section 1: {str(e)}"
368
 
369
  def section_two_tool(query: str) -> str:
 
381
  Returns:
382
  str: Disease-specific content from Section 2.
383
  """
384
+ print(f"🦠 SECTION 2 TOOL CALLED: {query[:50]}...")
385
  if not section_retrievers.get('two'):
386
+ print("❌ Section 2 retriever not available - file may be missing")
387
  return "Section 2 retriever not available - file may be missing"
388
  try:
389
  return section_tool_wrapper(section_retrievers['two'], section_paths['two'], query)
390
  except Exception as e:
391
+ print(f"❌ Error accessing section 2: {str(e)}")
392
  return f"Error accessing section 2: {str(e)}"
393
 
394
  def section_three_tool(query: str) -> str:
 
406
  Returns:
407
  str: Vaccine info from Section 3.
408
  """
409
+ print(f"💉 SECTION 3 TOOL CALLED: {query[:50]}...")
410
  if not section_retrievers.get('three'):
411
+ print("❌ Section 3 retriever not available - file may be missing")
412
  return "Section 3 retriever not available - file may be missing"
413
  try:
414
  return section_tool_wrapper(section_retrievers['three'], section_paths['three'], query)
415
  except Exception as e:
416
+ print(f"❌ Error accessing section 3: {str(e)}")
417
  return f"Error accessing section 3: {str(e)}"
418
 
419
  def section_four_tool(query: str) -> str:
 
431
  Returns:
432
  str: Catch-up guidance from Section 4.
433
  """
434
+ print(f"🔄 SECTION 4 TOOL CALLED: {query[:50]}...")
435
  if not section_retrievers.get('four'):
436
+ print("❌ Section 4 retriever not available - file may be missing")
437
  return "Section 4 retriever not available - file may be missing"
438
  try:
439
  return section_tool_wrapper(section_retrievers['four'], section_paths['four'], query)
440
  except Exception as e:
441
+ print(f"❌ Error accessing section 4: {str(e)}")
442
  return f"Error accessing section 4: {str(e)}"
443
 
444
  def section_five_tool(query: str) -> str:
 
456
  Returns:
457
  str: Custom recommendations from Section 5.
458
  """
459
+ print(f"👥 SECTION 5 TOOL CALLED: {query[:50]}...")
460
  if not section_retrievers.get('five'):
461
+ print("❌ Section 5 retriever not available - file may be missing")
462
  return "Section 5 retriever not available - file may be missing"
463
  try:
464
  return section_tool_wrapper(section_retrievers['five'], section_paths['five'], query)
465
  except Exception as e:
466
+ print(f"❌ Error accessing section 5: {str(e)}")
467
  return f"Error accessing section 5: {str(e)}"
468
 
469
  def section_six_tool(query: str) -> str:
 
481
  Returns:
482
  str: Cold chain instructions from Section 6.
483
  """
484
+ print(f"❄️ SECTION 6 TOOL CALLED: {query[:50]}...")
485
  if not section_retrievers.get('six'):
486
+ print("❌ Section 6 retriever not available - file may be missing")
487
  return "Section 6 retriever not available - file may be missing"
488
  try:
489
  return section_tool_wrapper(section_retrievers['six'], section_paths['six'], query)
490
  except Exception as e:
491
+ print(f"❌ Error accessing section 6: {str(e)}")
492
  return f"Error accessing section 6: {str(e)}"
493
 
494
  def section_seven_tool(query: str) -> str:
 
506
  Returns:
507
  str: Best practices from Section 7.
508
  """
509
+ print(f"🛡️ SECTION 7 TOOL CALLED: {query[:50]}...")
510
  if not section_retrievers.get('seven'):
511
+ print("❌ Section 7 retriever not available - file may be missing")
512
  return "Section 7 retriever not available - file may be missing"
513
  try:
514
  return section_tool_wrapper(section_retrievers['seven'], section_paths['seven'], query)
515
  except Exception as e:
516
+ print(f"❌ Error accessing section 7: {str(e)}")
517
  return f"Error accessing section 7: {str(e)}"
518
 
519
  def section_eight_tool(query: str) -> str:
 
531
  Returns:
532
  str: Workflow and safety monitoring details from Section 8.
533
  """
534
+ print(f"📊 SECTION 8 TOOL CALLED: {query[:50]}...")
535
  if not section_retrievers.get('eight'):
536
+ print("❌ Section 8 retriever not available - file may be missing")
537
  return "Section 8 retriever not available - file may be missing"
538
  try:
539
  return section_tool_wrapper(section_retrievers['eight'], section_paths['eight'], query)
540
  except Exception as e:
541
+ print(f"❌ Error accessing section 8: {str(e)}")
542
  return f"Error accessing section 8: {str(e)}"
543
 
544
  def section_nine_tool(query: str) -> str:
 
556
  Returns:
557
  str: Planning and stock guidance from Section 9.
558
  """
559
+ print(f"📅 SECTION 9 TOOL CALLED: {query[:50]}...")
560
  if not section_retrievers.get('nine'):
561
+ print("❌ Section 9 retriever not available - file may be missing")
562
  return "Section 9 retriever not available - file may be missing"
563
  try:
564
  return section_tool_wrapper(section_retrievers['nine'], section_paths['nine'], query)
565
  except Exception as e:
566
+ print(f"❌ Error accessing section 9: {str(e)}")
567
  return f"Error accessing section 9: {str(e)}"
568
 
569
  def section_ten_tool(query: str) -> str:
 
581
  Returns:
582
  str: Public mobilization strategies from Section 10.
583
  """
584
+ print(f"📢 SECTION 10 TOOL CALLED: {query[:50]}...")
585
  if not section_retrievers.get('ten'):
586
+ print("❌ Section 10 retriever not available - file may be missing")
587
  return "Section 10 retriever not available - file may be missing"
588
  try:
589
  return section_tool_wrapper(section_retrievers['ten'], section_paths['ten'], query)
590
  except Exception as e:
591
+ print(f"❌ Error accessing section 10: {str(e)}")
592
  return f"Error accessing section 10: {str(e)}"
593
 
594
  # Create FunctionTool objects
 
608
  FunctionTool.from_defaults(name="section_ten_vector_query_tool", fn=section_ten_tool),
609
  ]
610
 
611
+ print(f"✅ Created {len(tools)} section tools")
612
  return tools
613
 
614
  def prepare_environment():
615
  """Main function to prepare the environment and return tools"""
616
+ print("🚀 Starting environment preparation...")
617
+ print("🔧 Setting up models...")
618
  embedding_function, llm = setup_models()
619
 
620
+ print("🛠️ Creating section tools...")
621
  tools = create_section_tools(embedding_function, llm)
622
 
623
+ print("Environment prepared successfully!")
624
+ print(f"📋 Created {len(tools)} tools")
625
 
626
  return tools, llm
rag_pipeline.py CHANGED
@@ -11,6 +11,7 @@ from llama_index.core.agent import ReActAgent
11
  from llama_index.llms.google_genai import GoogleGenAI
12
  from langdetect import detect
13
  import os
 
14
 
15
 
16
  def extract_source_ids(response_text):
@@ -28,6 +29,8 @@ def extract_source_ids(response_text):
28
  """
29
  import re
30
 
 
 
31
  # First, extract all source IDs from inline citations with adjacent brackets [ID1][ID2]
32
  # Replace them with single brackets with comma separation to standardize format
33
  consolidated_text = re.sub(r'\][\s]*\[', '][', response_text)
@@ -35,6 +38,7 @@ def extract_source_ids(response_text):
35
 
36
  # Now extract all source IDs from any format (single ID or comma-separated IDs)
37
  inline_citations = re.findall(r'\[([^\[\]]+)\]', consolidated_text)
 
38
 
39
  if not inline_citations:
40
  print("Warning: No source IDs found in the response text.")
@@ -55,6 +59,8 @@ def extract_source_ids(response_text):
55
  seen.add(id_str)
56
  source_ids.append(id_str)
57
 
 
 
58
  if not source_ids:
59
  print("Warning: No valid source IDs found after filtering.")
60
  return []
@@ -73,6 +79,8 @@ def convert_citations_to_sequential(response_text, source_id_to_number_map):
73
  Returns:
74
  str: Response text with sequential number citations
75
  """
 
 
76
  def replace_citation(match):
77
  citation_content = match.group(1)
78
  # Handle multiple IDs in one citation (comma-separated)
@@ -94,6 +102,7 @@ def convert_citations_to_sequential(response_text, source_id_to_number_map):
94
 
95
  # Replace all citations in the text
96
  sequential_response = re.sub(r'\[([^\[\]]+)\]', replace_citation, response_text)
 
97
  return sequential_response
98
 
99
 
@@ -101,6 +110,8 @@ def convert_citations_to_sequential(response_text, source_id_to_number_map):
101
  def create_safe_custom_prompt(tools, llm):
102
  """Create a safe version that won't have formatting conflicts"""
103
 
 
 
104
  custom_instructions = """
105
  ## MEDICAL ASSISTANT ROLE
106
  You are a helpful and knowledgeable AI-powered vaccine assistant designed to support doctors in clinical decision-making.
@@ -165,19 +176,24 @@ If you cannot find complete information to fully answer a question:
165
  template_vars=original_prompt.template_vars,
166
  metadata=original_prompt.metadata if hasattr(original_prompt, 'metadata') else None
167
  )
 
168
  return new_prompt
169
  except:
170
  # Even safer fallback
 
171
  return PromptTemplate(template=safe_template)
172
 
173
  def create_agent(tools, llm):
174
  """Create the ReAct agent with custom prompt"""
175
 
 
 
176
  # Create agent with increased max iterations and better handling
 
177
  agent = ReActAgent.from_tools(
178
  tools,
179
  llm=llm,
180
- verbose=True,
181
  max_iterations=8, # Reduced from default to prevent excessive looping
182
  )
183
 
@@ -190,12 +206,17 @@ def create_agent(tools, llm):
190
  print(f"❌ Safe prompt update failed: {e}")
191
  print("⚠️ Using original agent without modifications")
192
 
 
193
  return agent
194
 
195
  def initialize_rag_pipeline(tools):
196
  """Initialize the RAG pipeline with tools"""
197
 
 
 
 
198
  # Initialize LlamaIndex LLM
 
199
  llama_index_llm = GoogleGenAI(
200
  model="models/gemini-2.0-flash",
201
  api_key=os.getenv('GOOGLE_API_KEY'),
@@ -204,15 +225,33 @@ def initialize_rag_pipeline(tools):
204
  # Create agent
205
  agent = create_agent(tools, llama_index_llm)
206
 
 
207
  return agent
208
 
209
  def process_question(agent, question: str) -> str:
210
  """Process a question through the RAG pipeline"""
 
 
 
 
 
 
211
  try:
 
212
  response = agent.chat(question)
 
 
 
 
 
 
 
 
 
213
  return response.response
214
  except Exception as e:
215
- print(f"Error processing question: {e}")
 
216
  return f"Error processing your question: {str(e)}"
217
 
218
  def aswer_language_detection(response_text: str) -> str:
@@ -225,15 +264,19 @@ def aswer_language_detection(response_text: str) -> str:
225
  Returns:
226
  str: Detected language code (e.g., 'en', 'fr', etc.)
227
  """
 
228
 
229
  try:
230
  # Detect the language of the first 5 words of the response
231
  first_line = " ".join(response_text.split()[:5])
232
  first_line = re.sub(r'\[.*?\]', '', first_line) # Remove citations
233
  answer_language = detect(first_line)
 
234
  if answer_language not in ['en', 'ar', 'fr']:
 
235
  answer_language ='en'
236
  except:
 
237
  answer_language ='en'
238
 
239
  finally:
@@ -257,17 +300,35 @@ def process_question_with_sequential_citations(agent, question: str, chunks_dire
257
  "citation_mapping": dict # Mapping from source ID to citation number
258
  }
259
  """
 
 
 
 
 
260
  try:
261
  # Get the response from the agent
 
 
 
 
262
  response = agent.chat(question)
 
 
 
 
263
  response_text = response.response
264
 
 
 
 
 
265
  # Enhanced handling for max iterations error
266
  if ("max iterations" in response_text.lower() or
267
  "reached max iterations" in response_text.lower() or
268
  len(response_text.strip()) == 0 or
269
  "agent stopped due to max iterations" in response_text.lower()):
270
 
 
271
  # Provide a more helpful fallback response
272
  response_text = ("I apologize, but I encountered difficulties processing your question within the available search iterations. "
273
  "This may be due to the complexity of your query or limitations in finding specific information in the available documents. "
@@ -278,55 +339,77 @@ def process_question_with_sequential_citations(agent, question: str, chunks_dire
278
 
279
  # Create mapping from source ID to sequential number
280
  source_id_to_number = {source_id: i + 1 for i, source_id in enumerate(unique_ids)}
 
281
 
282
  # Convert citations to sequential numbers
283
  sequential_response = convert_citations_to_sequential(response_text, source_id_to_number)
284
 
285
  # Load all chunks data to find cited elements
 
286
  all_chunks_data = []
287
  min_chunks_files = ["Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.json",
288
  "Immunization in Practice_WHO_eng_2015.json"]
289
 
290
  for json_file in min_chunks_files:
291
  json_path = os.path.join(chunks_directory, json_file)
 
292
  try:
293
  with open(json_path, "r", encoding="utf-8") as f:
294
  chunks_data = json.load(f)
295
  all_chunks_data.extend(chunks_data)
 
296
  except Exception as e:
297
- print(f"Warning: Could not load {json_file}: {e}")
 
 
298
 
299
  # Get cited elements in the same order as the sequential citations
 
300
  cited_elements_ordered = []
301
- for source_id in unique_ids: # This preserves the order
 
 
302
  for element in all_chunks_data:
303
  if element.get("type") == 'TableElement':
304
  if element.get("elements",{}).get("element_id") == source_id:
305
  cited_elements_ordered.append(element.get("elements",{}))
 
306
  break
307
  else:
308
  if "elements" in element:
309
  for nested_element in element["elements"]:
310
  if nested_element.get("element_id") == source_id:
311
  cited_elements_ordered.append(nested_element)
 
312
  break
313
  else:
314
  continue
315
  break
 
 
 
 
316
 
317
  # Convert to JSON
318
  cited_elements_json = json.dumps(cited_elements_ordered, ensure_ascii=False, indent=2)
319
- aswer_language= aswer_language_detection(response_text)
 
 
 
 
 
 
320
  return {
321
  "response": sequential_response,
322
  "cited_elements_json": cited_elements_json,
323
  "unique_ids": unique_ids,
324
  "citation_mapping": source_id_to_number,
325
- "answer_language":aswer_language
326
  }
327
 
328
  except Exception as e:
329
- print(f"Error processing question: {e}")
 
330
  error_response = "I apologize, but I encountered an error while processing your question. Please try rephrasing your question or asking about a more specific topic."
331
 
332
  return {
@@ -342,4 +425,5 @@ def process_question_with_citations(agent, question: str, chunks_directory="./da
342
  Legacy function - maintained for backward compatibility.
343
  Now calls the new sequential citation function.
344
  """
 
345
  return process_question_with_sequential_citations(agent, question, chunks_directory)
 
11
  from llama_index.llms.google_genai import GoogleGenAI
12
  from langdetect import detect
13
  import os
14
+ import time
15
 
16
 
17
  def extract_source_ids(response_text):
 
29
  """
30
  import re
31
 
32
+ print(f"[LOG] Extracting source IDs from response text (length: {len(response_text)} chars)")
33
+
34
  # First, extract all source IDs from inline citations with adjacent brackets [ID1][ID2]
35
  # Replace them with single brackets with comma separation to standardize format
36
  consolidated_text = re.sub(r'\][\s]*\[', '][', response_text)
 
38
 
39
  # Now extract all source IDs from any format (single ID or comma-separated IDs)
40
  inline_citations = re.findall(r'\[([^\[\]]+)\]', consolidated_text)
41
+ print(f"[LOG] Found {len(inline_citations)} inline citations")
42
 
43
  if not inline_citations:
44
  print("Warning: No source IDs found in the response text.")
 
59
  seen.add(id_str)
60
  source_ids.append(id_str)
61
 
62
+ print(f"[LOG] Extracted {len(source_ids)} unique source IDs: {source_ids[:3]}{'...' if len(source_ids) > 3 else ''}")
63
+
64
  if not source_ids:
65
  print("Warning: No valid source IDs found after filtering.")
66
  return []
 
79
  Returns:
80
  str: Response text with sequential number citations
81
  """
82
+ print(f"[LOG] Converting {len(source_id_to_number_map)} source IDs to sequential numbers")
83
+
84
  def replace_citation(match):
85
  citation_content = match.group(1)
86
  # Handle multiple IDs in one citation (comma-separated)
 
102
 
103
  # Replace all citations in the text
104
  sequential_response = re.sub(r'\[([^\[\]]+)\]', replace_citation, response_text)
105
+ print("[LOG] Successfully converted citations to sequential format")
106
  return sequential_response
107
 
108
 
 
110
  def create_safe_custom_prompt(tools, llm):
111
  """Create a safe version that won't have formatting conflicts"""
112
 
113
+ print(f"[LOG] Creating custom prompt with {len(tools)} tools")
114
+
115
  custom_instructions = """
116
  ## MEDICAL ASSISTANT ROLE
117
  You are a helpful and knowledgeable AI-powered vaccine assistant designed to support doctors in clinical decision-making.
 
176
  template_vars=original_prompt.template_vars,
177
  metadata=original_prompt.metadata if hasattr(original_prompt, 'metadata') else None
178
  )
179
+ print("[LOG] ✅ Successfully created safe custom prompt")
180
  return new_prompt
181
  except:
182
  # Even safer fallback
183
+ print("[LOG] ⚠️ Using fallback prompt template")
184
  return PromptTemplate(template=safe_template)
185
 
186
  def create_agent(tools, llm):
187
  """Create the ReAct agent with custom prompt"""
188
 
189
+ print(f"[LOG] Creating ReAct agent with {len(tools)} tools and max_iterations=8")
190
+
191
  # Create agent with increased max iterations and better handling
192
+ # Force verbose=True to see the Thought/Action/Observation cycle
193
  agent = ReActAgent.from_tools(
194
  tools,
195
  llm=llm,
196
+ verbose=True, # This should show the ReAct reasoning steps
197
  max_iterations=8, # Reduced from default to prevent excessive looping
198
  )
199
 
 
206
  print(f"❌ Safe prompt update failed: {e}")
207
  print("⚠️ Using original agent without modifications")
208
 
209
+ print("[LOG] Agent creation completed")
210
  return agent
211
 
212
  def initialize_rag_pipeline(tools):
213
  """Initialize the RAG pipeline with tools"""
214
 
215
+ print("[LOG] Initializing RAG pipeline...")
216
+ print(f"[LOG] Available tools: {[tool.metadata.name if hasattr(tool, 'metadata') else str(tool) for tool in tools]}")
217
+
218
  # Initialize LlamaIndex LLM
219
+ print("[LOG] Initializing Google GenAI LLM (gemini-2.0-flash)")
220
  llama_index_llm = GoogleGenAI(
221
  model="models/gemini-2.0-flash",
222
  api_key=os.getenv('GOOGLE_API_KEY'),
 
225
  # Create agent
226
  agent = create_agent(tools, llama_index_llm)
227
 
228
+ print("[LOG] ✅ RAG pipeline initialization completed")
229
  return agent
230
 
231
  def process_question(agent, question: str) -> str:
232
  """Process a question through the RAG pipeline"""
233
+ print(f"[LOG] Processing question: '{question[:100]}{'...' if len(question) > 100 else ''}'")
234
+ print("="*50)
235
+ print("AGENT REASONING PROCESS:")
236
+ print("="*50)
237
+ start_time = time.time()
238
+
239
  try:
240
+ # The agent.chat() call should now show the full ReAct process
241
  response = agent.chat(question)
242
+
243
+ print("="*50)
244
+ print("END OF AGENT REASONING")
245
+ print("="*50)
246
+
247
+ elapsed_time = time.time() - start_time
248
+ print(f"[LOG] ✅ Agent response received in {elapsed_time:.2f} seconds")
249
+ print(f"[LOG] Response length: {len(response.response)} characters")
250
+
251
  return response.response
252
  except Exception as e:
253
+ elapsed_time = time.time() - start_time
254
+ print(f"[LOG] ❌ Error processing question after {elapsed_time:.2f} seconds: {e}")
255
  return f"Error processing your question: {str(e)}"
256
 
257
  def aswer_language_detection(response_text: str) -> str:
 
264
  Returns:
265
  str: Detected language code (e.g., 'en', 'fr', etc.)
266
  """
267
+ print("[LOG] Detecting response language...")
268
 
269
  try:
270
  # Detect the language of the first 5 words of the response
271
  first_line = " ".join(response_text.split()[:5])
272
  first_line = re.sub(r'\[.*?\]', '', first_line) # Remove citations
273
  answer_language = detect(first_line)
274
+ print(f"[LOG] Detected language: {answer_language}")
275
  if answer_language not in ['en', 'ar', 'fr']:
276
+ print(f"[LOG] Language {answer_language} not in supported list, defaulting to 'en'")
277
  answer_language ='en'
278
  except:
279
+ print("[LOG] Language detection failed, defaulting to 'en'")
280
  answer_language ='en'
281
 
282
  finally:
 
300
  "citation_mapping": dict # Mapping from source ID to citation number
301
  }
302
  """
303
+ print(f"\n[LOG] === STARTING QUESTION PROCESSING ===")
304
+ print(f"[LOG] Question: '{question[:150]}{'...' if len(question) > 150 else ''}'")
305
+ print(f"[LOG] Chunks directory: {chunks_directory}")
306
+ start_time = time.time()
307
+
308
  try:
309
  # Get the response from the agent
310
+ print("\n" + "="*60)
311
+ print("🤖 AGENT REASONING PROCESS STARTING...")
312
+ print("="*60)
313
+
314
  response = agent.chat(question)
315
+
316
+ print("="*60)
317
+ print("🤖 AGENT REASONING PROCESS COMPLETED")
318
+ print("="*60)
319
  response_text = response.response
320
 
321
+ agent_time = time.time() - start_time
322
+ print(f"[LOG] Agent processing completed in {agent_time:.2f} seconds")
323
+ print(f"[LOG] Raw response length: {len(response_text)} characters")
324
+
325
  # Enhanced handling for max iterations error
326
  if ("max iterations" in response_text.lower() or
327
  "reached max iterations" in response_text.lower() or
328
  len(response_text.strip()) == 0 or
329
  "agent stopped due to max iterations" in response_text.lower()):
330
 
331
+ print("[LOG] ⚠️ Detected max iterations error, providing fallback response")
332
  # Provide a more helpful fallback response
333
  response_text = ("I apologize, but I encountered difficulties processing your question within the available search iterations. "
334
  "This may be due to the complexity of your query or limitations in finding specific information in the available documents. "
 
339
 
340
  # Create mapping from source ID to sequential number
341
  source_id_to_number = {source_id: i + 1 for i, source_id in enumerate(unique_ids)}
342
+ print(f"[LOG] Created citation mapping for {len(source_id_to_number)} sources")
343
 
344
  # Convert citations to sequential numbers
345
  sequential_response = convert_citations_to_sequential(response_text, source_id_to_number)
346
 
347
  # Load all chunks data to find cited elements
348
+ print("[LOG] Loading chunks data for citation lookup...")
349
  all_chunks_data = []
350
  min_chunks_files = ["Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.json",
351
  "Immunization in Practice_WHO_eng_2015.json"]
352
 
353
  for json_file in min_chunks_files:
354
  json_path = os.path.join(chunks_directory, json_file)
355
+ print(f"[LOG] Loading {json_file}...")
356
  try:
357
  with open(json_path, "r", encoding="utf-8") as f:
358
  chunks_data = json.load(f)
359
  all_chunks_data.extend(chunks_data)
360
+ print(f"[LOG] ✅ Loaded {len(chunks_data)} chunks from {json_file}")
361
  except Exception as e:
362
+ print(f"[LOG] ❌ Warning: Could not load {json_file}: {e}")
363
+
364
+ print(f"[LOG] Total chunks loaded: {len(all_chunks_data)}")
365
 
366
  # Get cited elements in the same order as the sequential citations
367
+ print("[LOG] Finding cited elements...")
368
  cited_elements_ordered = []
369
+ for i, source_id in enumerate(unique_ids): # This preserves the order
370
+ print(f"[LOG] Looking for source ID {i+1}/{len(unique_ids)}: {source_id}")
371
+ found = False
372
  for element in all_chunks_data:
373
  if element.get("type") == 'TableElement':
374
  if element.get("elements",{}).get("element_id") == source_id:
375
  cited_elements_ordered.append(element.get("elements",{}))
376
+ found = True
377
  break
378
  else:
379
  if "elements" in element:
380
  for nested_element in element["elements"]:
381
  if nested_element.get("element_id") == source_id:
382
  cited_elements_ordered.append(nested_element)
383
+ found = True
384
  break
385
  else:
386
  continue
387
  break
388
+ if not found:
389
+ print(f"[LOG] ⚠️ Source ID {source_id} not found in chunks data")
390
+
391
+ print(f"[LOG] Found {len(cited_elements_ordered)} cited elements")
392
 
393
  # Convert to JSON
394
  cited_elements_json = json.dumps(cited_elements_ordered, ensure_ascii=False, indent=2)
395
+ aswer_language = aswer_language_detection(response_text)
396
+
397
+ total_time = time.time() - start_time
398
+ print(f"[LOG] ✅ Processing completed in {total_time:.2f} seconds total")
399
+ print(f"[LOG] Final response length: {len(sequential_response)} characters")
400
+ print(f"[LOG] === QUESTION PROCESSING COMPLETED ===\n")
401
+
402
  return {
403
  "response": sequential_response,
404
  "cited_elements_json": cited_elements_json,
405
  "unique_ids": unique_ids,
406
  "citation_mapping": source_id_to_number,
407
+ "answer_language": aswer_language
408
  }
409
 
410
  except Exception as e:
411
+ elapsed_time = time.time() - start_time
412
+ print(f"[LOG] ❌ Error processing question after {elapsed_time:.2f} seconds: {e}")
413
  error_response = "I apologize, but I encountered an error while processing your question. Please try rephrasing your question or asking about a more specific topic."
414
 
415
  return {
 
425
  Legacy function - maintained for backward compatibility.
426
  Now calls the new sequential citation function.
427
  """
428
+ print("[LOG] Using legacy function wrapper - redirecting to sequential citations")
429
  return process_question_with_sequential_citations(agent, question, chunks_directory)