Zeggai Abdellah commited on
Commit
c0e5c04
·
1 Parent(s): 5a74e30

update the Immunization_in_Practice_tool tool

Browse files
Files changed (2) hide show
  1. prepare_env.py +217 -121
  2. rag_pipeline.py +110 -79
prepare_env.py CHANGED
@@ -1,7 +1,7 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- Environment preparation script for vaccine assistant - Improved version
4
- Creates vector stores and retrieval tools with better descriptions for efficient agent routing
5
  """
6
 
7
  import os
@@ -56,6 +56,11 @@ def extract_source_ids(response_text):
56
  # Get unique source IDs
57
  source_ids = list(set(all_ids))
58
 
 
 
 
 
 
59
  if not source_ids:
60
  print("Warning: No valid source IDs found after filtering.")
61
  return []
@@ -70,17 +75,15 @@ def setup_models():
70
  model_name="intfloat/multilingual-e5-base"
71
  )
72
 
73
- # Initialize LLM with better parameters for focused responses
74
  genai_api_key = os.getenv('GOOGLE_API_KEY')
75
  llm = ChatGoogleGenerativeAI(
76
  model="gemini-2.0-flash",
77
- google_api_key=genai_api_key,
78
- temperature=0.1 # Lower temperature for more focused responses
79
  )
80
 
81
  return embedding_function, llm
82
 
83
-
84
  def create_vectorstore_from_json(json_path: str, collection_name: str, embedding_function):
85
  """Create vector store from JSON chunks"""
86
  # Load the chunks.json
@@ -112,13 +115,12 @@ def create_vectorstore_from_json(json_path: str, collection_name: str, embedding
112
  )
113
  return vectorstore, documents
114
 
115
-
116
  def create_retriever(vectorstore, docs, llm):
117
  """Create ensemble retriever with vector and BM25 search"""
118
  # Vector retriever
119
  vector_retriever = vectorstore.as_retriever(
120
  search_type="similarity",
121
- search_kwargs={"k": 4} # Reduced from 6 to 4 for efficiency
122
  )
123
 
124
  # BM25 retriever
@@ -131,7 +133,7 @@ def create_retriever(vectorstore, docs, llm):
131
  weights=[0.5, 0.5]
132
  )
133
 
134
- # Multi-query expanding retriever (with reduced complexity for efficiency)
135
  expanding_retriever = MultiQueryRetriever.from_llm(
136
  retriever=ensemble_retriever,
137
  llm=llm
@@ -139,7 +141,6 @@ def create_retriever(vectorstore, docs, llm):
139
 
140
  return expanding_retriever
141
 
142
-
143
  def convert_chromadb_to_llamaindex_nodes(chromadb_documents: List) -> List[TextNode]:
144
  """Convert ChromaDB Document objects to LlamaIndex TextNode objects"""
145
  nodes = []
@@ -161,9 +162,8 @@ def convert_chromadb_to_llamaindex_nodes(chromadb_documents: List) -> List[TextN
161
  continue
162
  return nodes
163
 
164
-
165
  def section_tool_wrapper(retriever, section_path_chunks, query):
166
- """Generic section tool wrapper with improved efficiency"""
167
  try:
168
  retrieved_docs = retriever.get_relevant_documents(query)
169
  nodes_from_retrieved_docs = convert_chromadb_to_llamaindex_nodes(retrieved_docs)
@@ -178,15 +178,13 @@ def section_tool_wrapper(retriever, section_path_chunks, query):
178
  chunks_unique = [node for node in chunks_data if node.get('element_id', 'Unknown') in chunk_ids]
179
  combined_text = []
180
 
181
- # Limit the number of chunks to avoid overwhelming the context
182
- max_chunks = 8 # Reasonable limit
183
- for chu in chunks_unique[:max_chunks]:
184
  if "TableElement" == chu["type"]:
185
- text = f"[{chu['element_id']}]\n CONTENT: \n{chu['text']}\n HTML: \n {chu['table_text_as_html']} \n\n"
186
  combined_text.append(text)
187
  else:
188
  for element in chu["elements"]:
189
- text = f"[{element['element_id']}]\n CONTENT: \n{element['text']} \n\n"
190
  combined_text.append(text)
191
 
192
  result = "\n---\n".join(combined_text)
@@ -196,9 +194,8 @@ def section_tool_wrapper(retriever, section_path_chunks, query):
196
  print(f"Error in section tool: {e}")
197
  return f"Error retrieving documents: {str(e)}"
198
 
199
-
200
  def create_section_tools(embedding_function, llm):
201
- """Create all section-specific retrieval tools with improved descriptions"""
202
 
203
  # Define section paths
204
  section_paths = {
@@ -217,7 +214,7 @@ def create_section_tools(embedding_function, llm):
217
  # Create retrievers for each section
218
  section_retrievers = {}
219
  for section, path in section_paths.items():
220
- if os.path.exists(f'./data/{path}'):
221
  vstore, docs = create_vectorstore_from_json(f'./data/{path}', f"Guide_2023_{section}", embedding_function)
222
  section_retrievers[section] = create_retriever(vstore, docs, llm)
223
 
@@ -228,9 +225,29 @@ def create_section_tools(embedding_function, llm):
228
  guide_retriever = create_retriever(guide_vstore, guide_docs, llm)
229
  else:
230
  guide_retriever = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  # Primary + Secondary Document Paths
233
- immunization_path = './data/Immunization_in_Practice_WHO_eng_2015.json'
234
 
235
  # WHO Immunization in Practice Tool
236
  if os.path.exists(immunization_path):
@@ -243,134 +260,213 @@ def create_section_tools(embedding_function, llm):
243
  else:
244
  immunization_retriever = None
245
 
246
- # Tool Functions with Improved Efficiency Focus
247
-
248
- def guide_retrieval_tool(query: str) -> str:
249
  """
250
- **PRIMARY TOOL - USE FIRST FOR MOST QUESTIONS**
251
-
252
- Comprehensive search across the entire Algerian National Vaccination Guide (2023).
253
-
254
- **When to use this tool:**
255
- - General vaccination questions
256
- - Disease definitions and descriptions
257
- - Vaccine schedules and protocols
258
- - Comparative questions needing Algerian perspective
259
- - Any question about Algeria's vaccination program
260
-
261
- **Keywords that indicate this tool:** Algeria, Algerian, national, calendrier, vaccination, PEV, diseases (diphteria, polio, measles, etc.)
262
-
263
  Args:
264
- query (str): Any vaccination-related question about Algeria's national program
265
-
266
  Returns:
267
- str: Comprehensive information from the Algerian guide with citations
268
  """
269
- if not guide_retriever:
270
- return "Guide retriever not available"
271
- return section_tool_wrapper(guide_retriever, guide_path, query)
272
 
273
- def immunization_tool(query: str) -> str:
 
 
 
 
274
  """
275
- **SECONDARY TOOL - USE FOR WHO/INTERNATIONAL PERSPECTIVE**
276
-
277
- WHO Immunization in Practice 2015 - Global best practices and international standards.
278
-
279
- **When to use this tool:**
280
- - Questions specifically asking about WHO recommendations
281
- - International/global immunization practices
282
- - Comparative questions needing WHO perspective
283
- - Technical immunization procedures and best practices
284
-
285
- **Keywords that indicate this tool:** WHO, international, global, best practices, standards
286
-
287
  Args:
288
- query (str): Question about international immunization practices or WHO recommendations
289
-
290
  Returns:
291
- str: WHO guidance and international best practices with citations
292
  """
293
- if not immunization_retriever:
294
- return "Immunization in Practice retriever not available"
295
- return section_tool_wrapper(immunization_retriever, immunization_path, query)
296
 
297
- # Section-Specific Tools (USE ONLY IF QUESTION IS VERY SPECIFIC TO THE SECTION)
298
 
299
  def section_two_tool(query: str) -> str:
300
  """
301
- **DISEASE-SPECIFIC TOOL**
302
-
303
- Section 2: Vaccine-preventable diseases - definitions, symptoms, transmission, complications.
304
-
305
- **Use ONLY for specific disease definition questions like:**
306
- - "What is diphtheria?"
307
- - "Define measles according to Algerian protocol"
308
- - "Symptoms of polio"
309
-
310
- **Keywords:** definition, symptoms, transmission, complications, disease characteristics
311
-
312
  Args:
313
- query (str): Specific question about disease definitions or characteristics
314
-
315
  Returns:
316
- str: Disease-specific medical information with citations
317
  """
318
- if 'two' not in section_retrievers:
319
- return "Section 2 retriever not available"
320
- return section_tool_wrapper(section_retrievers['two'], f'./data/{section_paths["two"]}', query)
321
 
322
  def section_three_tool(query: str) -> str:
323
  """
324
- **VACCINE-SPECIFIC TOOL**
325
-
326
- Section 3: Vaccine details - types, composition, administration methods.
327
-
328
- **Use ONLY for specific vaccine technical questions like:**
329
- - "What type of vaccine is used for diphtheria?"
330
- - "How is the MMR vaccine administered?"
331
- - "Vaccine composition and dosage"
332
-
333
- **Keywords:** vaccine type, composition, administration, dosage, technical details
334
-
335
  Args:
336
- query (str): Technical question about specific vaccines
337
-
338
  Returns:
339
- str: Technical vaccine information with citations
340
  """
341
- if 'three' not in section_retrievers:
342
- return "Section 3 retriever not available"
343
- return section_tool_wrapper(section_retrievers['three'], f'./data/{section_paths["three"]}', query)
344
 
345
- # Create FunctionTool objects with focused selection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  tools = [
347
- # Primary tools - most commonly used
348
- FunctionTool.from_defaults(
349
- name="algerian_guide_search",
350
- fn=guide_retrieval_tool,
351
- description="PRIMARY TOOL: Search the complete Algerian National Vaccination Guide for any vaccination-related question"
352
- ),
353
- FunctionTool.from_defaults(
354
- name="who_immunization_search",
355
- fn=immunization_tool,
356
- description="SECONDARY TOOL: Search WHO Immunization in Practice for international standards and WHO recommendations"
357
- ),
358
- # Specialized tools - use only when very specific
359
- FunctionTool.from_defaults(
360
- name="disease_definitions_search",
361
- fn=section_two_tool,
362
- description="SPECIALIZED: Search for specific disease definitions, symptoms, and characteristics"
363
- ),
364
- FunctionTool.from_defaults(
365
- name="vaccine_technical_search",
366
- fn=section_three_tool,
367
- description="SPECIALIZED: Search for technical vaccine details, composition, and administration methods"
368
- ),
369
  ]
370
 
371
  return tools
372
 
373
-
374
  def prepare_environment():
375
  """Main function to prepare the environment and return tools"""
376
  print("Setting up models...")
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ Environment preparation script for vaccine assistant
4
+ Creates vector stores and retrieval tools
5
  """
6
 
7
  import os
 
56
  # Get unique source IDs
57
  source_ids = list(set(all_ids))
58
 
59
+ # Filter out any non-UUID-like IDs (if needed)
60
+ # This is now optional as we're handling various source ID formats
61
+ # uuid_pattern = r'^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$'
62
+ # source_ids = [source_id for source_id in source_ids if re.match(uuid_pattern, source_id, re.IGNORECASE)]
63
+
64
  if not source_ids:
65
  print("Warning: No valid source IDs found after filtering.")
66
  return []
 
75
  model_name="intfloat/multilingual-e5-base"
76
  )
77
 
78
+ # Initialize LLM
79
  genai_api_key = os.getenv('GOOGLE_API_KEY')
80
  llm = ChatGoogleGenerativeAI(
81
  model="gemini-2.0-flash",
82
+ google_api_key=genai_api_key
 
83
  )
84
 
85
  return embedding_function, llm
86
 
 
87
  def create_vectorstore_from_json(json_path: str, collection_name: str, embedding_function):
88
  """Create vector store from JSON chunks"""
89
  # Load the chunks.json
 
115
  )
116
  return vectorstore, documents
117
 
 
118
  def create_retriever(vectorstore, docs, llm):
119
  """Create ensemble retriever with vector and BM25 search"""
120
  # Vector retriever
121
  vector_retriever = vectorstore.as_retriever(
122
  search_type="similarity",
123
+ search_kwargs={"k": 6}
124
  )
125
 
126
  # BM25 retriever
 
133
  weights=[0.5, 0.5]
134
  )
135
 
136
+ # Multi-query expanding retriever
137
  expanding_retriever = MultiQueryRetriever.from_llm(
138
  retriever=ensemble_retriever,
139
  llm=llm
 
141
 
142
  return expanding_retriever
143
 
 
144
  def convert_chromadb_to_llamaindex_nodes(chromadb_documents: List) -> List[TextNode]:
145
  """Convert ChromaDB Document objects to LlamaIndex TextNode objects"""
146
  nodes = []
 
162
  continue
163
  return nodes
164
 
 
165
  def section_tool_wrapper(retriever, section_path_chunks, query):
166
+ """Generic section tool wrapper"""
167
  try:
168
  retrieved_docs = retriever.get_relevant_documents(query)
169
  nodes_from_retrieved_docs = convert_chromadb_to_llamaindex_nodes(retrieved_docs)
 
178
  chunks_unique = [node for node in chunks_data if node.get('element_id', 'Unknown') in chunk_ids]
179
  combined_text = []
180
 
181
+ for chu in chunks_unique:
 
 
182
  if "TableElement" == chu["type"]:
183
+ text = f"[Source: {chu['element_id']}]\n CONTENT: \n{chu['text']}\n HTML: \n {chu['table_text_as_html']} \n\n"
184
  combined_text.append(text)
185
  else:
186
  for element in chu["elements"]:
187
+ text = f"[Source: {element['element_id']}]\n CONTENT: \n{element['text']} \n\n"
188
  combined_text.append(text)
189
 
190
  result = "\n---\n".join(combined_text)
 
194
  print(f"Error in section tool: {e}")
195
  return f"Error retrieving documents: {str(e)}"
196
 
 
197
  def create_section_tools(embedding_function, llm):
198
+ """Create all section-specific retrieval tools"""
199
 
200
  # Define section paths
201
  section_paths = {
 
214
  # Create retrievers for each section
215
  section_retrievers = {}
216
  for section, path in section_paths.items():
217
+ if os.path.exists(path):
218
  vstore, docs = create_vectorstore_from_json(f'./data/{path}', f"Guide_2023_{section}", embedding_function)
219
  section_retrievers[section] = create_retriever(vstore, docs, llm)
220
 
 
225
  guide_retriever = create_retriever(guide_vstore, guide_docs, llm)
226
  else:
227
  guide_retriever = None
228
+ # General-purpose tool (entire Algerian guide)
229
+ def guide_retrieval_tool(query: str) -> str:
230
+ """
231
+ General-purpose retrieval tool for the entire Algerian National Vaccination Guide (2023).
232
+
233
+ Use this tool when a query spans multiple sections or cannot be routed confidently to a specific tool.
234
+ This is the fallback and all-encompassing tool to retrieve any vaccination-related information
235
+ from the national guide.
236
+
237
+ Secondary source: The WHO Immunization Guide can be queried separately via `Immunization_in_Practice_tool`.
238
+
239
+ Args:
240
+ query (str): A general or complex question related to vaccination policy, schedules, or practice.
241
+
242
+ Returns:
243
+ str: Synthesized response based on the full Algerian guide.
244
+ """
245
+ if not guide_retriever:
246
+ return "Guide retriever not available"
247
+ return section_tool_wrapper(guide_retriever, guide_path, query)
248
 
249
  # Primary + Secondary Document Paths
250
+ immunization_path = './data/Immunization in Practice_WHO_eng_2015.json'
251
 
252
  # WHO Immunization in Practice Tool
253
  if os.path.exists(immunization_path):
 
260
  else:
261
  immunization_retriever = None
262
 
263
+ def immunization_tool(query: str) -> str:
 
 
264
  """
265
+ WHO Immunization in Practice 2015 retrieval tool.
266
+
267
+ Use this tool to provide global best practices and operational guidance on immunization,
268
+ especially when context or clarification is needed beyond the Algerian national guide.
269
+ This can serve as a secondary source for training, logistics, and procedural reference.
270
+
 
 
 
 
 
 
 
271
  Args:
272
+ query (str): A question related to immunization practice in general.
273
+
274
  Returns:
275
+ str: Retrieved guidance from the WHO Immunization in Practice manual (2015).
276
  """
277
+ if not immunization_retriever:
278
+ return "Immunization in Practice retriever not available"
279
+ return section_tool_wrapper(immunization_retriever, immunization_path, query)
280
 
281
+
282
+
283
+ # Section-Specific Tools (Primary: Algerian National Vaccination Guide)
284
+
285
+ def section_one_tool(query: str) -> str:
286
  """
287
+ Section 1: Programme Élargi de Vaccination (PEV)
288
+
289
+ Use this tool to retrieve information about the Algerian immunization program:
290
+ its objectives, historical background, strengths and weaknesses, and justification
291
+ for calendar updates.
292
+
293
+ Primary source: Algerian National Vaccination Guide, Section 1.
294
+ Secondary source for operational benchmarks: WHO Immunization in Practice (optional).
295
+
 
 
 
296
  Args:
297
+ query (str): A question about Algeria’s national immunization strategy.
298
+
299
  Returns:
300
+ str: Relevant content from Section 1 of the guide.
301
  """
302
+ return section_tool_wrapper(section_retrievers['one'], section_paths['one'], query)
 
 
303
 
 
304
 
305
  def section_two_tool(query: str) -> str:
306
  """
307
+ Section 2: Maladies Ciblées par la Vaccination
308
+
309
+ Use this tool for questions about the diseases targeted by the national vaccination calendar:
310
+ symptoms, transmission, complications, and prevention strategies.
311
+
312
+ Primary source: Algerian National Guide, Section 2.
313
+ Secondary source: WHO guide may support contextual insights.
314
+
 
 
 
315
  Args:
316
+ query (str): A question about a vaccine-preventable disease (e.g. polio, rougeole).
317
+
318
  Returns:
319
+ str: Disease-specific guidance from Section 2.
320
  """
321
+ return section_tool_wrapper(section_retrievers['two'], section_paths['two'], query)
322
+
 
323
 
324
  def section_three_tool(query: str) -> str:
325
  """
326
+ Section 3: Vaccins du Calendrier
327
+
328
+ Use this tool to retrieve technical and procedural information about the vaccines used in the calendar:
329
+ names, contents, administration method, and dosing details.
330
+
 
 
 
 
 
 
331
  Args:
332
+ query (str): A question about a specific vaccine's type or method of use.
333
+
334
  Returns:
335
+ str: Vaccine information from Section 3.
336
  """
337
+ return section_tool_wrapper(section_retrievers['three'], section_paths['three'], query)
 
 
338
 
339
+
340
+ def section_four_tool(query: str) -> str:
341
+ """
342
+ Section 4: Rattrapage Vaccinal
343
+
344
+ Use this tool to determine catch-up strategies for children who missed or delayed one or more doses.
345
+ It provides age-adjusted rescheduling rules and justifications.
346
+
347
+ Args:
348
+ query (str): A question about how to manage missed vaccinations.
349
+
350
+ Returns:
351
+ str: Catch-up guidelines from Section 4.
352
+ """
353
+ return section_tool_wrapper(section_retrievers['four'], section_paths['four'], query)
354
+
355
+
356
+ def section_five_tool(query: str) -> str:
357
+ """
358
+ Section 5: Vaccination des Populations Particulières
359
+
360
+ Use this tool to retrieve recommendations for specific medical contexts:
361
+ preterm infants, immunocompromised children, allergies (e.g. eggs), and chronic diseases.
362
+
363
+ Args:
364
+ query (str): A question about vaccination adaptations for vulnerable groups.
365
+
366
+ Returns:
367
+ str: Guidelines from Section 5.
368
+ """
369
+ return section_tool_wrapper(section_retrievers['five'], section_paths['five'], query)
370
+
371
+
372
+ def section_six_tool(query: str) -> str:
373
+ """
374
+ Section 6: Chaîne du Froid
375
+
376
+ Use this tool for logistics, storage conditions, temperature monitoring,
377
+ and emergency procedures in case of cold chain failure.
378
+
379
+ Args:
380
+ query (str): A question about how vaccines should be stored and transported.
381
+
382
+ Returns:
383
+ str: Operational cold chain standards from Section 6.
384
+ """
385
+ return section_tool_wrapper(section_retrievers['six'], section_paths['six'], query)
386
+
387
+
388
+ def section_seven_tool(query: str) -> str:
389
+ """
390
+ Section 7: Sécurité des Injections
391
+
392
+ Use this tool to ensure injection safety: handling equipment, preventing needle-stick injuries,
393
+ and disposing of biomedical waste.
394
+
395
+ Args:
396
+ query (str): A question about safe injection practices.
397
+
398
+ Returns:
399
+ str: Procedures and guidelines from Section 7.
400
+ """
401
+ return section_tool_wrapper(section_retrievers['seven'], section_paths['seven'], query)
402
+
403
+
404
+ def section_eight_tool(query: str) -> str:
405
+ """
406
+ Section 8: Tenue d'une Séance de Vaccination & Vaccinovigilance
407
+
408
+ Use this tool to plan and monitor vaccination sessions, including material preparation,
409
+ injection recording, and handling of adverse events post-immunization (AEFI).
410
+
411
+ Args:
412
+ query (str): A question about session operations or vaccine side effect monitoring.
413
+
414
+ Returns:
415
+ str: Guidelines from Section 8.
416
+ """
417
+ return section_tool_wrapper(section_retrievers['eight'], section_paths['eight'], query)
418
+
419
+
420
+ def section_nine_tool(query: str) -> str:
421
+ """
422
+ Section 9: Planification des Séances de Vaccination
423
+
424
+ Use this tool to support logistical planning: mapping, resource estimation,
425
+ scheduling, and stock management.
426
+
427
+ Args:
428
+ query (str): A question about planning and organizing vaccination sessions.
429
+
430
+ Returns:
431
+ str: Recommendations from Section 9.
432
+ """
433
+ return section_tool_wrapper(section_retrievers['nine'], section_paths['nine'], query)
434
+
435
+
436
+ def section_ten_tool(query: str) -> str:
437
+ """
438
+ Section 10: Mobilisation Sociale
439
+
440
+ Use this tool for strategies to increase public engagement, combat vaccine hesitancy,
441
+ and manage misinformation.
442
+
443
+ Args:
444
+ query (str): A question about public communication and trust-building around vaccines.
445
+
446
+ Returns:
447
+ str: Social mobilization approaches from Section 10.
448
+ """
449
+ return section_tool_wrapper(section_retrievers['ten'], section_paths['ten'], query)
450
+
451
+ # Create FunctionTool objects
452
  tools = [
453
+ FunctionTool.from_defaults(name="Guide_vector_tool", fn=guide_retrieval_tool),
454
+ FunctionTool.from_defaults(name="Immunization_in_Practice_tool", fn=immunization_tool),
455
+ # Section-specific tools
456
+ FunctionTool.from_defaults(name="section_one_vector_query_tool", fn=section_one_tool),
457
+ FunctionTool.from_defaults(name="section_two_vector_query_tool", fn=section_two_tool),
458
+ FunctionTool.from_defaults(name="section_three_vector_query_tool", fn=section_three_tool),
459
+ FunctionTool.from_defaults(name="section_four_vector_query_tool", fn=section_four_tool),
460
+ FunctionTool.from_defaults(name="section_five_vector_query_tool", fn=section_five_tool),
461
+ FunctionTool.from_defaults(name="section_six_vector_query_tool", fn=section_six_tool),
462
+ FunctionTool.from_defaults(name="section_seven_vector_query_tool", fn=section_seven_tool),
463
+ FunctionTool.from_defaults(name="section_eight_vector_query_tool", fn=section_eight_tool),
464
+ FunctionTool.from_defaults(name="section_nine_vector_query_tool", fn=section_nine_tool),
465
+ FunctionTool.from_defaults(name="section_ten_vector_query_tool", fn=section_ten_tool),
 
 
 
 
 
 
 
 
 
466
  ]
467
 
468
  return tools
469
 
 
470
  def prepare_environment():
471
  """Main function to prepare the environment and return tools"""
472
  print("Setting up models...")
rag_pipeline.py CHANGED
@@ -1,6 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- Enhanced RAG Pipeline for vaccine assistant - Fixed version with max iterations control
4
  Handles agent creation and question answering with sequential citation numbering
5
  """
6
 
@@ -97,6 +97,89 @@ def convert_citations_to_sequential(response_text, source_id_to_number_map):
97
  return sequential_response
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def create_safe_custom_prompt(tools, llm):
101
  """Create a safe version that won't have formatting conflicts"""
102
 
@@ -106,21 +189,13 @@ You are a helpful and knowledgeable AI-powered vaccine assistant designed to sup
106
  You provide evidence-based guidance using only information from official vaccine medical documents.
107
  Answer the doctor's question accurately and concisely using only the provided information.
108
 
109
- ## CRITICAL RULES FOR EFFICIENCY
110
-
111
- ### Tool Usage Strategy
112
- 1. **MAXIMUM 3 TOOL CALLS**: You must provide a complete answer within 3 tool calls maximum.
113
- 2. **Smart Tool Selection**: Choose the most relevant tool first based on the question topic.
114
- 3. **Comparative Questions**: For questions comparing documents/protocols:
115
- - First tool call: Get information from primary source (e.g., Algerian guide)
116
- - Second tool call: Get information from secondary source (e.g., WHO document)
117
- - Third tool call: Only if absolutely necessary for missing details
118
- 4. **Stop Early**: If you have sufficient information after 1-2 tool calls, provide your answer immediately.
119
 
120
  ### Citation and Sourcing
121
  1. For each fact in your response, include an inline citation in the format [Source] immediately following the information, e.g., [e795ebd28318886c0b1a5395ac30ad90].
122
  2. Do NOT use 'Source:' in the citation format; use only the Source in square brackets.
123
- 3. If a fact is supported by multiple sources, use adjacent citations: [source1][source2]
 
124
  4. Use ONLY the provided information and never include facts from your general knowledge.
125
 
126
  ### Content Formatting
@@ -131,12 +206,6 @@ Answer the doctor's question accurately and concisely using only the provided in
131
  2. For lists, maintain the original bullet points/numbering and include citations.
132
  3. Present information concisely but ensure clinical accuracy is never compromised.
133
 
134
- ### Answer Completeness Guidelines
135
- - If you find relevant information from 1-2 sources, synthesize and provide a complete answer
136
- - Don't keep searching for more sources unless critical information is missing
137
- - For comparative questions, clearly structure your answer with sections for each source
138
- - If information is not available in the documents, clearly state this limitation
139
-
140
  ---
141
 
142
  """
@@ -163,38 +232,34 @@ Answer the doctor's question accurately and concisely using only the provided in
163
  # Even safer fallback
164
  return PromptTemplate(template=safe_template)
165
 
166
-
167
  def create_agent(tools, llm):
168
- """Create the ReAct agent with custom prompt and controlled max iterations"""
169
 
170
- # Create agent with controlled max iterations (reduced from default 10 to 5)
171
  agent = ReActAgent.from_tools(
172
  tools,
173
  llm=llm,
174
  verbose=True,
175
- max_iterations=5, # Reduced max iterations
176
  )
177
 
178
  # Create and apply safe custom prompt
179
  try:
180
  safe_custom_prompt = create_safe_custom_prompt(tools, llm)
181
  agent.update_prompts({"agent_worker:system_prompt": safe_custom_prompt})
182
- print("✅ Successfully updated with safe custom prompt and max_iterations=5")
183
  except Exception as e:
184
  print(f"❌ Safe prompt update failed: {e}")
185
  print("⚠️ Using original agent without modifications")
186
 
187
  return agent
188
 
189
-
190
  def initialize_rag_pipeline(tools):
191
  """Initialize the RAG pipeline with tools"""
192
 
193
- # Initialize LlamaIndex LLM with specific parameters to improve efficiency
194
  llama_index_llm = GoogleGenAI(
195
  model="models/gemini-2.0-flash",
196
  api_key=os.getenv('GOOGLE_API_KEY'),
197
- temperature=0.1, # Lower temperature for more focused responses
198
  )
199
 
200
  # Create agent
@@ -202,26 +267,14 @@ def initialize_rag_pipeline(tools):
202
 
203
  return agent
204
 
205
-
206
  def process_question(agent, question: str) -> str:
207
- """Process a question through the RAG pipeline with timeout handling"""
208
  try:
209
- # Add timeout/retry logic
210
  response = agent.chat(question)
211
  return response.response
212
  except Exception as e:
213
- error_msg = str(e)
214
- print(f"Error processing question: {error_msg}")
215
-
216
- # Handle specific "max iterations" error
217
- if "max iterations" in error_msg.lower() or "reached max" in error_msg.lower():
218
- return ("I apologize, but I was unable to find a complete answer within the allowed search attempts. "
219
- "This might be because the specific comparison you're asking about requires information "
220
- "that spans multiple sections of the documents. Could you please rephrase your question "
221
- "to be more specific about which aspect of the difference you're most interested in?")
222
-
223
- return f"Error processing your question: {error_msg}"
224
-
225
 
226
  def aswer_language_detection(response_text: str) -> str:
227
  """
@@ -233,23 +286,24 @@ def aswer_language_detection(response_text: str) -> str:
233
  Returns:
234
  str: Detected language code (e.g., 'en', 'fr', etc.)
235
  """
 
236
  try:
237
- # Detect the language of the first 5 words of the response
238
- first_line = " ".join(response_text.split()[:5])
239
- first_line = re.sub(r'\[.*?\]', '', first_line) # Remove citations
240
- answer_language = detect(first_line)
241
- if answer_language not in ['en', 'ar', 'fr']:
242
- answer_language = 'en'
243
  except:
244
- answer_language = 'en'
245
 
246
- return answer_language
 
247
 
248
 
249
  def process_question_with_sequential_citations(agent, question: str, chunks_directory="./data/") -> dict:
250
  """
251
  Process a question through the RAG pipeline and return response with sequential citation numbers.
252
- Enhanced with better error handling for max iterations.
253
 
254
  Args:
255
  agent: The initialized RAG agent
@@ -265,18 +319,10 @@ def process_question_with_sequential_citations(agent, question: str, chunks_dire
265
  }
266
  """
267
  try:
268
- # Get the response from the agent with improved error handling
269
  response = agent.chat(question)
270
  response_text = response.response
271
 
272
- # Check if the response indicates max iterations was reached
273
- if "max iterations" in response_text.lower() or len(response_text.strip()) == 0:
274
- # Provide a more helpful fallback response
275
- response_text = ("I apologize, but I encountered difficulties processing your comparative question "
276
- "within the allowed search attempts. For questions comparing different protocols "
277
- "or documents, please try asking about each aspect separately. For example, "
278
- "first ask about the Algerian definition of Diphtheria, then ask about the WHO definition.")
279
-
280
  # Extract source IDs from the response (preserving order)
281
  unique_ids = extract_source_ids(response_text)
282
 
@@ -320,40 +366,25 @@ def process_question_with_sequential_citations(agent, question: str, chunks_dire
320
 
321
  # Convert to JSON
322
  cited_elements_json = json.dumps(cited_elements_ordered, ensure_ascii=False, indent=2)
323
- answer_language = aswer_language_detection(response_text)
324
-
325
  return {
326
  "response": sequential_response,
327
  "cited_elements_json": cited_elements_json,
328
  "unique_ids": unique_ids,
329
  "citation_mapping": source_id_to_number,
330
- "answer_language": answer_language
331
  }
332
 
333
  except Exception as e:
334
- error_msg = str(e)
335
- print(f"Error processing question: {error_msg}")
336
-
337
- # Create appropriate fallback response based on error type
338
- if "max iterations" in error_msg.lower() or "reached max" in error_msg.lower():
339
- fallback_response = ("I apologize, but I was unable to complete the comparison within the allowed search attempts. "
340
- "For complex comparative questions like yours about the differences between Algerian and WHO "
341
- "definitions of Diphtheria, please try asking about each source separately: \n\n"
342
- "1. First ask: 'What is the definition of Diphtheria in the Algerian vaccination guide?'\n"
343
- "2. Then ask: 'What is the definition of Diphtheria in the WHO document?'\n\n"
344
- "This will help me provide you with more focused and complete information.")
345
- else:
346
- fallback_response = f"I encountered an error while processing your question: {error_msg}"
347
-
348
  return {
349
- "response": fallback_response,
350
  "cited_elements_json": "[]",
351
  "unique_ids": [],
352
  "citation_mapping": {},
353
- "answer_language": "en"
354
  }
355
 
356
-
357
  def process_question_with_citations(agent, question: str, chunks_directory="./data/") -> dict:
358
  """
359
  Legacy function - maintained for backward compatibility.
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ Enhanced RAG Pipeline for vaccine assistant
4
  Handles agent creation and question answering with sequential citation numbering
5
  """
6
 
 
97
  return sequential_response
98
 
99
 
100
+ def create_custom_prompt():
101
+ """Create custom prompt with medical assistant instructions"""
102
+
103
+ custom_instructions = """
104
+ ## MEDICAL ASSISTANT ROLE
105
+ You are a helpful and knowledgeable AI-powered vaccine assistant designed to support doctors in clinical decision-making.
106
+ You provide evidence-based guidance using only information from official vaccine medical documents.
107
+ Answer the doctor's question accurately and concisely using only the provided information.
108
+
109
+ ## IMPORTANT REQUIREMENTS
110
+
111
+ ### Citation and Sourcing
112
+ 1. For each fact in your response, include an inline citation in the format [Source] immediately following the information, e.g., [e795ebd28318886c0b1a5395ac30ad90].
113
+ 2. Do NOT use 'Source:' in the citation format; use only the Source in square brackets.
114
+ 3. If a fact is supported by multiple sources, use the following format:
115
+ - Use adjacent citations: [e795ebd28318886c0b1a5395ac30ad90][21a932b2340bb16707763f57f0ad2]
116
+ 4. Use ONLY the provided information and never include facts from your general knowledge.
117
+
118
+ ### Content Formatting
119
+ 1. When rendering tables:
120
+ - Convert HTML tables into clean Markdown format
121
+ - Preserve all original headers and data rows exactly
122
+ - Include the citation in the table caption, e.g., 'Table: Vaccination Schedule [Source]'
123
+ 2. For lists, maintain the original bullet points/numbering and include citations.
124
+ 3. Present information concisely but ensure clinical accuracy is never compromised.
125
+
126
+ ## Tools
127
+
128
+ You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
129
+ This may require breaking the task into subtasks and using different tools to complete each subtask.
130
+
131
+ You have access to the following tools:
132
+ {tool_desc}
133
+
134
+ ## Output Format
135
+
136
+ Please answer in the same language as the question and use the following format:
137
+
138
+ ```
139
+ Thought: The current language of the user is: (user's language). I need to use a tool to help me answer the question.
140
+ Action: tool name (one of {tool_names}) if using a tool.
141
+ Action Input: the input to the tool, in a JSON format representing the kwargs (e.g. {{"input": "hello world", "num_beams": 5}})
142
+ ```
143
+
144
+ Please ALWAYS start with a Thought.
145
+
146
+ NEVER surround your response with markdown code markers. You may use code markers within your response if you need to.
147
+
148
+ Please use a valid JSON format for the Action Input. Do NOT do this {{"input": "hello world", "num_beams": 5}}.
149
+
150
+ If this format is used, the tool will respond in the following format:
151
+
152
+ ```
153
+ Observation: tool response
154
+ ```
155
+
156
+ You should keep repeating the above format till you have enough information to answer the question without using any more tools. At that point, you MUST respond in one of the following two formats:
157
+
158
+ ```
159
+ Thought: I can answer without using any more tools. I'll use the user's language to answer. Remember to include proper citations
160
+ Answer: [your answer here with proper citations (In the same language as the user's question)]
161
+ ```
162
+
163
+ ```
164
+ Thought: I cannot answer the question with the provided tools.
165
+ Answer: [your answer here (In the same language as the user's question)]
166
+ ```
167
+
168
+ ## Current Conversation
169
+
170
+ Below is the current conversation consisting of interleaving human and assistant messages.
171
+ """
172
+
173
+ try:
174
+ custom_prompt = PromptTemplate(
175
+ template=custom_instructions,
176
+ template_vars=["tool_desc", "tool_names"]
177
+ )
178
+ return custom_prompt
179
+ except:
180
+ # Fallback to simple template
181
+ return PromptTemplate(template=custom_instructions)
182
+
183
  def create_safe_custom_prompt(tools, llm):
184
  """Create a safe version that won't have formatting conflicts"""
185
 
 
189
  You provide evidence-based guidance using only information from official vaccine medical documents.
190
  Answer the doctor's question accurately and concisely using only the provided information.
191
 
192
+ ## IMPORTANT REQUIREMENTS
 
 
 
 
 
 
 
 
 
193
 
194
  ### Citation and Sourcing
195
  1. For each fact in your response, include an inline citation in the format [Source] immediately following the information, e.g., [e795ebd28318886c0b1a5395ac30ad90].
196
  2. Do NOT use 'Source:' in the citation format; use only the Source in square brackets.
197
+ 3. If a fact is supported by multiple sources, use the following format:
198
+ - Use adjacent citations: [e795ebd28318886c0b1a5395ac30ad90][21a932b2340bb16707763f57f0ad2]
199
  4. Use ONLY the provided information and never include facts from your general knowledge.
200
 
201
  ### Content Formatting
 
206
  2. For lists, maintain the original bullet points/numbering and include citations.
207
  3. Present information concisely but ensure clinical accuracy is never compromised.
208
 
 
 
 
 
 
 
209
  ---
210
 
211
  """
 
232
  # Even safer fallback
233
  return PromptTemplate(template=safe_template)
234
 
 
235
  def create_agent(tools, llm):
236
+ """Create the ReAct agent with custom prompt"""
237
 
238
+ # Create agent
239
  agent = ReActAgent.from_tools(
240
  tools,
241
  llm=llm,
242
  verbose=True,
 
243
  )
244
 
245
  # Create and apply safe custom prompt
246
  try:
247
  safe_custom_prompt = create_safe_custom_prompt(tools, llm)
248
  agent.update_prompts({"agent_worker:system_prompt": safe_custom_prompt})
249
+ print("✅ Successfully updated with safe custom prompt")
250
  except Exception as e:
251
  print(f"❌ Safe prompt update failed: {e}")
252
  print("⚠️ Using original agent without modifications")
253
 
254
  return agent
255
 
 
256
  def initialize_rag_pipeline(tools):
257
  """Initialize the RAG pipeline with tools"""
258
 
259
+ # Initialize LlamaIndex LLM
260
  llama_index_llm = GoogleGenAI(
261
  model="models/gemini-2.0-flash",
262
  api_key=os.getenv('GOOGLE_API_KEY'),
 
263
  )
264
 
265
  # Create agent
 
267
 
268
  return agent
269
 
 
270
  def process_question(agent, question: str) -> str:
271
+ """Process a question through the RAG pipeline"""
272
  try:
 
273
  response = agent.chat(question)
274
  return response.response
275
  except Exception as e:
276
+ print(f"Error processing question: {e}")
277
+ return f"Error processing your question: {str(e)}"
 
 
 
 
 
 
 
 
 
 
278
 
279
  def aswer_language_detection(response_text: str) -> str:
280
  """
 
286
  Returns:
287
  str: Detected language code (e.g., 'en', 'fr', etc.)
288
  """
289
+
290
  try:
291
+ # Detect the language of the first 5 words of the response
292
+ first_line = " ".join(response_text.split()[:5])
293
+ first_line = re.sub(r'\[.*?\]', '', first_line) # Remove citations
294
+ answer_language = detect(first_line)
295
+ if answer_language not in ['en', 'ar', 'fr']:
296
+ answer_language ='en'
297
  except:
298
+ answer_language ='en'
299
 
300
+ finally:
301
+ return answer_language
302
 
303
 
304
  def process_question_with_sequential_citations(agent, question: str, chunks_directory="./data/") -> dict:
305
  """
306
  Process a question through the RAG pipeline and return response with sequential citation numbers.
 
307
 
308
  Args:
309
  agent: The initialized RAG agent
 
319
  }
320
  """
321
  try:
322
+ # Get the response from the agent
323
  response = agent.chat(question)
324
  response_text = response.response
325
 
 
 
 
 
 
 
 
 
326
  # Extract source IDs from the response (preserving order)
327
  unique_ids = extract_source_ids(response_text)
328
 
 
366
 
367
  # Convert to JSON
368
  cited_elements_json = json.dumps(cited_elements_ordered, ensure_ascii=False, indent=2)
369
+ aswer_language= aswer_language_detection(response_text)
 
370
  return {
371
  "response": sequential_response,
372
  "cited_elements_json": cited_elements_json,
373
  "unique_ids": unique_ids,
374
  "citation_mapping": source_id_to_number,
375
+ "answer_language":aswer_language
376
  }
377
 
378
  except Exception as e:
379
+ print(f"Error processing question: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  return {
381
+ "response": response_text if 'response_text' in locals() else "Error occurred",
382
  "cited_elements_json": "[]",
383
  "unique_ids": [],
384
  "citation_mapping": {},
385
+ "answer_language": "en" # Default to English if not specified
386
  }
387
 
 
388
  def process_question_with_citations(agent, question: str, chunks_directory="./data/") -> dict:
389
  """
390
  Legacy function - maintained for backward compatibility.