Chris commited on
Commit
5ec1e1b
·
1 Parent(s): 0b92da3

Final 7.0.3

Browse files
src/agents/__pycache__/router.cpython-310.pyc CHANGED
Binary files a/src/agents/__pycache__/router.cpython-310.pyc and b/src/agents/__pycache__/router.cpython-310.pyc differ
 
src/agents/__pycache__/synthesizer.cpython-310.pyc CHANGED
Binary files a/src/agents/__pycache__/synthesizer.cpython-310.pyc and b/src/agents/__pycache__/synthesizer.cpython-310.pyc differ
 
src/agents/reasoning_agent.py CHANGED
@@ -52,6 +52,8 @@ class ReasoningAgent:
52
  result = self._process_pattern_analysis(state)
53
  elif strategy == "step_by_step":
54
  result = self._process_step_by_step(state)
 
 
55
  else:
56
  result = self._process_general_reasoning(state)
57
 
@@ -286,17 +288,17 @@ class ReasoningAgent:
286
  Be systematic and show your reasoning.
287
  """
288
 
289
- model_tier = ModelTier.MAIN
290
  llm_result = self.llm_client.generate(pattern_prompt, tier=model_tier, max_tokens=500)
291
 
292
  if llm_result.success:
293
- confidence = 0.75 if numbers else 0.65 # Higher confidence with numerical data
294
  return AgentResult(
295
  agent_role=AgentRole.REASONING_AGENT,
296
  success=True,
297
  result=llm_result.response,
298
  confidence=confidence,
299
- reasoning="Analyzed patterns and sequences",
300
  model_used=llm_result.model_used,
301
  processing_time=llm_result.response_time,
302
  cost_estimate=llm_result.cost_estimate
@@ -323,7 +325,7 @@ class ReasoningAgent:
323
  Be thorough and explain each step.
324
  """
325
 
326
- model_tier = ModelTier.MAIN
327
  llm_result = self.llm_client.generate(step_prompt, tier=model_tier, max_tokens=600)
328
 
329
  if llm_result.success:
@@ -331,8 +333,8 @@ class ReasoningAgent:
331
  agent_role=AgentRole.REASONING_AGENT,
332
  success=True,
333
  result=llm_result.response,
334
- confidence=0.75,
335
- reasoning="Provided step-by-step solution",
336
  model_used=llm_result.model_used,
337
  processing_time=llm_result.response_time,
338
  cost_estimate=llm_result.cost_estimate
@@ -354,7 +356,7 @@ class ReasoningAgent:
354
  Consider all aspects of the question and explain your reasoning.
355
  """
356
 
357
- model_tier = ModelTier.MAIN
358
  llm_result = self.llm_client.generate(reasoning_prompt, tier=model_tier, max_tokens=500)
359
 
360
  if llm_result.success:
@@ -362,8 +364,8 @@ class ReasoningAgent:
362
  agent_role=AgentRole.REASONING_AGENT,
363
  success=True,
364
  result=llm_result.response,
365
- confidence=0.70,
366
- reasoning="Applied general reasoning and analysis",
367
  model_used=llm_result.model_used,
368
  processing_time=llm_result.response_time,
369
  cost_estimate=llm_result.cost_estimate
@@ -450,7 +452,7 @@ class ReasoningAgent:
450
  Please provide a direct answer incorporating these calculations.
451
  """
452
 
453
- llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=400)
454
 
455
  if llm_result.success:
456
  return AgentResult(
@@ -495,7 +497,7 @@ class ReasoningAgent:
495
  Please provide a direct answer based on this statistical analysis.
496
  """
497
 
498
- llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=400)
499
 
500
  if llm_result.success:
501
  return AgentResult(
@@ -534,7 +536,7 @@ class ReasoningAgent:
534
  Please provide a direct answer incorporating this conversion.
535
  """
536
 
537
- llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.ROUTER, max_tokens=300)
538
 
539
  if llm_result.success:
540
  return AgentResult(
@@ -568,7 +570,7 @@ class ReasoningAgent:
568
  Provide a clear numerical answer.
569
  """
570
 
571
- model_tier = ModelTier.MAIN
572
  llm_result = self.llm_client.generate(math_prompt, tier=model_tier, max_tokens=500)
573
 
574
  if llm_result.success:
@@ -598,7 +600,7 @@ class ReasoningAgent:
598
  Apply statistical reasoning and provide a clear answer.
599
  """
600
 
601
- model_tier = ModelTier.MAIN
602
  llm_result = self.llm_client.generate(stats_prompt, tier=model_tier, max_tokens=400)
603
 
604
  if llm_result.success:
@@ -628,7 +630,7 @@ class ReasoningAgent:
628
  Apply conversion reasoning and provide a clear answer.
629
  """
630
 
631
- model_tier = ModelTier.ROUTER
632
  llm_result = self.llm_client.generate(conversion_prompt, tier=model_tier, max_tokens=300)
633
 
634
  if llm_result.success:
@@ -677,7 +679,7 @@ class ReasoningAgent:
677
  """
678
 
679
  # Use main model for fallback
680
- llm_result = self.llm_client.generate(fallback_prompt, tier=ModelTier.MAIN, max_tokens=400)
681
 
682
  if llm_result.success:
683
  return AgentResult(
 
52
  result = self._process_pattern_analysis(state)
53
  elif strategy == "step_by_step":
54
  result = self._process_step_by_step(state)
55
+ elif strategy == "general_reasoning":
56
+ result = self._process_general_reasoning(state)
57
  else:
58
  result = self._process_general_reasoning(state)
59
 
 
288
  Be systematic and show your reasoning.
289
  """
290
 
291
+ model_tier = ModelTier.COMPLEX # Use 72B model for pattern analysis
292
  llm_result = self.llm_client.generate(pattern_prompt, tier=model_tier, max_tokens=500)
293
 
294
  if llm_result.success:
295
+ confidence = 0.85 if numbers else 0.75 # Higher confidence with numerical data
296
  return AgentResult(
297
  agent_role=AgentRole.REASONING_AGENT,
298
  success=True,
299
  result=llm_result.response,
300
  confidence=confidence,
301
+ reasoning="Analyzed patterns and sequences with 72B model",
302
  model_used=llm_result.model_used,
303
  processing_time=llm_result.response_time,
304
  cost_estimate=llm_result.cost_estimate
 
325
  Be thorough and explain each step.
326
  """
327
 
328
+ model_tier = ModelTier.COMPLEX # Use 72B model for step-by-step reasoning
329
  llm_result = self.llm_client.generate(step_prompt, tier=model_tier, max_tokens=600)
330
 
331
  if llm_result.success:
 
333
  agent_role=AgentRole.REASONING_AGENT,
334
  success=True,
335
  result=llm_result.response,
336
+ confidence=0.85, # Higher confidence with 72B model
337
+ reasoning="Provided step-by-step solution with 72B model",
338
  model_used=llm_result.model_used,
339
  processing_time=llm_result.response_time,
340
  cost_estimate=llm_result.cost_estimate
 
356
  Consider all aspects of the question and explain your reasoning.
357
  """
358
 
359
+ model_tier = ModelTier.COMPLEX # Use 72B model for general reasoning
360
  llm_result = self.llm_client.generate(reasoning_prompt, tier=model_tier, max_tokens=500)
361
 
362
  if llm_result.success:
 
364
  agent_role=AgentRole.REASONING_AGENT,
365
  success=True,
366
  result=llm_result.response,
367
+ confidence=0.80, # Higher confidence with 72B model
368
+ reasoning="Applied general reasoning and analysis with 72B model",
369
  model_used=llm_result.model_used,
370
  processing_time=llm_result.response_time,
371
  cost_estimate=llm_result.cost_estimate
 
452
  Please provide a direct answer incorporating these calculations.
453
  """
454
 
455
+ llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
456
 
457
  if llm_result.success:
458
  return AgentResult(
 
497
  Please provide a direct answer based on this statistical analysis.
498
  """
499
 
500
+ llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
501
 
502
  if llm_result.success:
503
  return AgentResult(
 
536
  Please provide a direct answer incorporating this conversion.
537
  """
538
 
539
+ llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
540
 
541
  if llm_result.success:
542
  return AgentResult(
 
570
  Provide a clear numerical answer.
571
  """
572
 
573
+ model_tier = ModelTier.COMPLEX # Use 72B model for mathematical reasoning
574
  llm_result = self.llm_client.generate(math_prompt, tier=model_tier, max_tokens=500)
575
 
576
  if llm_result.success:
 
600
  Apply statistical reasoning and provide a clear answer.
601
  """
602
 
603
+ model_tier = ModelTier.COMPLEX
604
  llm_result = self.llm_client.generate(stats_prompt, tier=model_tier, max_tokens=400)
605
 
606
  if llm_result.success:
 
630
  Apply conversion reasoning and provide a clear answer.
631
  """
632
 
633
+ model_tier = ModelTier.COMPLEX
634
  llm_result = self.llm_client.generate(conversion_prompt, tier=model_tier, max_tokens=300)
635
 
636
  if llm_result.success:
 
679
  """
680
 
681
  # Use main model for fallback
682
+ llm_result = self.llm_client.generate(fallback_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
683
 
684
  if llm_result.success:
685
  return AgentResult(
src/agents/router.py CHANGED
@@ -24,47 +24,48 @@ class RouterAgent:
24
 
25
  def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
26
  """
27
- Main routing function - analyzes question and updates state with routing decisions
28
  """
29
- logger.info(f"Routing question: {state.question[:100]}...")
30
- state.add_processing_step("Router: Starting question analysis")
31
-
32
- # Step 1: Enhanced question classification with multi-type detection
33
- question_types, primary_type = self._classify_question_types(state.question, state.file_name)
34
- state.question_type = primary_type
35
- state.add_processing_step(f"Router: Primary type: {primary_type.value}, All types: {[t.value for t in question_types]}")
36
-
37
- # Step 2: Complexity assessment
38
- complexity = self._assess_complexity(state.question)
39
- state.complexity_assessment = complexity
40
- state.add_processing_step(f"Router: Assessed complexity as {complexity}")
41
-
42
- # Step 3: Select appropriate agents with sequencing
43
- selected_agents = self._select_agents_enhanced(question_types, primary_type, state.file_name is not None, complexity)
44
- state.selected_agents = selected_agents
45
- state.add_processing_step(f"Router: Selected agents: {[a.value for a in selected_agents]}")
46
-
47
- # Step 4: Estimate cost
48
- estimated_cost = self._estimate_cost(complexity, selected_agents)
49
- state.estimated_cost = estimated_cost
50
- state.add_processing_step(f"Router: Estimated cost: ${estimated_cost:.4f}")
51
-
52
- # Step 5: Create routing decision summary
53
- state.routing_decision = {
54
- "primary_type": primary_type.value,
55
- "all_types": [t.value for t in question_types],
56
- "complexity": complexity,
57
- "agents": [agent.value for agent in selected_agents],
58
- "estimated_cost": estimated_cost,
59
- "reasoning": self._get_routing_reasoning(primary_type, complexity, selected_agents, question_types)
60
- }
61
-
62
- # Step 6: Use LLM for complex routing decisions if needed
63
- if complexity == "complex" or primary_type == QuestionType.UNKNOWN or len(question_types) > 2:
64
- state = self._llm_enhanced_routing(state)
65
 
66
- logger.info(f"✅ Routing complete: {primary_type.value} -> {[a.value for a in selected_agents]}")
67
- return state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def _classify_question_types(self, question: str, file_name: str = None) -> Tuple[List[QuestionType], QuestionType]:
70
  """
@@ -458,4 +459,131 @@ class RouterAgent:
458
  state.add_error(f"LLM routing error: {str(e)}")
459
  logger.error(f"LLM routing failed: {e}")
460
 
461
- return state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
26
  """
27
+ Main routing function - analyzes question and determines processing strategy
28
  """
29
+ logger.info(f"🧭 Router: Analyzing question type and complexity")
30
+ state.add_processing_step("Router: Analyzing question and selecting agents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ try:
33
+ # Analyze question patterns for classification
34
+ question_types, primary_type = self._classify_question_types(state.question, state.file_name)
35
+ state.question_types = question_types
36
+ state.primary_question_type = primary_type
37
+
38
+ # Use 72B model for complex routing decisions
39
+ llm_classification = self._get_llm_classification(state.question)
40
+
41
+ # Combine pattern-based and LLM-based classification
42
+ final_types, final_primary = self._combine_classifications(
43
+ question_types, primary_type, llm_classification
44
+ )
45
+
46
+ # Update state with final classification
47
+ state.question_types = final_types
48
+ state.primary_question_type = final_primary
49
+
50
+ # Select agents based on question types
51
+ selected_agents = self._select_agents(final_types, final_primary, state.question)
52
+ state.selected_agents = selected_agents
53
+
54
+ logger.info(f"✅ Routing complete: {final_primary.value} -> {[a.value for a in selected_agents]}")
55
+ state.add_processing_step(f"Router: Selected agents - {[a.value for a in selected_agents]}")
56
+
57
+ return state
58
+
59
+ except Exception as e:
60
+ error_msg = f"Router failed: {str(e)}"
61
+ logger.error(error_msg)
62
+ state.add_error(error_msg)
63
+
64
+ # Fallback to web researcher for unknown questions
65
+ state.selected_agents = [AgentRole.WEB_RESEARCHER]
66
+ state.primary_question_type = QuestionType.WEB_RESEARCH
67
+
68
+ return state
69
 
70
  def _classify_question_types(self, question: str, file_name: str = None) -> Tuple[List[QuestionType], QuestionType]:
71
  """
 
459
  state.add_error(f"LLM routing error: {str(e)}")
460
  logger.error(f"LLM routing failed: {e}")
461
 
462
+ return state
463
+
464
+ def _get_llm_classification(self, question: str) -> Dict[str, Any]:
465
+ """Use 72B model for intelligent question classification"""
466
+
467
+ classification_prompt = f"""
468
+ Analyze this GAIA benchmark question and classify it for agent routing.
469
+
470
+ Question: {question}
471
+
472
+ Determine:
473
+ 1. Primary question type (mathematical, text_manipulation, web_research, file_processing, reasoning, factual_lookup)
474
+ 2. Required capabilities (research, calculation, file_analysis, text_processing, logical_reasoning)
475
+ 3. Complexity level (simple, moderate, complex)
476
+ 4. Expected answer type (number, text, yes_no, name, location, list)
477
+
478
+ Provide your analysis in this format:
479
+ PRIMARY_TYPE: [type]
480
+ CAPABILITIES: [cap1, cap2, cap3]
481
+ COMPLEXITY: [level]
482
+ ANSWER_TYPE: [type]
483
+ REASONING: [brief explanation]
484
+ """
485
+
486
+ # Use 72B model for classification
487
+ result = self.llm_client.generate(
488
+ classification_prompt,
489
+ tier=ModelTier.COMPLEX, # 72B model for better reasoning
490
+ max_tokens=200
491
+ )
492
+
493
+ if result.success:
494
+ return self._parse_llm_classification(result.response)
495
+ else:
496
+ logger.warning("LLM classification failed, using pattern-based only")
497
+ return {"primary_type": "unknown", "capabilities": [], "complexity": "moderate"}
498
+
499
+ def _parse_llm_classification(self, response: str) -> Dict[str, Any]:
500
+ """Parse LLM classification response"""
501
+
502
+ parsed = {
503
+ "primary_type": "unknown",
504
+ "capabilities": [],
505
+ "complexity": "moderate",
506
+ "answer_type": "text",
507
+ "reasoning": ""
508
+ }
509
+
510
+ lines = response.split('\n')
511
+ for line in lines:
512
+ line = line.strip()
513
+ if line.startswith("PRIMARY_TYPE:"):
514
+ parsed["primary_type"] = line.split(":", 1)[1].strip().lower()
515
+ elif line.startswith("CAPABILITIES:"):
516
+ caps_text = line.split(":", 1)[1].strip()
517
+ parsed["capabilities"] = [c.strip().lower() for c in caps_text.split(",")]
518
+ elif line.startswith("COMPLEXITY:"):
519
+ parsed["complexity"] = line.split(":", 1)[1].strip().lower()
520
+ elif line.startswith("ANSWER_TYPE:"):
521
+ parsed["answer_type"] = line.split(":", 1)[1].strip().lower()
522
+ elif line.startswith("REASONING:"):
523
+ parsed["reasoning"] = line.split(":", 1)[1].strip()
524
+
525
+ return parsed
526
+
527
+ def _combine_classifications(self, pattern_types: List[QuestionType], pattern_primary: QuestionType,
528
+ llm_classification: Dict[str, Any]) -> Tuple[List[QuestionType], QuestionType]:
529
+ """Combine pattern-based and LLM-based classifications"""
530
+
531
+ # Map LLM classification to our enum types
532
+ llm_type_mapping = {
533
+ "mathematical": QuestionType.MATHEMATICAL,
534
+ "text_manipulation": QuestionType.TEXT_MANIPULATION,
535
+ "web_research": QuestionType.WEB_RESEARCH,
536
+ "file_processing": QuestionType.FILE_PROCESSING,
537
+ "reasoning": QuestionType.REASONING,
538
+ "factual_lookup": QuestionType.WEB_RESEARCH,
539
+ "code_execution": QuestionType.CODE_EXECUTION
540
+ }
541
+
542
+ llm_primary = llm_type_mapping.get(llm_classification["primary_type"], QuestionType.WEB_RESEARCH)
543
+
544
+ # Combine types - prefer LLM classification for primary, merge for secondary types
545
+ combined_types = list(pattern_types)
546
+ if llm_primary not in combined_types:
547
+ combined_types.insert(0, llm_primary) # Add LLM primary to front
548
+
549
+ # Use LLM primary if it's confident, otherwise stick with pattern
550
+ if llm_classification["complexity"] in ["complex", "moderate"] and llm_primary != QuestionType.WEB_RESEARCH:
551
+ final_primary = llm_primary
552
+ else:
553
+ final_primary = pattern_primary
554
+
555
+ logger.info(f"🤖 Combined classification: Pattern={pattern_primary.value}, LLM={llm_primary.value}, Final={final_primary.value}")
556
+
557
+ return combined_types, final_primary
558
+
559
+ def _select_agents(self, question_types: List[QuestionType], primary_type: QuestionType, question: str) -> List[AgentRole]:
560
+ """Select agents based on combined classification"""
561
+
562
+ agents = []
563
+
564
+ # Primary agent based on primary type
565
+ primary_agent_map = {
566
+ QuestionType.MATHEMATICAL: AgentRole.REASONING_AGENT,
567
+ QuestionType.TEXT_MANIPULATION: AgentRole.REASONING_AGENT,
568
+ QuestionType.WEB_RESEARCH: AgentRole.WEB_RESEARCHER,
569
+ QuestionType.FILE_PROCESSING: AgentRole.FILE_PROCESSOR,
570
+ QuestionType.REASONING: AgentRole.REASONING_AGENT,
571
+ QuestionType.CODE_EXECUTION: AgentRole.CODE_EXECUTOR
572
+ }
573
+
574
+ primary_agent = primary_agent_map.get(primary_type, AgentRole.WEB_RESEARCHER)
575
+ if primary_agent not in agents:
576
+ agents.append(primary_agent)
577
+
578
+ # Add secondary agents based on all detected types
579
+ for qtype in question_types:
580
+ if qtype != primary_type: # Don't duplicate primary
581
+ secondary_agent = primary_agent_map.get(qtype)
582
+ if secondary_agent and secondary_agent not in agents:
583
+ agents.append(secondary_agent)
584
+
585
+ # Always add synthesizer at the end
586
+ if AgentRole.SYNTHESIZER not in agents:
587
+ agents.append(AgentRole.SYNTHESIZER)
588
+
589
+ return agents
src/agents/synthesizer.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
  Synthesizer Agent for GAIA Agent System
4
- Combines results from multiple agents and produces final answers
5
  """
6
 
7
  import logging
@@ -10,338 +10,215 @@ from statistics import mean
10
 
11
  from agents.state import GAIAAgentState, AgentRole, AgentResult
12
  from models.qwen_client import QwenClient, ModelTier
 
13
 
14
  logger = logging.getLogger(__name__)
15
 
16
  class SynthesizerAgent:
17
  """
18
- Synthesizer agent that combines multiple agent results into a final answer
 
19
  """
20
 
21
  def __init__(self, llm_client: QwenClient):
22
  self.llm_client = llm_client
 
23
 
24
  def process(self, state: GAIAAgentState) -> GAIAAgentState:
25
  """
26
- Synthesize final answer from multiple agent results
27
  """
28
- logger.info("Synthesizer: Starting result synthesis")
29
- state.add_processing_step("Synthesizer: Analyzing agent results")
30
 
31
  try:
32
- # Check if we have any agent results to synthesize
33
  if not state.agent_results:
34
- error_msg = "No agent results available for synthesis"
35
- state.add_error(error_msg)
36
- state.final_answer = "Unable to process question - no agent results available"
37
  state.final_confidence = 0.0
38
- state.final_reasoning = error_msg
39
  state.is_complete = True
40
  return state
41
 
42
- # Determine synthesis strategy based on available results
43
- synthesis_strategy = self._determine_synthesis_strategy(state)
44
- state.add_processing_step(f"Synthesizer: Using {synthesis_strategy} strategy")
45
 
46
- # Execute synthesis based on strategy
47
- if synthesis_strategy == "single_agent":
48
- final_result = self._synthesize_single_agent(state)
49
- elif synthesis_strategy == "multi_agent_consensus":
50
- final_result = self._synthesize_multi_agent_consensus(state)
51
- elif synthesis_strategy == "confidence_weighted":
52
- final_result = self._synthesize_confidence_weighted(state)
53
- elif synthesis_strategy == "llm_synthesis":
54
- final_result = self._synthesize_with_llm(state)
55
- elif synthesis_strategy == "failure_analysis":
56
- final_result = self._synthesize_failure_analysis(state)
57
  else:
58
- final_result = self._synthesize_fallback(state)
 
 
 
 
 
 
 
59
 
60
  # Update state with final results
61
- state.final_answer = final_result["answer"]
62
- state.final_confidence = final_result["confidence"]
63
- state.final_reasoning = final_result["reasoning"]
64
- state.answer_source = final_result["source"]
65
  state.is_complete = True
66
 
67
- # Check if confidence threshold is met
68
- state.confidence_threshold_met = state.final_confidence >= 0.7
69
-
70
- # Determine if human review is needed
71
- state.requires_human_review = (
72
- state.final_confidence < 0.5 or
73
- len(state.error_messages) > 0 or
74
- state.difficulty_level >= 3
75
- )
76
 
77
- logger.info(f"✅ Synthesis complete: confidence={state.final_confidence:.2f}")
78
- state.add_processing_step(f"Synthesizer: Final answer generated (confidence: {state.final_confidence:.2f})")
79
 
80
  return state
81
 
82
  except Exception as e:
83
- error_msg = f"Synthesis failed: {str(e)}"
84
  state.add_error(error_msg)
85
  logger.error(error_msg)
86
 
87
- # Provide fallback answer
88
- state.final_answer = "Processing failed due to synthesis error"
89
  state.final_confidence = 0.0
90
  state.final_reasoning = error_msg
91
  state.answer_source = "error_fallback"
92
  state.is_complete = True
93
- state.requires_human_review = True
94
 
95
  return state
96
 
97
- def _determine_synthesis_strategy(self, state: GAIAAgentState) -> str:
98
- """Determine the best synthesis strategy based on available results"""
99
-
100
- successful_results = [r for r in state.agent_results.values() if r.success]
101
- failed_results = [r for r in state.agent_results.values() if not r.success]
102
 
103
- # If we have some results but they're mostly failures, try to extract useful info
104
- if len(successful_results) == 0 and len(failed_results) > 0:
105
- return "failure_analysis"
106
- elif len(successful_results) == 1:
107
- return "single_agent"
108
- elif len(successful_results) == 2:
109
- return "confidence_weighted"
110
- elif all(r.confidence > 0.6 for r in successful_results):
111
- return "multi_agent_consensus"
112
- else:
113
- return "llm_synthesis"
114
-
115
- def _synthesize_single_agent(self, state: GAIAAgentState) -> Dict[str, Any]:
116
- """Synthesize result from a single agent"""
117
-
118
- successful_results = [r for r in state.agent_results.values() if r.success]
119
- if not successful_results:
120
- return self._create_fallback_result("No successful agent results")
121
-
122
- best_result = max(successful_results, key=lambda r: r.confidence)
123
-
124
- return {
125
- "answer": best_result.result,
126
- "confidence": best_result.confidence,
127
- "reasoning": f"Single agent result from {best_result.agent_role.value}: {best_result.reasoning}",
128
- "source": best_result.agent_role.value
129
- }
130
-
131
- def _synthesize_multi_agent_consensus(self, state: GAIAAgentState) -> Dict[str, Any]:
132
- """Synthesize results when multiple agents agree (high confidence)"""
133
 
 
134
  successful_results = [r for r in state.agent_results.values() if r.success]
135
- high_confidence_results = [r for r in successful_results if r.confidence > 0.6]
136
-
137
- if not high_confidence_results:
138
- return self._synthesize_confidence_weighted(state)
139
-
140
- # Use the highest confidence result as primary
141
- primary_result = max(high_confidence_results, key=lambda r: r.confidence)
142
-
143
- # Calculate consensus confidence
144
- avg_confidence = mean([r.confidence for r in high_confidence_results])
145
- consensus_confidence = min(0.95, avg_confidence * 1.1) # Boost for consensus
146
-
147
- # Create reasoning summary
148
- agent_summaries = []
149
- for result in high_confidence_results:
150
- agent_summaries.append(f"{result.agent_role.value} (conf: {result.confidence:.2f})")
151
 
152
- reasoning = f"Consensus from {len(high_confidence_results)} agents: {', '.join(agent_summaries)}. Primary result: {primary_result.reasoning}"
 
 
 
 
 
 
 
 
 
153
 
154
- return {
155
- "answer": primary_result.result,
156
- "confidence": consensus_confidence,
157
- "reasoning": reasoning,
158
- "source": f"consensus_{len(high_confidence_results)}_agents"
159
- }
160
 
161
- def _synthesize_confidence_weighted(self, state: GAIAAgentState) -> Dict[str, Any]:
162
- """Synthesize results using confidence weighting"""
163
 
164
- successful_results = [r for r in state.agent_results.values() if r.success]
165
 
166
- if not successful_results:
167
- return self._create_fallback_result("No successful results for confidence weighting")
 
168
 
169
- # Weight by confidence
170
- total_weight = sum(r.confidence for r in successful_results)
171
- if total_weight == 0:
172
- return self._synthesize_single_agent(state)
173
 
174
- # Select primary result (highest confidence)
175
- primary_result = max(successful_results, key=lambda r: r.confidence)
 
176
 
177
- # Calculate weighted confidence
178
- weighted_confidence = sum(r.confidence ** 2 for r in successful_results) / total_weight
 
179
 
180
- # Create reasoning
181
- result_summaries = []
182
- for result in successful_results:
183
- weight = result.confidence / total_weight
184
- result_summaries.append(f"{result.agent_role.value} (weight: {weight:.2f})")
185
 
186
- reasoning = f"Confidence-weighted synthesis: {', '.join(result_summaries)}. Primary: {primary_result.reasoning}"
 
 
187
 
188
- return {
189
- "answer": primary_result.result,
190
- "confidence": min(0.9, weighted_confidence),
191
- "reasoning": reasoning,
192
- "source": f"weighted_{len(successful_results)}_agents"
193
- }
194
 
195
- def _synthesize_with_llm(self, state: GAIAAgentState) -> Dict[str, Any]:
196
- """Use LLM to synthesize conflicting or complex results"""
197
-
198
- successful_results = [r for r in state.agent_results.values() if r.success]
199
-
200
- # Prepare synthesis prompt
201
- agent_results_text = []
202
- for i, result in enumerate(successful_results, 1):
203
- agent_results_text.append(f"""
204
- Agent {i} ({result.agent_role.value}):
205
- - Answer: {result.result}
206
- - Confidence: {result.confidence:.2f}
207
- - Reasoning: {result.reasoning}
208
- """)
209
 
210
  synthesis_prompt = f"""
 
 
211
  Question: {state.question}
212
 
213
- Multiple agents have provided different answers/insights. Please synthesize these into a single, coherent final answer:
 
214
 
215
- {chr(10).join(agent_results_text)}
216
 
217
- Please provide:
218
- 1. A clear, direct final answer
219
- 2. Your confidence level (0.0 to 1.0)
220
- 3. Brief reasoning explaining how you synthesized the results
 
 
 
221
 
222
- Focus on accuracy and be direct in your response.
223
- """
 
 
224
 
225
- # Use complex model for synthesis
226
- model_tier = ModelTier.COMPLEX if state.should_use_complex_model() else ModelTier.MAIN
227
- llm_result = self.llm_client.generate(synthesis_prompt, tier=model_tier, max_tokens=400)
 
 
 
228
 
229
- if llm_result.success:
230
- # Parse LLM response for structured output
231
- llm_answer = llm_result.response
232
-
233
- # Extract confidence if mentioned in response
234
- confidence_match = re.search(r'confidence[:\s]*([0-9.]+)', llm_answer.lower())
235
- llm_confidence = float(confidence_match.group(1)) if confidence_match else 0.7
236
-
237
- # Adjust confidence based on input quality
238
- avg_input_confidence = mean([r.confidence for r in successful_results])
239
- final_confidence = min(0.85, (llm_confidence + avg_input_confidence) / 2)
240
-
241
  return {
242
- "answer": llm_answer,
243
- "confidence": final_confidence,
244
- "reasoning": f"LLM synthesis of {len(successful_results)} agent results using {llm_result.model_used}",
245
- "source": "llm_synthesis"
246
  }
247
  else:
248
- # Fallback to confidence weighted if LLM fails
249
- return self._synthesize_confidence_weighted(state)
250
 
251
- def _synthesize_fallback(self, state: GAIAAgentState) -> Dict[str, Any]:
252
- """Enhanced fallback synthesis when other strategies fail"""
253
 
254
- # Try to get any result, even if not successful
255
- all_results = list(state.agent_results.values())
256
 
257
- if all_results:
258
- # First try successful results
259
- successful_results = [r for r in all_results if r.success]
260
- if successful_results:
261
- best_attempt = max(successful_results, key=lambda r: r.confidence)
 
 
 
 
 
 
262
  return {
263
- "answer": best_attempt.result,
264
- "confidence": max(0.3, best_attempt.confidence * 0.8), # Reduce confidence for fallback
265
- "reasoning": f"Fallback result from {best_attempt.agent_role.value}: {best_attempt.reasoning}",
266
- "source": f"fallback_{best_attempt.agent_role.value}"
267
  }
268
-
269
- # If no successful results, try to extract useful info from failures
270
- return self._synthesize_failure_analysis(state)
271
- else:
272
- return self._create_fallback_result("No agent results available")
273
-
274
- def _synthesize_failure_analysis(self, state: GAIAAgentState) -> Dict[str, Any]:
275
- """Analyze failed results to provide some useful response"""
276
-
277
- failed_results = [r for r in state.agent_results.values() if not r.success]
278
-
279
- if not failed_results:
280
- return self._create_fallback_result("No results to analyze")
281
-
282
- # Look for patterns in failures
283
- error_patterns = []
284
- attempted_agents = []
285
-
286
- for result in failed_results:
287
- attempted_agents.append(result.agent_role.value)
288
-
289
- # Extract meaningful error information
290
- result_text = result.result.lower()
291
- if "research sources failed" in result_text:
292
- error_patterns.append("external_research_unavailable")
293
- elif "reasoning failed" in result_text:
294
- error_patterns.append("complex_reasoning_required")
295
- elif "conversion" in result_text:
296
- error_patterns.append("conversion_difficulty")
297
- elif "mathematical" in result_text:
298
- error_patterns.append("mathematical_complexity")
299
-
300
- # Try to provide a helpful response based on the question type and failures
301
- try:
302
- analysis_prompt = f"""
303
- Question: {state.question}
304
-
305
- Multiple specialized agents attempted to answer this question but encountered difficulties:
306
- - Agents tried: {', '.join(attempted_agents)}
307
- - Common issues: {', '.join(set(error_patterns)) if error_patterns else 'processing difficulties'}
308
-
309
- Based on the question itself, please provide the best answer you can using basic reasoning and knowledge.
310
- Even if external resources failed, try to answer based on general knowledge.
311
-
312
- Be honest about limitations but try to be helpful.
313
- """
314
-
315
- # Use main model for analysis
316
- llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=300)
317
-
318
- if llm_result.success:
319
  return {
320
- "answer": llm_result.response,
321
- "confidence": 0.25, # Low confidence but still attempting
322
- "reasoning": f"Generated from failure analysis. Agents tried: {', '.join(attempted_agents)}",
323
- "source": "failure_analysis"
324
  }
325
-
326
- except Exception as analysis_error:
327
- logger.warning(f"Failure analysis also failed: {analysis_error}")
328
-
329
- # Final fallback - provide structured error message
330
- return {
331
- "answer": f"Processing encountered difficulties: All research sources failed",
332
- "confidence": 0.1,
333
- "reasoning": f"Multiple agents failed: {', '.join(attempted_agents)}. {', '.join(set(error_patterns)) if error_patterns else 'Various processing issues encountered'}",
334
- "source": "structured_failure"
335
- }
336
-
337
- def _create_fallback_result(self, reason: str) -> Dict[str, Any]:
338
- """Create a fallback result when synthesis is impossible"""
339
- return {
340
- "answer": f"Unable to process question: {reason}",
341
- "confidence": 0.0,
342
- "reasoning": f"Synthesis failed: {reason}",
343
- "source": "synthesis_failure"
344
- }
345
 
346
  # Import regex for LLM response parsing
347
  import re
 
1
  #!/usr/bin/env python3
2
  """
3
  Synthesizer Agent for GAIA Agent System
4
+ GAIA-Compliant Final Answer Generation for Exact Match Evaluation
5
  """
6
 
7
  import logging
 
10
 
11
  from agents.state import GAIAAgentState, AgentRole, AgentResult
12
  from models.qwen_client import QwenClient, ModelTier
13
+ from tools.final_answer_tool import FinalAnswerTool
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
  class SynthesizerAgent:
18
  """
19
+ GAIA-compliant synthesizer that produces EXACT MATCH answers
20
+ Uses 72B model and final answer tool for precise extraction
21
  """
22
 
23
  def __init__(self, llm_client: QwenClient):
24
  self.llm_client = llm_client
25
+ self.final_answer_tool = FinalAnswerTool(llm_client)
26
 
27
  def process(self, state: GAIAAgentState) -> GAIAAgentState:
28
  """
29
+ Synthesize GAIA-compliant final answer from agent results
30
  """
31
+ logger.info("🎯 Synthesizer: Starting GAIA-compliant synthesis")
32
+ state.add_processing_step("Synthesizer: Generating GAIA-compliant final answer")
33
 
34
  try:
35
+ # Check if we have any agent results
36
  if not state.agent_results:
37
+ logger.warning("No agent results available for synthesis")
38
+ state.final_answer = "No results available"
 
39
  state.final_confidence = 0.0
40
+ state.final_reasoning = "No agent results to synthesize"
41
  state.is_complete = True
42
  return state
43
 
44
+ # Combine all agent results into comprehensive analysis
45
+ combined_analysis = self._combine_agent_results(state)
 
46
 
47
+ # Determine question type for specialized extraction
48
+ question_type = self._determine_question_type(state.question)
49
+
50
+ # Use 72B model for synthesis if we have multiple results or complex question
51
+ if len(state.agent_results) > 1 or state.should_use_complex_model():
52
+ synthesis_result = self._synthesize_with_72b(state, combined_analysis, question_type)
 
 
 
 
 
53
  else:
54
+ synthesis_result = self._synthesize_simple(state, combined_analysis, question_type)
55
+
56
+ # Extract GAIA-compliant final answer
57
+ final_answer_result = self.final_answer_tool.extract_final_answer(
58
+ question=state.question,
59
+ agent_results=synthesis_result["analysis"],
60
+ question_type=question_type
61
+ )
62
 
63
  # Update state with final results
64
+ state.final_answer = final_answer_result["answer"]
65
+ state.final_confidence = final_answer_result["confidence"]
66
+ state.final_reasoning = f"Synthesis: {synthesis_result['reasoning']} | Extraction: {final_answer_result['reasoning']}"
67
+ state.answer_source = "gaia_compliant_synthesis"
68
  state.is_complete = True
69
 
70
+ # GAIA compliance check
71
+ if len(state.final_answer) > 100:
72
+ logger.warning(f"Answer may be too long for GAIA: {len(state.final_answer)} chars")
73
+ state.final_confidence *= 0.7 # Reduce confidence for long answers
 
 
 
 
 
74
 
75
+ logger.info(f"✅ GAIA synthesis complete: '{state.final_answer}' (conf: {state.final_confidence:.2f})")
76
+ state.add_processing_step(f"Synthesizer: GAIA answer generated - '{state.final_answer}'")
77
 
78
  return state
79
 
80
  except Exception as e:
81
+ error_msg = f"GAIA synthesis failed: {str(e)}"
82
  state.add_error(error_msg)
83
  logger.error(error_msg)
84
 
85
+ # Fallback to simple answer
86
+ state.final_answer = "Processing error"
87
  state.final_confidence = 0.0
88
  state.final_reasoning = error_msg
89
  state.answer_source = "error_fallback"
90
  state.is_complete = True
 
91
 
92
  return state
93
 
94
+ def _combine_agent_results(self, state: GAIAAgentState) -> str:
95
+ """Combine all agent results into comprehensive analysis"""
 
 
 
96
 
97
+ analysis_parts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ # Add successful results first
100
  successful_results = [r for r in state.agent_results.values() if r.success]
101
+ if successful_results:
102
+ analysis_parts.append("=== SUCCESSFUL AGENT RESULTS ===")
103
+ for result in successful_results:
104
+ analysis_parts.append(f"""
105
+ {result.agent_role.value.upper()} (Confidence: {result.confidence:.2f}):
106
+ Result: {result.result}
107
+ Reasoning: {result.reasoning}
108
+ """)
 
 
 
 
 
 
 
 
109
 
110
+ # Add failed results with useful information
111
+ failed_results = [r for r in state.agent_results.values() if not r.success]
112
+ if failed_results:
113
+ analysis_parts.append("\n=== ADDITIONAL CONTEXT ===")
114
+ for result in failed_results:
115
+ if len(result.result) > 10: # Only include if has some content
116
+ analysis_parts.append(f"""
117
+ {result.agent_role.value.upper()} (Failed):
118
+ Attempted: {result.result[:200]}...
119
+ """)
120
 
121
+ return "\n".join(analysis_parts)
 
 
 
 
 
122
 
123
+ def _determine_question_type(self, question: str) -> str:
124
+ """Determine question type for specialized answer extraction"""
125
 
126
+ question_lower = question.lower()
127
 
128
+ # Mathematical/counting questions
129
+ if any(word in question_lower for word in ["how many", "count", "number of", "calculate", "sum", "total"]):
130
+ return "mathematical"
131
 
132
+ # Text manipulation (reversed text, opposites, etc.)
133
+ if any(word in question_lower for word in ["opposite", "reverse", "backwards", "decode"]):
134
+ return "text_manipulation"
 
135
 
136
+ # Yes/no questions
137
+ if any(word in question_lower for word in ["yes or no", "true or false", "is it", "does it", "can it"]):
138
+ return "yes_no"
139
 
140
+ # Name/person questions
141
+ if any(word in question_lower for word in ["who", "name", "first name", "last name", "surname"]):
142
+ return "name"
143
 
144
+ # Location questions
145
+ if any(word in question_lower for word in ["where", "city", "country", "location", "place"]):
146
+ return "location"
 
 
147
 
148
+ # File/code questions
149
+ if any(word in question_lower for word in ["file", "image", "code", "python", "attached", "excel"]):
150
+ return "file_processing"
151
 
152
+ return "general"
 
 
 
 
 
153
 
154
+ def _synthesize_with_72b(self, state: GAIAAgentState, combined_analysis: str, question_type: str) -> Dict[str, Any]:
155
+ """Use 72B model for complex synthesis"""
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  synthesis_prompt = f"""
158
+ CRITICAL: This is GAIA benchmark evaluation requiring EXACT MATCH answers.
159
+
160
  Question: {state.question}
161
 
162
+ Agent Analysis Results:
163
+ {combined_analysis}
164
 
165
+ Your task: Analyze all agent results and provide the most accurate answer.
166
 
167
+ GAIA COMPLIANCE RULES:
168
+ - Your answer must be concise and precise for exact match comparison
169
+ - No explanations, no "FINAL ANSWER:" prefix, no extra text
170
+ - For numbers: just the number (e.g., "5")
171
+ - For yes/no: just "yes" or "no"
172
+ - For names: just the name requested
173
+ - For locations: just the location name
174
 
175
+ Question Type: {question_type}
176
+
177
+ Based on all the agent results above, what is the precise answer to the original question?
178
+ Think carefully but respond with ONLY the answer:"""
179
 
180
+ # Use 72B model for synthesis
181
+ result = self.llm_client.generate(
182
+ synthesis_prompt,
183
+ tier=ModelTier.COMPLEX, # 72B model
184
+ max_tokens=100
185
+ )
186
 
187
+ if result.success:
 
 
 
 
 
 
 
 
 
 
 
188
  return {
189
+ "analysis": result.response,
190
+ "reasoning": f"72B synthesis of {len(state.agent_results)} agent results"
 
 
191
  }
192
  else:
193
+ # Fallback to simple synthesis
194
+ return self._synthesize_simple(state, combined_analysis, question_type)
195
 
196
+ def _synthesize_simple(self, state: GAIAAgentState, combined_analysis: str, question_type: str) -> Dict[str, Any]:
197
+ """Simple synthesis for single agent results or fallback"""
198
 
199
+ # Find the best available result
200
+ successful_results = [r for r in state.agent_results.values() if r.success]
201
 
202
+ if successful_results:
203
+ best_result = max(successful_results, key=lambda r: r.confidence)
204
+ return {
205
+ "analysis": f"Primary result from {best_result.agent_role.value}: {best_result.result}",
206
+ "reasoning": f"Single agent result from {best_result.agent_role.value}"
207
+ }
208
+ else:
209
+ # Try to extract useful info from failures
210
+ all_results = list(state.agent_results.values())
211
+ if all_results:
212
+ fallback_result = all_results[0] # Use first available result
213
  return {
214
+ "analysis": f"Fallback from {fallback_result.agent_role.value}: {fallback_result.result}",
215
+ "reasoning": f"Fallback synthesis from {fallback_result.agent_role.value}"
 
 
216
  }
217
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  return {
219
+ "analysis": "No agent results available",
220
+ "reasoning": "No synthesis possible - no results"
 
 
221
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  # Import regex for LLM response parsing
224
  import re
src/tools/__pycache__/final_answer_tool.cpython-310.pyc ADDED
Binary file (6.55 kB). View file
 
src/tools/final_answer_tool.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Final Answer Tool for GAIA Agent System
4
+ Extracts precise, EXACT MATCH compliant answers from agent results
5
+ """
6
+
7
+ import re
8
+ import logging
9
+ from typing import Dict, Any, Optional
10
+
11
+ from models.qwen_client import QwenClient, ModelTier
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class FinalAnswerTool:
16
+ """
17
+ Tool for extracting precise, GAIA-compliant final answers
18
+ Ensures EXACT MATCH compatibility for Unit 4 API submission
19
+ """
20
+
21
+ def __init__(self, llm_client: QwenClient):
22
+ self.llm_client = llm_client
23
+
24
+ def extract_final_answer(self, question: str, agent_results: str, question_type: str = "") -> Dict[str, Any]:
25
+ """
26
+ Extract the precise final answer in GAIA-compliant format
27
+
28
+ Args:
29
+ question: The original GAIA question
30
+ agent_results: Combined results from multiple agents
31
+ question_type: Type of question (for specialized extraction)
32
+
33
+ Returns:
34
+ Dict with extracted answer, confidence, and reasoning
35
+ """
36
+ try:
37
+ logger.info("🎯 Extracting GAIA-compliant final answer")
38
+
39
+ # Create specialized extraction prompt
40
+ extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
41
+
42
+ # Use 72B model for precise extraction
43
+ result = self.llm_client.generate(
44
+ extraction_prompt,
45
+ tier=ModelTier.COMPLEX, # 72B model
46
+ max_tokens=50 # Force concise answers
47
+ )
48
+
49
+ if not result.success:
50
+ logger.error("Final answer extraction failed")
51
+ return {
52
+ "answer": "Processing failed",
53
+ "confidence": 0.0,
54
+ "reasoning": f"Extraction failed: {result.response}"
55
+ }
56
+
57
+ # Parse and clean the extracted answer
58
+ extracted_answer = self._clean_answer(result.response, question_type)
59
+
60
+ # Validate answer format
61
+ validation_result = self._validate_answer(extracted_answer, question_type)
62
+
63
+ logger.info(f"✅ Final answer extracted: '{extracted_answer}'")
64
+
65
+ return {
66
+ "answer": extracted_answer,
67
+ "confidence": validation_result["confidence"],
68
+ "reasoning": f"Extracted using 72B model. Validation: {validation_result['status']}"
69
+ }
70
+
71
+ except Exception as e:
72
+ error_msg = f"Final answer extraction error: {str(e)}"
73
+ logger.error(error_msg)
74
+ return {
75
+ "answer": "Extraction error",
76
+ "confidence": 0.0,
77
+ "reasoning": error_msg
78
+ }
79
+
80
+ def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
81
+ """Create specialized extraction prompt based on question type"""
82
+
83
+ base_prompt = f"""
84
+ CRITICAL: This is for GAIA benchmark evaluation using EXACT MATCH comparison.
85
+ Your response must be ONLY the precise answer - no explanations, no "FINAL ANSWER:", no extra text.
86
+
87
+ Question: {question}
88
+
89
+ Agent Analysis Results:
90
+ {agent_results}
91
+
92
+ EXTRACTION RULES:
93
+ """
94
+
95
+ # Add type-specific rules
96
+ if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "calculate"]):
97
+ base_prompt += """
98
+ - If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
99
+ - If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
100
+ - No units unless specifically requested in the question
101
+ """
102
+ elif "text_manipulation" in question_type.lower() or "reverse" in question.lower():
103
+ base_prompt += """
104
+ - If text is reversed: provide the corrected text
105
+ - If asking for opposite: provide ONLY the opposite word (e.g., "right" for opposite of "left")
106
+ - If asking to decode: provide ONLY the decoded answer
107
+ """
108
+ elif "yes" in question.lower() or "true" in question.lower() or "false" in question.lower():
109
+ base_prompt += """
110
+ - If yes/no question: respond with ONLY "yes" or "no" (lowercase)
111
+ - If true/false question: respond with ONLY "true" or "false" (lowercase)
112
+ """
113
+ elif any(word in question.lower() for word in ["name", "who", "which person"]):
114
+ base_prompt += """
115
+ - If asking for a name: provide ONLY the name (e.g., "John Smith", "Einstein")
116
+ - If asking for first name only: provide ONLY first name (e.g., "John")
117
+ - If asking for last name only: provide ONLY last name (e.g., "Smith")
118
+ """
119
+ elif any(word in question.lower() for word in ["where", "location", "city", "country"]):
120
+ base_prompt += """
121
+ - If asking for location: provide ONLY the location name (e.g., "Paris", "USA", "New York")
122
+ - No additional descriptors unless specifically requested
123
+ """
124
+ else:
125
+ base_prompt += """
126
+ - Provide ONLY the direct answer to the question
127
+ - No explanations, context, or additional information
128
+ - Be as concise as possible while being accurate
129
+ """
130
+
131
+ base_prompt += """
132
+
133
+ EXAMPLES OF CORRECT FORMAT:
134
+ - Question: "How many albums?" → Answer: "5"
135
+ - Question: "What is the opposite of left?" → Answer: "right"
136
+ - Question: "True or false?" → Answer: "true"
137
+ - Question: "Who discovered X?" → Answer: "Einstein"
138
+ - Question: "Which city?" → Answer: "London"
139
+
140
+ Extract the precise answer NOW:"""
141
+
142
+ return base_prompt
143
+
144
+ def _clean_answer(self, raw_answer: str, question_type: str) -> str:
145
+ """Clean and format the extracted answer"""
146
+
147
+ # Remove common unwanted prefixes/suffixes
148
+ answer = raw_answer.strip()
149
+
150
+ # Remove common prefixes
151
+ prefixes_to_remove = [
152
+ "the answer is",
153
+ "answer:",
154
+ "final answer:",
155
+ "result:",
156
+ "response:",
157
+ "conclusion:",
158
+ ]
159
+
160
+ for prefix in prefixes_to_remove:
161
+ if answer.lower().startswith(prefix):
162
+ answer = answer[len(prefix):].strip()
163
+
164
+ # Remove quotes if they wrap the entire answer
165
+ if answer.startswith('"') and answer.endswith('"'):
166
+ answer = answer[1:-1]
167
+ if answer.startswith("'") and answer.endswith("'"):
168
+ answer = answer[1:-1]
169
+
170
+ # Handle specific formatting based on question type
171
+ if "mathematical" in question_type.lower():
172
+ # Extract just the number for mathematical questions
173
+ number_match = re.search(r'-?\d+(?:\.\d+)?', answer)
174
+ if number_match:
175
+ answer = number_match.group()
176
+
177
+ elif "text_manipulation" in question_type.lower():
178
+ # For reversed text questions, ensure clean output
179
+ if len(answer.split()) == 1: # Single word answer
180
+ answer = answer.lower()
181
+
182
+ # Remove any trailing punctuation that's not part of the answer
183
+ answer = answer.rstrip('.,!?;:')
184
+
185
+ return answer.strip()
186
+
187
+ def _validate_answer(self, answer: str, question_type: str) -> Dict[str, Any]:
188
+ """Validate the extracted answer format"""
189
+
190
+ if not answer:
191
+ return {"status": "empty_answer", "confidence": 0.0}
192
+
193
+ # Check length - GAIA answers should be concise
194
+ if len(answer) > 100:
195
+ return {"status": "too_long", "confidence": 0.3}
196
+
197
+ # Type-specific validation
198
+ if "mathematical" in question_type.lower():
199
+ if re.match(r'^-?\d+(?:\.\d+)?$', answer):
200
+ return {"status": "valid_number", "confidence": 0.9}
201
+ else:
202
+ return {"status": "invalid_number_format", "confidence": 0.5}
203
+
204
+ elif "yes_no" in question_type.lower():
205
+ if answer.lower() in ["yes", "no", "true", "false"]:
206
+ return {"status": "valid_boolean", "confidence": 0.9}
207
+ else:
208
+ return {"status": "invalid_boolean_format", "confidence": 0.4}
209
+
210
+ # General validation - prefer short, direct answers
211
+ if len(answer) <= 20:
212
+ return {"status": "concise_answer", "confidence": 0.8}
213
+ elif len(answer) <= 50:
214
+ return {"status": "moderate_length", "confidence": 0.6}
215
+ else:
216
+ return {"status": "long_answer", "confidence": 0.4}