Agent_Course_Final_Assignment

Sleeping

App Files Files Community

Chris commited on May 29, 2025

Commit

5ec1e1b

1 Parent(s): 0b92da3

Final 7.0.3

Browse files

Files changed (7) hide show

src/agents/__pycache__/router.cpython-310.pyc +0 -0
src/agents/__pycache__/synthesizer.cpython-310.pyc +0 -0
src/agents/reasoning_agent.py +18 -16
src/agents/router.py +168 -40
src/agents/synthesizer.py +135 -258
src/tools/__pycache__/final_answer_tool.cpython-310.pyc +0 -0
src/tools/final_answer_tool.py +216 -0

src/agents/__pycache__/router.cpython-310.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/router.cpython-310.pyc and b/src/agents/__pycache__/router.cpython-310.pyc differ

src/agents/__pycache__/synthesizer.cpython-310.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/synthesizer.cpython-310.pyc and b/src/agents/__pycache__/synthesizer.cpython-310.pyc differ

src/agents/reasoning_agent.py CHANGED Viewed

@@ -52,6 +52,8 @@ class ReasoningAgent:
                     result = self._process_pattern_analysis(state)
                 elif strategy == "step_by_step":
                     result = self._process_step_by_step(state)
                 else:
                     result = self._process_general_reasoning(state)
@@ -286,17 +288,17 @@ class ReasoningAgent:
         Be systematic and show your reasoning.
         """
-        model_tier = ModelTier.MAIN
         llm_result = self.llm_client.generate(pattern_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
-            confidence = 0.75 if numbers else 0.65  # Higher confidence with numerical data
             return AgentResult(
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
                 confidence=confidence,
-                reasoning="Analyzed patterns and sequences",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
@@ -323,7 +325,7 @@ class ReasoningAgent:
         Be thorough and explain each step.
         """
-        model_tier = ModelTier.MAIN
         llm_result = self.llm_client.generate(step_prompt, tier=model_tier, max_tokens=600)
         if llm_result.success:
@@ -331,8 +333,8 @@ class ReasoningAgent:
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
-                confidence=0.75,
-                reasoning="Provided step-by-step solution",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
@@ -354,7 +356,7 @@ class ReasoningAgent:
         Consider all aspects of the question and explain your reasoning.
         """
-        model_tier = ModelTier.MAIN
         llm_result = self.llm_client.generate(reasoning_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
@@ -362,8 +364,8 @@ class ReasoningAgent:
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
-                confidence=0.70,
-                reasoning="Applied general reasoning and analysis",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
@@ -450,7 +452,7 @@ class ReasoningAgent:
             Please provide a direct answer incorporating these calculations.
             """
-            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=400)
             if llm_result.success:
                 return AgentResult(
@@ -495,7 +497,7 @@ class ReasoningAgent:
             Please provide a direct answer based on this statistical analysis.
             """
-            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=400)
             if llm_result.success:
                 return AgentResult(
@@ -534,7 +536,7 @@ class ReasoningAgent:
             Please provide a direct answer incorporating this conversion.
             """
-            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.ROUTER, max_tokens=300)
             if llm_result.success:
                 return AgentResult(
@@ -568,7 +570,7 @@ class ReasoningAgent:
         Provide a clear numerical answer.
         """
-        model_tier = ModelTier.MAIN
         llm_result = self.llm_client.generate(math_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
@@ -598,7 +600,7 @@ class ReasoningAgent:
         Apply statistical reasoning and provide a clear answer.
         """
-        model_tier = ModelTier.MAIN
         llm_result = self.llm_client.generate(stats_prompt, tier=model_tier, max_tokens=400)
         if llm_result.success:
@@ -628,7 +630,7 @@ class ReasoningAgent:
         Apply conversion reasoning and provide a clear answer.
         """
-        model_tier = ModelTier.ROUTER
         llm_result = self.llm_client.generate(conversion_prompt, tier=model_tier, max_tokens=300)
         if llm_result.success:
@@ -677,7 +679,7 @@ class ReasoningAgent:
             """
             # Use main model for fallback
-            llm_result = self.llm_client.generate(fallback_prompt, tier=ModelTier.MAIN, max_tokens=400)
             if llm_result.success:
                 return AgentResult(

                     result = self._process_pattern_analysis(state)
                 elif strategy == "step_by_step":
                     result = self._process_step_by_step(state)
+                elif strategy == "general_reasoning":
+                    result = self._process_general_reasoning(state)
                 else:
                     result = self._process_general_reasoning(state)
         Be systematic and show your reasoning.
         """
+        model_tier = ModelTier.COMPLEX  # Use 72B model for pattern analysis
         llm_result = self.llm_client.generate(pattern_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
+            confidence = 0.85 if numbers else 0.75  # Higher confidence with numerical data
             return AgentResult(
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
                 confidence=confidence,
+                reasoning="Analyzed patterns and sequences with 72B model",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
         Be thorough and explain each step.
         """
+        model_tier = ModelTier.COMPLEX  # Use 72B model for step-by-step reasoning
         llm_result = self.llm_client.generate(step_prompt, tier=model_tier, max_tokens=600)
         if llm_result.success:
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
+                confidence=0.85,  # Higher confidence with 72B model
+                reasoning="Provided step-by-step solution with 72B model",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
         Consider all aspects of the question and explain your reasoning.
         """
+        model_tier = ModelTier.COMPLEX  # Use 72B model for general reasoning
         llm_result = self.llm_client.generate(reasoning_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
                 agent_role=AgentRole.REASONING_AGENT,
                 success=True,
                 result=llm_result.response,
+                confidence=0.80,  # Higher confidence with 72B model
+                reasoning="Applied general reasoning and analysis with 72B model",
                 model_used=llm_result.model_used,
                 processing_time=llm_result.response_time,
                 cost_estimate=llm_result.cost_estimate
             Please provide a direct answer incorporating these calculations.
             """
+            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
             if llm_result.success:
                 return AgentResult(
             Please provide a direct answer based on this statistical analysis.
             """
+            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
             if llm_result.success:
                 return AgentResult(
             Please provide a direct answer incorporating this conversion.
             """
+            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
             if llm_result.success:
                 return AgentResult(
         Provide a clear numerical answer.
         """
+        model_tier = ModelTier.COMPLEX  # Use 72B model for mathematical reasoning
         llm_result = self.llm_client.generate(math_prompt, tier=model_tier, max_tokens=500)
         if llm_result.success:
         Apply statistical reasoning and provide a clear answer.
         """
+        model_tier = ModelTier.COMPLEX
         llm_result = self.llm_client.generate(stats_prompt, tier=model_tier, max_tokens=400)
         if llm_result.success:
         Apply conversion reasoning and provide a clear answer.
         """
+        model_tier = ModelTier.COMPLEX
         llm_result = self.llm_client.generate(conversion_prompt, tier=model_tier, max_tokens=300)
         if llm_result.success:
             """
             # Use main model for fallback
+            llm_result = self.llm_client.generate(fallback_prompt, tier=ModelTier.COMPLEX, max_tokens=400)
             if llm_result.success:
                 return AgentResult(

src/agents/router.py CHANGED Viewed

@@ -24,47 +24,48 @@ class RouterAgent:
     def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
         """
-        Main routing function - analyzes question and updates state with routing decisions
         """
-        logger.info(f"Routing question: {state.question[:100]}...")
-        state.add_processing_step("Router: Starting question analysis")
-        # Step 1: Enhanced question classification with multi-type detection
-        question_types, primary_type = self._classify_question_types(state.question, state.file_name)
-        state.question_type = primary_type
-        state.add_processing_step(f"Router: Primary type: {primary_type.value}, All types: {[t.value for t in question_types]}")
-        # Step 2: Complexity assessment
-        complexity = self._assess_complexity(state.question)
-        state.complexity_assessment = complexity
-        state.add_processing_step(f"Router: Assessed complexity as {complexity}")
-        # Step 3: Select appropriate agents with sequencing
-        selected_agents = self._select_agents_enhanced(question_types, primary_type, state.file_name is not None, complexity)
-        state.selected_agents = selected_agents
-        state.add_processing_step(f"Router: Selected agents: {[a.value for a in selected_agents]}")
-        # Step 4: Estimate cost
-        estimated_cost = self._estimate_cost(complexity, selected_agents)
-        state.estimated_cost = estimated_cost
-        state.add_processing_step(f"Router: Estimated cost: ${estimated_cost:.4f}")
-        # Step 5: Create routing decision summary
-        state.routing_decision = {
-            "primary_type": primary_type.value,
-            "all_types": [t.value for t in question_types],
-            "complexity": complexity,
-            "agents": [agent.value for agent in selected_agents],
-            "estimated_cost": estimated_cost,
-            "reasoning": self._get_routing_reasoning(primary_type, complexity, selected_agents, question_types)
-        }
-        # Step 6: Use LLM for complex routing decisions if needed
-        if complexity == "complex" or primary_type == QuestionType.UNKNOWN or len(question_types) > 2:
-            state = self._llm_enhanced_routing(state)
-        logger.info(f"✅ Routing complete: {primary_type.value} -> {[a.value for a in selected_agents]}")
-        return state
     def _classify_question_types(self, question: str, file_name: str = None) -> Tuple[List[QuestionType], QuestionType]:
         """
@@ -458,4 +459,131 @@ class RouterAgent:
             state.add_error(f"LLM routing error: {str(e)}")
             logger.error(f"LLM routing failed: {e}")
-        return state

     def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
         """
+        Main routing function - analyzes question and determines processing strategy
         """
+        logger.info(f"🧭 Router: Analyzing question type and complexity")
+        state.add_processing_step("Router: Analyzing question and selecting agents")
+        try:
+            # Analyze question patterns for classification
+            question_types, primary_type = self._classify_question_types(state.question, state.file_name)
+            state.question_types = question_types
+            state.primary_question_type = primary_type
+            # Use 72B model for complex routing decisions
+            llm_classification = self._get_llm_classification(state.question)
+            # Combine pattern-based and LLM-based classification
+            final_types, final_primary = self._combine_classifications(
+                question_types, primary_type, llm_classification
+            )
+            # Update state with final classification
+            state.question_types = final_types
+            state.primary_question_type = final_primary
+            # Select agents based on question types
+            selected_agents = self._select_agents(final_types, final_primary, state.question)
+            state.selected_agents = selected_agents
+            logger.info(f"✅ Routing complete: {final_primary.value} -> {[a.value for a in selected_agents]}")
+            state.add_processing_step(f"Router: Selected agents - {[a.value for a in selected_agents]}")
+            return state
+        except Exception as e:
+            error_msg = f"Router failed: {str(e)}"
+            logger.error(error_msg)
+            state.add_error(error_msg)
+            # Fallback to web researcher for unknown questions
+            state.selected_agents = [AgentRole.WEB_RESEARCHER]
+            state.primary_question_type = QuestionType.WEB_RESEARCH
+            return state
     def _classify_question_types(self, question: str, file_name: str = None) -> Tuple[List[QuestionType], QuestionType]:
         """
             state.add_error(f"LLM routing error: {str(e)}")
             logger.error(f"LLM routing failed: {e}")
+        return state
+    def _get_llm_classification(self, question: str) -> Dict[str, Any]:
+        """Use 72B model for intelligent question classification"""
+        classification_prompt = f"""
+Analyze this GAIA benchmark question and classify it for agent routing.
+Question: {question}
+Determine:
+1. Primary question type (mathematical, text_manipulation, web_research, file_processing, reasoning, factual_lookup)
+2. Required capabilities (research, calculation, file_analysis, text_processing, logical_reasoning)
+3. Complexity level (simple, moderate, complex)
+4. Expected answer type (number, text, yes_no, name, location, list)
+Provide your analysis in this format:
+PRIMARY_TYPE: [type]
+CAPABILITIES: [cap1, cap2, cap3]
+COMPLEXITY: [level]
+ANSWER_TYPE: [type]
+REASONING: [brief explanation]
+"""
+        # Use 72B model for classification
+        result = self.llm_client.generate(
+            classification_prompt,
+            tier=ModelTier.COMPLEX,  # 72B model for better reasoning
+            max_tokens=200
+        )
+        if result.success:
+            return self._parse_llm_classification(result.response)
+        else:
+            logger.warning("LLM classification failed, using pattern-based only")
+            return {"primary_type": "unknown", "capabilities": [], "complexity": "moderate"}
+    def _parse_llm_classification(self, response: str) -> Dict[str, Any]:
+        """Parse LLM classification response"""
+        parsed = {
+            "primary_type": "unknown",
+            "capabilities": [],
+            "complexity": "moderate",
+            "answer_type": "text",
+            "reasoning": ""
+        }
+        lines = response.split('\n')
+        for line in lines:
+            line = line.strip()
+            if line.startswith("PRIMARY_TYPE:"):
+                parsed["primary_type"] = line.split(":", 1)[1].strip().lower()
+            elif line.startswith("CAPABILITIES:"):
+                caps_text = line.split(":", 1)[1].strip()
+                parsed["capabilities"] = [c.strip().lower() for c in caps_text.split(",")]
+            elif line.startswith("COMPLEXITY:"):
+                parsed["complexity"] = line.split(":", 1)[1].strip().lower()
+            elif line.startswith("ANSWER_TYPE:"):
+                parsed["answer_type"] = line.split(":", 1)[1].strip().lower()
+            elif line.startswith("REASONING:"):
+                parsed["reasoning"] = line.split(":", 1)[1].strip()
+        return parsed
+    def _combine_classifications(self, pattern_types: List[QuestionType], pattern_primary: QuestionType,
+                               llm_classification: Dict[str, Any]) -> Tuple[List[QuestionType], QuestionType]:
+        """Combine pattern-based and LLM-based classifications"""
+        # Map LLM classification to our enum types
+        llm_type_mapping = {
+            "mathematical": QuestionType.MATHEMATICAL,
+            "text_manipulation": QuestionType.TEXT_MANIPULATION,
+            "web_research": QuestionType.WEB_RESEARCH,
+            "file_processing": QuestionType.FILE_PROCESSING,
+            "reasoning": QuestionType.REASONING,
+            "factual_lookup": QuestionType.WEB_RESEARCH,
+            "code_execution": QuestionType.CODE_EXECUTION
+        }
+        llm_primary = llm_type_mapping.get(llm_classification["primary_type"], QuestionType.WEB_RESEARCH)
+        # Combine types - prefer LLM classification for primary, merge for secondary types
+        combined_types = list(pattern_types)
+        if llm_primary not in combined_types:
+            combined_types.insert(0, llm_primary)  # Add LLM primary to front
+        # Use LLM primary if it's confident, otherwise stick with pattern
+        if llm_classification["complexity"] in ["complex", "moderate"] and llm_primary != QuestionType.WEB_RESEARCH:
+            final_primary = llm_primary
+        else:
+            final_primary = pattern_primary
+        logger.info(f"🤖 Combined classification: Pattern={pattern_primary.value}, LLM={llm_primary.value}, Final={final_primary.value}")
+        return combined_types, final_primary
+    def _select_agents(self, question_types: List[QuestionType], primary_type: QuestionType, question: str) -> List[AgentRole]:
+        """Select agents based on combined classification"""
+        agents = []
+        # Primary agent based on primary type
+        primary_agent_map = {
+            QuestionType.MATHEMATICAL: AgentRole.REASONING_AGENT,
+            QuestionType.TEXT_MANIPULATION: AgentRole.REASONING_AGENT,
+            QuestionType.WEB_RESEARCH: AgentRole.WEB_RESEARCHER,
+            QuestionType.FILE_PROCESSING: AgentRole.FILE_PROCESSOR,
+            QuestionType.REASONING: AgentRole.REASONING_AGENT,
+            QuestionType.CODE_EXECUTION: AgentRole.CODE_EXECUTOR
+        }
+        primary_agent = primary_agent_map.get(primary_type, AgentRole.WEB_RESEARCHER)
+        if primary_agent not in agents:
+            agents.append(primary_agent)
+        # Add secondary agents based on all detected types
+        for qtype in question_types:
+            if qtype != primary_type:  # Don't duplicate primary
+                secondary_agent = primary_agent_map.get(qtype)
+                if secondary_agent and secondary_agent not in agents:
+                    agents.append(secondary_agent)
+        # Always add synthesizer at the end
+        if AgentRole.SYNTHESIZER not in agents:
+            agents.append(AgentRole.SYNTHESIZER)
+        return agents

src/agents/synthesizer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Synthesizer Agent for GAIA Agent System
-Combines results from multiple agents and produces final answers
 """
 import logging
@@ -10,338 +10,215 @@ from statistics import mean
 from agents.state import GAIAAgentState, AgentRole, AgentResult
 from models.qwen_client import QwenClient, ModelTier
 logger = logging.getLogger(__name__)
 class SynthesizerAgent:
     """
-    Synthesizer agent that combines multiple agent results into a final answer
     """
     def __init__(self, llm_client: QwenClient):
         self.llm_client = llm_client
     def process(self, state: GAIAAgentState) -> GAIAAgentState:
         """
-        Synthesize final answer from multiple agent results
         """
-        logger.info("Synthesizer: Starting result synthesis")
-        state.add_processing_step("Synthesizer: Analyzing agent results")
         try:
-            # Check if we have any agent results to synthesize
             if not state.agent_results:
-                error_msg = "No agent results available for synthesis"
-                state.add_error(error_msg)
-                state.final_answer = "Unable to process question - no agent results available"
                 state.final_confidence = 0.0
-                state.final_reasoning = error_msg
                 state.is_complete = True
                 return state
-            # Determine synthesis strategy based on available results
-            synthesis_strategy = self._determine_synthesis_strategy(state)
-            state.add_processing_step(f"Synthesizer: Using {synthesis_strategy} strategy")
-            # Execute synthesis based on strategy
-            if synthesis_strategy == "single_agent":
-                final_result = self._synthesize_single_agent(state)
-            elif synthesis_strategy == "multi_agent_consensus":
-                final_result = self._synthesize_multi_agent_consensus(state)
-            elif synthesis_strategy == "confidence_weighted":
-                final_result = self._synthesize_confidence_weighted(state)
-            elif synthesis_strategy == "llm_synthesis":
-                final_result = self._synthesize_with_llm(state)
-            elif synthesis_strategy == "failure_analysis":
-                final_result = self._synthesize_failure_analysis(state)
             else:
-                final_result = self._synthesize_fallback(state)
             # Update state with final results
-            state.final_answer = final_result["answer"]
-            state.final_confidence = final_result["confidence"]
-            state.final_reasoning = final_result["reasoning"]
-            state.answer_source = final_result["source"]
             state.is_complete = True
-            # Check if confidence threshold is met
-            state.confidence_threshold_met = state.final_confidence >= 0.7
-            # Determine if human review is needed
-            state.requires_human_review = (
-                state.final_confidence < 0.5 or
-                len(state.error_messages) > 0 or
-                state.difficulty_level >= 3
-            )
-            logger.info(f"✅ Synthesis complete: confidence={state.final_confidence:.2f}")
-            state.add_processing_step(f"Synthesizer: Final answer generated (confidence: {state.final_confidence:.2f})")
             return state
         except Exception as e:
-            error_msg = f"Synthesis failed: {str(e)}"
             state.add_error(error_msg)
             logger.error(error_msg)
-            # Provide fallback answer
-            state.final_answer = "Processing failed due to synthesis error"
             state.final_confidence = 0.0
             state.final_reasoning = error_msg
             state.answer_source = "error_fallback"
             state.is_complete = True
-            state.requires_human_review = True
             return state
-    def _determine_synthesis_strategy(self, state: GAIAAgentState) -> str:
-        """Determine the best synthesis strategy based on available results"""
-        successful_results = [r for r in state.agent_results.values() if r.success]
-        failed_results = [r for r in state.agent_results.values() if not r.success]
-        # If we have some results but they're mostly failures, try to extract useful info
-        if len(successful_results) == 0 and len(failed_results) > 0:
-            return "failure_analysis"
-        elif len(successful_results) == 1:
-            return "single_agent"
-        elif len(successful_results) == 2:
-            return "confidence_weighted"
-        elif all(r.confidence > 0.6 for r in successful_results):
-            return "multi_agent_consensus"
-        else:
-            return "llm_synthesis"
-    def _synthesize_single_agent(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Synthesize result from a single agent"""
-        successful_results = [r for r in state.agent_results.values() if r.success]
-        if not successful_results:
-            return self._create_fallback_result("No successful agent results")
-        best_result = max(successful_results, key=lambda r: r.confidence)
-        return {
-            "answer": best_result.result,
-            "confidence": best_result.confidence,
-            "reasoning": f"Single agent result from {best_result.agent_role.value}: {best_result.reasoning}",
-            "source": best_result.agent_role.value
-        }
-    def _synthesize_multi_agent_consensus(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Synthesize results when multiple agents agree (high confidence)"""
         successful_results = [r for r in state.agent_results.values() if r.success]
-        high_confidence_results = [r for r in successful_results if r.confidence > 0.6]
-        if not high_confidence_results:
-            return self._synthesize_confidence_weighted(state)
-        # Use the highest confidence result as primary
-        primary_result = max(high_confidence_results, key=lambda r: r.confidence)
-        # Calculate consensus confidence
-        avg_confidence = mean([r.confidence for r in high_confidence_results])
-        consensus_confidence = min(0.95, avg_confidence * 1.1)  # Boost for consensus
-        # Create reasoning summary
-        agent_summaries = []
-        for result in high_confidence_results:
-            agent_summaries.append(f"{result.agent_role.value} (conf: {result.confidence:.2f})")
-        reasoning = f"Consensus from {len(high_confidence_results)} agents: {', '.join(agent_summaries)}. Primary result: {primary_result.reasoning}"
-        return {
-            "answer": primary_result.result,
-            "confidence": consensus_confidence,
-            "reasoning": reasoning,
-            "source": f"consensus_{len(high_confidence_results)}_agents"
-        }
-    def _synthesize_confidence_weighted(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Synthesize results using confidence weighting"""
-        successful_results = [r for r in state.agent_results.values() if r.success]
-        if not successful_results:
-            return self._create_fallback_result("No successful results for confidence weighting")
-        # Weight by confidence
-        total_weight = sum(r.confidence for r in successful_results)
-        if total_weight == 0:
-            return self._synthesize_single_agent(state)
-        # Select primary result (highest confidence)
-        primary_result = max(successful_results, key=lambda r: r.confidence)
-        # Calculate weighted confidence
-        weighted_confidence = sum(r.confidence ** 2 for r in successful_results) / total_weight
-        # Create reasoning
-        result_summaries = []
-        for result in successful_results:
-            weight = result.confidence / total_weight
-            result_summaries.append(f"{result.agent_role.value} (weight: {weight:.2f})")
-        reasoning = f"Confidence-weighted synthesis: {', '.join(result_summaries)}. Primary: {primary_result.reasoning}"
-        return {
-            "answer": primary_result.result,
-            "confidence": min(0.9, weighted_confidence),
-            "reasoning": reasoning,
-            "source": f"weighted_{len(successful_results)}_agents"
-        }
-    def _synthesize_with_llm(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Use LLM to synthesize conflicting or complex results"""
-        successful_results = [r for r in state.agent_results.values() if r.success]
-        # Prepare synthesis prompt
-        agent_results_text = []
-        for i, result in enumerate(successful_results, 1):
-            agent_results_text.append(f"""
-Agent {i} ({result.agent_role.value}):
-- Answer: {result.result}
-- Confidence: {result.confidence:.2f}
-- Reasoning: {result.reasoning}
-""")
         synthesis_prompt = f"""
 Question: {state.question}
-Multiple agents have provided different answers/insights. Please synthesize these into a single, coherent final answer:
-{chr(10).join(agent_results_text)}
-Please provide:
-1. A clear, direct final answer
-2. Your confidence level (0.0 to 1.0)
-3. Brief reasoning explaining how you synthesized the results
-Focus on accuracy and be direct in your response.
-"""
-        # Use complex model for synthesis
-        model_tier = ModelTier.COMPLEX if state.should_use_complex_model() else ModelTier.MAIN
-        llm_result = self.llm_client.generate(synthesis_prompt, tier=model_tier, max_tokens=400)
-        if llm_result.success:
-            # Parse LLM response for structured output
-            llm_answer = llm_result.response
-            # Extract confidence if mentioned in response
-            confidence_match = re.search(r'confidence[:\s]*([0-9.]+)', llm_answer.lower())
-            llm_confidence = float(confidence_match.group(1)) if confidence_match else 0.7
-            # Adjust confidence based on input quality
-            avg_input_confidence = mean([r.confidence for r in successful_results])
-            final_confidence = min(0.85, (llm_confidence + avg_input_confidence) / 2)
             return {
-                "answer": llm_answer,
-                "confidence": final_confidence,
-                "reasoning": f"LLM synthesis of {len(successful_results)} agent results using {llm_result.model_used}",
-                "source": "llm_synthesis"
             }
         else:
-            # Fallback to confidence weighted if LLM fails
-            return self._synthesize_confidence_weighted(state)
-    def _synthesize_fallback(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Enhanced fallback synthesis when other strategies fail"""
-        # Try to get any result, even if not successful
-        all_results = list(state.agent_results.values())
-        if all_results:
-            # First try successful results
-            successful_results = [r for r in all_results if r.success]
-            if successful_results:
-                best_attempt = max(successful_results, key=lambda r: r.confidence)
                 return {
-                    "answer": best_attempt.result,
-                    "confidence": max(0.3, best_attempt.confidence * 0.8),  # Reduce confidence for fallback
-                    "reasoning": f"Fallback result from {best_attempt.agent_role.value}: {best_attempt.reasoning}",
-                    "source": f"fallback_{best_attempt.agent_role.value}"
                 }
-            # If no successful results, try to extract useful info from failures
-            return self._synthesize_failure_analysis(state)
-        else:
-            return self._create_fallback_result("No agent results available")
-    def _synthesize_failure_analysis(self, state: GAIAAgentState) -> Dict[str, Any]:
-        """Analyze failed results to provide some useful response"""
-        failed_results = [r for r in state.agent_results.values() if not r.success]
-        if not failed_results:
-            return self._create_fallback_result("No results to analyze")
-        # Look for patterns in failures
-        error_patterns = []
-        attempted_agents = []
-        for result in failed_results:
-            attempted_agents.append(result.agent_role.value)
-            # Extract meaningful error information
-            result_text = result.result.lower()
-            if "research sources failed" in result_text:
-                error_patterns.append("external_research_unavailable")
-            elif "reasoning failed" in result_text:
-                error_patterns.append("complex_reasoning_required")
-            elif "conversion" in result_text:
-                error_patterns.append("conversion_difficulty")
-            elif "mathematical" in result_text:
-                error_patterns.append("mathematical_complexity")
-        # Try to provide a helpful response based on the question type and failures
-        try:
-            analysis_prompt = f"""
-            Question: {state.question}
-            Multiple specialized agents attempted to answer this question but encountered difficulties:
-            - Agents tried: {', '.join(attempted_agents)}
-            - Common issues: {', '.join(set(error_patterns)) if error_patterns else 'processing difficulties'}
-            Based on the question itself, please provide the best answer you can using basic reasoning and knowledge.
-            Even if external resources failed, try to answer based on general knowledge.
-            Be honest about limitations but try to be helpful.
-            """
-            # Use main model for analysis
-            llm_result = self.llm_client.generate(analysis_prompt, tier=ModelTier.MAIN, max_tokens=300)
-            if llm_result.success:
                 return {
-                    "answer": llm_result.response,
-                    "confidence": 0.25,  # Low confidence but still attempting
-                    "reasoning": f"Generated from failure analysis. Agents tried: {', '.join(attempted_agents)}",
-                    "source": "failure_analysis"
                 }
-        except Exception as analysis_error:
-            logger.warning(f"Failure analysis also failed: {analysis_error}")
-        # Final fallback - provide structured error message
-        return {
-            "answer": f"Processing encountered difficulties: All research sources failed",
-            "confidence": 0.1,
-            "reasoning": f"Multiple agents failed: {', '.join(attempted_agents)}. {', '.join(set(error_patterns)) if error_patterns else 'Various processing issues encountered'}",
-            "source": "structured_failure"
-        }
-    def _create_fallback_result(self, reason: str) -> Dict[str, Any]:
-        """Create a fallback result when synthesis is impossible"""
-        return {
-            "answer": f"Unable to process question: {reason}",
-            "confidence": 0.0,
-            "reasoning": f"Synthesis failed: {reason}",
-            "source": "synthesis_failure"
-        }
 # Import regex for LLM response parsing
 import re

 #!/usr/bin/env python3
 """
 Synthesizer Agent for GAIA Agent System
+GAIA-Compliant Final Answer Generation for Exact Match Evaluation
 """
 import logging
 from agents.state import GAIAAgentState, AgentRole, AgentResult
 from models.qwen_client import QwenClient, ModelTier
+from tools.final_answer_tool import FinalAnswerTool
 logger = logging.getLogger(__name__)
 class SynthesizerAgent:
     """
+    GAIA-compliant synthesizer that produces EXACT MATCH answers
+    Uses 72B model and final answer tool for precise extraction
     """
     def __init__(self, llm_client: QwenClient):
         self.llm_client = llm_client
+        self.final_answer_tool = FinalAnswerTool(llm_client)
     def process(self, state: GAIAAgentState) -> GAIAAgentState:
         """
+        Synthesize GAIA-compliant final answer from agent results
         """
+        logger.info("🎯 Synthesizer: Starting GAIA-compliant synthesis")
+        state.add_processing_step("Synthesizer: Generating GAIA-compliant final answer")
         try:
+            # Check if we have any agent results
             if not state.agent_results:
+                logger.warning("No agent results available for synthesis")
+                state.final_answer = "No results available"
                 state.final_confidence = 0.0
+                state.final_reasoning = "No agent results to synthesize"
                 state.is_complete = True
                 return state
+            # Combine all agent results into comprehensive analysis
+            combined_analysis = self._combine_agent_results(state)
+            # Determine question type for specialized extraction
+            question_type = self._determine_question_type(state.question)
+            # Use 72B model for synthesis if we have multiple results or complex question
+            if len(state.agent_results) > 1 or state.should_use_complex_model():
+                synthesis_result = self._synthesize_with_72b(state, combined_analysis, question_type)
             else:
+                synthesis_result = self._synthesize_simple(state, combined_analysis, question_type)
+            # Extract GAIA-compliant final answer
+            final_answer_result = self.final_answer_tool.extract_final_answer(
+                question=state.question,
+                agent_results=synthesis_result["analysis"],
+                question_type=question_type
+            )
             # Update state with final results
+            state.final_answer = final_answer_result["answer"]
+            state.final_confidence = final_answer_result["confidence"]
+            state.final_reasoning = f"Synthesis: {synthesis_result['reasoning']} | Extraction: {final_answer_result['reasoning']}"
+            state.answer_source = "gaia_compliant_synthesis"
             state.is_complete = True
+            # GAIA compliance check
+            if len(state.final_answer) > 100:
+                logger.warning(f"Answer may be too long for GAIA: {len(state.final_answer)} chars")
+                state.final_confidence *= 0.7  # Reduce confidence for long answers
+            logger.info(f"✅ GAIA synthesis complete: '{state.final_answer}' (conf: {state.final_confidence:.2f})")
+            state.add_processing_step(f"Synthesizer: GAIA answer generated - '{state.final_answer}'")
             return state
         except Exception as e:
+            error_msg = f"GAIA synthesis failed: {str(e)}"
             state.add_error(error_msg)
             logger.error(error_msg)
+            # Fallback to simple answer
+            state.final_answer = "Processing error"
             state.final_confidence = 0.0
             state.final_reasoning = error_msg
             state.answer_source = "error_fallback"
             state.is_complete = True
             return state
+    def _combine_agent_results(self, state: GAIAAgentState) -> str:
+        """Combine all agent results into comprehensive analysis"""
+        analysis_parts = []
+        # Add successful results first
         successful_results = [r for r in state.agent_results.values() if r.success]
+        if successful_results:
+            analysis_parts.append("=== SUCCESSFUL AGENT RESULTS ===")
+            for result in successful_results:
+                analysis_parts.append(f"""
+{result.agent_role.value.upper()} (Confidence: {result.confidence:.2f}):
+Result: {result.result}
+Reasoning: {result.reasoning}
+""")
+        # Add failed results with useful information
+        failed_results = [r for r in state.agent_results.values() if not r.success]
+        if failed_results:
+            analysis_parts.append("\n=== ADDITIONAL CONTEXT ===")
+            for result in failed_results:
+                if len(result.result) > 10:  # Only include if has some content
+                    analysis_parts.append(f"""
+{result.agent_role.value.upper()} (Failed):
+Attempted: {result.result[:200]}...
+""")
+        return "\n".join(analysis_parts)
+    def _determine_question_type(self, question: str) -> str:
+        """Determine question type for specialized answer extraction"""
+        question_lower = question.lower()
+        # Mathematical/counting questions
+        if any(word in question_lower for word in ["how many", "count", "number of", "calculate", "sum", "total"]):
+            return "mathematical"
+        # Text manipulation (reversed text, opposites, etc.)
+        if any(word in question_lower for word in ["opposite", "reverse", "backwards", "decode"]):
+            return "text_manipulation"
+        # Yes/no questions
+        if any(word in question_lower for word in ["yes or no", "true or false", "is it", "does it", "can it"]):
+            return "yes_no"
+        # Name/person questions
+        if any(word in question_lower for word in ["who", "name", "first name", "last name", "surname"]):
+            return "name"
+        # Location questions
+        if any(word in question_lower for word in ["where", "city", "country", "location", "place"]):
+            return "location"
+        # File/code questions
+        if any(word in question_lower for word in ["file", "image", "code", "python", "attached", "excel"]):
+            return "file_processing"
+        return "general"
+    def _synthesize_with_72b(self, state: GAIAAgentState, combined_analysis: str, question_type: str) -> Dict[str, Any]:
+        """Use 72B model for complex synthesis"""
         synthesis_prompt = f"""
+CRITICAL: This is GAIA benchmark evaluation requiring EXACT MATCH answers.
 Question: {state.question}
+Agent Analysis Results:
+{combined_analysis}
+Your task: Analyze all agent results and provide the most accurate answer.
+GAIA COMPLIANCE RULES:
+- Your answer must be concise and precise for exact match comparison
+- No explanations, no "FINAL ANSWER:" prefix, no extra text
+- For numbers: just the number (e.g., "5")
+- For yes/no: just "yes" or "no"
+- For names: just the name requested
+- For locations: just the location name
+Question Type: {question_type}
+Based on all the agent results above, what is the precise answer to the original question?
+Think carefully but respond with ONLY the answer:"""
+        # Use 72B model for synthesis
+        result = self.llm_client.generate(
+            synthesis_prompt,
+            tier=ModelTier.COMPLEX,  # 72B model
+            max_tokens=100
+        )
+        if result.success:
             return {
+                "analysis": result.response,
+                "reasoning": f"72B synthesis of {len(state.agent_results)} agent results"
             }
         else:
+            # Fallback to simple synthesis
+            return self._synthesize_simple(state, combined_analysis, question_type)
+    def _synthesize_simple(self, state: GAIAAgentState, combined_analysis: str, question_type: str) -> Dict[str, Any]:
+        """Simple synthesis for single agent results or fallback"""
+        # Find the best available result
+        successful_results = [r for r in state.agent_results.values() if r.success]
+        if successful_results:
+            best_result = max(successful_results, key=lambda r: r.confidence)
+            return {
+                "analysis": f"Primary result from {best_result.agent_role.value}: {best_result.result}",
+                "reasoning": f"Single agent result from {best_result.agent_role.value}"
+            }
+        else:
+            # Try to extract useful info from failures
+            all_results = list(state.agent_results.values())
+            if all_results:
+                fallback_result = all_results[0]  # Use first available result
                 return {
+                    "analysis": f"Fallback from {fallback_result.agent_role.value}: {fallback_result.result}",
+                    "reasoning": f"Fallback synthesis from {fallback_result.agent_role.value}"
                 }
+            else:
                 return {
+                    "analysis": "No agent results available",
+                    "reasoning": "No synthesis possible - no results"
                 }
 # Import regex for LLM response parsing
 import re

src/tools/__pycache__/final_answer_tool.cpython-310.pyc ADDED Viewed

Binary file (6.55 kB). View file

src/tools/final_answer_tool.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#!/usr/bin/env python3
+"""
+Final Answer Tool for GAIA Agent System
+Extracts precise, EXACT MATCH compliant answers from agent results
+"""
+import re
+import logging
+from typing import Dict, Any, Optional
+from models.qwen_client import QwenClient, ModelTier
+logger = logging.getLogger(__name__)
+class FinalAnswerTool:
+    """
+    Tool for extracting precise, GAIA-compliant final answers
+    Ensures EXACT MATCH compatibility for Unit 4 API submission
+    """
+    def __init__(self, llm_client: QwenClient):
+        self.llm_client = llm_client
+    def extract_final_answer(self, question: str, agent_results: str, question_type: str = "") -> Dict[str, Any]:
+        """
+        Extract the precise final answer in GAIA-compliant format
+        Args:
+            question: The original GAIA question
+            agent_results: Combined results from multiple agents
+            question_type: Type of question (for specialized extraction)
+        Returns:
+            Dict with extracted answer, confidence, and reasoning
+        """
+        try:
+            logger.info("🎯 Extracting GAIA-compliant final answer")
+            # Create specialized extraction prompt
+            extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
+            # Use 72B model for precise extraction
+            result = self.llm_client.generate(
+                extraction_prompt,
+                tier=ModelTier.COMPLEX,  # 72B model
+                max_tokens=50  # Force concise answers
+            )
+            if not result.success:
+                logger.error("Final answer extraction failed")
+                return {
+                    "answer": "Processing failed",
+                    "confidence": 0.0,
+                    "reasoning": f"Extraction failed: {result.response}"
+                }
+            # Parse and clean the extracted answer
+            extracted_answer = self._clean_answer(result.response, question_type)
+            # Validate answer format
+            validation_result = self._validate_answer(extracted_answer, question_type)
+            logger.info(f"✅ Final answer extracted: '{extracted_answer}'")
+            return {
+                "answer": extracted_answer,
+                "confidence": validation_result["confidence"],
+                "reasoning": f"Extracted using 72B model. Validation: {validation_result['status']}"
+            }
+        except Exception as e:
+            error_msg = f"Final answer extraction error: {str(e)}"
+            logger.error(error_msg)
+            return {
+                "answer": "Extraction error",
+                "confidence": 0.0,
+                "reasoning": error_msg
+            }
+    def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
+        """Create specialized extraction prompt based on question type"""
+        base_prompt = f"""
+CRITICAL: This is for GAIA benchmark evaluation using EXACT MATCH comparison.
+Your response must be ONLY the precise answer - no explanations, no "FINAL ANSWER:", no extra text.
+Question: {question}
+Agent Analysis Results:
+{agent_results}
+EXTRACTION RULES:
+"""
+        # Add type-specific rules
+        if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "calculate"]):
+            base_prompt += """
+- If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
+- If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
+- No units unless specifically requested in the question
+"""
+        elif "text_manipulation" in question_type.lower() or "reverse" in question.lower():
+            base_prompt += """
+- If text is reversed: provide the corrected text
+- If asking for opposite: provide ONLY the opposite word (e.g., "right" for opposite of "left")
+- If asking to decode: provide ONLY the decoded answer
+"""
+        elif "yes" in question.lower() or "true" in question.lower() or "false" in question.lower():
+            base_prompt += """
+- If yes/no question: respond with ONLY "yes" or "no" (lowercase)
+- If true/false question: respond with ONLY "true" or "false" (lowercase)
+"""
+        elif any(word in question.lower() for word in ["name", "who", "which person"]):
+            base_prompt += """
+- If asking for a name: provide ONLY the name (e.g., "John Smith", "Einstein")
+- If asking for first name only: provide ONLY first name (e.g., "John")
+- If asking for last name only: provide ONLY last name (e.g., "Smith")
+"""
+        elif any(word in question.lower() for word in ["where", "location", "city", "country"]):
+            base_prompt += """
+- If asking for location: provide ONLY the location name (e.g., "Paris", "USA", "New York")
+- No additional descriptors unless specifically requested
+"""
+        else:
+            base_prompt += """
+- Provide ONLY the direct answer to the question
+- No explanations, context, or additional information
+- Be as concise as possible while being accurate
+"""
+        base_prompt += """
+EXAMPLES OF CORRECT FORMAT:
+- Question: "How many albums?" → Answer: "5"
+- Question: "What is the opposite of left?" → Answer: "right"
+- Question: "True or false?" → Answer: "true"
+- Question: "Who discovered X?" → Answer: "Einstein"
+- Question: "Which city?" → Answer: "London"
+Extract the precise answer NOW:"""
+        return base_prompt
+    def _clean_answer(self, raw_answer: str, question_type: str) -> str:
+        """Clean and format the extracted answer"""
+        # Remove common unwanted prefixes/suffixes
+        answer = raw_answer.strip()
+        # Remove common prefixes
+        prefixes_to_remove = [
+            "the answer is",
+            "answer:",
+            "final answer:",
+            "result:",
+            "response:",
+            "conclusion:",
+        ]
+        for prefix in prefixes_to_remove:
+            if answer.lower().startswith(prefix):
+                answer = answer[len(prefix):].strip()
+        # Remove quotes if they wrap the entire answer
+        if answer.startswith('"') and answer.endswith('"'):
+            answer = answer[1:-1]
+        if answer.startswith("'") and answer.endswith("'"):
+            answer = answer[1:-1]
+        # Handle specific formatting based on question type
+        if "mathematical" in question_type.lower():
+            # Extract just the number for mathematical questions
+            number_match = re.search(r'-?\d+(?:\.\d+)?', answer)
+            if number_match:
+                answer = number_match.group()
+        elif "text_manipulation" in question_type.lower():
+            # For reversed text questions, ensure clean output
+            if len(answer.split()) == 1:  # Single word answer
+                answer = answer.lower()
+        # Remove any trailing punctuation that's not part of the answer
+        answer = answer.rstrip('.,!?;:')
+        return answer.strip()
+    def _validate_answer(self, answer: str, question_type: str) -> Dict[str, Any]:
+        """Validate the extracted answer format"""
+        if not answer:
+            return {"status": "empty_answer", "confidence": 0.0}
+        # Check length - GAIA answers should be concise
+        if len(answer) > 100:
+            return {"status": "too_long", "confidence": 0.3}
+        # Type-specific validation
+        if "mathematical" in question_type.lower():
+            if re.match(r'^-?\d+(?:\.\d+)?$', answer):
+                return {"status": "valid_number", "confidence": 0.9}
+            else:
+                return {"status": "invalid_number_format", "confidence": 0.5}
+        elif "yes_no" in question_type.lower():
+            if answer.lower() in ["yes", "no", "true", "false"]:
+                return {"status": "valid_boolean", "confidence": 0.9}
+            else:
+                return {"status": "invalid_boolean_format", "confidence": 0.4}
+        # General validation - prefer short, direct answers
+        if len(answer) <= 20:
+            return {"status": "concise_answer", "confidence": 0.8}
+        elif len(answer) <= 50:
+            return {"status": "moderate_length", "confidence": 0.6}
+        else:
+            return {"status": "long_answer", "confidence": 0.4}