Chris commited on
Commit
6afa67b
·
1 Parent(s): b55bafd

Final 7.6.3

Browse files
src/agents/router.py CHANGED
@@ -24,49 +24,43 @@ class RouterAgent:
24
 
25
  def process(self, state: GAIAAgentState) -> GAIAAgentState:
26
  """
27
- Enhanced routing with multi-phase problem decomposition
28
  """
29
- logger.info("🧭 Router: Starting multi-phase question analysis")
30
- state.add_processing_step("Router: Multi-phase analysis initiated")
31
 
32
  try:
33
- # Phase 1: Structural Analysis
34
- structural_analysis = self._analyze_question_structure(state.question)
35
- state.add_processing_step(f"Router: Structure = {structural_analysis['type']}")
36
 
37
- # Phase 2: Information Requirements Analysis
38
- info_requirements = self._analyze_information_needs(state.question, structural_analysis)
39
- state.add_processing_step(f"Router: Needs = {info_requirements['primary_need']}")
40
 
41
- # Phase 3: Strategy Planning
42
- execution_strategy = self._plan_execution_strategy(state.question, structural_analysis, info_requirements)
43
- state.add_processing_step(f"Router: Strategy = {execution_strategy['approach']}")
44
 
45
- # Phase 4: Agent Selection and Sequencing
46
- agent_sequence = self._select_agent_sequence(execution_strategy, info_requirements)
47
-
48
- # Store analysis in state for agents to use
49
  state.router_analysis = {
50
- 'structural': structural_analysis,
51
- 'requirements': info_requirements,
52
- 'strategy': execution_strategy,
53
- 'sequence': agent_sequence
54
  }
55
 
56
- logger.info(f"✅ Routing complete: {structural_analysis['type']} -> {agent_sequence}")
57
- state.add_processing_step(f"Router: Selected agents = {agent_sequence}")
58
 
59
- # Set agent sequence for workflow
60
- state.agent_sequence = agent_sequence
61
  return state
62
 
63
  except Exception as e:
64
- error_msg = f"Router analysis failed: {str(e)}"
65
  logger.error(error_msg)
66
  state.add_error(error_msg)
67
 
68
  # Fallback to basic routing
69
- state.agent_sequence = ['reasoning_agent', 'web_researcher', 'synthesizer']
 
 
 
70
  return state
71
 
72
  def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
@@ -826,4 +820,302 @@ REASONING: [brief explanation]
826
  sequence.remove('synthesizer')
827
  sequence.append('synthesizer')
828
 
829
- return sequence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def process(self, state: GAIAAgentState) -> GAIAAgentState:
26
  """
27
+ Enhanced router processing with improved classification and planning
28
  """
29
+ logger.info("🧭 Router: Starting enhanced multi-phase analysis")
30
+ state.add_processing_step("Router: Enhanced multi-phase question analysis")
31
 
32
  try:
33
+ # Enhanced classification
34
+ classification_result = self._classify_question_enhanced(state.question)
 
35
 
36
+ state.question_type = classification_result['question_type']
37
+ state.routing_decision = classification_result['reasoning']
 
38
 
39
+ # Select agents based on enhanced classification
40
+ agents = self._select_agents_for_type(classification_result)
41
+ state.selected_agents = agents
42
 
43
+ # Store enhanced analysis for downstream agents
 
 
 
44
  state.router_analysis = {
45
+ 'classification': classification_result,
46
+ 'selected_agents': [a.value for a in agents],
47
+ 'confidence': classification_result['confidence']
 
48
  }
49
 
50
+ logger.info(f"✅ Enhanced routing: {classification_result['type']} -> {[a.value for a in agents]}")
 
51
 
 
 
52
  return state
53
 
54
  except Exception as e:
55
+ error_msg = f"Enhanced router analysis failed: {str(e)}"
56
  logger.error(error_msg)
57
  state.add_error(error_msg)
58
 
59
  # Fallback to basic routing
60
+ state.question_type = QuestionType.GENERAL_INQUIRY
61
+ state.selected_agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT, AgentRole.SYNTHESIZER]
62
+ state.routing_decision = f"Enhanced routing failed, using fallback: {error_msg}"
63
+
64
  return state
65
 
66
  def route_question(self, state: GAIAAgentState) -> GAIAAgentState:
 
820
  sequence.remove('synthesizer')
821
  sequence.append('synthesizer')
822
 
823
+ return sequence
824
+
825
+ def _classify_question_enhanced(self, question: str) -> Dict[str, Any]:
826
+ """Enhanced question classification using better pattern matching and LLM analysis"""
827
+
828
+ question_lower = question.lower()
829
+
830
+ # Enhanced pattern classification
831
+ pattern_classification = self._classify_by_enhanced_patterns(question_lower, question)
832
+
833
+ # LLM-based classification for complex cases
834
+ llm_classification = self._classify_with_llm(question)
835
+
836
+ # Combine both approaches
837
+ final_classification = self._combine_classifications(pattern_classification, llm_classification, question)
838
+
839
+ logger.info(f"🤖 Enhanced classification: Pattern={pattern_classification['type']}, LLM={llm_classification['type']}, Final={final_classification['type']}")
840
+
841
+ return final_classification
842
+
843
+ def _classify_by_enhanced_patterns(self, question_lower: str, original_question: str) -> Dict[str, Any]:
844
+ """Enhanced pattern-based classification with better accuracy"""
845
+
846
+ # Mathematical/counting questions (high confidence patterns)
847
+ mathematical_patterns = [
848
+ r'\bhow many\b',
849
+ r'\bcount\b.*\b(of|the)\b',
850
+ r'\bnumber of\b',
851
+ r'\btotal\b.*\b(of|number)\b',
852
+ r'\bcalculate\b',
853
+ r'\bsum\b.*\bof\b',
854
+ r'\bhow much\b',
855
+ r'\bquantity\b'
856
+ ]
857
+
858
+ if any(re.search(pattern, question_lower) for pattern in mathematical_patterns):
859
+ # Check for temporal constraints
860
+ temporal_indicators = ['between', 'from', 'during', 'in', r'\b(19|20)\d{2}\b']
861
+ has_temporal = any(re.search(indicator, question_lower) for indicator in temporal_indicators)
862
+
863
+ return {
864
+ 'type': 'mathematical',
865
+ 'confidence': 0.9,
866
+ 'subtype': 'temporal_counting' if has_temporal else 'general_counting',
867
+ 'reasoning': 'Strong mathematical/counting indicators found'
868
+ }
869
+
870
+ # Text manipulation questions
871
+ text_manipulation_patterns = [
872
+ r'\bopposite\b',
873
+ r'\breverse\b',
874
+ r'\bbackwards\b',
875
+ r'\bdecode\b',
876
+ r'\btranslate\b',
877
+ r'\bconvert\b',
878
+ r'\.rewsna', # Common in reversed text questions
879
+ r'\bcipher\b',
880
+ r'\bencrypt\b'
881
+ ]
882
+
883
+ if any(re.search(pattern, question_lower) for pattern in text_manipulation_patterns):
884
+ return {
885
+ 'type': 'text_manipulation',
886
+ 'confidence': 0.85,
887
+ 'subtype': 'text_processing',
888
+ 'reasoning': 'Text manipulation patterns detected'
889
+ }
890
+
891
+ # File/code processing questions
892
+ file_patterns = [
893
+ r'\battached\b.*\b(file|image|document|excel|csv|python|code)\b',
894
+ r'\bfile\b.*\b(contains|attached|uploaded)\b',
895
+ r'\b(image|photo|picture)\b.*\b(shows|contains|attached)\b',
896
+ r'\bcode\b.*\b(attached|file|script)\b',
897
+ r'\bspreadsheet\b',
898
+ r'\b\.py\b|\b\.csv\b|\b\.xlsx\b|\b\.png\b|\b\.jpg\b'
899
+ ]
900
+
901
+ if any(re.search(pattern, question_lower) for pattern in file_patterns):
902
+ return {
903
+ 'type': 'file_processing',
904
+ 'confidence': 0.9,
905
+ 'subtype': 'file_analysis',
906
+ 'reasoning': 'File processing indicators found'
907
+ }
908
+
909
+ # Web research questions (specific indicators)
910
+ web_research_patterns = [
911
+ r'\bwikipedia\b.*\barticle\b',
912
+ r'\bfeatured article\b',
913
+ r'\bpromoted\b.*\b(in|during)\b.*\b(19|20)\d{2}\b',
914
+ r'\bnominated\b.*\bby\b',
915
+ r'\byoutube\b.*\bvideo\b',
916
+ r'\bwatch\?v=\b',
917
+ r'\bhttps?://\b',
918
+ r'\bwebsite\b|\burl\b'
919
+ ]
920
+
921
+ if any(re.search(pattern, question_lower) for pattern in web_research_patterns):
922
+ return {
923
+ 'type': 'web_research',
924
+ 'confidence': 0.8,
925
+ 'subtype': 'specific_lookup',
926
+ 'reasoning': 'Web-specific content indicators found'
927
+ }
928
+
929
+ # Reasoning/analysis questions
930
+ reasoning_patterns = [
931
+ r'\banalyze\b|\banalysis\b',
932
+ r'\bcompare\b|\bcomparison\b',
933
+ r'\bexplain\b|\bexplanation\b',
934
+ r'\bwhy\b.*\b(is|are|was|were|do|does|did)\b',
935
+ r'\bhow\b.*\b(does|do|did|can|could|would)\b',
936
+ r'\bwhat.*difference\b',
937
+ r'\bwhat.*relationship\b'
938
+ ]
939
+
940
+ if any(re.search(pattern, question_lower) for pattern in reasoning_patterns):
941
+ return {
942
+ 'type': 'reasoning',
943
+ 'confidence': 0.7,
944
+ 'subtype': 'analytical_reasoning',
945
+ 'reasoning': 'Reasoning/analysis patterns detected'
946
+ }
947
+
948
+ # General factual questions
949
+ factual_patterns = [
950
+ r'\bwho\b.*\b(is|was|are|were)\b',
951
+ r'\bwhat\b.*\b(is|was|are|were)\b',
952
+ r'\bwhen\b.*\b(did|was|were|is|are)\b',
953
+ r'\bwhere\b.*\b(is|was|are|were)\b',
954
+ r'\bwhich\b.*\b(is|was|are|were)\b'
955
+ ]
956
+
957
+ if any(re.search(pattern, question_lower) for pattern in factual_patterns):
958
+ return {
959
+ 'type': 'factual_lookup',
960
+ 'confidence': 0.6,
961
+ 'subtype': 'general_factual',
962
+ 'reasoning': 'General factual question patterns'
963
+ }
964
+
965
+ # Default classification
966
+ return {
967
+ 'type': 'general',
968
+ 'confidence': 0.4,
969
+ 'subtype': 'unclassified',
970
+ 'reasoning': 'No specific patterns matched'
971
+ }
972
+
973
+ def _classify_with_llm(self, question: str) -> Dict[str, Any]:
974
+ """LLM-based classification for complex questions"""
975
+
976
+ classification_prompt = f"""
977
+ Analyze this question and classify it into one of these categories:
978
+
979
+ Categories:
980
+ - mathematical: Questions asking for counts, calculations, quantities
981
+ - text_manipulation: Questions involving text reversal, encoding, word puzzles
982
+ - file_processing: Questions about attached files, images, code, data
983
+ - web_research: Questions requiring web search, Wikipedia lookup, current information
984
+ - reasoning: Questions requiring analysis, comparison, logical deduction
985
+ - factual_lookup: Simple fact-based questions about people, places, events
986
+
987
+ Question: {question}
988
+
989
+ Respond with just the category name and a brief reason (max 10 words).
990
+ Format: category_name: reason
991
+
992
+ Classification:"""
993
+
994
+ try:
995
+ llm_result = self.llm_client.generate(
996
+ classification_prompt,
997
+ tier=ModelTier.ROUTER, # Use fast model for classification
998
+ max_tokens=50
999
+ )
1000
+
1001
+ if llm_result.success:
1002
+ response = llm_result.response.strip().lower()
1003
+
1004
+ # Parse the response
1005
+ if ':' in response:
1006
+ category, reason = response.split(':', 1)
1007
+ category = category.strip()
1008
+ reason = reason.strip()
1009
+ else:
1010
+ category = response.split()[0] if response.split() else 'general'
1011
+ reason = 'llm classification'
1012
+
1013
+ # Validate category
1014
+ valid_categories = ['mathematical', 'text_manipulation', 'file_processing', 'web_research', 'reasoning', 'factual_lookup']
1015
+ if category not in valid_categories:
1016
+ category = 'general'
1017
+
1018
+ return {
1019
+ 'type': category,
1020
+ 'confidence': 0.7,
1021
+ 'reasoning': f'LLM: {reason}'
1022
+ }
1023
+ else:
1024
+ return {
1025
+ 'type': 'general',
1026
+ 'confidence': 0.3,
1027
+ 'reasoning': 'LLM classification failed'
1028
+ }
1029
+
1030
+ except Exception as e:
1031
+ logger.warning(f"LLM classification failed: {e}")
1032
+ return {
1033
+ 'type': 'general',
1034
+ 'confidence': 0.3,
1035
+ 'reasoning': 'LLM classification error'
1036
+ }
1037
+
1038
+ def _combine_classifications(self, pattern_result: Dict[str, Any], llm_result: Dict[str, Any], question: str) -> Dict[str, Any]:
1039
+ """Combine pattern and LLM classifications for final decision"""
1040
+
1041
+ pattern_type = pattern_result['type']
1042
+ pattern_confidence = pattern_result['confidence']
1043
+ llm_type = llm_result['type']
1044
+ llm_confidence = llm_result['confidence']
1045
+
1046
+ # If pattern matching has high confidence, trust it
1047
+ if pattern_confidence >= 0.8:
1048
+ final_type = pattern_type
1049
+ final_confidence = pattern_confidence
1050
+ reasoning = f"High confidence pattern match: {pattern_result['reasoning']}"
1051
+
1052
+ # If both agree, boost confidence
1053
+ elif pattern_type == llm_type:
1054
+ final_type = pattern_type
1055
+ final_confidence = min(0.95, (pattern_confidence + llm_confidence) / 2 + 0.1)
1056
+ reasoning = f"Pattern and LLM agree: {pattern_type}"
1057
+
1058
+ # If they disagree, use the one with higher confidence
1059
+ elif pattern_confidence > llm_confidence:
1060
+ final_type = pattern_type
1061
+ final_confidence = pattern_confidence * 0.9 # Slight penalty for disagreement
1062
+ reasoning = f"Pattern-based: {pattern_result['reasoning']}"
1063
+ else:
1064
+ final_type = llm_type
1065
+ final_confidence = llm_confidence * 0.9 # Slight penalty for disagreement
1066
+ reasoning = f"LLM-based: {llm_result['reasoning']}"
1067
+
1068
+ # Map to question types
1069
+ type_mapping = {
1070
+ 'mathematical': QuestionType.QUANTITATIVE_ANALYSIS,
1071
+ 'text_manipulation': QuestionType.TEXT_MANIPULATION,
1072
+ 'file_processing': QuestionType.FILE_PROCESSING,
1073
+ 'web_research': QuestionType.WEB_RESEARCH,
1074
+ 'reasoning': QuestionType.COMPLEX_REASONING,
1075
+ 'factual_lookup': QuestionType.FACTUAL_LOOKUP,
1076
+ 'general': QuestionType.GENERAL_INQUIRY
1077
+ }
1078
+
1079
+ question_type = type_mapping.get(final_type, QuestionType.GENERAL_INQUIRY)
1080
+
1081
+ return {
1082
+ 'type': final_type,
1083
+ 'question_type': question_type,
1084
+ 'confidence': final_confidence,
1085
+ 'reasoning': reasoning,
1086
+ 'pattern_result': pattern_result,
1087
+ 'llm_result': llm_result
1088
+ }
1089
+
1090
+ def _select_agents_for_type(self, classification_result: Dict[str, Any]) -> List[AgentRole]:
1091
+ """Select appropriate agents based on enhanced classification"""
1092
+
1093
+ question_type = classification_result['type']
1094
+ confidence = classification_result['confidence']
1095
+
1096
+ # Agent selection based on question type
1097
+ if question_type == 'mathematical':
1098
+ agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT]
1099
+ elif question_type == 'text_manipulation':
1100
+ agents = [AgentRole.REASONING_AGENT]
1101
+ elif question_type == 'file_processing':
1102
+ agents = [AgentRole.FILE_PROCESSOR, AgentRole.REASONING_AGENT]
1103
+ elif question_type == 'web_research':
1104
+ agents = [AgentRole.WEB_RESEARCHER]
1105
+ elif question_type == 'reasoning':
1106
+ agents = [AgentRole.REASONING_AGENT, AgentRole.WEB_RESEARCHER]
1107
+ elif question_type == 'factual_lookup':
1108
+ agents = [AgentRole.WEB_RESEARCHER]
1109
+ else:
1110
+ # General questions - try multiple approaches
1111
+ agents = [AgentRole.WEB_RESEARCHER, AgentRole.REASONING_AGENT]
1112
+
1113
+ # Always add synthesizer
1114
+ agents.append(AgentRole.SYNTHESIZER)
1115
+
1116
+ # If confidence is low, add more agents for better coverage
1117
+ if confidence < 0.6:
1118
+ if AgentRole.WEB_RESEARCHER not in agents:
1119
+ agents.insert(-1, AgentRole.WEB_RESEARCHER) # Insert before synthesizer
1120
+
1121
+ return agents
src/agents/web_researcher.py CHANGED
@@ -589,23 +589,165 @@ class WebResearchAgent:
589
  return self._create_failure_result("YouTube research failed")
590
 
591
  def _research_web_general(self, state: GAIAAgentState) -> AgentResult:
592
- """General web search research"""
593
 
 
594
  search_terms = self._extract_search_terms(state.question)
595
 
596
- logger.info(f"Web search for: {search_terms}")
597
 
598
- # Perform web search
599
- web_result = self.web_search_tool.execute({
600
- "query": search_terms,
601
- "action": "search",
602
- "limit": 5
603
- })
604
 
605
  if web_result.success and web_result.result.get('found'):
606
- return self._analyze_web_search_result(state, web_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  else:
608
- return self._create_failure_result("Web search failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
  def _research_url_content(self, state: GAIAAgentState) -> AgentResult:
611
  """Extract and analyze content from specific URLs"""
@@ -760,128 +902,58 @@ class WebResearchAgent:
760
 
761
  return ' '.join(topic_words[:3]) if topic_words else "topic"
762
 
763
- def _extract_search_terms(self, question: str, max_length: int = 100) -> str:
764
  """
765
- Extract optimized search terms from question
766
- Prioritizes important terms while staying under length limits
767
  """
 
 
768
 
769
- # Clean the question first
770
- clean_question = re.sub(r'[^\w\s\-]', ' ', question.lower())
771
- words = clean_question.split()
772
-
773
- # Remove common stop words but keep question words
774
- stop_words = {
775
- 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
776
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
777
- 'should', 'may', 'might', 'must', 'shall', 'can', 'to', 'of', 'in',
778
- 'on', 'at', 'by', 'for', 'with', 'from', 'as', 'but', 'or', 'and',
779
- 'if', 'then', 'than', 'this', 'that', 'these', 'those', 'i', 'you',
780
- 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'
781
- }
782
 
783
- # Keep important question words
784
- question_words = {'who', 'what', 'when', 'where', 'why', 'how', 'which'}
 
785
 
786
- # Priority terms (always include if present)
787
- priority_terms = []
 
 
 
788
 
789
- # Extract quoted phrases first
790
- quoted_phrases = re.findall(r'"([^"]*)"', question)
791
- for phrase in quoted_phrases:
792
- if len(phrase.strip()) > 0:
793
- priority_terms.append(phrase.strip())
794
 
795
- # Extract years (4-digit numbers) - capture full years, not just prefixes
796
- years = re.findall(r'\b(?:19|20)\d{2}\b', question) # Changed from capturing group to full match
 
797
 
798
- # Extract proper nouns (capitalized words) - exclude numbers
799
- proper_nouns = []
800
- for word in question.split():
801
- clean_word = re.sub(r'[^\w]', '', word)
802
- if (clean_word and
803
- clean_word[0].isupper() and
804
- len(clean_word) > 1 and
805
- not clean_word.isdigit()): # Exclude pure numbers
806
- proper_nouns.append(clean_word)
 
807
 
808
- # Extract other meaningful numbers (but be very selective)
809
- # Only include numbers that are likely meaningful (dates, counts, etc.)
810
- meaningful_numbers = []
811
- number_matches = re.findall(r'\b\d{1,4}\b', question)
812
- for num in number_matches:
813
- # Skip very common/meaningless numbers and years already captured
814
- if (num not in ['1', '2', '3', '4', '5', '10', '20', '19', '21', '22', '23', '24', '25'] and
815
- num not in years and
816
- len(num) > 1): # Require at least 2 digits for meaningful numbers
817
- # Only include if it appears in a meaningful context
818
- if any(context in question.lower() for context in [
819
- f'{num} albums', f'{num} songs', f'{num} years', f'{num} people',
820
- f'{num} times', f'{num} days', f'{num} months', f'episode {num}',
821
- f'season {num}', f'volume {num}', f'part {num}'
822
- ]):
823
- meaningful_numbers.append(num)
824
-
825
- # Build search terms with priority
826
- search_terms = []
827
-
828
- # Add quoted phrases (highest priority)
829
- search_terms.extend(priority_terms)
830
-
831
- # Add proper nouns (high priority)
832
- search_terms.extend(proper_nouns[:5]) # Limit to avoid duplication
833
-
834
- # Add question words if present
835
- for word in words:
836
- if word in question_words and word not in search_terms:
837
- search_terms.append(word)
838
 
839
- # Add years
840
- search_terms.extend(years[:2]) # Limit to 2 years max
 
841
 
842
- # Add other important terms
843
- for word in words:
844
- if (word not in stop_words and
845
- word not in search_terms and
846
- len(word) > 2 and
847
- not word.isdigit()): # Avoid random numbers
848
- search_terms.append(word)
849
-
850
- # Stop if we have enough terms
851
- if len(' '.join(search_terms)) > max_length - 20:
852
- break
853
-
854
- # Add a few important numbers if space allows
855
- if len(' '.join(search_terms)) < max_length - 10:
856
- search_terms.extend(meaningful_numbers[:2])
857
-
858
- # Join and clean up
859
- search_query = ' '.join(search_terms)
860
-
861
- # Remove duplicates while preserving order
862
- seen = set()
863
- unique_terms = []
864
- for term in search_terms:
865
- if term.lower() not in seen:
866
- seen.add(term.lower())
867
- unique_terms.append(term)
868
-
869
- # Final cleanup and length check
870
- final_query = ' '.join(unique_terms)
871
- if len(final_query) > max_length:
872
- # Truncate to fit
873
- truncated_terms = []
874
- current_length = 0
875
- for term in unique_terms:
876
- if current_length + len(term) + 1 <= max_length:
877
- truncated_terms.append(term)
878
- current_length += len(term) + 1
879
- else:
880
- break
881
- final_query = ' '.join(truncated_terms)
882
-
883
- logger.info(f"📝 Optimized search terms: '{final_query}' from question: '{question[:50]}...'")
884
- return final_query
885
 
886
  def _extract_youtube_info(self, question: str) -> str:
887
  """Extract YouTube URL or search terms"""
 
589
  return self._create_failure_result("YouTube research failed")
590
 
591
  def _research_web_general(self, state: GAIAAgentState) -> AgentResult:
592
+ """General web research with enhanced result analysis"""
593
 
594
+ # Extract optimized search terms
595
  search_terms = self._extract_search_terms(state.question)
596
 
597
+ logger.info(f"Web research for: {search_terms}")
598
 
599
+ # Search the web
600
+ search_query = {"query": search_terms, "action": "search", "limit": 5}
601
+ web_result = self.web_search_tool.execute(search_query)
 
 
 
602
 
603
  if web_result.success and web_result.result.get('found'):
604
+ search_data = web_result.result
605
+
606
+ # Enhanced analysis with focused LLM processing
607
+ analysis_prompt = self._create_enhanced_analysis_prompt(state.question, search_data, search_terms)
608
+
609
+ # Use appropriate model tier based on complexity
610
+ model_tier = ModelTier.COMPLEX if state.complexity_assessment == "complex" else ModelTier.MAIN
611
+ llm_result = self.llm_client.generate(analysis_prompt, tier=model_tier, max_tokens=600)
612
+
613
+ if llm_result.success:
614
+ # Parse the LLM response for better confidence assessment
615
+ confidence = self._assess_answer_confidence(llm_result.response, state.question, search_data)
616
+
617
+ return AgentResult(
618
+ agent_role=AgentRole.WEB_RESEARCHER,
619
+ success=True,
620
+ result=llm_result.response,
621
+ confidence=confidence,
622
+ reasoning=f"Enhanced web search analysis of {len(search_data.get('results', []))} sources for '{search_terms}'",
623
+ tools_used=[ToolResult(
624
+ tool_name="web_search",
625
+ success=True,
626
+ result=search_data,
627
+ execution_time=web_result.execution_time
628
+ )],
629
+ model_used=llm_result.model_used,
630
+ processing_time=web_result.execution_time + llm_result.response_time,
631
+ cost_estimate=llm_result.cost_estimate
632
+ )
633
+ else:
634
+ # Fallback to best search result
635
+ results = search_data.get('results', [])
636
+ best_result = results[0] if results else {"title": "No results", "snippet": "No information found"}
637
+
638
+ return AgentResult(
639
+ agent_role=AgentRole.WEB_RESEARCHER,
640
+ success=True,
641
+ result=f"Found: {best_result.get('title', 'Unknown')} - {best_result.get('snippet', 'No description')}",
642
+ confidence=0.4,
643
+ reasoning="Web search completed but analysis failed",
644
+ tools_used=[ToolResult(
645
+ tool_name="web_search",
646
+ success=True,
647
+ result=search_data,
648
+ execution_time=web_result.execution_time
649
+ )],
650
+ model_used="fallback",
651
+ processing_time=web_result.execution_time,
652
+ cost_estimate=0.0
653
+ )
654
  else:
655
+ return self._create_failure_result(f"Web search failed for '{search_terms}': {web_result.result.get('message', 'Unknown error')}")
656
+
657
+ def _create_enhanced_analysis_prompt(self, question: str, search_data: Dict[str, Any], search_terms: str) -> str:
658
+ """Create enhanced analysis prompt for better result processing"""
659
+
660
+ results = search_data.get('results', [])
661
+ search_source = search_data.get('source', 'web')
662
+
663
+ # Format search results concisely
664
+ formatted_results = []
665
+ for i, result in enumerate(results[:4], 1): # Limit to top 4 results
666
+ title = result.get('title', 'No title')
667
+ snippet = result.get('snippet', 'No description')
668
+ url = result.get('url', '')
669
+ source = result.get('source', search_source)
670
+
671
+ formatted_results.append(f"""
672
+ Result {i} ({source}):
673
+ Title: {title}
674
+ Content: {snippet}
675
+ URL: {url}
676
+ """)
677
+
678
+ # Create focused analysis prompt
679
+ prompt = f"""
680
+ You are analyzing web search results to answer a specific question. Provide a direct, accurate answer based on the search findings.
681
+
682
+ Question: {question}
683
+
684
+ Search Terms Used: {search_terms}
685
+
686
+ Search Results:
687
+ {''.join(formatted_results)}
688
+
689
+ Instructions:
690
+ 1. Carefully read through all the search results
691
+ 2. Look for information that directly answers the question
692
+ 3. If you find a clear answer, state it concisely
693
+ 4. If the information is incomplete, state what you found and what's missing
694
+ 5. If you find no relevant information, clearly state that
695
+ 6. For questions asking for specific numbers, dates, or names, be precise
696
+ 7. Always base your answer on the search results provided
697
+
698
+ Provide your analysis and answer:"""
699
+
700
+ return prompt
701
+
702
+ def _assess_answer_confidence(self, answer: str, question: str, search_data: Dict[str, Any]) -> float:
703
+ """Assess confidence in the answer based on various factors"""
704
+
705
+ # Base confidence factors
706
+ confidence = 0.5 # Start with medium confidence
707
+
708
+ # Factor 1: Search result quality
709
+ results = search_data.get('results', [])
710
+ if len(results) >= 3:
711
+ confidence += 0.1 # More results = higher confidence
712
+
713
+ # Factor 2: Source quality
714
+ source = search_data.get('source', 'unknown')
715
+ if source == 'Wikipedia':
716
+ confidence += 0.15 # Wikipedia is generally reliable
717
+ elif source == 'DuckDuckGo':
718
+ confidence += 0.1 # General web search
719
+
720
+ # Factor 3: Answer specificity
721
+ answer_lower = answer.lower()
722
+ if any(indicator in answer_lower for indicator in [
723
+ 'no information', 'not found', 'unclear', 'unable to determine',
724
+ 'cannot find', 'no clear answer', 'insufficient information'
725
+ ]):
726
+ confidence -= 0.2 # Reduce confidence for uncertain answers
727
+
728
+ # Factor 4: Answer contains specific details
729
+ if any(pattern in answer for pattern in [
730
+ re.compile(r'\b\d{4}\b'), # Years
731
+ re.compile(r'\b\d+\b'), # Numbers
732
+ re.compile(r'\b[A-Z][a-z]+\b') # Proper nouns
733
+ ]):
734
+ confidence += 0.1 # Specific details increase confidence
735
+
736
+ # Factor 5: Answer length (very short answers might be incomplete)
737
+ if len(answer.split()) < 5:
738
+ confidence -= 0.1
739
+ elif len(answer.split()) > 50:
740
+ confidence += 0.05 # Detailed answers
741
+
742
+ # Factor 6: Question type matching
743
+ question_lower = question.lower()
744
+ if 'how many' in question_lower and re.search(r'\b\d+\b', answer):
745
+ confidence += 0.15 # Numerical answer to numerical question
746
+ elif any(q_word in question_lower for q_word in ['who', 'what', 'when', 'where']) and len(answer.split()) > 3:
747
+ confidence += 0.1 # Substantial answer to factual question
748
+
749
+ # Ensure confidence stays within bounds
750
+ return max(0.1, min(0.95, confidence))
751
 
752
  def _research_url_content(self, state: GAIAAgentState) -> AgentResult:
753
  """Extract and analyze content from specific URLs"""
 
902
 
903
  return ' '.join(topic_words[:3]) if topic_words else "topic"
904
 
905
+ def _extract_search_terms(self, question: str, max_length: int = 180) -> str:
906
  """
907
+ Improved search term extraction for better web search results
908
+ Prioritizes entities, dates, and specific terms
909
  """
910
+ # Remove common question words first
911
+ question_clean = re.sub(r'\b(what|who|when|where|why|how|is|are|was|were|did|do|does|can|could|should|would|please|tell|me|find|about)\b', '', question.lower())
912
 
913
+ # Extract key patterns first
914
+ entities = []
 
 
 
 
 
 
 
 
 
 
 
915
 
916
+ # Extract quoted phrases (highest priority)
917
+ quoted_phrases = re.findall(r'"([^"]+)"', question)
918
+ entities.extend(quoted_phrases)
919
 
920
+ # Extract proper nouns (names, places, organizations)
921
+ proper_nouns = re.findall(r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b', question)
922
+ # Filter out common question words that might be capitalized
923
+ filtered_nouns = [noun for noun in proper_nouns if noun.lower() not in {'you', 'i', 'me', 'my', 'the', 'a', 'an'}]
924
+ entities.extend(filtered_nouns[:4]) # Limit to top 4
925
 
926
+ # Extract years and dates (high priority for temporal questions)
927
+ years = re.findall(r'\b(19|20)\d{2}\b', question)
928
+ entities.extend(years)
 
 
929
 
930
+ # Extract important numbers that might be quantities
931
+ numbers = re.findall(r'\b\d+\b', question)
932
+ entities.extend(numbers[:2]) # Limit to first 2 numbers
933
 
934
+ # If we have good entities, use them primarily
935
+ if entities:
936
+ search_terms = ' '.join(entities[:8]) # Use top 8 entities
937
+ else:
938
+ # Fallback: clean the question and extract key words
939
+ words = question_clean.split()
940
+ # Remove very common words
941
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'many', 'some', 'all', 'any', 'most', 'other', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'now', 'here', 'there', 'then', 'them', 'they', 'their', 'would', 'could', 'should', 'will', 'can', 'may', 'might', 'must'}
942
+ filtered_words = [w for w in words if w.lower() not in stop_words and len(w) > 2]
943
+ search_terms = ' '.join(filtered_words[:10]) # Use top 10 content words
944
 
945
+ # Clean up the search terms
946
+ search_terms = re.sub(r'\s+', ' ', search_terms) # Remove multiple spaces
947
+ search_terms = search_terms.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948
 
949
+ # Ensure we don't exceed max length
950
+ if len(search_terms) > max_length:
951
+ search_terms = search_terms[:max_length].rsplit(' ', 1)[0] # Cut at word boundary
952
 
953
+ # Log the extraction for debugging
954
+ logger.info(f"📝 Optimized search terms: '{search_terms}' from question: '{question[:100]}...'")
955
+
956
+ return search_terms.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
957
 
958
  def _extract_youtube_info(self, question: str) -> str:
959
  """Extract YouTube URL or search terms"""
src/tools/final_answer_tool.py CHANGED
@@ -21,248 +21,226 @@ class FinalAnswerTool:
21
  def __init__(self, llm_client: QwenClient):
22
  self.llm_client = llm_client
23
 
24
- def extract_final_answer(self, question: str, agent_results: str, question_type: str = "") -> Dict[str, Any]:
25
  """
26
- Extract the precise final answer in GAIA-compliant format
27
-
28
- Args:
29
- question: The original GAIA question
30
- agent_results: Combined results from multiple agents
31
- question_type: Type of question (for specialized extraction)
32
-
33
- Returns:
34
- Dict with extracted answer, confidence, and reasoning
35
  """
 
 
36
  try:
37
- logger.info("🎯 Extracting GAIA-compliant final answer")
38
-
39
- # Create specialized extraction prompt
40
  extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
41
 
42
  # Use 72B model for precise extraction
43
- result = self.llm_client.generate(
44
  extraction_prompt,
45
- tier=ModelTier.COMPLEX, # 72B model
46
- max_tokens=50 # Force concise answers
 
47
  )
48
 
49
- if not result.success:
50
- logger.error("Final answer extraction failed")
 
 
 
 
 
 
51
  return {
52
- "answer": "Processing failed",
53
- "confidence": 0.0,
54
- "reasoning": f"Extraction failed: {result.response}"
 
 
55
  }
56
-
57
- # Parse and clean the extracted answer
58
- extracted_answer = self._clean_answer(result.response, question, question_type)
59
-
60
- # Validate answer format
61
- validation_result = self._validate_answer(extracted_answer, question_type)
62
-
63
- logger.info(f"✅ Final answer extracted: '{extracted_answer}'")
64
-
65
- return {
66
- "answer": extracted_answer,
67
- "confidence": validation_result["confidence"],
68
- "reasoning": f"Extracted using 72B model. Validation: {validation_result['status']}"
69
- }
70
-
71
  except Exception as e:
72
- error_msg = f"Final answer extraction error: {str(e)}"
73
- logger.error(error_msg)
74
- return {
75
- "answer": "Extraction error",
76
- "confidence": 0.0,
77
- "reasoning": error_msg
78
- }
79
 
80
  def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
81
  """Create specialized extraction prompt based on question type"""
82
 
83
- base_prompt = f"""
84
- CRITICAL: This is for GAIA benchmark evaluation using EXACT MATCH comparison.
85
- Your response must be ONLY the precise answer - no explanations, no "FINAL ANSWER:", no extra text.
86
-
87
- Question: {question}
88
-
89
- Agent Analysis Results:
90
- {agent_results}
91
-
92
- EXTRACTION RULES:
93
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # Add type-specific rules
96
- if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
97
- base_prompt += """
98
- - If asking for a count/number: respond with ONLY the number (e.g., "5", "23", "0")
99
- - If asking for calculation: respond with ONLY the result (e.g., "42", "3.14", "100")
100
- - No units unless specifically requested in the question
101
- """
102
- elif "text_manipulation" in question_type.lower() or "reverse" in question.lower():
103
- base_prompt += """
104
- - If text is reversed: provide the corrected text
105
- - If asking for opposite: provide ONLY the opposite word (e.g., "right" for opposite of "left")
106
- - If asking to decode: provide ONLY the decoded answer
107
- """
108
- elif "yes" in question.lower() or "true" in question.lower() or "false" in question.lower():
109
- base_prompt += """
110
- - If yes/no question: respond with ONLY "yes" or "no" (lowercase)
111
- - If true/false question: respond with ONLY "true" or "false" (lowercase)
112
- """
113
- elif any(word in question.lower() for word in ["name", "who", "which person"]):
114
- base_prompt += """
115
- - If asking for a name: provide ONLY the name (e.g., "John Smith", "Einstein")
116
- - If asking for first name only: provide ONLY first name (e.g., "John")
117
- - If asking for last name only: provide ONLY last name (e.g., "Smith")
118
- """
119
- elif any(word in question.lower() for word in ["where", "location", "city", "country"]):
120
- base_prompt += """
121
- - If asking for location: provide ONLY the location name (e.g., "Paris", "USA", "New York")
122
- - No additional descriptors unless specifically requested
123
- """
124
  else:
125
- base_prompt += """
126
- - Provide ONLY the direct answer to the question
127
- - No explanations, context, or additional information
128
- - Be as concise as possible while being accurate
129
- """
130
 
131
- base_prompt += """
132
-
133
- EXAMPLES OF CORRECT FORMAT:
134
- - Question: "How many albums?" → Answer: "5"
135
- - Question: "What is the opposite of left?" → Answer: "right"
136
- - Question: "True or false?" → Answer: "true"
137
- - Question: "Who discovered X?" → Answer: "Einstein"
138
- - Question: "Which city?" → Answer: "London"
139
-
140
- Extract the precise answer NOW:"""
141
 
142
- return base_prompt
 
 
 
143
 
144
- def _clean_answer(self, raw_answer: str, question: str, question_type: str) -> str:
145
- """Clean and format the extracted answer"""
146
 
147
- # Remove common unwanted prefixes/suffixes
148
  answer = raw_answer.strip()
149
 
150
- # Remove common prefixes
151
  prefixes_to_remove = [
152
- "the answer is",
153
- "answer:",
154
- "final answer:",
155
- "result:",
156
- "response:",
157
- "conclusion:",
158
- "based on",
159
- "according to",
160
- "from the",
161
  ]
162
 
 
163
  for prefix in prefixes_to_remove:
164
- if answer.lower().startswith(prefix):
165
  answer = answer[len(prefix):].strip()
 
166
 
167
  # Remove quotes if they wrap the entire answer
168
  if answer.startswith('"') and answer.endswith('"'):
169
  answer = answer[1:-1]
170
- if answer.startswith("'") and answer.endswith("'"):
171
  answer = answer[1:-1]
172
 
173
- # AGGRESSIVE LENGTH ENFORCEMENT FOR GAIA
174
- # If answer is too long, extract the core information
175
- if len(answer) > 50:
176
- # For different question types, extract differently
177
- if "mathematical" in question_type.lower() or any(word in question.lower() for word in ["how many", "count", "number", "albums"]):
178
- # Extract just the number for mathematical questions
179
- number_match = re.search(r'-?\d+(?:\.\d+)?', answer)
180
- if number_match:
181
- answer = number_match.group()
182
- elif "name" in question_type.lower() or any(word in question.lower() for word in ["who", "name"]):
183
- # Extract just the name (first few words)
184
- words = answer.split()
185
- if len(words) > 3:
186
- answer = ' '.join(words[:3]) # Keep only first 3 words for names
187
- elif "location" in question_type.lower() or any(word in question.lower() for word in ["where", "city", "country"]):
188
- # Extract just the location name
189
- words = answer.split()
190
- if len(words) > 2:
191
- answer = ' '.join(words[:2]) # Keep only first 2 words for locations
192
- elif "yes_no" in question_type.lower() or any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
193
- # Extract yes/no/true/false
194
- if any(word in answer.lower() for word in ["yes", "no", "true", "false"]):
195
- for word in answer.lower().split():
196
- if word in ["yes", "no", "true", "false"]:
197
- answer = word
198
- break
199
- else:
200
- # For other types, take first sentence or clause
201
- sentences = re.split(r'[.!?]', answer)
202
- if sentences:
203
- answer = sentences[0].strip()
204
- # If still too long, take first clause
205
- if len(answer) > 30:
206
- clauses = re.split(r'[,;:]', answer)
207
- if clauses:
208
- answer = clauses[0].strip()
209
-
210
- # Handle specific formatting based on question type
211
- if "text_manipulation" in question_type.lower():
212
- # For reversed text questions, ensure clean output
213
- if len(answer.split()) == 1: # Single word answer
214
- answer = answer.lower()
215
-
216
- # Final aggressive truncation if still too long
217
- if len(answer) > 40:
218
- # Split into words and take as many as fit
219
- words = answer.split()
220
- truncated_words = []
221
- current_length = 0
222
- for word in words:
223
- if current_length + len(word) + 1 <= 40:
224
- truncated_words.append(word)
225
- current_length += len(word) + 1
226
- else:
227
- break
228
- if truncated_words:
229
- answer = ' '.join(truncated_words)
230
- else:
231
- # Last resort - take first 40 characters
232
- answer = answer[:40].strip()
233
-
234
- # Remove any trailing punctuation that's not part of the answer
235
- answer = answer.rstrip('.,!?;:')
236
 
237
- return answer.strip()
238
-
239
- def _validate_answer(self, answer: str, question_type: str) -> Dict[str, Any]:
240
- """Validate the extracted answer format"""
241
-
242
- if not answer:
243
- return {"status": "empty_answer", "confidence": 0.0}
244
 
245
- # Check length - GAIA answers should be concise
246
- if len(answer) > 100:
247
- return {"status": "too_long", "confidence": 0.3}
 
 
 
248
 
249
- # Type-specific validation
250
- if "mathematical" in question_type.lower():
251
- if re.match(r'^-?\d+(?:\.\d+)?$', answer):
252
- return {"status": "valid_number", "confidence": 0.9}
253
- else:
254
- return {"status": "invalid_number_format", "confidence": 0.5}
255
 
256
- elif "yes_no" in question_type.lower():
257
- if answer.lower() in ["yes", "no", "true", "false"]:
258
- return {"status": "valid_boolean", "confidence": 0.9}
259
- else:
260
- return {"status": "invalid_boolean_format", "confidence": 0.4}
 
261
 
262
- # General validation - prefer short, direct answers
263
- if len(answer) <= 20:
264
- return {"status": "concise_answer", "confidence": 0.8}
265
- elif len(answer) <= 50:
266
- return {"status": "moderate_length", "confidence": 0.6}
267
- else:
268
- return {"status": "long_answer", "confidence": 0.4}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def __init__(self, llm_client: QwenClient):
22
  self.llm_client = llm_client
23
 
24
+ def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
25
  """
26
+ Extract GAIA-compliant final answer with enhanced accuracy
 
 
 
 
 
 
 
 
27
  """
28
+ logger.info("🎯 Extracting GAIA-compliant final answer")
29
+
30
  try:
31
+ # Create specialized extraction prompt based on question type
 
 
32
  extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
33
 
34
  # Use 72B model for precise extraction
35
+ llm_result = self.llm_client.generate(
36
  extraction_prompt,
37
+ tier=ModelTier.COMPLEX, # Always use most capable model
38
+ max_tokens=100, # Keep answer concise
39
+ temperature=0.1 # Lower temperature for consistency
40
  )
41
 
42
+ if llm_result.success:
43
+ # Clean and validate the extracted answer
44
+ raw_answer = llm_result.response.strip()
45
+ final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)
46
+
47
+ # Assess answer quality
48
+ confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)
49
+
50
  return {
51
+ "answer": final_answer,
52
+ "confidence": confidence,
53
+ "reasoning": f"Extracted from {question_type} analysis using 72B model",
54
+ "raw_response": raw_answer,
55
+ "validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
56
  }
57
+ else:
58
+ # Fallback to simple extraction
59
+ return self._fallback_extraction(question, agent_results)
60
+
 
 
 
 
 
 
 
 
 
 
 
61
  except Exception as e:
62
+ logger.error(f"Final answer extraction failed: {e}")
63
+ return self._fallback_extraction(question, agent_results)
 
 
 
 
 
64
 
65
  def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
66
  """Create specialized extraction prompt based on question type"""
67
 
68
+ base_instructions = """
69
+ CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
70
+ Your response must be ONLY the answer - no explanations, no prefixes, no extra text.
71
+
72
+ Question: {question}
73
+
74
+ Analysis from agents:
75
+ {agent_results}
76
+
77
+ """
78
+
79
+ # Specialized instructions based on question type
80
+ if question_type == "mathematical" or "how many" in question.lower():
81
+ type_instructions = """
82
+ This is a counting/mathematical question. Respond with ONLY the number.
83
+ Examples of correct responses: "5", "42", "0"
84
+ Do NOT include words like "albums", "songs", "items", etc.
85
+ """
86
+
87
+ elif question_type == "yes_no":
88
+ type_instructions = """
89
+ This is a yes/no question. Respond with ONLY "yes" or "no".
90
+ """
91
+
92
+ elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
93
+ type_instructions = """
94
+ This is asking for a name. Respond with ONLY the name requested.
95
+ Examples: "John Smith", "Mike102", "Einstein"
96
+ """
97
+
98
+ elif question_type == "location":
99
+ type_instructions = """
100
+ This is asking for a location. Respond with ONLY the location name.
101
+ Examples: "Paris", "New York", "LIE", "Hanoi"
102
+ """
103
+
104
+ elif question_type == "text_manipulation":
105
+ type_instructions = """
106
+ This involves text manipulation. Respond with ONLY the processed text result.
107
+ Examples: "right", "hello", "12345"
108
+ """
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
+ type_instructions = """
112
+ Respond with ONLY the direct answer requested.
113
+ Keep it concise and specific.
114
+ """
 
115
 
116
+ ending_instructions = """
117
+
118
+ EXTRACT ONLY THE ANSWER:"""
 
 
 
 
 
 
 
119
 
120
+ return base_instructions.format(
121
+ question=question,
122
+ agent_results=agent_results[:2000] # Limit input length
123
+ ) + type_instructions + ending_instructions
124
 
125
+ def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
126
+ """Clean and validate the extracted answer"""
127
 
128
+ # Remove common prefixes and suffixes
129
  answer = raw_answer.strip()
130
 
131
+ # Remove common answer prefixes
132
  prefixes_to_remove = [
133
+ "final answer:", "answer:", "the answer is:", "result:", "conclusion:",
134
+ "based on", "according to", "therefore", "thus", "so", "hence",
135
+ "final answer is", "the result is", "it is", "this is"
 
 
 
 
 
 
136
  ]
137
 
138
+ answer_lower = answer.lower()
139
  for prefix in prefixes_to_remove:
140
+ if answer_lower.startswith(prefix):
141
  answer = answer[len(prefix):].strip()
142
+ answer_lower = answer.lower()
143
 
144
  # Remove quotes if they wrap the entire answer
145
  if answer.startswith('"') and answer.endswith('"'):
146
  answer = answer[1:-1]
147
+ elif answer.startswith("'") and answer.endswith("'"):
148
  answer = answer[1:-1]
149
 
150
+ # Remove trailing punctuation that's not part of the answer
151
+ while answer and answer[-1] in '.!?:;':
152
+ answer = answer[:-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # Special handling for different question types
155
+ if question_type == "mathematical" or "how many" in question.lower():
156
+ # Extract just the number
157
+ numbers = re.findall(r'\b\d+\b', answer)
158
+ if numbers:
159
+ answer = numbers[0]
 
160
 
161
+ elif question_type == "yes_no":
162
+ # Normalize yes/no answers
163
+ if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
164
+ answer = "yes"
165
+ elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
166
+ answer = "no"
167
 
168
+ # Final cleanup
169
+ answer = answer.strip()
 
 
 
 
170
 
171
+ # Ensure answer is not empty
172
+ if not answer:
173
+ # Try to extract from the original raw answer
174
+ words = raw_answer.split()
175
+ if words:
176
+ answer = words[-1] # Take the last word as fallback
177
 
178
+ return answer
179
+
180
+ def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
181
+ """Assess the quality/confidence of the extracted answer"""
182
+
183
+ confidence = 0.7 # Base confidence
184
+
185
+ # Factor 1: Answer length appropriateness
186
+ if len(answer) == 0:
187
+ return 0.1 # Very low confidence for empty answers
188
+ elif len(answer) > 100:
189
+ confidence -= 0.2 # Too long for GAIA
190
+ elif 1 <= len(answer) <= 50:
191
+ confidence += 0.1 # Good length
192
+
193
+ # Factor 2: Question type matching
194
+ question_lower = question.lower()
195
+
196
+ if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
197
+ confidence += 0.15 # Numeric answer to counting question
198
+ elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
199
+ confidence += 0.1 # Name-like answer to who question
200
+ elif ("where" in question_lower) and len(answer.split()) <= 2:
201
+ confidence += 0.1 # Location-like answer
202
+ elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
203
+ confidence += 0.15 # Perfect yes/no answer
204
+
205
+ # Factor 3: Answer appears in agent results (indicates it was found)
206
+ if answer.lower() in agent_results.lower():
207
+ confidence += 0.1
208
+
209
+ # Factor 4: Answer specificity
210
+ if re.search(r'\b\d{4}\b', answer): # Contains year
211
+ confidence += 0.05
212
+ if re.search(r'\b[A-Z][a-z]+\b', answer): # Contains proper noun
213
+ confidence += 0.05
214
+
215
+ # Factor 5: Common failure patterns
216
+ failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
217
+ if any(indicator in answer.lower() for indicator in failure_indicators):
218
+ confidence -= 0.3
219
+
220
+ return max(0.1, min(0.95, confidence))
221
+
222
+ def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
223
+ """Simple fallback when LLM extraction fails"""
224
+
225
+ # Try to extract a reasonable answer from agent results
226
+ lines = agent_results.split('\n')
227
+
228
+ # Look for lines that might contain answers
229
+ potential_answers = []
230
+ for line in lines:
231
+ line = line.strip()
232
+ if len(line) > 0 and len(line) < 100:
233
+ # Skip lines that are clearly explanatory
234
+ if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
235
+ potential_answers.append(line)
236
+
237
+ # Use the first reasonable answer or a fallback
238
+ answer = potential_answers[0] if potential_answers else "Unable to determine"
239
+
240
+ return {
241
+ "answer": answer,
242
+ "confidence": 0.3,
243
+ "reasoning": "Fallback extraction due to LLM failure",
244
+ "raw_response": agent_results[:100],
245
+ "validation_passed": False
246
+ }
src/tools/web_search_tool.py CHANGED
@@ -126,105 +126,119 @@ class WebSearchTool(BaseTool):
126
  """Check if text is a URL"""
127
  return bool(re.match(r'https?://', text))
128
 
129
- def _extract_search_terms(self, query: str, max_length: int = 250) -> str:
130
  """
131
- Extract key search terms from a potentially long query
 
132
  """
133
- # If query is short enough, use as-is
134
- if len(query) <= max_length:
135
- return query
136
-
137
- # Remove common stop words and extract key terms
138
- stop_words = {
139
- 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
140
- 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
141
- 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
142
- 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
143
- 'what', 'where', 'when', 'why', 'how', 'which', 'who', 'whose', 'whom',
144
- 'please', 'could', 'you', 'tell', 'me', 'find', 'search', 'for', 'about'
145
- }
146
-
147
- # Split into words and filter
148
- words = re.findall(r'\b\w+\b', query.lower())
149
- key_words = [word for word in words if word not in stop_words and len(word) > 2]
150
-
151
- # Keep important phrases and entities
152
- # Look for quoted phrases, proper nouns, numbers, dates
153
- important_patterns = [
154
- r'"[^"]*"', # Quoted phrases
155
- r'\b[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*\b', # Proper nouns
156
- r'\b\d{4}\b', # Years
157
- r'\b\d+\b', # Numbers
158
- ]
159
 
160
- important_terms = []
161
- for pattern in important_patterns:
162
- matches = re.findall(pattern, query)
163
- important_terms.extend(matches)
164
 
165
- # Combine key words and important terms
166
- search_terms = []
 
167
 
168
- # Add important terms first (they're usually more specific)
169
- for term in important_terms:
170
- if len(' '.join(search_terms + [term])) <= max_length:
171
- search_terms.append(term)
172
 
173
- # Add key words until we hit the limit
174
- for word in key_words:
175
- potential_query = ' '.join(search_terms + [word])
176
- if len(potential_query) <= max_length:
177
- search_terms.append(word)
178
- else:
179
- break
180
 
181
- result = ' '.join(search_terms)
 
 
182
 
183
- # If still too long, truncate
184
- if len(result) > max_length:
185
- result = result[:max_length].rsplit(' ', 1)[0]
 
 
 
 
 
 
 
186
 
187
- # If we ended up with nothing, use first part of original query
188
- if not result.strip():
189
- result = query[:max_length].rsplit(' ', 1)[0]
190
 
191
- if result != query:
192
- logger.info(f"📝 Extracted search terms: '{result}' from '{query[:100]}...'")
193
 
194
- return result
195
 
196
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
197
  """
198
- Search the web using available search engines in priority order
199
  """
200
 
201
- # Extract search terms to avoid length issues
202
- search_query = self._extract_search_terms(query, max_length=250)
203
 
204
  # Try DuckDuckGo first (most comprehensive for general web search)
205
  if self.use_duckduckgo:
206
  try:
207
- return self._search_with_duckduckgo(search_query, limit, extract_content)
 
 
 
 
 
 
 
 
 
208
  except Exception as e:
209
  logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
210
 
211
  # Try Tavily if DuckDuckGo fails and API key is available
212
  if self.use_tavily:
213
  try:
214
- return self._search_with_tavily(search_query, limit, extract_content)
 
 
 
 
 
 
 
 
 
215
  except Exception as e:
216
  logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
217
 
218
  # Fallback to Wikipedia search
219
  if self.use_wikipedia:
220
- return self._search_with_wikipedia(search_query, limit)
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- # No search engines available
 
223
  return {
224
  "query": query,
225
  "found": False,
226
- "message": "❌ No search engines available. Please install required packages.",
227
- "results": []
 
 
 
228
  }
229
 
230
  def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
 
126
  """Check if text is a URL"""
127
  return bool(re.match(r'https?://', text))
128
 
129
+ def _extract_search_terms(self, question: str, max_length: int = 200) -> str:
130
  """
131
+ Extract focused search terms from a question
132
+ Prioritizes key entities, dates, and specific information
133
  """
134
+ # Remove common question words first
135
+ question_clean = re.sub(r'\b(what|who|when|where|why|how|is|are|was|were|did|do|does|can|could|should|would)\b', '', question.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ # Extract key patterns first
138
+ entities = []
 
 
139
 
140
+ # Extract quoted phrases (highest priority)
141
+ quoted_phrases = re.findall(r'"([^"]+)"', question)
142
+ entities.extend(quoted_phrases)
143
 
144
+ # Extract proper nouns (names, places, organizations)
145
+ proper_nouns = re.findall(r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b', question)
146
+ entities.extend(proper_nouns[:3]) # Limit to top 3
 
147
 
148
+ # Extract years and dates
149
+ years = re.findall(r'\b(19|20)\d{2}\b', question)
150
+ entities.extend(years)
 
 
 
 
151
 
152
+ # Extract numbers that might be important
153
+ numbers = re.findall(r'\b\d+\b', question)
154
+ entities.extend(numbers[:2]) # Limit to first 2 numbers
155
 
156
+ # If we have good entities, use them primarily
157
+ if entities:
158
+ search_terms = ' '.join(entities[:6]) # Use top 6 entities
159
+ else:
160
+ # Fallback: clean the question and extract key words
161
+ words = question_clean.split()
162
+ # Remove very common words
163
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'}
164
+ filtered_words = [w for w in words if w.lower() not in stop_words and len(w) > 2]
165
+ search_terms = ' '.join(filtered_words[:8]) # Use top 8 content words
166
 
167
+ # Ensure we don't exceed max length
168
+ if len(search_terms) > max_length:
169
+ search_terms = search_terms[:max_length].rsplit(' ', 1)[0] # Cut at word boundary
170
 
171
+ # Log the extraction for debugging
172
+ logger.info(f"📝 Extracted search terms: '{search_terms}' from question: '{question[:100]}...'")
173
 
174
+ return search_terms.strip()
175
 
176
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
177
  """
178
+ Search the web using available search engines in priority order with improved search terms
179
  """
180
 
181
+ # Extract clean search terms from the query
182
+ search_query = self._extract_search_terms(query, max_length=200)
183
 
184
  # Try DuckDuckGo first (most comprehensive for general web search)
185
  if self.use_duckduckgo:
186
  try:
187
+ ddg_result = self._search_with_duckduckgo(search_query, limit, extract_content)
188
+ if ddg_result.get('success') and ddg_result.get('count', 0) > 0:
189
+ return {
190
+ 'success': True,
191
+ 'found': True,
192
+ 'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in ddg_result['results']],
193
+ 'query': query,
194
+ 'source': 'DuckDuckGo',
195
+ 'total_found': ddg_result['count']
196
+ }
197
  except Exception as e:
198
  logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
199
 
200
  # Try Tavily if DuckDuckGo fails and API key is available
201
  if self.use_tavily:
202
  try:
203
+ tavily_result = self._search_with_tavily(search_query, limit, extract_content)
204
+ if tavily_result.get('success') and tavily_result.get('count', 0) > 0:
205
+ return {
206
+ 'success': True,
207
+ 'found': True,
208
+ 'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in tavily_result['results']],
209
+ 'query': query,
210
+ 'source': 'Tavily',
211
+ 'total_found': tavily_result['count']
212
+ }
213
  except Exception as e:
214
  logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
215
 
216
  # Fallback to Wikipedia search
217
  if self.use_wikipedia:
218
+ try:
219
+ wiki_result = self._search_with_wikipedia(search_query, limit)
220
+ if wiki_result.get('success') and wiki_result.get('count', 0) > 0:
221
+ return {
222
+ 'success': True,
223
+ 'found': True,
224
+ 'results': [r.to_dict() if hasattr(r, 'to_dict') else r for r in wiki_result['results']],
225
+ 'query': query,
226
+ 'source': 'Wikipedia',
227
+ 'total_found': wiki_result['count']
228
+ }
229
+ except Exception as e:
230
+ logger.warning(f"Wikipedia search failed: {e}")
231
 
232
+ # No search engines available or all failed
233
+ logger.warning("All search engines failed, returning empty results")
234
  return {
235
  "query": query,
236
  "found": False,
237
+ "success": False,
238
+ "message": "❌ All search engines failed or returned no results.",
239
+ "results": [],
240
+ "source": "none",
241
+ "total_found": 0
242
  }
243
 
244
  def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]: