Chris commited on
Commit
e107ea2
·
1 Parent(s): a178cd6

Final 6.7.3

Browse files
.gitignore CHANGED
@@ -8,3 +8,4 @@ debug_*.py
8
  *_debug*.py
9
  tests/
10
  *.log
 
 
8
  *_debug*.py
9
  tests/
10
  *.log
11
+ gaia_evaluation_cjb97*
requirements.txt CHANGED
@@ -6,6 +6,7 @@ beautifulsoup4==4.13.0
6
  certifi==2025.4.26
7
  charset-normalizer==3.4.2
8
  click==8.2.1
 
9
  exceptiongroup==1.3.0
10
  fastapi==0.115.12
11
  ffmpy==0.5.0
 
6
  certifi==2025.4.26
7
  charset-normalizer==3.4.2
8
  click==8.2.1
9
+ duckduckgo-search==6.3.4
10
  exceptiongroup==1.3.0
11
  fastapi==0.115.12
12
  ffmpy==0.5.0
src/agents/__pycache__/router.cpython-310.pyc CHANGED
Binary files a/src/agents/__pycache__/router.cpython-310.pyc and b/src/agents/__pycache__/router.cpython-310.pyc differ
 
src/agents/__pycache__/web_researcher.cpython-310.pyc CHANGED
Binary files a/src/agents/__pycache__/web_researcher.cpython-310.pyc and b/src/agents/__pycache__/web_researcher.cpython-310.pyc differ
 
src/agents/router.py CHANGED
@@ -317,8 +317,8 @@ class RouterAgent:
317
  """
318
 
319
  try:
320
- # Use router model for this analysis
321
- tier = ModelTier.ROUTER if state.complexity_assessment != "complex" else ModelTier.MAIN
322
  result = self.llm_client.generate(prompt, tier=tier, max_tokens=200)
323
 
324
  if result.success:
 
317
  """
318
 
319
  try:
320
+ # Use main model (32B) for better routing decisions instead of 7B router model
321
+ tier = ModelTier.MAIN # Always use 32B model for routing to improve classification accuracy
322
  result = self.llm_client.generate(prompt, tier=tier, max_tokens=200)
323
 
324
  if result.success:
src/agents/web_researcher.py CHANGED
@@ -414,24 +414,89 @@ class WebResearchAgent:
414
  return ' '.join(topic_words[:3]) if topic_words else "topic"
415
 
416
  def _extract_search_terms(self, question: str) -> str:
417
- """Extract search terms from question"""
418
 
419
- # Remove question words and common phrases
420
- stop_phrases = [
421
- 'what is', 'what are', 'who is', 'who are', 'when is', 'when was',
422
- 'where is', 'where are', 'how is', 'how are', 'why is', 'why are',
423
- 'tell me about', 'find information about', 'search for'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  ]
425
 
426
- clean_question = question.lower()
427
- for phrase in stop_phrases:
428
- clean_question = clean_question.replace(phrase, '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
 
430
- # Remove punctuation and extra spaces
431
- clean_question = re.sub(r'[?.,!]', '', clean_question)
432
- clean_question = re.sub(r'\s+', ' ', clean_question).strip()
433
 
434
- return clean_question
 
435
 
436
  def _extract_youtube_info(self, question: str) -> str:
437
  """Extract YouTube URL or search terms"""
 
414
  return ' '.join(topic_words[:3]) if topic_words else "topic"
415
 
416
  def _extract_search_terms(self, question: str) -> str:
417
+ """Extract focused search terms from question to avoid length limits"""
418
 
419
+ # Handle different question types more intelligently
420
+ question_lower = question.lower()
421
+
422
+ # For questions about specific people, places, things - extract key entities
423
+ # Look for quoted phrases first (highest priority)
424
+ quoted_terms = re.findall(r'"([^"]+)"', question)
425
+ if quoted_terms:
426
+ # Use the first quoted phrase as it's usually the most important
427
+ main_term = quoted_terms[0]
428
+ # Add year if present
429
+ years = re.findall(r'\b(19|20)\d{2}\b', question)
430
+ if years:
431
+ return f"{main_term} {years[0]}"
432
+ return main_term
433
+
434
+ # Extract proper nouns and key entities
435
+ # Look for capitalized words (likely proper nouns)
436
+ proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
437
+
438
+ # Extract years and numbers (often important)
439
+ years = re.findall(r'\b(19|20)\d{2}\b', question)
440
+ numbers = re.findall(r'\b\d+\b', question)
441
+
442
+ # Remove very common stop words and question patterns
443
+ stop_patterns = [
444
+ r'\b(?:what|who|when|where|why|how|is|are|was|were|do|does|did|can|could|would|should|will)\b',
445
+ r'\b(?:the|a|an|and|or|but|in|on|at|to|for|of|with|by|from|about)\b',
446
+ r'\b(?:please|could|you|tell|me|find|search|for|give|provide|list|show)\b',
447
+ r'\b(?:information|details|data|facts|answer)\b',
448
+ r'[?.,!]+', # Punctuation
449
  ]
450
 
451
+ # Clean the question
452
+ clean_question = question
453
+ for pattern in stop_patterns:
454
+ clean_question = re.sub(pattern, ' ', clean_question, flags=re.IGNORECASE)
455
+
456
+ # Extract remaining meaningful words
457
+ words = clean_question.split()
458
+ meaningful_words = []
459
+
460
+ for word in words:
461
+ word = word.strip()
462
+ if len(word) > 2 and word.isalpha(): # Only alphabetic words longer than 2 chars
463
+ meaningful_words.append(word)
464
+
465
+ # Build search terms prioritizing important elements
466
+ search_terms = []
467
+
468
+ # Add proper nouns first (most specific)
469
+ for noun in proper_nouns[:2]: # Max 2 proper nouns
470
+ if len(' '.join(search_terms + [noun])) <= 100: # Conservative length limit
471
+ search_terms.append(noun)
472
+
473
+ # Add years/numbers
474
+ for year in years[:1]: # Max 1 year
475
+ if len(' '.join(search_terms + [year])) <= 100:
476
+ search_terms.append(year)
477
+
478
+ # Add meaningful words until we reach a reasonable length
479
+ for word in meaningful_words[:5]: # Max 5 additional words
480
+ potential_query = ' '.join(search_terms + [word])
481
+ if len(potential_query) <= 100: # Keep well under 250 char limit
482
+ search_terms.append(word)
483
+ else:
484
+ break
485
+
486
+ # Fallback if nothing found
487
+ if not search_terms:
488
+ # Take first few words of the original question
489
+ first_words = question.split()[:5] # First 5 words max
490
+ search_terms = [w for w in first_words if w.isalpha() and len(w) > 2]
491
+
492
+ result = ' '.join(search_terms)
493
 
494
+ # Final length check and truncation
495
+ if len(result) > 100:
496
+ result = result[:100].rsplit(' ', 1)[0]
497
 
498
+ logger.info(f"📝 Extracted search terms: '{result}' from question: '{question[:50]}...'")
499
+ return result
500
 
501
  def _extract_youtube_info(self, question: str) -> str:
502
  """Extract YouTube URL or search terms"""
src/app.py CHANGED
@@ -1755,15 +1755,15 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
1755
  ### 🔧 System Architecture
1756
 
1757
  **LangGraph Multi-Agent Workflow:**
1758
- - **Router Agent**: Classifies questions and selects appropriate specialized agents
1759
- - **Web Research Agent**: Handles Wikipedia searches and web research with Tavily API + Wikipedia fallback
1760
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1761
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1762
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1763
 
1764
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1765
 
1766
- **Tools Available**: Wikipedia API, Tavily web search (with Wikipedia fallback), mathematical calculator, multi-format file processor
1767
 
1768
  ### 📈 Performance Metrics
1769
  - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
@@ -1771,7 +1771,7 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
1771
  - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
1772
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
1773
  - **Reliability**: Robust error handling and graceful degradation within workflow
1774
- - **Web Search**: Reliable Tavily API with Wikipedia fallback (no rate limiting issues)
1775
 
1776
  ### 🎯 Authentication Requirements
1777
  - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
 
1755
  ### 🔧 System Architecture
1756
 
1757
  **LangGraph Multi-Agent Workflow:**
1758
+ - **Router Agent**: Classifies questions and selects appropriate specialized agents (using 32B model for better accuracy)
1759
+ - **Web Research Agent**: Multi-engine search with DuckDuckGo (primary), Tavily API (secondary), Wikipedia (fallback)
1760
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1761
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1762
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1763
 
1764
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1765
 
1766
+ **Tools Available**: Multi-engine web search (DuckDuckGo + Tavily + Wikipedia), mathematical calculator, multi-format file processor
1767
 
1768
  ### 📈 Performance Metrics
1769
  - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
 
1771
  - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
1772
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
1773
  - **Reliability**: Robust error handling and graceful degradation within workflow
1774
+ - **Web Search**: 3-tier search system (DuckDuckGo → Tavily Wikipedia) with smart query optimization
1775
 
1776
  ### 🎯 Authentication Requirements
1777
  - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
src/requirements.txt CHANGED
@@ -10,6 +10,7 @@ huggingface-hub==0.32.2
10
  transformers==4.52.3
11
  wikipedia-api==0.7.1
12
  wikipedia==1.4.0
 
13
 
14
  # OAuth dependencies for Gradio
15
  itsdangerous>=2.0.0
 
10
  transformers==4.52.3
11
  wikipedia-api==0.7.1
12
  wikipedia==1.4.0
13
+ duckduckgo-search==6.3.4
14
 
15
  # OAuth dependencies for Gradio
16
  itsdangerous>=2.0.0
src/tools/__pycache__/web_search_tool.cpython-310.pyc CHANGED
Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ
 
src/tools/web_search_tool.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
  Web Search Tool for GAIA Agent System
4
- Handles web searches using Tavily API (primary) and Wikipedia (fallback)
5
  """
6
 
7
  import re
@@ -36,8 +36,8 @@ class WebSearchResult:
36
 
37
  class WebSearchTool(BaseTool):
38
  """
39
- Web search tool using Tavily API (primary) and Wikipedia (fallback)
40
- Much more reliable than DuckDuckGo with no rate limiting issues
41
  """
42
 
43
  def __init__(self):
@@ -50,14 +50,43 @@ class WebSearchTool(BaseTool):
50
  })
51
  self.session.timeout = 10
52
 
53
- # Initialize Tavily client if API key is available
54
  self.tavily_api_key = os.getenv("TAVILY_API_KEY")
55
  self.use_tavily = self.tavily_api_key is not None
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  if self.use_tavily:
58
- logger.info("✅ Tavily API key found - using Tavily for web search")
59
- else:
60
- logger.info("ℹ️ No Tavily API key found - will use Wikipedia fallback only")
 
 
 
 
 
 
 
 
 
61
 
62
  def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
63
  """
@@ -95,24 +124,208 @@ class WebSearchTool(BaseTool):
95
  """Check if text is a URL"""
96
  return bool(re.match(r'https?://', text))
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
99
  """
100
- Search the web using Tavily API (primary) or Wikipedia (fallback)
101
  """
102
 
103
- # Try Tavily first if API key is available
 
 
 
 
 
 
 
 
 
 
104
  if self.use_tavily:
105
  try:
106
- return self._search_with_tavily(query, limit, extract_content)
107
  except Exception as e:
108
- logger.warning(f"Tavily search failed, falling back to Wikipedia: {e}")
109
 
110
  # Fallback to Wikipedia search
111
- return self._search_with_wikipedia(query, limit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
114
  """
115
- Search using Tavily Search API - much more reliable than DuckDuckGo
116
  """
117
  try:
118
  logger.info(f"🔍 Tavily search for: {query}")
@@ -129,7 +342,7 @@ class WebSearchTool(BaseTool):
129
  "include_answer": False,
130
  "include_images": False,
131
  "include_raw_content": extract_content,
132
- "max_results": min(limit, 10) # Tavily supports up to 10 results
133
  }
134
 
135
  # Make API request
@@ -167,43 +380,41 @@ class WebSearchTool(BaseTool):
167
  "search_engine": "tavily"
168
  }
169
  else:
170
- logger.warning("Tavily returned no results, trying Wikipedia fallback")
171
- return self._search_with_wikipedia(query, limit)
 
 
172
 
173
  except requests.exceptions.RequestException as e:
174
  logger.error(f"Tavily API request failed: {e}")
175
- # Fall back to Wikipedia
176
- return self._search_with_wikipedia(query, limit)
177
  except Exception as e:
178
  logger.error(f"Tavily search error: {e}")
179
- # Fall back to Wikipedia
 
 
180
  return self._search_with_wikipedia(query, limit)
 
 
 
 
 
 
 
181
 
182
  def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
183
  """
184
- Search using Wikipedia as fallback - very reliable and no rate limits
185
  """
186
  try:
187
  logger.info(f"📚 Wikipedia search for: {query}")
188
 
189
- # Try to import wikipedia library
190
- try:
191
- import wikipedia
192
- except ImportError:
193
- return {
194
- "query": query,
195
- "found": False,
196
- "message": "❌ No search engines available. Install 'wikipedia' package or configure Tavily API key.",
197
- "results": []
198
- }
199
-
200
- wikipedia.set_lang("en")
201
 
202
- # Clean up query for Wikipedia search
203
- search_terms = query.replace("site:", "").strip()
204
 
205
  # Search Wikipedia pages
206
- wiki_results = wikipedia.search(search_terms, results=min(limit * 2, 10))
207
 
208
  if not wiki_results:
209
  return {
@@ -222,7 +433,7 @@ class WebSearchTool(BaseTool):
222
  break
223
 
224
  try:
225
- page = wikipedia.page(page_title)
226
  summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
227
 
228
  web_result = WebSearchResult(
@@ -234,11 +445,11 @@ class WebSearchTool(BaseTool):
234
  results.append(web_result.to_dict())
235
  processed += 1
236
 
237
- except wikipedia.exceptions.DisambiguationError as e:
238
  # Try the first suggestion from disambiguation
239
  try:
240
  if e.options:
241
- page = wikipedia.page(e.options[0])
242
  summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
243
 
244
  web_result = WebSearchResult(
@@ -252,7 +463,7 @@ class WebSearchTool(BaseTool):
252
  except:
253
  continue
254
 
255
- except wikipedia.exceptions.PageError:
256
  # Page doesn't exist, skip
257
  continue
258
  except Exception as e:
@@ -284,7 +495,7 @@ class WebSearchTool(BaseTool):
284
  return {
285
  "query": query,
286
  "found": False,
287
- "message": f"Search failed: {str(e)}",
288
  "results": [],
289
  "error_type": "search_failure"
290
  }
@@ -397,57 +608,6 @@ class WebSearchTool(BaseTool):
397
  combined_content = re.sub(r' +', ' ', combined_content) # Multiple spaces
398
 
399
  return combined_content.strip()[:5000] # Limit to 5000 characters
400
-
401
- def search_youtube_metadata(self, query: str) -> Dict[str, Any]:
402
- """
403
- Specialized search for YouTube video information
404
- """
405
- try:
406
- # Search specifically for YouTube videos
407
- youtube_query = f"site:youtube.com {query}"
408
-
409
- # Use the same search logic but filter for YouTube results
410
- search_result = self._search_web(youtube_query, limit=3)
411
-
412
- if not search_result.get('found'):
413
- return search_result
414
-
415
- youtube_results = []
416
- for result in search_result.get('results', []):
417
- if 'youtube.com/watch' in result.get('url', ''):
418
- video_id = self._extract_youtube_id(result['url'])
419
-
420
- youtube_result = {
421
- "title": result.get('title', 'No title'),
422
- "url": result.get('url', ''),
423
- "description": result.get('snippet', 'No description'),
424
- "video_id": video_id
425
- }
426
- youtube_results.append(youtube_result)
427
-
428
- return {
429
- "query": query,
430
- "found": len(youtube_results) > 0,
431
- "results": youtube_results,
432
- "message": f"Found {len(youtube_results)} YouTube videos"
433
- }
434
-
435
- except Exception as e:
436
- raise Exception(f"YouTube search failed: {str(e)}")
437
-
438
- def _extract_youtube_id(self, url: str) -> str:
439
- """Extract YouTube video ID from URL"""
440
- patterns = [
441
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
442
- r'(?:embed\/)([0-9A-Za-z_-]{11})',
443
- r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
444
- ]
445
-
446
- for pattern in patterns:
447
- match = re.search(pattern, url)
448
- if match:
449
- return match.group(1)
450
- return ""
451
 
452
  def test_web_search_tool():
453
  """Test the web search tool with various queries"""
@@ -456,10 +616,10 @@ def test_web_search_tool():
456
  # Test cases
457
  test_cases = [
458
  "Python programming tutorial",
459
- "https://en.wikipedia.org/wiki/Machine_learning",
460
- {"query": "artificial intelligence news", "action": "search", "limit": 3},
461
- {"query": "https://www.python.org", "action": "extract"},
462
- {"query": "OpenAI ChatGPT", "action": "search", "limit": 2, "extract_content": True}
463
  ]
464
 
465
  print("🧪 Testing Web Search Tool...")
 
1
  #!/usr/bin/env python3
2
  """
3
  Web Search Tool for GAIA Agent System
4
+ Handles web searches using DuckDuckGo (primary), Tavily API (secondary), and Wikipedia (fallback)
5
  """
6
 
7
  import re
 
36
 
37
  class WebSearchTool(BaseTool):
38
  """
39
+ Web search tool using DuckDuckGo (primary), Tavily API (secondary), and Wikipedia (fallback)
40
+ Provides multiple search engine options for reliability
41
  """
42
 
43
  def __init__(self):
 
50
  })
51
  self.session.timeout = 10
52
 
53
+ # Initialize search engines
54
  self.tavily_api_key = os.getenv("TAVILY_API_KEY")
55
  self.use_tavily = self.tavily_api_key is not None
56
 
57
+ # Try to import DuckDuckGo
58
+ try:
59
+ from duckduckgo_search import DDGS
60
+ self.ddgs = DDGS()
61
+ self.use_duckduckgo = True
62
+ logger.info("✅ DuckDuckGo search initialized")
63
+ except ImportError:
64
+ logger.warning("⚠️ DuckDuckGo search not available - install duckduckgo-search package")
65
+ self.use_duckduckgo = False
66
+
67
+ # Try to import Wikipedia
68
+ try:
69
+ import wikipedia
70
+ self.wikipedia = wikipedia
71
+ self.use_wikipedia = True
72
+ logger.info("✅ Wikipedia search initialized")
73
+ except ImportError:
74
+ logger.warning("⚠️ Wikipedia search not available - install wikipedia package")
75
+ self.use_wikipedia = False
76
+
77
  if self.use_tavily:
78
+ logger.info("✅ Tavily API key found - using as secondary search")
79
+
80
+ # Search engine priority: DuckDuckGo -> Tavily -> Wikipedia
81
+ search_engines = []
82
+ if self.use_duckduckgo:
83
+ search_engines.append("DuckDuckGo")
84
+ if self.use_tavily:
85
+ search_engines.append("Tavily")
86
+ if self.use_wikipedia:
87
+ search_engines.append("Wikipedia")
88
+
89
+ logger.info(f"🔍 Available search engines: {', '.join(search_engines)}")
90
 
91
  def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
92
  """
 
124
  """Check if text is a URL"""
125
  return bool(re.match(r'https?://', text))
126
 
127
+ def _extract_search_terms(self, query: str, max_length: int = 250) -> str:
128
+ """
129
+ Extract key search terms from a potentially long query
130
+ """
131
+ # If query is short enough, use as-is
132
+ if len(query) <= max_length:
133
+ return query
134
+
135
+ # Remove common stop words and extract key terms
136
+ stop_words = {
137
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
138
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
139
+ 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
140
+ 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
141
+ 'what', 'where', 'when', 'why', 'how', 'which', 'who', 'whose', 'whom',
142
+ 'please', 'could', 'you', 'tell', 'me', 'find', 'search', 'for', 'about'
143
+ }
144
+
145
+ # Split into words and filter
146
+ words = re.findall(r'\b\w+\b', query.lower())
147
+ key_words = [word for word in words if word not in stop_words and len(word) > 2]
148
+
149
+ # Keep important phrases and entities
150
+ # Look for quoted phrases, proper nouns, numbers, dates
151
+ important_patterns = [
152
+ r'"[^"]*"', # Quoted phrases
153
+ r'\b[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*\b', # Proper nouns
154
+ r'\b\d{4}\b', # Years
155
+ r'\b\d+\b', # Numbers
156
+ ]
157
+
158
+ important_terms = []
159
+ for pattern in important_patterns:
160
+ matches = re.findall(pattern, query)
161
+ important_terms.extend(matches)
162
+
163
+ # Combine key words and important terms
164
+ search_terms = []
165
+
166
+ # Add important terms first (they're usually more specific)
167
+ for term in important_terms:
168
+ if len(' '.join(search_terms + [term])) <= max_length:
169
+ search_terms.append(term)
170
+
171
+ # Add key words until we hit the limit
172
+ for word in key_words:
173
+ potential_query = ' '.join(search_terms + [word])
174
+ if len(potential_query) <= max_length:
175
+ search_terms.append(word)
176
+ else:
177
+ break
178
+
179
+ result = ' '.join(search_terms)
180
+
181
+ # If still too long, truncate
182
+ if len(result) > max_length:
183
+ result = result[:max_length].rsplit(' ', 1)[0]
184
+
185
+ # If we ended up with nothing, use first part of original query
186
+ if not result.strip():
187
+ result = query[:max_length].rsplit(' ', 1)[0]
188
+
189
+ if result != query:
190
+ logger.info(f"📝 Extracted search terms: '{result}' from '{query[:100]}...'")
191
+
192
+ return result
193
+
194
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
195
  """
196
+ Search the web using available search engines in priority order
197
  """
198
 
199
+ # Extract search terms to avoid length issues
200
+ search_query = self._extract_search_terms(query, max_length=250)
201
+
202
+ # Try DuckDuckGo first (most comprehensive for general web search)
203
+ if self.use_duckduckgo:
204
+ try:
205
+ return self._search_with_duckduckgo(search_query, limit, extract_content)
206
+ except Exception as e:
207
+ logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
208
+
209
+ # Try Tavily if DuckDuckGo fails and API key is available
210
  if self.use_tavily:
211
  try:
212
+ return self._search_with_tavily(search_query, limit, extract_content)
213
  except Exception as e:
214
+ logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
215
 
216
  # Fallback to Wikipedia search
217
+ if self.use_wikipedia:
218
+ return self._search_with_wikipedia(search_query, limit)
219
+
220
+ # No search engines available
221
+ return {
222
+ "query": query,
223
+ "found": False,
224
+ "message": "❌ No search engines available. Please install required packages.",
225
+ "results": []
226
+ }
227
+
228
+ def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
229
+ """
230
+ Search using DuckDuckGo - primary search engine
231
+ """
232
+ try:
233
+ logger.info(f"🦆 DuckDuckGo search for: {query}")
234
+
235
+ # Add retry logic for DuckDuckGo rate limiting
236
+ max_retries = 3
237
+ retry_delay = 2
238
+
239
+ for attempt in range(max_retries):
240
+ try:
241
+ # Use DuckDuckGo text search
242
+ ddg_results = list(self.ddgs.text(query, max_results=min(limit, 10)))
243
+
244
+ if not ddg_results:
245
+ if attempt < max_retries - 1:
246
+ logger.warning(f"DuckDuckGo returned no results, retrying in {retry_delay}s...")
247
+ time.sleep(retry_delay)
248
+ retry_delay *= 2
249
+ continue
250
+ else:
251
+ logger.warning("DuckDuckGo returned no results after retries")
252
+ # Fall back to other search engines
253
+ return self._search_with_fallback(query, limit)
254
+
255
+ break
256
+
257
+ except Exception as e:
258
+ if "rate limit" in str(e).lower() or "429" in str(e):
259
+ if attempt < max_retries - 1:
260
+ logger.warning(f"DuckDuckGo rate limited, retrying in {retry_delay}s...")
261
+ time.sleep(retry_delay)
262
+ retry_delay *= 2
263
+ continue
264
+ else:
265
+ logger.warning("DuckDuckGo rate limited after retries, using fallback")
266
+ return self._search_with_fallback(query, limit)
267
+ else:
268
+ raise
269
+
270
+ # Process DuckDuckGo results
271
+ results = []
272
+ for result in ddg_results:
273
+ web_result = WebSearchResult(
274
+ title=result.get('title', 'No title'),
275
+ url=result.get('href', ''),
276
+ snippet=result.get('body', 'No description'),
277
+ content='' # DuckDuckGo doesn't provide full content
278
+ )
279
+ results.append(web_result.to_dict())
280
+
281
+ # Extract content if requested
282
+ if extract_content and results:
283
+ for result in results[:2]: # Only extract from first 2 results to save time
284
+ try:
285
+ content_result = self._extract_content_from_url(result['url'])
286
+ if content_result.get('found'):
287
+ result['content'] = content_result.get('content', '')[:1000]
288
+ except:
289
+ pass # Skip content extraction errors
290
+
291
+ logger.info(f"✅ DuckDuckGo found {len(results)} results")
292
+ return {
293
+ "query": query,
294
+ "found": True,
295
+ "results": results,
296
+ "total_results": len(results),
297
+ "message": f"Found {len(results)} results via DuckDuckGo",
298
+ "search_engine": "duckduckgo"
299
+ }
300
+
301
+ except Exception as e:
302
+ logger.error(f"DuckDuckGo search error: {e}")
303
+ # Fall back to other search engines
304
+ return self._search_with_fallback(query, limit)
305
+
306
+ def _search_with_fallback(self, query: str, limit: int) -> Dict[str, Any]:
307
+ """Try fallback search engines"""
308
+ # Try Tavily if available
309
+ if self.use_tavily:
310
+ try:
311
+ return self._search_with_tavily(query, limit, False)
312
+ except Exception as e:
313
+ logger.warning(f"Tavily fallback failed: {e}")
314
+
315
+ # Try Wikipedia as last resort
316
+ if self.use_wikipedia:
317
+ return self._search_with_wikipedia(query, limit)
318
+
319
+ return {
320
+ "query": query,
321
+ "found": False,
322
+ "message": "All search engines failed",
323
+ "results": []
324
+ }
325
 
326
  def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
327
  """
328
+ Search using Tavily Search API - secondary search engine
329
  """
330
  try:
331
  logger.info(f"🔍 Tavily search for: {query}")
 
342
  "include_answer": False,
343
  "include_images": False,
344
  "include_raw_content": extract_content,
345
+ "max_results": min(limit, 10)
346
  }
347
 
348
  # Make API request
 
380
  "search_engine": "tavily"
381
  }
382
  else:
383
+ logger.warning("Tavily returned no results")
384
+ # Fall back to Wikipedia
385
+ if self.use_wikipedia:
386
+ return self._search_with_wikipedia(query, limit)
387
 
388
  except requests.exceptions.RequestException as e:
389
  logger.error(f"Tavily API request failed: {e}")
 
 
390
  except Exception as e:
391
  logger.error(f"Tavily search error: {e}")
392
+
393
+ # Fall back to Wikipedia if Tavily fails
394
+ if self.use_wikipedia:
395
  return self._search_with_wikipedia(query, limit)
396
+
397
+ return {
398
+ "query": query,
399
+ "found": False,
400
+ "message": "Tavily search failed and no fallback available",
401
+ "results": []
402
+ }
403
 
404
  def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
405
  """
406
+ Search using Wikipedia - fallback search engine for factual information
407
  """
408
  try:
409
  logger.info(f"📚 Wikipedia search for: {query}")
410
 
411
+ self.wikipedia.set_lang("en")
 
 
 
 
 
 
 
 
 
 
 
412
 
413
+ # Clean up query for Wikipedia search and ensure it's not too long
414
+ search_terms = self._extract_search_terms(query, max_length=100) # Wikipedia has stricter limits
415
 
416
  # Search Wikipedia pages
417
+ wiki_results = self.wikipedia.search(search_terms, results=min(limit * 2, 10))
418
 
419
  if not wiki_results:
420
  return {
 
433
  break
434
 
435
  try:
436
+ page = self.wikipedia.page(page_title)
437
  summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
438
 
439
  web_result = WebSearchResult(
 
445
  results.append(web_result.to_dict())
446
  processed += 1
447
 
448
+ except self.wikipedia.exceptions.DisambiguationError as e:
449
  # Try the first suggestion from disambiguation
450
  try:
451
  if e.options:
452
+ page = self.wikipedia.page(e.options[0])
453
  summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
454
 
455
  web_result = WebSearchResult(
 
463
  except:
464
  continue
465
 
466
+ except self.wikipedia.exceptions.PageError:
467
  # Page doesn't exist, skip
468
  continue
469
  except Exception as e:
 
495
  return {
496
  "query": query,
497
  "found": False,
498
+ "message": f"Wikipedia search failed: {str(e)}",
499
  "results": [],
500
  "error_type": "search_failure"
501
  }
 
608
  combined_content = re.sub(r' +', ' ', combined_content) # Multiple spaces
609
 
610
  return combined_content.strip()[:5000] # Limit to 5000 characters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  def test_web_search_tool():
613
  """Test the web search tool with various queries"""
 
616
  # Test cases
617
  test_cases = [
618
  "Python programming tutorial",
619
+ "Mercedes Sosa studio albums 2000 2009",
620
+ "artificial intelligence recent developments",
621
+ "climate change latest research",
622
+ "https://en.wikipedia.org/wiki/Machine_learning"
623
  ]
624
 
625
  print("🧪 Testing Web Search Tool...")