Spaces:
Sleeping
Sleeping
Chris
commited on
Commit
·
e107ea2
1
Parent(s):
a178cd6
Final 6.7.3
Browse files- .gitignore +1 -0
- requirements.txt +1 -0
- src/agents/__pycache__/router.cpython-310.pyc +0 -0
- src/agents/__pycache__/web_researcher.cpython-310.pyc +0 -0
- src/agents/router.py +2 -2
- src/agents/web_researcher.py +78 -13
- src/app.py +4 -4
- src/requirements.txt +1 -0
- src/tools/__pycache__/web_search_tool.cpython-310.pyc +0 -0
- src/tools/web_search_tool.py +255 -95
.gitignore
CHANGED
|
@@ -8,3 +8,4 @@ debug_*.py
|
|
| 8 |
*_debug*.py
|
| 9 |
tests/
|
| 10 |
*.log
|
|
|
|
|
|
| 8 |
*_debug*.py
|
| 9 |
tests/
|
| 10 |
*.log
|
| 11 |
+
gaia_evaluation_cjb97*
|
requirements.txt
CHANGED
|
@@ -6,6 +6,7 @@ beautifulsoup4==4.13.0
|
|
| 6 |
certifi==2025.4.26
|
| 7 |
charset-normalizer==3.4.2
|
| 8 |
click==8.2.1
|
|
|
|
| 9 |
exceptiongroup==1.3.0
|
| 10 |
fastapi==0.115.12
|
| 11 |
ffmpy==0.5.0
|
|
|
|
| 6 |
certifi==2025.4.26
|
| 7 |
charset-normalizer==3.4.2
|
| 8 |
click==8.2.1
|
| 9 |
+
duckduckgo-search==6.3.4
|
| 10 |
exceptiongroup==1.3.0
|
| 11 |
fastapi==0.115.12
|
| 12 |
ffmpy==0.5.0
|
src/agents/__pycache__/router.cpython-310.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/router.cpython-310.pyc and b/src/agents/__pycache__/router.cpython-310.pyc differ
|
|
|
src/agents/__pycache__/web_researcher.cpython-310.pyc
CHANGED
|
Binary files a/src/agents/__pycache__/web_researcher.cpython-310.pyc and b/src/agents/__pycache__/web_researcher.cpython-310.pyc differ
|
|
|
src/agents/router.py
CHANGED
|
@@ -317,8 +317,8 @@ class RouterAgent:
|
|
| 317 |
"""
|
| 318 |
|
| 319 |
try:
|
| 320 |
-
# Use
|
| 321 |
-
tier = ModelTier.
|
| 322 |
result = self.llm_client.generate(prompt, tier=tier, max_tokens=200)
|
| 323 |
|
| 324 |
if result.success:
|
|
|
|
| 317 |
"""
|
| 318 |
|
| 319 |
try:
|
| 320 |
+
# Use main model (32B) for better routing decisions instead of 7B router model
|
| 321 |
+
tier = ModelTier.MAIN # Always use 32B model for routing to improve classification accuracy
|
| 322 |
result = self.llm_client.generate(prompt, tier=tier, max_tokens=200)
|
| 323 |
|
| 324 |
if result.success:
|
src/agents/web_researcher.py
CHANGED
|
@@ -414,24 +414,89 @@ class WebResearchAgent:
|
|
| 414 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 415 |
|
| 416 |
def _extract_search_terms(self, question: str) -> str:
|
| 417 |
-
"""Extract search terms from question"""
|
| 418 |
|
| 419 |
-
#
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
]
|
| 425 |
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
-
#
|
| 431 |
-
|
| 432 |
-
|
| 433 |
|
| 434 |
-
|
|
|
|
| 435 |
|
| 436 |
def _extract_youtube_info(self, question: str) -> str:
|
| 437 |
"""Extract YouTube URL or search terms"""
|
|
|
|
| 414 |
return ' '.join(topic_words[:3]) if topic_words else "topic"
|
| 415 |
|
| 416 |
def _extract_search_terms(self, question: str) -> str:
|
| 417 |
+
"""Extract focused search terms from question to avoid length limits"""
|
| 418 |
|
| 419 |
+
# Handle different question types more intelligently
|
| 420 |
+
question_lower = question.lower()
|
| 421 |
+
|
| 422 |
+
# For questions about specific people, places, things - extract key entities
|
| 423 |
+
# Look for quoted phrases first (highest priority)
|
| 424 |
+
quoted_terms = re.findall(r'"([^"]+)"', question)
|
| 425 |
+
if quoted_terms:
|
| 426 |
+
# Use the first quoted phrase as it's usually the most important
|
| 427 |
+
main_term = quoted_terms[0]
|
| 428 |
+
# Add year if present
|
| 429 |
+
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 430 |
+
if years:
|
| 431 |
+
return f"{main_term} {years[0]}"
|
| 432 |
+
return main_term
|
| 433 |
+
|
| 434 |
+
# Extract proper nouns and key entities
|
| 435 |
+
# Look for capitalized words (likely proper nouns)
|
| 436 |
+
proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
|
| 437 |
+
|
| 438 |
+
# Extract years and numbers (often important)
|
| 439 |
+
years = re.findall(r'\b(19|20)\d{2}\b', question)
|
| 440 |
+
numbers = re.findall(r'\b\d+\b', question)
|
| 441 |
+
|
| 442 |
+
# Remove very common stop words and question patterns
|
| 443 |
+
stop_patterns = [
|
| 444 |
+
r'\b(?:what|who|when|where|why|how|is|are|was|were|do|does|did|can|could|would|should|will)\b',
|
| 445 |
+
r'\b(?:the|a|an|and|or|but|in|on|at|to|for|of|with|by|from|about)\b',
|
| 446 |
+
r'\b(?:please|could|you|tell|me|find|search|for|give|provide|list|show)\b',
|
| 447 |
+
r'\b(?:information|details|data|facts|answer)\b',
|
| 448 |
+
r'[?.,!]+', # Punctuation
|
| 449 |
]
|
| 450 |
|
| 451 |
+
# Clean the question
|
| 452 |
+
clean_question = question
|
| 453 |
+
for pattern in stop_patterns:
|
| 454 |
+
clean_question = re.sub(pattern, ' ', clean_question, flags=re.IGNORECASE)
|
| 455 |
+
|
| 456 |
+
# Extract remaining meaningful words
|
| 457 |
+
words = clean_question.split()
|
| 458 |
+
meaningful_words = []
|
| 459 |
+
|
| 460 |
+
for word in words:
|
| 461 |
+
word = word.strip()
|
| 462 |
+
if len(word) > 2 and word.isalpha(): # Only alphabetic words longer than 2 chars
|
| 463 |
+
meaningful_words.append(word)
|
| 464 |
+
|
| 465 |
+
# Build search terms prioritizing important elements
|
| 466 |
+
search_terms = []
|
| 467 |
+
|
| 468 |
+
# Add proper nouns first (most specific)
|
| 469 |
+
for noun in proper_nouns[:2]: # Max 2 proper nouns
|
| 470 |
+
if len(' '.join(search_terms + [noun])) <= 100: # Conservative length limit
|
| 471 |
+
search_terms.append(noun)
|
| 472 |
+
|
| 473 |
+
# Add years/numbers
|
| 474 |
+
for year in years[:1]: # Max 1 year
|
| 475 |
+
if len(' '.join(search_terms + [year])) <= 100:
|
| 476 |
+
search_terms.append(year)
|
| 477 |
+
|
| 478 |
+
# Add meaningful words until we reach a reasonable length
|
| 479 |
+
for word in meaningful_words[:5]: # Max 5 additional words
|
| 480 |
+
potential_query = ' '.join(search_terms + [word])
|
| 481 |
+
if len(potential_query) <= 100: # Keep well under 250 char limit
|
| 482 |
+
search_terms.append(word)
|
| 483 |
+
else:
|
| 484 |
+
break
|
| 485 |
+
|
| 486 |
+
# Fallback if nothing found
|
| 487 |
+
if not search_terms:
|
| 488 |
+
# Take first few words of the original question
|
| 489 |
+
first_words = question.split()[:5] # First 5 words max
|
| 490 |
+
search_terms = [w for w in first_words if w.isalpha() and len(w) > 2]
|
| 491 |
+
|
| 492 |
+
result = ' '.join(search_terms)
|
| 493 |
|
| 494 |
+
# Final length check and truncation
|
| 495 |
+
if len(result) > 100:
|
| 496 |
+
result = result[:100].rsplit(' ', 1)[0]
|
| 497 |
|
| 498 |
+
logger.info(f"📝 Extracted search terms: '{result}' from question: '{question[:50]}...'")
|
| 499 |
+
return result
|
| 500 |
|
| 501 |
def _extract_youtube_info(self, question: str) -> str:
|
| 502 |
"""Extract YouTube URL or search terms"""
|
src/app.py
CHANGED
|
@@ -1755,15 +1755,15 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
|
|
| 1755 |
### 🔧 System Architecture
|
| 1756 |
|
| 1757 |
**LangGraph Multi-Agent Workflow:**
|
| 1758 |
-
- **Router Agent**: Classifies questions and selects appropriate specialized agents
|
| 1759 |
-
- **Web Research Agent**:
|
| 1760 |
- **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
|
| 1761 |
- **Reasoning Agent**: Handles mathematical calculations and logical reasoning
|
| 1762 |
- **Synthesizer Agent**: Combines results from multiple agents into final answers
|
| 1763 |
|
| 1764 |
**Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
|
| 1765 |
|
| 1766 |
-
**Tools Available**:
|
| 1767 |
|
| 1768 |
### 📈 Performance Metrics
|
| 1769 |
- **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
|
|
@@ -1771,7 +1771,7 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
|
|
| 1771 |
- **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
|
| 1772 |
- **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
|
| 1773 |
- **Reliability**: Robust error handling and graceful degradation within workflow
|
| 1774 |
-
- **Web Search**:
|
| 1775 |
|
| 1776 |
### 🎯 Authentication Requirements
|
| 1777 |
- **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
|
|
|
|
| 1755 |
### 🔧 System Architecture
|
| 1756 |
|
| 1757 |
**LangGraph Multi-Agent Workflow:**
|
| 1758 |
+
- **Router Agent**: Classifies questions and selects appropriate specialized agents (using 32B model for better accuracy)
|
| 1759 |
+
- **Web Research Agent**: Multi-engine search with DuckDuckGo (primary), Tavily API (secondary), Wikipedia (fallback)
|
| 1760 |
- **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
|
| 1761 |
- **Reasoning Agent**: Handles mathematical calculations and logical reasoning
|
| 1762 |
- **Synthesizer Agent**: Combines results from multiple agents into final answers
|
| 1763 |
|
| 1764 |
**Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
|
| 1765 |
|
| 1766 |
+
**Tools Available**: Multi-engine web search (DuckDuckGo + Tavily + Wikipedia), mathematical calculator, multi-format file processor
|
| 1767 |
|
| 1768 |
### 📈 Performance Metrics
|
| 1769 |
- **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
|
|
|
|
| 1771 |
- **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
|
| 1772 |
- **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
|
| 1773 |
- **Reliability**: Robust error handling and graceful degradation within workflow
|
| 1774 |
+
- **Web Search**: 3-tier search system (DuckDuckGo → Tavily → Wikipedia) with smart query optimization
|
| 1775 |
|
| 1776 |
### 🎯 Authentication Requirements
|
| 1777 |
- **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
|
src/requirements.txt
CHANGED
|
@@ -10,6 +10,7 @@ huggingface-hub==0.32.2
|
|
| 10 |
transformers==4.52.3
|
| 11 |
wikipedia-api==0.7.1
|
| 12 |
wikipedia==1.4.0
|
|
|
|
| 13 |
|
| 14 |
# OAuth dependencies for Gradio
|
| 15 |
itsdangerous>=2.0.0
|
|
|
|
| 10 |
transformers==4.52.3
|
| 11 |
wikipedia-api==0.7.1
|
| 12 |
wikipedia==1.4.0
|
| 13 |
+
duckduckgo-search==6.3.4
|
| 14 |
|
| 15 |
# OAuth dependencies for Gradio
|
| 16 |
itsdangerous>=2.0.0
|
src/tools/__pycache__/web_search_tool.cpython-310.pyc
CHANGED
|
Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ
|
|
|
src/tools/web_search_tool.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Web Search Tool for GAIA Agent System
|
| 4 |
-
Handles web searches using Tavily API (
|
| 5 |
"""
|
| 6 |
|
| 7 |
import re
|
|
@@ -36,8 +36,8 @@ class WebSearchResult:
|
|
| 36 |
|
| 37 |
class WebSearchTool(BaseTool):
|
| 38 |
"""
|
| 39 |
-
Web search tool using Tavily API (
|
| 40 |
-
|
| 41 |
"""
|
| 42 |
|
| 43 |
def __init__(self):
|
|
@@ -50,14 +50,43 @@ class WebSearchTool(BaseTool):
|
|
| 50 |
})
|
| 51 |
self.session.timeout = 10
|
| 52 |
|
| 53 |
-
# Initialize
|
| 54 |
self.tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 55 |
self.use_tavily = self.tavily_api_key is not None
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
if self.use_tavily:
|
| 58 |
-
logger.info("✅ Tavily API key found - using
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
|
| 63 |
"""
|
|
@@ -95,24 +124,208 @@ class WebSearchTool(BaseTool):
|
|
| 95 |
"""Check if text is a URL"""
|
| 96 |
return bool(re.match(r'https?://', text))
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 99 |
"""
|
| 100 |
-
Search the web using
|
| 101 |
"""
|
| 102 |
|
| 103 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if self.use_tavily:
|
| 105 |
try:
|
| 106 |
-
return self._search_with_tavily(
|
| 107 |
except Exception as e:
|
| 108 |
-
logger.warning(f"Tavily search failed,
|
| 109 |
|
| 110 |
# Fallback to Wikipedia search
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 114 |
"""
|
| 115 |
-
Search using Tavily Search API -
|
| 116 |
"""
|
| 117 |
try:
|
| 118 |
logger.info(f"🔍 Tavily search for: {query}")
|
|
@@ -129,7 +342,7 @@ class WebSearchTool(BaseTool):
|
|
| 129 |
"include_answer": False,
|
| 130 |
"include_images": False,
|
| 131 |
"include_raw_content": extract_content,
|
| 132 |
-
"max_results": min(limit, 10)
|
| 133 |
}
|
| 134 |
|
| 135 |
# Make API request
|
|
@@ -167,43 +380,41 @@ class WebSearchTool(BaseTool):
|
|
| 167 |
"search_engine": "tavily"
|
| 168 |
}
|
| 169 |
else:
|
| 170 |
-
logger.warning("Tavily returned no results
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
|
| 173 |
except requests.exceptions.RequestException as e:
|
| 174 |
logger.error(f"Tavily API request failed: {e}")
|
| 175 |
-
# Fall back to Wikipedia
|
| 176 |
-
return self._search_with_wikipedia(query, limit)
|
| 177 |
except Exception as e:
|
| 178 |
logger.error(f"Tavily search error: {e}")
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
return self._search_with_wikipedia(query, limit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
| 183 |
"""
|
| 184 |
-
Search using Wikipedia
|
| 185 |
"""
|
| 186 |
try:
|
| 187 |
logger.info(f"📚 Wikipedia search for: {query}")
|
| 188 |
|
| 189 |
-
|
| 190 |
-
try:
|
| 191 |
-
import wikipedia
|
| 192 |
-
except ImportError:
|
| 193 |
-
return {
|
| 194 |
-
"query": query,
|
| 195 |
-
"found": False,
|
| 196 |
-
"message": "❌ No search engines available. Install 'wikipedia' package or configure Tavily API key.",
|
| 197 |
-
"results": []
|
| 198 |
-
}
|
| 199 |
-
|
| 200 |
-
wikipedia.set_lang("en")
|
| 201 |
|
| 202 |
-
# Clean up query for Wikipedia search
|
| 203 |
-
search_terms =
|
| 204 |
|
| 205 |
# Search Wikipedia pages
|
| 206 |
-
wiki_results = wikipedia.search(search_terms, results=min(limit * 2, 10))
|
| 207 |
|
| 208 |
if not wiki_results:
|
| 209 |
return {
|
|
@@ -222,7 +433,7 @@ class WebSearchTool(BaseTool):
|
|
| 222 |
break
|
| 223 |
|
| 224 |
try:
|
| 225 |
-
page = wikipedia.page(page_title)
|
| 226 |
summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
|
| 227 |
|
| 228 |
web_result = WebSearchResult(
|
|
@@ -234,11 +445,11 @@ class WebSearchTool(BaseTool):
|
|
| 234 |
results.append(web_result.to_dict())
|
| 235 |
processed += 1
|
| 236 |
|
| 237 |
-
except wikipedia.exceptions.DisambiguationError as e:
|
| 238 |
# Try the first suggestion from disambiguation
|
| 239 |
try:
|
| 240 |
if e.options:
|
| 241 |
-
page = wikipedia.page(e.options[0])
|
| 242 |
summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
|
| 243 |
|
| 244 |
web_result = WebSearchResult(
|
|
@@ -252,7 +463,7 @@ class WebSearchTool(BaseTool):
|
|
| 252 |
except:
|
| 253 |
continue
|
| 254 |
|
| 255 |
-
except wikipedia.exceptions.PageError:
|
| 256 |
# Page doesn't exist, skip
|
| 257 |
continue
|
| 258 |
except Exception as e:
|
|
@@ -284,7 +495,7 @@ class WebSearchTool(BaseTool):
|
|
| 284 |
return {
|
| 285 |
"query": query,
|
| 286 |
"found": False,
|
| 287 |
-
"message": f"
|
| 288 |
"results": [],
|
| 289 |
"error_type": "search_failure"
|
| 290 |
}
|
|
@@ -397,57 +608,6 @@ class WebSearchTool(BaseTool):
|
|
| 397 |
combined_content = re.sub(r' +', ' ', combined_content) # Multiple spaces
|
| 398 |
|
| 399 |
return combined_content.strip()[:5000] # Limit to 5000 characters
|
| 400 |
-
|
| 401 |
-
def search_youtube_metadata(self, query: str) -> Dict[str, Any]:
|
| 402 |
-
"""
|
| 403 |
-
Specialized search for YouTube video information
|
| 404 |
-
"""
|
| 405 |
-
try:
|
| 406 |
-
# Search specifically for YouTube videos
|
| 407 |
-
youtube_query = f"site:youtube.com {query}"
|
| 408 |
-
|
| 409 |
-
# Use the same search logic but filter for YouTube results
|
| 410 |
-
search_result = self._search_web(youtube_query, limit=3)
|
| 411 |
-
|
| 412 |
-
if not search_result.get('found'):
|
| 413 |
-
return search_result
|
| 414 |
-
|
| 415 |
-
youtube_results = []
|
| 416 |
-
for result in search_result.get('results', []):
|
| 417 |
-
if 'youtube.com/watch' in result.get('url', ''):
|
| 418 |
-
video_id = self._extract_youtube_id(result['url'])
|
| 419 |
-
|
| 420 |
-
youtube_result = {
|
| 421 |
-
"title": result.get('title', 'No title'),
|
| 422 |
-
"url": result.get('url', ''),
|
| 423 |
-
"description": result.get('snippet', 'No description'),
|
| 424 |
-
"video_id": video_id
|
| 425 |
-
}
|
| 426 |
-
youtube_results.append(youtube_result)
|
| 427 |
-
|
| 428 |
-
return {
|
| 429 |
-
"query": query,
|
| 430 |
-
"found": len(youtube_results) > 0,
|
| 431 |
-
"results": youtube_results,
|
| 432 |
-
"message": f"Found {len(youtube_results)} YouTube videos"
|
| 433 |
-
}
|
| 434 |
-
|
| 435 |
-
except Exception as e:
|
| 436 |
-
raise Exception(f"YouTube search failed: {str(e)}")
|
| 437 |
-
|
| 438 |
-
def _extract_youtube_id(self, url: str) -> str:
|
| 439 |
-
"""Extract YouTube video ID from URL"""
|
| 440 |
-
patterns = [
|
| 441 |
-
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
| 442 |
-
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
| 443 |
-
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
|
| 444 |
-
]
|
| 445 |
-
|
| 446 |
-
for pattern in patterns:
|
| 447 |
-
match = re.search(pattern, url)
|
| 448 |
-
if match:
|
| 449 |
-
return match.group(1)
|
| 450 |
-
return ""
|
| 451 |
|
| 452 |
def test_web_search_tool():
|
| 453 |
"""Test the web search tool with various queries"""
|
|
@@ -456,10 +616,10 @@ def test_web_search_tool():
|
|
| 456 |
# Test cases
|
| 457 |
test_cases = [
|
| 458 |
"Python programming tutorial",
|
| 459 |
-
"
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
]
|
| 464 |
|
| 465 |
print("🧪 Testing Web Search Tool...")
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Web Search Tool for GAIA Agent System
|
| 4 |
+
Handles web searches using DuckDuckGo (primary), Tavily API (secondary), and Wikipedia (fallback)
|
| 5 |
"""
|
| 6 |
|
| 7 |
import re
|
|
|
|
| 36 |
|
| 37 |
class WebSearchTool(BaseTool):
|
| 38 |
"""
|
| 39 |
+
Web search tool using DuckDuckGo (primary), Tavily API (secondary), and Wikipedia (fallback)
|
| 40 |
+
Provides multiple search engine options for reliability
|
| 41 |
"""
|
| 42 |
|
| 43 |
def __init__(self):
|
|
|
|
| 50 |
})
|
| 51 |
self.session.timeout = 10
|
| 52 |
|
| 53 |
+
# Initialize search engines
|
| 54 |
self.tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 55 |
self.use_tavily = self.tavily_api_key is not None
|
| 56 |
|
| 57 |
+
# Try to import DuckDuckGo
|
| 58 |
+
try:
|
| 59 |
+
from duckduckgo_search import DDGS
|
| 60 |
+
self.ddgs = DDGS()
|
| 61 |
+
self.use_duckduckgo = True
|
| 62 |
+
logger.info("✅ DuckDuckGo search initialized")
|
| 63 |
+
except ImportError:
|
| 64 |
+
logger.warning("⚠️ DuckDuckGo search not available - install duckduckgo-search package")
|
| 65 |
+
self.use_duckduckgo = False
|
| 66 |
+
|
| 67 |
+
# Try to import Wikipedia
|
| 68 |
+
try:
|
| 69 |
+
import wikipedia
|
| 70 |
+
self.wikipedia = wikipedia
|
| 71 |
+
self.use_wikipedia = True
|
| 72 |
+
logger.info("✅ Wikipedia search initialized")
|
| 73 |
+
except ImportError:
|
| 74 |
+
logger.warning("⚠️ Wikipedia search not available - install wikipedia package")
|
| 75 |
+
self.use_wikipedia = False
|
| 76 |
+
|
| 77 |
if self.use_tavily:
|
| 78 |
+
logger.info("✅ Tavily API key found - using as secondary search")
|
| 79 |
+
|
| 80 |
+
# Search engine priority: DuckDuckGo -> Tavily -> Wikipedia
|
| 81 |
+
search_engines = []
|
| 82 |
+
if self.use_duckduckgo:
|
| 83 |
+
search_engines.append("DuckDuckGo")
|
| 84 |
+
if self.use_tavily:
|
| 85 |
+
search_engines.append("Tavily")
|
| 86 |
+
if self.use_wikipedia:
|
| 87 |
+
search_engines.append("Wikipedia")
|
| 88 |
+
|
| 89 |
+
logger.info(f"🔍 Available search engines: {', '.join(search_engines)}")
|
| 90 |
|
| 91 |
def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
|
| 92 |
"""
|
|
|
|
| 124 |
"""Check if text is a URL"""
|
| 125 |
return bool(re.match(r'https?://', text))
|
| 126 |
|
| 127 |
+
def _extract_search_terms(self, query: str, max_length: int = 250) -> str:
|
| 128 |
+
"""
|
| 129 |
+
Extract key search terms from a potentially long query
|
| 130 |
+
"""
|
| 131 |
+
# If query is short enough, use as-is
|
| 132 |
+
if len(query) <= max_length:
|
| 133 |
+
return query
|
| 134 |
+
|
| 135 |
+
# Remove common stop words and extract key terms
|
| 136 |
+
stop_words = {
|
| 137 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 138 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did',
|
| 139 |
+
'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
|
| 140 |
+
'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
|
| 141 |
+
'what', 'where', 'when', 'why', 'how', 'which', 'who', 'whose', 'whom',
|
| 142 |
+
'please', 'could', 'you', 'tell', 'me', 'find', 'search', 'for', 'about'
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Split into words and filter
|
| 146 |
+
words = re.findall(r'\b\w+\b', query.lower())
|
| 147 |
+
key_words = [word for word in words if word not in stop_words and len(word) > 2]
|
| 148 |
+
|
| 149 |
+
# Keep important phrases and entities
|
| 150 |
+
# Look for quoted phrases, proper nouns, numbers, dates
|
| 151 |
+
important_patterns = [
|
| 152 |
+
r'"[^"]*"', # Quoted phrases
|
| 153 |
+
r'\b[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*\b', # Proper nouns
|
| 154 |
+
r'\b\d{4}\b', # Years
|
| 155 |
+
r'\b\d+\b', # Numbers
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
important_terms = []
|
| 159 |
+
for pattern in important_patterns:
|
| 160 |
+
matches = re.findall(pattern, query)
|
| 161 |
+
important_terms.extend(matches)
|
| 162 |
+
|
| 163 |
+
# Combine key words and important terms
|
| 164 |
+
search_terms = []
|
| 165 |
+
|
| 166 |
+
# Add important terms first (they're usually more specific)
|
| 167 |
+
for term in important_terms:
|
| 168 |
+
if len(' '.join(search_terms + [term])) <= max_length:
|
| 169 |
+
search_terms.append(term)
|
| 170 |
+
|
| 171 |
+
# Add key words until we hit the limit
|
| 172 |
+
for word in key_words:
|
| 173 |
+
potential_query = ' '.join(search_terms + [word])
|
| 174 |
+
if len(potential_query) <= max_length:
|
| 175 |
+
search_terms.append(word)
|
| 176 |
+
else:
|
| 177 |
+
break
|
| 178 |
+
|
| 179 |
+
result = ' '.join(search_terms)
|
| 180 |
+
|
| 181 |
+
# If still too long, truncate
|
| 182 |
+
if len(result) > max_length:
|
| 183 |
+
result = result[:max_length].rsplit(' ', 1)[0]
|
| 184 |
+
|
| 185 |
+
# If we ended up with nothing, use first part of original query
|
| 186 |
+
if not result.strip():
|
| 187 |
+
result = query[:max_length].rsplit(' ', 1)[0]
|
| 188 |
+
|
| 189 |
+
if result != query:
|
| 190 |
+
logger.info(f"📝 Extracted search terms: '{result}' from '{query[:100]}...'")
|
| 191 |
+
|
| 192 |
+
return result
|
| 193 |
+
|
| 194 |
def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 195 |
"""
|
| 196 |
+
Search the web using available search engines in priority order
|
| 197 |
"""
|
| 198 |
|
| 199 |
+
# Extract search terms to avoid length issues
|
| 200 |
+
search_query = self._extract_search_terms(query, max_length=250)
|
| 201 |
+
|
| 202 |
+
# Try DuckDuckGo first (most comprehensive for general web search)
|
| 203 |
+
if self.use_duckduckgo:
|
| 204 |
+
try:
|
| 205 |
+
return self._search_with_duckduckgo(search_query, limit, extract_content)
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.warning(f"DuckDuckGo search failed, trying Tavily: {e}")
|
| 208 |
+
|
| 209 |
+
# Try Tavily if DuckDuckGo fails and API key is available
|
| 210 |
if self.use_tavily:
|
| 211 |
try:
|
| 212 |
+
return self._search_with_tavily(search_query, limit, extract_content)
|
| 213 |
except Exception as e:
|
| 214 |
+
logger.warning(f"Tavily search failed, trying Wikipedia: {e}")
|
| 215 |
|
| 216 |
# Fallback to Wikipedia search
|
| 217 |
+
if self.use_wikipedia:
|
| 218 |
+
return self._search_with_wikipedia(search_query, limit)
|
| 219 |
+
|
| 220 |
+
# No search engines available
|
| 221 |
+
return {
|
| 222 |
+
"query": query,
|
| 223 |
+
"found": False,
|
| 224 |
+
"message": "❌ No search engines available. Please install required packages.",
|
| 225 |
+
"results": []
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
def _search_with_duckduckgo(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 229 |
+
"""
|
| 230 |
+
Search using DuckDuckGo - primary search engine
|
| 231 |
+
"""
|
| 232 |
+
try:
|
| 233 |
+
logger.info(f"🦆 DuckDuckGo search for: {query}")
|
| 234 |
+
|
| 235 |
+
# Add retry logic for DuckDuckGo rate limiting
|
| 236 |
+
max_retries = 3
|
| 237 |
+
retry_delay = 2
|
| 238 |
+
|
| 239 |
+
for attempt in range(max_retries):
|
| 240 |
+
try:
|
| 241 |
+
# Use DuckDuckGo text search
|
| 242 |
+
ddg_results = list(self.ddgs.text(query, max_results=min(limit, 10)))
|
| 243 |
+
|
| 244 |
+
if not ddg_results:
|
| 245 |
+
if attempt < max_retries - 1:
|
| 246 |
+
logger.warning(f"DuckDuckGo returned no results, retrying in {retry_delay}s...")
|
| 247 |
+
time.sleep(retry_delay)
|
| 248 |
+
retry_delay *= 2
|
| 249 |
+
continue
|
| 250 |
+
else:
|
| 251 |
+
logger.warning("DuckDuckGo returned no results after retries")
|
| 252 |
+
# Fall back to other search engines
|
| 253 |
+
return self._search_with_fallback(query, limit)
|
| 254 |
+
|
| 255 |
+
break
|
| 256 |
+
|
| 257 |
+
except Exception as e:
|
| 258 |
+
if "rate limit" in str(e).lower() or "429" in str(e):
|
| 259 |
+
if attempt < max_retries - 1:
|
| 260 |
+
logger.warning(f"DuckDuckGo rate limited, retrying in {retry_delay}s...")
|
| 261 |
+
time.sleep(retry_delay)
|
| 262 |
+
retry_delay *= 2
|
| 263 |
+
continue
|
| 264 |
+
else:
|
| 265 |
+
logger.warning("DuckDuckGo rate limited after retries, using fallback")
|
| 266 |
+
return self._search_with_fallback(query, limit)
|
| 267 |
+
else:
|
| 268 |
+
raise
|
| 269 |
+
|
| 270 |
+
# Process DuckDuckGo results
|
| 271 |
+
results = []
|
| 272 |
+
for result in ddg_results:
|
| 273 |
+
web_result = WebSearchResult(
|
| 274 |
+
title=result.get('title', 'No title'),
|
| 275 |
+
url=result.get('href', ''),
|
| 276 |
+
snippet=result.get('body', 'No description'),
|
| 277 |
+
content='' # DuckDuckGo doesn't provide full content
|
| 278 |
+
)
|
| 279 |
+
results.append(web_result.to_dict())
|
| 280 |
+
|
| 281 |
+
# Extract content if requested
|
| 282 |
+
if extract_content and results:
|
| 283 |
+
for result in results[:2]: # Only extract from first 2 results to save time
|
| 284 |
+
try:
|
| 285 |
+
content_result = self._extract_content_from_url(result['url'])
|
| 286 |
+
if content_result.get('found'):
|
| 287 |
+
result['content'] = content_result.get('content', '')[:1000]
|
| 288 |
+
except:
|
| 289 |
+
pass # Skip content extraction errors
|
| 290 |
+
|
| 291 |
+
logger.info(f"✅ DuckDuckGo found {len(results)} results")
|
| 292 |
+
return {
|
| 293 |
+
"query": query,
|
| 294 |
+
"found": True,
|
| 295 |
+
"results": results,
|
| 296 |
+
"total_results": len(results),
|
| 297 |
+
"message": f"Found {len(results)} results via DuckDuckGo",
|
| 298 |
+
"search_engine": "duckduckgo"
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
except Exception as e:
|
| 302 |
+
logger.error(f"DuckDuckGo search error: {e}")
|
| 303 |
+
# Fall back to other search engines
|
| 304 |
+
return self._search_with_fallback(query, limit)
|
| 305 |
+
|
| 306 |
+
def _search_with_fallback(self, query: str, limit: int) -> Dict[str, Any]:
|
| 307 |
+
"""Try fallback search engines"""
|
| 308 |
+
# Try Tavily if available
|
| 309 |
+
if self.use_tavily:
|
| 310 |
+
try:
|
| 311 |
+
return self._search_with_tavily(query, limit, False)
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.warning(f"Tavily fallback failed: {e}")
|
| 314 |
+
|
| 315 |
+
# Try Wikipedia as last resort
|
| 316 |
+
if self.use_wikipedia:
|
| 317 |
+
return self._search_with_wikipedia(query, limit)
|
| 318 |
+
|
| 319 |
+
return {
|
| 320 |
+
"query": query,
|
| 321 |
+
"found": False,
|
| 322 |
+
"message": "All search engines failed",
|
| 323 |
+
"results": []
|
| 324 |
+
}
|
| 325 |
|
| 326 |
def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
|
| 327 |
"""
|
| 328 |
+
Search using Tavily Search API - secondary search engine
|
| 329 |
"""
|
| 330 |
try:
|
| 331 |
logger.info(f"🔍 Tavily search for: {query}")
|
|
|
|
| 342 |
"include_answer": False,
|
| 343 |
"include_images": False,
|
| 344 |
"include_raw_content": extract_content,
|
| 345 |
+
"max_results": min(limit, 10)
|
| 346 |
}
|
| 347 |
|
| 348 |
# Make API request
|
|
|
|
| 380 |
"search_engine": "tavily"
|
| 381 |
}
|
| 382 |
else:
|
| 383 |
+
logger.warning("Tavily returned no results")
|
| 384 |
+
# Fall back to Wikipedia
|
| 385 |
+
if self.use_wikipedia:
|
| 386 |
+
return self._search_with_wikipedia(query, limit)
|
| 387 |
|
| 388 |
except requests.exceptions.RequestException as e:
|
| 389 |
logger.error(f"Tavily API request failed: {e}")
|
|
|
|
|
|
|
| 390 |
except Exception as e:
|
| 391 |
logger.error(f"Tavily search error: {e}")
|
| 392 |
+
|
| 393 |
+
# Fall back to Wikipedia if Tavily fails
|
| 394 |
+
if self.use_wikipedia:
|
| 395 |
return self._search_with_wikipedia(query, limit)
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
"query": query,
|
| 399 |
+
"found": False,
|
| 400 |
+
"message": "Tavily search failed and no fallback available",
|
| 401 |
+
"results": []
|
| 402 |
+
}
|
| 403 |
|
| 404 |
def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
| 405 |
"""
|
| 406 |
+
Search using Wikipedia - fallback search engine for factual information
|
| 407 |
"""
|
| 408 |
try:
|
| 409 |
logger.info(f"📚 Wikipedia search for: {query}")
|
| 410 |
|
| 411 |
+
self.wikipedia.set_lang("en")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
+
# Clean up query for Wikipedia search and ensure it's not too long
|
| 414 |
+
search_terms = self._extract_search_terms(query, max_length=100) # Wikipedia has stricter limits
|
| 415 |
|
| 416 |
# Search Wikipedia pages
|
| 417 |
+
wiki_results = self.wikipedia.search(search_terms, results=min(limit * 2, 10))
|
| 418 |
|
| 419 |
if not wiki_results:
|
| 420 |
return {
|
|
|
|
| 433 |
break
|
| 434 |
|
| 435 |
try:
|
| 436 |
+
page = self.wikipedia.page(page_title)
|
| 437 |
summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
|
| 438 |
|
| 439 |
web_result = WebSearchResult(
|
|
|
|
| 445 |
results.append(web_result.to_dict())
|
| 446 |
processed += 1
|
| 447 |
|
| 448 |
+
except self.wikipedia.exceptions.DisambiguationError as e:
|
| 449 |
# Try the first suggestion from disambiguation
|
| 450 |
try:
|
| 451 |
if e.options:
|
| 452 |
+
page = self.wikipedia.page(e.options[0])
|
| 453 |
summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
|
| 454 |
|
| 455 |
web_result = WebSearchResult(
|
|
|
|
| 463 |
except:
|
| 464 |
continue
|
| 465 |
|
| 466 |
+
except self.wikipedia.exceptions.PageError:
|
| 467 |
# Page doesn't exist, skip
|
| 468 |
continue
|
| 469 |
except Exception as e:
|
|
|
|
| 495 |
return {
|
| 496 |
"query": query,
|
| 497 |
"found": False,
|
| 498 |
+
"message": f"Wikipedia search failed: {str(e)}",
|
| 499 |
"results": [],
|
| 500 |
"error_type": "search_failure"
|
| 501 |
}
|
|
|
|
| 608 |
combined_content = re.sub(r' +', ' ', combined_content) # Multiple spaces
|
| 609 |
|
| 610 |
return combined_content.strip()[:5000] # Limit to 5000 characters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
def test_web_search_tool():
|
| 613 |
"""Test the web search tool with various queries"""
|
|
|
|
| 616 |
# Test cases
|
| 617 |
test_cases = [
|
| 618 |
"Python programming tutorial",
|
| 619 |
+
"Mercedes Sosa studio albums 2000 2009",
|
| 620 |
+
"artificial intelligence recent developments",
|
| 621 |
+
"climate change latest research",
|
| 622 |
+
"https://en.wikipedia.org/wiki/Machine_learning"
|
| 623 |
]
|
| 624 |
|
| 625 |
print("🧪 Testing Web Search Tool...")
|