Chris commited on
Commit
a178cd6
·
1 Parent(s): 6f7648f

Final 6.6.3

Browse files
.gitignore CHANGED
@@ -7,3 +7,4 @@ test_*.py
7
  debug_*.py
8
  *_debug*.py
9
  tests/
 
 
7
  debug_*.py
8
  *_debug*.py
9
  tests/
10
+ *.log
requirements.txt CHANGED
@@ -6,7 +6,6 @@ beautifulsoup4==4.13.0
6
  certifi==2025.4.26
7
  charset-normalizer==3.4.2
8
  click==8.2.1
9
- duckduckgo-search==7.2.0
10
  exceptiongroup==1.3.0
11
  fastapi==0.115.12
12
  ffmpy==0.5.0
@@ -86,6 +85,7 @@ sniffio==1.3.1
86
  SQLAlchemy==2.0.41
87
  starlette==0.46.2
88
  sympy==1.14.0
 
89
  tenacity==9.1.2
90
  threadpoolctl==3.6.0
91
  tokenizers==0.21.1
@@ -101,6 +101,7 @@ tzdata==2025.2
101
  urllib3==2.4.0
102
  uvicorn==0.34.2
103
  websockets==15.0.1
 
104
  Wikipedia-API==0.7.1
105
  xxhash==3.5.0
106
  zstandard==0.23.0
 
6
  certifi==2025.4.26
7
  charset-normalizer==3.4.2
8
  click==8.2.1
 
9
  exceptiongroup==1.3.0
10
  fastapi==0.115.12
11
  ffmpy==0.5.0
 
85
  SQLAlchemy==2.0.41
86
  starlette==0.46.2
87
  sympy==1.14.0
88
+ tavily-python==0.5.0
89
  tenacity==9.1.2
90
  threadpoolctl==3.6.0
91
  tokenizers==0.21.1
 
101
  urllib3==2.4.0
102
  uvicorn==0.34.2
103
  websockets==15.0.1
104
+ wikipedia==1.4.0
105
  Wikipedia-API==0.7.1
106
  xxhash==3.5.0
107
  zstandard==0.23.0
src/app.py CHANGED
@@ -1756,14 +1756,14 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
1756
 
1757
  **LangGraph Multi-Agent Workflow:**
1758
  - **Router Agent**: Classifies questions and selects appropriate specialized agents
1759
- - **Web Research Agent**: Handles Wikipedia searches and web research with DuckDuckGo
1760
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1761
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1762
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1763
 
1764
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1765
 
1766
- **Tools Available**: Wikipedia API, DuckDuckGo web search, mathematical calculator, multi-format file processor
1767
 
1768
  ### 📈 Performance Metrics
1769
  - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
@@ -1771,10 +1771,12 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
1771
  - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
1772
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
1773
  - **Reliability**: Robust error handling and graceful degradation within workflow
 
1774
 
1775
  ### 🎯 Authentication Requirements
1776
  - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
1777
  - **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API
 
1778
  - **No Fallback Options**: System requires proper authentication for multi-agent functionality
1779
  """)
1780
 
 
1756
 
1757
  **LangGraph Multi-Agent Workflow:**
1758
  - **Router Agent**: Classifies questions and selects appropriate specialized agents
1759
+ - **Web Research Agent**: Handles Wikipedia searches and web research with Tavily API + Wikipedia fallback
1760
  - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
1761
  - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
1762
  - **Synthesizer Agent**: Combines results from multiple agents into final answers
1763
 
1764
  **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
1765
 
1766
+ **Tools Available**: Wikipedia API, Tavily web search (with Wikipedia fallback), mathematical calculator, multi-format file processor
1767
 
1768
  ### 📈 Performance Metrics
1769
  - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
 
1771
  - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
1772
  - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
1773
  - **Reliability**: Robust error handling and graceful degradation within workflow
1774
+ - **Web Search**: Reliable Tavily API with Wikipedia fallback (no rate limiting issues)
1775
 
1776
  ### 🎯 Authentication Requirements
1777
  - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
1778
  - **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API
1779
+ - **Optional**: TAVILY_API_KEY for enhanced web search capabilities (1,000 free searches/month)
1780
  - **No Fallback Options**: System requires proper authentication for multi-agent functionality
1781
  """)
1782
 
src/models/__pycache__/qwen_client.cpython-310.pyc CHANGED
Binary files a/src/models/__pycache__/qwen_client.cpython-310.pyc and b/src/models/__pycache__/qwen_client.cpython-310.pyc differ
 
src/requirements.txt CHANGED
@@ -1,20 +1,115 @@
1
  # Core dependencies
2
- gradio==4.44.0
3
- langchain==0.3.9
4
  langchain-community==0.3.7
5
- langchain-core==0.3.18
6
- langgraph==0.2.45
7
  requests==2.32.3
8
  pandas==2.2.3
9
- huggingface-hub==0.26.2
10
- transformers==4.46.3
11
  wikipedia-api==0.7.1
12
- duckduckgo-search==6.3.4
13
- Pillow==10.4.0
14
- openpyxl==3.1.5
15
- pydub==0.25.1
16
- speechrecognition==3.11.0
17
 
18
  # OAuth dependencies for Gradio
19
  itsdangerous>=2.0.0
20
- gradio[oauth]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Core dependencies
2
+ gradio==5.31.0
3
+ langchain==0.3.25
4
  langchain-community==0.3.7
5
+ langchain-core==0.3.62
6
+ langgraph==0.4.7
7
  requests==2.32.3
8
  pandas==2.2.3
9
+ huggingface-hub==0.32.2
10
+ transformers==4.52.3
11
  wikipedia-api==0.7.1
12
+ wikipedia==1.4.0
 
 
 
 
13
 
14
  # OAuth dependencies for Gradio
15
  itsdangerous>=2.0.0
16
+ gradio[oauth]
17
+
18
+ # New dependencies
19
+ aiofiles==24.1.0
20
+ annotated-types==0.7.0
21
+ anyio==4.9.0
22
+ async-timeout==4.0.3
23
+ beautifulsoup4==4.13.0
24
+ certifi==2025.4.26
25
+ charset-normalizer==3.4.2
26
+ click==8.2.1
27
+ exceptiongroup==1.3.0
28
+ fastapi==0.115.12
29
+ ffmpy==0.5.0
30
+ filelock==3.18.0
31
+ fsspec==2025.5.1
32
+ gradio_client==1.10.1
33
+ greenlet==3.2.2
34
+ groovy==0.1.2
35
+ h11==0.16.0
36
+ hf-xet==1.1.2
37
+ httpcore==1.0.9
38
+ httpx==0.28.1
39
+ idna==3.10
40
+ Jinja2==3.1.6
41
+ joblib==1.5.1
42
+ jsonpatch==1.33
43
+ jsonpointer==3.0.0
44
+ langchain-huggingface==0.2.0
45
+ langchain-text-splitters==0.3.8
46
+ langgraph-checkpoint==2.0.26
47
+ langgraph-prebuilt==0.2.2
48
+ langgraph-sdk==0.1.70
49
+ langsmith==0.3.43
50
+ markdown-it-py==3.0.0
51
+ MarkupSafe==3.0.2
52
+ mdurl==0.1.2
53
+ mpmath==1.3.0
54
+ networkx==3.4.2
55
+ numpy==2.2.6
56
+ nvidia-cublas-cu12==12.6.4.1
57
+ nvidia-cuda-cupti-cu12==12.6.80
58
+ nvidia-cuda-nvrtc-cu12==12.6.77
59
+ nvidia-cuda-runtime-cu12==12.6.77
60
+ nvidia-cudnn-cu12==9.5.1.17
61
+ nvidia-cufft-cu12==11.3.0.4
62
+ nvidia-cufile-cu12==1.11.1.6
63
+ nvidia-curand-cu12==10.3.7.77
64
+ nvidia-cusolver-cu12==11.7.1.2
65
+ nvidia-cusparse-cu12==12.5.4.2
66
+ nvidia-cusparselt-cu12==0.6.3
67
+ nvidia-nccl-cu12==2.26.2
68
+ nvidia-nvjitlink-cu12==12.6.85
69
+ nvidia-nvtx-cu12==12.6.77
70
+ orjson==3.10.18
71
+ ormsgpack==1.10.0
72
+ packaging==24.2
73
+ pillow==11.2.1
74
+ pydantic==2.11.5
75
+ pydantic_core==2.33.2
76
+ pydub==0.25.1
77
+ Pygments==2.19.1
78
+ python-dateutil==2.9.0.post0
79
+ python-dotenv==1.1.0
80
+ python-multipart==0.0.20
81
+ pytz==2025.2
82
+ PyYAML==6.0.2
83
+ regex==2024.11.6
84
+ requests-toolbelt==1.0.0
85
+ rich==14.0.0
86
+ ruff==0.11.11
87
+ safehttpx==0.1.6
88
+ safetensors==0.5.3
89
+ scikit-learn==1.6.1
90
+ scipy==1.15.3
91
+ semantic-version==2.10.0
92
+ sentence-transformers==4.1.0
93
+ shellingham==1.5.4
94
+ six==1.17.0
95
+ sniffio==1.3.1
96
+ SQLAlchemy==2.0.41
97
+ starlette==0.46.2
98
+ sympy==1.14.0
99
+ tavily-python==0.5.0
100
+ tenacity==9.1.2
101
+ threadpoolctl==3.6.0
102
+ tokenizers==0.21.1
103
+ tomlkit==0.13.2
104
+ torch==2.7.0
105
+ tqdm==4.67.1
106
+ triton==3.3.0
107
+ typer==0.16.0
108
+ typing-inspection==0.4.1
109
+ typing_extensions==4.13.2
110
+ tzdata==2025.2
111
+ urllib3==2.4.0
112
+ uvicorn==0.34.2
113
+ websockets==15.0.1
114
+ xxhash==3.5.0
115
+ zstandard==0.23.0
src/tools/__pycache__/web_search_tool.cpython-310.pyc CHANGED
Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ
 
src/tools/web_search_tool.py CHANGED
@@ -1,17 +1,17 @@
1
  #!/usr/bin/env python3
2
  """
3
  Web Search Tool for GAIA Agent System
4
- Handles web searches using DuckDuckGo and content extraction from URLs
5
  """
6
 
7
  import re
8
  import logging
9
  import time
 
10
  from typing import Dict, List, Optional, Any
11
  from urllib.parse import urlparse, urljoin
12
  import requests
13
  from bs4 import BeautifulSoup
14
- from duckduckgo_search import DDGS
15
 
16
  from tools import BaseTool
17
 
@@ -36,8 +36,8 @@ class WebSearchResult:
36
 
37
  class WebSearchTool(BaseTool):
38
  """
39
- Web search tool using DuckDuckGo
40
- Handles searches, URL content extraction, and result filtering
41
  """
42
 
43
  def __init__(self):
@@ -50,6 +50,15 @@ class WebSearchTool(BaseTool):
50
  })
51
  self.session.timeout = 10
52
 
 
 
 
 
 
 
 
 
 
53
  def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
54
  """
55
  Execute web search operations based on input type
@@ -88,143 +97,197 @@ class WebSearchTool(BaseTool):
88
 
89
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
90
  """
91
- Search the web using DuckDuckGo with enhanced rate limiting handling
92
  """
93
 
94
- for attempt in range(3):
 
95
  try:
96
- logger.info(f"Searching web for: {query} (attempt {attempt + 1}/3)")
97
-
98
- # Progressive delays to handle rate limiting
99
- if attempt > 0:
100
- delay = 5 * (2 ** (attempt - 1)) # 5s, 10s delays
101
- logger.info(f"Waiting {delay}s before retry due to rate limiting...")
102
- time.sleep(delay)
103
-
104
- with DDGS() as ddgs:
105
- # Use DuckDuckGo search with proper parameters
106
- search_results = list(ddgs.text(
107
- keywords=query,
108
- max_results=limit,
109
- region='us-en',
110
- safesearch='moderate'
111
- ))
112
-
113
- if not search_results:
114
- if attempt < 2:
115
- logger.warning(f"No results on attempt {attempt + 1}, retrying...")
116
- continue
117
- else:
118
- return {
119
- "query": query,
120
- "found": False,
121
- "message": "No web search results found after retries",
122
- "results": []
123
- }
124
-
125
- results = []
126
- for result in search_results:
127
- try:
128
- web_result = WebSearchResult(
129
- title=result.get('title', 'No title'),
130
- url=result.get('href', ''),
131
- snippet=result.get('body', 'No description')
132
- )
133
-
134
- # Optionally extract full content from each URL
135
- if extract_content and web_result.url:
136
- try:
137
- content_result = self._extract_content_from_url(web_result.url)
138
- if content_result.get('found'):
139
- web_result.content = content_result['content'][:1000] # Limit content size
140
- except Exception as e:
141
- logger.warning(f"Failed to extract content from {web_result.url}: {e}")
142
- # Continue without content extraction rather than failing
143
-
144
- results.append(web_result.to_dict())
145
-
146
- except Exception as result_error:
147
- logger.warning(f"Error processing search result: {result_error}")
148
- # Continue with other results rather than failing entire search
149
- continue
150
-
151
- # Return successful results even if some individual results failed
152
  return {
153
  "query": query,
154
- "found": len(results) > 0,
155
  "results": results,
156
  "total_results": len(results),
157
- "message": f"Found {len(results)} web search results"
 
158
  }
159
-
160
- except Exception as e:
161
- error_msg = str(e)
162
- if "ratelimit" in error_msg.lower() or "rate limit" in error_msg.lower() or "403" in error_msg or "202" in error_msg or "429" in error_msg:
163
- logger.warning(f"Web search attempt {attempt + 1} failed: {error_msg}")
164
- if attempt < 2:
165
- continue
166
- else:
167
- logger.error(f"Web search attempt {attempt + 1} failed with non-rate-limit error: {error_msg}")
168
- if attempt < 2:
169
- continue
170
-
171
- # If all attempts failed, try fallback search strategy
172
- logger.warning("All DuckDuckGo attempts failed, trying fallback search strategy...")
173
- return self._fallback_search(query)
174
 
175
- def _fallback_search(self, query: str) -> Dict[str, Any]:
176
  """
177
- Fallback search strategy when DuckDuckGo is completely unavailable
178
  """
179
  try:
180
- # Try a simple Wikipedia search as fallback
181
- import wikipedia
 
 
 
 
 
 
 
 
 
 
 
182
  wikipedia.set_lang("en")
183
 
184
- # Extract key terms from query for Wikipedia search
185
  search_terms = query.replace("site:", "").strip()
186
 
187
- try:
188
- # Search Wikipedia pages
189
- wiki_results = wikipedia.search(search_terms, results=3)
190
- if wiki_results:
191
- fallback_results = []
192
- for i, page_title in enumerate(wiki_results[:2], 1):
193
- try:
194
- page = wikipedia.page(page_title)
195
- summary = page.summary[:200] + "..." if len(page.summary) > 200 else page.summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  web_result = WebSearchResult(
198
- title=f"{page_title} (Wikipedia)",
199
  url=page.url,
200
- snippet=summary
 
201
  )
202
- fallback_results.append(web_result.to_dict())
203
- except:
204
- continue
205
-
206
- if fallback_results:
207
- return {
208
- "query": query,
209
- "found": True,
210
- "results": fallback_results,
211
- "total_results": len(fallback_results),
212
- "message": f"Using Wikipedia fallback search. Found {len(fallback_results)} results"
213
- }
214
- except:
215
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- except ImportError:
218
- pass
219
-
220
- # Last resort: return a helpful message
221
- return {
222
- "query": query,
223
- "found": False,
224
- "message": "❌ Web search failed due to rate limiting. Please try again later or provide the information directly.",
225
- "results": [],
226
- "error_type": "search_failure"
227
- }
228
 
229
  def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
230
  """
@@ -343,23 +406,21 @@ class WebSearchTool(BaseTool):
343
  # Search specifically for YouTube videos
344
  youtube_query = f"site:youtube.com {query}"
345
 
346
- with DDGS() as ddgs:
347
- search_results = list(ddgs.text(
348
- keywords=youtube_query,
349
- max_results=3,
350
- region='us-en',
351
- safesearch='moderate'
352
- ))
353
 
354
  youtube_results = []
355
- for result in search_results:
356
- if 'youtube.com/watch' in result.get('href', ''):
357
- video_id = self._extract_youtube_id(result['href'])
358
 
359
  youtube_result = {
360
  "title": result.get('title', 'No title'),
361
- "url": result.get('href', ''),
362
- "description": result.get('body', 'No description'),
363
  "video_id": video_id
364
  }
365
  youtube_results.append(youtube_result)
@@ -410,6 +471,9 @@ def test_web_search_tool():
410
 
411
  if result.success:
412
  print(f"✅ Success: {result.result.get('message', 'No message')}")
 
 
 
413
  if result.result.get('found'):
414
  if 'results' in result.result:
415
  print(f" Found {len(result.result['results'])} results")
 
1
  #!/usr/bin/env python3
2
  """
3
  Web Search Tool for GAIA Agent System
4
+ Handles web searches using Tavily API (primary) and Wikipedia (fallback)
5
  """
6
 
7
  import re
8
  import logging
9
  import time
10
+ import os
11
  from typing import Dict, List, Optional, Any
12
  from urllib.parse import urlparse, urljoin
13
  import requests
14
  from bs4 import BeautifulSoup
 
15
 
16
  from tools import BaseTool
17
 
 
36
 
37
  class WebSearchTool(BaseTool):
38
  """
39
+ Web search tool using Tavily API (primary) and Wikipedia (fallback)
40
+ Much more reliable than DuckDuckGo with no rate limiting issues
41
  """
42
 
43
  def __init__(self):
 
50
  })
51
  self.session.timeout = 10
52
 
53
+ # Initialize Tavily client if API key is available
54
+ self.tavily_api_key = os.getenv("TAVILY_API_KEY")
55
+ self.use_tavily = self.tavily_api_key is not None
56
+
57
+ if self.use_tavily:
58
+ logger.info("✅ Tavily API key found - using Tavily for web search")
59
+ else:
60
+ logger.info("ℹ️ No Tavily API key found - will use Wikipedia fallback only")
61
+
62
  def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
63
  """
64
  Execute web search operations based on input type
 
97
 
98
  def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
99
  """
100
+ Search the web using Tavily API (primary) or Wikipedia (fallback)
101
  """
102
 
103
+ # Try Tavily first if API key is available
104
+ if self.use_tavily:
105
  try:
106
+ return self._search_with_tavily(query, limit, extract_content)
107
+ except Exception as e:
108
+ logger.warning(f"Tavily search failed, falling back to Wikipedia: {e}")
109
+
110
+ # Fallback to Wikipedia search
111
+ return self._search_with_wikipedia(query, limit)
112
+
113
+ def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
114
+ """
115
+ Search using Tavily Search API - much more reliable than DuckDuckGo
116
+ """
117
+ try:
118
+ logger.info(f"🔍 Tavily search for: {query}")
119
+
120
+ # Prepare Tavily API request
121
+ headers = {
122
+ "Content-Type": "application/json"
123
+ }
124
+
125
+ payload = {
126
+ "api_key": self.tavily_api_key,
127
+ "query": query,
128
+ "search_depth": "basic",
129
+ "include_answer": False,
130
+ "include_images": False,
131
+ "include_raw_content": extract_content,
132
+ "max_results": min(limit, 10) # Tavily supports up to 10 results
133
+ }
134
+
135
+ # Make API request
136
+ response = self.session.post(
137
+ "https://api.tavily.com/search",
138
+ json=payload,
139
+ headers=headers,
140
+ timeout=15
141
+ )
142
+ response.raise_for_status()
143
+
144
+ tavily_data = response.json()
145
+
146
+ # Process Tavily results
147
+ results = []
148
+ tavily_results = tavily_data.get('results', [])
149
+
150
+ for result in tavily_results:
151
+ web_result = WebSearchResult(
152
+ title=result.get('title', 'No title'),
153
+ url=result.get('url', ''),
154
+ snippet=result.get('content', 'No description'),
155
+ content=result.get('raw_content', '') if extract_content else ''
156
+ )
157
+ results.append(web_result.to_dict())
158
+
159
+ if results:
160
+ logger.info(f"✅ Tavily found {len(results)} results")
 
161
  return {
162
  "query": query,
163
+ "found": True,
164
  "results": results,
165
  "total_results": len(results),
166
+ "message": f"Found {len(results)} results via Tavily Search API",
167
+ "search_engine": "tavily"
168
  }
169
+ else:
170
+ logger.warning("Tavily returned no results, trying Wikipedia fallback")
171
+ return self._search_with_wikipedia(query, limit)
172
+
173
+ except requests.exceptions.RequestException as e:
174
+ logger.error(f"Tavily API request failed: {e}")
175
+ # Fall back to Wikipedia
176
+ return self._search_with_wikipedia(query, limit)
177
+ except Exception as e:
178
+ logger.error(f"Tavily search error: {e}")
179
+ # Fall back to Wikipedia
180
+ return self._search_with_wikipedia(query, limit)
 
 
 
181
 
182
+ def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
183
  """
184
+ Search using Wikipedia as fallback - very reliable and no rate limits
185
  """
186
  try:
187
+ logger.info(f"📚 Wikipedia search for: {query}")
188
+
189
+ # Try to import wikipedia library
190
+ try:
191
+ import wikipedia
192
+ except ImportError:
193
+ return {
194
+ "query": query,
195
+ "found": False,
196
+ "message": "❌ No search engines available. Install 'wikipedia' package or configure Tavily API key.",
197
+ "results": []
198
+ }
199
+
200
  wikipedia.set_lang("en")
201
 
202
+ # Clean up query for Wikipedia search
203
  search_terms = query.replace("site:", "").strip()
204
 
205
+ # Search Wikipedia pages
206
+ wiki_results = wikipedia.search(search_terms, results=min(limit * 2, 10))
207
+
208
+ if not wiki_results:
209
+ return {
210
+ "query": query,
211
+ "found": False,
212
+ "message": "No Wikipedia articles found for this query",
213
+ "results": [],
214
+ "search_engine": "wikipedia"
215
+ }
216
+
217
+ results = []
218
+ processed = 0
219
+
220
+ for page_title in wiki_results:
221
+ if processed >= limit:
222
+ break
223
+
224
+ try:
225
+ page = wikipedia.page(page_title)
226
+ summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
227
+
228
+ web_result = WebSearchResult(
229
+ title=f"{page_title} (Wikipedia)",
230
+ url=page.url,
231
+ snippet=summary,
232
+ content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
233
+ )
234
+ results.append(web_result.to_dict())
235
+ processed += 1
236
+
237
+ except wikipedia.exceptions.DisambiguationError as e:
238
+ # Try the first suggestion from disambiguation
239
+ try:
240
+ if e.options:
241
+ page = wikipedia.page(e.options[0])
242
+ summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
243
 
244
  web_result = WebSearchResult(
245
+ title=f"{e.options[0]} (Wikipedia)",
246
  url=page.url,
247
+ snippet=summary,
248
+ content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
249
  )
250
+ results.append(web_result.to_dict())
251
+ processed += 1
252
+ except:
253
+ continue
254
+
255
+ except wikipedia.exceptions.PageError:
256
+ # Page doesn't exist, skip
257
+ continue
258
+ except Exception as e:
259
+ # Other Wikipedia errors, skip this page
260
+ logger.warning(f"Wikipedia page error for '{page_title}': {e}")
261
+ continue
262
+
263
+ if results:
264
+ logger.info(f"✅ Wikipedia found {len(results)} results")
265
+ return {
266
+ "query": query,
267
+ "found": True,
268
+ "results": results,
269
+ "total_results": len(results),
270
+ "message": f"Found {len(results)} Wikipedia articles",
271
+ "search_engine": "wikipedia"
272
+ }
273
+ else:
274
+ return {
275
+ "query": query,
276
+ "found": False,
277
+ "message": "No accessible Wikipedia articles found for this query",
278
+ "results": [],
279
+ "search_engine": "wikipedia"
280
+ }
281
 
282
+ except Exception as e:
283
+ logger.error(f"Wikipedia search failed: {e}")
284
+ return {
285
+ "query": query,
286
+ "found": False,
287
+ "message": f"Search failed: {str(e)}",
288
+ "results": [],
289
+ "error_type": "search_failure"
290
+ }
 
 
291
 
292
  def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
293
  """
 
406
  # Search specifically for YouTube videos
407
  youtube_query = f"site:youtube.com {query}"
408
 
409
+ # Use the same search logic but filter for YouTube results
410
+ search_result = self._search_web(youtube_query, limit=3)
411
+
412
+ if not search_result.get('found'):
413
+ return search_result
 
 
414
 
415
  youtube_results = []
416
+ for result in search_result.get('results', []):
417
+ if 'youtube.com/watch' in result.get('url', ''):
418
+ video_id = self._extract_youtube_id(result['url'])
419
 
420
  youtube_result = {
421
  "title": result.get('title', 'No title'),
422
+ "url": result.get('url', ''),
423
+ "description": result.get('snippet', 'No description'),
424
  "video_id": video_id
425
  }
426
  youtube_results.append(youtube_result)
 
471
 
472
  if result.success:
473
  print(f"✅ Success: {result.result.get('message', 'No message')}")
474
+ search_engine = result.result.get('search_engine', 'unknown')
475
+ print(f" Search engine: {search_engine}")
476
+
477
  if result.result.get('found'):
478
  if 'results' in result.result:
479
  print(f" Found {len(result.result['results'])} results")