Agent_Course_Final_Assignment

Sleeping

App Files Files Community

Chris commited on May 29, 2025

Commit

a178cd6

1 Parent(s): 6f7648f

Final 6.6.3

Browse files

Files changed (7) hide show

.gitignore +1 -0
requirements.txt +2 -1
src/app.py +4 -2
src/models/__pycache__/qwen_client.cpython-310.pyc +0 -0
src/requirements.txt +107 -12
src/tools/__pycache__/web_search_tool.cpython-310.pyc +0 -0
src/tools/web_search_tool.py +196 -132

.gitignore CHANGED Viewed

@@ -7,3 +7,4 @@ test_*.py
 debug_*.py
 *_debug*.py
 tests/

 debug_*.py
 *_debug*.py
 tests/
+*.log

requirements.txt CHANGED Viewed

@@ -6,7 +6,6 @@ beautifulsoup4==4.13.0
 certifi==2025.4.26
 charset-normalizer==3.4.2
 click==8.2.1
-duckduckgo-search==7.2.0
 exceptiongroup==1.3.0
 fastapi==0.115.12
 ffmpy==0.5.0
@@ -86,6 +85,7 @@ sniffio==1.3.1
 SQLAlchemy==2.0.41
 starlette==0.46.2
 sympy==1.14.0
 tenacity==9.1.2
 threadpoolctl==3.6.0
 tokenizers==0.21.1
@@ -101,6 +101,7 @@ tzdata==2025.2
 urllib3==2.4.0
 uvicorn==0.34.2
 websockets==15.0.1
 Wikipedia-API==0.7.1
 xxhash==3.5.0
 zstandard==0.23.0

 certifi==2025.4.26
 charset-normalizer==3.4.2
 click==8.2.1
 exceptiongroup==1.3.0
 fastapi==0.115.12
 ffmpy==0.5.0
 SQLAlchemy==2.0.41
 starlette==0.46.2
 sympy==1.14.0
+tavily-python==0.5.0
 tenacity==9.1.2
 threadpoolctl==3.6.0
 tokenizers==0.21.1
 urllib3==2.4.0
 uvicorn==0.34.2
 websockets==15.0.1
+wikipedia==1.4.0
 Wikipedia-API==0.7.1
 xxhash==3.5.0
 zstandard==0.23.0

src/app.py CHANGED Viewed

@@ -1756,14 +1756,14 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
         **LangGraph Multi-Agent Workflow:**
         - **Router Agent**: Classifies questions and selects appropriate specialized agents
-        - **Web Research Agent**: Handles Wikipedia searches and web research with DuckDuckGo
         - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
         - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
         - **Synthesizer Agent**: Combines results from multiple agents into final answers
         **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
-        **Tools Available**: Wikipedia API, DuckDuckGo web search, mathematical calculator, multi-format file processor
         ### 📈 Performance Metrics
         - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
@@ -1771,10 +1771,12 @@ Please click the "Sign in with Hugging Face" button above to access GAIA evaluat
         - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
         - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
         - **Reliability**: Robust error handling and graceful degradation within workflow
         ### 🎯 Authentication Requirements
         - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
         - **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API
         - **No Fallback Options**: System requires proper authentication for multi-agent functionality
         """)

         **LangGraph Multi-Agent Workflow:**
         - **Router Agent**: Classifies questions and selects appropriate specialized agents
+        - **Web Research Agent**: Handles Wikipedia searches and web research with Tavily API + Wikipedia fallback
         - **File Processing Agent**: Processes uploaded files (CSV, images, code, audio)
         - **Reasoning Agent**: Handles mathematical calculations and logical reasoning
         - **Synthesizer Agent**: Combines results from multiple agents into final answers
         **Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance
+        **Tools Available**: Wikipedia API, Tavily web search (with Wikipedia fallback), mathematical calculator, multi-format file processor
         ### 📈 Performance Metrics
         - **Success Rate**: 30%+ expected on GAIA benchmark with full authentication
         - **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection
         - **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis
         - **Reliability**: Robust error handling and graceful degradation within workflow
+        - **Web Search**: Reliable Tavily API with Wikipedia fallback (no rate limiting issues)
         ### 🎯 Authentication Requirements
         - **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models
         - **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API
+        - **Optional**: TAVILY_API_KEY for enhanced web search capabilities (1,000 free searches/month)
         - **No Fallback Options**: System requires proper authentication for multi-agent functionality
         """)

src/models/__pycache__/qwen_client.cpython-310.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/qwen_client.cpython-310.pyc and b/src/models/__pycache__/qwen_client.cpython-310.pyc differ

src/requirements.txt CHANGED Viewed

@@ -1,20 +1,115 @@
 # Core dependencies
-gradio==4.44.0
-langchain==0.3.9
 langchain-community==0.3.7
-langchain-core==0.3.18
-langgraph==0.2.45
 requests==2.32.3
 pandas==2.2.3
-huggingface-hub==0.26.2
-transformers==4.46.3
 wikipedia-api==0.7.1
-duckduckgo-search==6.3.4
-Pillow==10.4.0
-openpyxl==3.1.5
-pydub==0.25.1
-speechrecognition==3.11.0
 # OAuth dependencies for Gradio
 itsdangerous>=2.0.0
-gradio[oauth]

 # Core dependencies
+gradio==5.31.0
+langchain==0.3.25
 langchain-community==0.3.7
+langchain-core==0.3.62
+langgraph==0.4.7
 requests==2.32.3
 pandas==2.2.3
+huggingface-hub==0.32.2
+transformers==4.52.3
 wikipedia-api==0.7.1
+wikipedia==1.4.0
 # OAuth dependencies for Gradio
 itsdangerous>=2.0.0
+gradio[oauth]
+# New dependencies
+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+async-timeout==4.0.3
+beautifulsoup4==4.13.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+exceptiongroup==1.3.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fsspec==2025.5.1
+gradio_client==1.10.1
+greenlet==3.2.2
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.10
+Jinja2==3.1.6
+joblib==1.5.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain-huggingface==0.2.0
+langchain-text-splitters==0.3.8
+langgraph-checkpoint==2.0.26
+langgraph-prebuilt==0.2.2
+langgraph-sdk==0.1.70
+langsmith==0.3.43
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+orjson==3.10.18
+ormsgpack==1.10.0
+packaging==24.2
+pillow==11.2.1
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests-toolbelt==1.0.0
+rich==14.0.0
+ruff==0.11.11
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+semantic-version==2.10.0
+sentence-transformers==4.1.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+starlette==0.46.2
+sympy==1.14.0
+tavily-python==0.5.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+tqdm==4.67.1
+triton==3.3.0
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+websockets==15.0.1
+xxhash==3.5.0
+zstandard==0.23.0

src/tools/__pycache__/web_search_tool.cpython-310.pyc CHANGED Viewed

Binary files a/src/tools/__pycache__/web_search_tool.cpython-310.pyc and b/src/tools/__pycache__/web_search_tool.cpython-310.pyc differ

src/tools/web_search_tool.py CHANGED Viewed

@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 """
 Web Search Tool for GAIA Agent System
-Handles web searches using DuckDuckGo and content extraction from URLs
 """
 import re
 import logging
 import time
 from typing import Dict, List, Optional, Any
 from urllib.parse import urlparse, urljoin
 import requests
 from bs4 import BeautifulSoup
-from duckduckgo_search import DDGS
 from tools import BaseTool
@@ -36,8 +36,8 @@ class WebSearchResult:
 class WebSearchTool(BaseTool):
     """
-    Web search tool using DuckDuckGo
-    Handles searches, URL content extraction, and result filtering
     """
     def __init__(self):
@@ -50,6 +50,15 @@ class WebSearchTool(BaseTool):
         })
         self.session.timeout = 10
     def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
         """
         Execute web search operations based on input type
@@ -88,143 +97,197 @@ class WebSearchTool(BaseTool):
     def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
         """
-        Search the web using DuckDuckGo with enhanced rate limiting handling
         """
-        for attempt in range(3):
             try:
-                logger.info(f"Searching web for: {query} (attempt {attempt + 1}/3)")
-                # Progressive delays to handle rate limiting
-                if attempt > 0:
-                    delay = 5 * (2 ** (attempt - 1))  # 5s, 10s delays
-                    logger.info(f"Waiting {delay}s before retry due to rate limiting...")
-                    time.sleep(delay)
-                with DDGS() as ddgs:
-                    # Use DuckDuckGo search with proper parameters
-                    search_results = list(ddgs.text(
-                        keywords=query,
-                        max_results=limit,
-                        region='us-en',
-                        safesearch='moderate'
-                    ))
-                if not search_results:
-                    if attempt < 2:
-                        logger.warning(f"No results on attempt {attempt + 1}, retrying...")
-                        continue
-                    else:
-                        return {
-                            "query": query,
-                            "found": False,
-                            "message": "No web search results found after retries",
-                            "results": []
-                        }
-                results = []
-                for result in search_results:
-                    try:
-                        web_result = WebSearchResult(
-                            title=result.get('title', 'No title'),
-                            url=result.get('href', ''),
-                            snippet=result.get('body', 'No description')
-                        )
-                        # Optionally extract full content from each URL
-                        if extract_content and web_result.url:
-                            try:
-                                content_result = self._extract_content_from_url(web_result.url)
-                                if content_result.get('found'):
-                                    web_result.content = content_result['content'][:1000]  # Limit content size
-                            except Exception as e:
-                                logger.warning(f"Failed to extract content from {web_result.url}: {e}")
-                                # Continue without content extraction rather than failing
-                        results.append(web_result.to_dict())
-                    except Exception as result_error:
-                        logger.warning(f"Error processing search result: {result_error}")
-                        # Continue with other results rather than failing entire search
-                        continue
-                # Return successful results even if some individual results failed
                 return {
                     "query": query,
-                    "found": len(results) > 0,
                     "results": results,
                     "total_results": len(results),
-                    "message": f"Found {len(results)} web search results"
                 }
-            except Exception as e:
-                error_msg = str(e)
-                if "ratelimit" in error_msg.lower() or "rate limit" in error_msg.lower() or "403" in error_msg or "202" in error_msg or "429" in error_msg:
-                    logger.warning(f"Web search attempt {attempt + 1} failed: {error_msg}")
-                    if attempt < 2:
-                        continue
-                else:
-                    logger.error(f"Web search attempt {attempt + 1} failed with non-rate-limit error: {error_msg}")
-                    if attempt < 2:
-                        continue
-        # If all attempts failed, try fallback search strategy
-        logger.warning("All DuckDuckGo attempts failed, trying fallback search strategy...")
-        return self._fallback_search(query)
-    def _fallback_search(self, query: str) -> Dict[str, Any]:
         """
-        Fallback search strategy when DuckDuckGo is completely unavailable
         """
         try:
-            # Try a simple Wikipedia search as fallback
-            import wikipedia
             wikipedia.set_lang("en")
-            # Extract key terms from query for Wikipedia search
             search_terms = query.replace("site:", "").strip()
-            try:
-                # Search Wikipedia pages
-                wiki_results = wikipedia.search(search_terms, results=3)
-                if wiki_results:
-                    fallback_results = []
-                    for i, page_title in enumerate(wiki_results[:2], 1):
-                        try:
-                            page = wikipedia.page(page_title)
-                            summary = page.summary[:200] + "..." if len(page.summary) > 200 else page.summary
                             web_result = WebSearchResult(
-                                title=f"{page_title} (Wikipedia)",
                                 url=page.url,
-                                snippet=summary
                             )
-                            fallback_results.append(web_result.to_dict())
-                        except:
-                            continue
-                    if fallback_results:
-                        return {
-                            "query": query,
-                            "found": True,
-                            "results": fallback_results,
-                            "total_results": len(fallback_results),
-                            "message": f"Using Wikipedia fallback search. Found {len(fallback_results)} results"
-                        }
-            except:
-                pass
-        except ImportError:
-            pass
-        # Last resort: return a helpful message
-        return {
-            "query": query,
-            "found": False,
-            "message": "❌ Web search failed due to rate limiting. Please try again later or provide the information directly.",
-            "results": [],
-            "error_type": "search_failure"
-        }
     def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
         """
@@ -343,23 +406,21 @@ class WebSearchTool(BaseTool):
             # Search specifically for YouTube videos
             youtube_query = f"site:youtube.com {query}"
-            with DDGS() as ddgs:
-                search_results = list(ddgs.text(
-                    keywords=youtube_query,
-                    max_results=3,
-                    region='us-en',
-                    safesearch='moderate'
-                ))
             youtube_results = []
-            for result in search_results:
-                if 'youtube.com/watch' in result.get('href', ''):
-                    video_id = self._extract_youtube_id(result['href'])
                     youtube_result = {
                         "title": result.get('title', 'No title'),
-                        "url": result.get('href', ''),
-                        "description": result.get('body', 'No description'),
                         "video_id": video_id
                     }
                     youtube_results.append(youtube_result)
@@ -410,6 +471,9 @@ def test_web_search_tool():
             if result.success:
                 print(f"✅ Success: {result.result.get('message', 'No message')}")
                 if result.result.get('found'):
                     if 'results' in result.result:
                         print(f"   Found {len(result.result['results'])} results")

 #!/usr/bin/env python3
 """
 Web Search Tool for GAIA Agent System
+Handles web searches using Tavily API (primary) and Wikipedia (fallback)
 """
 import re
 import logging
 import time
+import os
 from typing import Dict, List, Optional, Any
 from urllib.parse import urlparse, urljoin
 import requests
 from bs4 import BeautifulSoup
 from tools import BaseTool
 class WebSearchTool(BaseTool):
     """
+    Web search tool using Tavily API (primary) and Wikipedia (fallback)
+    Much more reliable than DuckDuckGo with no rate limiting issues
     """
     def __init__(self):
         })
         self.session.timeout = 10
+        # Initialize Tavily client if API key is available
+        self.tavily_api_key = os.getenv("TAVILY_API_KEY")
+        self.use_tavily = self.tavily_api_key is not None
+        if self.use_tavily:
+            logger.info("✅ Tavily API key found - using Tavily for web search")
+        else:
+            logger.info("ℹ️ No Tavily API key found - will use Wikipedia fallback only")
     def _execute_impl(self, input_data: Any, **kwargs) -> Dict[str, Any]:
         """
         Execute web search operations based on input type
     def _search_web(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
         """
+        Search the web using Tavily API (primary) or Wikipedia (fallback)
         """
+        # Try Tavily first if API key is available
+        if self.use_tavily:
             try:
+                return self._search_with_tavily(query, limit, extract_content)
+            except Exception as e:
+                logger.warning(f"Tavily search failed, falling back to Wikipedia: {e}")
+        # Fallback to Wikipedia search
+        return self._search_with_wikipedia(query, limit)
+    def _search_with_tavily(self, query: str, limit: int = 5, extract_content: bool = False) -> Dict[str, Any]:
+        """
+        Search using Tavily Search API - much more reliable than DuckDuckGo
+        """
+        try:
+            logger.info(f"🔍 Tavily search for: {query}")
+            # Prepare Tavily API request
+            headers = {
+                "Content-Type": "application/json"
+            }
+            payload = {
+                "api_key": self.tavily_api_key,
+                "query": query,
+                "search_depth": "basic",
+                "include_answer": False,
+                "include_images": False,
+                "include_raw_content": extract_content,
+                "max_results": min(limit, 10)  # Tavily supports up to 10 results
+            }
+            # Make API request
+            response = self.session.post(
+                "https://api.tavily.com/search",
+                json=payload,
+                headers=headers,
+                timeout=15
+            )
+            response.raise_for_status()
+            tavily_data = response.json()
+            # Process Tavily results
+            results = []
+            tavily_results = tavily_data.get('results', [])
+            for result in tavily_results:
+                web_result = WebSearchResult(
+                    title=result.get('title', 'No title'),
+                    url=result.get('url', ''),
+                    snippet=result.get('content', 'No description'),
+                    content=result.get('raw_content', '') if extract_content else ''
+                )
+                results.append(web_result.to_dict())
+            if results:
+                logger.info(f"✅ Tavily found {len(results)} results")
                 return {
                     "query": query,
+                    "found": True,
                     "results": results,
                     "total_results": len(results),
+                    "message": f"Found {len(results)} results via Tavily Search API",
+                    "search_engine": "tavily"
                 }
+            else:
+                logger.warning("Tavily returned no results, trying Wikipedia fallback")
+                return self._search_with_wikipedia(query, limit)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Tavily API request failed: {e}")
+            # Fall back to Wikipedia
+            return self._search_with_wikipedia(query, limit)
+        except Exception as e:
+            logger.error(f"Tavily search error: {e}")
+            # Fall back to Wikipedia
+            return self._search_with_wikipedia(query, limit)
+    def _search_with_wikipedia(self, query: str, limit: int = 5) -> Dict[str, Any]:
         """
+        Search using Wikipedia as fallback - very reliable and no rate limits
         """
         try:
+            logger.info(f"📚 Wikipedia search for: {query}")
+            # Try to import wikipedia library
+            try:
+                import wikipedia
+            except ImportError:
+                return {
+                    "query": query,
+                    "found": False,
+                    "message": "❌ No search engines available. Install 'wikipedia' package or configure Tavily API key.",
+                    "results": []
+                }
             wikipedia.set_lang("en")
+            # Clean up query for Wikipedia search
             search_terms = query.replace("site:", "").strip()
+            # Search Wikipedia pages
+            wiki_results = wikipedia.search(search_terms, results=min(limit * 2, 10))
+            if not wiki_results:
+                return {
+                    "query": query,
+                    "found": False,
+                    "message": "No Wikipedia articles found for this query",
+                    "results": [],
+                    "search_engine": "wikipedia"
+                }
+            results = []
+            processed = 0
+            for page_title in wiki_results:
+                if processed >= limit:
+                    break
+                try:
+                    page = wikipedia.page(page_title)
+                    summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
+                    web_result = WebSearchResult(
+                        title=f"{page_title} (Wikipedia)",
+                        url=page.url,
+                        snippet=summary,
+                        content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
+                    )
+                    results.append(web_result.to_dict())
+                    processed += 1
+                except wikipedia.exceptions.DisambiguationError as e:
+                    # Try the first suggestion from disambiguation
+                    try:
+                        if e.options:
+                            page = wikipedia.page(e.options[0])
+                            summary = page.summary[:300] + "..." if len(page.summary) > 300 else page.summary
                             web_result = WebSearchResult(
+                                title=f"{e.options[0]} (Wikipedia)",
                                 url=page.url,
+                                snippet=summary,
+                                content=page.summary[:1000] + "..." if len(page.summary) > 1000 else page.summary
                             )
+                            results.append(web_result.to_dict())
+                            processed += 1
+                    except:
+                        continue
+                except wikipedia.exceptions.PageError:
+                    # Page doesn't exist, skip
+                    continue
+                except Exception as e:
+                    # Other Wikipedia errors, skip this page
+                    logger.warning(f"Wikipedia page error for '{page_title}': {e}")
+                    continue
+            if results:
+                logger.info(f"✅ Wikipedia found {len(results)} results")
+                return {
+                    "query": query,
+                    "found": True,
+                    "results": results,
+                    "total_results": len(results),
+                    "message": f"Found {len(results)} Wikipedia articles",
+                    "search_engine": "wikipedia"
+                }
+            else:
+                return {
+                    "query": query,
+                    "found": False,
+                    "message": "No accessible Wikipedia articles found for this query",
+                    "results": [],
+                    "search_engine": "wikipedia"
+                }
+        except Exception as e:
+            logger.error(f"Wikipedia search failed: {e}")
+            return {
+                "query": query,
+                "found": False,
+                "message": f"Search failed: {str(e)}",
+                "results": [],
+                "error_type": "search_failure"
+            }
     def _extract_content_from_url(self, url: str) -> Dict[str, Any]:
         """
             # Search specifically for YouTube videos
             youtube_query = f"site:youtube.com {query}"
+            # Use the same search logic but filter for YouTube results
+            search_result = self._search_web(youtube_query, limit=3)
+            if not search_result.get('found'):
+                return search_result
             youtube_results = []
+            for result in search_result.get('results', []):
+                if 'youtube.com/watch' in result.get('url', ''):
+                    video_id = self._extract_youtube_id(result['url'])
                     youtube_result = {
                         "title": result.get('title', 'No title'),
+                        "url": result.get('url', ''),
+                        "description": result.get('snippet', 'No description'),
                         "video_id": video_id
                     }
                     youtube_results.append(youtube_result)
             if result.success:
                 print(f"✅ Success: {result.result.get('message', 'No message')}")
+                search_engine = result.result.get('search_engine', 'unknown')
+                print(f"   Search engine: {search_engine}")
                 if result.result.get('found'):
                     if 'results' in result.result:
                         print(f"   Found {len(result.result['results'])} results")