Final_Assignment_Template_alisa

Runtime error

App Files Files Community

alisamak commited on May 7, 2025

Commit

25b44c6

verified ·

1 Parent(s): d19b1bb

Update tools.py

Browse files

Files changed (1) hide show

tools.py +115 -237

tools.py CHANGED Viewed

@@ -1,42 +1,84 @@
-from langchain_core.tools import tool
-from urllib.parse import urlparse
 from duckduckgo_search import DDGS
 import wikipedia
-import requests
 import chess
 import chess.engine
 import sympy
-import fitz  # PyMuPDF
 import pandas as pd
 from imdb import IMDb
 from youtube_transcript_api import YouTubeTranscriptApi
 import yt_dlp
 import whisper
-from bs4 import BeautifulSoup
-import re
-import time
-from typing import Optional, List, Dict, Any
-import re
-from datetime import datetime, timedelta
-from langchain_core.tools import tool
-import logging
-from tavily import TavilyClient
-import os
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 client = TavilyClient(api_key=TAVILY_API_KEY)
-# Dictionary of known GAIA-style entities → canonical Wikipedia URLs
-WIKIPEDIA_PAGES = {
-    "mercedes sosa": "https://en.wikipedia.org/wiki/Mercedes_Sosa",
-    "summer olympics": "https://en.wikipedia.org/wiki/Summer_Olympic_Games",
-    "united nations": "https://en.wikipedia.org/wiki/United_Nations",
-    "pink floyd": "https://en.wikipedia.org/wiki/Pink_Floyd",
-    "chess": "https://en.wikipedia.org/wiki/Chess",
-    "dinosaur": "https://en.wikipedia.org/wiki/Dinosaur",
-    # ➕ add more GAIA topics here
-}
 @tool
 def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
@@ -64,115 +106,6 @@ def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
     return None
-@tool
-def tavily_search(query: str, k: int = 5) -> list[str]:
-    """
-    Perform a web search using the Tavily API and return up to k relevant snippets.
-    """
-    try:
-        response = client.search(query=query, search_depth="advanced", max_results=k)
-        return [r["content"] for r in response.get("results", [])]
-    except Exception as e:
-        return [f"❌ Error during Tavily search: {str(e)}"]
-@tool
-def get_article_nominator_from_fac_page(title: str) -> str:
-    """
-    Get the nominator of a Featured Article by scanning the main FAC page (not just archives).
-    """
-    base = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates"
-    url = f"{base}/{title}"
-    res = requests.get(url)
-    if res.status_code != 200:
-        return "Nominator not found"
-    soup = BeautifulSoup(res.text, "html.parser")
-    text = soup.get_text()
-    # Try direct pattern first
-    match = re.search(r"nominated by \[\[User:(.*?)\]\]", text, re.IGNORECASE)
-    if match:
-        return match.group(1).strip()
-    # Fallback: try to find first signed comment (e.g. --[[User:XYZ]])
-@tool
-def count_sosa_studio_albums_2000s() -> int:
-    """
-    Returns the number of studio albums by Mercedes Sosa released between 2000 and 2009 (inclusive).
-    Scrapes the 'Studio albums' section of her Wikipedia page.
-    """
-    import requests
-    from bs4 import BeautifulSoup
-    import re
-    url = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
-    res = requests.get(url)
-    soup = BeautifulSoup(res.text, "html.parser")
-    albums = []
-    start_header = None
-    # Find the "Studio albums" header
-    for tag in soup.find_all(["h2", "h3"]):
-        if 'Studio albums' in tag.get_text():
-            start_header = tag
-            break
-    if not start_header:
-        return 0
-    # Loop over the siblings until we hit the next major section
-    for sibling in start_header.find_next_siblings():
-        if sibling.name in ["h2", "h3"]:
-            break  # stop at next section
-        if sibling.name == "ul":
-            for li in sibling.find_all("li"):
-                text = li.get_text()
-                match = re.search(r"\b(19|20)\d{2}\b", text)
-                if match:
-                    year = int(match.group())
-                    if 2000 <= year <= 2009:
-                        albums.append(text.strip())
-    return len(set(albums))
-@tool
-def count_albums_by_year_range(title: str, start_year: int, end_year: int) -> int:
-    """
-    Count how many studio albums listed on the Wikipedia page were released between start_year and end_year.
-    This function targets the "Studio albums" section.
-    """
-    url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
-    response = requests.get(url)
-    soup = BeautifulSoup(response.text, "html.parser")
-    studio_section = None
-    for header in soup.find_all(["h2", "h3"]):
-        if "Studio albums" in header.get_text():
-            studio_section = header
-            break
-    if not studio_section:
-        return 0
-    albums = []
-    for elem in studio_section.find_next_siblings():
-        if elem.name in ["h2", "h3"]:  # next section begins
-            break
-        for li in elem.find_all("li"):
-            text = li.get_text()
-            year_match = re.search(r"(19|20)\d{2}", text)
-            if year_match:
-                year = int(year_match.group())
-                if start_year <= year <= end_year:
-                    albums.append(text)
-    return len(albums)
 @tool
 def get_article_nominator_from_fac_page(title: str) -> str:
     """
@@ -199,57 +132,8 @@ def get_article_nominator_from_fac_page(title: str) -> str:
     return "Nominator not found"
-@tool
-def handle_question(question: str) -> str:
-    """
-    Dynamically handle a question by routing to appropriate tools and combining results.
-    """
-    strategy = route_question.run(question)
-    if strategy == "extract_structured_facts_from_url":
-        wiki_url = resolve_wikipedia_url.run(question)
-        if not wiki_url:
-            return "❌ Could not find Wikipedia page."
-        return extract_structured_facts_from_url.run(wiki_url)
-    if strategy == "search_featured_articles_by_date_range":
-        return search_featured_articles_by_date_range.run("2016-11-01", "2016-11-30")
-    return "🤔 I will use internal reasoning."
-@tool
-def resolve_wikipedia_url(question: str) -> Optional[str]:
-    """
-    Returns a known Wikipedia URL if the question contains a known entity.
-    """
-    q = question.lower()
-    for key, url in WIKIPEDIA_PAGES.items():
-        if key in q:
-            logging.info(f"[Router] Matched '{key}' → {url}")
-            return url
-    logging.info(f"[Router] No match for: {question}")
-    return None
-@tool
-def route_question(question: str) -> str:
-    """
-    Determines the best tool to answer a given question.
-    Returns: one of 'search_web', 'extract_structured_facts_from_url', or 'use_internal_logic'
-    """
-    q = question.lower()
-    if "who" in q or "what" in q or "how many" in q or "when" in q:
-        return "tavily_search"
-    if "wikipedia" in q and any(k in q for k in ["how many", "list", "albums", "awards", "release"]):
-        return "extract_structured_facts_from_url"
-    if "featured article" in q and any(k in q for k in ["promoted", "in", "nominated"]):
-        return "search_featured_articles_by_date_range"
-    # Default to internal logic (math, logic puzzles, wordplay)
-    return "use_internal_logic"
 @tool
 def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
@@ -337,53 +221,53 @@ def categorize_grocery_items(items: list[str]) -> dict:
     return result
-@tool
-def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
-    """
-    Searches the English Wikipedia featured article archive and returns article titles
-    promoted between start_date and end_date.
-    Args:
-        start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
-        end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
-    Returns:
-        list[str]: A list of article titles promoted as Featured Articles during that period.
-    """
-    print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
-    try:
-        base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
-        archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
-        start = datetime.strptime(start_date, "%Y-%m-%d")
-        end = datetime.strptime(end_date, "%Y-%m-%d")
-        # We'll collect year-specific pages
-        result_titles = []
-        for year in range(start.year, end.year + 1):
-            url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
-            response = requests.get(url)
-            if response.status_code != 200:
-                continue
-            soup = BeautifulSoup(response.text, "html.parser")
-            for li in soup.select("li"):
-                text = li.get_text()
-                date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
-                print("🔍 Date matches:", date_matches)
-                for match in date_matches:
-                    try:
-                        d = datetime.strptime(match, "%Y-%m-%d")
-                        if start <= d <= end:
-                            a_tag = li.find("a")
-                            if a_tag:
-                                result_titles.append(a_tag.get_text(strip=True))
-                    except:
-                        continue
-        return sorted(set(result_titles))
-    except Exception as e:
-        return [f"Error: {str(e)}"]
 @tool
 def detect_non_commutative_subset(table_text: str) -> str:
@@ -463,19 +347,13 @@ def filter_vegetables(items: list[str]) -> list[str]:
 # List of all tools
 all_tools = [
     extract_number_from_snippets,
-    tavily_search,
-    route_question,
-    resolve_wikipedia_url,
-    handle_question,
-    search_featured_articles_by_date_range,
-    get_article_nominator_from_fac_page,
-    count_sosa_studio_albums_2000s,
-    count_albums_by_year_range,
-    extract_structured_facts_from_url,
     detect_non_commutative_subset,
     reverse_sentence,
     filter_vegetables,
     categorize_grocery_items,
 ]

 from duckduckgo_search import DDGS
 import wikipedia
 import chess
 import chess.engine
 import sympy
+import fitz
 import pandas as pd
 from imdb import IMDb
 from youtube_transcript_api import YouTubeTranscriptApi
 import yt_dlp
 import whisper
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 client = TavilyClient(api_key=TAVILY_API_KEY)
+@tool
+def handle_question(question: str) -> str:
+    """
+    Simple router for question types. Uses web_lookup as default.
+    """
+    if "table" in question and "*" in question:
+        return detect_non_commutative_subset.run(question)
+    if "reverse" in question or "backwards" in question:
+        return reverse_sentence.run(question)
+    if "vegetables" in question:
+        return ", ".join(filter_vegetables.run(question.split(", ")))
+    return web_lookup.run(question)
+@tool
+def web_lookup(query: str) -> str:
+    """
+    Unified web search tool that:
+    - Uses Tavily API to retrieve relevant snippets.
+    - Extracts the most relevant numeric or short factual answer.
+    - Falls back to Wikipedia if Tavily fails.
+    Args:
+        query (str): The user query or question.
+    Returns:
+        str: A concise factual answer extracted from Tavily or Wikipedia.
+    """
+    try:
+        # Step 1: Tavily search
+        response = client.search(query=query, search_depth="advanced", max_results=5)
+        snippets = [r["content"] for r in response.get("results", [])]
+        for s in snippets:
+            # Try to extract a meaningful answer (year, name, short fact)
+            match = re.search(r"\b(18|19|20)\d{2}\b", s)
+            if match:
+                return match.group()
+            elif len(s.split()) <= 12:
+                return s.strip()
+        # Step 2: Wikipedia fallback
+        # Guess page title from query
+        wiki_title = query.split(" ")[-1].capitalize()
+        wiki_url = f"https://en.wikipedia.org/wiki/{wiki_title}"
+        res = requests.get(wiki_url, timeout=10)
+        if res.status_code != 200:
+            return "❌ Wikipedia page not found."
+        soup = BeautifulSoup(res.text, "html.parser")
+        text = soup.get_text()
+        match = re.search(r"\b(18|19|20)\d{2}\b", text)
+        if match:
+            return match.group()
+        # Fallback to first paragraph or snippet
+        paras = soup.find_all("p")
+        if paras:
+            for p in paras:
+                if p.get_text(strip=True):
+                    return p.get_text(strip=True)
+        return "❌ No relevant data found."
+    except Exception as e:
+        return f"❌ Error during web lookup: {str(e)}"
 @tool
 def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
     return None
 @tool
 def get_article_nominator_from_fac_page(title: str) -> str:
     """
     return "Nominator not found"
 @tool
 def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
     return result
+# @tool
+# def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
+#     """
+#     Searches the English Wikipedia featured article archive and returns article titles
+#     promoted between start_date and end_date.
+#     Args:
+#         start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
+#         end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
+#     Returns:
+#         list[str]: A list of article titles promoted as Featured Articles during that period.
+#     """
+#     print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
+#     try:
+#         base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
+#         archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
+#         start = datetime.strptime(start_date, "%Y-%m-%d")
+#         end = datetime.strptime(end_date, "%Y-%m-%d")
+#         # We'll collect year-specific pages
+#         result_titles = []
+#         for year in range(start.year, end.year + 1):
+#             url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
+#             response = requests.get(url)
+#             if response.status_code != 200:
+#                 continue
+#             soup = BeautifulSoup(response.text, "html.parser")
+#             for li in soup.select("li"):
+#                 text = li.get_text()
+#                 date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
+#                 print("🔍 Date matches:", date_matches)
+#                 for match in date_matches:
+#                     try:
+#                         d = datetime.strptime(match, "%Y-%m-%d")
+#                         if start <= d <= end:
+#                             a_tag = li.find("a")
+#                             if a_tag:
+#                                 result_titles.append(a_tag.get_text(strip=True))
+#                     except:
+#                         continue
+#         return sorted(set(result_titles))
+#     except Exception as e:
+#         return [f"Error: {str(e)}"]
 @tool
 def detect_non_commutative_subset(table_text: str) -> str:
 # List of all tools
 all_tools = [
+    web_lookup,
     extract_number_from_snippets,
     detect_non_commutative_subset,
     reverse_sentence,
     filter_vegetables,
     categorize_grocery_items,
+    get_article_nominator_from_fac_page,
+    # Optional: handle_question (for fallback routing)
 ]