Spaces:

mugwaneza
/

agents-course-final

Sleeping

App Files Files Community

mugwaneza commited on Jan 18

Commit

5d1ae57

1 Parent(s): 0cdc222

finals

Browse files

Files changed (3) hide show

app.py +224 -67
requirements.txt +8 -6
tools.py +365 -255

app.py CHANGED Viewed

@@ -3,8 +3,25 @@ import gradio as gr
 import requests
 import inspect
 import pandas as pd
-import re
-from tools import web_search, google_web_search, calculatorAndLogics, reverse_text, botany_vegetables_only, youtube_species_count
 # (Keep Constants as is)
 # --- Constants ---
@@ -12,58 +29,141 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
     def __call__(self, question: str) -> str:
-        q = (question or "").strip()
-        # Route 1: reversed text cue (contains many reversed words or obvious pattern)
-        if re.search(r"\brewsna\b|\btfel\b|\bs\w+t\b", q[::-1]):
-            return reverse_text(q)
-        # Route 2: strict math/logic expressions (only math chars and spaces)
-        if re.fullmatch(r"[0-9xX\s\+\-\*\/\^\.\(\)=]+", q):
-            return calculatorAndLogics(q)
-        # Route 3: info retrieval cues
-        if ("http://" in q or "https://" in q):
-            if "youtube.com/watch" in q.lower():
-                # Extract URL from question
-                url = None
-                for token in q.split():
-                    if token.startswith("http://") or token.startswith("https://"):
-                        url = token
-                        break
-                ans = youtube_species_count(url or q)
-                # For evaluation, return just the number if found
-                return ans if ans else web_search(q)
-            # Other URLs → Google web search (plain summary)
-            return google_web_search(q)
-        # Route 3b: info retrieval cues without explicit URLs
-        if any(kw in q.lower() for kw in ["wikipedia", "youtube", "find", "who", "what", "when", "where", "which", "how many", "surname", "city", "first name", "last name"]):
-            ans = google_web_search(q)
-            return ans if ans else web_search(q)
-        # Route 4: grocery list parsing (mentions list and botanically strict requirement)
-        if "grocery" in q.lower() and "vegetable" in q.lower():
-            # Try to extract the comma-separated segment after ':'
-            parts = q.split(":", 1)
-            list_text = parts[1] if len(parts) > 1 else q
-            return botany_vegetables_only(list_text)
-        # Default: Google search fallback to ddg
-        ans = google_web_search(q)
-        return ans if ans else web_search(q)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("/mugwaneza/agents-course-final") # Get the SPACE_ID for sending link to the code
     if profile:
         username= f"{profile.username}"
@@ -78,13 +178,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
@@ -110,17 +211,34 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     # 3. Run your Agent
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
@@ -177,40 +295,79 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
-        **Instructions:**
-        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
-        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
         ---
-        **Disclaimers:**
         Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
-        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
         """
     )
     gr.LoginButton()
-    run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    # Removed max_rows=10 from DataFrame constructor
-    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
-    )
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
     space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
@@ -230,5 +387,5 @@ if __name__ == "__main__":
     print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

 import requests
 import inspect
 import pandas as pd
+from dotenv import load_dotenv
+from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel, HfApiModel
+from tools import (
+    ReverseTextTool,
+    ExtractTextFromImageTool,
+    AnalyzeCSVTool,
+    AnalyzeExcelTool,
+    DateCalculatorTool,
+    DownloadFileTool
+)
+# Load environment variables
+try:
+    load_dotenv()
+    print("Environment variables are loaded from .env file")
+except Exception as e:
+    print(f"Could not load .env file - {e}")
 # (Keep Constants as is)
 # --- Constants ---
 # --- Basic Agent Definition ---
 # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+class GAIAAgent:
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+        print("Initializing Agent...")
+        # Get API Key
+        api_key = os.environ.get("HF_API_KEY")
+        if not api_key:
+            raise ValueError("HF API key not found. Please set the HF_API_KEY variable.")
+        # Initialize model with gpt-4o-mini
+        model_id = os.environ.get("HF_MODEL_ID", "Qwen/Qwen3-32B")
+        print(f"Using HF model: {model_id}")
+        model = HfApiModel(
+            model_id=model_id,
+            api_key=api_key,
+            temperature=0.6
+        )
+        # Initializing tools
+        search_tool = DuckDuckGoSearchTool()
+        self.tools = [search_tool,
+                     ReverseTextTool(),
+                     ExtractTextFromImageTool(),
+                     AnalyzeCSVTool(),
+                     AnalyzeExcelTool(),
+                     DateCalculatorTool(),
+                     DownloadFileTool()]
+        # Authorised imports
+        authorised_imports = ["PyPDF2", "pdf2image", "pillow", "nltk", "sklearn",
+                             "networkx", "matplotlib", "seaborn", "scipy", "time"]
+        self.agent = CodeAgent(
+            tools=self.tools,
+            model=model,
+            add_base_tools=True,
+            planning_interval=3,
+            verbosity_level=2 if self.verbose else 0,
+            additional_authorized_imports=authorised_imports
+        )
+        print("Agent ready to Go!")
+    def _is_reversed_text(self, text):
+        """Check if the text appears to be reversed"""
+        return(text.startswith(".") or
+              ".rewsna eht sa" in text or
+              "esrever" in text or
+              "sdrawkcab" in text)
     def __call__(self, question: str) -> str:
+        """Process a question and return the answer"""
+        if self.verbose:
+            print(f"Processing question: {question[:100]}." if len(question) > 100 else f"Processing question: {question}")
+        if self._is_reversed_text(question):
+            if self.verbose:
+                print("Detected reversed text, it will be hadle accordingly")
+            prompt = f"""
+            You are a general AI Assistant. Your purpose is to answer question.
+            This question appears to be in reversed text. Here is the reversed version for clarity:
+            {question[::-1]}
+            Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+            YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+            - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+            - If you are asked for a string, don't use articles, neither abbreviations(e.g. for cites), and write the digits in plain text unless specified otherwise.
+            - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put on the list is a number or a string.
+            IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
+            - Use web search sparingly and only when absolutely necessary.
+            - Limit to 1-2 web searches per question.
+            - If the search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
+            - Do not import libraries that aren't available - stick to basic Python and the tools provided.
+            - Focus on answering directly with what you already know when possible.
+            - If you have made more than 3 attempts to solve a problem, prioritize providing your best guess.
+            - Always add a delay of 2-3 seconds between web searches using time.sleep() to avoid rate limiting.
+            Remember to structure your response in Python code format using the final_answer() function.
+            """
+        else:
+            prompt = f"""
+            You are a general AI Assistant. Your purpose is to answer question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+            YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+            - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+            - If you are asked for a string, don't use articles, neither abbreviations(e.g. for cites), and write the digits in plain text unless specified otherwise.
+            - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put on the list is a number or a string.
+            Question: {question}
+            IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
+            - Use web search sparingly and only when absolutely necessary.
+            - Limit to 1-2 web searches per question.
+            - If the search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
+            - Do not import libraries that aren't available - stick to basic Python and the tools provided.
+            - Focus on answering directly with what you already know when possible.
+            - If you have made more than 3 attempts to solve a problem, prioritize providing your best guess.
+            - Always add a delay of 2-3 seconds between web searches using time.sleep() to avoid rate limiting.
+            Remember to structure your response in Python code format using the final_answer() function.
+            """
+        try:
+            answer = self.agent.run(prompt)
+            if self.verbose:
+                print(f"Generated answer: {answer}")
+            return answer
+        except Exception as e:
+            error_msg = f"Error processing question: {e}"
+            if self.verbose:
+                print(error_msg)
+            return error_msg
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
+    Fetches all questions, runs the Agent on them, submits all answers,
     and displays the results.
+    Args:
+        sample_size: Number of questions to process (0 for all questions)
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
         username= f"{profile.username}"
     # 1. Instantiate Agent ( modify this part to create your agent)
     try:
+        agent = GAIAAgent(verbose=True)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(f"Agent code URL: {agent_code}")
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
     # 3. Run your Agent
     results_log = []
     answers_payload = []
+    # Limit number of questions if sample_size is specified
+    # if sample_size > 0 and sample_size < len(questions_data):
+    #     import random
+    #     print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
+    #     questions_data = random.sample(questions_data, sample_size)
     print(f"Running agent on {len(questions_data)} questions...")
+    for i, item in enumerate(questions_data):
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+            print(f"Successfully processed question {i+1}")
+            # Delays next question to avoid rate limiting
+            if i< len(questions_data) - 1:
+                import time
+                print("Waiting 5 seconds before next question:)")
+                time.sleep(5)
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
         results_df = pd.DataFrame(results_log)
         return status_message, results_df
+def test_single_question(question: str) -> str:
+    """Test the agent on a single question"""
+    try:
+        agent = GAIAAgent(verbose=True)
+        answer = agent(question)
+        return answer
+    except Exception as e:
+        return f"Error: {e}"
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Agent Evaluation Runner")
     gr.Markdown(
         """
+        ## Instructions:
+        1.  Log in to your Hugging Face account using the button below.
+        2.  Test your agent on individual questions in the Testing Tab.
+        3.  Run the Evaluation on the GAIA benchmark in teh Evaluation Tab.
+        This agent is designed to achieve a score of at least 30% on teh GAIA Benchmark.
         ---
+        ## Disclaimers:
         Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
         """
     )
     gr.LoginButton()
+    with gr.Tab("Test for a single question"):
+        test_input = gr.Textbox(label="Enter a question", lines=3)
+        test_output = gr.Textbox(label="Answer", lines=5)
+        test_button = gr.Button("Run Test")
+        test_button.click(
+            fn=test_single_question,
+            inputs = test_input,
+            outputs=test_output
+        )
+    with gr.Tab("Final Evaluation"):
+        with gr.Row():
+            sample_size = gr.Slider(
+                minimum=0,
+                maximum=20,
+                value=0,
+                step=1,
+                label="Sample Size (0 for all questions)",
+                info="Set a number to limit how many questions to process (reduces costs)"
+            )
+        run_button = gr.Button("Run Evaluation & Submit All Answers")
+        status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+        # Removed max_rows=10 from DataFrame constructor
+        results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+        run_button.click(
+            fn=run_and_submit_all,
+            outputs=[status_output, results_table]
+        )
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
+    # Check for API key
+    api_key = os.environ.get("HF_API_KEY")
+    if not api_key:
+        print("WARNING: HF API key is not found. Please set HF_API_KEY environment variable.")
+    else:
+        print("OpenAI API key was found.")
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
     space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     print("-"*(60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for Agent Evaluation...")
+    demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 gradio
 requests
-fastapi
-datasets
-smolagents
-sympy
 pandas
-ddgs
-youtube-transcript-api

 gradio
+gradio[oauth]
+itsdangerous
 requests
 pandas
+numpy
+smolagents
+smolagents[openai]
+python-dotenv
+openai>=1.0.0
+litellm

tools.py CHANGED Viewed

@@ -1,266 +1,376 @@
-from smolagents import tool
-@tool
-def web_search(query: str) -> str:
-    """Search the web and return a plain-text summary.
-    Uses DuckDuckGo via `ddgs` and returns concise snippets (no links) prioritizing the top results.
-    Args:
-        query (str): The information need, e.g., a fact question or topic.
-    Returns:
-        str: A short summary synthesized from result snippets. If none available, returns the top result title.
-    """
-    try:
-        from ddgs import DDGS
-        bodies = []
-        titles = []
-        for r in DDGS().text(query, max_results=5):
-            b = r.get("body") or ""
-            t = r.get("title") or ""
-            if b:
-                bodies.append(b.strip())
-            if t:
-                titles.append(t.strip())
-        if bodies:
-            # Build a concise summary from top 2-3 snippets
-            summary = " ".join(bodies[:3])
-            # Trim overly long outputs
-            return summary[:600]
-        if titles:
-            return titles[0]
-        return "No relevant text found."
-    except Exception as e:
-        return f"ERROR: web_search failed: {e}"
-@tool
-def google_web_search(query: str) -> str:
-    """Search Google via Serper and return a concise plain-text summary.
-    Args:
-        query (str): The information need (fact question or topic).
-    Returns:
-        str: A short summary synthesized from top result snippets (no links). If none, returns the top title.
-    """
-    try:
-        import os
-        import requests
-        api_key = os.getenv("SERPER_API_KEY")
-        if not api_key:
-            return "ERROR: SERPER_API_KEY not set"
-        headers = {"X-API-KEY": api_key, "Content-Type": "application/json"}
-        payload = {"q": query, "gl": "us", "hl": "en"}
-        r = requests.post("https://google.serper.dev/search", json=payload, headers=headers, timeout=20)
-        r.raise_for_status()
-        data = r.json()
-        organic = data.get("organic") or []
-        bodies = []
-        titles = []
-        for item in organic[:5]:
-            snip = item.get("snippet") or ""
-            title = item.get("title") or ""
-            if snip:
-                bodies.append(snip.strip())
-            if title:
-                titles.append(title.strip())
-        if bodies:
-            return (" ".join(bodies[:3]))[:600]
-        if titles:
-            return titles[0]
-        return "No relevant text found."
-    except Exception as e:
-        return f"ERROR: google_web_search failed: {e}"
-@tool
-def calculatorAndLogics(expression: str) -> str:
-    """Perform calculations and basic logic evaluation.
-    Supports arithmetic (including parentheses and powers), boolean logic (and/or/not), and solving simple equations.
-    Args:
-        expression (str): A proposition or mathematical expression, e.g., "2*x+3=7" or "True and not False".
-    Returns:
-        str: The result of the calculation or logic operation.
-    """
-    try:
-        import re
-        from sympy import Eq, simplify, solve, sympify
-        expr = expression.strip()
-        lowered = expr.lower()
-        # Handle basic boolean logic like: True and False, not(True or False)
-        if any(k in lowered for k in [" and ", " or ", " not ", " true", " false"]):
-            safe_globals = {"__builtins__": {}}
-            safe_locals = {"True": True, "False": False}
-            safe_expr = re.sub(r"\btrue\b", "True", lowered)
-            safe_expr = re.sub(r"\bfalse\b", "False", safe_expr)
-            result = eval(safe_expr, safe_globals, safe_locals)
-            return str(result)
-        # Solve simple equations like: 2*x + 3 = 7
-        if "=" in expr:
-            left, right = expr.split("=", 1)
-            left_expr = sympify(left)
-            right_expr = sympify(right)
-            symbols_in_expr = list(left_expr.free_symbols.union(right_expr.free_symbols))
-            if symbols_in_expr:
-                sol = solve(Eq(left_expr, right_expr), symbols_in_expr)
-                return str(sol)
-        # Arithmetic evaluation and simplification
-        res = simplify(sympify(expr))
-        return str(res)
-    except Exception as e:
-        return f"ERROR: Unable to evaluate expression: {e}"
-@tool
-def guest_info_retriever(query: str) -> str:
-    """Retrieve detailed information about gala guests.
-    Args:
-        query (str): The name or relation of the guest you want information about.
-    Returns:
-        str: A concise set of search results describing the guest and their relation.
-    """
-    try:
-        from ddgs import DDGS
-        q = f"gala guest {query} relation biography"
-        bodies = []
-        for r in DDGS().text(q, max_results=5):
-            b = r.get("body") or ""
-            if b:
-                bodies.append(b.strip())
-        if bodies:
-            return (" ".join(bodies[:2]))[:600]
-        return "No guest info found."
-    except Exception:
-        return "No guest info found."
-# Note: LLM agent disabled to avoid runtime errors when a proper LLM adapter is not configured.
-@tool
-def reverse_text(input_text: str) -> str:
-    """Reverse the input text.
-    Useful for tasks that present reversed sentences and expect the opposite word or normal reading.
-    Args:
-        input_text (str): The text to reverse.
-    Returns:
-        str: The reversed text.
-    """
-    return input_text[::-1]
-@tool
-def botany_vegetables_only(list_text: str) -> str:
-    """Extract botanically correct vegetables from a grocery list.
-    Parses a comma-separated list in natural language and returns only items that are vegetables under botanical definitions.
-    Args:
-        list_text (str): The text containing the grocery list (comma-separated), possibly embedded within a sentence.
-    Returns:
-        str: Alphabetized, comma-separated list of vegetables with botanical fruits excluded.
-    """
-    import re
-    # Identify list items by splitting on commas, normalizing whitespace and case
-    items = [re.sub(r"\s+", " ", x.strip()).lower() for x in list_text.split(",")]
-    # Known botanical vegetables in the provided list
-    veg_set = {
-        "broccoli",
-        "celery",
-        "lettuce",
-        "sweet potatoes",
     }
-    # Items that are botanical fruits or non-vegetables to exclude
-    exclude = {
-        "bell pepper",
-        "plums",
-        "green beans",
-        "zucchini",
-        "corn",
-        "rice",
-        "peanuts",
-        "acorns",
-        "fresh basil",
-        "whole allspice",
-        "whole bean coffee",
-        "milk",
-        "eggs",
-        "flour",
-        "oreos",
     }
-    # Normalize certain variants
-    normalized = []
-    for it in items:
-        # Handle possible variants like singular forms
-        if it == "sweet potato":
-            it = "sweet potatoes"
-        normalized.append(it)
-    selected = sorted([x for x in normalized if x in veg_set and x not in exclude])
-    return ", ".join(selected) if selected else ""
-@tool
-def youtube_species_count(url: str) -> str:
-    """Extract the highest number of bird species on camera from a YouTube video.
-    Attempts to retrieve the transcript and searches for lines mentioning 'species' and numbers.
-    Args:
-        url (str): Full YouTube watch URL (e.g., https://www.youtube.com/watch?v=... ).
-    Returns:
-        str: The highest number found as a string, or an empty string if not determinable.
-    """
-    try:
-        import re
-        from urllib.parse import urlparse, parse_qs
-        from youtube_transcript_api import YouTubeTranscriptApi
-        parsed = urlparse(url)
-        vid = parse_qs(parsed.query).get('v', [''])[0]
-        if not vid:
-            return ""
-        max_num = None
         try:
-            # Try preferred English transcript
-            transcripts = YouTubeTranscriptApi.get_transcript(vid, languages=['en'])
-        except Exception:
-            # Fall back: iterate available transcripts
-            lister = YouTubeTranscriptApi.list_transcripts(vid)
-            transcripts = None
-            for tr in lister:
                 try:
-                    transcripts = tr.fetch()
                     break
-                except Exception:
                     continue
-            if transcripts is None:
-                return ""
-        for entry in transcripts:
-            text = entry.get('text', '')
-            if not text:
-                continue
-            if 'species' in text.lower():
-                for m in re.findall(r"\b(\d+)\b", text):
-                    n = int(m)
-                    max_num = n if (max_num is None or n > max_num) else max_num
-        return str(max_num) if max_num is not None else ""
-    except Exception:
-        return ""

+from smolagents import Tool
+import pandas as pd
+import os
+import tempfile
+import requests
+from urllib.parse import urlparse
+import json
+import re
+from datetime import datetime, timedelta
+class ReverseTextTool(Tool):
+    name = "reverse_text"
+    description = "Reverses the text in a string."
+    inputs = {
+        "text": {
+            "type": "string",
+            "description": "The text to reverse."
+        }
     }
+    output_type = "string"
+    def forward(self, text: str) -> str:
+        return text[::-1]
+class ExtractTextFromImageTool(Tool):
+    name = "extract_text_from_image"
+    description = "Extracts text from an image file using OCR."
+    inputs = {
+        "image_path": {
+            "type": "string",
+            "description": "Path to the image file."
+        }
     }
+    output_type = "string"
+    def forward(self, image_path: str) -> str:
+        try:
+            # Try to import pytesseract
+            import pytesseract
+            from PIL import Image
+            # Open the image
+            image = Image.open(image_path)
+            # Try different configurations for better results
+            configs = [
+                '--psm 6',  # Assume a single uniform block of text
+                '--psm 3',  # Automatic page segmentation, but no OSD
+                '--psm 1',  # Automatic page segmentation with OSD
+            ]
+            results = []
+            for config in configs:
+                try:
+                    text = pytesseract.image_to_string(image, config=config)
+                    if text.strip():
+                        results.append(text)
+                except Exception:
+                    continue
+            if results:
+                # Return the longest result, which is likely the most complete
+                return f"Extracted text from image:\n\n{max(results, key=len)}"
+            else:
+                return "No text could be extracted from the image."
+        except ImportError:
+            return "Error: pytesseract is not installed. Please install it with 'pip install pytesseract' and ensure Tesseract OCR is installed on your system."
+        except Exception as e:
+            return f"Error extracting text from image: {str(e)}"
+class AnalyzeCSVTool(Tool):
+    name = "analyze_csv_file"
+    description = "Analyzes a CSV file and provides information about its contents."
+    inputs = {
+        "file_path": {
+            "type": "string",
+            "description": "Path to the CSV file."
+        },
+        "query": {
+            "type": "string",
+            "description": "Optional query about the data.",
+            "default": "",
+            "nullable": True
+        }
+    }
+    output_type = "string"
+    def forward(self, file_path: str, query: str = "") -> str:
         try:
+            # Read CSV file with different encodings if needed
+            for encoding in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
                 try:
+                    df = pd.read_csv(file_path, encoding=encoding)
                     break
+                except UnicodeDecodeError:
                     continue
+            else:
+                return "Error: Could not read the CSV file with any of the attempted encodings."
+            # Basic information
+            result = f"CSV file has {len(df)} rows and {len(df.columns)} columns.\n"
+            result += f"Columns: {', '.join(df.columns)}\n\n"
+            # If there's a specific query
+            if query:
+                if "count" in query.lower():
+                    result += f"Row count: {len(df)}\n"
+                # Look for column-specific queries
+                for col in df.columns:
+                    if col.lower() in query.lower():
+                        result += f"\nColumn '{col}' information:\n"
+                        if pd.api.types.is_numeric_dtype(df[col]):
+                            result += f"Min: {df[col].min()}\n"
+                            result += f"Max: {df[col].max()}\n"
+                            result += f"Mean: {df[col].mean()}\n"
+                            result += f"Median: {df[col].median()}\n"
+                        else:
+                            # For categorical data
+                            value_counts = df[col].value_counts().head(10)
+                            result += f"Unique values: {df[col].nunique()}\n"
+                            result += f"Top values:\n{value_counts.to_string()}\n"
+            # General statistics for all columns
+            else:
+                # For numeric columns
+                numeric_cols = df.select_dtypes(include=['number']).columns
+                if len(numeric_cols) > 0:
+                    result += "Numeric columns statistics:\n"
+                    result += df[numeric_cols].describe().to_string()
+                    result += "\n\n"
+                # For categorical columns, show counts of unique values
+                cat_cols = df.select_dtypes(exclude=['number']).columns
+                if len(cat_cols) > 0:
+                    result += "Categorical columns:\n"
+                    for col in cat_cols[:5]:  # Limit to first 5 columns
+                        result += f"- {col}: {df[col].nunique()} unique values\n"
+            return result
+        except Exception as e:
+            return f"Error analyzing CSV file: {str(e)}"
+class AnalyzeExcelTool(Tool):
+    name = "analyze_excel_file"
+    description = "Analyzes an Excel file and provides information about its contents."
+    inputs = {
+        "file_path": {
+            "type": "string",
+            "description": "Path to the Excel file."
+        },
+        "query": {
+            "type": "string",
+            "description": "Optional query about the data.",
+            "default": "",
+            "nullable": True
+        },
+        "sheet_name": {
+            "type": "string",
+            "description": "Name of the sheet to analyze (defaults to first sheet).",
+            "default": None,
+            "nullable": True
+        }
+    }
+    output_type = "string"
+    def forward(self, file_path: str, query: str = "", sheet_name: str = None) -> str:
+        try:
+            # Read sheet names first
+            excel_file = pd.ExcelFile(file_path)
+            sheet_names = excel_file.sheet_names
+            # Info about all sheets
+            result = f"Excel file contains {len(sheet_names)} sheets: {', '.join(sheet_names)}\n\n"
+            # If sheet name is specified, use it; otherwise use first sheet
+            if sheet_name is None:
+                sheet_name = sheet_names[0]
+            elif sheet_name not in sheet_names:
+                return f"Error: Sheet '{sheet_name}' not found. Available sheets: {', '.join(sheet_names)}"
+            # Read the specified sheet
+            df = pd.read_excel(file_path, sheet_name=sheet_name)
+            # Basic information
+            result += f"Sheet '{sheet_name}' has {len(df)} rows and {len(df.columns)} columns.\n"
+            result += f"Columns: {', '.join(df.columns)}\n\n"
+            # Handle query similar to CSV tool
+            if query:
+                if "count" in query.lower():
+                    result += f"Row count: {len(df)}\n"
+                # Look for column-specific queries
+                for col in df.columns:
+                    if col.lower() in query.lower():
+                        result += f"\nColumn '{col}' information:\n"
+                        if pd.api.types.is_numeric_dtype(df[col]):
+                            result += f"Min: {df[col].min()}\n"
+                            result += f"Max: {df[col].max()}\n"
+                            result += f"Mean: {df[col].mean()}\n"
+                            result += f"Median: {df[col].median()}\n"
+                        else:
+                            # For categorical data
+                            value_counts = df[col].value_counts().head(10)
+                            result += f"Unique values: {df[col].nunique()}\n"
+                            result += f"Top values:\n{value_counts.to_string()}\n"
+            else:
+                # For numeric columns
+                numeric_cols = df.select_dtypes(include=['number']).columns
+                if len(numeric_cols) > 0:
+                    result += "Numeric columns statistics:\n"
+                    result += df[numeric_cols].describe().to_string()
+                    result += "\n\n"
+                # For categorical columns, show counts of unique values
+                cat_cols = df.select_dtypes(exclude=['number']).columns
+                if len(cat_cols) > 0:
+                    result += "Categorical columns:\n"
+                    for col in cat_cols[:5]:  # Limit to first 5 columns
+                        result += f"- {col}: {df[col].nunique()} unique values\n"
+            return result
+        except Exception as e:
+            return f"Error analyzing Excel file: {str(e)}"
+class DateCalculatorTool(Tool):
+    name = "date_calculator"
+    description = "Performs date calculations like adding days, formatting dates, etc."
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The date calculation to perform (e.g., 'What day is 10 days from today?', 'Format 2023-05-15 as MM/DD/YYYY')"
+        }
+    }
+    output_type = "string"
+    def forward(self, query: str) -> str:
+        try:
+            # Get current date/time
+            if re.search(r'(today|now|current date|current time)', query, re.IGNORECASE):
+                now = datetime.now()
+                if 'time' in query.lower():
+                    return f"Current date and time: {now.strftime('%Y-%m-%d %H:%M:%S')}"
+                else:
+                    return f"Today's date: {now.strftime('%Y-%m-%d')}"
+            # Add days to a date
+            add_match = re.search(r'(what|when).+?(\d+)\s+(day|days|week|weeks|month|months|year|years)\s+(from|after)\s+(.+)', query, re.IGNORECASE)
+            if add_match:
+                amount = int(add_match.group(2))
+                unit = add_match.group(3).lower()
+                date_text = add_match.group(5).strip()
+                # Parse the date
+                if date_text.lower() in ['today', 'now']:
+                    base_date = datetime.now()
+                else:
+                    try:
+                        # Try various date formats
+                        for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%B %d, %Y']:
+                            try:
+                                base_date = datetime.strptime(date_text, fmt)
+                                break
+                            except ValueError:
+                                continue
+                        else:
+                            return f"Could not parse date: {date_text}"
+                    except Exception as e:
+                        return f"Error parsing date: {e}"
+                # Calculate new date
+                if 'day' in unit:
+                    new_date = base_date + timedelta(days=amount)
+                elif 'week' in unit:
+                    new_date = base_date + timedelta(weeks=amount)
+                elif 'month' in unit:
+                    # Simplified month calculation
+                    new_month = base_date.month + amount
+                    new_year = base_date.year + (new_month - 1) // 12
+                    new_month = ((new_month - 1) % 12) + 1
+                    new_date = base_date.replace(year=new_year, month=new_month)
+                elif 'year' in unit:
+                    new_date = base_date.replace(year=base_date.year + amount)
+                return f"Date {amount} {unit} from {base_date.strftime('%Y-%m-%d')} is {new_date.strftime('%Y-%m-%d')}"
+            # Format a date
+            format_match = re.search(r'format\s+(.+?)\s+as\s+(.+)', query, re.IGNORECASE)
+            if format_match:
+                date_text = format_match.group(1).strip()
+                format_spec = format_match.group(2).strip()
+                # Parse the date
+                if date_text.lower() in ['today', 'now']:
+                    date_obj = datetime.now()
+                else:
+                    try:
+                        # Try various date formats
+                        for fmt in ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%B %d, %Y']:
+                            try:
+                                date_obj = datetime.strptime(date_text, fmt)
+                                break
+                            except ValueError:
+                                continue
+                        else:
+                            return f"Could not parse date: {date_text}"
+                    except Exception as e:
+                        return f"Error parsing date: {e}"
+                # Convert format specification to strftime format
+                format_mapping = {
+                    'YYYY': '%Y',
+                    'YY': '%y',
+                    'MM': '%m',
+                    'DD': '%d',
+                    'HH': '%H',
+                    'mm': '%M',
+                    'ss': '%S'
+                }
+                strftime_format = format_spec
+                for key, value in format_mapping.items():
+                    strftime_format = strftime_format.replace(key, value)
+                return f"Formatted date: {date_obj.strftime(strftime_format)}"
+            return "I couldn't understand the date calculation query."
+        except Exception as e:
+            return f"Error performing date calculation: {str(e)}"
+class DownloadFileTool(Tool):
+    name = "download_file"
+    description = "Downloads a file from a URL and saves it locally."
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL to download from."
+        },
+        "filename": {
+            "type": "string",
+            "description": "Optional filename to save as (default: derived from URL).",
+            "default": None,
+            "nullable": True
+        }
+    }
+    output_type = "string"
+    def forward(self, url: str, filename: str = None) -> str:
+        try:
+            # Parse URL to get filename if not provided
+            if not filename:
+                path = urlparse(url).path
+                filename = os.path.basename(path)
+                if not filename:
+                    # Generate a random name if we couldn't extract one
+                    import uuid
+                    filename = f"downloaded_{uuid.uuid4().hex[:8]}"
+            # Create temporary file
+            temp_dir = tempfile.gettempdir()
+            filepath = os.path.join(temp_dir, filename)
+            # Download the file
+            response = requests.get(url, stream=True)
+            response.raise_for_status()
+            # Save the file
+            with open(filepath, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return f"File downloaded to {filepath}. You can now analyze this file."
+        except Exception as e:
+            return f"Error downloading file: {str(e)}"