GaiaAgent_Final_Assignment

Running

App Files Files Community

Francesco-A commited on 9 days ago

Commit

15a3001

1 Parent(s): da5af70

Updated space

Browse files

Added:
- transciber tools
- GeminiAgent
- LocalAgent (not used in space)

Files changed (4) hide show

agent.py +245 -63
app.py +23 -73
requirements.txt +14 -0
tools/audio_tools.py +78 -0

agent.py CHANGED Viewed

@@ -1,10 +1,21 @@
-from smolagents import tool
 import pandas as pd
 from smolagents import (
     CodeAgent,
-    InferenceClientModel,
     Tool,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
     WikipediaSearchTool,
@@ -15,66 +26,237 @@ from smolagents import (
 # Import your custom tools (to be used in app, not in local notebook)
 from tools.download_file import download_file_from_url
 from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
-def create_agent(
-    model_path: str = "Qwen/Qwen3-Next-80B-A3B-Thinking"
-):
-    """
-    Creates and configures a CodeAgent.
-    This function initializes a smolagents CodeAgent equipped with the
-    recommended default tools (web search, browser, and Python interpreter),
-    together with any custom tools you may define.
-    Args:
-        model_path (str): The identifier or local path of the Hugging Face
-            model to be loaded. By default, it uses `Qwen/Qwen3-Next-80B-A3B-Thinking`,
-            but any compatible model can be substituted.
-    Returns:
-        CodeAgent: A fully initialized agent ready to run code, query tools,
-        and perform multi-step reasoning using the selected model.
-    """
-    # Choose a lightweight but reasoning-capable model
-    model = InferenceClientModel(
-        model_id=model_path,
-        temperature        = 0.0,
-        top_p              = 1.0,   # NEW
-    )
-    # Default smolagents tools (high-level)
-    default_tools = [
-        DuckDuckGoSearchTool(),     # Internet search
-        VisitWebpageTool(),         # Retrieve webpage content
-        PythonInterpreterTool(),    # Executes agent-generated Python code
-        FinalAnswerTool(),          # Ends agent reasoning and returns final answer
-    ]
-    # Custom tools (critical for GAIA)
-    custom_tools = [
-        download_file_from_url,     # file downloader
-        text_file_to_string,        # .txt, .md, .json, etc.
-        pdf_to_text,                # PyMuPDF-based safe PDF parser
-        image_to_text,              # OCR for images
-    ]
-    tools = default_tools + custom_tools
-    # Create the CodeAgent (best for GAIA because it supports Python)
-    agent = CodeAgent(
-        model=model,
-        tools=tools,
-        add_base_tools=True,        # probably redundant, but it does not hurt
-        max_steps=7,
-        additional_authorized_imports = ['numpy','subprocess', 're', 'pandas',
-                                         'json', 'os', 'pathlib', 'tempfile',
-                                        #  'matplotlib.pyplot', 'seaborn'
-                                         ],
-        verbosity_level = 1,
-        max_print_outputs_length=1_000_000
-    )
-    return agent
-# WIP: Agentic RAG Systems

+# Generic agent
+import os
+from typing import Optional
 import pandas as pd
+import torch
+# Local agent specific
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Smolagents imports
 from smolagents import (
     CodeAgent,
+    InferenceClientModel,
+    TransformersModel,
+    LiteLLMModel,
     Tool,
+    tool,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
     WikipediaSearchTool,
 # Import your custom tools (to be used in app, not in local notebook)
 from tools.download_file import download_file_from_url
 from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
+from tools.audio_tools   import youtube_to_text, transcribe_audio
+# Define tools
+AGENT_TOOLS = [
+    # Default Tools
+    DuckDuckGoSearchTool(),     # Internet search
+    VisitWebpageTool(),         # Retrieve webpage content
+    PythonInterpreterTool(),    # Executes agent-generated Python code
+    FinalAnswerTool(),          # Ends agent reasoning and returns final answer
+    # Custom Tools
+    download_file_from_url,     # file downloader
+    text_file_to_string,        # .txt, .md, .json, etc.
+    pdf_to_text,                # PyMuPDF-based safe PDF parser
+    image_to_text,              # OCR for images
+    youtube_to_text,            # Youtube audio to text
+    transcribe_audio,           # Audio file to text
+]
+# System prompt
+SYSTEM_PROMPT = """
+You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
+### 1. Reason-Act-Observe
+Follow a **PLAN → ACT → OBSERVE** loop:
+- **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step.
+- **ACT:** Write and run one self-contained Python block per step.
+- **OBSERVE:** Examine outputs or errors before proceeding.
+### 2. File Handling
+- When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
+You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
+| File Type / Source | Tool / Method to Use |
+| :--- | :--- |
+| `.csv` | `pd.read_csv(filepath)` |
+| `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
+| `.pdf` | `pdf_to_text(filepath)` |
+| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
+| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
+| **YouTube URL** | `youtube_to_text(url)` |
+| `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
+**Important rules:**
+- When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
+- You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
+- For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
+### 3. Data Analysis & Answer
+- Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
+- Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
+- Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
+### 4. Additional instructions for the following tasks provided by GAIA team
+- You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
+- Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
+- Example of how to end the task:
+Thought: I have found the answer. I will now provide it.
+<code>
+final_answer("FINAL ANSWER: The capital of France is Paris")
+</code>
+\n\n
+"""
+class BasicAgent:
+    def __init__(self):
+        self.system_prompt = SYSTEM_PROMPT
+        self.model  = InferenceClientModel(
+            model_id    = "Qwen/Qwen3-Next-80B-A3B-Thinking",
+            temperature = 0.0,
+            top_p       = 1.0,
+            max_tokens  = 8196,
+            )
+        self.tools = AGENT_TOOLS
+        self.basic_agent = CodeAgent(
+            name           = "basic_agent",
+            description    = "Basic smolagents CodeAgent",
+            model          = self.model,
+            tools          = self.tools,
+            add_base_tools = True,        # probably redundant, but it does not hurt
+            max_steps      = 5,
+            additional_authorized_imports = [
+                'numpy','subprocess', 're', 'pandas',
+                'json', 'os', 'datetime', 'tempfile',
+                ],
+            verbosity_level = 1,
+            max_print_outputs_length=1_000_000
+            )
+        print("✅ Basic agent initialized")
+    def __call__(self, question: str, file_path: Optional[str] = None) -> str:
+        if file_path:
+            # Inject system prompt + question and (optional) file path
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+                f"There is an associated file at path: {file_path}.\n"
+                f"Use the appropriate tool to download it (if necessary) and read it before answering"
+            )
+        else:
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+            )
+        return self.basic_agent.run(prompt)
+class GeminiAgent:
+    def __init__(self):
+        self.system_prompt = SYSTEM_PROMPT
+        GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+        if not GOOGLE_API_KEY:
+            raise RuntimeError(
+                "GOOGLE_API_KEY not found."
+            )
+        self.model = LiteLLMModel(
+            model_id    = "gemini/gemini-2.0-flash",
+            api_key     = GOOGLE_API_KEY,
+            temperature = 0.0,
+            top_p       = 1.0,
+            max_tokens  = 8196,
+            )
+        self.tools = AGENT_TOOLS
+        self.gemini_agent = CodeAgent(
+            name           = "gemini_agent",
+            description    = "Gemini CodeAgent",
+            model          = self.model,
+            tools          = self.tools,
+            add_base_tools = True,        # probably redundant, but it does not hurt
+            max_steps      = 5,
+            additional_authorized_imports = [
+                'numpy','subprocess', 're', 'pandas',
+                'json', 'os', 'datetime', 'tempfile',
+                ],
+            verbosity_level = 1,
+            max_print_outputs_length=1_000_000
+            )
+        print("✅ Gemini agent initialized")
+    def __call__(self, question: str, file_path: Optional[str] = None) -> str:
+        if file_path:
+            # Inject system prompt + question and (optional) file path
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+                f"There is an associated file at path: {file_path}.\n"
+                f"Use the appropriate tool to download it (if necessary) and read it before answering"
+            )
+        else:
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+            )
+        return self.gemini_agent.run(prompt)
+class LocalAgent:
+    def __init__(self):
+        checkpoint = "Qwen/Qwen2.5-7B-Instruct"
+        quantized_model_dir = "./quantized_model"
+        # Define the quantized configuration
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit = True,
+            bnb_4bit_quant_type = "nf4",
+            bnb_4bit_compute_dtype = torch.bfloat16,
+            bnb_4bit_use_double_quant = True,
+        )
+        # Load quantized model and tokenizer
+        temp_model = AutoModelForCausalLM.from_pretrained(
+            checkpoint,
+            quantization_config = bnb_config,
+            device_map="auto" # use multiple GPUs if available
+        )
+        temp_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        # Save the model in local path (seems like it's the only way to make it work with TransformersModel)
+        temp_model.save_pretrained(quantized_model_dir)
+        temp_tokenizer.save_pretrained(quantized_model_dir)
+        self.system_prompt = SYSTEM_PROMPT
+        self.model = TransformersModel(
+            model_path = quantized_model_dir,
+            temperature = 0.1,
+            top_p = 0.95,
+            device_map = "auto",
+            max_new_tokens = 8196    # https://github.com/huggingface/smolagents/issues/414#:~:text=Running%20with%20TransformersModel%20does%20not%20work
+        )
+        self.tools = AGENT_TOOLS
+        self.local_agent = CodeAgent(
+            model=self.model,
+            tools=tools,
+            add_base_tools=True,      # probably redundant, but it does not hurt
+            max_steps=5,
+            additional_authorized_imports = ['numpy','subprocess', 're', 'pandas',
+                                             'json', 'os', 'pathlib', 'tempfile',
+                                             # 'matplotlib.pyplot', 'seaborn'
+                                            ],
+            verbosity_level = 1,
+            max_print_outputs_length=1_000_000
+        )
+        print("✅ Local (quantized) agent initialized.")
+    def __call__(self, question: str, file_path: Optional[str] = None) -> str:
+        if file_path:
+            # Inject system prompt + question and (optional) file path
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+                f"There is an associated file at path: {file_path}.\n"
+                f"Use the appropriate tool to download it (if necessary) and read it before answering"
+            )
+        else:
+            prompt = (
+                f"{self.system_prompt}\n\n"
+                f"Question: {question}\n\n"
+            )
+        return self.local_agent.run(prompt)

app.py CHANGED Viewed

@@ -1,80 +1,18 @@
 import os
 import gradio as gr
 import requests
 import inspect
 import pandas as pd
-from agent import create_agent
 from typing import Optional
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Basic Agent Definition ---
-class BasicAgent:
-    def __init__(self):
-        self.agent = create_agent()
-        self.system_prompt = """
-        You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
-        ### 1. Reason-Act-Observe
-        Follow a **PLAN → ACT → OBSERVE** loop:
-        - **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step.
-        - **ACT:** Write and run one self-contained Python block per step.
-        - **OBSERVE:** Examine outputs or errors before proceeding.
-        ### 2. File Handling
-        - When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
-        You must select the reading method based strictly on the file extension:
-        | File Extension | Tool / Method to Use |
-        | :--- | :--- |
-        | .csv | `pd.read_csv(filepath)` |
-        | .xlsx, .xls | `pd.read_excel(filepath)` |
-        | .pdf | `pdf_to_text(filepath)` |
-        | .txt, .md, .json | `text_file_to_string(filepath)` |
-        | .png, .jpg, .jpeg | `image_to_text(filepath)` |
-        ### 3. Data Analysis & Answer
-        - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
-        - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
-        - Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
-        ### 4. Additional instructions for the following tasks provided by GAIA team
-        - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
-        - Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-        ### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block.
-        - Example of how to end the task:
-        Thought: I have found the answer. I will now provide it.
-        <code>
-        final_answer("FINAL ANSWER: The capital of France is Paris")
-        </code>
-        \n\n
-        """
-        # print("Agent initialized.")
-    def __call__(self, question: str, file_path: Optional[str] = None) -> str:
-        if file_path:
-            # Inject system prompt + question and (optional) file path
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-                f"There is an associated file at path: {file_path}.\n"
-                f"Use the appropriate tool to download it (if necessary) and read it before answering"
-            )
-        else:
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-            )
-        return self.agent.run(prompt)
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -84,7 +22,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -94,15 +32,25 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
-    except Exception as e:
-        print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
@@ -139,6 +87,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

 import os
+import time
 import gradio as gr
 import requests
 import inspect
 import pandas as pd
+from agent import BasicAgent, GeminiAgent
 from typing import Optional
+# (ASK ABOUT ALIGNMENT BETWEEN TIMEOUT ARGUMENTS AND TIME.SLEEP)
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent (modify this part to create your agent)
     try:
+        agent = GeminiAgent()
+        agent_type = "GeminiAgent"
+    except Exception as main_agent_error:
+        print(f"{agent_type} failed to initialize: {main_agent_error}.")
+        try:
+            agent = BasicAgent()
+            agent_type = "BasicAgent"
+            print(f"Falling back to {agent_type}.")
+        except Exception as secondary_agent_error:
+            print(f"{agent_type} failed to initialize: {secondary_agent_error}.")
+            agent_type = "None"
+            return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(f"Agent code: {agent_code}")
+    print(f"Active agent: {agent_code}")
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+            time.sleep(60) # to not exceed free limits
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

requirements.txt CHANGED Viewed

@@ -19,5 +19,19 @@ Pillow==11.3.0
 pdfplumber==0.11.8
 PyMuPDF==1.26.7
 # OCR (OPTIONAL, disabled)
 # pytesseract==0.3.13

 pdfplumber==0.11.8
 PyMuPDF==1.26.7
+# Audio transcriber
+youtube-transcript-api==1.2.3
+pytubefix==10.3.6
+openai-whisper==20250625
 # OCR (OPTIONAL, disabled)
 # pytesseract==0.3.13
+# Additional for LocalAgent (optional)
+!pip install transformers==4.1.0
+!pip install bitsandbytes==0.49.0
+!pip install \
+    torch==2.6.0+cu124 \
+    torchvision \
+    torchaudio \
+    --extra-index-url https://download.pytorch.org/whl/cu124

tools/audio_tools.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from smolagents import tool
+import tempfile
+import os
+@tool
+def youtube_to_text(url: str) -> str:
+    """
+    Transcribe a YouTube video.
+    First tries to retrieve official captions.
+    Falls back to Whisper transcription if captions are unavailable.
+    Args:
+        url: Full YouTube video URL
+    Returns:
+        Transcribed text
+    """
+    # ---- Step 1: Try official YouTube transcripts ----
+    try:
+        from youtube_transcript_api import YouTubeTranscriptApi
+        from urllib.parse import urlparse, parse_qs
+        query = parse_qs(orlparse(url).query)
+        video_id = query.get("v", [None])[0]
+        if video_id:
+            transcript = YouTubeTranscriptApi.get_transcript(video_id)
+            text = " ".join([chunk["text"] for chunk in transcript])
+            return text
+    except Exception:
+        pass # Silent fallback to Whisper
+    # ---- Step 2: Fallback to Whisper transcription ----
+    try:
+        import whisper
+        from pytubefix import YouTube
+        yt = YouTube(url)
+        audio_stream = yt.streams.get_audio_only()
+        temp_dir = tempfile.gettempdir()
+        audio_path = audio_stream.download(output_path=temp_dir)
+        model = whisper.load_model("base")
+        result = model.transcribe(audio_path)
+        return result["text"]
+    except Exception as e:
+        return f"Error transcribing YouTube video: {str(e)}"
+@tool
+def transcribe_audio(file_path: str) -> str:
+    """
+    Transcribes audio files into text using the Whisper model.
+    Supports multiple formats including .mp3, .wav, .m4a, .flac, and .ogg.
+    Args:
+        file_path: The local path to the audio file to be transcribed.
+    Returns:
+        The transcribed text as a string.
+    """
+    try:
+        import whisper
+        model = whisper.load_model("base")
+        result = model.transcribe(file_path)
+        return result["text"]
+    except ImportError:
+        return (
+            "Whisper is not installed. "
+            "Install it with `pip install openai-whisper` and ensure ffmpeg is available."
+        )
+    except Exception as e:
+        return f"Error transcribing audio file: {str(e)}"