GAIA-Agent

Sleeping

App Files Files Community

Mikkel Skovdal commited on Apr 29, 2025

Commit

f0eb1da

1 Parent(s): 7d72891

audio and image support

Browse files

Files changed (13) hide show

.DS_Store +0 -0
.gitignore +1 -1
app.py +14 -29
requirements.txt +4 -1
src/.DS_Store +0 -0
src/__pycache__/agent.cpython-310.pyc +0 -0
src/__pycache__/api_client.cpython-310.pyc +0 -0
src/__pycache__/tools.cpython-310.pyc +0 -0
src/agent.py +78 -68
src/api_client.py +30 -9
src/temp/.DS_Store +0 -0
src/tools.py +43 -3
test.json +6 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitignore CHANGED Viewed

@@ -1,5 +1,5 @@
 secret
 test.ipynb
 .env
-error.txt
 log.txt

 secret
 test.ipynb
 .env
 log.txt
+src/temp/files/

app.py CHANGED Viewed

@@ -12,25 +12,6 @@ from src.api_client import ApiClient
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-def save_file(file_content, task_id):
-    """
-    Save a task file to a temporary location
-    """
-    if not file_content:
-        return None
-    # Create a temporary file
-    temp_dir = tempfile.gettempdir()
-    file_path = os.path.join(temp_dir, f"task_{task_id}.txt")
-    # Write content to the file
-    with open(file_path, "wb") as f:
-        f.write(file_content)
-    print(f"File saved to {file_path}")
-    return file_path
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the CustomAgent on them, submits all answers,
@@ -49,7 +30,6 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     # 1. Instantiate Agent ( modify this part to create your agent)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
-    # Initialize Agent with configuration
     try:
         agent_config = get_config()
         print(f"Using agent configuration: {agent_config}")
@@ -83,6 +63,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
@@ -95,13 +78,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             )
             # Check if the question has an associated file
-            file_path = None
-            try:
-                file_content = api_client.get_file(task_id)
-                print(f"Downloaded file for task {task_id}")
-                file_path = save_file(file_content, task_id)
-            except Exception as file_e:
-                print(f"No file for task {task_id}")
             # Run the agent to get the answer
             submitted_answer = agent.forward(question_text, file_path)
@@ -134,7 +115,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 }
             )
         finally:
-            time.sleep(agent_config.get("sleep", 60))  # Delay between requests
     # Print summary
     print(f"\nProcessing complete: {completed} questions processed, {failed} failures")
@@ -143,12 +125,15 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     # Store results in a log file
-    log_file_path = "logging/log.txt"
     os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
     timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
     with open(log_file_path, "a") as log_file:
         for entry in results_log:
             log_file.write(f"{timestamp} - {entry}\n")
     # 4. Prepare Submission
     print(f"Submitting {len(answers_payload)} answers for username '{username}'...")

 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the CustomAgent on them, submits all answers,
     # 1. Instantiate Agent ( modify this part to create your agent)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
     try:
         agent_config = get_config()
         print(f"Using agent configuration: {agent_config}")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
+        file_name = item.get("file_name")
+        file_path = None
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
             )
             # Check if the question has an associated file
+            if file_name:
+                try:
+                    file_path = api_client.get_file(task_id=task_id, file_name=file_name)
+                except Exception as file_e:
+                    print(f"Failed to download the file for task {task_id} - {file_e}")
             # Run the agent to get the answer
             submitted_answer = agent.forward(question_text, file_path)
                 }
             )
         finally:
+            if completed+failed < total_questions:
+                time.sleep(55)
     # Print summary
     print(f"\nProcessing complete: {completed} questions processed, {failed} failures")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     # Store results in a log file
+    log_file_path = "src/temp/log.txt"
     os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
     timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
     with open(log_file_path, "a") as log_file:
         for entry in results_log:
             log_file.write(f"{timestamp} - {entry}\n")
+        log_file.write(
+            f"{timestamp} - Summary: {completed} questions processed, {failed} failures\n\n"
+        )
     # 4. Prepare Submission
     print(f"Submitting {len(answers_payload)} answers for username '{username}'...")

requirements.txt CHANGED Viewed

@@ -2,4 +2,7 @@ gradio
 requests
 smolagents
 smolagents[litellm]
-ratelimiter

 requests
 smolagents
 smolagents[litellm]
+ratelimiter
+youtube-transcript-api
+SpeechRecognition
+pydub

src/.DS_Store CHANGED Viewed

Binary files a/src/.DS_Store and b/src/.DS_Store differ

src/__pycache__/agent.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/agent.cpython-310.pyc and b/src/__pycache__/agent.cpython-310.pyc differ

src/__pycache__/api_client.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/api_client.cpython-310.pyc and b/src/__pycache__/api_client.cpython-310.pyc differ

src/__pycache__/tools.cpython-310.pyc ADDED Viewed

Binary file (1.1 kB). View file

src/agent.py CHANGED Viewed

@@ -1,15 +1,23 @@
-from smolagents import CodeAgent, LiteLLMModel, DuckDuckGoSearchTool, PythonInterpreterTool
-import datetime
 import os
-import time
-from typing import List, Optional
 from dotenv import load_dotenv
-load_dotenv()
 SYSTEM_PROMPT = """
-You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.\n
 """
@@ -30,28 +38,41 @@ class CustomAgent:
         """
         self.logging = logging
         self.verbose = verbose
-        self.imports = ["pandas", "numpy", "datetime", "json", "re", "math", "os", "requests", "csv", "urllib"]
         if additional_imports:
             self.imports.extend(additional_imports)
-                # Initialize tools
         self.tools = [
             DuckDuckGoSearchTool(),
             PythonInterpreterTool(),
-            # save_and_read_file,
-            # download_file_from_url,
-            # analyze_csv_file,
-            # analyze_excel_file
         ]
         # Initialize the model
         model = LiteLLMModel(
-            model_id=model_id,
             api_key=os.getenv("GEMINI_API_KEY"),
             timeout=timeout,
         )
-        # Initialize the CodeAgent
         self.agent = CodeAgent(
             model=model,
             tools=self.tools,
@@ -64,14 +85,10 @@ class CustomAgent:
             print("CustomAgent initialized.")
     def __call__(self, question: str) -> str:
-        if self.verbose:
-            print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             full_prompt = f"""{question}
-            When answering, provide ONLY the precise answer requested.
-            Do not include explanations, steps, reasoning, or additional text.
-            Be direct and specific. GAIA benchmark requires exact matching answers.
-            For example, if asked "What is the capital of France?", respond simply with "Paris".
             """
             answer = self.agent.run(full_prompt)
             answer = self._clean_answer(answer)
@@ -82,39 +99,34 @@ class CustomAgent:
             if self.verbose:
                 print(error_msg)
             return error_msg
     def forward(self, question: str, file_path) -> str:
-        if self.verbose:
-            print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
-            context = f"Question: {question}"
-            # If there's a file, read it and include its content in the context
             if file_path:
-                try:
-                    with open(file_path, 'r') as f:
-                        file_content = f.read()
-                    # Determine file type from extension
-                    import os
-                    file_ext = os.path.splitext(file_path)[1].lower()
-                    context = f"""
-                    Question: {question}
-                    This question has an associated file. Here is the file content:
-                    ```{file_ext}
-                    {file_content}
-                    ```
-                    Analyze the file content above to answer the question.
-                    """
-                except Exception as _:
-                    print(f"Error reading file {file_path}. Proceeding without file content.")
-            full_prompt = f"""{context}
-            When answering, provide ONLY the precise answer requested.
-            Do not include explanations, steps, reasoning, or additional text.
-            Be direct and specific. GAIA benchmark requires exact matching answers.
-            For example, if asked "What is the capital of France?", respond simply with "Paris".
-            """
-            answer = self.agent.run(full_prompt)
             answer = self._clean_answer(answer)
             return answer
@@ -123,27 +135,24 @@ class CustomAgent:
             if self.verbose:
                 print(error_msg)
             return error_msg
     def _clean_answer(self, answer: any) -> str:
         """
         Clean up the answer to remove common prefixes and formatting
         that models often add but that can cause exact match failures.
         Args:
             answer: The raw answer from the model
         Returns:
             The cleaned answer as a string
         """
         # Convert non-string types to strings
         if not isinstance(answer, str):
-            # Handle numeric types (float, int)
             if isinstance(answer, float):
-                # Format floating point numbers properly
                 if answer.is_integer():
                     formatted_answer = str(int(answer))
                 else:
-                    # For currency values that might need formatting
                     if abs(answer) >= 1000:
                         formatted_answer = f"${answer:,.2f}"
                     else:
@@ -153,13 +162,13 @@ class CustomAgent:
                 return str(answer)
             else:
                 return str(answer)
         # Normalize whitespace
         answer = answer.strip()
         # Remove common prefixes and formatting that models add
         prefixes_to_remove = [
-            "The answer is ",
             "Answer: ",
             "Final answer: ",
             "The result is ",
@@ -169,12 +178,14 @@ class CustomAgent:
         ]
         for prefix in prefixes_to_remove:
             if answer.startswith(prefix):
-                answer = answer[len(prefix):].strip()
         # Remove quotes if they wrap the entire answer
-        if (answer.startswith('"') and answer.endswith('"')) or (answer.startswith("'") and answer.endswith("'")):
             answer = answer[1:-1].strip()
         return answer
@@ -184,13 +195,12 @@ def get_config():
     """
     # Default configuration
     config = {
-        "model_id": "models/gemini-2.5-flash-preview-04-17",
         "logging": False,
         "max_steps": 5,
         "verbose": False,
         "executor_type": "local",
-        "timeout": 120 ,
-        "sleep" : 60,
     }
     return config

+from smolagents import (
+    CodeAgent,
+    LiteLLMModel,
+    DuckDuckGoSearchTool,
+    PythonInterpreterTool,
+    VisitWebpageTool,
+)
+from src.tools import extract_text_from_youtube
+from PIL import Image
 import os
+from typing import List
 from dotenv import load_dotenv
+load_dotenv()
 SYSTEM_PROMPT = """
+When answering, provide ONLY the precise answer requested.
+Do not include explanations, steps, reasoning, or additional text.
+Be direct and specific. GAIA benchmark requires exact matching answers.
+For example, if asked "What is the capital of France?", respond simply with "Paris".
 """
         """
         self.logging = logging
         self.verbose = verbose
+        self.imports = [
+            "pandas",
+            "numpy",
+            "io",
+            "datetime",
+            "json",
+            "re",
+            "math",
+            "os",
+            "requests",
+            "csv",
+            "urllib",
+            "youtube-transcript-api",
+            "SpeechRecognition",
+            "pydub",
+        ]
         if additional_imports:
             self.imports.extend(additional_imports)
+        # Initialize tools
         self.tools = [
             DuckDuckGoSearchTool(),
             PythonInterpreterTool(),
+            VisitWebpageTool(),
+            extract_text_from_youtube,
         ]
         # Initialize the model
         model = LiteLLMModel(
+            model_id=model_id,
             api_key=os.getenv("GEMINI_API_KEY"),
             timeout=timeout,
         )
+        # Initialize the CodeAgent
         self.agent = CodeAgent(
             model=model,
             tools=self.tools,
             print("CustomAgent initialized.")
     def __call__(self, question: str) -> str:
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             full_prompt = f"""{question}
+            {SYSTEM_PROMPT}
             """
             answer = self.agent.run(full_prompt)
             answer = self._clean_answer(answer)
             if self.verbose:
                 print(error_msg)
             return error_msg
     def forward(self, question: str, file_path) -> str:
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
+            full_prompt = f"""Question: {question}
+            {SYSTEM_PROMPT}"""
             if file_path:
+                file_path_ext = os.path.splitext(file_path)[1]
+                if file_path_ext.lower() in [".jpg", ".jpeg", ".png"]:
+                    image = Image.open(file_path).convert("RGB")
+                    answer = self.agent.run(full_prompt, images=[image])
+                elif file_path_ext.lower() in [".txt", ".py"]:
+                    with open(file_path, "r") as f:
+                        content = f.read()
+                    full_prompt = f"""Question: {question}
+                    File content: ```{content}```
+                    {SYSTEM_PROMPT}"""
+                    answer = self.agent.run(full_prompt)
+                else:
+                    full_prompt = f"""Question: {question}
+                    File path: {file_path}
+                    {SYSTEM_PROMPT}"""
+                    answer = self.agent.run(full_prompt)
+            else:
+                answer = self.agent.run(full_prompt)
             answer = self._clean_answer(answer)
             return answer
             if self.verbose:
                 print(error_msg)
             return error_msg
     def _clean_answer(self, answer: any) -> str:
         """
         Clean up the answer to remove common prefixes and formatting
         that models often add but that can cause exact match failures.
         Args:
             answer: The raw answer from the model
         Returns:
             The cleaned answer as a string
         """
         # Convert non-string types to strings
         if not isinstance(answer, str):
             if isinstance(answer, float):
                 if answer.is_integer():
                     formatted_answer = str(int(answer))
                 else:
                     if abs(answer) >= 1000:
                         formatted_answer = f"${answer:,.2f}"
                     else:
                 return str(answer)
             else:
                 return str(answer)
         # Normalize whitespace
         answer = answer.strip()
         # Remove common prefixes and formatting that models add
         prefixes_to_remove = [
+            "The answer is ",
             "Answer: ",
             "Final answer: ",
             "The result is ",
         ]
         for prefix in prefixes_to_remove:
             if answer.startswith(prefix):
+                answer = answer[len(prefix) :].strip()
         # Remove quotes if they wrap the entire answer
+        if (answer.startswith('"') and answer.endswith('"')) or (
+            answer.startswith("'") and answer.endswith("'")
+        ):
             answer = answer[1:-1].strip()
         return answer
     """
     # Default configuration
     config = {
+        "model_id": "gemini/gemini-2.5-flash-preview-04-17",
         "logging": False,
         "max_steps": 5,
         "verbose": False,
         "executor_type": "local",
+        "timeout": 120,
     }
     return config

src/api_client.py CHANGED Viewed

@@ -3,35 +3,56 @@ import os
 import datetime
 from typing import List, Dict, Any
 class ApiClient:
     def __init__(self, api_url="https://agents-course-unit4-scoring.hf.space"):
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
         self.files_url = f"{api_url}/files"
     def get_questions(self, limit=20) -> List[Dict[str, Any]]:
         limit = min(limit, 20)
         limit = max(limit, 1)
         response = requests.get(self.questions_url)
         response.raise_for_status()
         return response.json()[:limit]
     def get_random_question(self) -> Dict[str, Any]:
         response = requests.get(f"{self.api_url}/random-question")
         response.raise_for_status()
         return response.json()
-    def get_file(self, task_id: str) -> bytes:
-        response = requests.get(f"{self.files_url}/{task_id}")
         response.raise_for_status()
-        return response.content
-    def submit_answers(self, username: str, agent_code: str, answers_payload: List[Dict[str, Any]], logging: bool) -> Dict[str, Any]:
         data = {
             "username": username,
             "agent_code": agent_code,
-            "answers": answers_payload
         }
         response = requests.post(self.submit_url, json=data)
         response.raise_for_status()

 import datetime
 from typing import List, Dict, Any
 class ApiClient:
     def __init__(self, api_url="https://agents-course-unit4-scoring.hf.space"):
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
         self.files_url = f"{api_url}/files"
     def get_questions(self, limit=20) -> List[Dict[str, Any]]:
         limit = min(limit, 20)
         limit = max(limit, 1)
         response = requests.get(self.questions_url)
         response.raise_for_status()
         return response.json()[:limit]
     def get_random_question(self) -> Dict[str, Any]:
         response = requests.get(f"{self.api_url}/random-question")
         response.raise_for_status()
         return response.json()
+    def get_file(self, task_id, file_name: str) -> bytes:
+        # check if file already exists
+        file_path = os.path.join("src/temp/files", file_name)
+        if os.path.exists(file_path):
+            return file_path
+        # Download the file
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        response = requests.get(f"{self.files_url}/{task_id}", stream=True)
         response.raise_for_status()
+        # Save the file
+        with open(file_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+            print(f"File saved to {file_path}")
+        return file_path
+    def submit_answers(
+        self,
+        username: str,
+        agent_code: str,
+        answers_payload: List[Dict[str, Any]],
+        logging: bool,
+    ) -> Dict[str, Any]:
         data = {
             "username": username,
             "agent_code": agent_code,
+            "answers": answers_payload,
         }
         response = requests.post(self.submit_url, json=data)
         response.raise_for_status()

src/temp/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/tools.py CHANGED Viewed

@@ -2,8 +2,48 @@ from smolagents import tool
 @tool
-def test():
     """
-    Test function that returns a string.
     """
-    return "This is a test function."

 @tool
+def extract_text_from_mp3(mp3_file_path: str) -> str:
     """
+    Extract text from an mp3 audio file.
+    Args:
+        mp3_file_path (str): Path to the mp3 file.
+    Returns:
+        str: The text extracted from the mp3 file.
+    """
+    try:
+        import speech_recognition as sr
+        from pydub import AudioSegment
+        audio = AudioSegment.from_mp3(mp3_file_path)
+        wav_file = f"{mp3_file_path}.wav"
+        audio.export(wav_file, format="wav")
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(wav_file) as source:
+            audio_data = recognizer.record(source)
+            text = recognizer.recognize_google(audio_data)
+        return text
+    except Exception as e:
+        return f"Could not extract text from mp3 file: {e}"
+@tool
+def extract_text_from_youtube(youtube_id: str) -> str:
     """
+    Extract text from a youtube video.
+    Args:
+        youtube_id (str): ID of the youtube video. Not the full URL. Example: "dQw4w9WgXcQ"
+    Returns:
+        str: The text extracted from the youtube video.
+    """
+    try:
+        from youtube_transcript_api import YouTubeTranscriptApi
+        ytt_api = YouTubeTranscriptApi()
+        fetched_transcript = ytt_api.fetch(youtube_id)
+        plaintext = " ".join(snippet.text for snippet in fetched_transcript)
+        return plaintext
+    except:
+        return "Could not fetch transcript from YouTube video."

test.json ADDED Viewed

	@@ -0,0 +1,6 @@

+[
+    {'task_id': '8e867cd7-cff9-4e6c-867a-ff5ddc2550be', 'question': 'How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.', 'Level': '1', 'file_name': ''},
+    {'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', 'question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?', 'Level': '1', 'file_name': ''},
+    {'task_id': '2d83110e-a098-4ebb-9987-066c06fa42d0', 'question': '.rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI', 'Level': '1', 'file_name': ''},
+    {'task_id': 'cca530fc-4052-43b2-b130-b30968d8aa44', 'question': "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", 'Level': '1', 'file_name': 'cca530fc-4052-43b2-b130-b30968d8aa44.png'},
+    {'task_id': '4fc2f1ae-8625-45b5-ab34-ad4433bc21f8', 'question': 'Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?', 'Level': '1', 'file_name': ''}]