GaiaAgent_Final_Assignment

Running

App Files Files Community

Francesco-A commited on 16 days ago

Commit

5fbd0a0

1 Parent(s): 34a788d

added genai + double agent logic

Browse files

Multiple Gemini agents:
1 - heavy tasks (video/image analysis)
2 - lighter tasks

Files changed (3) hide show

agent.py +73 -69
requirements.txt +7 -0
tools/gemini_native_tools.py +62 -0

agent.py CHANGED Viewed

@@ -1,28 +1,31 @@
 # Generic agent
 import os
 from typing import Optional
 import pandas as pd
 # Smolagents imports
 from smolagents import (
     CodeAgent,
     InferenceClientModel,
     TransformersModel,
     LiteLLMModel,
-    Tool,
-    tool,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
-    WikipediaSearchTool,
     PythonInterpreterTool,
     FinalAnswerTool,
 )
 # Import your custom tools (to be used in app, not in local notebook)
-from tools.download_file import download_file_from_url
-from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
-from tools.audio_tools   import youtube_to_text, transcribe_audio
 # Define tools
 AGENT_TOOLS = [
@@ -39,10 +42,24 @@ AGENT_TOOLS = [
     image_to_text,              # OCR for images
     youtube_to_text,            # Youtube audio to text
     transcribe_audio,           # Audio file to text
 ]
-# System prompt
-SYSTEM_PROMPT = """
 You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
 ### 1. Reason-Act-Observe
@@ -52,29 +69,15 @@ Follow a **PLAN → ACT → OBSERVE** loop:
 - **OBSERVE:** Examine outputs or errors before proceeding.
 ### 2. File Handling
-- When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
-You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
-| File Type / Source | Tool / Method to Use |
-| :--- | :--- |
-| `.csv` | `pd.read_csv(filepath)` |
-| `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
-| `.pdf` | `pdf_to_text(filepath)` |
-| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
-| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
-| **YouTube URL** | `youtube_to_text(url)` |
-| `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
 **Important rules:**
-- When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
 - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
-- For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
 ### 3. Data Analysis & Answer
 - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
 - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
-- Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
 ### 4. Additional instructions for the following tasks provided by GAIA team
 - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
@@ -92,28 +95,44 @@ final_answer("FINAL ANSWER: The capital of France is Paris")
 \n\n
 """
 class BasicAgent:
     def __init__(self):
-        self.system_prompt = SYSTEM_PROMPT
         self.model  = InferenceClientModel(
             model_id    = "Qwen/Qwen3-Next-80B-A3B-Thinking",
             temperature = 0.0,
             top_p       = 1.0,
             max_tokens  = 8196,
             )
-        self.tools = AGENT_TOOLS
         self.basic_agent = CodeAgent(
             name           = "basic_agent",
             description    = "Basic smolagents CodeAgent",
             model          = self.model,
-            tools          = self.tools,
             add_base_tools = True,        # probably redundant, but it does not hurt
             max_steps      = 5,
-            additional_authorized_imports = [
-                'numpy','subprocess', 're', 'pandas',
-                'json', 'os', 'datetime', 'tempfile',
-                'requests', 'markdownify'
-                ],
             verbosity_level = 1,
             max_print_outputs_length=1_000_000
             )
@@ -121,39 +140,39 @@ class BasicAgent:
         print("✅ Basic agent initialized")
     def __call__(self, question: str, file_path: Optional[str] = None) -> str:
         if file_path:
-            # Inject system prompt + question and (optional) file path
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-                f"There is an associated file at path: {file_path}.\n"
-                f"Use the appropriate tool to download it (if necessary) and read it before answering"
-            )
-        else:
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-            )
         return self.basic_agent.run(prompt)
 class GeminiAgent:
-    def __init__(self):
-        self.system_prompt = SYSTEM_PROMPT
         GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
         if not GOOGLE_API_KEY:
             raise RuntimeError(
                 "GOOGLE_API_KEY not found."
             )
         self.model = LiteLLMModel(
-            model_id    = "gemini/gemini-2.5-flash-lite",
             api_key     = GOOGLE_API_KEY,
             temperature = 0.0,
             top_p       = 1.0,
             max_tokens  = 8196,
             )
-        self.tools = AGENT_TOOLS
         self.gemini_agent = CodeAgent(
             name           = "gemini_agent",
             description    = "Gemini CodeAgent",
@@ -161,11 +180,7 @@ class GeminiAgent:
             tools          = self.tools,
             add_base_tools = True,        # probably redundant, but it does not hurt
             max_steps      = 8,
-            additional_authorized_imports = [
-                'numpy','subprocess', 're', 'pandas',
-                'json', 'os', 'datetime', 'tempfile',
-                'requests', 'markdownify',
-                ],
             verbosity_level = 1,
             max_print_outputs_length=1_000_000
             )
@@ -173,19 +188,8 @@ class GeminiAgent:
         print("✅ Gemini agent initialized")
     def __call__(self, question: str, file_path: Optional[str] = None) -> str:
         if file_path:
-            # Inject system prompt + question and (optional) file path
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-                f"There is an associated file at path: {file_path}.\n"
-                f"Use the appropriate tool to download it (if necessary) and read it before answering"
-            )
-        else:
-            prompt = (
-                f"{self.system_prompt}\n\n"
-                f"Question: {question}\n\n"
-            )
-        return self.gemini_agent.run(prompt)

+#  pip install smolagents python-chess stockfish pandas numpy requests markdownify
 # Generic agent
 import os
 from typing import Optional
 import pandas as pd
+# Genai imports
+from google import genai
+from google.genai import types
 # Smolagents imports
 from smolagents import (
     CodeAgent,
     InferenceClientModel,
     TransformersModel,
     LiteLLMModel,
     DuckDuckGoSearchTool,
     VisitWebpageTool,
     PythonInterpreterTool,
     FinalAnswerTool,
 )
 # Import your custom tools (to be used in app, not in local notebook)
+from tools.gemini_native_tools import analyze_video, analyze_image, analyze_audio
+from tools.download_file       import download_file_from_url
+from tools.files_to_text       import image_to_text, pdf_to_text, text_file_to_string
+from tools.audio_tools         import youtube_to_text, transcribe_audio
 # Define tools
 AGENT_TOOLS = [
     image_to_text,              # OCR for images
     youtube_to_text,            # Youtube audio to text
     transcribe_audio,           # Audio file to text
+]
+# Gemini-only tools
+NATIVE_TOOLS = [
+    analyze_video,
+    analyze_image,
+    analyze_audio
 ]
+# Define authorized imports
+AUTHORIZED_IMPORTS = [
+    'numpy','re', 'pandas', 'json', 'datetime',
+    'tempfile','requests', 'markdownify', 'chess.*',
+                ]
+# --- SYSTEM PROMPT TEMPLATE ---
+# The {} placeholder will be filled differently for Basic vs Gemini (Native)
+SYSTEM_PROMPT_TEMPLATE = """
 You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
 ### 1. Reason-Act-Observe
 - **OBSERVE:** Examine outputs or errors before proceeding.
 ### 2. File Handling
+{file_handling_instructions}
 **Important rules:**
+- Whenever you are given a file path (or url), you **must  ABSOLUTELY store it in a variable first** (e.g. filepath`) and pass that variable directly to the next tool. **NEVER** try to write the path yourself in the function.
 - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
 ### 3. Data Analysis & Answer
 - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
 - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
 ### 4. Additional instructions for the following tasks provided by GAIA team
 - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
 \n\n
 """
+# Instruction for Tool-Based Agents (BasicAgent and Gemini-Standard)
+TOOL_BASED_INSTRUCTIONS = """
+You must select the reading or transcription method **strictly** based on the file type:
+| File Type / Source | Tool / Method to Use |
+| :--- | :--- |
+| `.csv` | `pd.read_csv(filepath)` |
+| `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
+| `.pdf` | `pdf_to_text(filepath)` |
+| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
+| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
+| **YouTube URL** | `youtube_to_text(url)` |
+| `.mp3`, `.wav`, `.m4a` | `transcribe_audio(filepath)` |
+"""
+# Instruction for Native Gemini (No OCR/Transcribe tools for media)
+NATIVE_MEDIA_INSTRUCTIONS = """
+You have **native vision and audio capabilities**.
+- For **Images (.png, .jpg) and Audio/Video**: Do NOT use external tools like `image_to_text`. You can see and hear these files directly. Analyze them using your internal multimodal capabilities.
+- For **Data/Text files**: Continue using tools like `pd.read_csv(filepath)` or `text_file_to_string(filepath)`.
+"""
 class BasicAgent:
     def __init__(self):
+        self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=TOOL_BASED_INSTRUCTIONS)
         self.model  = InferenceClientModel(
             model_id    = "Qwen/Qwen3-Next-80B-A3B-Thinking",
             temperature = 0.0,
             top_p       = 1.0,
             max_tokens  = 8196,
             )
         self.basic_agent = CodeAgent(
             name           = "basic_agent",
             description    = "Basic smolagents CodeAgent",
             model          = self.model,
+            tools          = AGENT_TOOLS,
             add_base_tools = True,        # probably redundant, but it does not hurt
             max_steps      = 5,
+            additional_authorized_imports = AUTHORIZED_IMPORTS,
             verbosity_level = 1,
             max_print_outputs_length=1_000_000
             )
         print("✅ Basic agent initialized")
     def __call__(self, question: str, file_path: Optional[str] = None) -> str:
+        prompt = f"{self.system_prompt}\n\nQuestion: {question}"
         if file_path:
+            prompt += f"\nFile path: {file_path}"
         return self.basic_agent.run(prompt)
 class GeminiAgent:
+    def __init__(self, native_multimodal: bool = True, model_id: str = "gemini/gemini-2.5-flash-lite"):
+        self.native_multimodal = native_multimodal
+        if self.native_multimodal:
+            client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
+        # Switch prompt based on the native_multimodal flag
+        INSTRUCTIONS = NATIVE_MEDIA_INSTRUCTIONS if native_multimodal else TOOL_BASED_INSTRUCTIONS
+        self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=INSTRUCTIONS)
         GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
         if not GOOGLE_API_KEY:
             raise RuntimeError(
                 "GOOGLE_API_KEY not found."
             )
         self.model = LiteLLMModel(
+            model_id    = model_id,
             api_key     = GOOGLE_API_KEY,
             temperature = 0.0,
             top_p       = 1.0,
             max_tokens  = 8196,
+            timeout     = 120   # Add timeout to prevent hanging
             )
+        # If native, we can optionally remove image_to_text from tools to prevent the agent from getting confused
+        if self.native_multimodal:
+            self.tools = NATIVE_TOOLS + [t for t in AGENT_TOOLS if t not in [image_to_text, youtube_to_text, transcribe_audio]]
+        else:
+            self.tools = AGENT_TOOLS
         self.gemini_agent = CodeAgent(
             name           = "gemini_agent",
             description    = "Gemini CodeAgent",
             tools          = self.tools,
             add_base_tools = True,        # probably redundant, but it does not hurt
             max_steps      = 8,
+            additional_authorized_imports = AUTHORIZED_IMPORTS,
             verbosity_level = 1,
             max_print_outputs_length=1_000_000
             )
         print("✅ Gemini agent initialized")
     def __call__(self, question: str, file_path: Optional[str] = None) -> str:
+        prompt = f"{self.system_prompt}\n\nQuestion: {question}"
         if file_path:
+            prompt += f"\n\nThere is a file at: {file_path}. Use your tools to process it."
+        return self.gemini_agent.run(prompt)

requirements.txt CHANGED Viewed

@@ -26,5 +26,12 @@ youtube-transcript-api==1.2.3
 pytubefix==10.3.6
 openai-whisper==20250625
 # OCR (OPTIONAL, disabled)
 # pytesseract==0.3.13

 pytubefix==10.3.6
 openai-whisper==20250625
+# Chess
+chess==1.11.2
+stockfish==4.0.5
+# Google genai
+google-genai==1.57.0
 # OCR (OPTIONAL, disabled)
 # pytesseract==0.3.13

tools/gemini_native_tools.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from smolagents import tool
+from google import genai
+from google.genai import types
+# Initialize client once
+client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
+@tool
+def analyze_video(video_source: str, question: str) -> str:
+    """
+    Analyzes a video (YouTube URL or local file path) to answer a specific question.
+    Args:
+        video_source: The YouTube URL or the local path to the video file.
+        question: The question you want to ask about the video content.
+    """
+    # 1. Handle YouTube vs Local
+    if "youtube.com" in video_source or "youtu.be" in video_source:
+        video_part = types.Part(file_data=types.FileData(file_uri=video_source))
+    else:
+        # Upload local file to Gemini's File API (stored for 48h)
+        uploaded_file = client.files.upload(file=video_source)
+        video_part = types.Part(file_data=types.FileData(file_uri=uploaded_file.uri))
+    # 2. Generate content
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=[video_part, question]
+    )
+    return response.text
+@tool
+def analyze_image(image_path: str, question: str) -> str:
+    """
+    Uses native vision to analyze an image file and answer questions about it.
+    Args:
+        image_path: Path to the image file (jpg, png, webp).
+        question: What you want to know about the image.
+    """
+    uploaded_file = client.files.upload(file=image_path)
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=[uploaded_file, question]
+    )
+    return response.text
+@tool
+def analyze_audio(audio_path: str, question: str) -> str:
+    """
+    Analyzes audio files (mp3, wav) to transcribe or answer questions about content and tone.
+    Args:
+        audio_path: Path to the audio file.
+        question: The question or instruction (e.g., 'Summarize the mood' or 'Transcribe this').
+    """
+    uploaded_file = client.files.upload(file=audio_path)
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=[uploaded_file, question]
+    )
+    return response.text
+# approach inspired by: https://huggingface.co/spaces/DeekshithN05/Final_Assignment_Template/blob/main/agent.py