Francesco-A commited on
Commit
5fbd0a0
·
1 Parent(s): 34a788d

added genai + double agent logic

Browse files

Multiple Gemini agents:
1 - heavy tasks (video/image analysis)
2 - lighter tasks

Files changed (3) hide show
  1. agent.py +73 -69
  2. requirements.txt +7 -0
  3. tools/gemini_native_tools.py +62 -0
agent.py CHANGED
@@ -1,28 +1,31 @@
 
1
 
2
  # Generic agent
3
  import os
4
  from typing import Optional
5
  import pandas as pd
6
 
 
 
 
 
7
  # Smolagents imports
8
  from smolagents import (
9
  CodeAgent,
10
  InferenceClientModel,
11
  TransformersModel,
12
  LiteLLMModel,
13
- Tool,
14
- tool,
15
  DuckDuckGoSearchTool,
16
  VisitWebpageTool,
17
- WikipediaSearchTool,
18
  PythonInterpreterTool,
19
  FinalAnswerTool,
20
  )
21
 
22
  # Import your custom tools (to be used in app, not in local notebook)
23
- from tools.download_file import download_file_from_url
24
- from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
25
- from tools.audio_tools import youtube_to_text, transcribe_audio
 
26
 
27
  # Define tools
28
  AGENT_TOOLS = [
@@ -39,10 +42,24 @@ AGENT_TOOLS = [
39
  image_to_text, # OCR for images
40
  youtube_to_text, # Youtube audio to text
41
  transcribe_audio, # Audio file to text
 
 
 
 
 
 
 
42
  ]
43
 
44
- # System prompt
45
- SYSTEM_PROMPT = """
 
 
 
 
 
 
 
46
  You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
47
 
48
  ### 1. Reason-Act-Observe
@@ -52,29 +69,15 @@ Follow a **PLAN → ACT → OBSERVE** loop:
52
  - **OBSERVE:** Examine outputs or errors before proceeding.
53
 
54
  ### 2. File Handling
55
- - When a tool like `download_file_from_url` returns a local file path (e.g., `/tmp/data.csv`), you **MUST** save this path to a descriptive variable (e.g., `filepath`) and **immediately use that variable** as the argument for the next file-reading tool.
56
-
57
- You must select the reading or transcription method **strictly** based on the file type or source, following the rules below.
58
-
59
- | File Type / Source | Tool / Method to Use |
60
- | :--- | :--- |
61
- | `.csv` | `pd.read_csv(filepath)` |
62
- | `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
63
- | `.pdf` | `pdf_to_text(filepath)` |
64
- | `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
65
- | `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
66
- | **YouTube URL** | `youtube_to_text(url)` |
67
- | `.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg` | `transcribe_audio(filepath)` |
68
 
69
  **Important rules:**
70
- - When a tool returns a local file path, you **must** store it in a variable (e.g. `filepath`) and pass that variable directly to the next tool.
71
  - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
72
- - For YouTube links, always attempt `youtube_to_text` first; it will automatically fall back to Whisper if captions are unavailable.
73
 
74
  ### 3. Data Analysis & Answer
75
  - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
76
  - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
77
- - Use `FinalAnswerTool` **only once the problem is fully solved** to give a concise final answer.
78
 
79
  ### 4. Additional instructions for the following tasks provided by GAIA team
80
  - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
@@ -92,28 +95,44 @@ final_answer("FINAL ANSWER: The capital of France is Paris")
92
  \n\n
93
  """
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  class BasicAgent:
96
  def __init__(self):
97
- self.system_prompt = SYSTEM_PROMPT
98
  self.model = InferenceClientModel(
99
  model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking",
100
  temperature = 0.0,
101
  top_p = 1.0,
102
  max_tokens = 8196,
103
  )
104
- self.tools = AGENT_TOOLS
105
  self.basic_agent = CodeAgent(
106
  name = "basic_agent",
107
  description = "Basic smolagents CodeAgent",
108
  model = self.model,
109
- tools = self.tools,
110
  add_base_tools = True, # probably redundant, but it does not hurt
111
  max_steps = 5,
112
- additional_authorized_imports = [
113
- 'numpy','subprocess', 're', 'pandas',
114
- 'json', 'os', 'datetime', 'tempfile',
115
- 'requests', 'markdownify'
116
- ],
117
  verbosity_level = 1,
118
  max_print_outputs_length=1_000_000
119
  )
@@ -121,39 +140,39 @@ class BasicAgent:
121
  print("✅ Basic agent initialized")
122
 
123
  def __call__(self, question: str, file_path: Optional[str] = None) -> str:
124
-
125
  if file_path:
126
- # Inject system prompt + question and (optional) file path
127
- prompt = (
128
- f"{self.system_prompt}\n\n"
129
- f"Question: {question}\n\n"
130
- f"There is an associated file at path: {file_path}.\n"
131
- f"Use the appropriate tool to download it (if necessary) and read it before answering"
132
- )
133
- else:
134
- prompt = (
135
- f"{self.system_prompt}\n\n"
136
- f"Question: {question}\n\n"
137
- )
138
-
139
  return self.basic_agent.run(prompt)
140
 
141
  class GeminiAgent:
142
- def __init__(self):
143
- self.system_prompt = SYSTEM_PROMPT
 
 
 
 
 
 
 
144
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
145
  if not GOOGLE_API_KEY:
146
  raise RuntimeError(
147
  "GOOGLE_API_KEY not found."
148
  )
149
  self.model = LiteLLMModel(
150
- model_id = "gemini/gemini-2.5-flash-lite",
151
  api_key = GOOGLE_API_KEY,
152
  temperature = 0.0,
153
  top_p = 1.0,
154
  max_tokens = 8196,
 
155
  )
156
- self.tools = AGENT_TOOLS
 
 
 
 
157
  self.gemini_agent = CodeAgent(
158
  name = "gemini_agent",
159
  description = "Gemini CodeAgent",
@@ -161,11 +180,7 @@ class GeminiAgent:
161
  tools = self.tools,
162
  add_base_tools = True, # probably redundant, but it does not hurt
163
  max_steps = 8,
164
- additional_authorized_imports = [
165
- 'numpy','subprocess', 're', 'pandas',
166
- 'json', 'os', 'datetime', 'tempfile',
167
- 'requests', 'markdownify',
168
- ],
169
  verbosity_level = 1,
170
  max_print_outputs_length=1_000_000
171
  )
@@ -173,19 +188,8 @@ class GeminiAgent:
173
  print("✅ Gemini agent initialized")
174
 
175
  def __call__(self, question: str, file_path: Optional[str] = None) -> str:
176
-
177
  if file_path:
178
- # Inject system prompt + question and (optional) file path
179
- prompt = (
180
- f"{self.system_prompt}\n\n"
181
- f"Question: {question}\n\n"
182
- f"There is an associated file at path: {file_path}.\n"
183
- f"Use the appropriate tool to download it (if necessary) and read it before answering"
184
- )
185
- else:
186
- prompt = (
187
- f"{self.system_prompt}\n\n"
188
- f"Question: {question}\n\n"
189
- )
190
 
191
- return self.gemini_agent.run(prompt)
 
1
+ # pip install smolagents python-chess stockfish pandas numpy requests markdownify
2
 
3
  # Generic agent
4
  import os
5
  from typing import Optional
6
  import pandas as pd
7
 
8
+ # Genai imports
9
+ from google import genai
10
+ from google.genai import types
11
+
12
  # Smolagents imports
13
  from smolagents import (
14
  CodeAgent,
15
  InferenceClientModel,
16
  TransformersModel,
17
  LiteLLMModel,
 
 
18
  DuckDuckGoSearchTool,
19
  VisitWebpageTool,
 
20
  PythonInterpreterTool,
21
  FinalAnswerTool,
22
  )
23
 
24
  # Import your custom tools (to be used in app, not in local notebook)
25
+ from tools.gemini_native_tools import analyze_video, analyze_image, analyze_audio
26
+ from tools.download_file import download_file_from_url
27
+ from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string
28
+ from tools.audio_tools import youtube_to_text, transcribe_audio
29
 
30
  # Define tools
31
  AGENT_TOOLS = [
 
42
  image_to_text, # OCR for images
43
  youtube_to_text, # Youtube audio to text
44
  transcribe_audio, # Audio file to text
45
+ ]
46
+
47
+ # Gemini-only tools
48
+ NATIVE_TOOLS = [
49
+ analyze_video,
50
+ analyze_image,
51
+ analyze_audio
52
  ]
53
 
54
+ # Define authorized imports
55
+ AUTHORIZED_IMPORTS = [
56
+ 'numpy','re', 'pandas', 'json', 'datetime',
57
+ 'tempfile','requests', 'markdownify', 'chess.*',
58
+ ]
59
+
60
+ # --- SYSTEM PROMPT TEMPLATE ---
61
+ # The {} placeholder will be filled differently for Basic vs Gemini (Native)
62
+ SYSTEM_PROMPT_TEMPLATE = """
63
  You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems.
64
 
65
  ### 1. Reason-Act-Observe
 
69
  - **OBSERVE:** Examine outputs or errors before proceeding.
70
 
71
  ### 2. File Handling
72
+ {file_handling_instructions}
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  **Important rules:**
75
+ - Whenever you are given a file path (or url), you **must ABSOLUTELY store it in a variable first** (e.g. filepath`) and pass that variable directly to the next tool. **NEVER** try to write the path yourself in the function.
76
  - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio).
 
77
 
78
  ### 3. Data Analysis & Answer
79
  - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis.
80
  - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task.
 
81
 
82
  ### 4. Additional instructions for the following tasks provided by GAIA team
83
  - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated.
 
95
  \n\n
96
  """
97
 
98
+ # Instruction for Tool-Based Agents (BasicAgent and Gemini-Standard)
99
+ TOOL_BASED_INSTRUCTIONS = """
100
+ You must select the reading or transcription method **strictly** based on the file type:
101
+ | File Type / Source | Tool / Method to Use |
102
+ | :--- | :--- |
103
+ | `.csv` | `pd.read_csv(filepath)` |
104
+ | `.xlsx`, `.xls` | `pd.read_excel(filepath)` |
105
+ | `.pdf` | `pdf_to_text(filepath)` |
106
+ | `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` |
107
+ | `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` |
108
+ | **YouTube URL** | `youtube_to_text(url)` |
109
+ | `.mp3`, `.wav`, `.m4a` | `transcribe_audio(filepath)` |
110
+ """
111
+
112
+ # Instruction for Native Gemini (No OCR/Transcribe tools for media)
113
+ NATIVE_MEDIA_INSTRUCTIONS = """
114
+ You have **native vision and audio capabilities**.
115
+ - For **Images (.png, .jpg) and Audio/Video**: Do NOT use external tools like `image_to_text`. You can see and hear these files directly. Analyze them using your internal multimodal capabilities.
116
+ - For **Data/Text files**: Continue using tools like `pd.read_csv(filepath)` or `text_file_to_string(filepath)`.
117
+ """
118
+
119
  class BasicAgent:
120
  def __init__(self):
121
+ self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=TOOL_BASED_INSTRUCTIONS)
122
  self.model = InferenceClientModel(
123
  model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking",
124
  temperature = 0.0,
125
  top_p = 1.0,
126
  max_tokens = 8196,
127
  )
 
128
  self.basic_agent = CodeAgent(
129
  name = "basic_agent",
130
  description = "Basic smolagents CodeAgent",
131
  model = self.model,
132
+ tools = AGENT_TOOLS,
133
  add_base_tools = True, # probably redundant, but it does not hurt
134
  max_steps = 5,
135
+ additional_authorized_imports = AUTHORIZED_IMPORTS,
 
 
 
 
136
  verbosity_level = 1,
137
  max_print_outputs_length=1_000_000
138
  )
 
140
  print("✅ Basic agent initialized")
141
 
142
  def __call__(self, question: str, file_path: Optional[str] = None) -> str:
143
+ prompt = f"{self.system_prompt}\n\nQuestion: {question}"
144
  if file_path:
145
+ prompt += f"\nFile path: {file_path}"
 
 
 
 
 
 
 
 
 
 
 
 
146
  return self.basic_agent.run(prompt)
147
 
148
  class GeminiAgent:
149
+ def __init__(self, native_multimodal: bool = True, model_id: str = "gemini/gemini-2.5-flash-lite"):
150
+ self.native_multimodal = native_multimodal
151
+ if self.native_multimodal:
152
+ client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
153
+
154
+ # Switch prompt based on the native_multimodal flag
155
+ INSTRUCTIONS = NATIVE_MEDIA_INSTRUCTIONS if native_multimodal else TOOL_BASED_INSTRUCTIONS
156
+ self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=INSTRUCTIONS)
157
+
158
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
159
  if not GOOGLE_API_KEY:
160
  raise RuntimeError(
161
  "GOOGLE_API_KEY not found."
162
  )
163
  self.model = LiteLLMModel(
164
+ model_id = model_id,
165
  api_key = GOOGLE_API_KEY,
166
  temperature = 0.0,
167
  top_p = 1.0,
168
  max_tokens = 8196,
169
+ timeout = 120 # Add timeout to prevent hanging
170
  )
171
+ # If native, we can optionally remove image_to_text from tools to prevent the agent from getting confused
172
+ if self.native_multimodal:
173
+ self.tools = NATIVE_TOOLS + [t for t in AGENT_TOOLS if t not in [image_to_text, youtube_to_text, transcribe_audio]]
174
+ else:
175
+ self.tools = AGENT_TOOLS
176
  self.gemini_agent = CodeAgent(
177
  name = "gemini_agent",
178
  description = "Gemini CodeAgent",
 
180
  tools = self.tools,
181
  add_base_tools = True, # probably redundant, but it does not hurt
182
  max_steps = 8,
183
+ additional_authorized_imports = AUTHORIZED_IMPORTS,
 
 
 
 
184
  verbosity_level = 1,
185
  max_print_outputs_length=1_000_000
186
  )
 
188
  print("✅ Gemini agent initialized")
189
 
190
  def __call__(self, question: str, file_path: Optional[str] = None) -> str:
191
+ prompt = f"{self.system_prompt}\n\nQuestion: {question}"
192
  if file_path:
193
+ prompt += f"\n\nThere is a file at: {file_path}. Use your tools to process it."
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ return self.gemini_agent.run(prompt)
requirements.txt CHANGED
@@ -26,5 +26,12 @@ youtube-transcript-api==1.2.3
26
  pytubefix==10.3.6
27
  openai-whisper==20250625
28
 
 
 
 
 
 
 
 
29
  # OCR (OPTIONAL, disabled)
30
  # pytesseract==0.3.13
 
26
  pytubefix==10.3.6
27
  openai-whisper==20250625
28
 
29
+ # Chess
30
+ chess==1.11.2
31
+ stockfish==4.0.5
32
+
33
+ # Google genai
34
+ google-genai==1.57.0
35
+
36
  # OCR (OPTIONAL, disabled)
37
  # pytesseract==0.3.13
tools/gemini_native_tools.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import tool
2
+ from google import genai
3
+ from google.genai import types
4
+
5
+ # Initialize client once
6
+ client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
7
+
8
+ @tool
9
+ def analyze_video(video_source: str, question: str) -> str:
10
+ """
11
+ Analyzes a video (YouTube URL or local file path) to answer a specific question.
12
+ Args:
13
+ video_source: The YouTube URL or the local path to the video file.
14
+ question: The question you want to ask about the video content.
15
+ """
16
+ # 1. Handle YouTube vs Local
17
+ if "youtube.com" in video_source or "youtu.be" in video_source:
18
+ video_part = types.Part(file_data=types.FileData(file_uri=video_source))
19
+ else:
20
+ # Upload local file to Gemini's File API (stored for 48h)
21
+ uploaded_file = client.files.upload(file=video_source)
22
+ video_part = types.Part(file_data=types.FileData(file_uri=uploaded_file.uri))
23
+
24
+ # 2. Generate content
25
+ response = client.models.generate_content(
26
+ model="gemini-2.5-flash",
27
+ contents=[video_part, question]
28
+ )
29
+ return response.text
30
+
31
+ @tool
32
+ def analyze_image(image_path: str, question: str) -> str:
33
+ """
34
+ Uses native vision to analyze an image file and answer questions about it.
35
+ Args:
36
+ image_path: Path to the image file (jpg, png, webp).
37
+ question: What you want to know about the image.
38
+ """
39
+ uploaded_file = client.files.upload(file=image_path)
40
+ response = client.models.generate_content(
41
+ model="gemini-2.5-flash",
42
+ contents=[uploaded_file, question]
43
+ )
44
+ return response.text
45
+
46
+ @tool
47
+ def analyze_audio(audio_path: str, question: str) -> str:
48
+ """
49
+ Analyzes audio files (mp3, wav) to transcribe or answer questions about content and tone.
50
+ Args:
51
+ audio_path: Path to the audio file.
52
+ question: The question or instruction (e.g., 'Summarize the mood' or 'Transcribe this').
53
+ """
54
+ uploaded_file = client.files.upload(file=audio_path)
55
+ response = client.models.generate_content(
56
+ model="gemini-2.5-flash",
57
+ contents=[uploaded_file, question]
58
+ )
59
+ return response.text
60
+
61
+
62
+ # approach inspired by: https://huggingface.co/spaces/DeekshithN05/Final_Assignment_Template/blob/main/agent.py