David commited on
Commit
7da5655
·
1 Parent(s): 3f771a9

Included tools to understand audio, image and video. Sleep is included to avoid free tier RPM

Browse files
Files changed (5) hide show
  1. agent.py +61 -34
  2. app.py +6 -0
  3. gaia_system_prompt.py +11 -14
  4. requirements.txt +1 -4
  5. tools.py +47 -57
agent.py CHANGED
@@ -1,7 +1,5 @@
1
  from llama_index.llms.google_genai import GoogleGenAI
2
  from llama_index.llms.gemini import Gemini
3
- from llama_index.llms.groq import Groq
4
- from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
5
  from llama_index.tools.arxiv import ArxivToolSpec
6
  from llama_index.tools.wikipedia import WikipediaToolSpec
7
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
@@ -15,17 +13,16 @@ from llama_index.core.agent.workflow import (
15
  from gradio import ChatMessage
16
  from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
17
 
18
- from tools import interpret_python_math_code
19
- from gaia_system_prompt import SYSTEM_PROMPT as GAIA_SYSTEM_PROMPT
20
 
21
  import os
22
  import asyncio
23
 
24
  TIMEOUT=180 # Timeout for agent execution in seconds
25
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
26
- GROQ_API_KEY = os.getenv("GROQ_TOKEN")
27
  GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
28
- GEMINI_MODEL_NAME = "gemini-2.5-flash-preview-04-17"
29
  LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
30
  API_DIR = "http://host.docker.internal:1234/v1" # LM Studio API URL
31
 
@@ -33,10 +30,8 @@ class FinalAgent:
33
  def __init__(self):
34
  # LLM Initialization
35
  # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
36
- # self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
37
- # self.llm = Groq(model="meta-llama/llama-4-maverick-17b-128e-instruct", api_key=GROQ_API_KEY)
38
  # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
39
- self.llm = HuggingFaceInferenceAPI(model_name="meta-llama/Llama-3.3-70B-Instruct", timeout=TIMEOUT)
40
 
41
  # Tool Initialization
42
  self.tools = [
@@ -44,6 +39,31 @@ class FinalAgent:
44
  fn=interpret_python_math_code,
45
  name="InterpretPythonMathCode",
46
  description="Interprets Python code for mathematical expressions."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  )
48
  ]
49
  self.tools.extend(
@@ -56,22 +76,27 @@ class FinalAgent:
56
  DuckDuckGoSearchToolSpec().to_tool_list()
57
  )
58
 
59
- # Agent Workflow Initialization
60
- # self.agent = AgentWorkflow.from_tools_or_functions(
61
- # tools_or_functions=self.tools,
62
- # llm=self.llm,
63
- # system_prompt=GAIA_SYSTEM_PROMPT,
64
- # timeout=TIMEOUT
65
- # )
66
 
67
- self.agent = ReActAgent(
 
 
68
  llm=self.llm,
69
- verbose=True,
70
- max_iterations=5,
71
- system_prompt=GAIA_SYSTEM_PROMPT,
72
- tools=self.tools
73
  )
74
 
 
 
 
 
 
 
 
 
75
  print("FinalAgent initialized.")
76
  # async def __call__(self, question: str) -> str:
77
  # # Example
@@ -102,6 +127,7 @@ class FinalAgent:
102
  try:
103
  # Use arun for an async method.
104
  agent_chat_response = await self.agent.run(question)
 
105
 
106
  potential_response_obj = agent_chat_response.response
107
 
@@ -133,22 +159,23 @@ class FinalAgent:
133
  # Depending on requirements, you might want to return an error message or re-raise
134
  response_str = f"Agent error: {e}"
135
 
136
- # Get the agent's final response string from FINAL ANSWER:
137
- if "FINAL ANSWER: " in response_str:
138
- response_str = response_str.split("FINAL ANSWER: ")[-1].strip()
 
 
139
  else:
140
- print("Warning: 'FINAL ANSWER:' not found in response string. Returning full response.")
141
 
142
- print(f"Agent final response: {response_str}")
143
  return response_str
144
 
145
 
146
- async def main():
147
- # Example usage
148
- agent = FinalAgent()
149
- question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
150
- answer = await agent(question)
151
- print(f"Final answer: {answer}")
152
 
153
- if __name__ == "__main__":
154
- asyncio.run(main())
 
1
  from llama_index.llms.google_genai import GoogleGenAI
2
  from llama_index.llms.gemini import Gemini
 
 
3
  from llama_index.tools.arxiv import ArxivToolSpec
4
  from llama_index.tools.wikipedia import WikipediaToolSpec
5
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 
13
  from gradio import ChatMessage
14
  from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
15
 
16
+ from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
17
+ from gaia_system_prompt import GAIA_SYSTEM_PROMPT, CUSTOM_SYSTEM_PROMPT
18
 
19
  import os
20
  import asyncio
21
 
22
  TIMEOUT=180 # Timeout for agent execution in seconds
23
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
 
24
  GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
25
+ GEMINI_MODEL_NAME = "gemini-2.0-flash"
26
  LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
27
  API_DIR = "http://host.docker.internal:1234/v1" # LM Studio API URL
28
 
 
30
  def __init__(self):
31
  # LLM Initialization
32
  # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
33
+ self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
 
34
  # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
 
35
 
36
  # Tool Initialization
37
  self.tools = [
 
39
  fn=interpret_python_math_code,
40
  name="InterpretPythonMathCode",
41
  description="Interprets Python code for mathematical expressions."
42
+ ),
43
+ FunctionTool.from_defaults(
44
+ fn=image_understanding,
45
+ name="ImageUnderstanding",
46
+ description="Analyzes an image and generates a response to a given question based on the image's content."
47
+ ),
48
+ FunctionTool.from_defaults(
49
+ fn=convert_audio_to_text,
50
+ name="ConvertAudioToText",
51
+ description="Converts audio files to text using a speech-to-text model."
52
+ ),
53
+ FunctionTool.from_defaults(
54
+ fn=video_understanding,
55
+ name="VideoUnderstanding",
56
+ description="Analyzes a video and generates a response to a given question based on the video's content."
57
+ ),
58
+ FunctionTool.from_defaults(
59
+ fn=read_csv_file,
60
+ name="ReadCSVFile",
61
+ description="Reads a CSV file and returns its content as a string."
62
+ ),
63
+ FunctionTool.from_defaults(
64
+ fn=read_xlsx_file,
65
+ name="ReadXLSXFile",
66
+ description="Reads an XLSX file and returns its content as a string."
67
  )
68
  ]
69
  self.tools.extend(
 
76
  DuckDuckGoSearchToolSpec().to_tool_list()
77
  )
78
 
79
+ # Print the tools for debugging
80
+ print("Tools initialized:")
81
+ for tool in self.tools:
82
+ print(f"- {tool._metadata}")
 
 
 
83
 
84
+ # Agent Workflow Initialization
85
+ self.agent = AgentWorkflow.from_tools_or_functions(
86
+ tools_or_functions=self.tools,
87
  llm=self.llm,
88
+ system_prompt=CUSTOM_SYSTEM_PROMPT,
89
+ timeout=TIMEOUT
 
 
90
  )
91
 
92
+ # self.agent = ReActAgent(
93
+ # llm=self.llm,
94
+ # verbose=True,
95
+ # max_iterations=5,
96
+ # system_prompt=CUSTOM_SYSTEM_PROMPT,
97
+ # tools=self.tools
98
+ # )
99
+
100
  print("FinalAgent initialized.")
101
  # async def __call__(self, question: str) -> str:
102
  # # Example
 
127
  try:
128
  # Use arun for an async method.
129
  agent_chat_response = await self.agent.run(question)
130
+ print(agent_chat_response)
131
 
132
  potential_response_obj = agent_chat_response.response
133
 
 
159
  # Depending on requirements, you might want to return an error message or re-raise
160
  response_str = f"Agent error: {e}"
161
 
162
+ # Get the agent's final response between <final_answer> and </final_answer> tags
163
+ if "<final_answer>" in response_str and "</final_answer>" in response_str:
164
+ start_index = response_str.index("<final_answer>") + len("<final_answer>")
165
+ end_index = response_str.index("</final_answer>")
166
+ response_str = response_str[start_index:end_index].strip()
167
  else:
168
+ print("Warning: No <final_answer> tags found in the response.")
169
 
 
170
  return response_str
171
 
172
 
173
+ # async def main():
174
+ # # Example usage
175
+ # agent = FinalAgent()
176
+ # question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
177
+ # answer = await agent(question)
178
+ # print(f"Final answer: {answer}")
179
 
180
+ # if __name__ == "__main__":
181
+ # asyncio.run(main())
app.py CHANGED
@@ -6,6 +6,9 @@ import pandas as pd
6
 
7
  from agent import FinalAgent
8
  import asyncio
 
 
 
9
 
10
  # (Keep Constants as is)
11
  # --- Constants ---
@@ -85,9 +88,12 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
85
  print(f"Skipping item with missing task_id or question: {item}")
86
  continue
87
  try:
 
88
  submitted_answer = await agent(question_text)
89
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
90
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
91
  except Exception as e:
92
  print(f"Error running agent on task {task_id}: {e}")
93
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
6
 
7
  from agent import FinalAgent
8
  import asyncio
9
+ import time
10
+
11
+ SLEEP_TIME_BETWEEN_QUESTIONS = 30 # Sleep time between questions to avoid rate limiting
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
 
88
  print(f"Skipping item with missing task_id or question: {item}")
89
  continue
90
  try:
91
+ # Run the agent on the question
92
  submitted_answer = await agent(question_text)
93
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
94
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
95
+
96
+ time.sleep(SLEEP_TIME_BETWEEN_QUESTIONS) # Sleep for 60 seconds to avoid Gemini free RPD rate limiting issues
97
  except Exception as e:
98
  print(f"Error running agent on task {task_id}: {e}")
99
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
gaia_system_prompt.py CHANGED
@@ -5,19 +5,16 @@ If you are asked for a number, don't use comma to write your number neither use
5
  If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
6
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
7
 
8
- SYSTEM_PROMPT = """
9
- You are a general AI assistant. Answer my question directly, following these strict rules. Your entire output must be *only* the template below.
 
 
 
 
 
10
 
11
- **Rules:**
12
- * No thoughts, explanations, or extra text.
13
- * The *only* output is: FINAL ANSWER: [YOUR SHORT ANSWER]
14
- * [YOUR SHORT ANSWER] is a number, string, or comma-separated list.
15
- * Numbers: No commas, no units (unless specified).
16
- * Strings: No articles, no abbreviations, digits as words (unless specified).
17
- * Lists: Apply number/string rules to items.
18
-
19
- **Example:**
20
- User: What is the capital of France?
21
- Assistant:
22
- FINAL ANSWER: Paris
23
  """
 
5
  If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
6
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
7
 
8
+ CUSTOM_SYSTEM_PROMPT = """
9
+ You are a general AI assistant. I will ask you a question and you should use your tools to answer as better as you can. You must be concise and precise in your answers.
10
+ I provide you some guidelines to follow:
11
+ 1. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
12
+ 2. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
13
+ 3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
14
+ 4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
15
 
16
+ The final answer should be written in the following format:
17
+ <final_answer>
18
+ YOUR FINAL ANSWER
19
+ </final_answer>
 
 
 
 
 
 
 
 
20
  """
requirements.txt CHANGED
@@ -3,11 +3,8 @@ requests
3
  numpy
4
  pandas
5
  scipy
6
- groq
7
  llama-index
8
- llama-index-llms-huggingface
9
- llama-index-llms-huggingface-api
10
- llama-index-llms-groq
11
  llama-index-utils-workflow
12
  llama-index-llms-lmstudio
13
  llama-index-llms-gemini
 
3
  numpy
4
  pandas
5
  scipy
6
+ google-genai
7
  llama-index
 
 
 
8
  llama-index-utils-workflow
9
  llama-index-llms-lmstudio
10
  llama-index-llms-gemini
tools.py CHANGED
@@ -5,14 +5,16 @@ import sys
5
  import numpy as np
6
  import pandas as pd
7
  import scipy
8
- from groq import Groq
9
 
10
  from pathlib import Path
11
- import pandas as pd
12
  import mimetypes
13
  import base64
14
 
 
 
15
  ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
 
 
16
 
17
  def interpret_python_math_code(python_code: str) -> str:
18
  """
@@ -119,7 +121,7 @@ def interpret_python_math_code(python_code: str) -> str:
119
  sys.stdout = old_stdout
120
 
121
 
122
- ## STT tool
123
  def convert_audio_to_text(path_to_audio: str) -> str:
124
  """
125
  Converts speech from an audio file into text.
@@ -129,33 +131,19 @@ def convert_audio_to_text(path_to_audio: str) -> str:
129
  str: The transcribed text content of the audio file.
130
  """
131
 
132
- # Validate audio file
133
- if not isinstance(path_to_audio, str):
134
- raise TypeError(
135
- "Parameter 'path_to_audio' must be a string containing the file path."
136
- )
137
- path = Path(path_to_audio).expanduser().resolve()
138
- if not path.is_file():
139
- raise FileNotFoundError(f"No such audio file: {path}")
140
-
141
- # Initialize the Groq client
142
- client = Groq()
143
 
144
- # Open the audio file
145
- with open(path_to_audio, "rb") as audio_file:
146
- # Create a transcription of the audio file
147
- transcription = client.audio.transcriptions.create(
148
- file=audio_file,
149
- model="whisper-large-v3-turbo",
150
- response_format="text", # Returns plain text instead of JSON
151
- language="en",
152
- temperature=0.1
153
- )
154
-
155
- return transcription
156
-
157
- ## Analyze image tool
158
- def analyze_image(path_to_image: str, question: str) -> str:
159
  """
160
  Analyzes an image and generates a response to a given question based on the image's content.
161
 
@@ -167,39 +155,41 @@ def analyze_image(path_to_image: str, question: str) -> str:
167
  str: The response from a VLM, typically a textual analysis or description based on the image.
168
  """
169
 
170
- def encode_image(image_path):
171
- with open(image_path, "rb") as image_file:
172
- return base64.b64encode(image_file.read()).decode('utf-8')
173
-
174
- # Get the MIME type (e.g., image/png, image/jpeg)
175
- mime_type, _ = mimetypes.guess_type(path_to_image)
176
- if mime_type is None:
177
- raise ValueError("Unsupported file type. Please provide a valid image.")
178
 
179
- base64_image = encode_image(path_to_image)
180
 
181
- # Initialize the Groq client
182
- client = GroqClient()
 
 
183
 
184
- chat_completion = client.chat.completions.create(
185
- messages=[
186
- {
187
- "role": "user",
188
- "content": [
189
- {"type": "text", "text": question},
190
- {
191
- "type": "image_url",
192
- "image_url": {
193
- "url": f"data:{mime_type};base64,{base64_image}",
194
- },
195
- },
196
- ],
197
- }
198
- ],
199
- model="meta-llama/llama-4-scout-17b-16e-instruct",
200
  )
 
 
201
 
202
- return chat_completion.choices[0].message.content
203
 
204
  ## Read .csv file tool
205
  def read_csv_file(path_to_csv: str) -> str:
 
5
  import numpy as np
6
  import pandas as pd
7
  import scipy
 
8
 
9
  from pathlib import Path
 
10
  import mimetypes
11
  import base64
12
 
13
+ from google import genai
14
+
15
  ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
16
+ GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
17
+ GEMINI_MODEL_NAME = "gemini-2.0-flash"
18
 
19
  def interpret_python_math_code(python_code: str) -> str:
20
  """
 
121
  sys.stdout = old_stdout
122
 
123
 
124
+ # STT tool
125
  def convert_audio_to_text(path_to_audio: str) -> str:
126
  """
127
  Converts speech from an audio file into text.
 
131
  str: The transcribed text content of the audio file.
132
  """
133
 
134
+ client = genai.Client(api_key="GOOGLE_API_KEY")
135
+
136
+ myfile = client.files.upload(file=path_to_audio)
137
+
138
+ transcription = client.models.generate_content(
139
+ model=GEMINI_MODEL_NAME, contents=["Provide a transcription of this audio file.", myfile]
140
+ )
 
 
 
 
141
 
142
+
143
+ return transcription.text
144
+
145
+ # Analyze image tool
146
+ def image_understanding(path_to_image: str, question: str) -> str:
 
 
 
 
 
 
 
 
 
 
147
  """
148
  Analyzes an image and generates a response to a given question based on the image's content.
149
 
 
155
  str: The response from a VLM, typically a textual analysis or description based on the image.
156
  """
157
 
158
+ client = genai.Client(api_key=GEMINI_API_KEY)
159
+
160
+ my_file = client.files.upload(file=path_to_image)
161
+
162
+ response = client.models.generate_content(
163
+ model=GEMINI_MODEL_NAME,
164
+ contents=[my_file, question],
165
+ )
166
 
167
+ return response.text
168
 
169
+ # Analyze video tool
170
+ def video_understanding(path_to_video: str, question: str) -> str:
171
+ """
172
+ Analyzes a video and generates a response to a given question based on the video's content.
173
 
174
+ Args:
175
+ path_to_video (str): The path to the video file to be analyzed.
176
+ question (str): The question to be answered, based on the contents of the video.
177
+
178
+ Returns:
179
+ str: The response from a VLM, typically a textual analysis or description based on the video.
180
+ """
181
+
182
+ client = genai.Client(api_key=GEMINI_API_KEY)
183
+
184
+ my_file = client.files.upload(file=path_to_video)
185
+
186
+ response = client.models.generate_content(
187
+ model=GEMINI_MODEL_NAME,
188
+ contents=[my_file, question],
 
189
  )
190
+
191
+ return response.text
192
 
 
193
 
194
  ## Read .csv file tool
195
  def read_csv_file(path_to_csv: str) -> str: