David commited on
Commit
b0c6c93
·
1 Parent(s): 7da5655

Agent passed.

Browse files
Files changed (4) hide show
  1. agent.py +21 -63
  2. app.py +2 -1
  3. gaia_system_prompt.py +11 -0
  4. tools.py +26 -14
agent.py CHANGED
@@ -1,69 +1,59 @@
1
- from llama_index.llms.google_genai import GoogleGenAI
2
  from llama_index.llms.gemini import Gemini
3
  from llama_index.tools.arxiv import ArxivToolSpec
4
  from llama_index.tools.wikipedia import WikipediaToolSpec
5
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
6
  from llama_index.core.tools import FunctionTool
7
- from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
8
- from llama_index.llms.lmstudio import LMStudio
9
- from llama_index.core.agent.workflow import (
10
- AgentStream,
11
- AgentOutput
12
- )
13
  from gradio import ChatMessage
14
  from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
15
 
16
  from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
17
- from gaia_system_prompt import GAIA_SYSTEM_PROMPT, CUSTOM_SYSTEM_PROMPT
18
 
19
  import os
20
  import asyncio
21
 
22
  TIMEOUT=180 # Timeout for agent execution in seconds
23
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
24
- GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
25
- GEMINI_MODEL_NAME = "gemini-2.0-flash"
26
- LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
27
- API_DIR = "http://host.docker.internal:1234/v1" # LM Studio API URL
28
 
29
  class FinalAgent:
30
  def __init__(self):
31
  # LLM Initialization
32
- # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
33
  self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
34
- # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
35
 
36
  # Tool Initialization
37
  self.tools = [
38
  FunctionTool.from_defaults(
39
  fn=interpret_python_math_code,
40
  name="InterpretPythonMathCode",
41
- description="Interprets Python code for mathematical expressions."
42
  ),
43
  FunctionTool.from_defaults(
44
  fn=image_understanding,
45
  name="ImageUnderstanding",
46
- description="Analyzes an image and generates a response to a given question based on the image's content."
47
  ),
48
  FunctionTool.from_defaults(
49
  fn=convert_audio_to_text,
50
  name="ConvertAudioToText",
51
- description="Converts audio files to text using a speech-to-text model."
52
  ),
53
  FunctionTool.from_defaults(
54
  fn=video_understanding,
55
  name="VideoUnderstanding",
56
- description="Analyzes a video and generates a response to a given question based on the video's content."
57
  ),
58
  FunctionTool.from_defaults(
59
  fn=read_csv_file,
60
  name="ReadCSVFile",
61
- description="Reads a CSV file and returns its content as a string."
62
  ),
63
  FunctionTool.from_defaults(
64
  fn=read_xlsx_file,
65
  name="ReadXLSXFile",
66
- description="Reads an XLSX file and returns its content as a string."
67
  )
68
  ]
69
  self.tools.extend(
@@ -75,11 +65,7 @@ class FinalAgent:
75
  self.tools.extend(
76
  DuckDuckGoSearchToolSpec().to_tool_list()
77
  )
78
-
79
- # Print the tools for debugging
80
- print("Tools initialized:")
81
- for tool in self.tools:
82
- print(f"- {tool._metadata}")
83
 
84
  # Agent Workflow Initialization
85
  self.agent = AgentWorkflow.from_tools_or_functions(
@@ -89,37 +75,8 @@ class FinalAgent:
89
  timeout=TIMEOUT
90
  )
91
 
92
- # self.agent = ReActAgent(
93
- # llm=self.llm,
94
- # verbose=True,
95
- # max_iterations=5,
96
- # system_prompt=CUSTOM_SYSTEM_PROMPT,
97
- # tools=self.tools
98
- # )
99
-
100
  print("FinalAgent initialized.")
101
- # async def __call__(self, question: str) -> str:
102
- # # Example
103
- # print(f"Agent received question: {question}")
104
- # # fixed_answer = "This is a default answer."
105
- # # print(f"Agent returning fixed answer: {fixed_answer}")
106
- # # response = fixed_answer
107
-
108
- # # Implement agent logic here
109
- # response = ""
110
- # # Run the agent with the question
111
- # stream = await self.agent.run(question)
112
- # response = stream.response.content
113
- # # async for event in stream.stream_events():
114
- # # if isinstance(event, AgentStream):
115
- # # # Check if delta is empty
116
- # # if event.raw["choices"][0]["delta"] != {}:
117
- # # response += event.raw["choices"][0]["delta"]["content"]
118
-
119
- # print(f"Agent response: {response}")
120
-
121
- # return response
122
-
123
  async def __call__(self, question: str) -> str:
124
  print(f"Agent received question: {question}")
125
 
@@ -170,12 +127,13 @@ class FinalAgent:
170
  return response_str
171
 
172
 
173
- # async def main():
174
- # # Example usage
175
- # agent = FinalAgent()
176
- # question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
177
- # answer = await agent(question)
178
- # print(f"Final answer: {answer}")
 
179
 
180
- # if __name__ == "__main__":
181
- # asyncio.run(main())
 
 
1
  from llama_index.llms.gemini import Gemini
2
  from llama_index.tools.arxiv import ArxivToolSpec
3
  from llama_index.tools.wikipedia import WikipediaToolSpec
4
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
5
  from llama_index.core.tools import FunctionTool
6
+ from llama_index.core.agent.workflow import AgentWorkflow
 
 
 
 
 
7
  from gradio import ChatMessage
8
  from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
9
 
10
  from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
11
+ from gaia_system_prompt import CUSTOM_SYSTEM_PROMPT
12
 
13
  import os
14
  import asyncio
15
 
16
  TIMEOUT=180 # Timeout for agent execution in seconds
17
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
18
+ GEMINI_MODEL_NAME = "gemini-2.5-flash-preview-04-17"
19
+ # GEMINI_MODEL_NAME = "gemini-2.0-flash"
 
 
20
 
21
  class FinalAgent:
22
  def __init__(self):
23
  # LLM Initialization
 
24
  self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
 
25
 
26
  # Tool Initialization
27
  self.tools = [
28
  FunctionTool.from_defaults(
29
  fn=interpret_python_math_code,
30
  name="InterpretPythonMathCode",
31
+ description=interpret_python_math_code.__doc__
32
  ),
33
  FunctionTool.from_defaults(
34
  fn=image_understanding,
35
  name="ImageUnderstanding",
36
+ description=image_understanding.__doc__
37
  ),
38
  FunctionTool.from_defaults(
39
  fn=convert_audio_to_text,
40
  name="ConvertAudioToText",
41
+ description= convert_audio_to_text.__doc__
42
  ),
43
  FunctionTool.from_defaults(
44
  fn=video_understanding,
45
  name="VideoUnderstanding",
46
+ description= video_understanding.__doc__
47
  ),
48
  FunctionTool.from_defaults(
49
  fn=read_csv_file,
50
  name="ReadCSVFile",
51
+ description=read_csv_file.__doc__
52
  ),
53
  FunctionTool.from_defaults(
54
  fn=read_xlsx_file,
55
  name="ReadXLSXFile",
56
+ description= read_xlsx_file.__doc__
57
  )
58
  ]
59
  self.tools.extend(
 
65
  self.tools.extend(
66
  DuckDuckGoSearchToolSpec().to_tool_list()
67
  )
68
+
 
 
 
 
69
 
70
  # Agent Workflow Initialization
71
  self.agent = AgentWorkflow.from_tools_or_functions(
 
75
  timeout=TIMEOUT
76
  )
77
 
 
 
 
 
 
 
 
 
78
  print("FinalAgent initialized.")
79
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  async def __call__(self, question: str) -> str:
81
  print(f"Agent received question: {question}")
82
 
 
127
  return response_str
128
 
129
 
130
+ async def main():
131
+ # Example usage
132
+ agent = FinalAgent()
133
+ question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
134
+ question2 = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
135
+ answer = await agent(question)
136
+ print(f"Final answer: {answer}")
137
 
138
+ if __name__ == "__main__":
139
+ asyncio.run(main())
app.py CHANGED
@@ -8,7 +8,7 @@ from agent import FinalAgent
8
  import asyncio
9
  import time
10
 
11
- SLEEP_TIME_BETWEEN_QUESTIONS = 30 # Sleep time between questions to avoid rate limiting
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
@@ -89,6 +89,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
89
  continue
90
  try:
91
  # Run the agent on the question
 
92
  submitted_answer = await agent(question_text)
93
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
94
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
8
  import asyncio
9
  import time
10
 
11
+ SLEEP_TIME_BETWEEN_QUESTIONS = 60 # Sleep time between questions to avoid rate limiting
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
 
89
  continue
90
  try:
91
  # Run the agent on the question
92
+ print(item)
93
  submitted_answer = await agent(question_text)
94
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
95
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
gaia_system_prompt.py CHANGED
@@ -13,6 +13,17 @@ I provide you some guidelines to follow:
13
  3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
14
  4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  The final answer should be written in the following format:
17
  <final_answer>
18
  YOUR FINAL ANSWER
 
13
  3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
14
  4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
15
 
16
+ To answer the questions, you should use the follwing tools:
17
+ - DuckDuckGoSearchTool: Use this tool to search the web for information.
18
+ - ArxivTool: Use this tool to search for academic papers on arXiv.
19
+ - WikipediaTool: Use this tool to search for information on Wikipedia.
20
+ - InterpretPythonCodeTool: Use this tool to execute Python code to perform math calculations and return the result.
21
+ - ImageUnderstandingTool: Use this tool to analyze images and extract information.
22
+ - ConvertAudioToTextTool: Use this tool to convert audio files to text.
23
+ - VideoUnderstandingTool: Use this tool to analyze videos and extract information.
24
+ - ReadCSVFileTool: Use this tool to read CSV files and extract information.
25
+ - ReadXLSXFileTool: Use this tool to read XLSX files and extract information.
26
+
27
  The final answer should be written in the following format:
28
  <final_answer>
29
  YOUR FINAL ANSWER
tools.py CHANGED
@@ -11,6 +11,7 @@ import mimetypes
11
  import base64
12
 
13
  from google import genai
 
14
 
15
  ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
16
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
@@ -126,12 +127,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
126
  """
127
  Converts speech from an audio file into text.
128
  Args:
129
- path_to_audio (str): The path to the audio file to be transcribed.
130
  Returns:
131
  str: The transcribed text content of the audio file.
132
  """
133
 
134
- client = genai.Client(api_key="GOOGLE_API_KEY")
135
 
136
  myfile = client.files.upload(file=path_to_audio)
137
 
@@ -143,12 +144,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
143
  return transcription.text
144
 
145
  # Analyze image tool
146
- def image_understanding(path_to_image: str, question: str) -> str:
147
  """
148
- Analyzes an image and generates a response to a given question based on the image's content.
149
 
150
  Args:
151
- path_to_image (str): The path to the image file to be analyzed.
152
  question (str): The question to be answered, based on the contents of the image.
153
 
154
  Returns:
@@ -157,22 +158,23 @@ def image_understanding(path_to_image: str, question: str) -> str:
157
 
158
  client = genai.Client(api_key=GEMINI_API_KEY)
159
 
160
- my_file = client.files.upload(file=path_to_image)
 
161
 
162
  response = client.models.generate_content(
163
  model=GEMINI_MODEL_NAME,
164
- contents=[my_file, question],
165
  )
166
 
167
  return response.text
168
 
169
  # Analyze video tool
170
- def video_understanding(path_to_video: str, question: str) -> str:
171
  """
172
  Analyzes a video and generates a response to a given question based on the video's content.
173
 
174
  Args:
175
- path_to_video (str): The path to the video file to be analyzed.
176
  question (str): The question to be answered, based on the contents of the video.
177
 
178
  Returns:
@@ -181,16 +183,20 @@ def video_understanding(path_to_video: str, question: str) -> str:
181
 
182
  client = genai.Client(api_key=GEMINI_API_KEY)
183
 
184
- my_file = client.files.upload(file=path_to_video)
185
-
186
  response = client.models.generate_content(
187
  model=GEMINI_MODEL_NAME,
188
- contents=[my_file, question],
 
 
 
 
 
 
 
189
  )
190
 
191
  return response.text
192
 
193
-
194
  ## Read .csv file tool
195
  def read_csv_file(path_to_csv: str) -> str:
196
  """
@@ -229,4 +235,10 @@ def read_xlsx_file(path_to_xlsx: str) -> str:
229
  # Return df as plain tect
230
  return df.to_string(index=False)
231
  except Exception as e:
232
- return f"Error reading the XLSX file: {e}"
 
 
 
 
 
 
 
11
  import base64
12
 
13
  from google import genai
14
+ import requests
15
 
16
  ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
17
  GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
 
127
  """
128
  Converts speech from an audio file into text.
129
  Args:
130
+ path_to_audio (str): The path to the audio file to be transcribed. An URL can also be used.
131
  Returns:
132
  str: The transcribed text content of the audio file.
133
  """
134
 
135
+ client = genai.Client(api_key=GEMINI_API_KEY)
136
 
137
  myfile = client.files.upload(file=path_to_audio)
138
 
 
144
  return transcription.text
145
 
146
  # Analyze image tool
147
+ def image_understanding(url_to_image: str, question: str) -> str:
148
  """
149
+ Analyzes an image and generates a response to a given question based on the image's content. An URL needs to be used.
150
 
151
  Args:
152
+ path_to_image (str): The URL to the image file to be analyzed.
153
  question (str): The question to be answered, based on the contents of the image.
154
 
155
  Returns:
 
158
 
159
  client = genai.Client(api_key=GEMINI_API_KEY)
160
 
161
+ image_bytes = requests.get(url_to_image).content
162
+ image = genai.types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
163
 
164
  response = client.models.generate_content(
165
  model=GEMINI_MODEL_NAME,
166
+ contents=[question, image],
167
  )
168
 
169
  return response.text
170
 
171
  # Analyze video tool
172
+ def video_understanding(url_to_video: str, question: str) -> str:
173
  """
174
  Analyzes a video and generates a response to a given question based on the video's content.
175
 
176
  Args:
177
+ url_to_video (str): The URL to the video file to be analyzed (example:YouTube).
178
  question (str): The question to be answered, based on the contents of the video.
179
 
180
  Returns:
 
183
 
184
  client = genai.Client(api_key=GEMINI_API_KEY)
185
 
 
 
186
  response = client.models.generate_content(
187
  model=GEMINI_MODEL_NAME,
188
+ contents=genai.types.Content(
189
+ parts=[
190
+ genai.types.Part(
191
+ file_data=genai.types.FileData(file_uri=url_to_video)
192
+ ),
193
+ genai.types.Part(text=question)
194
+ ]
195
+ )
196
  )
197
 
198
  return response.text
199
 
 
200
  ## Read .csv file tool
201
  def read_csv_file(path_to_csv: str) -> str:
202
  """
 
235
  # Return df as plain tect
236
  return df.to_string(index=False)
237
  except Exception as e:
238
+ return f"Error reading the XLSX file: {e}"
239
+
240
+ # Example usage of the tools
241
+ if __name__ == "__main__":
242
+ # Example usage of the tools
243
+ # print(video_understanding("https://www.youtube.com/watch?v=L1vXCYZAYYM", "What is happening in this video?"))
244
+ print(image_understanding("https://i.etsystatic.com/28810262/r/il/2fc5e0/5785166966/il_1140xN.5785166966_nvy4.jpg", "What does this image represent?"))