David
commited on
Commit
·
b0c6c93
1
Parent(s):
7da5655
Agent passed.
Browse files- agent.py +21 -63
- app.py +2 -1
- gaia_system_prompt.py +11 -0
- tools.py +26 -14
agent.py
CHANGED
|
@@ -1,69 +1,59 @@
|
|
| 1 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
| 2 |
from llama_index.llms.gemini import Gemini
|
| 3 |
from llama_index.tools.arxiv import ArxivToolSpec
|
| 4 |
from llama_index.tools.wikipedia import WikipediaToolSpec
|
| 5 |
from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
|
| 6 |
from llama_index.core.tools import FunctionTool
|
| 7 |
-
from llama_index.core.agent.workflow import AgentWorkflow
|
| 8 |
-
from llama_index.llms.lmstudio import LMStudio
|
| 9 |
-
from llama_index.core.agent.workflow import (
|
| 10 |
-
AgentStream,
|
| 11 |
-
AgentOutput
|
| 12 |
-
)
|
| 13 |
from gradio import ChatMessage
|
| 14 |
from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
|
| 15 |
|
| 16 |
from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
|
| 17 |
-
from gaia_system_prompt import
|
| 18 |
|
| 19 |
import os
|
| 20 |
import asyncio
|
| 21 |
|
| 22 |
TIMEOUT=180 # Timeout for agent execution in seconds
|
| 23 |
GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
|
| 24 |
-
|
| 25 |
-
GEMINI_MODEL_NAME = "gemini-2.0-flash"
|
| 26 |
-
LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
|
| 27 |
-
API_DIR = "http://host.docker.internal:1234/v1" # LM Studio API URL
|
| 28 |
|
| 29 |
class FinalAgent:
|
| 30 |
def __init__(self):
|
| 31 |
# LLM Initialization
|
| 32 |
-
# self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
|
| 33 |
self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
|
| 34 |
-
# self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
|
| 35 |
|
| 36 |
# Tool Initialization
|
| 37 |
self.tools = [
|
| 38 |
FunctionTool.from_defaults(
|
| 39 |
fn=interpret_python_math_code,
|
| 40 |
name="InterpretPythonMathCode",
|
| 41 |
-
description=
|
| 42 |
),
|
| 43 |
FunctionTool.from_defaults(
|
| 44 |
fn=image_understanding,
|
| 45 |
name="ImageUnderstanding",
|
| 46 |
-
description=
|
| 47 |
),
|
| 48 |
FunctionTool.from_defaults(
|
| 49 |
fn=convert_audio_to_text,
|
| 50 |
name="ConvertAudioToText",
|
| 51 |
-
description=
|
| 52 |
),
|
| 53 |
FunctionTool.from_defaults(
|
| 54 |
fn=video_understanding,
|
| 55 |
name="VideoUnderstanding",
|
| 56 |
-
description=
|
| 57 |
),
|
| 58 |
FunctionTool.from_defaults(
|
| 59 |
fn=read_csv_file,
|
| 60 |
name="ReadCSVFile",
|
| 61 |
-
description=
|
| 62 |
),
|
| 63 |
FunctionTool.from_defaults(
|
| 64 |
fn=read_xlsx_file,
|
| 65 |
name="ReadXLSXFile",
|
| 66 |
-
description=
|
| 67 |
)
|
| 68 |
]
|
| 69 |
self.tools.extend(
|
|
@@ -75,11 +65,7 @@ class FinalAgent:
|
|
| 75 |
self.tools.extend(
|
| 76 |
DuckDuckGoSearchToolSpec().to_tool_list()
|
| 77 |
)
|
| 78 |
-
|
| 79 |
-
# Print the tools for debugging
|
| 80 |
-
print("Tools initialized:")
|
| 81 |
-
for tool in self.tools:
|
| 82 |
-
print(f"- {tool._metadata}")
|
| 83 |
|
| 84 |
# Agent Workflow Initialization
|
| 85 |
self.agent = AgentWorkflow.from_tools_or_functions(
|
|
@@ -89,37 +75,8 @@ class FinalAgent:
|
|
| 89 |
timeout=TIMEOUT
|
| 90 |
)
|
| 91 |
|
| 92 |
-
# self.agent = ReActAgent(
|
| 93 |
-
# llm=self.llm,
|
| 94 |
-
# verbose=True,
|
| 95 |
-
# max_iterations=5,
|
| 96 |
-
# system_prompt=CUSTOM_SYSTEM_PROMPT,
|
| 97 |
-
# tools=self.tools
|
| 98 |
-
# )
|
| 99 |
-
|
| 100 |
print("FinalAgent initialized.")
|
| 101 |
-
|
| 102 |
-
# # Example
|
| 103 |
-
# print(f"Agent received question: {question}")
|
| 104 |
-
# # fixed_answer = "This is a default answer."
|
| 105 |
-
# # print(f"Agent returning fixed answer: {fixed_answer}")
|
| 106 |
-
# # response = fixed_answer
|
| 107 |
-
|
| 108 |
-
# # Implement agent logic here
|
| 109 |
-
# response = ""
|
| 110 |
-
# # Run the agent with the question
|
| 111 |
-
# stream = await self.agent.run(question)
|
| 112 |
-
# response = stream.response.content
|
| 113 |
-
# # async for event in stream.stream_events():
|
| 114 |
-
# # if isinstance(event, AgentStream):
|
| 115 |
-
# # # Check if delta is empty
|
| 116 |
-
# # if event.raw["choices"][0]["delta"] != {}:
|
| 117 |
-
# # response += event.raw["choices"][0]["delta"]["content"]
|
| 118 |
-
|
| 119 |
-
# print(f"Agent response: {response}")
|
| 120 |
-
|
| 121 |
-
# return response
|
| 122 |
-
|
| 123 |
async def __call__(self, question: str) -> str:
|
| 124 |
print(f"Agent received question: {question}")
|
| 125 |
|
|
@@ -170,12 +127,13 @@ class FinalAgent:
|
|
| 170 |
return response_str
|
| 171 |
|
| 172 |
|
| 173 |
-
|
| 174 |
-
#
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
| 1 |
from llama_index.llms.gemini import Gemini
|
| 2 |
from llama_index.tools.arxiv import ArxivToolSpec
|
| 3 |
from llama_index.tools.wikipedia import WikipediaToolSpec
|
| 4 |
from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
|
| 5 |
from llama_index.core.tools import FunctionTool
|
| 6 |
+
from llama_index.core.agent.workflow import AgentWorkflow
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from gradio import ChatMessage
|
| 8 |
from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
|
| 9 |
|
| 10 |
from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
|
| 11 |
+
from gaia_system_prompt import CUSTOM_SYSTEM_PROMPT
|
| 12 |
|
| 13 |
import os
|
| 14 |
import asyncio
|
| 15 |
|
| 16 |
TIMEOUT=180 # Timeout for agent execution in seconds
|
| 17 |
GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
|
| 18 |
+
GEMINI_MODEL_NAME = "gemini-2.5-flash-preview-04-17"
|
| 19 |
+
# GEMINI_MODEL_NAME = "gemini-2.0-flash"
|
|
|
|
|
|
|
| 20 |
|
| 21 |
class FinalAgent:
|
| 22 |
def __init__(self):
|
| 23 |
# LLM Initialization
|
|
|
|
| 24 |
self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
|
|
|
|
| 25 |
|
| 26 |
# Tool Initialization
|
| 27 |
self.tools = [
|
| 28 |
FunctionTool.from_defaults(
|
| 29 |
fn=interpret_python_math_code,
|
| 30 |
name="InterpretPythonMathCode",
|
| 31 |
+
description=interpret_python_math_code.__doc__
|
| 32 |
),
|
| 33 |
FunctionTool.from_defaults(
|
| 34 |
fn=image_understanding,
|
| 35 |
name="ImageUnderstanding",
|
| 36 |
+
description=image_understanding.__doc__
|
| 37 |
),
|
| 38 |
FunctionTool.from_defaults(
|
| 39 |
fn=convert_audio_to_text,
|
| 40 |
name="ConvertAudioToText",
|
| 41 |
+
description= convert_audio_to_text.__doc__
|
| 42 |
),
|
| 43 |
FunctionTool.from_defaults(
|
| 44 |
fn=video_understanding,
|
| 45 |
name="VideoUnderstanding",
|
| 46 |
+
description= video_understanding.__doc__
|
| 47 |
),
|
| 48 |
FunctionTool.from_defaults(
|
| 49 |
fn=read_csv_file,
|
| 50 |
name="ReadCSVFile",
|
| 51 |
+
description=read_csv_file.__doc__
|
| 52 |
),
|
| 53 |
FunctionTool.from_defaults(
|
| 54 |
fn=read_xlsx_file,
|
| 55 |
name="ReadXLSXFile",
|
| 56 |
+
description= read_xlsx_file.__doc__
|
| 57 |
)
|
| 58 |
]
|
| 59 |
self.tools.extend(
|
|
|
|
| 65 |
self.tools.extend(
|
| 66 |
DuckDuckGoSearchToolSpec().to_tool_list()
|
| 67 |
)
|
| 68 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# Agent Workflow Initialization
|
| 71 |
self.agent = AgentWorkflow.from_tools_or_functions(
|
|
|
|
| 75 |
timeout=TIMEOUT
|
| 76 |
)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
print("FinalAgent initialized.")
|
| 79 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
async def __call__(self, question: str) -> str:
|
| 81 |
print(f"Agent received question: {question}")
|
| 82 |
|
|
|
|
| 127 |
return response_str
|
| 128 |
|
| 129 |
|
| 130 |
+
async def main():
|
| 131 |
+
# Example usage
|
| 132 |
+
agent = FinalAgent()
|
| 133 |
+
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
|
| 134 |
+
question2 = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
|
| 135 |
+
answer = await agent(question)
|
| 136 |
+
print(f"Final answer: {answer}")
|
| 137 |
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
asyncio.run(main())
|
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from agent import FinalAgent
|
|
| 8 |
import asyncio
|
| 9 |
import time
|
| 10 |
|
| 11 |
-
SLEEP_TIME_BETWEEN_QUESTIONS =
|
| 12 |
|
| 13 |
# (Keep Constants as is)
|
| 14 |
# --- Constants ---
|
|
@@ -89,6 +89,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 89 |
continue
|
| 90 |
try:
|
| 91 |
# Run the agent on the question
|
|
|
|
| 92 |
submitted_answer = await agent(question_text)
|
| 93 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 94 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
|
|
|
| 8 |
import asyncio
|
| 9 |
import time
|
| 10 |
|
| 11 |
+
SLEEP_TIME_BETWEEN_QUESTIONS = 60 # Sleep time between questions to avoid rate limiting
|
| 12 |
|
| 13 |
# (Keep Constants as is)
|
| 14 |
# --- Constants ---
|
|
|
|
| 89 |
continue
|
| 90 |
try:
|
| 91 |
# Run the agent on the question
|
| 92 |
+
print(item)
|
| 93 |
submitted_answer = await agent(question_text)
|
| 94 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 95 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
gaia_system_prompt.py
CHANGED
|
@@ -13,6 +13,17 @@ I provide you some guidelines to follow:
|
|
| 13 |
3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 14 |
4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
The final answer should be written in the following format:
|
| 17 |
<final_answer>
|
| 18 |
YOUR FINAL ANSWER
|
|
|
|
| 13 |
3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 14 |
4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 15 |
|
| 16 |
+
To answer the questions, you should use the follwing tools:
|
| 17 |
+
- DuckDuckGoSearchTool: Use this tool to search the web for information.
|
| 18 |
+
- ArxivTool: Use this tool to search for academic papers on arXiv.
|
| 19 |
+
- WikipediaTool: Use this tool to search for information on Wikipedia.
|
| 20 |
+
- InterpretPythonCodeTool: Use this tool to execute Python code to perform math calculations and return the result.
|
| 21 |
+
- ImageUnderstandingTool: Use this tool to analyze images and extract information.
|
| 22 |
+
- ConvertAudioToTextTool: Use this tool to convert audio files to text.
|
| 23 |
+
- VideoUnderstandingTool: Use this tool to analyze videos and extract information.
|
| 24 |
+
- ReadCSVFileTool: Use this tool to read CSV files and extract information.
|
| 25 |
+
- ReadXLSXFileTool: Use this tool to read XLSX files and extract information.
|
| 26 |
+
|
| 27 |
The final answer should be written in the following format:
|
| 28 |
<final_answer>
|
| 29 |
YOUR FINAL ANSWER
|
tools.py
CHANGED
|
@@ -11,6 +11,7 @@ import mimetypes
|
|
| 11 |
import base64
|
| 12 |
|
| 13 |
from google import genai
|
|
|
|
| 14 |
|
| 15 |
ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
|
| 16 |
GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
|
|
@@ -126,12 +127,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
|
|
| 126 |
"""
|
| 127 |
Converts speech from an audio file into text.
|
| 128 |
Args:
|
| 129 |
-
path_to_audio (str): The path to the audio file to be transcribed.
|
| 130 |
Returns:
|
| 131 |
str: The transcribed text content of the audio file.
|
| 132 |
"""
|
| 133 |
|
| 134 |
-
client = genai.Client(api_key=
|
| 135 |
|
| 136 |
myfile = client.files.upload(file=path_to_audio)
|
| 137 |
|
|
@@ -143,12 +144,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
|
|
| 143 |
return transcription.text
|
| 144 |
|
| 145 |
# Analyze image tool
|
| 146 |
-
def image_understanding(
|
| 147 |
"""
|
| 148 |
-
Analyzes an image and generates a response to a given question based on the image's content.
|
| 149 |
|
| 150 |
Args:
|
| 151 |
-
path_to_image (str): The
|
| 152 |
question (str): The question to be answered, based on the contents of the image.
|
| 153 |
|
| 154 |
Returns:
|
|
@@ -157,22 +158,23 @@ def image_understanding(path_to_image: str, question: str) -> str:
|
|
| 157 |
|
| 158 |
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 159 |
|
| 160 |
-
|
|
|
|
| 161 |
|
| 162 |
response = client.models.generate_content(
|
| 163 |
model=GEMINI_MODEL_NAME,
|
| 164 |
-
contents=[
|
| 165 |
)
|
| 166 |
|
| 167 |
return response.text
|
| 168 |
|
| 169 |
# Analyze video tool
|
| 170 |
-
def video_understanding(
|
| 171 |
"""
|
| 172 |
Analyzes a video and generates a response to a given question based on the video's content.
|
| 173 |
|
| 174 |
Args:
|
| 175 |
-
|
| 176 |
question (str): The question to be answered, based on the contents of the video.
|
| 177 |
|
| 178 |
Returns:
|
|
@@ -181,16 +183,20 @@ def video_understanding(path_to_video: str, question: str) -> str:
|
|
| 181 |
|
| 182 |
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 183 |
|
| 184 |
-
my_file = client.files.upload(file=path_to_video)
|
| 185 |
-
|
| 186 |
response = client.models.generate_content(
|
| 187 |
model=GEMINI_MODEL_NAME,
|
| 188 |
-
contents=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
)
|
| 190 |
|
| 191 |
return response.text
|
| 192 |
|
| 193 |
-
|
| 194 |
## Read .csv file tool
|
| 195 |
def read_csv_file(path_to_csv: str) -> str:
|
| 196 |
"""
|
|
@@ -229,4 +235,10 @@ def read_xlsx_file(path_to_xlsx: str) -> str:
|
|
| 229 |
# Return df as plain tect
|
| 230 |
return df.to_string(index=False)
|
| 231 |
except Exception as e:
|
| 232 |
-
return f"Error reading the XLSX file: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import base64
|
| 12 |
|
| 13 |
from google import genai
|
| 14 |
+
import requests
|
| 15 |
|
| 16 |
ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
|
| 17 |
GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
|
|
|
|
| 127 |
"""
|
| 128 |
Converts speech from an audio file into text.
|
| 129 |
Args:
|
| 130 |
+
path_to_audio (str): The path to the audio file to be transcribed. An URL can also be used.
|
| 131 |
Returns:
|
| 132 |
str: The transcribed text content of the audio file.
|
| 133 |
"""
|
| 134 |
|
| 135 |
+
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 136 |
|
| 137 |
myfile = client.files.upload(file=path_to_audio)
|
| 138 |
|
|
|
|
| 144 |
return transcription.text
|
| 145 |
|
| 146 |
# Analyze image tool
|
| 147 |
+
def image_understanding(url_to_image: str, question: str) -> str:
|
| 148 |
"""
|
| 149 |
+
Analyzes an image and generates a response to a given question based on the image's content. An URL needs to be used.
|
| 150 |
|
| 151 |
Args:
|
| 152 |
+
path_to_image (str): The URL to the image file to be analyzed.
|
| 153 |
question (str): The question to be answered, based on the contents of the image.
|
| 154 |
|
| 155 |
Returns:
|
|
|
|
| 158 |
|
| 159 |
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 160 |
|
| 161 |
+
image_bytes = requests.get(url_to_image).content
|
| 162 |
+
image = genai.types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
|
| 163 |
|
| 164 |
response = client.models.generate_content(
|
| 165 |
model=GEMINI_MODEL_NAME,
|
| 166 |
+
contents=[question, image],
|
| 167 |
)
|
| 168 |
|
| 169 |
return response.text
|
| 170 |
|
| 171 |
# Analyze video tool
|
| 172 |
+
def video_understanding(url_to_video: str, question: str) -> str:
|
| 173 |
"""
|
| 174 |
Analyzes a video and generates a response to a given question based on the video's content.
|
| 175 |
|
| 176 |
Args:
|
| 177 |
+
url_to_video (str): The URL to the video file to be analyzed (example:YouTube).
|
| 178 |
question (str): The question to be answered, based on the contents of the video.
|
| 179 |
|
| 180 |
Returns:
|
|
|
|
| 183 |
|
| 184 |
client = genai.Client(api_key=GEMINI_API_KEY)
|
| 185 |
|
|
|
|
|
|
|
| 186 |
response = client.models.generate_content(
|
| 187 |
model=GEMINI_MODEL_NAME,
|
| 188 |
+
contents=genai.types.Content(
|
| 189 |
+
parts=[
|
| 190 |
+
genai.types.Part(
|
| 191 |
+
file_data=genai.types.FileData(file_uri=url_to_video)
|
| 192 |
+
),
|
| 193 |
+
genai.types.Part(text=question)
|
| 194 |
+
]
|
| 195 |
+
)
|
| 196 |
)
|
| 197 |
|
| 198 |
return response.text
|
| 199 |
|
|
|
|
| 200 |
## Read .csv file tool
|
| 201 |
def read_csv_file(path_to_csv: str) -> str:
|
| 202 |
"""
|
|
|
|
| 235 |
# Return df as plain tect
|
| 236 |
return df.to_string(index=False)
|
| 237 |
except Exception as e:
|
| 238 |
+
return f"Error reading the XLSX file: {e}"
|
| 239 |
+
|
| 240 |
+
# Example usage of the tools
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
# Example usage of the tools
|
| 243 |
+
# print(video_understanding("https://www.youtube.com/watch?v=L1vXCYZAYYM", "What is happening in this video?"))
|
| 244 |
+
print(image_understanding("https://i.etsystatic.com/28810262/r/il/2fc5e0/5785166966/il_1140xN.5785166966_nvy4.jpg", "What does this image represent?"))
|