Final_Assignment_Agents_Course

Sleeping

App Files Files Community

David commited on May 29, 2025

Commit

b0c6c93

1 Parent(s): 7da5655

Agent passed.

Browse files

Files changed (4) hide show

agent.py +21 -63
app.py +2 -1
gaia_system_prompt.py +11 -0
tools.py +26 -14

agent.py CHANGED Viewed

@@ -1,69 +1,59 @@
-from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.llms.gemini import Gemini
 from llama_index.tools.arxiv import ArxivToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from llama_index.core.tools import FunctionTool
-from llama_index.core.agent.workflow import AgentWorkflow, ReActAgent
-from llama_index.llms.lmstudio import LMStudio
-from llama_index.core.agent.workflow import (
-    AgentStream,
-    AgentOutput
-)
 from gradio import ChatMessage
 from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
 from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
-from gaia_system_prompt import GAIA_SYSTEM_PROMPT, CUSTOM_SYSTEM_PROMPT
 import os
 import asyncio
 TIMEOUT=180 # Timeout for agent execution in seconds
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
-GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
-GEMINI_MODEL_NAME = "gemini-2.0-flash"
-LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
-API_DIR = "http://host.docker.internal:1234/v1"  # LM Studio API URL
 class FinalAgent:
     def __init__(self):
         # LLM Initialization
-        # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
         self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
-        # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
         # Tool Initialization
         self.tools = [
             FunctionTool.from_defaults(
                 fn=interpret_python_math_code,
                 name="InterpretPythonMathCode",
-                description="Interprets Python code for mathematical expressions."
             ),
             FunctionTool.from_defaults(
                 fn=image_understanding,
                 name="ImageUnderstanding",
-                description="Analyzes an image and generates a response to a given question based on the image's content."
             ),
             FunctionTool.from_defaults(
                 fn=convert_audio_to_text,
                 name="ConvertAudioToText",
-                description="Converts audio files to text using a speech-to-text model."
             ),
             FunctionTool.from_defaults(
                 fn=video_understanding,
                 name="VideoUnderstanding",
-                description="Analyzes a video and generates a response to a given question based on the video's content."
             ),
             FunctionTool.from_defaults(
                 fn=read_csv_file,
                 name="ReadCSVFile",
-                description="Reads a CSV file and returns its content as a string."
             ),
             FunctionTool.from_defaults(
                 fn=read_xlsx_file,
                 name="ReadXLSXFile",
-                description="Reads an XLSX file and returns its content as a string."
             )
         ]
         self.tools.extend(
@@ -75,11 +65,7 @@ class FinalAgent:
         self.tools.extend(
             DuckDuckGoSearchToolSpec().to_tool_list()
         )
-        # Print the tools for debugging
-        print("Tools initialized:")
-        for tool in self.tools:
-            print(f"- {tool._metadata}")
         # Agent Workflow Initialization
         self.agent = AgentWorkflow.from_tools_or_functions(
@@ -89,37 +75,8 @@ class FinalAgent:
             timeout=TIMEOUT
         )
-        # self.agent = ReActAgent(
-        #     llm=self.llm,
-        #     verbose=True,
-        #     max_iterations=5,
-        #     system_prompt=CUSTOM_SYSTEM_PROMPT,
-        #     tools=self.tools
-        # )
         print("FinalAgent initialized.")
-    # async def __call__(self, question: str) -> str:
-    #     # Example
-    #     print(f"Agent received question: {question}")
-    #     # fixed_answer = "This is a default answer."
-    #     # print(f"Agent returning fixed answer: {fixed_answer}")
-    #     # response = fixed_answer
-    #     # Implement agent logic here
-    #     response = ""
-    #     # Run the agent with the question
-    #     stream = await self.agent.run(question)
-    #     response = stream.response.content
-    #     # async for event in stream.stream_events():
-    #     #         if isinstance(event, AgentStream):
-    #     #              # Check if delta is empty
-    #     #             if event.raw["choices"][0]["delta"] != {}:
-    #     #                 response += event.raw["choices"][0]["delta"]["content"]
-    #     print(f"Agent response: {response}")
-    #     return response
     async def __call__(self, question: str) -> str:
         print(f"Agent received question: {question}")
@@ -170,12 +127,13 @@ class FinalAgent:
         return response_str
-# async def main():
-#     # Example usage
-#     agent = FinalAgent()
-#     question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
-#     answer = await agent(question)
-#     print(f"Final answer: {answer}")
-# if __name__ == "__main__":
-#     asyncio.run(main())

 from llama_index.llms.gemini import Gemini
 from llama_index.tools.arxiv import ArxivToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from llama_index.core.tools import FunctionTool
+from llama_index.core.agent.workflow import AgentWorkflow
 from gradio import ChatMessage
 from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
 from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
+from gaia_system_prompt import CUSTOM_SYSTEM_PROMPT
 import os
 import asyncio
 TIMEOUT=180 # Timeout for agent execution in seconds
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
+GEMINI_MODEL_NAME = "gemini-2.5-flash-preview-04-17"
+# GEMINI_MODEL_NAME = "gemini-2.0-flash"
 class FinalAgent:
     def __init__(self):
         # LLM Initialization
         self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
         # Tool Initialization
         self.tools = [
             FunctionTool.from_defaults(
                 fn=interpret_python_math_code,
                 name="InterpretPythonMathCode",
+                description=interpret_python_math_code.__doc__
             ),
             FunctionTool.from_defaults(
                 fn=image_understanding,
                 name="ImageUnderstanding",
+                description=image_understanding.__doc__
             ),
             FunctionTool.from_defaults(
                 fn=convert_audio_to_text,
                 name="ConvertAudioToText",
+                description= convert_audio_to_text.__doc__
             ),
             FunctionTool.from_defaults(
                 fn=video_understanding,
                 name="VideoUnderstanding",
+                description= video_understanding.__doc__
             ),
             FunctionTool.from_defaults(
                 fn=read_csv_file,
                 name="ReadCSVFile",
+                description=read_csv_file.__doc__
             ),
             FunctionTool.from_defaults(
                 fn=read_xlsx_file,
                 name="ReadXLSXFile",
+                description= read_xlsx_file.__doc__
             )
         ]
         self.tools.extend(
         self.tools.extend(
             DuckDuckGoSearchToolSpec().to_tool_list()
         )
         # Agent Workflow Initialization
         self.agent = AgentWorkflow.from_tools_or_functions(
             timeout=TIMEOUT
         )
         print("FinalAgent initialized.")
     async def __call__(self, question: str) -> str:
         print(f"Agent received question: {question}")
         return response_str
+async def main():
+    # Example usage
+    agent = FinalAgent()
+    question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
+    question2 = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
+    answer = await agent(question)
+    print(f"Final answer: {answer}")
+if __name__ == "__main__":
+    asyncio.run(main())

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from agent import FinalAgent
 import asyncio
 import time
-SLEEP_TIME_BETWEEN_QUESTIONS = 30  # Sleep time between questions to avoid rate limiting
 # (Keep Constants as is)
 # --- Constants ---
@@ -89,6 +89,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
             continue
         try:
             # Run the agent on the question
             submitted_answer = await agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

 import asyncio
 import time
+SLEEP_TIME_BETWEEN_QUESTIONS = 60  # Sleep time between questions to avoid rate limiting
 # (Keep Constants as is)
 # --- Constants ---
             continue
         try:
             # Run the agent on the question
+            print(item)
             submitted_answer = await agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})

gaia_system_prompt.py CHANGED Viewed

@@ -13,6 +13,17 @@ I provide you some guidelines to follow:
 3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 The final answer should be written in the following format:
 <final_answer>
 YOUR FINAL ANSWER

 3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+To answer the questions, you should use the follwing tools:
+- DuckDuckGoSearchTool: Use this tool to search the web for information.
+- ArxivTool: Use this tool to search for academic papers on arXiv.
+- WikipediaTool: Use this tool to search for information on Wikipedia.
+- InterpretPythonCodeTool: Use this tool to execute Python code to perform math calculations and return the result.
+- ImageUnderstandingTool: Use this tool to analyze images and extract information.
+- ConvertAudioToTextTool: Use this tool to convert audio files to text.
+- VideoUnderstandingTool: Use this tool to analyze videos and extract information.
+- ReadCSVFileTool: Use this tool to read CSV files and extract information.
+- ReadXLSXFileTool: Use this tool to read XLSX files and extract information.
 The final answer should be written in the following format:
 <final_answer>
 YOUR FINAL ANSWER

tools.py CHANGED Viewed

@@ -11,6 +11,7 @@ import mimetypes
 import base64
 from google import genai
 ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
@@ -126,12 +127,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
     """
     Converts speech from an audio file into text.
     Args:
-        path_to_audio (str): The path to the audio file to be transcribed.
     Returns:
         str: The transcribed text content of the audio file.
     """
-    client = genai.Client(api_key="GOOGLE_API_KEY")
     myfile = client.files.upload(file=path_to_audio)
@@ -143,12 +144,12 @@ def convert_audio_to_text(path_to_audio: str) -> str:
     return transcription.text
 # Analyze image tool
-def image_understanding(path_to_image: str, question: str) -> str:
     """
-    Analyzes an image and generates a response to a given question based on the image's content.
     Args:
-        path_to_image (str): The path to the image file to be analyzed.
         question (str): The question to be answered, based on the contents of the image.
     Returns:
@@ -157,22 +158,23 @@ def image_understanding(path_to_image: str, question: str) -> str:
     client = genai.Client(api_key=GEMINI_API_KEY)
-    my_file = client.files.upload(file=path_to_image)
     response = client.models.generate_content(
     model=GEMINI_MODEL_NAME,
-    contents=[my_file, question],
     )
     return response.text
 # Analyze video tool
-def video_understanding(path_to_video: str, question: str) -> str:
     """
     Analyzes a video and generates a response to a given question based on the video's content.
     Args:
-        path_to_video (str): The path to the video file to be analyzed.
         question (str): The question to be answered, based on the contents of the video.
     Returns:
@@ -181,16 +183,20 @@ def video_understanding(path_to_video: str, question: str) -> str:
     client = genai.Client(api_key=GEMINI_API_KEY)
-    my_file = client.files.upload(file=path_to_video)
     response = client.models.generate_content(
     model=GEMINI_MODEL_NAME,
-    contents=[my_file, question],
     )
     return response.text
 ## Read .csv file tool
 def read_csv_file(path_to_csv: str) -> str:
     """
@@ -229,4 +235,10 @@ def read_xlsx_file(path_to_xlsx: str) -> str:
         # Return df as plain tect
         return df.to_string(index=False)
     except Exception as e:
-        return f"Error reading the XLSX file: {e}"

 import base64
 from google import genai
+import requests
 ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
     """
     Converts speech from an audio file into text.
     Args:
+        path_to_audio (str): The path to the audio file to be transcribed. An URL can also be used.
     Returns:
         str: The transcribed text content of the audio file.
     """
+    client = genai.Client(api_key=GEMINI_API_KEY)
     myfile = client.files.upload(file=path_to_audio)
     return transcription.text
 # Analyze image tool
+def image_understanding(url_to_image: str, question: str) -> str:
     """
+    Analyzes an image and generates a response to a given question based on the image's content. An URL needs to be used.
     Args:
+        path_to_image (str): The URL to the image file to be analyzed.
         question (str): The question to be answered, based on the contents of the image.
     Returns:
     client = genai.Client(api_key=GEMINI_API_KEY)
+    image_bytes = requests.get(url_to_image).content
+    image = genai.types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
     response = client.models.generate_content(
     model=GEMINI_MODEL_NAME,
+    contents=[question, image],
     )
     return response.text
 # Analyze video tool
+def video_understanding(url_to_video: str, question: str) -> str:
     """
     Analyzes a video and generates a response to a given question based on the video's content.
     Args:
+        url_to_video (str): The URL to the video file to be analyzed (example:YouTube).
         question (str): The question to be answered, based on the contents of the video.
     Returns:
     client = genai.Client(api_key=GEMINI_API_KEY)
     response = client.models.generate_content(
     model=GEMINI_MODEL_NAME,
+    contents=genai.types.Content(
+        parts=[
+            genai.types.Part(
+                file_data=genai.types.FileData(file_uri=url_to_video)
+            ),
+            genai.types.Part(text=question)
+            ]
+        )
     )
     return response.text
 ## Read .csv file tool
 def read_csv_file(path_to_csv: str) -> str:
     """
         # Return df as plain tect
         return df.to_string(index=False)
     except Exception as e:
+        return f"Error reading the XLSX file: {e}"
+# Example usage of the tools
+if __name__ == "__main__":
+    # Example usage of the tools
+    # print(video_understanding("https://www.youtube.com/watch?v=L1vXCYZAYYM", "What is happening in this video?"))
+    print(image_understanding("https://i.etsystatic.com/28810262/r/il/2fc5e0/5785166966/il_1140xN.5785166966_nvy4.jpg", "What does this image represent?"))