Final_Assignment_Agents_Course

Sleeping

App Files Files Community

David commited on May 28, 2025

Commit

7da5655

1 Parent(s): 3f771a9

Included tools to understand audio, image and video. Sleep is included to avoid free tier RPM

Browse files

Files changed (5) hide show

agent.py +61 -34
app.py +6 -0
gaia_system_prompt.py +11 -14
requirements.txt +1 -4
tools.py +47 -57

agent.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.llms.gemini import Gemini
-from llama_index.llms.groq import Groq
-from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
 from llama_index.tools.arxiv import ArxivToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
@@ -15,17 +13,16 @@ from llama_index.core.agent.workflow import (
 from gradio import ChatMessage
 from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
-from tools import interpret_python_math_code
-from gaia_system_prompt import SYSTEM_PROMPT as GAIA_SYSTEM_PROMPT
 import os
 import asyncio
 TIMEOUT=180 # Timeout for agent execution in seconds
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
-GROQ_API_KEY = os.getenv("GROQ_TOKEN")
 GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
-GEMINI_MODEL_NAME = "gemini-2.5-flash-preview-04-17"
 LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
 API_DIR = "http://host.docker.internal:1234/v1"  # LM Studio API URL
@@ -33,10 +30,8 @@ class FinalAgent:
     def __init__(self):
         # LLM Initialization
         # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
-        # self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
-        # self.llm = Groq(model="meta-llama/llama-4-maverick-17b-128e-instruct", api_key=GROQ_API_KEY)
         # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
-        self.llm = HuggingFaceInferenceAPI(model_name="meta-llama/Llama-3.3-70B-Instruct", timeout=TIMEOUT)
         # Tool Initialization
         self.tools = [
@@ -44,6 +39,31 @@ class FinalAgent:
                 fn=interpret_python_math_code,
                 name="InterpretPythonMathCode",
                 description="Interprets Python code for mathematical expressions."
             )
         ]
         self.tools.extend(
@@ -56,22 +76,27 @@ class FinalAgent:
             DuckDuckGoSearchToolSpec().to_tool_list()
         )
-        # Agent Workflow Initialization
-        # self.agent = AgentWorkflow.from_tools_or_functions(
-        #     tools_or_functions=self.tools,
-        #     llm=self.llm,
-        #     system_prompt=GAIA_SYSTEM_PROMPT,
-        #     timeout=TIMEOUT
-        # )
-        self.agent = ReActAgent(
             llm=self.llm,
-            verbose=True,
-            max_iterations=5,
-            system_prompt=GAIA_SYSTEM_PROMPT,
-            tools=self.tools
         )
         print("FinalAgent initialized.")
     # async def __call__(self, question: str) -> str:
     #     # Example
@@ -102,6 +127,7 @@ class FinalAgent:
         try:
             # Use arun for an async method.
             agent_chat_response = await self.agent.run(question)
             potential_response_obj = agent_chat_response.response
@@ -133,22 +159,23 @@ class FinalAgent:
             # Depending on requirements, you might want to return an error message or re-raise
             response_str = f"Agent error: {e}"
-        # Get the agent's final response string from FINAL ANSWER:
-        if "FINAL ANSWER: " in response_str:
-            response_str = response_str.split("FINAL ANSWER: ")[-1].strip()
         else:
-            print("Warning: 'FINAL ANSWER:' not found in response string. Returning full response.")
-        print(f"Agent final response: {response_str}")
         return response_str
-async def main():
-    # Example usage
-    agent = FinalAgent()
-    question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
-    answer = await agent(question)
-    print(f"Final answer: {answer}")
-if __name__ == "__main__":
-    asyncio.run(main())

 from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.llms.gemini import Gemini
 from llama_index.tools.arxiv import ArxivToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from gradio import ChatMessage
 from llama_index.core.base.llms.types import ChatMessage as llama_index_chat_message
+from tools import interpret_python_math_code, image_understanding, convert_audio_to_text, video_understanding, read_csv_file, read_xlsx_file
+from gaia_system_prompt import GAIA_SYSTEM_PROMPT, CUSTOM_SYSTEM_PROMPT
 import os
 import asyncio
 TIMEOUT=180 # Timeout for agent execution in seconds
 GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
 GEMINI_OPENAI_API_DIR = "https://generativelanguage.googleapis.com/v1beta/openai/"
+GEMINI_MODEL_NAME = "gemini-2.0-flash"
 LMSTUDIO_MODEL_NAME = "gemma-3-12B-it-qat-GGUF"
 API_DIR = "http://host.docker.internal:1234/v1"  # LM Studio API URL
     def __init__(self):
         # LLM Initialization
         # self.llm = GoogleGenAI(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
+        self.llm = Gemini(model=GEMINI_MODEL_NAME, api_key=GEMINI_API_KEY)
         # self.llm = LMStudio(model_name=LMSTUDIO_MODEL_NAME, base_url=API_DIR, request_timeout=180, temperature=0.1)
         # Tool Initialization
         self.tools = [
                 fn=interpret_python_math_code,
                 name="InterpretPythonMathCode",
                 description="Interprets Python code for mathematical expressions."
+            ),
+            FunctionTool.from_defaults(
+                fn=image_understanding,
+                name="ImageUnderstanding",
+                description="Analyzes an image and generates a response to a given question based on the image's content."
+            ),
+            FunctionTool.from_defaults(
+                fn=convert_audio_to_text,
+                name="ConvertAudioToText",
+                description="Converts audio files to text using a speech-to-text model."
+            ),
+            FunctionTool.from_defaults(
+                fn=video_understanding,
+                name="VideoUnderstanding",
+                description="Analyzes a video and generates a response to a given question based on the video's content."
+            ),
+            FunctionTool.from_defaults(
+                fn=read_csv_file,
+                name="ReadCSVFile",
+                description="Reads a CSV file and returns its content as a string."
+            ),
+            FunctionTool.from_defaults(
+                fn=read_xlsx_file,
+                name="ReadXLSXFile",
+                description="Reads an XLSX file and returns its content as a string."
             )
         ]
         self.tools.extend(
             DuckDuckGoSearchToolSpec().to_tool_list()
         )
+        # Print the tools for debugging
+        print("Tools initialized:")
+        for tool in self.tools:
+            print(f"- {tool._metadata}")
+        # Agent Workflow Initialization
+        self.agent = AgentWorkflow.from_tools_or_functions(
+            tools_or_functions=self.tools,
             llm=self.llm,
+            system_prompt=CUSTOM_SYSTEM_PROMPT,
+            timeout=TIMEOUT
         )
+        # self.agent = ReActAgent(
+        #     llm=self.llm,
+        #     verbose=True,
+        #     max_iterations=5,
+        #     system_prompt=CUSTOM_SYSTEM_PROMPT,
+        #     tools=self.tools
+        # )
         print("FinalAgent initialized.")
     # async def __call__(self, question: str) -> str:
     #     # Example
         try:
             # Use arun for an async method.
             agent_chat_response = await self.agent.run(question)
+            print(agent_chat_response)
             potential_response_obj = agent_chat_response.response
             # Depending on requirements, you might want to return an error message or re-raise
             response_str = f"Agent error: {e}"
+        # Get the agent's final response between <final_answer> and </final_answer> tags
+        if "<final_answer>" in response_str and "</final_answer>" in response_str:
+            start_index = response_str.index("<final_answer>") + len("<final_answer>")
+            end_index = response_str.index("</final_answer>")
+            response_str = response_str[start_index:end_index].strip()
         else:
+            print("Warning: No <final_answer> tags found in the response.")
         return response_str
+# async def main():
+#     # Example usage
+#     agent = FinalAgent()
+#     question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
+#     answer = await agent(question)
+#     print(f"Final answer: {answer}")
+# if __name__ == "__main__":
+#     asyncio.run(main())

app.py CHANGED Viewed

@@ -6,6 +6,9 @@ import pandas as pd
 from agent import FinalAgent
 import asyncio
 # (Keep Constants as is)
 # --- Constants ---
@@ -85,9 +88,12 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             submitted_answer = await agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

 from agent import FinalAgent
 import asyncio
+import time
+SLEEP_TIME_BETWEEN_QUESTIONS = 30  # Sleep time between questions to avoid rate limiting
 # (Keep Constants as is)
 # --- Constants ---
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            # Run the agent on the question
             submitted_answer = await agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+            time.sleep(SLEEP_TIME_BETWEEN_QUESTIONS) # Sleep for 60 seconds to avoid Gemini free RPD rate limiting issues
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

gaia_system_prompt.py CHANGED Viewed

@@ -5,19 +5,16 @@ If you are asked for a number, don't use comma to write your number neither use
 If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
-SYSTEM_PROMPT = """
-You are a general AI assistant. Answer my question directly, following these strict rules. Your entire output must be *only* the template below.
-**Rules:**
-*   No thoughts, explanations, or extra text.
-*   The *only* output is: FINAL ANSWER: [YOUR SHORT ANSWER]
-*   [YOUR SHORT ANSWER] is a number, string, or comma-separated list.
-*   Numbers: No commas, no units (unless specified).
-*   Strings: No articles, no abbreviations, digits as words (unless specified).
-*   Lists: Apply number/string rules to items.
-**Example:**
-User: What is the capital of France?
-Assistant:
-FINAL ANSWER: Paris
 """

 If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
 If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string."""
+CUSTOM_SYSTEM_PROMPT = """
+You are a general AI assistant. I will ask you a question and you should use your tools to answer as better as you can. You must be concise and precise in your answers.
+I provide you some guidelines to follow:
+1. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+2. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+3. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+4. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+The final answer should be written in the following format:
+<final_answer>
+YOUR FINAL ANSWER
+</final_answer>
 """

requirements.txt CHANGED Viewed

@@ -3,11 +3,8 @@ requests
 numpy
 pandas
 scipy
-groq
 llama-index
-llama-index-llms-huggingface
-llama-index-llms-huggingface-api
-llama-index-llms-groq
 llama-index-utils-workflow
 llama-index-llms-lmstudio
 llama-index-llms-gemini

 numpy
 pandas
 scipy
+google-genai
 llama-index
 llama-index-utils-workflow
 llama-index-llms-lmstudio
 llama-index-llms-gemini

tools.py CHANGED Viewed

@@ -5,14 +5,16 @@ import sys
 import numpy as np
 import pandas as pd
 import scipy
-from groq import Groq
 from pathlib import Path
-import pandas as pd
 import mimetypes
 import base64
 ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
 def interpret_python_math_code(python_code: str) -> str:
     """
@@ -119,7 +121,7 @@ def interpret_python_math_code(python_code: str) -> str:
             sys.stdout = old_stdout
-## STT tool
 def convert_audio_to_text(path_to_audio: str) -> str:
     """
     Converts speech from an audio file into text.
@@ -129,33 +131,19 @@ def convert_audio_to_text(path_to_audio: str) -> str:
         str: The transcribed text content of the audio file.
     """
-    # Validate audio file
-    if not isinstance(path_to_audio, str):
-        raise TypeError(
-            "Parameter 'path_to_audio' must be a string containing the file path."
-        )
-    path = Path(path_to_audio).expanduser().resolve()
-    if not path.is_file():
-        raise FileNotFoundError(f"No such audio file: {path}")
-    # Initialize the Groq client
-    client = Groq()
-    # Open the audio file
-    with open(path_to_audio, "rb") as audio_file:
-        # Create a transcription of the audio file
-        transcription = client.audio.transcriptions.create(
-            file=audio_file,
-            model="whisper-large-v3-turbo",
-            response_format="text", # Returns plain text instead of JSON
-            language="en",
-            temperature=0.1
-        )
-    return transcription
-## Analyze image tool
-def analyze_image(path_to_image: str, question: str) -> str:
     """
     Analyzes an image and generates a response to a given question based on the image's content.
@@ -167,39 +155,41 @@ def analyze_image(path_to_image: str, question: str) -> str:
         str: The response from a VLM, typically a textual analysis or description based on the image.
     """
-    def encode_image(image_path):
-        with open(image_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode('utf-8')
-    # Get the MIME type (e.g., image/png, image/jpeg)
-    mime_type, _ = mimetypes.guess_type(path_to_image)
-    if mime_type is None:
-        raise ValueError("Unsupported file type. Please provide a valid image.")
-    base64_image = encode_image(path_to_image)
-    # Initialize the Groq client
-    client = GroqClient()
-    chat_completion = client.chat.completions.create(
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": question},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:{mime_type};base64,{base64_image}",
-                        },
-                    },
-                ],
-            }
-        ],
-        model="meta-llama/llama-4-scout-17b-16e-instruct",
     )
-    return chat_completion.choices[0].message.content
 ## Read .csv file tool
 def read_csv_file(path_to_csv: str) -> str:

 import numpy as np
 import pandas as pd
 import scipy
 from pathlib import Path
 import mimetypes
 import base64
+from google import genai
 ALLOWED_MODULES = {"numpy", "pandas", "scipy"}
+GEMINI_API_KEY = os.getenv("GEMINI_TOKEN")
+GEMINI_MODEL_NAME = "gemini-2.0-flash"
 def interpret_python_math_code(python_code: str) -> str:
     """
             sys.stdout = old_stdout
+# STT tool
 def convert_audio_to_text(path_to_audio: str) -> str:
     """
     Converts speech from an audio file into text.
         str: The transcribed text content of the audio file.
     """
+    client = genai.Client(api_key="GOOGLE_API_KEY")
+    myfile = client.files.upload(file=path_to_audio)
+    transcription = client.models.generate_content(
+        model=GEMINI_MODEL_NAME, contents=["Provide a transcription of this audio file.", myfile]
+    )
+    return transcription.text
+# Analyze image tool
+def image_understanding(path_to_image: str, question: str) -> str:
     """
     Analyzes an image and generates a response to a given question based on the image's content.
         str: The response from a VLM, typically a textual analysis or description based on the image.
     """
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    my_file = client.files.upload(file=path_to_image)
+    response = client.models.generate_content(
+    model=GEMINI_MODEL_NAME,
+    contents=[my_file, question],
+    )
+    return response.text
+# Analyze video tool
+def video_understanding(path_to_video: str, question: str) -> str:
+    """
+    Analyzes a video and generates a response to a given question based on the video's content.
+    Args:
+        path_to_video (str): The path to the video file to be analyzed.
+        question (str): The question to be answered, based on the contents of the video.
+    Returns:
+        str: The response from a VLM, typically a textual analysis or description based on the video.
+    """
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    my_file = client.files.upload(file=path_to_video)
+    response = client.models.generate_content(
+    model=GEMINI_MODEL_NAME,
+    contents=[my_file, question],
     )
+    return response.text
 ## Read .csv file tool
 def read_csv_file(path_to_csv: str) -> str: