Final_Assignment_Template

Running

App Files Files Community

Paperbag commited on 14 days ago

Commit

39b1e37

1 Parent(s): 45386f2

feat: Update `analyze_image` and `analyze_video` tool descriptions and system prompt rules to enhance multimedia processing.

Browse files

Files changed (4) hide show

__pycache__/agent.cpython-39.pyc +0 -0
agent.py +79 -15
app.py +2 -1
requirements.txt +2 -0

__pycache__/agent.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ

agent.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import datetime
 import subprocess
 import tempfile
 from typing import TypedDict, List, Dict, Any, Optional, Union
 from langchain_core import tools
 from langgraph.graph import StateGraph, START, END
@@ -15,6 +16,8 @@ from groq import Groq
 from langchain_groq import ChatGroq
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders import WebBaseLoader
 import base64
 try:
@@ -22,6 +25,8 @@ try:
 except ImportError:
     cv2 = None
 whisper_model = None
 def get_whisper():
     global whisper_model
@@ -48,9 +53,59 @@ model = ChatGroq(
     max_tokens=None,
     timeout=None,
     max_retries=2,
-    # other params...
 )
 @tool
 def web_search(keywords: str) -> str:
     """
@@ -105,12 +160,13 @@ def wiki_search(query: str) -> str:
 @tool
 def analyze_image(image_path: str, question: str) -> str:
     """
-    Analyzes an image to answer a specific question.
-    Use this tool when you need to extract visual information from an image file.
     Args:
         image_path: The local path or URL to the image file.
-        question: The specific question to ask about the image.
     """
     try:
         # If it's a local file, we encode it to base64
@@ -154,12 +210,13 @@ def analyze_audio(audio_path: str, question: str) -> str:
 @tool
 def analyze_video(video_path: str, question: str) -> str:
     """
-    Analyzes a video file to answer questions about its content.
-    Extracts key frames and describes what is happening.
     Args:
         video_path: The local path to the video file.
-        question: The specific question to ask about the video.
     """
     if cv2 is None:
         return "Error: cv2 is not installed. Please install opencv-python."
@@ -321,6 +378,8 @@ def restart_required(state: AgentState) -> AgentState:
 tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
 def answer_message(state: AgentState) -> AgentState:
     messages = state["messages"]
@@ -334,11 +393,12 @@ def answer_message(state: AgentState) -> AgentState:
     TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
     CRITICAL RULES FOR SEARCH & TOOLS:
-    1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_audio, analyze_video) to answer the question based on the file content.
-    2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
-    3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
-    4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
-    5. Cross-reference facts if they seem ambiguous.
     Do not include any thought process before answering the question, and only response exactly what was being asked of you.
     If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
@@ -358,8 +418,12 @@ def answer_message(state: AgentState) -> AgentState:
     draft_response = None
     for step in range(max_steps):
         print(f"--- ReAct Step {step + 1} ---")
-        ai_msg = model_with_tools.invoke(messages)
         messages.append(ai_msg)
         # Check if the model requested tools
@@ -390,7 +454,7 @@ def answer_message(state: AgentState) -> AgentState:
         print("Max reasoning steps reached. Forcing answer extraction.")
         forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
         messages.append(forced_msg)
-        draft_response = model.invoke(messages)
     # Third pass: strict GAIA formatting extraction
     formatting_sys = SystemMessage(
@@ -403,7 +467,7 @@ def answer_message(state: AgentState) -> AgentState:
             "If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
         )
     )
-    final_response = model.invoke([formatting_sys, HumanMessage(content=draft_response.content)])
     print(f"Draft response: {draft_response.content}")
     print(f"Strict Final response: {final_response.content}")

 import datetime
 import subprocess
 import tempfile
+import time
 from typing import TypedDict, List, Dict, Any, Optional, Union
 from langchain_core import tools
 from langgraph.graph import StateGraph, START, END
 from langchain_groq import ChatGroq
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders import WebBaseLoader
+from langchain_openai import ChatOpenAI
+from langchain_google_genai import ChatGoogleGenerativeAI
 import base64
 try:
 except ImportError:
     cv2 = None
+# os.environ["USER_AGENT"] = "gaia-agent/1.0"
 whisper_model = None
 def get_whisper():
     global whisper_model
     max_tokens=None,
     timeout=None,
     max_retries=2,
 )
+# OpenRouter Fallback Model (used when Groq hits rate limits)
+openrouter_model = ChatOpenAI(
+    model="meta-llama/llama-3.3-70b-instruct",
+    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
+    openai_api_base="https://openrouter.ai/api/v1",
+    temperature=0,
+)
+# Google AI Studio Fallback Model (Gemini)
+gemini_model = ChatGoogleGenerativeAI(
+    model="gemini-1.5-pro",
+    google_api_key=os.getenv("GOOGLE_API_KEY"),
+    temperature=0,
+)
+def smart_invoke(msgs, use_tools=False):
+    """
+    Tiered fallback: Groq -> OpenRouter -> Google AI Studio.
+    Retries next tier if a 429 (rate limit) or server-side error occurs.
+    """
+    primary = model_with_tools if use_tools else model
+    secondary = openrouter_with_tools if use_tools else openrouter_model
+    tertiary = gemini_with_tools if use_tools else gemini_model
+    tiers = [
+        {"name": "Groq", "model": primary, "key": "GROQ_API_KEY"},
+        {"name": "OpenRouter", "model": secondary, "key": "OPENROUTER_API_KEY"},
+        {"name": "Gemini", "model": tertiary, "key": "GOOGLE_API_KEY"},
+    ]
+    last_exception = None
+    for tier in tiers:
+        if not os.getenv(tier["key"]):
+            continue # Skip if no API key
+        try:
+            return tier["model"].invoke(msgs)
+        except Exception as e:
+            err_str = str(e).lower()
+            # Catch rate limits or generic temporary server failures
+            if any(x in err_str for x in ["rate_limit", "429", "500", "503", "overloaded"]):
+                print(f"--- {tier['name']} Error: {e}. Falling back... ---")
+                last_exception = e
+                continue
+            raise e
+    if last_exception:
+        print("CRITICAL: All fallback tiers failed.")
+        raise last_exception
+    return None
 @tool
 def web_search(keywords: str) -> str:
     """
 @tool
 def analyze_image(image_path: str, question: str) -> str:
     """
+    EXTERNAL SIGHT API: Sends an image path to a Vision Model to answer a specific question.
+    YOU MUST CALL THIS TOOL ANY TIME an image (.png, .jpg, .jpeg) is attached to the prompt.
+    NEVER claim you cannot see images. Use this tool instead.
     Args:
         image_path: The local path or URL to the image file.
+        question: Specific question describing what you want the vision model to look for.
     """
     try:
         # If it's a local file, we encode it to base64
 @tool
 def analyze_video(video_path: str, question: str) -> str:
     """
+    EXTERNAL SIGHT/HEARING API: Sends a video file to an external Vision/Audio model.
+    YOU MUST CALL THIS TOOL ANY TIME a video (.mp4, .avi) is attached to the prompt.
+    NEVER claim you cannot analyze videos. Use this tool instead.
     Args:
         video_path: The local path to the video file.
+        question: Specific question describing what you want to extract from the video.
     """
     if cv2 is None:
         return "Error: cv2 is not installed. Please install opencv-python."
 tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
+openrouter_with_tools = openrouter_model.bind_tools(tools)
+gemini_with_tools = gemini_model.bind_tools(tools)
 def answer_message(state: AgentState) -> AgentState:
     messages = state["messages"]
     TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
     CRITICAL RULES FOR SEARCH & TOOLS:
+    1. If an image, video, or audio file is attached, YOU MUST NOT SAY "I don't have access to analyze..." or "I cannot see". YOU ARE NOT BLIND. You have external APIs (analyze_image, analyze_video, analyze_audio) that will act as your eyes and ears! ALWAYS invoke these tools immediately to get descriptions!
+    2. If a text/data file is attached, use the appropriate tool (run_python_script, read_document) to analyze the file content.
+    3. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
+    4. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
+    5. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
+    6. Cross-reference facts if they seem ambiguous.
     Do not include any thought process before answering the question, and only response exactly what was being asked of you.
     If you are not able to provide an answer, use tools or state the limitation that you're facing instead.
     draft_response = None
     for step in range(max_steps):
+        if step > 0:
+            # Prevents Groq API Request/Tokens Per Minute exceptions when deep reasoning
+            time.sleep(4)
         print(f"--- ReAct Step {step + 1} ---")
+        ai_msg = smart_invoke(messages, use_tools=True)
         messages.append(ai_msg)
         # Check if the model requested tools
         print("Max reasoning steps reached. Forcing answer extraction.")
         forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
         messages.append(forced_msg)
+        draft_response = smart_invoke(messages, use_tools=False)
     # Third pass: strict GAIA formatting extraction
     formatting_sys = SystemMessage(
             "If it is a name or word, just return the exact string. If a list, return only the comma-separated list."
         )
     )
+    final_response = smart_invoke([formatting_sys, HumanMessage(content=draft_response.content)], use_tools=False)
     print(f"Draft response: {draft_response.content}")
     print(f"Strict Final response: {final_response.content}")

app.py CHANGED Viewed

@@ -53,7 +53,8 @@ def file_extract(local_file_path, task_id):
     logger.warning(f"Could not download file '{local_file_path}' for task_id {task_id}. Make sure you accepted GAIA terms on HF and set HF_TOKEN.")
     return None
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.

     logger.warning(f"Could not download file '{local_file_path}' for task_id {task_id}. Make sure you accepted GAIA terms on HF and set HF_TOKEN.")
     return None
+from typing import Optional
+def run_and_submit_all(profile: Optional[gr.OAuthProfile] = None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.

requirements.txt CHANGED Viewed

@@ -25,3 +25,5 @@ opencv-python
 beautifulsoup4
 PyPDF2
 openai-whisper

 beautifulsoup4
 PyPDF2
 openai-whisper
+langchain-openai
+langchain-google-genai