Gaia_test_ai_agent

Sleeping

App Files Files Community

kamorou commited on Jul 1

Commit

e8f0b12

verified ·

1 Parent(s): dbcbf7a

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -181

app.py CHANGED Viewed

@@ -218,6 +218,20 @@
 #
 # =================================================================================================
 import os
 import io
 import requests
@@ -228,12 +242,12 @@ from typing import TypedDict, Annotated, List
 import operator
 # --- LangChain & LangGraph Imports ---
-from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage
 from langchain_core.tools import tool
 from langchain_groq import ChatGroq
-# from langchain_openai import ChatOpenAI #<-- Alternative LLM
 from langgraph.graph import StateGraph, END
-from langgraph.prebuilt import ToolNode # <-- Corrected Import for modern LangGraph
 # (Keep Constants as is)
 # --- Constants ---
@@ -241,52 +255,70 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 FILES_DIR = "./files"
 os.makedirs(FILES_DIR, exist_ok=True)
 #
 # ================================================================================================
-#  ✅ 1. DEFINE THE AGENT'S TOOLS
 # ================================================================================================
-#  Each tool is a simple Python function decorated with `@tool`.
-#  The docstring of the function is CRUCIAL. The LLM uses it to decide which tool to use.
 #
 @tool
-def web_search(query: str) -> str:
     """
-    Searches the web using DuckDuckGo to find up-to-date information, facts, or answers to general questions.
-    Use this for any questions that require current event knowledge or broad-spectrum information.
     """
-    print(f"--- Calling Web Search Tool with query: {query} ---")
-    from duckduckgo_search import DDGS
     try:
-        with DDGS() as ddgs:
-            results = [r for r in ddgs.text(query, max_results=5)]
-            return str(results) if results else "No results found."
     except Exception as e:
-        return f"Error during web search: {e}"
 @tool
 def read_file(url: str) -> str:
     """
-    Downloads a file from a given URL, saves it locally, and returns its content.
-    Use this tool when the user provides a URL to a file that needs to be inspected.
-    The file is saved in the './files/' directory. The function returns the full text content.
     """
     print(f"--- Calling Read File Tool with URL: {url} ---")
     try:
         filename = os.path.join(FILES_DIR, os.path.basename(url))
         response = requests.get(url)
-        response.raise_for_status()  # Raise an exception for bad status codes
         with open(filename, 'wb') as f:
             f.write(response.content)
-        # Try to read as text, if it fails, it might be a binary file.
         try:
             with open(filename, 'r', encoding='utf-8') as f:
                 content = f.read()
             return f"Successfully read file '{filename}'. Content:\n\n{content}"
         except UnicodeDecodeError:
             return f"Successfully downloaded binary file '{filename}'. Cannot display content."
     except requests.exceptions.RequestException as e:
         return f"Error downloading or reading file: {e}"
@@ -295,7 +327,6 @@ def python_interpreter(code: str) -> str:
     """
     Executes a given string of Python code and returns the output from stdout.
     Use this for complex calculations, data manipulation, or any task that can be solved with code.
-    The code runs in a restricted environment. You can use libraries like pandas, requests etc.
     Make sure to use a print() statement to capture the output.
     """
     print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
@@ -309,180 +340,120 @@ def python_interpreter(code: str) -> str:
 #
 # ================================================================================================
-#  ✅ 2. CONFIGURE THE AGENT'S STATE, BRAIN (LLM)
 # ================================================================================================
 #
-# The AgentState is the "memory" of our agent. It keeps track of the conversation history.
 class AgentState(TypedDict):
     messages: Annotated[List[BaseMessage], operator.add]
-# List of all the tools our agent can use
-tools = [web_search, read_file, python_interpreter]
-# The "Brain" of our agent. We're using Groq for speed.
-# Make sure to set GROQ_API_KEY in your HF Space secrets
-llm = ChatGroq(model="llama3-70b-8192", temperature=0)
-# If you want to use OpenAI instead, uncomment the line below and set OPENAI_API_KEY
-# llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
-# We now bind the tools to the LLM. This tells the LLM what functions it can call.
-llm_with_tools = llm.bind_tools(tools)
-#
-# ================================================================================================
-#  ✅ 3. DEFINE THE LANGGRAPH NODES AND EDGES
-# ================================================================================================
-#  This is the core logic of our agent, defined as a graph.
-#
-# NODE 1: The Agent Node (call_model)
-# This node invokes the LLM to decide the next action or to give a final answer.
-def call_model(state: AgentState) -> dict:
-    print("--- Calling LLM ---")
-    messages = state['messages']
-    response = llm_with_tools.invoke(messages)
-    # We return a dict, because this node will always be part of a graph
-    return {"messages": [response]}
-# EDGE: The Conditional Router (should_continue)
-# This function decides which node to go to next.
-def should_continue(state: AgentState) -> str:
-    last_message = state['messages'][-1]
-    # If the LLM made a tool call, we route to the 'action' node to execute the tool
-    if last_message.tool_calls:
-        print("--- Decision: Call a tool ---")
-        return "action"
-    # Otherwise, we are done, and we route to the 'end' state
-    else:
-        print("--- Decision: End of process ---")
-        return "end"
-#
-# ================================================================================================
-#  ✅ 4. BUILD AND COMPILE THE GRAPH (Corrected Version)
-# ================================================================================================
-#
-# The ToolNode is a pre-built node that executes tools for us.
-# It's the modern way to handle tool execution in LangGraph.
-tool_node = ToolNode(tools)
-# 1. Initialize the graph and add our state object
-workflow = StateGraph(AgentState)
-# 2. Add the two nodes we need: the 'agent' and the 'action' (our tool_node)
-workflow.add_node("agent", call_model)
-workflow.add_node("action", tool_node)
-# 3. Set the entry point of the graph. The first thing to run is the 'agent' node.
-workflow.set_entry_point("agent")
-# 4. Add the conditional edge. This controls the flow of the graph.
-workflow.add_conditional_edges(
-    "agent",          # Start from the 'agent' node
-    should_continue,  # Use our function to decide the path
-    {
-        "action": "action", # If it returns "action", go to the 'action' node
-        "end": END          # If it returns "end", finish the graph
-    }
-)
-# 5. Add a normal edge. After 'action' runs, it should always go back to 'agent' to reflect.
-workflow.add_edge('action', 'agent')
-# 6. Compile the graph into a runnable app.
-app = workflow.compile()
 #
 # ================================================================================================
-#  ✅ 5. CREATE THE AGENT CLASS THAT THE TEMPLATE USES
 # ================================================================================================
-#  This class wraps our LangGraph agent in the format expected by the evaluation script.
 #
 class GaiaAgent:
     def __init__(self):
-        print("GaiaAgent initialized.")
-        self.agent_app = app
     def __call__(self, question: str) -> str:
-        print(f"\n{'='*60}\nAgent received question (first 100 chars): {question[:100]}...\n{'='*60}")
-        # The initial input for our graph is a list of messages.
-        initial_input = {"messages": [HumanMessage(content=question)]}
         final_state = None
-        # Let's add a loop limit to prevent infinite cycles
         for i, step in enumerate(self.agent_app.stream(initial_input, {"recursion_limit": 15})):
             if i == 0:
                 print("--- Starting Agentic Loop ---")
             final_state = step
-        # The final answer is in the last AIMessage of the 'messages' list
         final_answer_message = final_state['agent']['messages'][-1]
-        final_answer = final_answer_message.content
         print(f"\n--- Agent finished. Final Answer: {final_answer} ---\n")
         return final_answer
 #
 # ================================================================================================
-#  -- DO NOT MODIFY THE CODE BELOW THIS LINE --
-#  -- This is the Gradio App and Submission Logic from the course --
 # ================================================================================================
 def run_and_submit_all( profile: gr.OAuthProfile | None):
-    """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
-    and displays the results.
-    """
     space_id = os.getenv("SPACE_ID")
-    if profile:
-        username= f"{profile.username}"
-        print(f"User logged in: {username}")
-    else:
-        print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    try:
-        agent = GaiaAgent()
-    except Exception as e:
-        print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
     print(f"Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
-        if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except Exception as e:
-        print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
-            print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
@@ -491,13 +462,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
-        print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    print(status_update)
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
@@ -513,27 +480,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
         return final_status, results_df
-    except requests.exceptions.HTTPError as e:
-        error_detail = f"Server responded with status {e.response.status_code}."
-        try:
-            error_json = e.response.json()
-            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except requests.exceptions.JSONDecodeError:
-            error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.Timeout:
-        status_message = "Submission Failed: The request timed out."
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
@@ -541,25 +487,21 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         return status_message, results_df
-# --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# GAIA Agent Final Assessment")
     gr.Markdown(
         """
-        **Instructor's Note:** This space is now powered by a LangGraph agent.
-        1.  Ensure your `GROQ_API_KEY` is set in the Space secrets.
-        2.  Make sure you have a `requirements.txt` file with the specified versions.
-        3.  Log in below and click 'Run Evaluation'. Good luck!
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
         fn=run_and_submit_all,
         outputs=[status_output, results_table]
@@ -567,12 +509,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
-    space_id_startup = os.getenv("SPACE_ID")
-    if space_id_startup:
-        print(f"✅ SPACE_ID found: {space_id_startup}")
-    else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?).")
-    print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for GAIA Agent Evaluation...")
     demo.launch(debug=True, share=False)

 #
 # =================================================================================================
+#
+###########################
+# =================================================================================================
+#  ✅ --- ✅  FINAL ASSESSMENT AGENT - V4 (STATE-FIXED & TAVILY) ✅ --- ✅
+# =================================================================================================
+#
+#  Instructions:
+#  1. Add TAVILY_API_KEY and GROQ_API_KEY to your HF Space secrets.
+#  2. Update your requirements.txt to include `tavily-python`.
+#  3. This version fixes the critical state-leakage bug and uses a better search tool.
+#
+# =================================================================================================
 import os
 import io
 import requests
 import operator
 # --- LangChain & LangGraph Imports ---
+from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage, SystemMessage
 from langchain_core.tools import tool
 from langchain_groq import ChatGroq
 from langgraph.graph import StateGraph, END
+from langgraph.prebuilt import ToolNode
+from tavily import TavilyClient # <-- Import Tavily
 # (Keep Constants as is)
 # --- Constants ---
 FILES_DIR = "./files"
 os.makedirs(FILES_DIR, exist_ok=True)
+# --- The new, stricter System Prompt ---
+AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark.
+Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
+CRITICAL INSTRUCTIONS:
+1.  **Analyze the Goal:** First, understand what the user is asking for.
+2.  **Plan & Execute:** Formulate a plan and use the available tools (`tavily_search`, `read_file`, `python_interpreter`) to gather information.
+3.  **Final Answer Format:** Once you are absolutely certain of the answer, you MUST provide it directly and concisely.
+    - DO NOT include your reasoning, thoughts, or any conversational text like 'The answer is...', 'Here is the result:', or 'Based on my search...'.
+    - Your final response must ONLY be the answer itself.
+EXAMPLES OF CORRECT FINAL ANSWERS:
+- If the question asks for a year: `2023`
+- If it asks for a name: `John Doe`
+- If it asks for a number: `42`
+- If it asks for a comma-separated list: `item1, item2, item3`
+Think, use your tools, and then provide ONLY the final, precise answer.
+"""
 #
 # ================================================================================================
+#  ✅ 1. DEFINE THE AGENT'S TOOLS (NOW WITH TAVILY)
 # ================================================================================================
 #
+# Initialize the Tavily client. It will automatically use the TAVILY_API_KEY from secrets.
+tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
 @tool
+def tavily_search(query: str) -> str:
     """
+    Uses the Tavily Search API to find information on the web.
+    Tavily is optimized for AI agents and provides clean, summarized results.
+    Use this for any questions that require current, factual, or web-based information.
     """
+    print(f"--- Calling Tavily Search Tool with query: {query} ---")
     try:
+        # Calling the search method with the query
+        result = tavily.search(query=query, search_depth="advanced")
+        # Returning the content of the search results
+        return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
     except Exception as e:
+        return f"Error during Tavily search: {e}"
 @tool
 def read_file(url: str) -> str:
     """
+    Downloads a file from a given URL and returns its content.
+    Use this tool when a question provides a URL to a file that needs to be read.
     """
     print(f"--- Calling Read File Tool with URL: {url} ---")
     try:
         filename = os.path.join(FILES_DIR, os.path.basename(url))
         response = requests.get(url)
+        response.raise_for_status()
         with open(filename, 'wb') as f:
             f.write(response.content)
         try:
             with open(filename, 'r', encoding='utf-8') as f:
                 content = f.read()
             return f"Successfully read file '{filename}'. Content:\n\n{content}"
         except UnicodeDecodeError:
             return f"Successfully downloaded binary file '{filename}'. Cannot display content."
     except requests.exceptions.RequestException as e:
         return f"Error downloading or reading file: {e}"
     """
     Executes a given string of Python code and returns the output from stdout.
     Use this for complex calculations, data manipulation, or any task that can be solved with code.
     Make sure to use a print() statement to capture the output.
     """
     print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
 #
 # ================================================================================================
+#  ✅ 2. CONFIGURE AND BUILD THE AGENT GRAPH
 # ================================================================================================
 #
+# This section is now self-contained to be called for each new agent instance.
+#
 class AgentState(TypedDict):
     messages: Annotated[List[BaseMessage], operator.add]
+def build_agent_graph():
+    """Builds the LangGraph agent."""
+    tools = [tavily_search, read_file, python_interpreter]
+    llm = ChatGroq(model="llama3-70b-8192", temperature=0)
+    llm_with_tools = llm.bind_tools(tools)
+    def call_model(state: AgentState) -> dict:
+        print("--- Calling LLM ---")
+        messages = state['messages']
+        response = llm_with_tools.invoke(messages)
+        return {"messages": [response]}
+    def should_continue(state: AgentState) -> str:
+        last_message = state['messages'][-1]
+        if last_message.tool_calls:
+            return "action"
+        else:
+            return "end"
+    tool_node = ToolNode(tools)
+    workflow = StateGraph(AgentState)
+    workflow.add_node("agent", call_model)
+    workflow.add_node("action", tool_node)
+    workflow.set_entry_point("agent")
+    workflow.add_conditional_edges(
+        "agent",
+        should_continue,
+        {"action": "action", "end": END}
+    )
+    workflow.add_edge('action', 'agent')
+    return workflow.compile()
 #
 # ================================================================================================
+#  ✅ 3. CREATE THE AGENT CLASS THAT THE TEMPLATE USES
 # ================================================================================================
 #
 class GaiaAgent:
     def __init__(self):
+        print("GaiaAgent initialized. Building fresh graph...")
+        self.agent_app = build_agent_graph()
     def __call__(self, question: str) -> str:
+        print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
+        initial_input = {
+            "messages": [
+                SystemMessage(content=AGENT_SYSTEM_PROMPT),
+                HumanMessage(content=question)
+            ]
+        }
         final_state = None
         for i, step in enumerate(self.agent_app.stream(initial_input, {"recursion_limit": 15})):
             if i == 0:
                 print("--- Starting Agentic Loop ---")
             final_state = step
         final_answer_message = final_state['agent']['messages'][-1]
+        final_answer = str(final_answer_message.content).strip()
         print(f"\n--- Agent finished. Final Answer: {final_answer} ---\n")
         return final_answer
 #
 # ================================================================================================
+#  -- EVALUATION LOGIC - CRITICAL FIX APPLIED --
 # ================================================================================================
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
+    if not profile:
         return "Please Login to Hugging Face with the button.", None
+    username = f"{profile.username}"
+    print(f"User logged in: {username}")
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(f"Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
         print(f"Fetched {len(questions_data)} questions.")
     except Exception as e:
         return f"An unexpected error occurred fetching questions: {e}", None
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
+    #
+    # --->>> CRITICAL FIX: Instantiate a NEW agent for EACH question <<<---
+    #
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             continue
         try:
+            # A new, clean agent is created here to prevent state leakage.
+            agent = GaiaAgent()
             submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
         return final_status, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         return status_message, results_df
+# --- Gradio Interface (No Changes Needed) ---
 with gr.Blocks() as demo:
+    gr.Markdown("# GAIA Agent Final Assessment (V4 - State Fixed)")
     gr.Markdown(
         """
+        **Instructor's Note:** This version fixes the critical state-leakage bug and uses the Tavily Search API for better results.
+        1.  Ensure `GROQ_API_KEY` and `TAVILY_API_KEY` are set in secrets.
+        2.  Ensure `requirements.txt` includes `tavily-python`.
+        3.  Log in and run the evaluation. Let's see that score jump!
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
         fn=run_and_submit_all,
         outputs=[status_output, results_table]
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
     demo.launch(debug=True, share=False)