Final_Assignment_D3MI4N

Sleeping

App Files Files Community

D3MI4N commited on Jun 27, 2025

Commit

7835c92

1 Parent(s): 0ee4998

new version with reflection

Browse files

Files changed (6) hide show

app.py +89 -140
app_prior.py +0 -116
gaia_graph.py +0 -116
gaia_graph_legacy.py +0 -188
langgraph_agents.py +155 -0
test_gaia_questions.py +1 -1

app.py CHANGED Viewed

@@ -1,167 +1,116 @@
-# app.py
 import os
-import json
 import requests
 import pandas as pd
 import asyncio
-import gradio as gr
-from openai import OpenAI
-from tavily import TavilyClient
-from dotenv import load_dotenv
-load_dotenv()
-# ─── 1) OpenAI client (v1 SDK) ───────────────────────────────────────────────────
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-assert OPENAI_API_KEY, "Set OPENAI_API_KEY in .env"
-openai_client = OpenAI(api_key=OPENAI_API_KEY)
-# ─── 2) Tavily search client ─────────────────────────────────────────────────────
-TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
-assert TAVILY_API_KEY, "Set TAVILY_API_KEY in .env"
-tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
-# ─── 3) Define our tools & JSON schemas ──────────────────────────────────────────
-def calculator(expr: str) -> str:
-    try:
-        # safe eval
-        return str(eval(expr, {}, {}))
-    except Exception as e:
-        return f"Error: {e}"
-def search(query: str) -> str:
     try:
-        resp = tavily_client.search(query=query, search_depth="basic")
-        results = resp.get("results", [])
-        if not results:
-            return "No results found."
-        # grab up to 3 titles/snippets
-        snippets = []
-        for r in results[:3]:
-            snippets.append(r.get("title") or r.get("snippet") or "")
-        return " | ".join(snippets)
     except Exception as e:
-        return f"Search error: {e}"
-functions = [
-    {
-        "name": "calculator",
-        "description": "Evaluate a math expression. Returns the result as a string.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "expr": {"type": "string", "description": "Math expression to evaluate"}
-            },
-            "required": ["expr"],
-        },
-    },
-    {
-        "name": "search",
-        "description": "Look up facts on the web via Tavily; return up to three summaries separated by ' | '.",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "query": {"type": "string", "description": "The search query"}
-            },
-            "required": ["query"],
-        },
-    },
-]
-tool_map = {"calculator": calculator, "search": search}
-# ─── 4) The ReAct loop ───────────────────────────────────────────────────────────
-def run_react(question: str) -> str:
-    messages = [{"role": "user", "content": question}]
-    while True:
-        resp = openai_client.chat.completions.create(
-            model="gpt-4o-mini",      # free-tier “mini” model
-            messages=messages,
-            functions=functions,
-            function_call="auto",
-        )
-        msg = resp.choices[0].message
-        # if the model wants to call a tool:
-        if msg.function_call:
-            name = msg.function_call.name
-            args = json.loads(msg.function_call.arguments)
-            output = tool_map[name](**args)
-            # feed both the assistant's call and the tool's result back into the loop
-            messages.append({
-                "role": "assistant",
-                "content": None,
-                "function_call": msg.function_call.to_dict()
-            })
-            messages.append({
-                "role": "function",
-                "name": name,
-                "content": output
-            })
-        else:
-            # final answer
-            return msg.content.strip()
-# ─── 5) Gradio / GAIA integration ────────────────────────────────────────────────
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-_cache = {}
-class GaiaAgent:
-    def __call__(self, question: str) -> str:
-        return run_react(question)
-async def run_agent(profile: gr.OAuthProfile | None):
-    if not profile:
-        return "Please login.", None
-    user = profile.username
-    resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
-    data = resp.json()
-    agent = GaiaAgent()
-    async def proc(item):
-        ans = await asyncio.to_thread(agent, item["question"])
-        return {
-            "task_id": item["task_id"],
-            "question": item["question"],
-            "submitted_answer": ans
-        }
-    results = await asyncio.gather(*(proc(it) for it in data))
-    _cache[user] = results
-    return f"Answered {len(results)} questions.", pd.DataFrame(results)
 def submit_answers(profile: gr.OAuthProfile | None):
     if not profile:
-        return "Please login.", None
-    user = profile.username
-    if user not in _cache:
-        return "Run agent first.", None
-    payload = [
-        {"task_id": r["task_id"], "submitted_answer": r["submitted_answer"]}
-        for r in _cache[user]
     ]
     space_id = os.getenv("SPACE_ID", "")
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
-    body = {"username": user, "agent_code": agent_code, "answers": payload}
-    r = requests.post(f"{DEFAULT_API_URL}/submit", json=body, timeout=60)
-    r.raise_for_status()
-    res = r.json()
-    msg = (
-        f"Score: {res.get('score')}% "
-        f"({res.get('correct_count')}/{res.get('total_attempted')})"
-    )
-    return msg, pd.DataFrame(_cache[user])
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 GAIA Benchmark Runner")
     gr.LoginButton()
-    run_btn = gr.Button("Run agent on questions")
-    sub_btn = gr.Button("Submit cached answers")
-    out_txt = gr.Textbox(lines=3, interactive=False)
-    out_tbl = gr.DataFrame()
-    run_btn.click(run_agent, outputs=[out_txt, out_tbl])
-    sub_btn.click(submit_answers, outputs=[out_txt, out_tbl])
 if __name__ == "__main__":
     demo.launch(debug=True, share=False)

 import os
+import gradio as gr
 import requests
 import pandas as pd
 import asyncio
+from gaia_new import graph  # Use your agent
+from typing import Optional
+# Constants
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+user_answers_cache = {}  # session-based cache
+class GaiaAgent:
+    def __init__(self):
+        print("Graph-based agent initialized.")
+    def __call__(self, question: str) -> str:
+        print("Received question:", question)
+        state = {"question": question, "answer": ""}
+        try:
+            result = graph.invoke(state)
+            print("Result type:", type(result))
+            print("Result value:", result)
+            if isinstance(result, dict):
+                return result.get("answer", "No answer generated.")
+            else:
+                return f"Unexpected output from graph: {result}"
+        except Exception as e:
+            return f"ERROR invoking graph: {e}"
+# Async runner
+async def run_agent(profile: gr.OAuthProfile | None):
+    if not profile:
+        return "Please login to Hugging Face.", None
+    username = profile.username
+    agent = GaiaAgent()
+    # 1. Load questions
     try:
+        response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
+        response.raise_for_status()
+        questions_data = response.json()
     except Exception as e:
+        return f"Error fetching questions: {e}", None
+    # 2. Process questions
+    async def process(item):
+        task_id = item.get("task_id")
+        question = item.get("question")
+        try:
+            answer = await asyncio.to_thread(agent, question)
+            return {"task_id": task_id, "question": question, "submitted_answer": answer}
+        except Exception as e:
+            return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
+    results = await asyncio.gather(*(process(item) for item in questions_data))
+    user_answers_cache[username] = results
+    df = pd.DataFrame(results)
+    return f"Answered {len(results)} questions. Ready to submit.", df
 def submit_answers(profile: gr.OAuthProfile | None):
     if not profile:
+        return "Please login to Hugging Face.", None
+    username = profile.username.strip()
+    if username not in user_answers_cache:
+        return "No cached answers. Please run the agent first.", None
+    answers_payload = [
+        {"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
+        for item in user_answers_cache[username]
     ]
     space_id = os.getenv("SPACE_ID", "")
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
+    submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
+    # 3. Submit to scoring API
+    try:
+        response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
+        response.raise_for_status()
+        result = response.json()
+        final_status = (
+            f"✅ Submission Successful!\n"
+            f"👤 User: {result.get('username')}\n"
+            f"🎯 Score: {result.get('score', 'N/A')}% "
+            f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
+            f"📩 Message: {result.get('message', 'No message received.')}"
+        )
+        df = pd.DataFrame(user_answers_cache[username])
+        return final_status, df
+    except Exception as e:
+        return f"❌ Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
+# ────────── Gradio UI ──────────
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 GAIA Agent Evaluation")
     gr.LoginButton()
+    run_button = gr.Button("▶️ Run Agent on GAIA Questions")
+    submit_button = gr.Button("📤 Submit Cached Answers")
+    status = gr.Textbox(label="Status", lines=6, interactive=False)
+    results = gr.DataFrame(label="Answers", wrap=True)
+    run_button.click(run_agent, outputs=[status, results])
+    submit_button.click(submit_answers, outputs=[status, results])
 if __name__ == "__main__":
+    print("Launching Gradio app...")
     demo.launch(debug=True, share=False)

app_prior.py DELETED Viewed

@@ -1,116 +0,0 @@
-import os
-import gradio as gr
-import requests
-import pandas as pd
-import asyncio
-from gaia_new import graph  # Use your agent
-from typing import Optional
-# Constants
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-user_answers_cache = {}  # session-based cache
-class GaiaAgent:
-    def __init__(self):
-        print("Graph-based agent initialized.")
-    def __call__(self, question: str) -> str:
-        print("Received question:", question)
-        state = {"question": question, "answer": ""}
-        try:
-            result = graph.invoke(state)
-            print("Result type:", type(result))
-            print("Result value:", result)
-            if isinstance(result, dict):
-                return result.get("answer", "No answer generated.")
-            else:
-                return f"Unexpected output from graph: {result}"
-        except Exception as e:
-            return f"ERROR invoking graph: {e}"
-# Async runner
-async def run_agent(profile: gr.OAuthProfile | None):
-    if not profile:
-        return "Please login to Hugging Face.", None
-    username = profile.username
-    agent = GaiaAgent()
-    # 1. Load questions
-    try:
-        response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
-        response.raise_for_status()
-        questions_data = response.json()
-    except Exception as e:
-        return f"Error fetching questions: {e}", None
-    # 2. Process questions
-    async def process(item):
-        task_id = item.get("task_id")
-        question = item.get("question")
-        try:
-            answer = await asyncio.to_thread(agent, question)
-            return {"task_id": task_id, "question": question, "submitted_answer": answer}
-        except Exception as e:
-            return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
-    results = await asyncio.gather(*(process(item) for item in questions_data))
-    user_answers_cache[username] = results
-    df = pd.DataFrame(results)
-    return f"Answered {len(results)} questions. Ready to submit.", df
-def submit_answers(profile: gr.OAuthProfile | None):
-    if not profile:
-        return "Please login to Hugging Face.", None
-    username = profile.username.strip()
-    if username not in user_answers_cache:
-        return "No cached answers. Please run the agent first.", None
-    answers_payload = [
-        {"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
-        for item in user_answers_cache[username]
-    ]
-    space_id = os.getenv("SPACE_ID", "")
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
-    submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
-    # 3. Submit to scoring API
-    try:
-        response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
-        response.raise_for_status()
-        result = response.json()
-        final_status = (
-            f"✅ Submission Successful!\n"
-            f"👤 User: {result.get('username')}\n"
-            f"🎯 Score: {result.get('score', 'N/A')}% "
-            f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
-            f"📩 Message: {result.get('message', 'No message received.')}"
-        )
-        df = pd.DataFrame(user_answers_cache[username])
-        return final_status, df
-    except Exception as e:
-        return f"❌ Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
-# ────────── Gradio UI ──────────
-with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 GAIA Agent Evaluation")
-    gr.LoginButton()
-    run_button = gr.Button("▶️ Run Agent on GAIA Questions")
-    submit_button = gr.Button("📤 Submit Cached Answers")
-    status = gr.Textbox(label="Status", lines=6, interactive=False)
-    results = gr.DataFrame(label="Answers", wrap=True)
-    run_button.click(run_agent, outputs=[status, results])
-    submit_button.click(submit_answers, outputs=[status, results])
-if __name__ == "__main__":
-    print("Launching Gradio app...")
-    demo.launch(debug=True, share=False)

gaia_graph.py DELETED Viewed

@@ -1,116 +0,0 @@
-# gaia_graph.py
-import os
-import ast
-import operator
-from typing import TypedDict
-from dotenv import load_dotenv
-from langchain.tools import Tool
-from langchain.agents import initialize_agent, AgentType
-from langchain_openai import ChatOpenAI
-from langgraph.graph import StateGraph, END
-# ─── Load Environment Variables ──────────────────────────────────────────────
-load_dotenv()
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-assert OPENAI_API_KEY, "OPENAI_API_KEY is not set"
-# ─── Define Calculator Tool ──────────────────────────────────────────────────
-def safe_eval(expr: str) -> str:
-    ops = {
-        ast.Add: operator.add,
-        ast.Sub: operator.sub,
-        ast.Mult: operator.mul,
-        ast.Div: operator.truediv,
-        ast.Pow: operator.pow,
-        ast.USub: operator.neg,
-    }
-    def _eval(node):
-        if isinstance(node, ast.Constant):
-            return node.value
-        if isinstance(node, ast.BinOp):
-            return ops[type(node.op)](_eval(node.left), _eval(node.right))
-        if isinstance(node, ast.UnaryOp):
-            return ops[type(node.op)](_eval(node.operand))
-        raise TypeError(f"Unsupported AST node: {node!r}")
-    try:
-        node = ast.parse(expr, mode="eval").body
-        return str(_eval(node))
-    except Exception as e:
-        return f"Error: {e}"
-calculator_tool = Tool(
-    name="calculator",
-    func=safe_eval,
-    description="Evaluate basic math expressions. Input: a math string like '2 + 2'. Output: the result.",
-)
-# ─── Define Search Tool using Tavily ─────────────────────────────────────────
-from tavily import TavilyClient
-TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
-assert TAVILY_API_KEY, "TAVILY_API_KEY environment variable is not set"
-tavily = TavilyClient(api_key=TAVILY_API_KEY)
-def search_tool_fn(query: str) -> str:
-    try:
-        resp = tavily.search(query)
-        results = resp.get("results", [])
-        if not results:
-            return "No results found."
-        return results[0].get("title") or results[0].get("snippet") or "No snippet."
-    except Exception as e:
-        return f"Search error: {e}"
-search_tool = Tool(
-    name="search",
-    func=search_tool_fn,
-    description="Useful for answering factual questions using a search engine.",
-)
-# ─── Create LLM Agent ────────────────────────────────────────────────────────
-llm = ChatOpenAI(
-    temperature=0.0,
-    model="gpt-4o-mini",
-    openai_api_key=OPENAI_API_KEY
-)
-agent_executor = initialize_agent(
-    tools=[calculator_tool, search_tool],
-    llm=llm,
-    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
-    verbose=False,
-    handle_parsing_errors=True,
-)
-# ─── Clean Output ────────────────────────────────────────────────────────────
-def clean_answer(ans: str) -> str:
-    if "```" in ans:
-        ans = ans.split("```")[-1]
-    if "Answer:" in ans:
-        return ans.split("Answer:")[-1].strip()
-    if "→" in ans:
-        return ans.split("→")[-1].strip()
-    return ans.strip()
-# ─── Define State ────────────────────────────────────────────────────────────
-class GaiaState(TypedDict):
-    question: str
-    answer: str
-# ─── Define Node Function ────────────────────────────────────────────────────
-def agent_node(state: GaiaState) -> GaiaState:
-    raw = agent_executor.run(state["question"])
-    return {"question": state["question"], "answer": clean_answer(raw)}
-# ─── Build LangGraph ─────────────────────────────────────────────────────────
-builder = StateGraph(GaiaState)
-builder.add_node("agent", agent_node)
-builder.set_entry_point("agent")
-builder.set_finish_point("agent")
-graph = builder.compile()

gaia_graph_legacy.py DELETED Viewed

@@ -1,188 +0,0 @@
-# gaia_graph.py
-import os
-import re
-import yaml
-from typing import TypedDict
-from dotenv import load_dotenv
-from transformers import pipeline
-from langchain_huggingface import HuggingFacePipeline
-from langchain_core.tools.structured import StructuredTool
-from langgraph.graph import StateGraph, START, END
-from langgraph.prebuilt.chat_agent_executor import create_react_agent
-#
-# ─── 1) LOAD ENVIRONMENT VARIABLES ──────────────────────────────────────────────
-#
-# Make sure you have a valid HF token in your shell or .env:
-#     export HUGGINGFACE_API_TOKEN="<your token>"
-load_dotenv()
-HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
-assert HF_TOKEN, "Please set HUGGINGFACE_API_TOKEN in your environment or .env."
-#
-# ─── 2) LOAD config.yaml ─────────────────────────────────────────────────────────
-#
-# Expect config.yaml with:
-#   tavily_api_key: "<your Tavily key>"
-#   huggingface_api_token: "<your HF token>"  (optional duplication)
-with open("config.yaml", "r") as f:
-    cfg = yaml.safe_load(f)
-TAVILY_API_KEY = cfg.get("tavily_api_key")
-assert TAVILY_API_KEY, "Put your Tavily key under 'tavily_api_key' in config.yaml."
-#
-# ─── 3) DEFINE “TOOL” WRAPPERS ────────────────────────────────────────────────────
-#
-# 3a) Calculator (a “safe eval” of simple expressions)
-def _safe_eval(expr: str) -> str:
-    import ast, operator
-    ops = {
-        ast.Add: operator.add,
-        ast.Sub: operator.sub,
-        ast.Mult: operator.mul,
-        ast.Div: operator.truediv,
-        ast.Pow: operator.pow,
-        ast.USub: operator.neg,
-    }
-    def _eval(node):
-        if isinstance(node, ast.Constant):
-            return node.n
-        elif isinstance(node, ast.BinOp):
-            return ops[type(node.op)](_eval(node.left), _eval(node.right))
-        elif isinstance(node, ast.UnaryOp):
-            return ops[type(node.op)](_eval(node.operand))
-        else:
-            raise TypeError(f"Unsupported AST node: {node}")
-    node = ast.parse(expr, mode="eval").body
-    return str(_eval(node))
-def _calculator_tool(text: str) -> str:
-    try:
-        return _safe_eval(text)
-    except Exception as e:
-        return f"Error evaluating expression: {e}"
-calculator_tool = StructuredTool.from_function(
-    func=_calculator_tool,
-    name="calculator",
-    description="Evaluate simple arithmetic expressions; return the numeric result as a string.",
-)
-# 3b) Tavily‐based search
-from tavily import TavilyClient
-class _TavilySearch:
-    def __init__(self, api_key: str):
-        self.client = TavilyClient(api_key=api_key)
-    def __call__(self, query: str) -> str:
-        resp = self.client.search(query)
-        results = resp.get("results", [])
-        if not results:
-            return "No results found."
-        snippets = []
-        for r in results[:3]:
-            title = r.get("title")
-            snippet = r.get("snippet")
-            if title:
-                snippets.append(title)
-            elif snippet:
-                snippets.append(snippet)
-        return " | ".join(snippets)
-_tavily_search = _TavilySearch(api_key=TAVILY_API_KEY)
-# Note: pass the instance’s __call__, not the instance itself.
-search_tool = StructuredTool.from_function(
-    func=_tavily_search.__call__,
-    name="search",
-    description="Look up facts via Tavily; return up to three summaries joined by ' | '.",
-)
-TOOLS = [calculator_tool, search_tool]
-#
-# ─── 4) PRELOAD A FREE HF MODEL & WRAP IT AS HuggingFacePipeline ───────────────────
-#
-# We choose “google/flan-t5-small” (free, CPU‐friendly). Load as a text2text pipeline:
-hf_gen = pipeline(
-    "text2text-generation",
-    model="google/flan-t5-small",
-    device=-1,          # CPU only
-    max_new_tokens=128,
-    do_sample=False,    # greedy
-)
-# Now wrap that pipeline into a HuggingFacePipeline LLM.
-# (No API token needed here for a local “google/flan-t5-small”)
-llm = HuggingFacePipeline(pipeline=hf_gen)
-#
-# ─── 5) CREATE A LANGGRAPH ReAct AGENT ─────────────────────────────────────────────
-#
-# This `create_react_agent` will add the Thought/Action/Observation framing
-# so that the LLM can call “calculator” or “search” as needed,
-# and then eventually emit “Final Answer: …”.
-#
-react_agent = create_react_agent(
-    llm=llm,
-    tools=TOOLS,
-    max_iterations=3,
-    verbose=False,
-)
-#
-# ─── 6) DEFINE STATE SCHEMA & SINGLE GRAPH NODE ─────────────────────────────────
-#
-class AgentState(TypedDict):
-    question:     str
-    tool_output:  str   # (ignored by ReAct, but must exist)
-    final_answer: str
-def AgentNode(state: AgentState) -> AgentState:
-    q = state["question"].strip()
-    # Invoke the internal ReAct loop:
-    answer = react_agent.invoke(q).strip()
-    state["final_answer"] = answer
-    return state
-#
-# ─── 7) WIRE UP THE LANGGRAPH ─────────────────────────────────────────────────────
-#
-builder = StateGraph(AgentState)
-builder.set_entry_point("AgentNode")
-builder.add_node("AgentNode", AgentNode)
-builder.add_edge(START, "AgentNode")
-builder.add_edge("AgentNode", END)
-graph = builder.compile()
-#
-# ─── 8) SMOKE TESTS ───────────────────────────────────────────────────────────────
-#
-if __name__ == "__main__":
-    print("Device set to use CPU\n")
-    tests = [
-        "How much is 2 + 2",
-        "What is the capital of France?",
-        "Which country had the fewest athletes at the 1928 Olympics? Give the IOC code."
-    ]
-    for q in tests:
-        state = {"question": q, "tool_output": "", "final_answer": ""}
-        out = graph.invoke(state)
-        print(f"Q: {q!r}\n→ A: {out['final_answer']!r}\n")

langgraph_agents.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+from dotenv import load_dotenv
+from typing import TypedDict, Annotated, List, Dict, Any
+from langchain_openai import ChatOpenAI
+from langchain_core.tools import tool
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, AnyMessage
+from langchain_core.messages.ai import subtract_usage
+from langsmith.run_helpers import trace_run
+from langgraph.graph import StateGraph, MessagesState, START, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from difflib import SequenceMatcher
+load_dotenv()
+# System prompt for assistant
+system_prompt = """
+You are a high-performance question-answering agent. Your job is to answer each question using the available tools (web search, Wikipedia, ArXiv, etc.) when necessary. Always return a clear and concise one-line answer. Never explain, disclaim, or use phrases like "I'm sorry", "I cannot", or "as an AI language model".
+Your responses must follow these strict rules:
+1. Only output the **final answer**, as a single line. No preamble, no reasoning, no markdown.
+2. If a tool can help, invoke it to retrieve relevant information. Use tools assertively when the answer is not already cached or obvious.
+3. If a question contains a known URL or document reference, try to infer or search its content based on what is available.
+4. When dealing with long or obscure questions (e.g., academic papers, dataset entries, etc.), extract just the answer — even if based on partial context.
+5. Never refuse to answer. Make your best informed guess based on the tools, data, and context available.
+6. Repeat answers for duplicate questions.
+7. If the question requires extracting a list or name, return the **bare** list or name, alphabetized if requested.
+### Examples of valid answers:
+LUX
+Paris
+28 September 1985
+bell pepper, broccoli, celery, corn, green beans, lettuce, sweet potatoes, zucchini
+4
+You must be accurate, efficient, and concise. Begin.
+"""
+# Reflection prompt for the LLM to self-assess its answer
+reflection_prompt = """
+You are a reflective quality control agent.
+Your task is to verify if the assistant's answer is a correct and complete response to the user question.
+You will think carefully before responding.
+Instructions:
+1. Analyze the user question: What is being asked? Are there specific formats or constraints? (e.g. one-line, IOC code, alphabetical order, names only, no explanations)
+2. Evaluate the assistant's answer: Does it answer the core question faithfully and clearly? Is it concise, accurate, and in the required format?
+3. Reflect: If the answer is already optimal, return it unchanged.
+4. If the answer has issues (wrong content, incomplete reasoning, extra text, wrong format, etc.), fix it. You may use reasoning, assumptions, or clarification based on context.
+Respond with ONLY the improved answer (if changed), or the original if it's already optimal.
+Begin.
+"""
+# Tools
+@tool
+def web_search(query: str) -> Dict[str, str]:
+    """Search the web for information."""
+    results = TavilySearchResults(max_results=3).run(query)
+    docs = "\n".join([doc["content"] for doc in results])
+    return {"web_results": docs}
+TOOLS = [web_search]
+# Agent state
+class AgentState(TypedDict):
+    messages: Annotated[List[AnyMessage], add_messages]
+# LLMs
+llm = ChatOpenAI(model="gpt-4", temperature=0)
+llm_with_tools = llm.bind_tools(TOOLS)
+# Assistant node
+def assistant(state: AgentState) -> Dict[str, Any]:
+    result = llm_with_tools.invoke(state["messages"])
+    if isinstance(result, AIMessage) and result.usage_metadata is None:
+        result.usage_metadata = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+    return {"messages": [result]}
+# Reflection agent
+def reflect_answer(question: str, answer: str) -> str:
+    reflector = llm.with_config({"tags": ["reflection"]})
+    input_messages = [
+        SystemMessage(content=reflection_prompt),
+        HumanMessage(content=f"Q: {question}\nAssistant's Answer: {answer}")
+    ]
+    reflection_result = reflector.invoke(input_messages)
+    return reflection_result.content.strip()
+# Build LangGraph
+builder = StateGraph(AgentState)
+builder.add_node("assistant", assistant)
+builder.add_node("tools", ToolNode(TOOLS))
+builder.set_entry_point("assistant")
+builder.add_conditional_edges(
+    "assistant",
+    tools_condition,
+    {
+        "tools": "tools",
+        END: END
+    }
+)
+builder.add_edge("tools", "assistant")
+graph = builder.compile()
+# Evaluation helpers
+def similarity_score(a: str, b: str) -> float:
+    return round(SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio(), 2)
+# Questions + Ground Truths
+qa_pairs = [
+    {
+        "q": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
+        "gt": "Louvrier"
+    },
+    {
+        "q": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
+        "gt": "Wojciech"
+    },
+    {
+        "q": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
+        "gt": "LUX"
+    }
+]
+# Run evaluation
+print("\n📊 Evaluating QA Agent\n")
+for idx, qa in enumerate(qa_pairs, 1):
+    question = qa["q"]
+    ground_truth = qa["gt"]
+    print(f"🔹 Q{idx}: {question}")
+    with trace_run(name=f"GAIA-Q{idx}", tags=["gaia", "reflection", "evaluation"]):
+        try:
+            result = graph.invoke({"messages": [HumanMessage(content=question)]})
+            raw_answer = result["messages"][-1].content.strip()
+            reflected = reflect_answer(question, raw_answer)
+            score = similarity_score(reflected, ground_truth)
+            verdict = "✅" if score == 1.0 else "❌"
+            print(f"{verdict} A{idx}: {reflected}  |  GT: {ground_truth}  |  Similarity: {score}\n")
+        except Exception as e:
+            print(f"❌ A{idx} ERROR: {e}\n")

test_gaia_questions.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # test_gaia_questions.py
 import requests
-from gaia_new import graph
 def test_with_real_gaia_questions():
     # Fetch questions directly from the benchmark API

 # test_gaia_questions.py
 import requests
+from langgraph_agents import graph
 def test_with_real_gaia_questions():
     # Fetch questions directly from the benchmark API