D3MI4N commited on
Commit
7835c92
Β·
1 Parent(s): 0ee4998

new version with reflection

Browse files
Files changed (6) hide show
  1. app.py +89 -140
  2. app_prior.py +0 -116
  3. gaia_graph.py +0 -116
  4. gaia_graph_legacy.py +0 -188
  5. langgraph_agents.py +155 -0
  6. test_gaia_questions.py +1 -1
app.py CHANGED
@@ -1,167 +1,116 @@
1
- # app.py
2
-
3
  import os
4
- import json
5
  import requests
6
  import pandas as pd
7
  import asyncio
 
 
8
 
9
- import gradio as gr
10
- from openai import OpenAI
11
- from tavily import TavilyClient
12
- from dotenv import load_dotenv
13
-
14
- load_dotenv()
15
 
16
- # ─── 1) OpenAI client (v1 SDK) ───────────────────────────────────────────────────
17
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
18
- assert OPENAI_API_KEY, "Set OPENAI_API_KEY in .env"
19
- openai_client = OpenAI(api_key=OPENAI_API_KEY)
20
 
21
- # ─── 2) Tavily search client ─────────────────────────────────────────────────────
22
- TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
23
- assert TAVILY_API_KEY, "Set TAVILY_API_KEY in .env"
24
- tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # ─── 3) Define our tools & JSON schemas ──────────────────────────────────────────
27
- def calculator(expr: str) -> str:
28
- try:
29
- # safe eval
30
- return str(eval(expr, {}, {}))
31
- except Exception as e:
32
- return f"Error: {e}"
33
 
34
- def search(query: str) -> str:
35
  try:
36
- resp = tavily_client.search(query=query, search_depth="basic")
37
- results = resp.get("results", [])
38
- if not results:
39
- return "No results found."
40
- # grab up to 3 titles/snippets
41
- snippets = []
42
- for r in results[:3]:
43
- snippets.append(r.get("title") or r.get("snippet") or "")
44
- return " | ".join(snippets)
45
  except Exception as e:
46
- return f"Search error: {e}"
47
-
48
- functions = [
49
- {
50
- "name": "calculator",
51
- "description": "Evaluate a math expression. Returns the result as a string.",
52
- "parameters": {
53
- "type": "object",
54
- "properties": {
55
- "expr": {"type": "string", "description": "Math expression to evaluate"}
56
- },
57
- "required": ["expr"],
58
- },
59
- },
60
- {
61
- "name": "search",
62
- "description": "Look up facts on the web via Tavily; return up to three summaries separated by ' | '.",
63
- "parameters": {
64
- "type": "object",
65
- "properties": {
66
- "query": {"type": "string", "description": "The search query"}
67
- },
68
- "required": ["query"],
69
- },
70
- },
71
- ]
72
- tool_map = {"calculator": calculator, "search": search}
73
-
74
- # ─── 4) The ReAct loop ───────────────────────────────────────────────────────────
75
- def run_react(question: str) -> str:
76
- messages = [{"role": "user", "content": question}]
77
- while True:
78
- resp = openai_client.chat.completions.create(
79
- model="gpt-4o-mini", # free-tier β€œmini” model
80
- messages=messages,
81
- functions=functions,
82
- function_call="auto",
83
- )
84
- msg = resp.choices[0].message
85
-
86
- # if the model wants to call a tool:
87
- if msg.function_call:
88
- name = msg.function_call.name
89
- args = json.loads(msg.function_call.arguments)
90
- output = tool_map[name](**args)
91
- # feed both the assistant's call and the tool's result back into the loop
92
- messages.append({
93
- "role": "assistant",
94
- "content": None,
95
- "function_call": msg.function_call.to_dict()
96
- })
97
- messages.append({
98
- "role": "function",
99
- "name": name,
100
- "content": output
101
- })
102
- else:
103
- # final answer
104
- return msg.content.strip()
105
-
106
- # ─── 5) Gradio / GAIA integration ────────────────────────────────────────────────
107
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
108
- _cache = {}
109
 
110
- class GaiaAgent:
111
- def __call__(self, question: str) -> str:
112
- return run_react(question)
 
 
 
 
 
 
113
 
114
- async def run_agent(profile: gr.OAuthProfile | None):
115
- if not profile:
116
- return "Please login.", None
117
- user = profile.username
118
- resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
119
- data = resp.json()
120
- agent = GaiaAgent()
121
 
122
- async def proc(item):
123
- ans = await asyncio.to_thread(agent, item["question"])
124
- return {
125
- "task_id": item["task_id"],
126
- "question": item["question"],
127
- "submitted_answer": ans
128
- }
129
 
130
- results = await asyncio.gather(*(proc(it) for it in data))
131
- _cache[user] = results
132
- return f"Answered {len(results)} questions.", pd.DataFrame(results)
133
 
134
  def submit_answers(profile: gr.OAuthProfile | None):
135
  if not profile:
136
- return "Please login.", None
137
- user = profile.username
138
- if user not in _cache:
139
- return "Run agent first.", None
140
- payload = [
141
- {"task_id": r["task_id"], "submitted_answer": r["submitted_answer"]}
142
- for r in _cache[user]
 
 
143
  ]
 
144
  space_id = os.getenv("SPACE_ID", "")
145
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
146
- body = {"username": user, "agent_code": agent_code, "answers": payload}
147
- r = requests.post(f"{DEFAULT_API_URL}/submit", json=body, timeout=60)
148
- r.raise_for_status()
149
- res = r.json()
150
- msg = (
151
- f"Score: {res.get('score')}% "
152
- f"({res.get('correct_count')}/{res.get('total_attempted')})"
153
- )
154
- return msg, pd.DataFrame(_cache[user])
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  with gr.Blocks() as demo:
157
- gr.Markdown("# 🧠 GAIA Benchmark Runner")
158
  gr.LoginButton()
159
- run_btn = gr.Button("Run agent on questions")
160
- sub_btn = gr.Button("Submit cached answers")
161
- out_txt = gr.Textbox(lines=3, interactive=False)
162
- out_tbl = gr.DataFrame()
163
- run_btn.click(run_agent, outputs=[out_txt, out_tbl])
164
- sub_btn.click(submit_answers, outputs=[out_txt, out_tbl])
 
 
 
165
 
166
  if __name__ == "__main__":
 
167
  demo.launch(debug=True, share=False)
 
 
 
1
  import os
2
+ import gradio as gr
3
  import requests
4
  import pandas as pd
5
  import asyncio
6
+ from gaia_new import graph # Use your agent
7
+ from typing import Optional
8
 
9
+ # Constants
10
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
+ user_answers_cache = {} # session-based cache
 
 
 
12
 
13
+ class GaiaAgent:
14
+ def __init__(self):
15
+ print("Graph-based agent initialized.")
 
16
 
17
+ def __call__(self, question: str) -> str:
18
+ print("Received question:", question)
19
+ state = {"question": question, "answer": ""}
20
+ try:
21
+ result = graph.invoke(state)
22
+ print("Result type:", type(result))
23
+ print("Result value:", result)
24
+ if isinstance(result, dict):
25
+ return result.get("answer", "No answer generated.")
26
+ else:
27
+ return f"Unexpected output from graph: {result}"
28
+ except Exception as e:
29
+ return f"ERROR invoking graph: {e}"
30
+
31
+
32
+ # Async runner
33
+ async def run_agent(profile: gr.OAuthProfile | None):
34
+ if not profile:
35
+ return "Please login to Hugging Face.", None
36
 
37
+ username = profile.username
38
+ agent = GaiaAgent()
 
 
 
 
 
39
 
40
+ # 1. Load questions
41
  try:
42
+ response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
43
+ response.raise_for_status()
44
+ questions_data = response.json()
 
 
 
 
 
 
45
  except Exception as e:
46
+ return f"Error fetching questions: {e}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # 2. Process questions
49
+ async def process(item):
50
+ task_id = item.get("task_id")
51
+ question = item.get("question")
52
+ try:
53
+ answer = await asyncio.to_thread(agent, question)
54
+ return {"task_id": task_id, "question": question, "submitted_answer": answer}
55
+ except Exception as e:
56
+ return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
57
 
58
+ results = await asyncio.gather(*(process(item) for item in questions_data))
59
+ user_answers_cache[username] = results
 
 
 
 
 
60
 
61
+ df = pd.DataFrame(results)
62
+ return f"Answered {len(results)} questions. Ready to submit.", df
 
 
 
 
 
63
 
 
 
 
64
 
65
  def submit_answers(profile: gr.OAuthProfile | None):
66
  if not profile:
67
+ return "Please login to Hugging Face.", None
68
+
69
+ username = profile.username.strip()
70
+ if username not in user_answers_cache:
71
+ return "No cached answers. Please run the agent first.", None
72
+
73
+ answers_payload = [
74
+ {"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
75
+ for item in user_answers_cache[username]
76
  ]
77
+
78
  space_id = os.getenv("SPACE_ID", "")
79
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
80
+ submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
 
 
 
 
 
 
 
 
81
 
82
+ # 3. Submit to scoring API
83
+ try:
84
+ response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
85
+ response.raise_for_status()
86
+ result = response.json()
87
+ final_status = (
88
+ f"βœ… Submission Successful!\n"
89
+ f"πŸ‘€ User: {result.get('username')}\n"
90
+ f"🎯 Score: {result.get('score', 'N/A')}% "
91
+ f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
92
+ f"πŸ“© Message: {result.get('message', 'No message received.')}"
93
+ )
94
+ df = pd.DataFrame(user_answers_cache[username])
95
+ return final_status, df
96
+ except Exception as e:
97
+ return f"❌ Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
98
+
99
+
100
+ # ────────── Gradio UI ──────────
101
  with gr.Blocks() as demo:
102
+ gr.Markdown("# 🧠 GAIA Agent Evaluation")
103
  gr.LoginButton()
104
+
105
+ run_button = gr.Button("▢️ Run Agent on GAIA Questions")
106
+ submit_button = gr.Button("πŸ“€ Submit Cached Answers")
107
+
108
+ status = gr.Textbox(label="Status", lines=6, interactive=False)
109
+ results = gr.DataFrame(label="Answers", wrap=True)
110
+
111
+ run_button.click(run_agent, outputs=[status, results])
112
+ submit_button.click(submit_answers, outputs=[status, results])
113
 
114
  if __name__ == "__main__":
115
+ print("Launching Gradio app...")
116
  demo.launch(debug=True, share=False)
app_prior.py DELETED
@@ -1,116 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import requests
4
- import pandas as pd
5
- import asyncio
6
- from gaia_new import graph # Use your agent
7
- from typing import Optional
8
-
9
- # Constants
10
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
- user_answers_cache = {} # session-based cache
12
-
13
- class GaiaAgent:
14
- def __init__(self):
15
- print("Graph-based agent initialized.")
16
-
17
- def __call__(self, question: str) -> str:
18
- print("Received question:", question)
19
- state = {"question": question, "answer": ""}
20
- try:
21
- result = graph.invoke(state)
22
- print("Result type:", type(result))
23
- print("Result value:", result)
24
- if isinstance(result, dict):
25
- return result.get("answer", "No answer generated.")
26
- else:
27
- return f"Unexpected output from graph: {result}"
28
- except Exception as e:
29
- return f"ERROR invoking graph: {e}"
30
-
31
-
32
- # Async runner
33
- async def run_agent(profile: gr.OAuthProfile | None):
34
- if not profile:
35
- return "Please login to Hugging Face.", None
36
-
37
- username = profile.username
38
- agent = GaiaAgent()
39
-
40
- # 1. Load questions
41
- try:
42
- response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
43
- response.raise_for_status()
44
- questions_data = response.json()
45
- except Exception as e:
46
- return f"Error fetching questions: {e}", None
47
-
48
- # 2. Process questions
49
- async def process(item):
50
- task_id = item.get("task_id")
51
- question = item.get("question")
52
- try:
53
- answer = await asyncio.to_thread(agent, question)
54
- return {"task_id": task_id, "question": question, "submitted_answer": answer}
55
- except Exception as e:
56
- return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
57
-
58
- results = await asyncio.gather(*(process(item) for item in questions_data))
59
- user_answers_cache[username] = results
60
-
61
- df = pd.DataFrame(results)
62
- return f"Answered {len(results)} questions. Ready to submit.", df
63
-
64
-
65
- def submit_answers(profile: gr.OAuthProfile | None):
66
- if not profile:
67
- return "Please login to Hugging Face.", None
68
-
69
- username = profile.username.strip()
70
- if username not in user_answers_cache:
71
- return "No cached answers. Please run the agent first.", None
72
-
73
- answers_payload = [
74
- {"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
75
- for item in user_answers_cache[username]
76
- ]
77
-
78
- space_id = os.getenv("SPACE_ID", "")
79
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
80
- submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
81
-
82
- # 3. Submit to scoring API
83
- try:
84
- response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
85
- response.raise_for_status()
86
- result = response.json()
87
- final_status = (
88
- f"βœ… Submission Successful!\n"
89
- f"πŸ‘€ User: {result.get('username')}\n"
90
- f"🎯 Score: {result.get('score', 'N/A')}% "
91
- f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
92
- f"πŸ“© Message: {result.get('message', 'No message received.')}"
93
- )
94
- df = pd.DataFrame(user_answers_cache[username])
95
- return final_status, df
96
- except Exception as e:
97
- return f"❌ Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
98
-
99
-
100
- # ────────── Gradio UI ──────────
101
- with gr.Blocks() as demo:
102
- gr.Markdown("# 🧠 GAIA Agent Evaluation")
103
- gr.LoginButton()
104
-
105
- run_button = gr.Button("▢️ Run Agent on GAIA Questions")
106
- submit_button = gr.Button("πŸ“€ Submit Cached Answers")
107
-
108
- status = gr.Textbox(label="Status", lines=6, interactive=False)
109
- results = gr.DataFrame(label="Answers", wrap=True)
110
-
111
- run_button.click(run_agent, outputs=[status, results])
112
- submit_button.click(submit_answers, outputs=[status, results])
113
-
114
- if __name__ == "__main__":
115
- print("Launching Gradio app...")
116
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_graph.py DELETED
@@ -1,116 +0,0 @@
1
- # gaia_graph.py
2
-
3
- import os
4
- import ast
5
- import operator
6
- from typing import TypedDict
7
-
8
- from dotenv import load_dotenv
9
- from langchain.tools import Tool
10
- from langchain.agents import initialize_agent, AgentType
11
- from langchain_openai import ChatOpenAI
12
- from langgraph.graph import StateGraph, END
13
-
14
- # ─── Load Environment Variables ──────────────────────────────────────────────
15
- load_dotenv()
16
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
- assert OPENAI_API_KEY, "OPENAI_API_KEY is not set"
18
-
19
- # ─── Define Calculator Tool ──────────────────────────────────────────────────
20
- def safe_eval(expr: str) -> str:
21
- ops = {
22
- ast.Add: operator.add,
23
- ast.Sub: operator.sub,
24
- ast.Mult: operator.mul,
25
- ast.Div: operator.truediv,
26
- ast.Pow: operator.pow,
27
- ast.USub: operator.neg,
28
- }
29
-
30
- def _eval(node):
31
- if isinstance(node, ast.Constant):
32
- return node.value
33
- if isinstance(node, ast.BinOp):
34
- return ops[type(node.op)](_eval(node.left), _eval(node.right))
35
- if isinstance(node, ast.UnaryOp):
36
- return ops[type(node.op)](_eval(node.operand))
37
- raise TypeError(f"Unsupported AST node: {node!r}")
38
-
39
- try:
40
- node = ast.parse(expr, mode="eval").body
41
- return str(_eval(node))
42
- except Exception as e:
43
- return f"Error: {e}"
44
-
45
- calculator_tool = Tool(
46
- name="calculator",
47
- func=safe_eval,
48
- description="Evaluate basic math expressions. Input: a math string like '2 + 2'. Output: the result.",
49
- )
50
-
51
- # ─── Define Search Tool using Tavily ─────────────────────────────────────────
52
- from tavily import TavilyClient
53
-
54
- TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
55
- assert TAVILY_API_KEY, "TAVILY_API_KEY environment variable is not set"
56
-
57
- tavily = TavilyClient(api_key=TAVILY_API_KEY)
58
-
59
- def search_tool_fn(query: str) -> str:
60
- try:
61
- resp = tavily.search(query)
62
- results = resp.get("results", [])
63
- if not results:
64
- return "No results found."
65
- return results[0].get("title") or results[0].get("snippet") or "No snippet."
66
- except Exception as e:
67
- return f"Search error: {e}"
68
-
69
- search_tool = Tool(
70
- name="search",
71
- func=search_tool_fn,
72
- description="Useful for answering factual questions using a search engine.",
73
- )
74
-
75
- # ─── Create LLM Agent ────────────────────────────────────────────────────────
76
- llm = ChatOpenAI(
77
- temperature=0.0,
78
- model="gpt-4o-mini",
79
- openai_api_key=OPENAI_API_KEY
80
- )
81
-
82
- agent_executor = initialize_agent(
83
- tools=[calculator_tool, search_tool],
84
- llm=llm,
85
- agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
86
- verbose=False,
87
- handle_parsing_errors=True,
88
- )
89
-
90
- # ─── Clean Output ────────────────────────────────────────────────────────────
91
- def clean_answer(ans: str) -> str:
92
- if "```" in ans:
93
- ans = ans.split("```")[-1]
94
- if "Answer:" in ans:
95
- return ans.split("Answer:")[-1].strip()
96
- if "β†’" in ans:
97
- return ans.split("β†’")[-1].strip()
98
- return ans.strip()
99
-
100
- # ─── Define State ────────────────────────────────────────────────────────────
101
- class GaiaState(TypedDict):
102
- question: str
103
- answer: str
104
-
105
- # ─── Define Node Function ────────────────────────────────────────────────────
106
- def agent_node(state: GaiaState) -> GaiaState:
107
- raw = agent_executor.run(state["question"])
108
- return {"question": state["question"], "answer": clean_answer(raw)}
109
-
110
- # ─── Build LangGraph ─────────────────────────────────────────────────────────
111
- builder = StateGraph(GaiaState)
112
- builder.add_node("agent", agent_node)
113
- builder.set_entry_point("agent")
114
- builder.set_finish_point("agent")
115
-
116
- graph = builder.compile()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gaia_graph_legacy.py DELETED
@@ -1,188 +0,0 @@
1
- # gaia_graph.py
2
-
3
- import os
4
- import re
5
- import yaml
6
- from typing import TypedDict
7
-
8
- from dotenv import load_dotenv
9
- from transformers import pipeline
10
- from langchain_huggingface import HuggingFacePipeline
11
- from langchain_core.tools.structured import StructuredTool
12
- from langgraph.graph import StateGraph, START, END
13
- from langgraph.prebuilt.chat_agent_executor import create_react_agent
14
-
15
- #
16
- # ─── 1) LOAD ENVIRONMENT VARIABLES ──────────────────────────────────────────────
17
- #
18
- # Make sure you have a valid HF token in your shell or .env:
19
- # export HUGGINGFACE_API_TOKEN="<your token>"
20
- load_dotenv()
21
- HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
22
- assert HF_TOKEN, "Please set HUGGINGFACE_API_TOKEN in your environment or .env."
23
-
24
- #
25
- # ─── 2) LOAD config.yaml ─────────────────────────────────────────────────────────
26
- #
27
- # Expect config.yaml with:
28
- # tavily_api_key: "<your Tavily key>"
29
- # huggingface_api_token: "<your HF token>" (optional duplication)
30
- with open("config.yaml", "r") as f:
31
- cfg = yaml.safe_load(f)
32
-
33
- TAVILY_API_KEY = cfg.get("tavily_api_key")
34
- assert TAVILY_API_KEY, "Put your Tavily key under 'tavily_api_key' in config.yaml."
35
-
36
- #
37
- # ─── 3) DEFINE β€œTOOL” WRAPPERS ────────────────────────────────────────────────────
38
- #
39
-
40
- # 3a) Calculator (a β€œsafe eval” of simple expressions)
41
- def _safe_eval(expr: str) -> str:
42
- import ast, operator
43
-
44
- ops = {
45
- ast.Add: operator.add,
46
- ast.Sub: operator.sub,
47
- ast.Mult: operator.mul,
48
- ast.Div: operator.truediv,
49
- ast.Pow: operator.pow,
50
- ast.USub: operator.neg,
51
- }
52
-
53
- def _eval(node):
54
- if isinstance(node, ast.Constant):
55
- return node.n
56
- elif isinstance(node, ast.BinOp):
57
- return ops[type(node.op)](_eval(node.left), _eval(node.right))
58
- elif isinstance(node, ast.UnaryOp):
59
- return ops[type(node.op)](_eval(node.operand))
60
- else:
61
- raise TypeError(f"Unsupported AST node: {node}")
62
-
63
- node = ast.parse(expr, mode="eval").body
64
- return str(_eval(node))
65
-
66
-
67
- def _calculator_tool(text: str) -> str:
68
- try:
69
- return _safe_eval(text)
70
- except Exception as e:
71
- return f"Error evaluating expression: {e}"
72
-
73
-
74
- calculator_tool = StructuredTool.from_function(
75
- func=_calculator_tool,
76
- name="calculator",
77
- description="Evaluate simple arithmetic expressions; return the numeric result as a string.",
78
- )
79
-
80
- # 3b) Tavily‐based search
81
- from tavily import TavilyClient
82
-
83
- class _TavilySearch:
84
- def __init__(self, api_key: str):
85
- self.client = TavilyClient(api_key=api_key)
86
-
87
- def __call__(self, query: str) -> str:
88
- resp = self.client.search(query)
89
- results = resp.get("results", [])
90
- if not results:
91
- return "No results found."
92
- snippets = []
93
- for r in results[:3]:
94
- title = r.get("title")
95
- snippet = r.get("snippet")
96
- if title:
97
- snippets.append(title)
98
- elif snippet:
99
- snippets.append(snippet)
100
- return " | ".join(snippets)
101
-
102
-
103
- _tavily_search = _TavilySearch(api_key=TAVILY_API_KEY)
104
-
105
- # Note: pass the instance’s __call__, not the instance itself.
106
- search_tool = StructuredTool.from_function(
107
- func=_tavily_search.__call__,
108
- name="search",
109
- description="Look up facts via Tavily; return up to three summaries joined by ' | '.",
110
- )
111
-
112
- TOOLS = [calculator_tool, search_tool]
113
-
114
-
115
- #
116
- # ─── 4) PRELOAD A FREE HF MODEL & WRAP IT AS HuggingFacePipeline ───────────────────
117
- #
118
- # We choose β€œgoogle/flan-t5-small” (free, CPU‐friendly). Load as a text2text pipeline:
119
- hf_gen = pipeline(
120
- "text2text-generation",
121
- model="google/flan-t5-small",
122
- device=-1, # CPU only
123
- max_new_tokens=128,
124
- do_sample=False, # greedy
125
- )
126
-
127
- # Now wrap that pipeline into a HuggingFacePipeline LLM.
128
- # (No API token needed here for a local β€œgoogle/flan-t5-small”)
129
- llm = HuggingFacePipeline(pipeline=hf_gen)
130
-
131
-
132
- #
133
- # ─── 5) CREATE A LANGGRAPH ReAct AGENT ─────────────────────────────────────────────
134
- #
135
- # This `create_react_agent` will add the Thought/Action/Observation framing
136
- # so that the LLM can call β€œcalculator” or β€œsearch” as needed,
137
- # and then eventually emit β€œFinal Answer: …”.
138
- #
139
- react_agent = create_react_agent(
140
- llm=llm,
141
- tools=TOOLS,
142
- max_iterations=3,
143
- verbose=False,
144
- )
145
-
146
-
147
- #
148
- # ─── 6) DEFINE STATE SCHEMA & SINGLE GRAPH NODE ─────────────────────────────────
149
- #
150
- class AgentState(TypedDict):
151
- question: str
152
- tool_output: str # (ignored by ReAct, but must exist)
153
- final_answer: str
154
-
155
-
156
- def AgentNode(state: AgentState) -> AgentState:
157
- q = state["question"].strip()
158
- # Invoke the internal ReAct loop:
159
- answer = react_agent.invoke(q).strip()
160
- state["final_answer"] = answer
161
- return state
162
-
163
-
164
- #
165
- # ─── 7) WIRE UP THE LANGGRAPH ─────────────────────────────────────────────────────
166
- #
167
- builder = StateGraph(AgentState)
168
- builder.set_entry_point("AgentNode")
169
- builder.add_node("AgentNode", AgentNode)
170
- builder.add_edge(START, "AgentNode")
171
- builder.add_edge("AgentNode", END)
172
-
173
- graph = builder.compile()
174
-
175
- #
176
- # ─── 8) SMOKE TESTS ───────────────────────────────────────────────────────────────
177
- #
178
- if __name__ == "__main__":
179
- print("Device set to use CPU\n")
180
- tests = [
181
- "How much is 2 + 2",
182
- "What is the capital of France?",
183
- "Which country had the fewest athletes at the 1928 Olympics? Give the IOC code."
184
- ]
185
- for q in tests:
186
- state = {"question": q, "tool_output": "", "final_answer": ""}
187
- out = graph.invoke(state)
188
- print(f"Q: {q!r}\n→ A: {out['final_answer']!r}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
langgraph_agents.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from typing import TypedDict, Annotated, List, Dict, Any
4
+
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.tools import tool
7
+ from langchain_community.tools.tavily_search import TavilySearchResults
8
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, AnyMessage
9
+ from langchain_core.messages.ai import subtract_usage
10
+ from langsmith.run_helpers import trace_run
11
+
12
+
13
+ from langgraph.graph import StateGraph, MessagesState, START, END
14
+ from langgraph.graph.message import add_messages
15
+ from langgraph.prebuilt import ToolNode, tools_condition
16
+
17
+ from difflib import SequenceMatcher
18
+
19
+ load_dotenv()
20
+
21
+ # System prompt for assistant
22
+ system_prompt = """
23
+ You are a high-performance question-answering agent. Your job is to answer each question using the available tools (web search, Wikipedia, ArXiv, etc.) when necessary. Always return a clear and concise one-line answer. Never explain, disclaim, or use phrases like "I'm sorry", "I cannot", or "as an AI language model".
24
+
25
+ Your responses must follow these strict rules:
26
+
27
+ 1. Only output the **final answer**, as a single line. No preamble, no reasoning, no markdown.
28
+ 2. If a tool can help, invoke it to retrieve relevant information. Use tools assertively when the answer is not already cached or obvious.
29
+ 3. If a question contains a known URL or document reference, try to infer or search its content based on what is available.
30
+ 4. When dealing with long or obscure questions (e.g., academic papers, dataset entries, etc.), extract just the answer β€” even if based on partial context.
31
+ 5. Never refuse to answer. Make your best informed guess based on the tools, data, and context available.
32
+ 6. Repeat answers for duplicate questions.
33
+ 7. If the question requires extracting a list or name, return the **bare** list or name, alphabetized if requested.
34
+
35
+ ### Examples of valid answers:
36
+ LUX
37
+ Paris
38
+ 28 September 1985
39
+ bell pepper, broccoli, celery, corn, green beans, lettuce, sweet potatoes, zucchini
40
+ 4
41
+
42
+ You must be accurate, efficient, and concise. Begin.
43
+ """
44
+
45
+ # Reflection prompt for the LLM to self-assess its answer
46
+ reflection_prompt = """
47
+ You are a reflective quality control agent.
48
+
49
+ Your task is to verify if the assistant's answer is a correct and complete response to the user question.
50
+ You will think carefully before responding.
51
+
52
+ Instructions:
53
+ 1. Analyze the user question: What is being asked? Are there specific formats or constraints? (e.g. one-line, IOC code, alphabetical order, names only, no explanations)
54
+ 2. Evaluate the assistant's answer: Does it answer the core question faithfully and clearly? Is it concise, accurate, and in the required format?
55
+ 3. Reflect: If the answer is already optimal, return it unchanged.
56
+ 4. If the answer has issues (wrong content, incomplete reasoning, extra text, wrong format, etc.), fix it. You may use reasoning, assumptions, or clarification based on context.
57
+
58
+ Respond with ONLY the improved answer (if changed), or the original if it's already optimal.
59
+
60
+ Begin.
61
+ """
62
+
63
+ # Tools
64
+ @tool
65
+ def web_search(query: str) -> Dict[str, str]:
66
+ """Search the web for information."""
67
+ results = TavilySearchResults(max_results=3).run(query)
68
+ docs = "\n".join([doc["content"] for doc in results])
69
+ return {"web_results": docs}
70
+
71
+ TOOLS = [web_search]
72
+
73
+ # Agent state
74
+ class AgentState(TypedDict):
75
+ messages: Annotated[List[AnyMessage], add_messages]
76
+
77
+ # LLMs
78
+ llm = ChatOpenAI(model="gpt-4", temperature=0)
79
+ llm_with_tools = llm.bind_tools(TOOLS)
80
+
81
+ # Assistant node
82
+ def assistant(state: AgentState) -> Dict[str, Any]:
83
+ result = llm_with_tools.invoke(state["messages"])
84
+ if isinstance(result, AIMessage) and result.usage_metadata is None:
85
+ result.usage_metadata = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
86
+ return {"messages": [result]}
87
+
88
+ # Reflection agent
89
+ def reflect_answer(question: str, answer: str) -> str:
90
+ reflector = llm.with_config({"tags": ["reflection"]})
91
+ input_messages = [
92
+ SystemMessage(content=reflection_prompt),
93
+ HumanMessage(content=f"Q: {question}\nAssistant's Answer: {answer}")
94
+ ]
95
+ reflection_result = reflector.invoke(input_messages)
96
+ return reflection_result.content.strip()
97
+
98
+ # Build LangGraph
99
+ builder = StateGraph(AgentState)
100
+ builder.add_node("assistant", assistant)
101
+ builder.add_node("tools", ToolNode(TOOLS))
102
+
103
+ builder.set_entry_point("assistant")
104
+
105
+ builder.add_conditional_edges(
106
+ "assistant",
107
+ tools_condition,
108
+ {
109
+ "tools": "tools",
110
+ END: END
111
+ }
112
+ )
113
+ builder.add_edge("tools", "assistant")
114
+ graph = builder.compile()
115
+
116
+ # Evaluation helpers
117
+ def similarity_score(a: str, b: str) -> float:
118
+ return round(SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio(), 2)
119
+
120
+ # Questions + Ground Truths
121
+ qa_pairs = [
122
+ {
123
+ "q": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
124
+ "gt": "Louvrier"
125
+ },
126
+ {
127
+ "q": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
128
+ "gt": "Wojciech"
129
+ },
130
+ {
131
+ "q": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
132
+ "gt": "LUX"
133
+ }
134
+ ]
135
+
136
+ # Run evaluation
137
+ print("\nπŸ“Š Evaluating QA Agent\n")
138
+
139
+ for idx, qa in enumerate(qa_pairs, 1):
140
+ question = qa["q"]
141
+ ground_truth = qa["gt"]
142
+
143
+ print(f"πŸ”Ή Q{idx}: {question}")
144
+
145
+ with trace_run(name=f"GAIA-Q{idx}", tags=["gaia", "reflection", "evaluation"]):
146
+ try:
147
+ result = graph.invoke({"messages": [HumanMessage(content=question)]})
148
+ raw_answer = result["messages"][-1].content.strip()
149
+ reflected = reflect_answer(question, raw_answer)
150
+ score = similarity_score(reflected, ground_truth)
151
+ verdict = "βœ…" if score == 1.0 else "❌"
152
+ print(f"{verdict} A{idx}: {reflected} | GT: {ground_truth} | Similarity: {score}\n")
153
+ except Exception as e:
154
+ print(f"❌ A{idx} ERROR: {e}\n")
155
+
test_gaia_questions.py CHANGED
@@ -1,7 +1,7 @@
1
  # test_gaia_questions.py
2
 
3
  import requests
4
- from gaia_new import graph
5
 
6
  def test_with_real_gaia_questions():
7
  # Fetch questions directly from the benchmark API
 
1
  # test_gaia_questions.py
2
 
3
  import requests
4
+ from langgraph_agents import graph
5
 
6
  def test_with_real_gaia_questions():
7
  # Fetch questions directly from the benchmark API