Spaces:
Sleeping
Sleeping
new version with reflection
Browse files- app.py +89 -140
- app_prior.py +0 -116
- gaia_graph.py +0 -116
- gaia_graph_legacy.py +0 -188
- langgraph_agents.py +155 -0
- test_gaia_questions.py +1 -1
app.py
CHANGED
|
@@ -1,167 +1,116 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
-
import
|
| 5 |
import requests
|
| 6 |
import pandas as pd
|
| 7 |
import asyncio
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
from dotenv import load_dotenv
|
| 13 |
-
|
| 14 |
-
load_dotenv()
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
openai_client = OpenAI(api_key=OPENAI_API_KEY)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
-
# safe eval
|
| 30 |
-
return str(eval(expr, {}, {}))
|
| 31 |
-
except Exception as e:
|
| 32 |
-
return f"Error: {e}"
|
| 33 |
|
| 34 |
-
|
| 35 |
try:
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
return "No results found."
|
| 40 |
-
# grab up to 3 titles/snippets
|
| 41 |
-
snippets = []
|
| 42 |
-
for r in results[:3]:
|
| 43 |
-
snippets.append(r.get("title") or r.get("snippet") or "")
|
| 44 |
-
return " | ".join(snippets)
|
| 45 |
except Exception as e:
|
| 46 |
-
return f"
|
| 47 |
-
|
| 48 |
-
functions = [
|
| 49 |
-
{
|
| 50 |
-
"name": "calculator",
|
| 51 |
-
"description": "Evaluate a math expression. Returns the result as a string.",
|
| 52 |
-
"parameters": {
|
| 53 |
-
"type": "object",
|
| 54 |
-
"properties": {
|
| 55 |
-
"expr": {"type": "string", "description": "Math expression to evaluate"}
|
| 56 |
-
},
|
| 57 |
-
"required": ["expr"],
|
| 58 |
-
},
|
| 59 |
-
},
|
| 60 |
-
{
|
| 61 |
-
"name": "search",
|
| 62 |
-
"description": "Look up facts on the web via Tavily; return up to three summaries separated by ' | '.",
|
| 63 |
-
"parameters": {
|
| 64 |
-
"type": "object",
|
| 65 |
-
"properties": {
|
| 66 |
-
"query": {"type": "string", "description": "The search query"}
|
| 67 |
-
},
|
| 68 |
-
"required": ["query"],
|
| 69 |
-
},
|
| 70 |
-
},
|
| 71 |
-
]
|
| 72 |
-
tool_map = {"calculator": calculator, "search": search}
|
| 73 |
-
|
| 74 |
-
# βββ 4) The ReAct loop βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
-
def run_react(question: str) -> str:
|
| 76 |
-
messages = [{"role": "user", "content": question}]
|
| 77 |
-
while True:
|
| 78 |
-
resp = openai_client.chat.completions.create(
|
| 79 |
-
model="gpt-4o-mini", # free-tier βminiβ model
|
| 80 |
-
messages=messages,
|
| 81 |
-
functions=functions,
|
| 82 |
-
function_call="auto",
|
| 83 |
-
)
|
| 84 |
-
msg = resp.choices[0].message
|
| 85 |
-
|
| 86 |
-
# if the model wants to call a tool:
|
| 87 |
-
if msg.function_call:
|
| 88 |
-
name = msg.function_call.name
|
| 89 |
-
args = json.loads(msg.function_call.arguments)
|
| 90 |
-
output = tool_map[name](**args)
|
| 91 |
-
# feed both the assistant's call and the tool's result back into the loop
|
| 92 |
-
messages.append({
|
| 93 |
-
"role": "assistant",
|
| 94 |
-
"content": None,
|
| 95 |
-
"function_call": msg.function_call.to_dict()
|
| 96 |
-
})
|
| 97 |
-
messages.append({
|
| 98 |
-
"role": "function",
|
| 99 |
-
"name": name,
|
| 100 |
-
"content": output
|
| 101 |
-
})
|
| 102 |
-
else:
|
| 103 |
-
# final answer
|
| 104 |
-
return msg.content.strip()
|
| 105 |
-
|
| 106 |
-
# βββ 5) Gradio / GAIA integration ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 107 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 108 |
-
_cache = {}
|
| 109 |
|
| 110 |
-
|
| 111 |
-
def
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
return "Please login.", None
|
| 117 |
-
user = profile.username
|
| 118 |
-
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
|
| 119 |
-
data = resp.json()
|
| 120 |
-
agent = GaiaAgent()
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
return {
|
| 125 |
-
"task_id": item["task_id"],
|
| 126 |
-
"question": item["question"],
|
| 127 |
-
"submitted_answer": ans
|
| 128 |
-
}
|
| 129 |
|
| 130 |
-
results = await asyncio.gather(*(proc(it) for it in data))
|
| 131 |
-
_cache[user] = results
|
| 132 |
-
return f"Answered {len(results)} questions.", pd.DataFrame(results)
|
| 133 |
|
| 134 |
def submit_answers(profile: gr.OAuthProfile | None):
|
| 135 |
if not profile:
|
| 136 |
-
return "Please login.", None
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
]
|
|
|
|
| 144 |
space_id = os.getenv("SPACE_ID", "")
|
| 145 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
|
| 146 |
-
|
| 147 |
-
r = requests.post(f"{DEFAULT_API_URL}/submit", json=body, timeout=60)
|
| 148 |
-
r.raise_for_status()
|
| 149 |
-
res = r.json()
|
| 150 |
-
msg = (
|
| 151 |
-
f"Score: {res.get('score')}% "
|
| 152 |
-
f"({res.get('correct_count')}/{res.get('total_attempted')})"
|
| 153 |
-
)
|
| 154 |
-
return msg, pd.DataFrame(_cache[user])
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
with gr.Blocks() as demo:
|
| 157 |
-
gr.Markdown("# π§ GAIA
|
| 158 |
gr.LoginButton()
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
if __name__ == "__main__":
|
|
|
|
| 167 |
demo.launch(debug=True, share=False)
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import pandas as pd
|
| 5 |
import asyncio
|
| 6 |
+
from gaia_new import graph # Use your agent
|
| 7 |
+
from typing import Optional
|
| 8 |
|
| 9 |
+
# Constants
|
| 10 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 11 |
+
user_answers_cache = {} # session-based cache
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
class GaiaAgent:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
print("Graph-based agent initialized.")
|
|
|
|
| 16 |
|
| 17 |
+
def __call__(self, question: str) -> str:
|
| 18 |
+
print("Received question:", question)
|
| 19 |
+
state = {"question": question, "answer": ""}
|
| 20 |
+
try:
|
| 21 |
+
result = graph.invoke(state)
|
| 22 |
+
print("Result type:", type(result))
|
| 23 |
+
print("Result value:", result)
|
| 24 |
+
if isinstance(result, dict):
|
| 25 |
+
return result.get("answer", "No answer generated.")
|
| 26 |
+
else:
|
| 27 |
+
return f"Unexpected output from graph: {result}"
|
| 28 |
+
except Exception as e:
|
| 29 |
+
return f"ERROR invoking graph: {e}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Async runner
|
| 33 |
+
async def run_agent(profile: gr.OAuthProfile | None):
|
| 34 |
+
if not profile:
|
| 35 |
+
return "Please login to Hugging Face.", None
|
| 36 |
|
| 37 |
+
username = profile.username
|
| 38 |
+
agent = GaiaAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# 1. Load questions
|
| 41 |
try:
|
| 42 |
+
response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
|
| 43 |
+
response.raise_for_status()
|
| 44 |
+
questions_data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as e:
|
| 46 |
+
return f"Error fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
# 2. Process questions
|
| 49 |
+
async def process(item):
|
| 50 |
+
task_id = item.get("task_id")
|
| 51 |
+
question = item.get("question")
|
| 52 |
+
try:
|
| 53 |
+
answer = await asyncio.to_thread(agent, question)
|
| 54 |
+
return {"task_id": task_id, "question": question, "submitted_answer": answer}
|
| 55 |
+
except Exception as e:
|
| 56 |
+
return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
|
| 57 |
|
| 58 |
+
results = await asyncio.gather(*(process(item) for item in questions_data))
|
| 59 |
+
user_answers_cache[username] = results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
df = pd.DataFrame(results)
|
| 62 |
+
return f"Answered {len(results)} questions. Ready to submit.", df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def submit_answers(profile: gr.OAuthProfile | None):
|
| 66 |
if not profile:
|
| 67 |
+
return "Please login to Hugging Face.", None
|
| 68 |
+
|
| 69 |
+
username = profile.username.strip()
|
| 70 |
+
if username not in user_answers_cache:
|
| 71 |
+
return "No cached answers. Please run the agent first.", None
|
| 72 |
+
|
| 73 |
+
answers_payload = [
|
| 74 |
+
{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
|
| 75 |
+
for item in user_answers_cache[username]
|
| 76 |
]
|
| 77 |
+
|
| 78 |
space_id = os.getenv("SPACE_ID", "")
|
| 79 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
|
| 80 |
+
submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
# 3. Submit to scoring API
|
| 83 |
+
try:
|
| 84 |
+
response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
result = response.json()
|
| 87 |
+
final_status = (
|
| 88 |
+
f"β
Submission Successful!\n"
|
| 89 |
+
f"π€ User: {result.get('username')}\n"
|
| 90 |
+
f"π― Score: {result.get('score', 'N/A')}% "
|
| 91 |
+
f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
|
| 92 |
+
f"π© Message: {result.get('message', 'No message received.')}"
|
| 93 |
+
)
|
| 94 |
+
df = pd.DataFrame(user_answers_cache[username])
|
| 95 |
+
return final_status, df
|
| 96 |
+
except Exception as e:
|
| 97 |
+
return f"β Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ββββββββββ Gradio UI ββββββββββ
|
| 101 |
with gr.Blocks() as demo:
|
| 102 |
+
gr.Markdown("# π§ GAIA Agent Evaluation")
|
| 103 |
gr.LoginButton()
|
| 104 |
+
|
| 105 |
+
run_button = gr.Button("βΆοΈ Run Agent on GAIA Questions")
|
| 106 |
+
submit_button = gr.Button("π€ Submit Cached Answers")
|
| 107 |
+
|
| 108 |
+
status = gr.Textbox(label="Status", lines=6, interactive=False)
|
| 109 |
+
results = gr.DataFrame(label="Answers", wrap=True)
|
| 110 |
+
|
| 111 |
+
run_button.click(run_agent, outputs=[status, results])
|
| 112 |
+
submit_button.click(submit_answers, outputs=[status, results])
|
| 113 |
|
| 114 |
if __name__ == "__main__":
|
| 115 |
+
print("Launching Gradio app...")
|
| 116 |
demo.launch(debug=True, share=False)
|
app_prior.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import gradio as gr
|
| 3 |
-
import requests
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import asyncio
|
| 6 |
-
from gaia_new import graph # Use your agent
|
| 7 |
-
from typing import Optional
|
| 8 |
-
|
| 9 |
-
# Constants
|
| 10 |
-
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 11 |
-
user_answers_cache = {} # session-based cache
|
| 12 |
-
|
| 13 |
-
class GaiaAgent:
|
| 14 |
-
def __init__(self):
|
| 15 |
-
print("Graph-based agent initialized.")
|
| 16 |
-
|
| 17 |
-
def __call__(self, question: str) -> str:
|
| 18 |
-
print("Received question:", question)
|
| 19 |
-
state = {"question": question, "answer": ""}
|
| 20 |
-
try:
|
| 21 |
-
result = graph.invoke(state)
|
| 22 |
-
print("Result type:", type(result))
|
| 23 |
-
print("Result value:", result)
|
| 24 |
-
if isinstance(result, dict):
|
| 25 |
-
return result.get("answer", "No answer generated.")
|
| 26 |
-
else:
|
| 27 |
-
return f"Unexpected output from graph: {result}"
|
| 28 |
-
except Exception as e:
|
| 29 |
-
return f"ERROR invoking graph: {e}"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
# Async runner
|
| 33 |
-
async def run_agent(profile: gr.OAuthProfile | None):
|
| 34 |
-
if not profile:
|
| 35 |
-
return "Please login to Hugging Face.", None
|
| 36 |
-
|
| 37 |
-
username = profile.username
|
| 38 |
-
agent = GaiaAgent()
|
| 39 |
-
|
| 40 |
-
# 1. Load questions
|
| 41 |
-
try:
|
| 42 |
-
response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=10)
|
| 43 |
-
response.raise_for_status()
|
| 44 |
-
questions_data = response.json()
|
| 45 |
-
except Exception as e:
|
| 46 |
-
return f"Error fetching questions: {e}", None
|
| 47 |
-
|
| 48 |
-
# 2. Process questions
|
| 49 |
-
async def process(item):
|
| 50 |
-
task_id = item.get("task_id")
|
| 51 |
-
question = item.get("question")
|
| 52 |
-
try:
|
| 53 |
-
answer = await asyncio.to_thread(agent, question)
|
| 54 |
-
return {"task_id": task_id, "question": question, "submitted_answer": answer}
|
| 55 |
-
except Exception as e:
|
| 56 |
-
return {"task_id": task_id, "question": question, "submitted_answer": f"ERROR: {e}"}
|
| 57 |
-
|
| 58 |
-
results = await asyncio.gather(*(process(item) for item in questions_data))
|
| 59 |
-
user_answers_cache[username] = results
|
| 60 |
-
|
| 61 |
-
df = pd.DataFrame(results)
|
| 62 |
-
return f"Answered {len(results)} questions. Ready to submit.", df
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def submit_answers(profile: gr.OAuthProfile | None):
|
| 66 |
-
if not profile:
|
| 67 |
-
return "Please login to Hugging Face.", None
|
| 68 |
-
|
| 69 |
-
username = profile.username.strip()
|
| 70 |
-
if username not in user_answers_cache:
|
| 71 |
-
return "No cached answers. Please run the agent first.", None
|
| 72 |
-
|
| 73 |
-
answers_payload = [
|
| 74 |
-
{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]}
|
| 75 |
-
for item in user_answers_cache[username]
|
| 76 |
-
]
|
| 77 |
-
|
| 78 |
-
space_id = os.getenv("SPACE_ID", "")
|
| 79 |
-
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
|
| 80 |
-
submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload}
|
| 81 |
-
|
| 82 |
-
# 3. Submit to scoring API
|
| 83 |
-
try:
|
| 84 |
-
response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
|
| 85 |
-
response.raise_for_status()
|
| 86 |
-
result = response.json()
|
| 87 |
-
final_status = (
|
| 88 |
-
f"β
Submission Successful!\n"
|
| 89 |
-
f"π€ User: {result.get('username')}\n"
|
| 90 |
-
f"π― Score: {result.get('score', 'N/A')}% "
|
| 91 |
-
f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
|
| 92 |
-
f"π© Message: {result.get('message', 'No message received.')}"
|
| 93 |
-
)
|
| 94 |
-
df = pd.DataFrame(user_answers_cache[username])
|
| 95 |
-
return final_status, df
|
| 96 |
-
except Exception as e:
|
| 97 |
-
return f"β Submission failed: {e}", pd.DataFrame(user_answers_cache[username])
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
# ββββββββββ Gradio UI ββββββββββ
|
| 101 |
-
with gr.Blocks() as demo:
|
| 102 |
-
gr.Markdown("# π§ GAIA Agent Evaluation")
|
| 103 |
-
gr.LoginButton()
|
| 104 |
-
|
| 105 |
-
run_button = gr.Button("βΆοΈ Run Agent on GAIA Questions")
|
| 106 |
-
submit_button = gr.Button("π€ Submit Cached Answers")
|
| 107 |
-
|
| 108 |
-
status = gr.Textbox(label="Status", lines=6, interactive=False)
|
| 109 |
-
results = gr.DataFrame(label="Answers", wrap=True)
|
| 110 |
-
|
| 111 |
-
run_button.click(run_agent, outputs=[status, results])
|
| 112 |
-
submit_button.click(submit_answers, outputs=[status, results])
|
| 113 |
-
|
| 114 |
-
if __name__ == "__main__":
|
| 115 |
-
print("Launching Gradio app...")
|
| 116 |
-
demo.launch(debug=True, share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_graph.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
# gaia_graph.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import ast
|
| 5 |
-
import operator
|
| 6 |
-
from typing import TypedDict
|
| 7 |
-
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
-
from langchain.tools import Tool
|
| 10 |
-
from langchain.agents import initialize_agent, AgentType
|
| 11 |
-
from langchain_openai import ChatOpenAI
|
| 12 |
-
from langgraph.graph import StateGraph, END
|
| 13 |
-
|
| 14 |
-
# βββ Load Environment Variables ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
-
load_dotenv()
|
| 16 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 17 |
-
assert OPENAI_API_KEY, "OPENAI_API_KEY is not set"
|
| 18 |
-
|
| 19 |
-
# βββ Define Calculator Tool ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
-
def safe_eval(expr: str) -> str:
|
| 21 |
-
ops = {
|
| 22 |
-
ast.Add: operator.add,
|
| 23 |
-
ast.Sub: operator.sub,
|
| 24 |
-
ast.Mult: operator.mul,
|
| 25 |
-
ast.Div: operator.truediv,
|
| 26 |
-
ast.Pow: operator.pow,
|
| 27 |
-
ast.USub: operator.neg,
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
def _eval(node):
|
| 31 |
-
if isinstance(node, ast.Constant):
|
| 32 |
-
return node.value
|
| 33 |
-
if isinstance(node, ast.BinOp):
|
| 34 |
-
return ops[type(node.op)](_eval(node.left), _eval(node.right))
|
| 35 |
-
if isinstance(node, ast.UnaryOp):
|
| 36 |
-
return ops[type(node.op)](_eval(node.operand))
|
| 37 |
-
raise TypeError(f"Unsupported AST node: {node!r}")
|
| 38 |
-
|
| 39 |
-
try:
|
| 40 |
-
node = ast.parse(expr, mode="eval").body
|
| 41 |
-
return str(_eval(node))
|
| 42 |
-
except Exception as e:
|
| 43 |
-
return f"Error: {e}"
|
| 44 |
-
|
| 45 |
-
calculator_tool = Tool(
|
| 46 |
-
name="calculator",
|
| 47 |
-
func=safe_eval,
|
| 48 |
-
description="Evaluate basic math expressions. Input: a math string like '2 + 2'. Output: the result.",
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
# βββ Define Search Tool using Tavily βββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
-
from tavily import TavilyClient
|
| 53 |
-
|
| 54 |
-
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 55 |
-
assert TAVILY_API_KEY, "TAVILY_API_KEY environment variable is not set"
|
| 56 |
-
|
| 57 |
-
tavily = TavilyClient(api_key=TAVILY_API_KEY)
|
| 58 |
-
|
| 59 |
-
def search_tool_fn(query: str) -> str:
|
| 60 |
-
try:
|
| 61 |
-
resp = tavily.search(query)
|
| 62 |
-
results = resp.get("results", [])
|
| 63 |
-
if not results:
|
| 64 |
-
return "No results found."
|
| 65 |
-
return results[0].get("title") or results[0].get("snippet") or "No snippet."
|
| 66 |
-
except Exception as e:
|
| 67 |
-
return f"Search error: {e}"
|
| 68 |
-
|
| 69 |
-
search_tool = Tool(
|
| 70 |
-
name="search",
|
| 71 |
-
func=search_tool_fn,
|
| 72 |
-
description="Useful for answering factual questions using a search engine.",
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
# βββ Create LLM Agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
-
llm = ChatOpenAI(
|
| 77 |
-
temperature=0.0,
|
| 78 |
-
model="gpt-4o-mini",
|
| 79 |
-
openai_api_key=OPENAI_API_KEY
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
agent_executor = initialize_agent(
|
| 83 |
-
tools=[calculator_tool, search_tool],
|
| 84 |
-
llm=llm,
|
| 85 |
-
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 86 |
-
verbose=False,
|
| 87 |
-
handle_parsing_errors=True,
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
# βββ Clean Output ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
-
def clean_answer(ans: str) -> str:
|
| 92 |
-
if "```" in ans:
|
| 93 |
-
ans = ans.split("```")[-1]
|
| 94 |
-
if "Answer:" in ans:
|
| 95 |
-
return ans.split("Answer:")[-1].strip()
|
| 96 |
-
if "β" in ans:
|
| 97 |
-
return ans.split("β")[-1].strip()
|
| 98 |
-
return ans.strip()
|
| 99 |
-
|
| 100 |
-
# βββ Define State ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
-
class GaiaState(TypedDict):
|
| 102 |
-
question: str
|
| 103 |
-
answer: str
|
| 104 |
-
|
| 105 |
-
# βββ Define Node Function ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
-
def agent_node(state: GaiaState) -> GaiaState:
|
| 107 |
-
raw = agent_executor.run(state["question"])
|
| 108 |
-
return {"question": state["question"], "answer": clean_answer(raw)}
|
| 109 |
-
|
| 110 |
-
# βββ Build LangGraph βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
-
builder = StateGraph(GaiaState)
|
| 112 |
-
builder.add_node("agent", agent_node)
|
| 113 |
-
builder.set_entry_point("agent")
|
| 114 |
-
builder.set_finish_point("agent")
|
| 115 |
-
|
| 116 |
-
graph = builder.compile()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gaia_graph_legacy.py
DELETED
|
@@ -1,188 +0,0 @@
|
|
| 1 |
-
# gaia_graph.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import re
|
| 5 |
-
import yaml
|
| 6 |
-
from typing import TypedDict
|
| 7 |
-
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
-
from transformers import pipeline
|
| 10 |
-
from langchain_huggingface import HuggingFacePipeline
|
| 11 |
-
from langchain_core.tools.structured import StructuredTool
|
| 12 |
-
from langgraph.graph import StateGraph, START, END
|
| 13 |
-
from langgraph.prebuilt.chat_agent_executor import create_react_agent
|
| 14 |
-
|
| 15 |
-
#
|
| 16 |
-
# βββ 1) LOAD ENVIRONMENT VARIABLES ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
-
#
|
| 18 |
-
# Make sure you have a valid HF token in your shell or .env:
|
| 19 |
-
# export HUGGINGFACE_API_TOKEN="<your token>"
|
| 20 |
-
load_dotenv()
|
| 21 |
-
HF_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
|
| 22 |
-
assert HF_TOKEN, "Please set HUGGINGFACE_API_TOKEN in your environment or .env."
|
| 23 |
-
|
| 24 |
-
#
|
| 25 |
-
# βββ 2) LOAD config.yaml βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
-
#
|
| 27 |
-
# Expect config.yaml with:
|
| 28 |
-
# tavily_api_key: "<your Tavily key>"
|
| 29 |
-
# huggingface_api_token: "<your HF token>" (optional duplication)
|
| 30 |
-
with open("config.yaml", "r") as f:
|
| 31 |
-
cfg = yaml.safe_load(f)
|
| 32 |
-
|
| 33 |
-
TAVILY_API_KEY = cfg.get("tavily_api_key")
|
| 34 |
-
assert TAVILY_API_KEY, "Put your Tavily key under 'tavily_api_key' in config.yaml."
|
| 35 |
-
|
| 36 |
-
#
|
| 37 |
-
# βββ 3) DEFINE βTOOLβ WRAPPERS ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
# 3a) Calculator (a βsafe evalβ of simple expressions)
|
| 41 |
-
def _safe_eval(expr: str) -> str:
|
| 42 |
-
import ast, operator
|
| 43 |
-
|
| 44 |
-
ops = {
|
| 45 |
-
ast.Add: operator.add,
|
| 46 |
-
ast.Sub: operator.sub,
|
| 47 |
-
ast.Mult: operator.mul,
|
| 48 |
-
ast.Div: operator.truediv,
|
| 49 |
-
ast.Pow: operator.pow,
|
| 50 |
-
ast.USub: operator.neg,
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
def _eval(node):
|
| 54 |
-
if isinstance(node, ast.Constant):
|
| 55 |
-
return node.n
|
| 56 |
-
elif isinstance(node, ast.BinOp):
|
| 57 |
-
return ops[type(node.op)](_eval(node.left), _eval(node.right))
|
| 58 |
-
elif isinstance(node, ast.UnaryOp):
|
| 59 |
-
return ops[type(node.op)](_eval(node.operand))
|
| 60 |
-
else:
|
| 61 |
-
raise TypeError(f"Unsupported AST node: {node}")
|
| 62 |
-
|
| 63 |
-
node = ast.parse(expr, mode="eval").body
|
| 64 |
-
return str(_eval(node))
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def _calculator_tool(text: str) -> str:
|
| 68 |
-
try:
|
| 69 |
-
return _safe_eval(text)
|
| 70 |
-
except Exception as e:
|
| 71 |
-
return f"Error evaluating expression: {e}"
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
calculator_tool = StructuredTool.from_function(
|
| 75 |
-
func=_calculator_tool,
|
| 76 |
-
name="calculator",
|
| 77 |
-
description="Evaluate simple arithmetic expressions; return the numeric result as a string.",
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
# 3b) Tavilyβbased search
|
| 81 |
-
from tavily import TavilyClient
|
| 82 |
-
|
| 83 |
-
class _TavilySearch:
|
| 84 |
-
def __init__(self, api_key: str):
|
| 85 |
-
self.client = TavilyClient(api_key=api_key)
|
| 86 |
-
|
| 87 |
-
def __call__(self, query: str) -> str:
|
| 88 |
-
resp = self.client.search(query)
|
| 89 |
-
results = resp.get("results", [])
|
| 90 |
-
if not results:
|
| 91 |
-
return "No results found."
|
| 92 |
-
snippets = []
|
| 93 |
-
for r in results[:3]:
|
| 94 |
-
title = r.get("title")
|
| 95 |
-
snippet = r.get("snippet")
|
| 96 |
-
if title:
|
| 97 |
-
snippets.append(title)
|
| 98 |
-
elif snippet:
|
| 99 |
-
snippets.append(snippet)
|
| 100 |
-
return " | ".join(snippets)
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
_tavily_search = _TavilySearch(api_key=TAVILY_API_KEY)
|
| 104 |
-
|
| 105 |
-
# Note: pass the instanceβs __call__, not the instance itself.
|
| 106 |
-
search_tool = StructuredTool.from_function(
|
| 107 |
-
func=_tavily_search.__call__,
|
| 108 |
-
name="search",
|
| 109 |
-
description="Look up facts via Tavily; return up to three summaries joined by ' | '.",
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
TOOLS = [calculator_tool, search_tool]
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
#
|
| 116 |
-
# βββ 4) PRELOAD A FREE HF MODEL & WRAP IT AS HuggingFacePipeline βββββββββββββββββββ
|
| 117 |
-
#
|
| 118 |
-
# We choose βgoogle/flan-t5-smallβ (free, CPUβfriendly). Load as a text2text pipeline:
|
| 119 |
-
hf_gen = pipeline(
|
| 120 |
-
"text2text-generation",
|
| 121 |
-
model="google/flan-t5-small",
|
| 122 |
-
device=-1, # CPU only
|
| 123 |
-
max_new_tokens=128,
|
| 124 |
-
do_sample=False, # greedy
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
# Now wrap that pipeline into a HuggingFacePipeline LLM.
|
| 128 |
-
# (No API token needed here for a local βgoogle/flan-t5-smallβ)
|
| 129 |
-
llm = HuggingFacePipeline(pipeline=hf_gen)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
#
|
| 133 |
-
# βββ 5) CREATE A LANGGRAPH ReAct AGENT βββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
-
#
|
| 135 |
-
# This `create_react_agent` will add the Thought/Action/Observation framing
|
| 136 |
-
# so that the LLM can call βcalculatorβ or βsearchβ as needed,
|
| 137 |
-
# and then eventually emit βFinal Answer: β¦β.
|
| 138 |
-
#
|
| 139 |
-
react_agent = create_react_agent(
|
| 140 |
-
llm=llm,
|
| 141 |
-
tools=TOOLS,
|
| 142 |
-
max_iterations=3,
|
| 143 |
-
verbose=False,
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
#
|
| 148 |
-
# βββ 6) DEFINE STATE SCHEMA & SINGLE GRAPH NODE βββββββββββββββββββββββββββββββββ
|
| 149 |
-
#
|
| 150 |
-
class AgentState(TypedDict):
|
| 151 |
-
question: str
|
| 152 |
-
tool_output: str # (ignored by ReAct, but must exist)
|
| 153 |
-
final_answer: str
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
def AgentNode(state: AgentState) -> AgentState:
|
| 157 |
-
q = state["question"].strip()
|
| 158 |
-
# Invoke the internal ReAct loop:
|
| 159 |
-
answer = react_agent.invoke(q).strip()
|
| 160 |
-
state["final_answer"] = answer
|
| 161 |
-
return state
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
#
|
| 165 |
-
# βββ 7) WIRE UP THE LANGGRAPH βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
-
#
|
| 167 |
-
builder = StateGraph(AgentState)
|
| 168 |
-
builder.set_entry_point("AgentNode")
|
| 169 |
-
builder.add_node("AgentNode", AgentNode)
|
| 170 |
-
builder.add_edge(START, "AgentNode")
|
| 171 |
-
builder.add_edge("AgentNode", END)
|
| 172 |
-
|
| 173 |
-
graph = builder.compile()
|
| 174 |
-
|
| 175 |
-
#
|
| 176 |
-
# βββ 8) SMOKE TESTS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
-
#
|
| 178 |
-
if __name__ == "__main__":
|
| 179 |
-
print("Device set to use CPU\n")
|
| 180 |
-
tests = [
|
| 181 |
-
"How much is 2 + 2",
|
| 182 |
-
"What is the capital of France?",
|
| 183 |
-
"Which country had the fewest athletes at the 1928 Olympics? Give the IOC code."
|
| 184 |
-
]
|
| 185 |
-
for q in tests:
|
| 186 |
-
state = {"question": q, "tool_output": "", "final_answer": ""}
|
| 187 |
-
out = graph.invoke(state)
|
| 188 |
-
print(f"Q: {q!r}\nβ A: {out['final_answer']!r}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
langgraph_agents.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from typing import TypedDict, Annotated, List, Dict, Any
|
| 4 |
+
|
| 5 |
+
from langchain_openai import ChatOpenAI
|
| 6 |
+
from langchain_core.tools import tool
|
| 7 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 8 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, AnyMessage
|
| 9 |
+
from langchain_core.messages.ai import subtract_usage
|
| 10 |
+
from langsmith.run_helpers import trace_run
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
from langgraph.graph import StateGraph, MessagesState, START, END
|
| 14 |
+
from langgraph.graph.message import add_messages
|
| 15 |
+
from langgraph.prebuilt import ToolNode, tools_condition
|
| 16 |
+
|
| 17 |
+
from difflib import SequenceMatcher
|
| 18 |
+
|
| 19 |
+
load_dotenv()
|
| 20 |
+
|
| 21 |
+
# System prompt for assistant
|
| 22 |
+
system_prompt = """
|
| 23 |
+
You are a high-performance question-answering agent. Your job is to answer each question using the available tools (web search, Wikipedia, ArXiv, etc.) when necessary. Always return a clear and concise one-line answer. Never explain, disclaim, or use phrases like "I'm sorry", "I cannot", or "as an AI language model".
|
| 24 |
+
|
| 25 |
+
Your responses must follow these strict rules:
|
| 26 |
+
|
| 27 |
+
1. Only output the **final answer**, as a single line. No preamble, no reasoning, no markdown.
|
| 28 |
+
2. If a tool can help, invoke it to retrieve relevant information. Use tools assertively when the answer is not already cached or obvious.
|
| 29 |
+
3. If a question contains a known URL or document reference, try to infer or search its content based on what is available.
|
| 30 |
+
4. When dealing with long or obscure questions (e.g., academic papers, dataset entries, etc.), extract just the answer β even if based on partial context.
|
| 31 |
+
5. Never refuse to answer. Make your best informed guess based on the tools, data, and context available.
|
| 32 |
+
6. Repeat answers for duplicate questions.
|
| 33 |
+
7. If the question requires extracting a list or name, return the **bare** list or name, alphabetized if requested.
|
| 34 |
+
|
| 35 |
+
### Examples of valid answers:
|
| 36 |
+
LUX
|
| 37 |
+
Paris
|
| 38 |
+
28 September 1985
|
| 39 |
+
bell pepper, broccoli, celery, corn, green beans, lettuce, sweet potatoes, zucchini
|
| 40 |
+
4
|
| 41 |
+
|
| 42 |
+
You must be accurate, efficient, and concise. Begin.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
# Reflection prompt for the LLM to self-assess its answer
|
| 46 |
+
reflection_prompt = """
|
| 47 |
+
You are a reflective quality control agent.
|
| 48 |
+
|
| 49 |
+
Your task is to verify if the assistant's answer is a correct and complete response to the user question.
|
| 50 |
+
You will think carefully before responding.
|
| 51 |
+
|
| 52 |
+
Instructions:
|
| 53 |
+
1. Analyze the user question: What is being asked? Are there specific formats or constraints? (e.g. one-line, IOC code, alphabetical order, names only, no explanations)
|
| 54 |
+
2. Evaluate the assistant's answer: Does it answer the core question faithfully and clearly? Is it concise, accurate, and in the required format?
|
| 55 |
+
3. Reflect: If the answer is already optimal, return it unchanged.
|
| 56 |
+
4. If the answer has issues (wrong content, incomplete reasoning, extra text, wrong format, etc.), fix it. You may use reasoning, assumptions, or clarification based on context.
|
| 57 |
+
|
| 58 |
+
Respond with ONLY the improved answer (if changed), or the original if it's already optimal.
|
| 59 |
+
|
| 60 |
+
Begin.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
# Tools
|
| 64 |
+
@tool
|
| 65 |
+
def web_search(query: str) -> Dict[str, str]:
|
| 66 |
+
"""Search the web for information."""
|
| 67 |
+
results = TavilySearchResults(max_results=3).run(query)
|
| 68 |
+
docs = "\n".join([doc["content"] for doc in results])
|
| 69 |
+
return {"web_results": docs}
|
| 70 |
+
|
| 71 |
+
TOOLS = [web_search]
|
| 72 |
+
|
| 73 |
+
# Agent state
|
| 74 |
+
class AgentState(TypedDict):
|
| 75 |
+
messages: Annotated[List[AnyMessage], add_messages]
|
| 76 |
+
|
| 77 |
+
# LLMs
|
| 78 |
+
llm = ChatOpenAI(model="gpt-4", temperature=0)
|
| 79 |
+
llm_with_tools = llm.bind_tools(TOOLS)
|
| 80 |
+
|
| 81 |
+
# Assistant node
|
| 82 |
+
def assistant(state: AgentState) -> Dict[str, Any]:
|
| 83 |
+
result = llm_with_tools.invoke(state["messages"])
|
| 84 |
+
if isinstance(result, AIMessage) and result.usage_metadata is None:
|
| 85 |
+
result.usage_metadata = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
|
| 86 |
+
return {"messages": [result]}
|
| 87 |
+
|
| 88 |
+
# Reflection agent
|
| 89 |
+
def reflect_answer(question: str, answer: str) -> str:
|
| 90 |
+
reflector = llm.with_config({"tags": ["reflection"]})
|
| 91 |
+
input_messages = [
|
| 92 |
+
SystemMessage(content=reflection_prompt),
|
| 93 |
+
HumanMessage(content=f"Q: {question}\nAssistant's Answer: {answer}")
|
| 94 |
+
]
|
| 95 |
+
reflection_result = reflector.invoke(input_messages)
|
| 96 |
+
return reflection_result.content.strip()
|
| 97 |
+
|
| 98 |
+
# Build LangGraph
|
| 99 |
+
builder = StateGraph(AgentState)
|
| 100 |
+
builder.add_node("assistant", assistant)
|
| 101 |
+
builder.add_node("tools", ToolNode(TOOLS))
|
| 102 |
+
|
| 103 |
+
builder.set_entry_point("assistant")
|
| 104 |
+
|
| 105 |
+
builder.add_conditional_edges(
|
| 106 |
+
"assistant",
|
| 107 |
+
tools_condition,
|
| 108 |
+
{
|
| 109 |
+
"tools": "tools",
|
| 110 |
+
END: END
|
| 111 |
+
}
|
| 112 |
+
)
|
| 113 |
+
builder.add_edge("tools", "assistant")
|
| 114 |
+
graph = builder.compile()
|
| 115 |
+
|
| 116 |
+
# Evaluation helpers
|
| 117 |
+
def similarity_score(a: str, b: str) -> float:
|
| 118 |
+
return round(SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio(), 2)
|
| 119 |
+
|
| 120 |
+
# Questions + Ground Truths
|
| 121 |
+
qa_pairs = [
|
| 122 |
+
{
|
| 123 |
+
"q": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 124 |
+
"gt": "Louvrier"
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"q": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
| 128 |
+
"gt": "Wojciech"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"q": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
| 132 |
+
"gt": "LUX"
|
| 133 |
+
}
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
# Run evaluation
|
| 137 |
+
print("\nπ Evaluating QA Agent\n")
|
| 138 |
+
|
| 139 |
+
for idx, qa in enumerate(qa_pairs, 1):
|
| 140 |
+
question = qa["q"]
|
| 141 |
+
ground_truth = qa["gt"]
|
| 142 |
+
|
| 143 |
+
print(f"πΉ Q{idx}: {question}")
|
| 144 |
+
|
| 145 |
+
with trace_run(name=f"GAIA-Q{idx}", tags=["gaia", "reflection", "evaluation"]):
|
| 146 |
+
try:
|
| 147 |
+
result = graph.invoke({"messages": [HumanMessage(content=question)]})
|
| 148 |
+
raw_answer = result["messages"][-1].content.strip()
|
| 149 |
+
reflected = reflect_answer(question, raw_answer)
|
| 150 |
+
score = similarity_score(reflected, ground_truth)
|
| 151 |
+
verdict = "β
" if score == 1.0 else "β"
|
| 152 |
+
print(f"{verdict} A{idx}: {reflected} | GT: {ground_truth} | Similarity: {score}\n")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"β A{idx} ERROR: {e}\n")
|
| 155 |
+
|
test_gaia_questions.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# test_gaia_questions.py
|
| 2 |
|
| 3 |
import requests
|
| 4 |
-
from
|
| 5 |
|
| 6 |
def test_with_real_gaia_questions():
|
| 7 |
# Fetch questions directly from the benchmark API
|
|
|
|
| 1 |
# test_gaia_questions.py
|
| 2 |
|
| 3 |
import requests
|
| 4 |
+
from langgraph_agents import graph
|
| 5 |
|
| 6 |
def test_with_real_gaia_questions():
|
| 7 |
# Fetch questions directly from the benchmark API
|