Answeragent / app.py
Nitinguleria's picture
Update app.py
2e5751e verified
raw
history blame
16.6 kB
import os
import gradio as gr
import requests
import pandas as pd
import sympy
import re
from duckduckgo_search import DDGS
from langgraph.graph import StateGraph, END
from typing import TypedDict, Literal
# Default API URL - you may need to update this
DEFAULT_API_URL = "https://huggingface.co/api/spaces/evaluate"
# --- Enhanced Tools for GAIA Benchmark ---
def wikipedia_search_tool(input: str) -> str:
"""Enhanced search tool with better result processing"""
try:
ddgs = DDGS()
results = ddgs.text(input, max_results=5)
if results:
# Combine multiple results for better coverage
combined_info = []
for i, result in enumerate(results[:3]):
body = result.get("body", "")
if body and len(body) > 10:
combined_info.append(f"Source {i+1}: {body}")
if combined_info:
return "\n\n".join(combined_info)
return "No relevant information found."
except Exception as e:
return f"Search Error: {e}"
def math_solver_tool(input: str) -> str:
"""Enhanced math solver with better parsing"""
try:
# Clean and preprocess the input
cleaned_input = input.replace("^", "**").replace("Γ·", "/")
# Try to extract mathematical expressions
math_patterns = [
r'[\d\+\-\*/\^\(\)\.\s]+',
r'[a-zA-Z\d\+\-\*/\^\(\)\.\s]+=.*',
]
for pattern in math_patterns:
matches = re.findall(pattern, cleaned_input)
if matches:
try:
expr = sympy.sympify(matches[0])
result = expr.evalf()
return str(result)
except:
continue
# Direct sympy attempt
expr = sympy.sympify(cleaned_input)
result = expr.evalf()
return str(result)
except Exception as e:
# Try basic eval as fallback (with safety checks)
try:
# Only allow safe mathematical operations
safe_chars = set('0123456789+-*/.() ')
if all(c in safe_chars for c in input.replace(' ', '')):
result = eval(input)
return str(result)
except:
pass
return f"Could not solve mathematical expression: {e}"
def code_execution_tool(input: str) -> str:
"""Enhanced code execution with better safety and Python support"""
try:
# Create a safe execution environment
safe_globals = {
'__builtins__': {
'len': len, 'str': str, 'int': int, 'float': float,
'list': list, 'dict': dict, 'tuple': tuple, 'set': set,
'sum': sum, 'max': max, 'min': min, 'abs': abs,
'round': round, 'range': range, 'enumerate': enumerate,
'zip': zip, 'sorted': sorted, 'reversed': reversed,
'print': print
},
'math': __import__('math'),
're': __import__('re'),
}
local_vars = {}
# Try to execute the code
if 'return ' in input or 'print(' in input:
exec(input, safe_globals, local_vars)
# Look for printed output or return values
if 'result' in local_vars:
return str(local_vars['result'])
return "Code executed successfully"
else:
# Try to evaluate as expression
result = eval(input, safe_globals, local_vars)
return str(result)
except Exception as e:
return f"Code execution error: {e}"
def general_reasoning_tool(input: str) -> str:
"""Tool for general reasoning and analysis"""
# This is a placeholder for more advanced reasoning
# In a real implementation, you might use an LLM here
# Simple keyword-based analysis
if any(word in input.lower() for word in ['compare', 'difference', 'similar', 'contrast']):
return f"Analysis: This appears to be a comparison question. Key factors to consider: {input[:200]}..."
elif any(word in input.lower() for word in ['cause', 'reason', 'why', 'because']):
return f"Reasoning: This is asking about causation. Consider multiple factors that might contribute to: {input[:200]}..."
else:
return f"General analysis: {input[:300]}..."
# --- State definition ---
class AgentState(TypedDict):
question: str
response: str
tool_used: str
# --- Enhanced Routing logic for GAIA ---
def route_question(state: AgentState) -> Literal["math", "code", "search", "reasoning"]:
"""Enhanced routing for GAIA benchmark questions"""
q = state["question"].lower()
# Math-related keywords
math_keywords = [
"solve", "calculate", "evaluate", "compute", "sum", "multiply",
"divide", "percentage", "%", "=", "equation", "formula", "average",
"total", "cost", "price", "number", "how many", "how much"
]
# Code-related keywords
code_keywords = [
"python", "code", "function", "return", "algorithm", "program",
"script", "execute", "run", "implementation"
]
# Search-related keywords
search_keywords = [
"what", "who", "when", "where", "which", "capital", "country",
"invented", "created", "founded", "established", "located", "known for"
]
# Check for mathematical expressions or numbers
if (any(k in q for k in math_keywords) or
re.search(r'\d+[\+\-\*/\^]\d+', q) or
re.search(r'\$\d+', q) or
'%' in q):
return "math"
elif any(k in q for k in code_keywords):
return "code"
elif any(k in q for k in search_keywords):
return "search"
else:
return "reasoning"
# --- Node functions ---
def math_node(state: AgentState) -> AgentState:
response = math_solver_tool(state["question"])
return {
"question": state["question"],
"response": response,
"tool_used": "math"
}
def code_node(state: AgentState) -> AgentState:
response = code_execution_tool(state["question"])
return {
"question": state["question"],
"response": response,
"tool_used": "code"
}
def search_node(state: AgentState) -> AgentState:
response = wikipedia_search_tool(state["question"])
return {
"question": state["question"],
"response": response,
"tool_used": "search"
}
def reasoning_node(state: AgentState) -> AgentState:
response = general_reasoning_tool(state["question"])
return {
"question": state["question"],
"response": response,
"tool_used": "reasoning"
}
# --- LangGraph setup with corrected API ---
def create_agent_graph():
"""Create the agent graph using the correct LangGraph API"""
# Create the state graph
workflow = StateGraph(AgentState)
# Add all the nodes
workflow.add_node("math", math_node)
workflow.add_node("code", code_node)
workflow.add_node("search", search_node)
workflow.add_node("reasoning", reasoning_node)
# Add conditional edges from entry point
workflow.add_conditional_edges(
"__start__",
route_question,
{
"math": "math",
"code": "code",
"search": "search",
"reasoning": "reasoning"
}
)
# All nodes end the workflow
workflow.add_edge("math", END)
workflow.add_edge("code", END)
workflow.add_edge("search", END)
workflow.add_edge("reasoning", END)
return workflow.compile()
# Create the compiled graph
app_graph = create_agent_graph()
# --- Enhanced Agent wrapper ---
class BasicAgent:
def __init__(self):
self.graph = app_graph
print("Enhanced LangGraph Agent initialized for GAIA benchmark.")
def __call__(self, question: str) -> str:
"""Process a question and return an answer"""
try:
state = {
"question": question,
"response": "",
"tool_used": ""
}
result = self.graph.invoke(state)
# Post-process the response for better formatting
response = result.get("response", "No response generated")
tool_used = result.get("tool_used", "unknown")
# For math problems, try to extract just the numerical answer
if tool_used == "math" and response:
# Try to extract the final number
numbers = re.findall(r'-?\d+\.?\d*', response)
if numbers:
return numbers[-1] # Return the last number found
return str(response)
except Exception as e:
print(f"Error in agent processing: {e}")
return f"Error: Could not process the question - {e}"
def run_and_submit_all(profile: gr.OAuthProfile | None):
"""
Fetches all questions, runs the BasicAgent on them, submits all answers,
and displays the results.
"""
# --- Determine HF Space Runtime URL and Repo URL ---
space_id = os.getenv("SPACE_ID")
if profile:
username = f"{profile.username}"
print(f"User logged in: {username}")
else:
print("User not logged in.")
return "Please Login to Hugging Face with the button.", None
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
submit_url = f"{api_url}/submit"
# 1. Instantiate Agent
try:
agent = BasicAgent()
except Exception as e:
print(f"Error instantiating agent: {e}")
return f"Error initializing agent: {e}", None
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
print(f"Agent code location: {agent_code}")
# 2. Fetch Questions
print(f"Fetching questions from: {questions_url}")
try:
response = requests.get(questions_url, timeout=15)
response.raise_for_status()
questions_data = response.json()
if not questions_data:
print("Fetched questions list is empty.")
return "Fetched questions list is empty or invalid format.", None
print(f"Fetched {len(questions_data)} questions.")
except requests.exceptions.RequestException as e:
print(f"Error fetching questions: {e}")
return f"Error fetching questions: {e}", None
except Exception as e:
print(f"An unexpected error occurred fetching questions: {e}")
return f"An unexpected error occurred fetching questions: {e}", None
# 3. Run Agent on all questions
results_log = []
answers_payload = []
print(f"Running agent on {len(questions_data)} questions...")
for i, item in enumerate(questions_data):
task_id = item.get("task_id")
question_text = item.get("question")
if not task_id or question_text is None:
print(f"Skipping item with missing task_id or question: {item}")
continue
print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
try:
submitted_answer = agent(question_text)
answers_payload.append({
"task_id": task_id,
"submitted_answer": submitted_answer
})
results_log.append({
"Task ID": task_id,
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
"Submitted Answer": submitted_answer
})
except Exception as e:
print(f"Error running agent on task {task_id}: {e}")
error_answer = f"AGENT ERROR: {e}"
answers_payload.append({
"task_id": task_id,
"submitted_answer": error_answer
})
results_log.append({
"Task ID": task_id,
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
"Submitted Answer": error_answer
})
if not answers_payload:
print("Agent did not produce any answers to submit.")
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
# 4. Prepare Submission
submission_data = {
"username": username.strip(),
"agent_code": agent_code,
"answers": answers_payload
}
print(f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'...")
# 5. Submit answers
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
try:
response = requests.post(submit_url, json=submission_data, timeout=120)
response.raise_for_status()
result_data = response.json()
final_status = (
f"Submission Successful!\n"
f"User: {result_data.get('username', username)}\n"
f"Overall Score: {result_data.get('score', 'N/A')}% "
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}"
)
print("Submission successful.")
results_df = pd.DataFrame(results_log)
return final_status, results_df
except requests.exceptions.HTTPError as e:
error_detail = f"Server responded with status {e.response.status_code}."
try:
error_json = e.response.json()
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
except:
error_detail += f" Response: {e.response.text[:500]}"
status_message = f"Submission Failed: {error_detail}"
print(status_message)
results_df = pd.DataFrame(results_log)
return status_message, results_df
except Exception as e:
status_message = f"Submission error: {e}"
print(status_message)
results_df = pd.DataFrame(results_log)
return status_message, results_df
# --- Gradio Interface ---
with gr.Blocks(title="GAIA Benchmark Agent") as demo:
gr.Markdown("# Enhanced GAIA Benchmark Agent")
gr.Markdown(
"""
**Enhanced Agent for GAIA Benchmark - Targeting 60% Accuracy**
**Features:**
- Enhanced mathematical problem solving with symbolic computation
- Improved search capabilities with multiple source aggregation
- Safe code execution environment
- Smart question routing (math/code/search/reasoning)
- Better answer formatting and extraction
**Instructions:**
1. Log in to your Hugging Face account using the button below
2. Click 'Run Evaluation & Submit All Answers' to start the benchmark
3. The agent will process all questions and submit answers automatically
**Note:** Processing may take several minutes depending on the number of questions.
"""
)
gr.LoginButton()
run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
status_output = gr.Textbox(
label="Status & Results",
lines=8,
interactive=False,
placeholder="Click the button above to start the evaluation..."
)
results_table = gr.DataFrame(
label="Questions and Agent Responses",
wrap=True,
interactive=False
)
run_button.click(
fn=run_and_submit_all,
inputs=[],
outputs=[status_output, results_table]
)
if __name__ == "__main__":
print("\n" + "="*50)
print("πŸš€ GAIA Benchmark Agent Starting")
print("="*50)
# Environment info
space_host = os.getenv("SPACE_HOST")
space_id = os.getenv("SPACE_ID")
if space_host:
print(f"βœ… SPACE_HOST: {space_host}")
print(f" Runtime URL: https://{space_host}.hf.space")
else:
print("ℹ️ Running locally (SPACE_HOST not found)")
if space_id:
print(f"βœ… SPACE_ID: {space_id}")
print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
else:
print("ℹ️ SPACE_ID not found")
print("="*50 + "\n")
print("🎯 Target: 60% accuracy on GAIA benchmark")
print("πŸ”§ Enhanced tools: Math, Code, Search, Reasoning")
print("\nLaunching Gradio interface...")
demo.launch(debug=True, share=False)