Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| import requests | |
| import pandas as pd | |
| import sympy | |
| import re | |
| from duckduckgo_search import DDGS | |
| from langgraph.graph import StateGraph, END | |
| from typing import TypedDict, Literal | |
| # Default API URL - you may need to update this | |
| DEFAULT_API_URL = "https://huggingface.co/api/spaces/evaluate" | |
| # --- Enhanced Tools for GAIA Benchmark --- | |
| def wikipedia_search_tool(input: str) -> str: | |
| """Enhanced search tool with better result processing""" | |
| try: | |
| ddgs = DDGS() | |
| results = ddgs.text(input, max_results=5) | |
| if results: | |
| # Combine multiple results for better coverage | |
| combined_info = [] | |
| for i, result in enumerate(results[:3]): | |
| body = result.get("body", "") | |
| if body and len(body) > 10: | |
| combined_info.append(f"Source {i+1}: {body}") | |
| if combined_info: | |
| return "\n\n".join(combined_info) | |
| return "No relevant information found." | |
| except Exception as e: | |
| return f"Search Error: {e}" | |
| def math_solver_tool(input: str) -> str: | |
| """Enhanced math solver with better parsing""" | |
| try: | |
| # Clean and preprocess the input | |
| cleaned_input = input.replace("^", "**").replace("Γ·", "/") | |
| # Try to extract mathematical expressions | |
| math_patterns = [ | |
| r'[\d\+\-\*/\^\(\)\.\s]+', | |
| r'[a-zA-Z\d\+\-\*/\^\(\)\.\s]+=.*', | |
| ] | |
| for pattern in math_patterns: | |
| matches = re.findall(pattern, cleaned_input) | |
| if matches: | |
| try: | |
| expr = sympy.sympify(matches[0]) | |
| result = expr.evalf() | |
| return str(result) | |
| except: | |
| continue | |
| # Direct sympy attempt | |
| expr = sympy.sympify(cleaned_input) | |
| result = expr.evalf() | |
| return str(result) | |
| except Exception as e: | |
| # Try basic eval as fallback (with safety checks) | |
| try: | |
| # Only allow safe mathematical operations | |
| safe_chars = set('0123456789+-*/.() ') | |
| if all(c in safe_chars for c in input.replace(' ', '')): | |
| result = eval(input) | |
| return str(result) | |
| except: | |
| pass | |
| return f"Could not solve mathematical expression: {e}" | |
| def code_execution_tool(input: str) -> str: | |
| """Enhanced code execution with better safety and Python support""" | |
| try: | |
| # Create a safe execution environment | |
| safe_globals = { | |
| '__builtins__': { | |
| 'len': len, 'str': str, 'int': int, 'float': float, | |
| 'list': list, 'dict': dict, 'tuple': tuple, 'set': set, | |
| 'sum': sum, 'max': max, 'min': min, 'abs': abs, | |
| 'round': round, 'range': range, 'enumerate': enumerate, | |
| 'zip': zip, 'sorted': sorted, 'reversed': reversed, | |
| 'print': print | |
| }, | |
| 'math': __import__('math'), | |
| 're': __import__('re'), | |
| } | |
| local_vars = {} | |
| # Try to execute the code | |
| if 'return ' in input or 'print(' in input: | |
| exec(input, safe_globals, local_vars) | |
| # Look for printed output or return values | |
| if 'result' in local_vars: | |
| return str(local_vars['result']) | |
| return "Code executed successfully" | |
| else: | |
| # Try to evaluate as expression | |
| result = eval(input, safe_globals, local_vars) | |
| return str(result) | |
| except Exception as e: | |
| return f"Code execution error: {e}" | |
| def general_reasoning_tool(input: str) -> str: | |
| """Tool for general reasoning and analysis""" | |
| # This is a placeholder for more advanced reasoning | |
| # In a real implementation, you might use an LLM here | |
| # Simple keyword-based analysis | |
| if any(word in input.lower() for word in ['compare', 'difference', 'similar', 'contrast']): | |
| return f"Analysis: This appears to be a comparison question. Key factors to consider: {input[:200]}..." | |
| elif any(word in input.lower() for word in ['cause', 'reason', 'why', 'because']): | |
| return f"Reasoning: This is asking about causation. Consider multiple factors that might contribute to: {input[:200]}..." | |
| else: | |
| return f"General analysis: {input[:300]}..." | |
| # --- State definition --- | |
| class AgentState(TypedDict): | |
| question: str | |
| response: str | |
| tool_used: str | |
| # --- Enhanced Routing logic for GAIA --- | |
| def route_question(state: AgentState) -> Literal["math", "code", "search", "reasoning"]: | |
| """Enhanced routing for GAIA benchmark questions""" | |
| q = state["question"].lower() | |
| # Math-related keywords | |
| math_keywords = [ | |
| "solve", "calculate", "evaluate", "compute", "sum", "multiply", | |
| "divide", "percentage", "%", "=", "equation", "formula", "average", | |
| "total", "cost", "price", "number", "how many", "how much" | |
| ] | |
| # Code-related keywords | |
| code_keywords = [ | |
| "python", "code", "function", "return", "algorithm", "program", | |
| "script", "execute", "run", "implementation" | |
| ] | |
| # Search-related keywords | |
| search_keywords = [ | |
| "what", "who", "when", "where", "which", "capital", "country", | |
| "invented", "created", "founded", "established", "located", "known for" | |
| ] | |
| # Check for mathematical expressions or numbers | |
| if (any(k in q for k in math_keywords) or | |
| re.search(r'\d+[\+\-\*/\^]\d+', q) or | |
| re.search(r'\$\d+', q) or | |
| '%' in q): | |
| return "math" | |
| elif any(k in q for k in code_keywords): | |
| return "code" | |
| elif any(k in q for k in search_keywords): | |
| return "search" | |
| else: | |
| return "reasoning" | |
| # --- Node functions --- | |
| def math_node(state: AgentState) -> AgentState: | |
| response = math_solver_tool(state["question"]) | |
| return { | |
| "question": state["question"], | |
| "response": response, | |
| "tool_used": "math" | |
| } | |
| def code_node(state: AgentState) -> AgentState: | |
| response = code_execution_tool(state["question"]) | |
| return { | |
| "question": state["question"], | |
| "response": response, | |
| "tool_used": "code" | |
| } | |
| def search_node(state: AgentState) -> AgentState: | |
| response = wikipedia_search_tool(state["question"]) | |
| return { | |
| "question": state["question"], | |
| "response": response, | |
| "tool_used": "search" | |
| } | |
| def reasoning_node(state: AgentState) -> AgentState: | |
| response = general_reasoning_tool(state["question"]) | |
| return { | |
| "question": state["question"], | |
| "response": response, | |
| "tool_used": "reasoning" | |
| } | |
| # --- LangGraph setup with corrected API --- | |
| def create_agent_graph(): | |
| """Create the agent graph using the correct LangGraph API""" | |
| # Create the state graph | |
| workflow = StateGraph(AgentState) | |
| # Add all the nodes | |
| workflow.add_node("math", math_node) | |
| workflow.add_node("code", code_node) | |
| workflow.add_node("search", search_node) | |
| workflow.add_node("reasoning", reasoning_node) | |
| # Add conditional edges from entry point | |
| workflow.add_conditional_edges( | |
| "__start__", | |
| route_question, | |
| { | |
| "math": "math", | |
| "code": "code", | |
| "search": "search", | |
| "reasoning": "reasoning" | |
| } | |
| ) | |
| # All nodes end the workflow | |
| workflow.add_edge("math", END) | |
| workflow.add_edge("code", END) | |
| workflow.add_edge("search", END) | |
| workflow.add_edge("reasoning", END) | |
| return workflow.compile() | |
| # Create the compiled graph | |
| app_graph = create_agent_graph() | |
| # --- Enhanced Agent wrapper --- | |
| class BasicAgent: | |
| def __init__(self): | |
| self.graph = app_graph | |
| print("Enhanced LangGraph Agent initialized for GAIA benchmark.") | |
| def __call__(self, question: str) -> str: | |
| """Process a question and return an answer""" | |
| try: | |
| state = { | |
| "question": question, | |
| "response": "", | |
| "tool_used": "" | |
| } | |
| result = self.graph.invoke(state) | |
| # Post-process the response for better formatting | |
| response = result.get("response", "No response generated") | |
| tool_used = result.get("tool_used", "unknown") | |
| # For math problems, try to extract just the numerical answer | |
| if tool_used == "math" and response: | |
| # Try to extract the final number | |
| numbers = re.findall(r'-?\d+\.?\d*', response) | |
| if numbers: | |
| return numbers[-1] # Return the last number found | |
| return str(response) | |
| except Exception as e: | |
| print(f"Error in agent processing: {e}") | |
| return f"Error: Could not process the question - {e}" | |
| def run_and_submit_all(profile: gr.OAuthProfile | None): | |
| """ | |
| Fetches all questions, runs the BasicAgent on them, submits all answers, | |
| and displays the results. | |
| """ | |
| # --- Determine HF Space Runtime URL and Repo URL --- | |
| space_id = os.getenv("SPACE_ID") | |
| if profile: | |
| username = f"{profile.username}" | |
| print(f"User logged in: {username}") | |
| else: | |
| print("User not logged in.") | |
| return "Please Login to Hugging Face with the button.", None | |
| api_url = DEFAULT_API_URL | |
| questions_url = f"{api_url}/questions" | |
| submit_url = f"{api_url}/submit" | |
| # 1. Instantiate Agent | |
| try: | |
| agent = BasicAgent() | |
| except Exception as e: | |
| print(f"Error instantiating agent: {e}") | |
| return f"Error initializing agent: {e}", None | |
| agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local" | |
| print(f"Agent code location: {agent_code}") | |
| # 2. Fetch Questions | |
| print(f"Fetching questions from: {questions_url}") | |
| try: | |
| response = requests.get(questions_url, timeout=15) | |
| response.raise_for_status() | |
| questions_data = response.json() | |
| if not questions_data: | |
| print("Fetched questions list is empty.") | |
| return "Fetched questions list is empty or invalid format.", None | |
| print(f"Fetched {len(questions_data)} questions.") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching questions: {e}") | |
| return f"Error fetching questions: {e}", None | |
| except Exception as e: | |
| print(f"An unexpected error occurred fetching questions: {e}") | |
| return f"An unexpected error occurred fetching questions: {e}", None | |
| # 3. Run Agent on all questions | |
| results_log = [] | |
| answers_payload = [] | |
| print(f"Running agent on {len(questions_data)} questions...") | |
| for i, item in enumerate(questions_data): | |
| task_id = item.get("task_id") | |
| question_text = item.get("question") | |
| if not task_id or question_text is None: | |
| print(f"Skipping item with missing task_id or question: {item}") | |
| continue | |
| print(f"Processing question {i+1}/{len(questions_data)}: {task_id}") | |
| try: | |
| submitted_answer = agent(question_text) | |
| answers_payload.append({ | |
| "task_id": task_id, | |
| "submitted_answer": submitted_answer | |
| }) | |
| results_log.append({ | |
| "Task ID": task_id, | |
| "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, | |
| "Submitted Answer": submitted_answer | |
| }) | |
| except Exception as e: | |
| print(f"Error running agent on task {task_id}: {e}") | |
| error_answer = f"AGENT ERROR: {e}" | |
| answers_payload.append({ | |
| "task_id": task_id, | |
| "submitted_answer": error_answer | |
| }) | |
| results_log.append({ | |
| "Task ID": task_id, | |
| "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, | |
| "Submitted Answer": error_answer | |
| }) | |
| if not answers_payload: | |
| print("Agent did not produce any answers to submit.") | |
| return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) | |
| # 4. Prepare Submission | |
| submission_data = { | |
| "username": username.strip(), | |
| "agent_code": agent_code, | |
| "answers": answers_payload | |
| } | |
| print(f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'...") | |
| # 5. Submit answers | |
| print(f"Submitting {len(answers_payload)} answers to: {submit_url}") | |
| try: | |
| response = requests.post(submit_url, json=submission_data, timeout=120) | |
| response.raise_for_status() | |
| result_data = response.json() | |
| final_status = ( | |
| f"Submission Successful!\n" | |
| f"User: {result_data.get('username', username)}\n" | |
| f"Overall Score: {result_data.get('score', 'N/A')}% " | |
| f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" | |
| f"Message: {result_data.get('message', 'No message received.')}" | |
| ) | |
| print("Submission successful.") | |
| results_df = pd.DataFrame(results_log) | |
| return final_status, results_df | |
| except requests.exceptions.HTTPError as e: | |
| error_detail = f"Server responded with status {e.response.status_code}." | |
| try: | |
| error_json = e.response.json() | |
| error_detail += f" Detail: {error_json.get('detail', e.response.text)}" | |
| except: | |
| error_detail += f" Response: {e.response.text[:500]}" | |
| status_message = f"Submission Failed: {error_detail}" | |
| print(status_message) | |
| results_df = pd.DataFrame(results_log) | |
| return status_message, results_df | |
| except Exception as e: | |
| status_message = f"Submission error: {e}" | |
| print(status_message) | |
| results_df = pd.DataFrame(results_log) | |
| return status_message, results_df | |
| # --- Gradio Interface --- | |
| with gr.Blocks(title="GAIA Benchmark Agent") as demo: | |
| gr.Markdown("# Enhanced GAIA Benchmark Agent") | |
| gr.Markdown( | |
| """ | |
| **Enhanced Agent for GAIA Benchmark - Targeting 60% Accuracy** | |
| **Features:** | |
| - Enhanced mathematical problem solving with symbolic computation | |
| - Improved search capabilities with multiple source aggregation | |
| - Safe code execution environment | |
| - Smart question routing (math/code/search/reasoning) | |
| - Better answer formatting and extraction | |
| **Instructions:** | |
| 1. Log in to your Hugging Face account using the button below | |
| 2. Click 'Run Evaluation & Submit All Answers' to start the benchmark | |
| 3. The agent will process all questions and submit answers automatically | |
| **Note:** Processing may take several minutes depending on the number of questions. | |
| """ | |
| ) | |
| gr.LoginButton() | |
| run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary") | |
| status_output = gr.Textbox( | |
| label="Status & Results", | |
| lines=8, | |
| interactive=False, | |
| placeholder="Click the button above to start the evaluation..." | |
| ) | |
| results_table = gr.DataFrame( | |
| label="Questions and Agent Responses", | |
| wrap=True, | |
| interactive=False | |
| ) | |
| run_button.click( | |
| fn=run_and_submit_all, | |
| inputs=[], | |
| outputs=[status_output, results_table] | |
| ) | |
| if __name__ == "__main__": | |
| print("\n" + "="*50) | |
| print("π GAIA Benchmark Agent Starting") | |
| print("="*50) | |
| # Environment info | |
| space_host = os.getenv("SPACE_HOST") | |
| space_id = os.getenv("SPACE_ID") | |
| if space_host: | |
| print(f"β SPACE_HOST: {space_host}") | |
| print(f" Runtime URL: https://{space_host}.hf.space") | |
| else: | |
| print("βΉοΈ Running locally (SPACE_HOST not found)") | |
| if space_id: | |
| print(f"β SPACE_ID: {space_id}") | |
| print(f" Repo URL: https://huggingface.co/spaces/{space_id}") | |
| else: | |
| print("βΉοΈ SPACE_ID not found") | |
| print("="*50 + "\n") | |
| print("π― Target: 60% accuracy on GAIA benchmark") | |
| print("π§ Enhanced tools: Math, Code, Search, Reasoning") | |
| print("\nLaunching Gradio interface...") | |
| demo.launch(debug=True, share=False) |