Spaces:
Sleeping
Sleeping
| """ | |
| Code Execution Framework for GAIA Agent | |
| Provides safe Python code execution for math/data processing questions. | |
| Uses local execution with timeout and safety constraints. | |
| Expected Impact: +15-20% accuracy improvement on math/calculation questions | |
| """ | |
| import re | |
| import os | |
| import sys | |
| import time | |
| import subprocess | |
| import tempfile | |
| from dataclasses import dataclass | |
| from typing import Optional, List | |
| from pathlib import Path | |
| class ExecutionResult: | |
| """Result of code execution""" | |
| success: bool | |
| output: Optional[str] | |
| error: Optional[str] | |
| execution_time: float | |
| def should_use_code_execution(question: str) -> bool: | |
| """ | |
| Determine if a question would benefit from code execution. | |
| Args: | |
| question: The question text | |
| Returns: | |
| True if code execution should be used | |
| """ | |
| question_lower = question.lower() | |
| # EXCLUSIONS: Research questions that should NOT use code | |
| research_indicators = [ | |
| 'who', 'when', 'where', 'which person', 'which company', | |
| 'published by', 'written by', 'created by', 'founded by', | |
| 'according to', 'wikipedia', 'article', 'biography', | |
| 'history of', 'year of', 'born in', 'died in' | |
| ] | |
| # If it's clearly a research/lookup question, don't use code | |
| if any(indicator in question_lower for indicator in research_indicators): | |
| # Exception: if it has actual numbers to calculate WITH | |
| # e.g., "Who scored 25 + 30 points?" should use code for the math | |
| has_math_operators = any(op in question for op in ['+', '-', '*', '/', '=']) | |
| if not has_math_operators: | |
| return False | |
| # Math keywords - direct operations | |
| math_keywords = [ | |
| 'calculate', 'compute', 'sum', 'average', 'mean', 'median', | |
| 'multiply', 'divide', 'subtract', 'add', 'total', | |
| 'square root', 'power', 'factorial', 'prime', | |
| '+', '-', '*', '/', '%', '^', '=' | |
| ] | |
| # Check for math operations | |
| if any(keyword in question_lower for keyword in math_keywords): | |
| return True | |
| # Data processing keywords - only for provided data | |
| data_processing_indicators = [ | |
| 'from the csv', 'in the file', 'in the spreadsheet', | |
| 'from the table', 'in the data', 'given the values', | |
| 'calculate from', 'based on the following' | |
| ] | |
| if any(indicator in question_lower for indicator in data_processing_indicators): | |
| return True | |
| # Check for explicit number sequences that need calculation | |
| # e.g., "What is 123 * 456" or "Sum of 10, 20, 30" | |
| numbers = re.findall(r'\d+', question) | |
| has_operators = any(op in question for op in ['+', '-', '*', '/', '=', 'x']) | |
| if len(numbers) >= 2 and has_operators: | |
| return True | |
| return False | |
| class CodeExecutor: | |
| """ | |
| Safe Python code executor with timeout and safety constraints. | |
| Uses subprocess isolation to prevent harmful operations. | |
| """ | |
| def __init__(self, timeout: int = 10, openrouter_client=None, model: str = "x-ai/grok-4.1-fast"): | |
| """ | |
| Initialize code executor. | |
| Args: | |
| timeout: Maximum execution time in seconds | |
| openrouter_client: OpenAI client for OpenRouter (for code generation) | |
| model: Model to use for code generation | |
| """ | |
| self.timeout = timeout | |
| self.openrouter_client = openrouter_client | |
| self.model = model | |
| def generate_code(self, question: str, context: Optional[str] = None) -> str: | |
| """ | |
| Generate Python code to answer the question. | |
| Args: | |
| question: The question to solve | |
| context: Optional context/data for the question | |
| Returns: | |
| Python code as string | |
| """ | |
| # If we have OpenRouter, use LLM to generate code | |
| if self.openrouter_client: | |
| return self._generate_code_with_llm(question, context) | |
| # Fallback: Simple code generation for basic math | |
| return self._generate_code_simple(question) | |
| def _generate_code_with_llm(self, question: str, context: Optional[str] = None) -> str: | |
| """Generate code using LLM""" | |
| prompt = f"""Generate Python code to answer this question. Output ONLY the Python code, no explanations. | |
| The code must print the final answer using print(). | |
| Question: {question}""" | |
| if context: | |
| prompt += f"\n\nContext/Data: {context}" | |
| prompt += """ | |
| Requirements: | |
| 1. Use only Python standard library (math, statistics, etc.) | |
| 2. Print the final answer | |
| 3. Keep it simple and direct | |
| 4. No external imports except math, statistics | |
| 5. Handle edge cases | |
| Code:""" | |
| try: | |
| response = self.openrouter_client.chat.completions.create( | |
| model=self.model, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=500, | |
| temperature=0.1 | |
| ) | |
| code = response.choices[0].message.content.strip() | |
| # Extract code from markdown if present | |
| if "```python" in code: | |
| code = code.split("```python")[1].split("```")[0].strip() | |
| elif "```" in code: | |
| code = code.split("```")[1].split("```")[0].strip() | |
| return code | |
| except Exception as e: | |
| print(f"❌ LLM code generation failed: {e}") | |
| return self._generate_code_simple(question) | |
| def _generate_code_simple(self, question: str) -> str: | |
| """ | |
| Generate simple code without LLM (fallback). | |
| This handles basic arithmetic expressions. | |
| """ | |
| # Try to extract a math expression | |
| # Remove common words | |
| expr = question.lower() | |
| for word in ['what is', 'calculate', 'compute', 'the result of', '?', 'equal', 'equals']: | |
| expr = expr.replace(word, ' ') | |
| expr = expr.strip() | |
| # Convert word operations to symbols | |
| replacements = { | |
| ' plus ': '+', | |
| ' minus ': '-', | |
| ' times ': '*', | |
| ' divided by ': '/', | |
| ' multiply ': '*', | |
| ' divide ': '/', | |
| ' add ': '+', | |
| ' subtract ': '-' | |
| } | |
| for word, symbol in replacements.items(): | |
| expr = expr.replace(word, symbol) | |
| # Clean up spaces | |
| expr = re.sub(r'\s+', '', expr) | |
| # Basic validation | |
| if re.match(r'^[\d+\-*/().\s]+$', expr): | |
| return f"result = {expr}\nprint(int(result) if result == int(result) else result)" | |
| # Fallback for square root | |
| if 'square root' in question.lower(): | |
| match = re.search(r'\d+', question) | |
| if match: | |
| num = match.group() | |
| return f"import math\nresult = math.sqrt({num})\nprint(int(result) if result == int(result) else result)" | |
| # Fallback for average | |
| if 'average' in question.lower() or 'mean' in question.lower(): | |
| numbers = re.findall(r'\d+', question) | |
| if numbers: | |
| # Convert to integers explicitly | |
| numbers_list = [int(n) for n in numbers] | |
| return f"values = {numbers_list}\nresult = sum(values) / len(values)\nprint(int(result) if result == int(result) else result)" | |
| # Default fallback | |
| return "print('Unable to generate code for this question')" | |
| def execute(self, code: str) -> ExecutionResult: | |
| """ | |
| Execute Python code safely with timeout. | |
| Args: | |
| code: Python code to execute | |
| Returns: | |
| ExecutionResult with output or error | |
| """ | |
| start_time = time.time() | |
| # Safety check: block dangerous operations | |
| dangerous_patterns = { | |
| 'import os': 'os module', | |
| 'import subprocess': 'subprocess module', | |
| 'import sys': 'sys module', | |
| 'import urllib': 'urllib module', | |
| 'import requests': 'requests module', | |
| 'import http': 'http module', | |
| 'import socket': 'socket module', | |
| 'open(': 'file operations', | |
| '__import__': '__import__ function', | |
| 'eval(': 'eval function', | |
| 'exec(': 'exec function', | |
| 'compile(': 'compile function', | |
| } | |
| code_lower = code.lower() | |
| # Check for dangerous patterns | |
| for pattern, name in dangerous_patterns.items(): | |
| if pattern in code_lower: | |
| # Only allow math and statistics imports | |
| if 'import' in pattern and pattern not in ['import math', 'import statistics']: | |
| # Check if it's actually importing something safe | |
| if not any(safe in code_lower for safe in ['import math', 'import statistics', 'import random', 'import datetime']): | |
| if pattern in code_lower: | |
| return ExecutionResult( | |
| success=False, | |
| output=None, | |
| error=f"Security: {name} is not allowed", | |
| execution_time=time.time() - start_time | |
| ) | |
| # Block file/exec operations outright | |
| elif pattern in ['open(', '__import__', 'eval(', 'exec(', 'compile(']: | |
| return ExecutionResult( | |
| success=False, | |
| output=None, | |
| error=f"Security: {name} is not allowed", | |
| execution_time=time.time() - start_time | |
| ) | |
| # Create temporary file for code | |
| try: | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: | |
| f.write(code) | |
| code_file = f.name | |
| # Execute with timeout using subprocess | |
| try: | |
| result = subprocess.run( | |
| [sys.executable, code_file], | |
| capture_output=True, | |
| text=True, | |
| timeout=self.timeout, | |
| env={**os.environ, 'PYTHONPATH': str(Path(__file__).parent)} | |
| ) | |
| execution_time = time.time() - start_time | |
| if result.returncode == 0: | |
| output = result.stdout.strip() | |
| return ExecutionResult( | |
| success=True, | |
| output=output, | |
| error=None, | |
| execution_time=execution_time | |
| ) | |
| else: | |
| return ExecutionResult( | |
| success=False, | |
| output=None, | |
| error=result.stderr.strip(), | |
| execution_time=execution_time | |
| ) | |
| except subprocess.TimeoutExpired: | |
| return ExecutionResult( | |
| success=False, | |
| output=None, | |
| error=f"Execution timeout ({self.timeout}s)", | |
| execution_time=self.timeout | |
| ) | |
| except Exception as e: | |
| return ExecutionResult( | |
| success=False, | |
| output=None, | |
| error=str(e), | |
| execution_time=time.time() - start_time | |
| ) | |
| finally: | |
| # Clean up temp file | |
| try: | |
| if 'code_file' in locals(): | |
| os.unlink(code_file) | |
| except: | |
| pass | |
| def solve_question(self, question: str, context: Optional[str] = None) -> Optional[str]: | |
| """ | |
| Complete workflow: generate code, execute, return answer. | |
| Args: | |
| question: Question to solve | |
| context: Optional context | |
| Returns: | |
| Answer string or None if failed | |
| """ | |
| print(f" 🧮 CODE EXECUTION: {question[:60]}...") | |
| # Generate code | |
| code = self.generate_code(question, context) | |
| print(f" 📝 Generated code ({len(code)} chars)") | |
| # Execute code | |
| result = self.execute(code) | |
| if result.success and result.output: | |
| print(f" ✅ Execution successful: {result.output}") | |
| return result.output | |
| else: | |
| print(f" ❌ Execution failed: {result.error}") | |
| return None | |
| if __name__ == "__main__": | |
| # Test the code executor | |
| print("=" * 60) | |
| print("Code Executor Test") | |
| print("=" * 60) | |
| executor = CodeExecutor() | |
| # Test 1: Simple arithmetic | |
| question1 = "What is 123 * 456?" | |
| print(f"\nTest 1: {question1}") | |
| answer1 = executor.solve_question(question1) | |
| print(f"Answer: {answer1}") | |
| # Test 2: Average | |
| question2 = "What is the average of 10, 20, 30, 40, 50?" | |
| print(f"\nTest 2: {question2}") | |
| answer2 = executor.solve_question(question2) | |
| print(f"Answer: {answer2}") | |
| # Test 3: Square root | |
| question3 = "What is the square root of 144?" | |
| print(f"\nTest 3: {question3}") | |
| answer3 = executor.solve_question(question3) | |
| print(f"Answer: {answer3}") | |