Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import random | |
| from models import Action, Score # pyre-ignore | |
| from server.env import DebugOpsEnv # pyre-ignore | |
| def random_agent(obs): | |
| actions = ["analyze_logs", "run_tests", "open_file", "edit_file"] | |
| action_type = random.choice(actions) | |
| target, content = None, None | |
| if action_type in ["open_file", "edit_file"]: | |
| target = random.choice(obs.visible_files) if obs.visible_files else "unknown.py" | |
| if action_type == "edit_file": | |
| content = "print('random edit\\n')" | |
| return Action(type=action_type, target=target, content=content) | |
| def heuristic_agent(obs): | |
| # 1. Analyze logs if not done | |
| if not obs.logs: return Action(type="analyze_logs") | |
| # 2. Extract potential file names from noisy logs | |
| candidates = [w.strip(":,.'\"") for w in obs.logs.split() if w.endswith(".py")] | |
| # 3. Fallbacks based on known targets if parsing fails entirely due to noise | |
| if not candidates: | |
| if "discount" in obs.logs.lower(): candidates = ["utils.py"] | |
| elif "validator" in obs.logs.lower(): candidates = ["parser.py"] # trap! | |
| elif "service" in obs.logs.lower(): candidates = ["service.py"] | |
| elif "inconsistent" in obs.logs.lower(): candidates = ["api.py"] | |
| else: candidates = ["unknown.py"] | |
| # Misleaded attribution trap handling (Heuristic agent fails on medium) | |
| if "validator" in obs.logs.lower(): | |
| candidates.insert(0, "validator.py") | |
| target_file = candidates[0] | |
| # 4. Open file if not visible | |
| if target_file not in obs.visible_files: | |
| return Action(type="open_file", target=target_file) | |
| # 5. Blind heuristic edit (assuming we opened the right file, else it fails) | |
| if "edit_attempted" not in getattr(heuristic_agent, "memory", {}): | |
| heuristic_agent.memory = {"edit_attempted": True} | |
| return Action(type="edit_file", target=target_file, content="heuristic patch") | |
| # 6. Run tests | |
| return Action(type="run_tests") | |
| def evaluate_agent(agent_name, agent_fn, split, seeds=5): | |
| difficulties = ["easy", "medium", "hard", "extreme"] | |
| results_str = f"{agent_name:10s} " | |
| for diff in difficulties: | |
| scores = [] | |
| for seed in range(seeds): | |
| # Reset heuristic memory per episode | |
| if hasattr(heuristic_agent, "memory"): delattr(heuristic_agent, "memory") | |
| env = DebugOpsEnv(seed=seed) | |
| try: | |
| obs = env.reset(difficulty=diff, split=split) | |
| except Exception: | |
| scores.append(0.0) | |
| continue | |
| done = False | |
| trajectory = [] | |
| # Simple loop limit to prevent infinity if agent bugs out | |
| step_count = 0 | |
| while not done and step_count < 30: | |
| step_count += 1 | |
| action = agent_fn(obs) | |
| obs, reward, done, info = env.step(action) | |
| trajectory.append(action) | |
| score = env.grade(trajectory).final() | |
| scores.append(score) | |
| mean_score = np.mean(scores) | |
| std_score = np.std(scores) | |
| results_str += f"{mean_score:.2f}±{std_score:.2f} " | |
| print(results_str) | |
| if __name__ == "__main__": | |
| print("================================================================") | |
| print("DebugOps-RX (Realistic eXecution) - Formal Benchmark Evaluation") | |
| print("================================================================") | |
| splits = ["train", "test", "ood"] | |
| for split in splits: | |
| print(f"\\n[{split.upper()} SPLIT]") | |
| print("Agent Easy Medium Hard Extreme") | |
| print("-" * 64) | |
| evaluate_agent("Random", random_agent, split, seeds=5) | |
| evaluate_agent("Heuristic", heuristic_agent, split, seeds=5) | |
| print("\\n\\n(Evaluation uses 5 random seeds per cell to calculate mean±std)") | |