import numpy as np import random from models import Action, Score # pyre-ignore from server.env import DebugOpsEnv # pyre-ignore def random_agent(obs): actions = ["analyze_logs", "run_tests", "open_file", "edit_file"] action_type = random.choice(actions) target, content = None, None if action_type in ["open_file", "edit_file"]: target = random.choice(obs.visible_files) if obs.visible_files else "unknown.py" if action_type == "edit_file": content = "print('random edit\\n')" return Action(type=action_type, target=target, content=content) def heuristic_agent(obs): # 1. Analyze logs if not done if not obs.logs: return Action(type="analyze_logs") # 2. Extract potential file names from noisy logs candidates = [w.strip(":,.'\"") for w in obs.logs.split() if w.endswith(".py")] # 3. Fallbacks based on known targets if parsing fails entirely due to noise if not candidates: if "discount" in obs.logs.lower(): candidates = ["utils.py"] elif "validator" in obs.logs.lower(): candidates = ["parser.py"] # trap! elif "service" in obs.logs.lower(): candidates = ["service.py"] elif "inconsistent" in obs.logs.lower(): candidates = ["api.py"] else: candidates = ["unknown.py"] # Misleaded attribution trap handling (Heuristic agent fails on medium) if "validator" in obs.logs.lower(): candidates.insert(0, "validator.py") target_file = candidates[0] # 4. Open file if not visible if target_file not in obs.visible_files: return Action(type="open_file", target=target_file) # 5. Blind heuristic edit (assuming we opened the right file, else it fails) if "edit_attempted" not in getattr(heuristic_agent, "memory", {}): heuristic_agent.memory = {"edit_attempted": True} return Action(type="edit_file", target=target_file, content="heuristic patch") # 6. Run tests return Action(type="run_tests") def evaluate_agent(agent_name, agent_fn, split, seeds=5): difficulties = ["easy", "medium", "hard", "extreme"] results_str = f"{agent_name:10s} " for diff in difficulties: scores = [] for seed in range(seeds): # Reset heuristic memory per episode if hasattr(heuristic_agent, "memory"): delattr(heuristic_agent, "memory") env = DebugOpsEnv(seed=seed) try: obs = env.reset(difficulty=diff, split=split) except Exception: scores.append(0.0) continue done = False trajectory = [] # Simple loop limit to prevent infinity if agent bugs out step_count = 0 while not done and step_count < 30: step_count += 1 action = agent_fn(obs) obs, reward, done, info = env.step(action) trajectory.append(action) score = env.grade(trajectory).final() scores.append(score) mean_score = np.mean(scores) std_score = np.std(scores) results_str += f"{mean_score:.2f}±{std_score:.2f} " print(results_str) if __name__ == "__main__": print("================================================================") print("DebugOps-RX (Realistic eXecution) - Formal Benchmark Evaluation") print("================================================================") splits = ["train", "test", "ood"] for split in splits: print(f"\\n[{split.upper()} SPLIT]") print("Agent Easy Medium Hard Extreme") print("-" * 64) evaluate_agent("Random", random_agent, split, seeds=5) evaluate_agent("Heuristic", heuristic_agent, split, seeds=5) print("\\n\\n(Evaluation uses 5 random seeds per cell to calculate mean±std)")