CodeReviewEnv-Elite / run_baseline.py
Krsnapriya's picture
Upload folder using huggingface_hub
aa466c2 verified
import numpy as np
import random
from models import Action, Score # pyre-ignore
from server.env import DebugOpsEnv # pyre-ignore
def random_agent(obs):
actions = ["analyze_logs", "run_tests", "open_file", "edit_file"]
action_type = random.choice(actions)
target, content = None, None
if action_type in ["open_file", "edit_file"]:
target = random.choice(obs.visible_files) if obs.visible_files else "unknown.py"
if action_type == "edit_file":
content = "print('random edit\\n')"
return Action(type=action_type, target=target, content=content)
def heuristic_agent(obs):
# 1. Analyze logs if not done
if not obs.logs: return Action(type="analyze_logs")
# 2. Extract potential file names from noisy logs
candidates = [w.strip(":,.'\"") for w in obs.logs.split() if w.endswith(".py")]
# 3. Fallbacks based on known targets if parsing fails entirely due to noise
if not candidates:
if "discount" in obs.logs.lower(): candidates = ["utils.py"]
elif "validator" in obs.logs.lower(): candidates = ["parser.py"] # trap!
elif "service" in obs.logs.lower(): candidates = ["service.py"]
elif "inconsistent" in obs.logs.lower(): candidates = ["api.py"]
else: candidates = ["unknown.py"]
# Misleaded attribution trap handling (Heuristic agent fails on medium)
if "validator" in obs.logs.lower():
candidates.insert(0, "validator.py")
target_file = candidates[0]
# 4. Open file if not visible
if target_file not in obs.visible_files:
return Action(type="open_file", target=target_file)
# 5. Blind heuristic edit (assuming we opened the right file, else it fails)
if "edit_attempted" not in getattr(heuristic_agent, "memory", {}):
heuristic_agent.memory = {"edit_attempted": True}
return Action(type="edit_file", target=target_file, content="heuristic patch")
# 6. Run tests
return Action(type="run_tests")
def evaluate_agent(agent_name, agent_fn, split, seeds=5):
difficulties = ["easy", "medium", "hard", "extreme"]
results_str = f"{agent_name:10s} "
for diff in difficulties:
scores = []
for seed in range(seeds):
# Reset heuristic memory per episode
if hasattr(heuristic_agent, "memory"): delattr(heuristic_agent, "memory")
env = DebugOpsEnv(seed=seed)
try:
obs = env.reset(difficulty=diff, split=split)
except Exception:
scores.append(0.0)
continue
done = False
trajectory = []
# Simple loop limit to prevent infinity if agent bugs out
step_count = 0
while not done and step_count < 30:
step_count += 1
action = agent_fn(obs)
obs, reward, done, info = env.step(action)
trajectory.append(action)
score = env.grade(trajectory).final()
scores.append(score)
mean_score = np.mean(scores)
std_score = np.std(scores)
results_str += f"{mean_score:.2f}±{std_score:.2f} "
print(results_str)
if __name__ == "__main__":
print("================================================================")
print("DebugOps-RX (Realistic eXecution) - Formal Benchmark Evaluation")
print("================================================================")
splits = ["train", "test", "ood"]
for split in splits:
print(f"\\n[{split.upper()} SPLIT]")
print("Agent Easy Medium Hard Extreme")
print("-" * 64)
evaluate_agent("Random", random_agent, split, seeds=5)
evaluate_agent("Heuristic", heuristic_agent, split, seeds=5)
print("\\n\\n(Evaluation uses 5 random seeds per cell to calculate mean±std)")