Spaces:

Krsnapriya
/

CodeReviewEnv-Elite

Runtime error

App Files Files Community

CodeReviewEnv-Elite / run_baseline.py

Krsnapriya

Upload folder using huggingface_hub

aa466c2 verified 2 months ago

raw

history blame contribute delete

4 kB

	import numpy as np
	import random
	from models import Action, Score # pyre-ignore
	from server.env import DebugOpsEnv # pyre-ignore


	def random_agent(obs):
	actions = ["analyze_logs", "run_tests", "open_file", "edit_file"]
	action_type = random.choice(actions)

	target, content = None, None
	if action_type in ["open_file", "edit_file"]:
	target = random.choice(obs.visible_files) if obs.visible_files else "unknown.py"
	if action_type == "edit_file":
	content = "print('random edit\\n')"

	return Action(type=action_type, target=target, content=content)


	def heuristic_agent(obs):
	# 1. Analyze logs if not done
	if not obs.logs: return Action(type="analyze_logs")

	# 2. Extract potential file names from noisy logs
	candidates = [w.strip(":,.'\"") for w in obs.logs.split() if w.endswith(".py")]

	# 3. Fallbacks based on known targets if parsing fails entirely due to noise
	if not candidates:
	if "discount" in obs.logs.lower(): candidates = ["utils.py"]
	elif "validator" in obs.logs.lower(): candidates = ["parser.py"] # trap!
	elif "service" in obs.logs.lower(): candidates = ["service.py"]
	elif "inconsistent" in obs.logs.lower(): candidates = ["api.py"]
	else: candidates = ["unknown.py"]

	# Misleaded attribution trap handling (Heuristic agent fails on medium)
	if "validator" in obs.logs.lower():
	candidates.insert(0, "validator.py")

	target_file = candidates[0]

	# 4. Open file if not visible
	if target_file not in obs.visible_files:
	return Action(type="open_file", target=target_file)

	# 5. Blind heuristic edit (assuming we opened the right file, else it fails)
	if "edit_attempted" not in getattr(heuristic_agent, "memory", {}):
	heuristic_agent.memory = {"edit_attempted": True}
	return Action(type="edit_file", target=target_file, content="heuristic patch")

	# 6. Run tests
	return Action(type="run_tests")


	def evaluate_agent(agent_name, agent_fn, split, seeds=5):
	difficulties = ["easy", "medium", "hard", "extreme"]
	results_str = f"{agent_name:10s} "

	for diff in difficulties:
	scores = []
	for seed in range(seeds):
	# Reset heuristic memory per episode
	if hasattr(heuristic_agent, "memory"): delattr(heuristic_agent, "memory")

	env = DebugOpsEnv(seed=seed)
	try:
	obs = env.reset(difficulty=diff, split=split)
	except Exception:
	scores.append(0.0)
	continue

	done = False
	trajectory = []

	# Simple loop limit to prevent infinity if agent bugs out
	step_count = 0
	while not done and step_count < 30:
	step_count += 1
	action = agent_fn(obs)
	obs, reward, done, info = env.step(action)
	trajectory.append(action)

	score = env.grade(trajectory).final()
	scores.append(score)

	mean_score = np.mean(scores)
	std_score = np.std(scores)
	results_str += f"{mean_score:.2f}±{std_score:.2f} "

	print(results_str)


	if __name__ == "__main__":
	print("================================================================")
	print("DebugOps-RX (Realistic eXecution) - Formal Benchmark Evaluation")
	print("================================================================")

	splits = ["train", "test", "ood"]

	for split in splits:
	print(f"\\n[{split.upper()} SPLIT]")
	print("Agent Easy Medium Hard Extreme")
	print("-" * 64)
	evaluate_agent("Random", random_agent, split, seeds=5)
	evaluate_agent("Heuristic", heuristic_agent, split, seeds=5)

	print("\\n\\n(Evaluation uses 5 random seeds per cell to calculate mean±std)")