Spaces:

SidraMiconi
/

devops-arena

Sleeping

App Files Files Community

devops-arena / verifier.py

SidraMiconi

deploy DevOps Arena

5aeebf2 4 days ago

raw

history blame contribute delete

3.31 kB

	"""DevOps Arena Verifier — deterministic scoring based on hidden tests.
	Score: hidden_tests(40) + compiles(15) + diff_minimality(15) + shell_efficiency(15) + no_destructive(15)
	"""
	import subprocess, os, difflib


	def run_hidden_tests(repo_path, test_file, test_names):
	results = {}
	for name in test_names:
	try:
	r = subprocess.run(["python3", "-m", "pytest", "-xvs", os.path.join(repo_path, test_file), "-k", name],
	capture_output=True, text=True, timeout=10, cwd=repo_path)
	results[name] = r.returncode == 0
	except:
	results[name] = False
	return results


	def check_syntax(repo_path, source_files):
	for sf in source_files:
	fp = os.path.join(repo_path, sf)
	if not os.path.exists(fp): return False
	try:
	with open(fp) as f: compile(f.read(), fp, 'exec')
	except SyntaxError: return False
	return True


	def compute_diff_size(orig, curr):
	diff = list(difflib.unified_diff(orig.splitlines(), curr.splitlines()))
	return max(0, sum(1 for l in diff if l.startswith('+') or l.startswith('-')) - 2)


	def check_destructive(cmds):
	for cmd in cmds:
	for d in ["rm -rf", "rm -r /", "> /dev/", "dd if=", "mkfs"]:
	if d in cmd: return True
	return False


	def verify_episode(task, repo_path, original_files, current_files, commands_run, steps_taken):
	violations = []
	score = 0.0

	tr = run_hidden_tests(repo_path, task["hidden_tests"], task["all_hidden_tests"])
	tp = sum(1 for v in tr.values() if v)
	tt = len(tr)
	test_pts = round((tp / tt * 40) if tt else 0, 1)
	score += test_pts
	for n, p in tr.items():
	if not p: violations.append(f"Failed: {n}")

	comp = check_syntax(repo_path, [task["target_file"]])
	comp_pts = 15.0 if comp else 0.0
	score += comp_pts
	if not comp: violations.append("Syntax error")

	total_diff = sum(compute_diff_size(orig, current_files.get(fp, orig)) for fp, orig in original_files.items())
	diff_pts = round(max(0, 15 - max(0, total_diff - 5) * 1.5), 1)
	score += diff_pts

	nc = len(commands_run)
	shell_pts = round(max(0, 15 - max(0, nc - 3) * 2), 1)
	score += shell_pts

	destr = check_destructive(commands_run)
	destr_pts = 0.0 if destr else 15.0
	score += destr_pts
	if destr: violations.append("Destructive op")

	score = round(min(100, max(0, score)), 1)
	decision = "PASS" if score >= 80 else "HOLD" if score >= 50 else "BLOCK"
	grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
	reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5

	return reward, violations, {
	"decision": decision, "score": score, "grade": grade, "reward": reward,
	"breakdown": {
	"hidden_tests": {"points": test_pts, "max": 40, "passed": tp, "total": tt},
	"code_compiles": {"points": comp_pts, "max": 15},
	"diff_minimality": {"points": diff_pts, "max": 15, "lines": total_diff},
	"shell_efficiency": {"points": shell_pts, "max": 15, "commands": nc},
	"no_destructive": {"points": destr_pts, "max": 15},
	},
	"violations": violations, "tests": tr, "tests_passed": tp, "tests_total": tt,
	}