devops-arena / verifier.py
SidraMiconi's picture
deploy DevOps Arena
5aeebf2
"""DevOps Arena Verifier — deterministic scoring based on hidden tests.
Score: hidden_tests(40) + compiles(15) + diff_minimality(15) + shell_efficiency(15) + no_destructive(15)
"""
import subprocess, os, difflib
def run_hidden_tests(repo_path, test_file, test_names):
results = {}
for name in test_names:
try:
r = subprocess.run(["python3", "-m", "pytest", "-xvs", os.path.join(repo_path, test_file), "-k", name],
capture_output=True, text=True, timeout=10, cwd=repo_path)
results[name] = r.returncode == 0
except:
results[name] = False
return results
def check_syntax(repo_path, source_files):
for sf in source_files:
fp = os.path.join(repo_path, sf)
if not os.path.exists(fp): return False
try:
with open(fp) as f: compile(f.read(), fp, 'exec')
except SyntaxError: return False
return True
def compute_diff_size(orig, curr):
diff = list(difflib.unified_diff(orig.splitlines(), curr.splitlines()))
return max(0, sum(1 for l in diff if l.startswith('+') or l.startswith('-')) - 2)
def check_destructive(cmds):
for cmd in cmds:
for d in ["rm -rf", "rm -r /", "> /dev/", "dd if=", "mkfs"]:
if d in cmd: return True
return False
def verify_episode(task, repo_path, original_files, current_files, commands_run, steps_taken):
violations = []
score = 0.0
tr = run_hidden_tests(repo_path, task["hidden_tests"], task["all_hidden_tests"])
tp = sum(1 for v in tr.values() if v)
tt = len(tr)
test_pts = round((tp / tt * 40) if tt else 0, 1)
score += test_pts
for n, p in tr.items():
if not p: violations.append(f"Failed: {n}")
comp = check_syntax(repo_path, [task["target_file"]])
comp_pts = 15.0 if comp else 0.0
score += comp_pts
if not comp: violations.append("Syntax error")
total_diff = sum(compute_diff_size(orig, current_files.get(fp, orig)) for fp, orig in original_files.items())
diff_pts = round(max(0, 15 - max(0, total_diff - 5) * 1.5), 1)
score += diff_pts
nc = len(commands_run)
shell_pts = round(max(0, 15 - max(0, nc - 3) * 2), 1)
score += shell_pts
destr = check_destructive(commands_run)
destr_pts = 0.0 if destr else 15.0
score += destr_pts
if destr: violations.append("Destructive op")
score = round(min(100, max(0, score)), 1)
decision = "PASS" if score >= 80 else "HOLD" if score >= 50 else "BLOCK"
grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5
return reward, violations, {
"decision": decision, "score": score, "grade": grade, "reward": reward,
"breakdown": {
"hidden_tests": {"points": test_pts, "max": 40, "passed": tp, "total": tt},
"code_compiles": {"points": comp_pts, "max": 15},
"diff_minimality": {"points": diff_pts, "max": 15, "lines": total_diff},
"shell_efficiency": {"points": shell_pts, "max": 15, "commands": nc},
"no_destructive": {"points": destr_pts, "max": 15},
},
"violations": violations, "tests": tr, "tests_passed": tp, "tests_total": tt,
}