Spaces:
Sleeping
Sleeping
| """DevOps Arena Verifier — deterministic scoring based on hidden tests. | |
| Score: hidden_tests(40) + compiles(15) + diff_minimality(15) + shell_efficiency(15) + no_destructive(15) | |
| """ | |
| import subprocess, os, difflib | |
| def run_hidden_tests(repo_path, test_file, test_names): | |
| results = {} | |
| for name in test_names: | |
| try: | |
| r = subprocess.run(["python3", "-m", "pytest", "-xvs", os.path.join(repo_path, test_file), "-k", name], | |
| capture_output=True, text=True, timeout=10, cwd=repo_path) | |
| results[name] = r.returncode == 0 | |
| except: | |
| results[name] = False | |
| return results | |
| def check_syntax(repo_path, source_files): | |
| for sf in source_files: | |
| fp = os.path.join(repo_path, sf) | |
| if not os.path.exists(fp): return False | |
| try: | |
| with open(fp) as f: compile(f.read(), fp, 'exec') | |
| except SyntaxError: return False | |
| return True | |
| def compute_diff_size(orig, curr): | |
| diff = list(difflib.unified_diff(orig.splitlines(), curr.splitlines())) | |
| return max(0, sum(1 for l in diff if l.startswith('+') or l.startswith('-')) - 2) | |
| def check_destructive(cmds): | |
| for cmd in cmds: | |
| for d in ["rm -rf", "rm -r /", "> /dev/", "dd if=", "mkfs"]: | |
| if d in cmd: return True | |
| return False | |
| def verify_episode(task, repo_path, original_files, current_files, commands_run, steps_taken): | |
| violations = [] | |
| score = 0.0 | |
| tr = run_hidden_tests(repo_path, task["hidden_tests"], task["all_hidden_tests"]) | |
| tp = sum(1 for v in tr.values() if v) | |
| tt = len(tr) | |
| test_pts = round((tp / tt * 40) if tt else 0, 1) | |
| score += test_pts | |
| for n, p in tr.items(): | |
| if not p: violations.append(f"Failed: {n}") | |
| comp = check_syntax(repo_path, [task["target_file"]]) | |
| comp_pts = 15.0 if comp else 0.0 | |
| score += comp_pts | |
| if not comp: violations.append("Syntax error") | |
| total_diff = sum(compute_diff_size(orig, current_files.get(fp, orig)) for fp, orig in original_files.items()) | |
| diff_pts = round(max(0, 15 - max(0, total_diff - 5) * 1.5), 1) | |
| score += diff_pts | |
| nc = len(commands_run) | |
| shell_pts = round(max(0, 15 - max(0, nc - 3) * 2), 1) | |
| score += shell_pts | |
| destr = check_destructive(commands_run) | |
| destr_pts = 0.0 if destr else 15.0 | |
| score += destr_pts | |
| if destr: violations.append("Destructive op") | |
| score = round(min(100, max(0, score)), 1) | |
| decision = "PASS" if score >= 80 else "HOLD" if score >= 50 else "BLOCK" | |
| grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F" | |
| reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5 | |
| return reward, violations, { | |
| "decision": decision, "score": score, "grade": grade, "reward": reward, | |
| "breakdown": { | |
| "hidden_tests": {"points": test_pts, "max": 40, "passed": tp, "total": tt}, | |
| "code_compiles": {"points": comp_pts, "max": 15}, | |
| "diff_minimality": {"points": diff_pts, "max": 15, "lines": total_diff}, | |
| "shell_efficiency": {"points": shell_pts, "max": 15, "commands": nc}, | |
| "no_destructive": {"points": destr_pts, "max": 15}, | |
| }, | |
| "violations": violations, "tests": tr, "tests_passed": tp, "tests_total": tt, | |
| } | |