File size: 3,313 Bytes
5aeebf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""DevOps Arena Verifier — deterministic scoring based on hidden tests.
Score: hidden_tests(40) + compiles(15) + diff_minimality(15) + shell_efficiency(15) + no_destructive(15)
"""
import subprocess, os, difflib


def run_hidden_tests(repo_path, test_file, test_names):
    results = {}
    for name in test_names:
        try:
            r = subprocess.run(["python3", "-m", "pytest", "-xvs", os.path.join(repo_path, test_file), "-k", name],
                capture_output=True, text=True, timeout=10, cwd=repo_path)
            results[name] = r.returncode == 0
        except:
            results[name] = False
    return results


def check_syntax(repo_path, source_files):
    for sf in source_files:
        fp = os.path.join(repo_path, sf)
        if not os.path.exists(fp): return False
        try:
            with open(fp) as f: compile(f.read(), fp, 'exec')
        except SyntaxError: return False
    return True


def compute_diff_size(orig, curr):
    diff = list(difflib.unified_diff(orig.splitlines(), curr.splitlines()))
    return max(0, sum(1 for l in diff if l.startswith('+') or l.startswith('-')) - 2)


def check_destructive(cmds):
    for cmd in cmds:
        for d in ["rm -rf", "rm -r /", "> /dev/", "dd if=", "mkfs"]:
            if d in cmd: return True
    return False


def verify_episode(task, repo_path, original_files, current_files, commands_run, steps_taken):
    violations = []
    score = 0.0

    tr = run_hidden_tests(repo_path, task["hidden_tests"], task["all_hidden_tests"])
    tp = sum(1 for v in tr.values() if v)
    tt = len(tr)
    test_pts = round((tp / tt * 40) if tt else 0, 1)
    score += test_pts
    for n, p in tr.items():
        if not p: violations.append(f"Failed: {n}")

    comp = check_syntax(repo_path, [task["target_file"]])
    comp_pts = 15.0 if comp else 0.0
    score += comp_pts
    if not comp: violations.append("Syntax error")

    total_diff = sum(compute_diff_size(orig, current_files.get(fp, orig)) for fp, orig in original_files.items())
    diff_pts = round(max(0, 15 - max(0, total_diff - 5) * 1.5), 1)
    score += diff_pts

    nc = len(commands_run)
    shell_pts = round(max(0, 15 - max(0, nc - 3) * 2), 1)
    score += shell_pts

    destr = check_destructive(commands_run)
    destr_pts = 0.0 if destr else 15.0
    score += destr_pts
    if destr: violations.append("Destructive op")

    score = round(min(100, max(0, score)), 1)
    decision = "PASS" if score >= 80 else "HOLD" if score >= 50 else "BLOCK"
    grade = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "D" if score >= 60 else "F"
    reward = 1.0 if decision == "PASS" else 0.3 if decision == "HOLD" else -0.5

    return reward, violations, {
        "decision": decision, "score": score, "grade": grade, "reward": reward,
        "breakdown": {
            "hidden_tests": {"points": test_pts, "max": 40, "passed": tp, "total": tt},
            "code_compiles": {"points": comp_pts, "max": 15},
            "diff_minimality": {"points": diff_pts, "max": 15, "lines": total_diff},
            "shell_efficiency": {"points": shell_pts, "max": 15, "commands": nc},
            "no_destructive": {"points": destr_pts, "max": 15},
        },
        "violations": violations, "tests": tr, "tests_passed": tp, "tests_total": tt,
    }