File size: 6,770 Bytes
4ec75cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6284048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec75cf
 
 
 
 
 
 
 
 
 
 
6284048
4ec75cf
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# tests/test_grader_variance.py
# Phase 2 of judging runs a variance check. If all graders return the same score
# for different quality answers, the submission is DISQUALIFIED.
# Run: python -m pytest tests/test_grader_variance.py -v

import sys
sys.path.insert(0, '.')

from server.graders.base_grader import safe_score
from server.graders.security_grader import compute_correctness as sec_cc
from server.graders.dependency_grader import compute_correctness as dep_cc
from server.graders.clinical_grader import compute_correctness as cli_cc


# ── Security Case for Testing ──
SEC_CASE = {
    'expected_vuln_type': 'sql_injection',
    'cvss_range': [7.5, 9.8],
    'expected_severity': 'critical',
    'required_fix_tokens': ['?', 'parameterized'],
    'current_feedback_keywords': ['sql', 'injection'],
    'original_vuln_pattern': 'query+',
}


def test_sec_identify_variance():
    """Security grader must return 3+ different scores for different quality answers."""
    perfect = {
        'action_type': 'identify_vulnerability',
        'vuln_type': 'sql_injection',
        'cvss_score': 8.5,
        'severity': 'critical',
        'affected_line': 1,
    }
    partial = {
        'action_type': 'identify_vulnerability',
        'vuln_type': 'xss',           # wrong vuln_type
        'cvss_score': 8.5,            # but correct CVSS
        'severity': 'critical',       # and correct severity
        'affected_line': 1,
    }
    wrong = {
        'action_type': 'identify_vulnerability',
        'vuln_type': 'xss',           # wrong everything
        'cvss_score': 2.0,
        'severity': 'low',
        'affected_line': 1,
    }

    s1 = safe_score(sec_cc(perfect, SEC_CASE))
    s2 = safe_score(sec_cc(partial, SEC_CASE))
    s3 = safe_score(sec_cc(wrong, SEC_CASE))

    assert len({round(s, 2) for s in [s1, s2, s3]}) >= 3, f'No variance: {s1},{s2},{s3}'
    assert s1 > s2 > s3, f'Wrong ordering: {s1},{s2},{s3}'
    print(f'  Security identify variance: {s1:.4f} > {s2:.4f} > {s3:.4f} PASS')


def test_dep_resolve_variance():
    """Dependency grader must return different scores for different quality answers."""
    case = {
        'conflict_packages': ['torch', 'numpy'],
        'compatibility_matrix': {
            'torch': {'2.1.0': {'numpy': '>=1.24'}, '1.9.0': {}},
            'numpy': {'1.24.0': {}, '1.16.0': {}},
        },
        'requirements': {'torch': '1.9.0', 'numpy': '1.16.0'},
    }

    full = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.24.0'}, 'reasoning': 'ok'}
    part = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.16.0'}, 'reasoning': 'ok'}
    empty = {'action_type': 'resolve_conflict', 'packages': {}, 'reasoning': 'ok'}

    s1 = safe_score(dep_cc(full, case))
    s2 = safe_score(dep_cc(part, case))
    s3 = safe_score(dep_cc(empty, case))

    assert s1 > s2 >= s3, f'No variance: {s1},{s2},{s3}'
    print(f'  Dependency resolve variance: {s1:.4f} > {s2:.4f} >= {s3:.4f} PASS')


def test_cli_order_variance():
    """Clinical grader must return different scores for correct vs violated dependency order."""
    case = {
        'dependency_graph': {
            'schedule_surgery': ['resolve_insurance', 'complete_pre_op'],
            'complete_pre_op': ['resolve_insurance'],
            'resolve_insurance': [],
        },
        'required_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
    }

    correct = {
        'action_type': 'order_steps',
        'recovery_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
    }
    violated = {
        'action_type': 'order_steps',
        'recovery_steps': ['schedule_surgery', 'complete_pre_op', 'resolve_insurance'],
    }
    partial = {
        'action_type': 'order_steps',
        'recovery_steps': ['resolve_insurance', 'complete_pre_op'],
    }

    s1 = safe_score(cli_cc(correct, case))
    s2 = safe_score(cli_cc(violated, case))
    s3 = safe_score(cli_cc(partial, case))

    assert s1 > s2, f'Violation not penalised: correct={s1}, violated={s2}'
    assert s1 > s3, f'Completeness not rewarded: correct={s1}, partial={s3}'
    print(f'  Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')


def test_safe_score_clamp():
    """
    safe_score clamps to [0.01, 0.99] β€” strictly between 0 and 1.

    WHY 0.01 not 0.0:  The official spec says scores must be strictly > 0.
                        A score of 0.0 from a crashed run looks indistinguishable
                        from a broken environment. 0.01 signals "ran but failed".

    WHY 0.99 not 1.0:  A score of exactly 1.0 means the grader is trivially solved
                        or broken. 0.99 signals "excellent but not perfect".
    """
    # Floor: None, negative, bad types β†’ 0.01
    assert safe_score(None)   == 0.01, f"Expected 0.01, got {safe_score(None)}"
    assert safe_score(-0.5)   == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
    assert safe_score(-999)   == 0.01, f"Expected 0.01, got {safe_score(-999)}"
    assert safe_score('bad')  == 0.01, f"Expected 0.01, got {safe_score('bad')}"
    assert safe_score([])     == 0.01, f"Expected 0.01, got {safe_score([])}"

    # Ceiling: values > 1 β†’ 0.99
    assert safe_score(1.5)    == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
    assert safe_score(2.0)    == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
    assert safe_score(100)    == 0.99, f"Expected 0.99, got {safe_score(100)}"

    # Exact boundary values
    assert safe_score(0.01)   == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
    assert safe_score(0.99)   == 0.99, f"Expected 0.99, got {safe_score(0.99)}"

    # Pass-through: normal values in range stay unchanged
    assert safe_score(0.5)    == 0.5,  f"Expected 0.5, got {safe_score(0.5)}"
    assert safe_score(0.85)   == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
    assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
    assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"

    print('  safe_score clamp [0.01, 0.99]: PASS')


def test_clinical_valid_actions():
    """Bug 2 fix: propose_recovery must NOT be in clinical VALID_ACTIONS."""
    from server.graders.clinical_grader import VALID_ACTIONS
    assert 'propose_recovery' not in VALID_ACTIONS, 'Bug 2 still present!'
    assert set(VALID_ACTIONS) == {'detect_gap', 'rank_issues', 'order_steps'}
    print('  Clinical VALID_ACTIONS (Bug 2): PASS')


if __name__ == '__main__':
    test_safe_score_clamp()
    test_clinical_valid_actions()
    test_sec_identify_variance()
    test_dep_resolve_variance()
    test_cli_order_variance()
    print('\nALL VARIANCE TESTS PASSED βœ…')