Spaces:
Running
Running
File size: 6,770 Bytes
4ec75cf 6284048 4ec75cf 6284048 4ec75cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | # tests/test_grader_variance.py
# Phase 2 of judging runs a variance check. If all graders return the same score
# for different quality answers, the submission is DISQUALIFIED.
# Run: python -m pytest tests/test_grader_variance.py -v
import sys
sys.path.insert(0, '.')
from server.graders.base_grader import safe_score
from server.graders.security_grader import compute_correctness as sec_cc
from server.graders.dependency_grader import compute_correctness as dep_cc
from server.graders.clinical_grader import compute_correctness as cli_cc
# ββ Security Case for Testing ββ
SEC_CASE = {
'expected_vuln_type': 'sql_injection',
'cvss_range': [7.5, 9.8],
'expected_severity': 'critical',
'required_fix_tokens': ['?', 'parameterized'],
'current_feedback_keywords': ['sql', 'injection'],
'original_vuln_pattern': 'query+',
}
def test_sec_identify_variance():
"""Security grader must return 3+ different scores for different quality answers."""
perfect = {
'action_type': 'identify_vulnerability',
'vuln_type': 'sql_injection',
'cvss_score': 8.5,
'severity': 'critical',
'affected_line': 1,
}
partial = {
'action_type': 'identify_vulnerability',
'vuln_type': 'xss', # wrong vuln_type
'cvss_score': 8.5, # but correct CVSS
'severity': 'critical', # and correct severity
'affected_line': 1,
}
wrong = {
'action_type': 'identify_vulnerability',
'vuln_type': 'xss', # wrong everything
'cvss_score': 2.0,
'severity': 'low',
'affected_line': 1,
}
s1 = safe_score(sec_cc(perfect, SEC_CASE))
s2 = safe_score(sec_cc(partial, SEC_CASE))
s3 = safe_score(sec_cc(wrong, SEC_CASE))
assert len({round(s, 2) for s in [s1, s2, s3]}) >= 3, f'No variance: {s1},{s2},{s3}'
assert s1 > s2 > s3, f'Wrong ordering: {s1},{s2},{s3}'
print(f' Security identify variance: {s1:.4f} > {s2:.4f} > {s3:.4f} PASS')
def test_dep_resolve_variance():
"""Dependency grader must return different scores for different quality answers."""
case = {
'conflict_packages': ['torch', 'numpy'],
'compatibility_matrix': {
'torch': {'2.1.0': {'numpy': '>=1.24'}, '1.9.0': {}},
'numpy': {'1.24.0': {}, '1.16.0': {}},
},
'requirements': {'torch': '1.9.0', 'numpy': '1.16.0'},
}
full = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.24.0'}, 'reasoning': 'ok'}
part = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.16.0'}, 'reasoning': 'ok'}
empty = {'action_type': 'resolve_conflict', 'packages': {}, 'reasoning': 'ok'}
s1 = safe_score(dep_cc(full, case))
s2 = safe_score(dep_cc(part, case))
s3 = safe_score(dep_cc(empty, case))
assert s1 > s2 >= s3, f'No variance: {s1},{s2},{s3}'
print(f' Dependency resolve variance: {s1:.4f} > {s2:.4f} >= {s3:.4f} PASS')
def test_cli_order_variance():
"""Clinical grader must return different scores for correct vs violated dependency order."""
case = {
'dependency_graph': {
'schedule_surgery': ['resolve_insurance', 'complete_pre_op'],
'complete_pre_op': ['resolve_insurance'],
'resolve_insurance': [],
},
'required_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
}
correct = {
'action_type': 'order_steps',
'recovery_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'],
}
violated = {
'action_type': 'order_steps',
'recovery_steps': ['schedule_surgery', 'complete_pre_op', 'resolve_insurance'],
}
partial = {
'action_type': 'order_steps',
'recovery_steps': ['resolve_insurance', 'complete_pre_op'],
}
s1 = safe_score(cli_cc(correct, case))
s2 = safe_score(cli_cc(violated, case))
s3 = safe_score(cli_cc(partial, case))
assert s1 > s2, f'Violation not penalised: correct={s1}, violated={s2}'
assert s1 > s3, f'Completeness not rewarded: correct={s1}, partial={s3}'
print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS')
def test_safe_score_clamp():
"""
safe_score clamps to [0.01, 0.99] β strictly between 0 and 1.
WHY 0.01 not 0.0: The official spec says scores must be strictly > 0.
A score of 0.0 from a crashed run looks indistinguishable
from a broken environment. 0.01 signals "ran but failed".
WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved
or broken. 0.99 signals "excellent but not perfect".
"""
# Floor: None, negative, bad types β 0.01
assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}"
assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}"
assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}"
assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}"
assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}"
# Ceiling: values > 1 β 0.99
assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}"
assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}"
assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}"
# Exact boundary values
assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}"
assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}"
# Pass-through: normal values in range stay unchanged
assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}"
assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}"
assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}"
assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}"
print(' safe_score clamp [0.01, 0.99]: PASS')
def test_clinical_valid_actions():
"""Bug 2 fix: propose_recovery must NOT be in clinical VALID_ACTIONS."""
from server.graders.clinical_grader import VALID_ACTIONS
assert 'propose_recovery' not in VALID_ACTIONS, 'Bug 2 still present!'
assert set(VALID_ACTIONS) == {'detect_gap', 'rank_issues', 'order_steps'}
print(' Clinical VALID_ACTIONS (Bug 2): PASS')
if __name__ == '__main__':
test_safe_score_clamp()
test_clinical_valid_actions()
test_sec_identify_variance()
test_dep_resolve_variance()
test_cli_order_variance()
print('\nALL VARIANCE TESTS PASSED β
')
|