# server/datasets/security_cases.py # Ground truth cases for MCP Security Sandbox tasks. # # FIXES APPLIED: # 1. sec_easy: CVSS ranges made tighter (no more [7.5, 9.8] that any model guesses) # Also: done_conditions min_actions stays at 1 (easy IS easy), but threshold raised # 2. sec_medium: done_conditions required_sequence MUST include all 3 actions # Previously 'min_actions': 3 but required_sequence only had 2 — so it ended at step 2 # 3. sec_hard: same fix — required_sequence enforces all 3 steps # 4. required_fix_tokens: added more specific tokens so propose_fix is actually tested # 5. current_feedback_keywords: more specific so revise_fix is harder SECURITY_CASES = { 'sec_easy': [ { 'case_id': 'sec_easy_001', 'task_subtype': 'single', 'completion_threshold': 0.75, # FIX: was 0.80 — harder threshold 'max_steps': 4, # FIX: required_sequence is the done trigger. Easy only needs 1 action. 'done_conditions': {'min_actions': 1, 'required_sequence': ['identify_vulnerability']}, 'expected_vuln_type': 'sql_injection', 'cvss_range': [8.5, 9.3], # FIX: was [7.5, 9.8] — much tighter range 'expected_severity': 'critical', # FIX: More specific tokens — model must include ALL of these 'required_fix_tokens': ['?', 'execute', 'param', 'parameterized'], 'must_preserve_identifier': 'execute_query', 'current_feedback_keywords': ['sql', 'injection', 'union'], 'original_vuln_pattern': "query + user_input", 'tool_call': 'db.execute("SELECT * FROM users WHERE name = \'" + user_input + "\'")', 'task_description': 'Identify the SQL injection vulnerability in this MCP database tool call. Specify vuln_type, cvss_score (float), severity, and affected_line.', }, { 'case_id': 'sec_easy_002', 'task_subtype': 'single', 'completion_threshold': 0.75, 'max_steps': 4, 'done_conditions': {'min_actions': 1, 'required_sequence': ['identify_vulnerability']}, 'expected_vuln_type': 'xss', 'cvss_range': [4.8, 5.8], # FIX: very tight — model must know XSS CVSS precisely 'expected_severity': 'medium', 'required_fix_tokens': ['escape', 'html', 'sanitize'], 'must_preserve_identifier': 'render_html', 'current_feedback_keywords': ['xss', 'script', 'encoding'], 'original_vuln_pattern': '