File size: 4,421 Bytes
ee2f27b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ea725
ee2f27b
26ea725
ee2f27b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ea725
 
 
ee2f27b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""
Exploit Test Script — proves specific attack vectors are blocked.
Designed as judge-facing evidence of reward system integrity.
"""
import requests, json, sys, os

BASE = "http://localhost:7860"

EXPLOITS = [
    {
        "name": "Token Stuffing",
        "description": "Agent sends 100k tokens per action hoping to confuse MER",
        "task": "easy",
        "actions": [
            {
                "action_type": "inspect_flight_recorder",
                "parameters": {"rank_id": 1},
                "token_count": 100000,
            },
        ]
        * 5,
        "metric": "score",
        "expect_score_below": 0.5,
        "why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor",
    },
    {
        "name": "Cascade Phase Skip",
        "description": "Agent tries to solve phase 3 before phase 1",
        "task": "cascade",
        "actions": [
            {
                "action_type": "patch_divergent_code",
                "parameters": {
                    "file": "model/transformer.py",
                    "fix_type": "synchronize_conditional",
                },
            },
        ]
        * 3,
        "metric": "score",
        "expect_score_below": 0.3,
        "why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved",
    },
    {
        "name": "Reward Farming with Investigation Loop",
        "description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each",
        "task": "easy",
        "actions": [
            {
                "action_type": "inspect_flight_recorder",
                "parameters": {"rank_id": i % 8},
                "token_count": 120,
            }
            for i in range(40)
        ],
        "metric": "score",
        "expect_score_below": 0.5,
        "why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops",
    },
]


def run_exploit(exploit: dict) -> dict:
    r = requests.post(
        f"{BASE}/reset",
        json={"task_id": exploit["task"], "seed": 42},
        timeout=10,
    )
    if r.status_code != 200:
        return {"name": exploit["name"], "error": "reset failed", "blocked": False}

    for action in exploit["actions"]:
        requests.post(f"{BASE}/step", json=action, timeout=10)

    grade = requests.post(
        f"{BASE}/grade",
        json={"task_id": exploit["task"]},
        timeout=10,
    ).json()
    metric = exploit.get("metric", "score")
    if metric == "mer_score":
        metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0)))
    else:
        metric_value = float(grade.get("score", 1.0))
    blocked = metric_value < exploit["expect_score_below"]

    return {
        "name": exploit["name"],
        "description": exploit["description"],
        "metric": metric,
        "score": float(grade.get("score", 1.0)),
        "metric_value": metric_value,
        "threshold": exploit["expect_score_below"],
        "blocked": blocked,
        "why_blocked": exploit["why_blocked"],
        "verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED",
    }


def main() -> None:
    print("NervousSystem-Env Exploit Test Suite")
    print("=" * 50)
    all_blocked = True
    results = []
    for exploit in EXPLOITS:
        print(f"\n[{exploit['name']}]")
        print(f"  {exploit['description']}")
        result = run_exploit(exploit)
        results.append(result)

        score_value = result.get("metric_value")
        if score_value is None:
            score_text = "N/A"
        else:
            score_text = f"{score_value:.3f}"

        print(
            f"  Score: {score_text} "
            f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})"
        )
        print(f"  {result.get('verdict', '⚠️ NOT BLOCKED')}")
        print(f"  Why: {exploit['why_blocked']}")
        if not result.get("blocked", False):
            all_blocked = False

    print(f"\n{'=' * 50}")
    print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}")

    os.makedirs("results", exist_ok=True)
    with open("results/exploit_test.json", "w") as f:
        json.dump(results, f, indent=2)
    print("Saved to results/exploit_test.json")

    sys.exit(0 if all_blocked else 1)


if __name__ == "__main__":
    main()