#!/usr/bin/env python3 """ Exploit Test Script — proves specific attack vectors are blocked. Designed as judge-facing evidence of reward system integrity. """ import requests, json, sys, os BASE = "http://localhost:7860" EXPLOITS = [ { "name": "Token Stuffing", "description": "Agent sends 100k tokens per action hoping to confuse MER", "task": "easy", "actions": [ { "action_type": "inspect_flight_recorder", "parameters": {"rank_id": 1}, "token_count": 100000, }, ] * 5, "metric": "score", "expect_score_below": 0.5, "why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor", }, { "name": "Cascade Phase Skip", "description": "Agent tries to solve phase 3 before phase 1", "task": "cascade", "actions": [ { "action_type": "patch_divergent_code", "parameters": { "file": "model/transformer.py", "fix_type": "synchronize_conditional", }, }, ] * 3, "metric": "score", "expect_score_below": 0.3, "why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved", }, { "name": "Reward Farming with Investigation Loop", "description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each", "task": "easy", "actions": [ { "action_type": "inspect_flight_recorder", "parameters": {"rank_id": i % 8}, "token_count": 120, } for i in range(40) ], "metric": "score", "expect_score_below": 0.5, "why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops", }, ] def run_exploit(exploit: dict) -> dict: r = requests.post( f"{BASE}/reset", json={"task_id": exploit["task"], "seed": 42}, timeout=10, ) if r.status_code != 200: return {"name": exploit["name"], "error": "reset failed", "blocked": False} for action in exploit["actions"]: requests.post(f"{BASE}/step", json=action, timeout=10) grade = requests.post( f"{BASE}/grade", json={"task_id": exploit["task"]}, timeout=10, ).json() metric = exploit.get("metric", "score") if metric == "mer_score": metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0))) else: metric_value = float(grade.get("score", 1.0)) blocked = metric_value < exploit["expect_score_below"] return { "name": exploit["name"], "description": exploit["description"], "metric": metric, "score": float(grade.get("score", 1.0)), "metric_value": metric_value, "threshold": exploit["expect_score_below"], "blocked": blocked, "why_blocked": exploit["why_blocked"], "verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED", } def main() -> None: print("NervousSystem-Env Exploit Test Suite") print("=" * 50) all_blocked = True results = [] for exploit in EXPLOITS: print(f"\n[{exploit['name']}]") print(f" {exploit['description']}") result = run_exploit(exploit) results.append(result) score_value = result.get("metric_value") if score_value is None: score_text = "N/A" else: score_text = f"{score_value:.3f}" print( f" Score: {score_text} " f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})" ) print(f" {result.get('verdict', '⚠️ NOT BLOCKED')}") print(f" Why: {exploit['why_blocked']}") if not result.get("blocked", False): all_blocked = False print(f"\n{'=' * 50}") print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}") os.makedirs("results", exist_ok=True) with open("results/exploit_test.json", "w") as f: json.dump(results, f, indent=2) print("Saved to results/exploit_test.json") sys.exit(0 if all_blocked else 1) if __name__ == "__main__": main()