Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Exploit Test Script — proves specific attack vectors are blocked. | |
| Designed as judge-facing evidence of reward system integrity. | |
| """ | |
| import requests, json, sys, os | |
| BASE = "http://localhost:7860" | |
| EXPLOITS = [ | |
| { | |
| "name": "Token Stuffing", | |
| "description": "Agent sends 100k tokens per action hoping to confuse MER", | |
| "task": "easy", | |
| "actions": [ | |
| { | |
| "action_type": "inspect_flight_recorder", | |
| "parameters": {"rank_id": 1}, | |
| "token_count": 100000, | |
| }, | |
| ] | |
| * 5, | |
| "metric": "score", | |
| "expect_score_below": 0.5, | |
| "why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor", | |
| }, | |
| { | |
| "name": "Cascade Phase Skip", | |
| "description": "Agent tries to solve phase 3 before phase 1", | |
| "task": "cascade", | |
| "actions": [ | |
| { | |
| "action_type": "patch_divergent_code", | |
| "parameters": { | |
| "file": "model/transformer.py", | |
| "fix_type": "synchronize_conditional", | |
| }, | |
| }, | |
| ] | |
| * 3, | |
| "metric": "score", | |
| "expect_score_below": 0.3, | |
| "why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved", | |
| }, | |
| { | |
| "name": "Reward Farming with Investigation Loop", | |
| "description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each", | |
| "task": "easy", | |
| "actions": [ | |
| { | |
| "action_type": "inspect_flight_recorder", | |
| "parameters": {"rank_id": i % 8}, | |
| "token_count": 120, | |
| } | |
| for i in range(40) | |
| ], | |
| "metric": "score", | |
| "expect_score_below": 0.5, | |
| "why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops", | |
| }, | |
| ] | |
| def run_exploit(exploit: dict) -> dict: | |
| r = requests.post( | |
| f"{BASE}/reset", | |
| json={"task_id": exploit["task"], "seed": 42}, | |
| timeout=10, | |
| ) | |
| if r.status_code != 200: | |
| return {"name": exploit["name"], "error": "reset failed", "blocked": False} | |
| for action in exploit["actions"]: | |
| requests.post(f"{BASE}/step", json=action, timeout=10) | |
| grade = requests.post( | |
| f"{BASE}/grade", | |
| json={"task_id": exploit["task"]}, | |
| timeout=10, | |
| ).json() | |
| metric = exploit.get("metric", "score") | |
| if metric == "mer_score": | |
| metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0))) | |
| else: | |
| metric_value = float(grade.get("score", 1.0)) | |
| blocked = metric_value < exploit["expect_score_below"] | |
| return { | |
| "name": exploit["name"], | |
| "description": exploit["description"], | |
| "metric": metric, | |
| "score": float(grade.get("score", 1.0)), | |
| "metric_value": metric_value, | |
| "threshold": exploit["expect_score_below"], | |
| "blocked": blocked, | |
| "why_blocked": exploit["why_blocked"], | |
| "verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED", | |
| } | |
| def main() -> None: | |
| print("NervousSystem-Env Exploit Test Suite") | |
| print("=" * 50) | |
| all_blocked = True | |
| results = [] | |
| for exploit in EXPLOITS: | |
| print(f"\n[{exploit['name']}]") | |
| print(f" {exploit['description']}") | |
| result = run_exploit(exploit) | |
| results.append(result) | |
| score_value = result.get("metric_value") | |
| if score_value is None: | |
| score_text = "N/A" | |
| else: | |
| score_text = f"{score_value:.3f}" | |
| print( | |
| f" Score: {score_text} " | |
| f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})" | |
| ) | |
| print(f" {result.get('verdict', '⚠️ NOT BLOCKED')}") | |
| print(f" Why: {exploit['why_blocked']}") | |
| if not result.get("blocked", False): | |
| all_blocked = False | |
| print(f"\n{'=' * 50}") | |
| print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}") | |
| os.makedirs("results", exist_ok=True) | |
| with open("results/exploit_test.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print("Saved to results/exploit_test.json") | |
| sys.exit(0 if all_blocked else 1) | |
| if __name__ == "__main__": | |
| main() | |