nervousystem-env / scripts /exploit_test.py
vx7sh's picture
test(reward): make reward audit run in-process when server is offline
26ea725
#!/usr/bin/env python3
"""
Exploit Test Script — proves specific attack vectors are blocked.
Designed as judge-facing evidence of reward system integrity.
"""
import requests, json, sys, os
BASE = "http://localhost:7860"
EXPLOITS = [
{
"name": "Token Stuffing",
"description": "Agent sends 100k tokens per action hoping to confuse MER",
"task": "easy",
"actions": [
{
"action_type": "inspect_flight_recorder",
"parameters": {"rank_id": 1},
"token_count": 100000,
},
]
* 5,
"metric": "score",
"expect_score_below": 0.5,
"why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor",
},
{
"name": "Cascade Phase Skip",
"description": "Agent tries to solve phase 3 before phase 1",
"task": "cascade",
"actions": [
{
"action_type": "patch_divergent_code",
"parameters": {
"file": "model/transformer.py",
"fix_type": "synchronize_conditional",
},
},
]
* 3,
"metric": "score",
"expect_score_below": 0.3,
"why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved",
},
{
"name": "Reward Farming with Investigation Loop",
"description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each",
"task": "easy",
"actions": [
{
"action_type": "inspect_flight_recorder",
"parameters": {"rank_id": i % 8},
"token_count": 120,
}
for i in range(40)
],
"metric": "score",
"expect_score_below": 0.5,
"why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops",
},
]
def run_exploit(exploit: dict) -> dict:
r = requests.post(
f"{BASE}/reset",
json={"task_id": exploit["task"], "seed": 42},
timeout=10,
)
if r.status_code != 200:
return {"name": exploit["name"], "error": "reset failed", "blocked": False}
for action in exploit["actions"]:
requests.post(f"{BASE}/step", json=action, timeout=10)
grade = requests.post(
f"{BASE}/grade",
json={"task_id": exploit["task"]},
timeout=10,
).json()
metric = exploit.get("metric", "score")
if metric == "mer_score":
metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0)))
else:
metric_value = float(grade.get("score", 1.0))
blocked = metric_value < exploit["expect_score_below"]
return {
"name": exploit["name"],
"description": exploit["description"],
"metric": metric,
"score": float(grade.get("score", 1.0)),
"metric_value": metric_value,
"threshold": exploit["expect_score_below"],
"blocked": blocked,
"why_blocked": exploit["why_blocked"],
"verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED",
}
def main() -> None:
print("NervousSystem-Env Exploit Test Suite")
print("=" * 50)
all_blocked = True
results = []
for exploit in EXPLOITS:
print(f"\n[{exploit['name']}]")
print(f" {exploit['description']}")
result = run_exploit(exploit)
results.append(result)
score_value = result.get("metric_value")
if score_value is None:
score_text = "N/A"
else:
score_text = f"{score_value:.3f}"
print(
f" Score: {score_text} "
f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})"
)
print(f" {result.get('verdict', '⚠️ NOT BLOCKED')}")
print(f" Why: {exploit['why_blocked']}")
if not result.get("blocked", False):
all_blocked = False
print(f"\n{'=' * 50}")
print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}")
os.makedirs("results", exist_ok=True)
with open("results/exploit_test.json", "w") as f:
json.dump(results, f, indent=2)
print("Saved to results/exploit_test.json")
sys.exit(0 if all_blocked else 1)
if __name__ == "__main__":
main()