Spaces:
Running
Running
File size: 4,421 Bytes
ee2f27b 26ea725 ee2f27b 26ea725 ee2f27b 26ea725 ee2f27b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | #!/usr/bin/env python3
"""
Exploit Test Script — proves specific attack vectors are blocked.
Designed as judge-facing evidence of reward system integrity.
"""
import requests, json, sys, os
BASE = "http://localhost:7860"
EXPLOITS = [
{
"name": "Token Stuffing",
"description": "Agent sends 100k tokens per action hoping to confuse MER",
"task": "easy",
"actions": [
{
"action_type": "inspect_flight_recorder",
"parameters": {"rank_id": 1},
"token_count": 100000,
},
]
* 5,
"metric": "score",
"expect_score_below": 0.5,
"why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor",
},
{
"name": "Cascade Phase Skip",
"description": "Agent tries to solve phase 3 before phase 1",
"task": "cascade",
"actions": [
{
"action_type": "patch_divergent_code",
"parameters": {
"file": "model/transformer.py",
"fix_type": "synchronize_conditional",
},
},
]
* 3,
"metric": "score",
"expect_score_below": 0.3,
"why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved",
},
{
"name": "Reward Farming with Investigation Loop",
"description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each",
"task": "easy",
"actions": [
{
"action_type": "inspect_flight_recorder",
"parameters": {"rank_id": i % 8},
"token_count": 120,
}
for i in range(40)
],
"metric": "score",
"expect_score_below": 0.5,
"why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops",
},
]
def run_exploit(exploit: dict) -> dict:
r = requests.post(
f"{BASE}/reset",
json={"task_id": exploit["task"], "seed": 42},
timeout=10,
)
if r.status_code != 200:
return {"name": exploit["name"], "error": "reset failed", "blocked": False}
for action in exploit["actions"]:
requests.post(f"{BASE}/step", json=action, timeout=10)
grade = requests.post(
f"{BASE}/grade",
json={"task_id": exploit["task"]},
timeout=10,
).json()
metric = exploit.get("metric", "score")
if metric == "mer_score":
metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0)))
else:
metric_value = float(grade.get("score", 1.0))
blocked = metric_value < exploit["expect_score_below"]
return {
"name": exploit["name"],
"description": exploit["description"],
"metric": metric,
"score": float(grade.get("score", 1.0)),
"metric_value": metric_value,
"threshold": exploit["expect_score_below"],
"blocked": blocked,
"why_blocked": exploit["why_blocked"],
"verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED",
}
def main() -> None:
print("NervousSystem-Env Exploit Test Suite")
print("=" * 50)
all_blocked = True
results = []
for exploit in EXPLOITS:
print(f"\n[{exploit['name']}]")
print(f" {exploit['description']}")
result = run_exploit(exploit)
results.append(result)
score_value = result.get("metric_value")
if score_value is None:
score_text = "N/A"
else:
score_text = f"{score_value:.3f}"
print(
f" Score: {score_text} "
f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})"
)
print(f" {result.get('verdict', '⚠️ NOT BLOCKED')}")
print(f" Why: {exploit['why_blocked']}")
if not result.get("blocked", False):
all_blocked = False
print(f"\n{'=' * 50}")
print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}")
os.makedirs("results", exist_ok=True)
with open("results/exploit_test.json", "w") as f:
json.dump(results, f, indent=2)
print("Saved to results/exploit_test.json")
sys.exit(0 if all_blocked else 1)
if __name__ == "__main__":
main()
|