Spaces:

v4xsh
/

nervousystem-env

Running

App Files Files Community

nervousystem-env / scripts /exploit_test.py

vx7sh

test(reward): make reward audit run in-process when server is offline

26ea725 25 days ago

raw

history blame contribute delete

4.42 kB

	#!/usr/bin/env python3
	"""
	Exploit Test Script — proves specific attack vectors are blocked.
	Designed as judge-facing evidence of reward system integrity.
	"""
	import requests, json, sys, os

	BASE = "http://localhost:7860"

	EXPLOITS = [
	{
	"name": "Token Stuffing",
	"description": "Agent sends 100k tokens per action hoping to confuse MER",
	"task": "easy",
	"actions": [
	{
	"action_type": "inspect_flight_recorder",
	"parameters": {"rank_id": 1},
	"token_count": 100000,
	},
	]
	* 5,
	"metric": "score",
	"expect_score_below": 0.5,
	"why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor",
	},
	{
	"name": "Cascade Phase Skip",
	"description": "Agent tries to solve phase 3 before phase 1",
	"task": "cascade",
	"actions": [
	{
	"action_type": "patch_divergent_code",
	"parameters": {
	"file": "model/transformer.py",
	"fix_type": "synchronize_conditional",
	},
	},
	]
	* 3,
	"metric": "score",
	"expect_score_below": 0.3,
	"why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved",
	},
	{
	"name": "Reward Farming with Investigation Loop",
	"description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each",
	"task": "easy",
	"actions": [
	{
	"action_type": "inspect_flight_recorder",
	"parameters": {"rank_id": i % 8},
	"token_count": 120,
	}
	for i in range(40)
	],
	"metric": "score",
	"expect_score_below": 0.5,
	"why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops",
	},
	]


	def run_exploit(exploit: dict) -> dict:
	r = requests.post(
	f"{BASE}/reset",
	json={"task_id": exploit["task"], "seed": 42},
	timeout=10,
	)
	if r.status_code != 200:
	return {"name": exploit["name"], "error": "reset failed", "blocked": False}

	for action in exploit["actions"]:
	requests.post(f"{BASE}/step", json=action, timeout=10)

	grade = requests.post(
	f"{BASE}/grade",
	json={"task_id": exploit["task"]},
	timeout=10,
	).json()
	metric = exploit.get("metric", "score")
	if metric == "mer_score":
	metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0)))
	else:
	metric_value = float(grade.get("score", 1.0))
	blocked = metric_value < exploit["expect_score_below"]

	return {
	"name": exploit["name"],
	"description": exploit["description"],
	"metric": metric,
	"score": float(grade.get("score", 1.0)),
	"metric_value": metric_value,
	"threshold": exploit["expect_score_below"],
	"blocked": blocked,
	"why_blocked": exploit["why_blocked"],
	"verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED",
	}


	def main() -> None:
	print("NervousSystem-Env Exploit Test Suite")
	print("=" * 50)
	all_blocked = True
	results = []
	for exploit in EXPLOITS:
	print(f"\n[{exploit['name']}]")
	print(f" {exploit['description']}")
	result = run_exploit(exploit)
	results.append(result)

	score_value = result.get("metric_value")
	if score_value is None:
	score_text = "N/A"
	else:
	score_text = f"{score_value:.3f}"

	print(
	f" Score: {score_text} "
	f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})"
	)
	print(f" {result.get('verdict', '⚠️ NOT BLOCKED')}")
	print(f" Why: {exploit['why_blocked']}")
	if not result.get("blocked", False):
	all_blocked = False

	print(f"\n{'=' * 50}")
	print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}")

	os.makedirs("results", exist_ok=True)
	with open("results/exploit_test.json", "w") as f:
	json.dump(results, f, indent=2)
	print("Saved to results/exploit_test.json")

	sys.exit(0 if all_blocked else 1)


	if __name__ == "__main__":
	main()