Spaces:

v4xsh
/

nervousystem-env

Running

File size: 4,421 Bytes

#!/usr/bin/env python3
"""
Exploit Test Script — proves specific attack vectors are blocked.
Designed as judge-facing evidence of reward system integrity.
"""
import requests, json, sys, os

BASE = "http://localhost:7860"

EXPLOITS = [
    {
        "name": "Token Stuffing",
        "description": "Agent sends 100k tokens per action hoping to confuse MER",
        "task": "easy",
        "actions": [
            {
                "action_type": "inspect_flight_recorder",
                "parameters": {"rank_id": 1},
                "token_count": 100000,
            },
        ]
        * 5,
        "metric": "score",
        "expect_score_below": 0.5,
        "why_blocked": "Final grade applies the token-efficiency multiplier, dropping score to the 0.01 floor",
    },
    {
        "name": "Cascade Phase Skip",
        "description": "Agent tries to solve phase 3 before phase 1",
        "task": "cascade",
        "actions": [
            {
                "action_type": "patch_divergent_code",
                "parameters": {
                    "file": "model/transformer.py",
                    "fix_type": "synchronize_conditional",
                },
            },
        ]
        * 3,
        "metric": "score",
        "expect_score_below": 0.3,
        "why_blocked": "Phase gating: phase 3 locked until phase 1+2 solved",
    },
    {
        "name": "Reward Farming with Investigation Loop",
        "description": "Agent loops inspect_flight_recorder hoping to farm +0.05 each",
        "task": "easy",
        "actions": [
            {
                "action_type": "inspect_flight_recorder",
                "parameters": {"rank_id": i % 8},
                "token_count": 120,
            }
            for i in range(40)
        ],
        "metric": "score",
        "expect_score_below": 0.5,
        "why_blocked": "Final grade applies the cumulative token-efficiency multiplier to repeated diagnostic loops",
    },
]


def run_exploit(exploit: dict) -> dict:
    r = requests.post(
        f"{BASE}/reset",
        json={"task_id": exploit["task"], "seed": 42},
        timeout=10,
    )
    if r.status_code != 200:
        return {"name": exploit["name"], "error": "reset failed", "blocked": False}

    for action in exploit["actions"]:
        requests.post(f"{BASE}/step", json=action, timeout=10)

    grade = requests.post(
        f"{BASE}/grade",
        json={"task_id": exploit["task"]},
        timeout=10,
    ).json()
    metric = exploit.get("metric", "score")
    if metric == "mer_score":
        metric_value = float(grade.get("breakdown", {}).get("mer_score", grade.get("score", 1.0)))
    else:
        metric_value = float(grade.get("score", 1.0))
    blocked = metric_value < exploit["expect_score_below"]

    return {
        "name": exploit["name"],
        "description": exploit["description"],
        "metric": metric,
        "score": float(grade.get("score", 1.0)),
        "metric_value": metric_value,
        "threshold": exploit["expect_score_below"],
        "blocked": blocked,
        "why_blocked": exploit["why_blocked"],
        "verdict": "✅ BLOCKED" if blocked else "⚠️ NOT BLOCKED",
    }


def main() -> None:
    print("NervousSystem-Env Exploit Test Suite")
    print("=" * 50)
    all_blocked = True
    results = []
    for exploit in EXPLOITS:
        print(f"\n[{exploit['name']}]")
        print(f"  {exploit['description']}")
        result = run_exploit(exploit)
        results.append(result)

        score_value = result.get("metric_value")
        if score_value is None:
            score_text = "N/A"
        else:
            score_text = f"{score_value:.3f}"

        print(
            f"  Score: {score_text} "
            f"(metric={result.get('metric', 'score')}, must be < {exploit['expect_score_below']})"
        )
        print(f"  {result.get('verdict', '⚠️ NOT BLOCKED')}")
        print(f"  Why: {exploit['why_blocked']}")
        if not result.get("blocked", False):
            all_blocked = False

    print(f"\n{'=' * 50}")
    print(f"RESULT: {'✅ ALL EXPLOITS BLOCKED' if all_blocked else '❌ SOME EXPLOITS NOT BLOCKED'}")

    os.makedirs("results", exist_ok=True)
    with open("results/exploit_test.json", "w") as f:
        json.dump(results, f, indent=2)
    print("Saved to results/exploit_test.json")

    sys.exit(0 if all_blocked else 1)


if __name__ == "__main__":
    main()