Spaces:

mpnikhil
/

skill-invocation-env

Running

File size: 31,761 Bytes

ac627d5

#!/usr/bin/env python3
"""
Local test script for the Skill Invocation Environment.

Tests the environment directly (no server) to verify:
- reset() works and returns proper observation
- list/load/unload/submit actions work correctly
- context budget enforcement
- precision/recall/bloat reward computation
- verifier tests for static and procedural tasks
"""

import sys
import os

# Add parent dir so imports work
sys.path.insert(0, os.path.dirname(__file__))

from models import SkillInvocationAction, SkillInvocationObservation, SkillInvocationState
from task_bank import TASK_BANK, SKILL_BANK
from server.skill_invocation_env_environment import SkillInvocationEnvironment
from task_generator import TaskGenerator


# ---------------------------------------------------------------------------
# Core environment tests
# ---------------------------------------------------------------------------


def test_reset():
    """Test that reset returns a valid observation."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    assert isinstance(obs, SkillInvocationObservation)
    assert obs.task_description != ""
    assert len(obs.skill_catalog) >= 5  # relevant + distractors (now 5-8)
    assert obs.done is False
    assert obs.reward == 0.0
    assert obs.skill_content is None
    assert obs.loaded_skills == []
    assert obs.context_budget_used == 0
    assert obs.context_budget_total == 5
    assert len(obs.messages) > 0

    print("[PASS] test_reset")


def test_load_skill():
    """Test loading a skill puts it in context."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    skill_id = obs.skill_catalog[0]["id"]
    action = SkillInvocationAction(action_type="load", skill_id=skill_id)
    obs2 = env.step(action)

    assert obs2.skill_content is not None
    assert len(obs2.skill_content) > 0
    assert skill_id in obs2.loaded_skills
    assert obs2.context_budget_used == 1
    assert skill_id in obs2.loaded_skill_contents
    assert obs2.done is False

    print("[PASS] test_load_skill")


def test_invoke_backward_compat():
    """Test that 'invoke' still works as alias for 'load'."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    skill_id = obs.skill_catalog[0]["id"]
    action = SkillInvocationAction(action_type="invoke", skill_id=skill_id)
    obs2 = env.step(action)

    assert obs2.skill_content is not None
    assert skill_id in obs2.loaded_skills
    assert obs2.context_budget_used == 1

    print("[PASS] test_invoke_backward_compat")


def test_unload_skill():
    """Test unloading a skill removes it from context."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    skill_id = obs.skill_catalog[0]["id"]

    # Load
    env.step(SkillInvocationAction(action_type="load", skill_id=skill_id))
    # Unload
    obs3 = env.step(SkillInvocationAction(action_type="unload", skill_id=skill_id))

    assert skill_id not in obs3.loaded_skills
    assert obs3.context_budget_used == 0
    assert obs3.skill_content is None
    # Should still be in skills_ever_loaded (history)
    assert skill_id in obs3.skills_invoked

    print("[PASS] test_unload_skill")


def test_load_already_loaded():
    """Loading same skill twice is a no-op (no double counting)."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    skill_id = obs.skill_catalog[0]["id"]
    env.step(SkillInvocationAction(action_type="load", skill_id=skill_id))
    obs2 = env.step(SkillInvocationAction(action_type="load", skill_id=skill_id))

    assert obs2.context_budget_used == 1  # Not 2
    assert obs2.loaded_skills.count(skill_id) == 1
    assert obs2.skill_content is not None  # Still returns content

    print("[PASS] test_load_already_loaded")


def test_unload_not_loaded():
    """Unloading a skill that isn't loaded is a no-op."""
    env = SkillInvocationEnvironment()
    env.reset(seed=42)

    obs = env.step(SkillInvocationAction(action_type="unload", skill_id="skill_001"))
    assert obs.context_budget_used == 0

    print("[PASS] test_unload_not_loaded")


def test_context_budget():
    """Test that context budget is enforced."""
    env = SkillInvocationEnvironment(context_budget=3)
    obs = env.reset(seed=42)

    catalog_ids = [s["id"] for s in obs.skill_catalog]

    # Load 3 skills (budget full)
    for i in range(min(3, len(catalog_ids))):
        env.step(SkillInvocationAction(action_type="load", skill_id=catalog_ids[i]))

    obs = env.step(SkillInvocationAction(action_type="load", skill_id=catalog_ids[3]))
    # Should fail — budget is full
    assert obs.context_budget_used == 3
    assert catalog_ids[3] not in obs.loaded_skills

    # Unload one, then load should work
    env.step(SkillInvocationAction(action_type="unload", skill_id=catalog_ids[0]))
    obs2 = env.step(SkillInvocationAction(action_type="load", skill_id=catalog_ids[3]))
    assert catalog_ids[3] in obs2.loaded_skills
    assert obs2.context_budget_used == 3

    print("[PASS] test_context_budget")


def test_load_unknown_skill():
    """Test loading a skill not in the catalog."""
    env = SkillInvocationEnvironment()
    env.reset(seed=42)

    action = SkillInvocationAction(action_type="load", skill_id="skill_999")
    obs = env.step(action)

    assert obs.skill_content is None
    assert obs.context_budget_used == 0

    print("[PASS] test_load_unknown_skill")


def test_submit_incorrect():
    """Test submitting an incorrect answer."""
    env = SkillInvocationEnvironment()
    env.reset(seed=42)

    action = SkillInvocationAction(action_type="submit", answer="I don't know")
    obs = env.step(action)

    assert obs.done is True
    assert obs.reward <= 0.0
    assert obs.verification_result is not None
    assert "INCORRECT" in obs.verification_result

    print("[PASS] test_submit_incorrect")


def test_submit_after_done():
    """Test that actions after done return done state."""
    env = SkillInvocationEnvironment()
    env.reset(seed=42)

    env.step(SkillInvocationAction(action_type="submit", answer="test"))

    obs = env.step(SkillInvocationAction(action_type="load", skill_id="skill_001"))
    assert obs.done is True

    print("[PASS] test_submit_after_done")


def test_precision_reward():
    """Load only relevant skill, submit correct answer → max reward 1.0."""
    env = SkillInvocationEnvironment()

    for seed in range(100):
        obs = env.reset(seed=seed)
        state = env.state
        if state.task_id == "task_001":
            break
    else:
        print("[SKIP] test_precision_reward - couldn't find task_001")
        return

    # Load only relevant skill
    env.step(SkillInvocationAction(action_type="load", skill_id="skill_001"))

    correct_answer = """
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
"""
    obs = env.step(SkillInvocationAction(action_type="submit", answer=correct_answer))

    assert obs.done is True
    assert "CORRECT" in obs.verification_result
    # 0.6 correctness + 0.3 precision (1/1) + 0.1 recall (1/1) = 1.0
    assert abs(obs.reward - 1.0) < 0.01, f"Expected ~1.0, got {obs.reward}"

    print(f"[PASS] test_precision_reward (reward={obs.reward})")


def test_bloat_penalty():
    """Load all catalog skills, submit correct answer → reduced reward."""
    env = SkillInvocationEnvironment()

    for seed in range(100):
        obs = env.reset(seed=seed)
        state = env.state
        if state.task_id == "task_001":
            break
    else:
        print("[SKIP] test_bloat_penalty - couldn't find task_001")
        return

    # Load all catalog skills (relevant + distractors)
    for skill in obs.skill_catalog:
        env.step(SkillInvocationAction(action_type="load", skill_id=skill["id"]))

    correct_answer = """
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
"""
    obs = env.step(SkillInvocationAction(action_type="submit", answer=correct_answer))

    assert obs.done is True
    assert "CORRECT" in obs.verification_result
    # With 6 total skills loaded (1 relevant + 5 distractors):
    # 0.6 + 0.3*(1/6) + 0.1*(1/1) - 0.15*5 = 0.6 + 0.05 + 0.1 - 0.75 = 0.0
    # Reward should be much less than 1.0
    assert obs.reward < 0.5, f"Bloat should reduce reward, got {obs.reward}"

    print(f"[PASS] test_bloat_penalty (reward={obs.reward})")


def test_load_unload_no_bloat():
    """Load distractor, unload before submit → no bloat penalty."""
    env = SkillInvocationEnvironment()

    for seed in range(100):
        obs = env.reset(seed=seed)
        state = env.state
        if state.task_id == "task_001":
            break
    else:
        print("[SKIP] test_load_unload_no_bloat - couldn't find task_001")
        return

    # Load a distractor
    distractor_id = None
    for skill in obs.skill_catalog:
        if skill["id"] != "skill_001":
            distractor_id = skill["id"]
            break
    assert distractor_id is not None

    env.step(SkillInvocationAction(action_type="load", skill_id=distractor_id))
    # Unload it
    env.step(SkillInvocationAction(action_type="unload", skill_id=distractor_id))
    # Load relevant
    env.step(SkillInvocationAction(action_type="load", skill_id="skill_001"))

    correct_answer = """
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
"""
    obs = env.step(SkillInvocationAction(action_type="submit", answer=correct_answer))

    assert obs.done is True
    assert "CORRECT" in obs.verification_result
    # Only skill_001 loaded at submit → no bloat
    # 0.6 + 0.3 + 0.1 = 1.0
    assert abs(obs.reward - 1.0) < 0.01, f"Expected ~1.0 after unload, got {obs.reward}"

    print(f"[PASS] test_load_unload_no_bloat (reward={obs.reward})")


def test_state_property():
    """Test that state returns correct metadata."""
    env = SkillInvocationEnvironment()
    obs = env.reset(seed=42)

    state = env.state
    assert isinstance(state, SkillInvocationState)
    assert state.episode_id is not None
    assert state.step_count == 0
    assert state.task_id != ""
    assert state.done is False
    assert state.loaded_skills == []
    assert state.context_budget_total == 5

    # After a step
    skill_id = obs.skill_catalog[0]["id"]
    env.step(SkillInvocationAction(action_type="load", skill_id=skill_id))

    state = env.state
    assert state.step_count == 1
    assert skill_id in state.loaded_skills

    print("[PASS] test_state_property")


def test_all_tasks_have_valid_skills():
    """Verify task bank integrity."""
    for task in TASK_BANK:
        for sid in task["relevant_skills"]:
            assert sid in SKILL_BANK, f"Task {task['id']}: missing relevant skill {sid}"
        for sid in task["distractor_skills"]:
            assert sid in SKILL_BANK, f"Task {task['id']}: missing distractor skill {sid}"
        # Verify no overlap between relevant and distractor
        overlap = set(task["relevant_skills"]) & set(task["distractor_skills"])
        assert len(overlap) == 0, f"Task {task['id']}: overlap: {overlap}"
        # Each task now should have at least 5 skills in catalog
        total = len(task["relevant_skills"]) + len(task["distractor_skills"])
        assert total >= 5, f"Task {task['id']}: only {total} skills in catalog"

    print(f"[PASS] test_all_tasks_have_valid_skills ({len(TASK_BANK)} tasks verified)")


# ---------------------------------------------------------------------------
# Verifier tests (unchanged — these test verifier correctness, not env logic)
# ---------------------------------------------------------------------------


def test_verifier_task001_correct_code_passes():
    """Verify task_001 exec verifier passes reference implementation from skill content."""
    task = next(t for t in TASK_BANK if t["id"] == "task_001")
    correct_code = '''
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
'''
    assert task["verifier"](correct_code), "Reference implementation should pass"
    print("[PASS] test_verifier_task001_correct_code_passes")


def test_verifier_task001_keywords_only_fails():
    """Verify task_001 exec verifier rejects keyword-stuffed garbage."""
    task = next(t for t in TASK_BANK if t["id"] == "task_001")
    garbage = "hmac sha256 x-zephyr-auth base64 zph encode_zephyr_auth"
    assert not task["verifier"](garbage), "Keyword-stuffed garbage should fail"
    print("[PASS] test_verifier_task001_keywords_only_fails")


def test_verifier_task001_wrong_format_fails():
    """Verify task_001 rejects code with wrong header format."""
    task = next(t for t in TASK_BANK if t["id"] == "task_001")
    wrong_code = '''
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.md5).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
'''
    assert not task["verifier"](wrong_code), "Wrong hash algorithm should fail"
    print("[PASS] test_verifier_task001_wrong_format_fails")


def test_verifier_task001_markdown_fenced():
    """Verify task_001 exec verifier handles markdown-fenced code."""
    task = next(t for t in TASK_BANK if t["id"] == "task_001")
    fenced = '''```python
import hmac, hashlib, base64

def encode_zephyr_auth(api_key: str, timestamp: int) -> dict:
    signing_string = f"{api_key}:{timestamp}"
    digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest()
    b64 = base64.b64encode(digest).decode()
    return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"}
```'''
    assert task["verifier"](fenced), "Markdown-fenced correct code should pass"
    print("[PASS] test_verifier_task001_markdown_fenced")


def test_verifier_task002_correct_passes():
    """Verify task_002 NovaBin header parser passes with correct implementation."""
    task = next(t for t in TASK_BANK if t["id"] == "task_002")
    correct_code = '''
import struct

def parse_novabin_header(data: bytes) -> dict:
    magic = data[0:4]
    assert magic == b'NOVB', f"Invalid magic: {magic}"
    version = struct.unpack('>H', data[4:6])[0]
    record_count = struct.unpack('>I', data[6:10])[0]
    flags = struct.unpack('>H', data[10:12])[0]
    checksum = struct.unpack('>I', data[12:16])[0]
    return {
        "version": version, "record_count": record_count,
        "compressed": bool(flags & 1), "encrypted": bool(flags & 2),
        "checksummed": bool(flags & 4), "checksum": checksum
    }
'''
    assert task["verifier"](correct_code), "Correct NovaBin parser should pass"
    print("[PASS] test_verifier_task002_correct_passes")


def test_verifier_task002_keywords_only_fails():
    """Verify task_002 rejects keyword-stuffed answer."""
    task = next(t for t in TASK_BANK if t["id"] == "task_002")
    garbage = "struct NOVB 0x4E4F5642 big-endian parse_novabin_header version record_count"
    assert not task["verifier"](garbage), "Keyword-stuffed answer should fail"
    print("[PASS] test_verifier_task002_keywords_only_fails")


def test_verifier_task003_structural():
    """Verify task_003 HelixLang structural verifier catches structure, not just keywords."""
    task = next(t for t in TASK_BANK if t["id"] == "task_003")

    good = '''
fn fetch_user(db: Database, user_id: str) -> result<User> {
    let conn = try! db.connect().with_context("step", "connecting to database")

    let user = match try! conn.query_user(user_id).with_context("step", "fetching user") {
        Ok(u) => u,
        Err(e) => {
            if e.retryable {
                return retry_with_backoff(|| conn.query_user(user_id), max=3, backoff=100ms)
            }
            helix.log.error(e)
            return Err(HelixError.wrap(e, "HLX-DATA-2001", "user fetch failed"))
        }
    }

    Ok(user)
}
'''
    assert task["verifier"](good), "Proper HelixLang pseudocode should pass"

    keywords_only = "HLX-DATA try! with_context retry backoff helix.log.error result Ok Err"
    assert not task["verifier"](keywords_only), "Keywords without structure should fail"

    print("[PASS] test_verifier_task003_structural")


def test_verifier_task004_yaml_structure():
    """Verify task_004 ArcDeploy YAML verifier checks structure."""
    task = next(t for t in TASK_BANK if t["id"] == "task_004")

    good_yaml = '''```yaml
canary:
  phases:
    - name: shadow
      traffic_pct: 0
      duration_min: 5
      metrics_gate: error_rate < 0.01
    - name: canary_1
      traffic_pct: 5
      duration_min: 10
      metrics_gate: p99_latency_ms < 200 AND error_rate < 0.005
    - name: canary_2
      traffic_pct: 25
      duration_min: 15
      metrics_gate: p99_latency_ms < 250 AND error_rate < 0.005
    - name: canary_3
      traffic_pct: 50
      duration_min: 20
      metrics_gate: p99_latency_ms < 300 AND error_rate < 0.01
    - name: full
      traffic_pct: 100
      duration_min: 0
  rollback:
    auto: true
    on_metric_breach: immediate
    cooldown_min: 30
```'''
    assert task["verifier"](good_yaml), "Valid ArcDeploy YAML should pass"

    keywords = "shadow canary_1 traffic_pct metrics_gate error_rate rollback auto: true"
    assert not task["verifier"](keywords), "Keywords-only should fail YAML verifier"

    print("[PASS] test_verifier_task004_yaml_structure")


def test_verifier_task008_record_parser():
    """Verify task_008 NovaBin record parser with exec verifier."""
    task = next(t for t in TASK_BANK if t["id"] == "task_008")

    correct_code = '''
import struct

def parse_novabin_record(data: bytes, offset: int) -> tuple:
    fields = {}
    field_count = struct.unpack('>H', data[offset:offset+2])[0]
    offset += 2

    for _ in range(field_count):
        type_tag = data[offset]
        offset += 1

        name_len = struct.unpack('>H', data[offset:offset+2])[0]
        offset += 2
        field_name = data[offset:offset+name_len].decode('utf-8')
        offset += name_len

        val_len = struct.unpack('>I', data[offset:offset+4])[0]
        offset += 4
        val_data = data[offset:offset+val_len]
        offset += val_len

        if type_tag == 0x01:  # int32
            fields[field_name] = struct.unpack('>i', val_data)[0]
        elif type_tag == 0x02:  # float64
            fields[field_name] = struct.unpack('>d', val_data)[0]
        elif type_tag == 0x03:  # string
            fields[field_name] = val_data.decode('utf-8')
        elif type_tag == 0x04:  # bool
            fields[field_name] = val_data[0] != 0

    return (fields, offset)
'''
    assert task["verifier"](correct_code), "Correct record parser should pass"

    keywords = "struct 0x01 0x02 0x03 0x04 uint16 utf-8 parse_novabin_record"
    assert not task["verifier"](keywords), "Keywords should fail exec verifier"

    print("[PASS] test_verifier_task008_record_parser")


# ---------------------------------------------------------------------------
# SkillsBench-adapted task tests
# ---------------------------------------------------------------------------

def test_sb_001_flood_detection_correct():
    """Verify task_sb_001 flood detection passes with correct implementation."""
    task = next(t for t in TASK_BANK if t["id"] == "task_sb_001")
    correct_code = '''
def detect_flood_days(daily_max_levels, flood_thresholds):
    result = {}
    for station_id, levels in daily_max_levels.items():
        if station_id not in flood_thresholds:
            continue
        threshold = flood_thresholds[station_id]
        flood_days = sum(1 for level in levels if level >= threshold)
        if flood_days > 0:
            result[station_id] = flood_days
    return result
'''
    assert task["verifier"](correct_code), "Correct flood detection should pass"

    garbage = "detect_flood_days daily_max_levels flood_thresholds threshold"
    assert not task["verifier"](garbage), "Keywords should fail"

    print("[PASS] test_sb_001_flood_detection_correct")


def test_sb_002_hp_filter_correct():
    """Verify task_sb_002 HP filter correlation passes with correct implementation."""
    try:
        import numpy  # noqa: F401
        from statsmodels.tsa.filters.hp_filter import hpfilter  # noqa: F401
    except ImportError:
        print("[SKIP] test_sb_002_hp_filter_correct - scipy/statsmodels not installed")
        return

    task = next(t for t in TASK_BANK if t["id"] == "task_sb_002")
    correct_code = '''
import numpy as np
from statsmodels.tsa.filters.hp_filter import hpfilter

def hp_filter_correlation(series_a, series_b):
    log_a = np.log(series_a)
    log_b = np.log(series_b)
    cycle_a, _ = hpfilter(log_a, lamb=100)
    cycle_b, _ = hpfilter(log_b, lamb=100)
    corr = np.corrcoef(cycle_a, cycle_b)[0, 1]
    return round(float(corr), 5)
'''
    assert task["verifier"](correct_code), "Correct HP filter implementation should pass"

    garbage = "hp_filter_correlation numpy hpfilter corrcoef lamb=100"
    assert not task["verifier"](garbage), "Keywords should fail"

    print("[PASS] test_sb_002_hp_filter_correct")


def test_sb_003_dialogue_parser_correct():
    """Verify task_sb_003 dialogue parser passes with correct implementation."""
    task = next(t for t in TASK_BANK if t["id"] == "task_sb_003")
    correct_code = (
        'import re\n'
        '\n'
        'def parse_dialogue(script):\n'
        '    nodes = []\n'
        '    edges = []\n'
        '    lines = script.strip().split("\\n")\n'
        '    current_node_id = None\n'
        '    current_lines = []\n'
        '    def flush_node():\n'
        '        nonlocal current_node_id, current_lines\n'
        '        if current_node_id is None:\n'
        '            return\n'
        '        content_lines = [l.strip() for l in current_lines if l.strip()]\n'
        '        is_choice = any(re.match(r"^\\d+\\.", l) for l in content_lines)\n'
        '        if is_choice:\n'
        '            nodes.append({"id": current_node_id, "text": "", "speaker": "", "type": "choice"})\n'
        '            for l in content_lines:\n'
        '                m = re.match(r"^(\\d+\\.\\s*.+?)\\s*->\\s*(\\w+)$", l)\n'
        '                if m:\n'
        '                    edges.append({"from": current_node_id, "to": m.group(2), "text": m.group(1).strip()})\n'
        '        else:\n'
        '            speaker = ""\n'
        '            text = ""\n'
        '            target = None\n'
        '            for l in content_lines:\n'
        '                m = re.match(r"^(\\w[\\w\\s]*):\\s*(.+?)\\s*->\\s*(\\w+)$", l)\n'
        '                if m:\n'
        '                    speaker = m.group(1)\n'
        '                    text = m.group(2).strip()\n'
        '                    target = m.group(3)\n'
        '                else:\n'
        '                    m2 = re.match(r"^(\\w[\\w\\s]*):\\s*(.+)$", l)\n'
        '                    if m2:\n'
        '                        speaker = m2.group(1)\n'
        '                        text = m2.group(2).strip()\n'
        '            nodes.append({"id": current_node_id, "text": text, "speaker": speaker, "type": "line"})\n'
        '            if target:\n'
        '                edges.append({"from": current_node_id, "to": target, "text": ""})\n'
        '        current_node_id = None\n'
        '        current_lines = []\n'
        '    for line in lines:\n'
        '        m = re.match(r"^\\[(\\w+)\\]$", line.strip())\n'
        '        if m:\n'
        '            flush_node()\n'
        '            current_node_id = m.group(1)\n'
        '            current_lines = []\n'
        '        else:\n'
        '            current_lines.append(line)\n'
        '    flush_node()\n'
        '    return {"nodes": nodes, "edges": edges}\n'
    )
    assert task["verifier"](correct_code), "Correct dialogue parser should pass"

    garbage = "parse_dialogue nodes edges from to text speaker type"
    assert not task["verifier"](garbage), "Keywords should fail"

    print("[PASS] test_sb_003_dialogue_parser_correct")


# ---------------------------------------------------------------------------
# Procedural task generator tests
# ---------------------------------------------------------------------------


def test_procedural_auth_100_seeds():
    """Test auth protocol template produces valid, verifiable tasks for 100 seeds."""
    gen = TaskGenerator(seed=0)
    for seed in range(100):
        result = gen.generate_with_seed(seed, template="auth_protocol")
        task = result["task"]
        skills = result["skills"]

        assert task["id"].startswith("task_proc_auth_")
        assert task["source"] == "procedural"
        assert task["template"] == "auth_protocol"
        assert len(task["relevant_skills"]) == 1
        assert len(task["distractor_skills"]) >= 4

        for sid in task["relevant_skills"] + task["distractor_skills"]:
            assert sid in skills, f"Skill {sid} not in generated skills for seed {seed}"

        rel_skill = skills[task["relevant_skills"][0]]
        assert len(rel_skill["full_content"]) > 100

    print("[PASS] test_procedural_auth_100_seeds")


def test_procedural_binary_100_seeds():
    """Test binary format template produces valid tasks for 100 seeds."""
    gen = TaskGenerator(seed=0)
    for seed in range(100):
        result = gen.generate_with_seed(seed, template="binary_format")
        task = result["task"]
        skills = result["skills"]

        assert task["id"].startswith("task_proc_bin_")
        assert task["source"] == "procedural"
        assert len(task["relevant_skills"]) == 1
        assert len(task["distractor_skills"]) >= 4

        for sid in task["relevant_skills"] + task["distractor_skills"]:
            assert sid in skills

    print("[PASS] test_procedural_binary_100_seeds")


def test_procedural_deterministic():
    """Same seed produces identical tasks."""
    gen = TaskGenerator(seed=0)
    r1 = gen.generate_with_seed(42, template="auth_protocol")
    r2 = gen.generate_with_seed(42, template="auth_protocol")

    assert r1["task"]["id"] == r2["task"]["id"]
    assert r1["task"]["description"] == r2["task"]["description"]
    assert r1["task"]["relevant_skills"] == r2["task"]["relevant_skills"]
    assert r1["task"]["distractor_skills"] == r2["task"]["distractor_skills"]

    r3 = gen.generate_with_seed(42, template="binary_format")
    r4 = gen.generate_with_seed(42, template="binary_format")
    assert r3["task"]["id"] == r4["task"]["id"]
    assert r3["task"]["description"] == r4["task"]["description"]

    print("[PASS] test_procedural_deterministic")


def test_procedural_keyword_stuffing_fails():
    """Keyword-stuffed answers should fail procedural verifiers."""
    gen = TaskGenerator(seed=0)

    for seed in range(10):
        result = gen.generate_with_seed(seed, template="auth_protocol")
        task = result["task"]
        garbage = "HMAC SHA256 base64 signing API key authentication header"
        assert not task["verifier"](garbage), f"Keyword stuffing passed for auth seed {seed}"

        result = gen.generate_with_seed(seed, template="binary_format")
        task = result["task"]
        garbage = "struct unpack CRC32 magic bytes header version flags"
        assert not task["verifier"](garbage), f"Keyword stuffing passed for binary seed {seed}"

    print("[PASS] test_procedural_keyword_stuffing_fails")


def test_procedural_env_integration():
    """Test environment works with use_procedural=True."""
    env = SkillInvocationEnvironment(use_procedural=True, procedural_seed=42)
    obs = env.reset(seed=100)

    assert isinstance(obs, SkillInvocationObservation)
    assert obs.task_description != ""
    assert len(obs.skill_catalog) >= 5
    assert obs.context_budget_total == 5
    assert obs.done is False

    skill_id = obs.skill_catalog[0]["id"]
    obs2 = env.step(SkillInvocationAction(action_type="load", skill_id=skill_id))
    assert obs2.skill_content is not None
    assert len(obs2.skill_content) > 50
    assert obs2.context_budget_used == 1

    obs3 = env.step(SkillInvocationAction(action_type="submit", answer="test"))
    assert obs3.done is True
    assert obs3.reward is not None

    print("[PASS] test_procedural_env_integration")


def test_procedural_uniqueness():
    """Different seeds produce different tasks."""
    gen = TaskGenerator(seed=0)
    descriptions = set()
    for seed in range(50):
        result = gen.generate_with_seed(seed, template="auth_protocol")
        descriptions.add(result["task"]["description"])

    assert len(descriptions) >= 10, f"Only {len(descriptions)} unique tasks from 50 seeds"

    print("[PASS] test_procedural_uniqueness")


if __name__ == "__main__":
    print("=" * 60)
    print("Skill Invocation Environment - Local Tests")
    print("=" * 60)

    tests = [
        # Core environment tests
        test_reset,
        test_load_skill,
        test_invoke_backward_compat,
        test_unload_skill,
        test_load_already_loaded,
        test_unload_not_loaded,
        test_context_budget,
        test_load_unknown_skill,
        test_submit_incorrect,
        test_submit_after_done,
        test_precision_reward,
        test_bloat_penalty,
        test_load_unload_no_bloat,
        test_state_property,
        test_all_tasks_have_valid_skills,
        # Verifier tests
        test_verifier_task001_correct_code_passes,
        test_verifier_task001_keywords_only_fails,
        test_verifier_task001_wrong_format_fails,
        test_verifier_task001_markdown_fenced,
        test_verifier_task002_correct_passes,
        test_verifier_task002_keywords_only_fails,
        test_verifier_task003_structural,
        test_verifier_task004_yaml_structure,
        test_verifier_task008_record_parser,
        # SkillsBench tests
        test_sb_001_flood_detection_correct,
        test_sb_002_hp_filter_correct,
        test_sb_003_dialogue_parser_correct,
        # Procedural generator tests
        test_procedural_auth_100_seeds,
        test_procedural_binary_100_seeds,
        test_procedural_deterministic,
        test_procedural_keyword_stuffing_fails,
        test_procedural_env_integration,
        test_procedural_uniqueness,
    ]

    passed = 0
    failed = 0
    for test in tests:
        try:
            test()
            passed += 1
        except Exception as e:
            print(f"[FAIL] {test.__name__}: {e}")
            import traceback
            traceback.print_exc()
            failed += 1

    print("=" * 60)
    print(f"Results: {passed} passed, {failed} failed")
    print("=" * 60)

    sys.exit(1 if failed > 0 else 0)