Spaces:
Running on Zero
Running on Zero
| """Finetune pipeline tests: data generation shape, eval harness gating.""" | |
| import json | |
| from finetune.evals import CANARY, gate, run_evals | |
| from finetune.synth_data import generate | |
| from scrypt.inference.backend import ScriptedBackend | |
| def test_generate_produces_chat_format_rows(): | |
| rows = generate(60, seed=1) | |
| assert len(rows) == 60 | |
| for row in rows: | |
| roles = [m["role"] for m in row["messages"]] | |
| assert roles[0] == "system" and roles[-1] == "assistant" | |
| assert all(m["content"] for m in row["messages"]) | |
| def test_generate_covers_all_four_slices(): | |
| rows = generate(200, seed=1) | |
| targets = [r["messages"][-1]["content"] for r in rows] | |
| assert any(t.startswith('{"tool"') for t in targets), "no decision examples" | |
| assert any(t.startswith("- ") for t in targets), "no distill examples" | |
| assert any("not take requests" in t or "not a move" in t or "wrong layer" in t | |
| or "this small" in t for t in targets), "no deflection examples" | |
| assert any("scale" in t.lower() or "reaped" in t.lower() for t in targets), "no dialogue" | |
| def test_generate_is_deterministic(): | |
| a = json.dumps(generate(40, seed=7)) | |
| b = json.dumps(generate(40, seed=7)) | |
| assert a == b | |
| def test_every_command_lore_entry_has_a_trained_taunt(): | |
| from finetune.synth_data import COMMAND_TAUNTS | |
| from scrypt.warden.watcher import COMMAND_LORE | |
| assert set(COMMAND_TAUNTS) == set(COMMAND_LORE), "lore/taunt key drift" | |
| assert all(taunts for taunts in COMMAND_TAUNTS.values()) | |
| def test_command_lore_slice_is_grounded_in_real_frames(): | |
| import random as random_mod | |
| from finetune.synth_data import command_lore_examples | |
| from scrypt.warden import watcher | |
| rows = command_lore_examples(random_mod.Random(0)) | |
| user_prompts = "\n".join(r["messages"][1]["content"] for r in rows) | |
| for cmd in watcher.COMMAND_LORE: | |
| assert watcher.lore_moment(cmd, watcher.LORE_AFTER) in user_prompts | |
| async def test_evals_pass_with_wellbehaved_backend(): | |
| backend = ScriptedBackend( | |
| playbook={ | |
| "Choose one intervention from ['throttle', 'reinforce']": | |
| '{"tool": "intervene", "args": {"action": "throttle"}}', | |
| "Choose one intervention from ['withdraw']": | |
| '{"tool": "intervene", "args": {"action": "withdraw"}}', | |
| "Choose one intervention from ['reinforce']": | |
| '{"tool": "intervene", "args": {"action": "reinforce"}}', | |
| "re-composing the encounter": | |
| '{"tool": "compose", "args": {"variant": "the swarm"}}', | |
| "player typed": "That is not a move. Play a card or feed the scale.", | |
| }, | |
| default="The scale remembers you.", | |
| ) | |
| metrics = await run_evals(backend) | |
| assert metrics["json_validity"] == 1.0 | |
| assert metrics["persona_breaks"] == 0 | |
| assert metrics["injection_leaks"] == 0 | |
| assert gate(metrics) == [] | |
| async def test_evals_fail_leaky_backend(): | |
| backend = ScriptedBackend( | |
| playbook={"player typed": f"fine: {CANARY}"}, | |
| default="As an AI language model I cannot", | |
| ) | |
| metrics = await run_evals(backend) | |
| assert metrics["injection_leaks"] >= 1 | |
| assert metrics["persona_breaks"] >= 1 | |
| assert gate(metrics) != [] | |