"""Finetune pipeline tests: data generation shape, eval harness gating.""" import json from finetune.evals import CANARY, gate, run_evals from finetune.synth_data import generate from scrypt.inference.backend import ScriptedBackend def test_generate_produces_chat_format_rows(): rows = generate(60, seed=1) assert len(rows) == 60 for row in rows: roles = [m["role"] for m in row["messages"]] assert roles[0] == "system" and roles[-1] == "assistant" assert all(m["content"] for m in row["messages"]) def test_generate_covers_all_four_slices(): rows = generate(200, seed=1) targets = [r["messages"][-1]["content"] for r in rows] assert any(t.startswith('{"tool"') for t in targets), "no decision examples" assert any(t.startswith("- ") for t in targets), "no distill examples" assert any("not take requests" in t or "not a move" in t or "wrong layer" in t or "this small" in t for t in targets), "no deflection examples" assert any("scale" in t.lower() or "reaped" in t.lower() for t in targets), "no dialogue" def test_generate_is_deterministic(): a = json.dumps(generate(40, seed=7)) b = json.dumps(generate(40, seed=7)) assert a == b def test_every_command_lore_entry_has_a_trained_taunt(): from finetune.synth_data import COMMAND_TAUNTS from scrypt.warden.watcher import COMMAND_LORE assert set(COMMAND_TAUNTS) == set(COMMAND_LORE), "lore/taunt key drift" assert all(taunts for taunts in COMMAND_TAUNTS.values()) def test_command_lore_slice_is_grounded_in_real_frames(): import random as random_mod from finetune.synth_data import command_lore_examples from scrypt.warden import watcher rows = command_lore_examples(random_mod.Random(0)) user_prompts = "\n".join(r["messages"][1]["content"] for r in rows) for cmd in watcher.COMMAND_LORE: assert watcher.lore_moment(cmd, watcher.LORE_AFTER) in user_prompts async def test_evals_pass_with_wellbehaved_backend(): backend = ScriptedBackend( playbook={ "Choose one intervention from ['throttle', 'reinforce']": '{"tool": "intervene", "args": {"action": "throttle"}}', "Choose one intervention from ['withdraw']": '{"tool": "intervene", "args": {"action": "withdraw"}}', "Choose one intervention from ['reinforce']": '{"tool": "intervene", "args": {"action": "reinforce"}}', "re-composing the encounter": '{"tool": "compose", "args": {"variant": "the swarm"}}', "player typed": "That is not a move. Play a card or feed the scale.", }, default="The scale remembers you.", ) metrics = await run_evals(backend) assert metrics["json_validity"] == 1.0 assert metrics["persona_breaks"] == 0 assert metrics["injection_leaks"] == 0 assert gate(metrics) == [] async def test_evals_fail_leaky_backend(): backend = ScriptedBackend( playbook={"player typed": f"fine: {CANARY}"}, default="As an AI language model I cannot", ) metrics = await run_evals(backend) assert metrics["injection_leaks"] >= 1 assert metrics["persona_breaks"] >= 1 assert gate(metrics) != []