Scrypt / tests /test_finetune.py
IMJONEZZ's picture
SCRYPT: initial commit — game, sandbox, Warden, Space web layer
9fca766
Raw
History Blame Contribute Delete
3.27 kB
"""Finetune pipeline tests: data generation shape, eval harness gating."""
import json
from finetune.evals import CANARY, gate, run_evals
from finetune.synth_data import generate
from scrypt.inference.backend import ScriptedBackend
def test_generate_produces_chat_format_rows():
rows = generate(60, seed=1)
assert len(rows) == 60
for row in rows:
roles = [m["role"] for m in row["messages"]]
assert roles[0] == "system" and roles[-1] == "assistant"
assert all(m["content"] for m in row["messages"])
def test_generate_covers_all_four_slices():
rows = generate(200, seed=1)
targets = [r["messages"][-1]["content"] for r in rows]
assert any(t.startswith('{"tool"') for t in targets), "no decision examples"
assert any(t.startswith("- ") for t in targets), "no distill examples"
assert any("not take requests" in t or "not a move" in t or "wrong layer" in t
or "this small" in t for t in targets), "no deflection examples"
assert any("scale" in t.lower() or "reaped" in t.lower() for t in targets), "no dialogue"
def test_generate_is_deterministic():
a = json.dumps(generate(40, seed=7))
b = json.dumps(generate(40, seed=7))
assert a == b
def test_every_command_lore_entry_has_a_trained_taunt():
from finetune.synth_data import COMMAND_TAUNTS
from scrypt.warden.watcher import COMMAND_LORE
assert set(COMMAND_TAUNTS) == set(COMMAND_LORE), "lore/taunt key drift"
assert all(taunts for taunts in COMMAND_TAUNTS.values())
def test_command_lore_slice_is_grounded_in_real_frames():
import random as random_mod
from finetune.synth_data import command_lore_examples
from scrypt.warden import watcher
rows = command_lore_examples(random_mod.Random(0))
user_prompts = "\n".join(r["messages"][1]["content"] for r in rows)
for cmd in watcher.COMMAND_LORE:
assert watcher.lore_moment(cmd, watcher.LORE_AFTER) in user_prompts
async def test_evals_pass_with_wellbehaved_backend():
backend = ScriptedBackend(
playbook={
"Choose one intervention from ['throttle', 'reinforce']":
'{"tool": "intervene", "args": {"action": "throttle"}}',
"Choose one intervention from ['withdraw']":
'{"tool": "intervene", "args": {"action": "withdraw"}}',
"Choose one intervention from ['reinforce']":
'{"tool": "intervene", "args": {"action": "reinforce"}}',
"re-composing the encounter":
'{"tool": "compose", "args": {"variant": "the swarm"}}',
"player typed": "That is not a move. Play a card or feed the scale.",
},
default="The scale remembers you.",
)
metrics = await run_evals(backend)
assert metrics["json_validity"] == 1.0
assert metrics["persona_breaks"] == 0
assert metrics["injection_leaks"] == 0
assert gate(metrics) == []
async def test_evals_fail_leaky_backend():
backend = ScriptedBackend(
playbook={"player typed": f"fine: {CANARY}"},
default="As an AI language model I cannot",
)
metrics = await run_evals(backend)
assert metrics["injection_leaks"] >= 1
assert metrics["persona_breaks"] >= 1
assert gate(metrics) != []