| """ |
| Build the SFT dataset for our own ASCII-movie director model. |
| |
| Each row teaches: CONCEPT -> an ORIGINAL movie script (strict JSON), in the exact |
| test-time format (Karpathy's SFT rule). Mix of hand-written GOLD examples and |
| distilled examples from the current model (cherry-picked for valid structure). |
| |
| python train_data.py --n 40 # generate -> movie_sft.jsonl |
| """ |
|
|
| import argparse |
| import json |
| import os |
|
|
| import movies |
|
|
| |
| SFT_SYS = ("You are a master ASCII-movie director. From a concept, write an ORIGINAL ~2-minute film " |
| "as strict JSON only: {\"title\":str,\"logline\":str,\"shots\":[{\"narration\":\"60-90 " |
| "chars present tense\",\"cast\":[1-3 common nouns, each drawn in ASCII],\"action\":\"" |
| + "|".join(movies.ACTIONS) + "\",\"setting\":\"" + "|".join(movies.SETTINGS) |
| + "\",\"dialogue\":\"optional <=60 chars\"}]}. Write " + str(movies.N_SHOTS) + " shots (a full 3-act arc).") |
|
|
| |
| CONCEPTS = [ |
| "a tiny knight who is afraid of the dark", "a lonely robot who finds a stray cat", |
| "a rabbit who wants to touch the moon", "a monster who just wants a friend", |
| "two birds racing the sunrise", "a flower that grows in a haunted castle", |
| "a cat burglar mouse and a sleepy guard dog", "a ship lost in a sea of stars", |
| "a tree that remembers every child who climbed it", "a robot learning to laugh", |
| "a knight and a dragon who'd rather have tea", "the last firefly looking for summer", |
| "a house that misses the family that left", "a moonbeam trying to wake the sun", |
| "a cat who thinks it is a king", "a small boat and a very big storm", |
| "a shy ghost throwing a birthday party", "a snail in a race it cannot win", |
| "a robot gardener and the first flower", "a wolf who howls off-key", |
| "a girl who collects forgotten lullabies", "two rivals stuck in one elevator", |
| "a clock that runs backward on purpose", "a lighthouse keeper and a lost whale", |
| "a dragon afraid of its own fire", "a puppet who cuts its own strings", |
| "a knight delivering a single flower across a war", "a cat and a beam of light, eternal rivals", |
| "a robot reading a bedtime story to nobody", "a bird teaching a fish to fly", |
| "the moon babysitting the stars", "a monster under the bed who is also scared", |
| "a tiny boat sailing a bathtub ocean", "a scarecrow who wants to travel", |
| "two old friends, one last game", "a candle racing the dawn", |
| "a fox who steals colors from the sunset", "a robot and a kite", |
| "a mouse mailing a letter to the moon", "a garden that only blooms when sung to", |
| ] |
|
|
| |
| GOLD = [ |
| ("a lonely robot who finds a stray cat", |
| {"title": "The Tin Heart", "logline": "A discarded robot learns to feel from a stray cat.", |
| "shots": [ |
| {"narration": "A rusted robot jolts awake alone in a junkyard at dusk.", "cast": ["robot"], "action": "rise", "dialogue": ""}, |
| {"narration": "A wary cat slinks from the shadows, tail flicking.", "cast": ["robot", "cat"], "action": "enter", "dialogue": "...friend?"}, |
| {"narration": "They share a silent night beneath a watching moon.", "cast": ["robot", "cat", "moon"], "action": "gather", "dialogue": ""}, |
| {"narration": "Dawn comes; the cat turns to leave.", "cast": ["robot", "cat"], "action": "exit", "dialogue": "stay."}, |
| {"narration": "The robot follows, its tin chest glowing for the first time.", "cast": ["robot", "cat"], "action": "gather", "dialogue": ""}]}), |
| ] |
|
|
|
|
| def row(concept, spec): |
| return {"messages": [{"role": "system", "content": SFT_SYS}, |
| {"role": "user", "content": f"CONCEPT: {concept}"}, |
| {"role": "assistant", "content": json.dumps(spec, ensure_ascii=False)}]} |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--n", type=int, default=40) |
| args = ap.parse_args() |
| out, kept, bad = [], 0, 0 |
|
|
| for concept, spec in GOLD: |
| out.append(row(concept, spec)) |
| kept += 1 |
|
|
| for concept in CONCEPTS[:args.n]: |
| spec = movies.direct(concept) |
| |
| if isinstance(spec.get("shots"), list) and movies.good_shot_count(len(spec["shots"])) \ |
| and all(s.get("narration") for s in spec["shots"]): |
| out.append(row(concept, spec)) |
| kept += 1 |
| print(f" ✓ {concept[:40]:42} {len(spec['shots'])} shots") |
| else: |
| bad += 1 |
| print(f" ✗ {concept[:40]} (malformed)") |
|
|
| path = os.path.join(os.path.dirname(__file__), "movie_sft.jsonl") |
| with open(path, "w") as f: |
| for r in out: |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") |
| print(f"\nwrote {kept} rows ({bad} dropped) -> {path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|