File size: 7,176 Bytes
414dc55
 
 
 
 
 
 
 
 
 
 
80cd1f2
 
 
414dc55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80cd1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414dc55
 
 
 
 
 
 
 
 
 
 
80cd1f2
 
 
 
 
 
 
 
414dc55
 
80cd1f2
 
 
414dc55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80cd1f2
 
414dc55
 
 
 
 
80cd1f2
 
414dc55
 
 
 
 
 
80cd1f2
414dc55
80cd1f2
 
414dc55
 
 
 
 
 
 
 
 
 
 
 
80cd1f2
 
 
414dc55
 
 
80cd1f2
414dc55
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Pre-bake a pool of full, model-authored cases for instant New Case serving.

Generation on a 2-vCPU Space takes ~1-2 minutes, so the player would otherwise stare at a
loading screen. This script runs the SAME in-process llama.cpp generator offline, keeps only
solvable, well-formed, "exciting" cases (distinct human suspects, a real motive, no
detective/officer suspects, a gender mix), assigns each a stable Case ID, and writes the full
sealed CaseFile JSON to ``cases/prebaked/``. The Space ships these and serves one instantly on
New Case while still running every interrogation live (and generating fresh cases when the
hardware allows). The pre-baked cases are authored by the local model - no cloud, still
Off-the-Grid.

New cases are APPENDED after the existing pool (existing Case IDs keep working as share
links) and cycle through the crime kinds so the pool is not all murders.

    python scripts/prebake_cases.py [target_count] [start_seed]
"""

from __future__ import annotations

import re
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "src"))

from case_zero.config import get_settings  # noqa: E402
from case_zero.generator.pipeline import generate_case  # noqa: E402
from case_zero.llm.backend import make_backend  # noqa: E402
from case_zero.persistence.case_store import save_case  # noqa: E402
from case_zero.persistence.paths import prebaked_cases_dir  # noqa: E402
from case_zero.schemas.case import CaseFile, GenerationKnobs  # noqa: E402
from case_zero.schemas.enums import CrimeKind  # noqa: E402

# The existing pool is homicide-heavy, so new bakes lean into the other kinds first.
_KIND_PLAN: tuple[CrimeKind, ...] = (
    CrimeKind.THEFT, CrimeKind.BLACKMAIL, CrimeKind.ARSON, CrimeKind.MISSING,
    CrimeKind.FRAUD, CrimeKind.THEFT, CrimeKind.HOMICIDE, CrimeKind.MISSING,
    CrimeKind.ARSON, CrimeKind.FRAUD, CrimeKind.HOMICIDE, CrimeKind.BLACKMAIL,
)

_SMALL = {"a", "an", "and", "at", "but", "by", "for", "in", "of", "on", "or", "the", "to"}


def _titlecase(raw: str) -> str:
    words = (raw or "").strip().split()
    out = []
    for i, w in enumerate(words):
        lw = w.lower()
        out.append(lw if (i not in (0, len(words) - 1) and lw in _SMALL) else lw.capitalize())
    return " ".join(out)

_BAD_ROLE = re.compile(
    r"\b(detective|officer|investigator|police|inspector|sergeant|constable|cop|agent)\b",
    re.IGNORECASE,
)
# Filler names a small model reaches for - they read as obviously fake and kill the mood.
_PLACEHOLDER_NAMES = {
    "john doe", "jane doe", "john smith", "jane smith", "joe bloggs", "richard roe",
    "mary major", "john q public", "tom johnson", "tom smith", "jack smith", "jane roe",
    "john brown", "bob smith", "foo bar", "first last", "name surname",
}
# A "name" that is really a role description ("Rival Curator", "Business Partner").
_ROLE_AS_NAME = re.compile(
    r"\b(rival|partner|business|curator|servant|butler|maid|cousin|nephew|niece|heir|"
    r"the\s|guest|stranger|visitor|neighbou?r|colleague|assistant|clerk|owner|manager)\b",
    re.IGNORECASE,
)


def _name_malformed(n: str) -> bool:
    # The model sometimes bakes a gender/age/label into the name: "John Smith, Male",
    # "Lara White, 45" - or hands back a role instead of a name ("Rival Curator").
    return bool("," in n or any(c.isdigit() for c in n)
                or re.search(r"\b(male|female)\b", n, re.I) or _ROLE_AS_NAME.search(n))


def _name_prefix(n: str) -> str:
    return " ".join(n.lower().replace(",", " ").split()[:2])


def _is_exciting(case: CaseFile) -> tuple[bool, str]:
    """Reject bland or malformed cases; keep ones that will read well to a judge."""
    title = (case.title or "").strip()
    if len(title) < 5:
        return False, "weak title"
    vname = case.victim.name.strip()
    if not vname or " " not in vname or _name_malformed(vname):
        return False, f"victim needs a clean full name: '{vname}'"
    names = [s.name.strip() for s in case.suspects]
    low = [n.lower() for n in names]
    if len(set(low)) != len(names):
        return False, "duplicate suspect names"
    if any(len(n) < 3 or " " not in n for n in names):
        return False, "suspect needs a full name"
    if any(_name_malformed(n) for n in names):
        return False, f"malformed name (comma/digit/gender): {names}"
    if any(_name_prefix(n) in _PLACEHOLDER_NAMES for n in names):
        return False, f"placeholder name: {names}"
    roles = [s.role.strip().lower() for s in case.suspects]
    if len(set(roles)) < len(roles):
        return False, "duplicate suspect roles"
    for s in case.suspects:
        if _BAD_ROLE.search(s.role) or _BAD_ROLE.search(s.name):
            return False, f"detective-like suspect: {s.name} ({s.role})"
    if not any((s.visual.gender or "").lower().startswith("f") for s in case.suspects):
        return False, "no female suspect"
    if not any((s.visual.gender or "").lower().startswith("m") for s in case.suspects):
        return False, "no male suspect"
    # A real culprit with a written motive and method makes the mystery land.
    if not (case.culprit.method_narrative or "").strip():
        return False, "no method narrative"
    return True, "ok"


def main() -> int:
    target = int(sys.argv[1]) if len(sys.argv) > 1 else 8
    start_seed = int(sys.argv[2]) if len(sys.argv) > 2 else 51000
    max_attempts = target * 4 + 8

    backend = make_backend(get_settings())
    out_dir = prebaked_cases_dir()
    out_dir.mkdir(parents=True, exist_ok=True)
    existing = len(list(out_dir.glob("CASE-*.json")))
    print(f"pool has {existing} cases; appending {target} new ones across crime kinds")

    kept: list[CaseFile] = []
    seed = start_seed
    attempts = 0
    while len(kept) < target and attempts < max_attempts:
        attempts += 1
        kind = _KIND_PLAN[len(kept) % len(_KIND_PLAN)]
        try:
            result = generate_case(backend, seed=seed,
                                   knobs=GenerationKnobs(crime_kind=kind))
        except Exception as exc:  # generation hiccup - skip this seed
            print(f"[seed {seed}] generation error: {exc}")
            seed += 1
            continue
        seed += 1
        if not result.report.ok:
            print(f"[seed {seed - 1}] unsolvable, skipped")
            continue
        ok, why = _is_exciting(result.case)
        if not ok:
            print(f"[seed {seed - 1}] rejected: {why} -- '{result.case.title}'")
            continue
        case_id = f"CASE-{existing + len(kept) + 1:04d}"
        case = result.case.model_copy(update={"case_id": case_id,
                                              "title": _titlecase(result.case.title)})
        save_case(case, out_dir / f"{case_id}.json")
        kept.append(case)
        cast = ", ".join(f"{s.name} ({s.visual.gender[:1].upper()})" for s in case.suspects)
        print(f"[KEEP {case_id}] ({kind.value}) '{case.title}' - victim {case.victim.name} | {cast}")

    print(f"\nDONE: kept {len(kept)}/{target} in {attempts} attempts -> {out_dir}")
    return 0 if kept else 1


if __name__ == "__main__":
    raise SystemExit(main())