File size: 21,426 Bytes
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6736c51
 
 
 
7563305
6736c51
 
 
 
 
7563305
 
6736c51
 
 
 
 
 
 
 
 
7563305
 
6736c51
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6736c51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76f2051
 
 
 
6736c51
 
 
76f2051
6736c51
 
 
 
 
 
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6736c51
 
 
 
 
 
7563305
 
 
 
6736c51
 
 
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d0b820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae15cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7563305
 
1d0b820
 
 
7563305
 
 
 
 
 
 
1d0b820
 
 
 
 
 
 
 
 
 
 
 
 
7563305
 
ae15cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7563305
 
 
 
 
ae15cb7
 
 
7563305
 
 
 
 
 
 
 
 
 
 
ae15cb7
 
 
 
7563305
 
 
 
 
6736c51
 
 
 
 
 
 
 
 
 
7563305
 
6736c51
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
"""
Recall — shared inference wrapper.  OWNER: Nikolai (Module B)

Everything that touches the model goes through `chat()`. Both content_pipeline
and learning_engine import this and nothing else model-related.

Default is STUB mode (RECALL_STUB=1) so `python app.py` runs with no GPU and no
model download. Flip RECALL_STUB=0 once the real MiniCPM call works on the Space.

Model is a one-env-var config flip (NAH-9). Set RECALL_MODEL to a known alias
or any full HF id; default is the 8B. If the Space is too slow / OOM, swap to a
smaller model with no code change:

    RECALL_MODEL=1b RECALL_STUB=0 python app.py   # MiniCPM5-1B  (fast fallback)
    RECALL_MODEL=4b RECALL_STUB=0 python app.py   # MiniCPM3-4B  (Tiny Titan badge)

Aliases resolve via MODELS below; an unknown value is treated as a literal HF id.

Load dtype/device default to bf16 + device_map="auto" (correct for the Space's
CUDA GPU). For a local real-model smoke test on Apple Silicon, override them —
bf16 on MPS produces garbage, so use CPU/float32:

    RECALL_STUB=0 RECALL_MODEL=1b RECALL_DTYPE=float32 RECALL_DEVICE=cpu python app.py
"""
from __future__ import annotations

import json
import os
import re

STUB = os.getenv("RECALL_STUB", "1") == "1"

# Known models, keyed by short alias so swapping is a single env-var flip.
MODELS = {
    "v46": "openbmb/MiniCPM-V-4.6",  # default / primary — multimodal (text + image)
    "8b": "openbmb/MiniCPM4.1-8B",   # legacy text-only (needs transformers<5.0)
    "1b": "openbmb/MiniCPM5-1B",     # legacy fast fallback
    "4b": "openbmb/MiniCPM3-4B",     # legacy mid fallback (Tiny Titan badge)
}
# Default is the multimodal MiniCPM-V 4.6 so the same model grades text AND reads
# image-only / scanned PDFs. The legacy text aliases need transformers<5.0 and no
# longer load against the pinned transformers 5.x — keep them only for reference.
_requested = os.getenv("RECALL_MODEL", "v46")
# Accept an alias ("v46") or a full HF id ("org/model") passed through verbatim.
MODEL_ID = MODELS.get(_requested, _requested)


def _is_vision_model(model_id: str) -> bool:
    """MiniCPM-V (vision) ids load via a different class + processor than the
    text-only MiniCPM models. Detect by the '-V' family marker."""
    return "minicpm-v" in model_id.lower()


VISION = _is_vision_model(MODEL_ID)

_model = None
_tokenizer = None
_processor = None  # MiniCPM-V uses an AutoProcessor (image+text) instead of a tokenizer


def active_model() -> str:
    """The HF model id currently configured ('stub' when running stubbed)."""
    return "stub" if STUB else MODEL_ID


# Load-time dtype/device, overridable for local dev (defaults are correct for
# the Space's CUDA GPU). bf16 on Apple-Silicon MPS produces garbage output, so a
# Mac real-model smoke test needs RECALL_DTYPE=float32 RECALL_DEVICE=cpu; unset,
# behavior is unchanged (bf16 + device_map="auto").
_DTYPE_ALIASES = {
    "bfloat16": "bfloat16", "bf16": "bfloat16",
    "float16": "float16", "fp16": "float16", "half": "float16",
    "float32": "float32", "fp32": "float32", "float": "float32",
}


def _resolve_dtype_name() -> str:
    """Normalized torch dtype name from RECALL_DTYPE (default 'bfloat16').
    Unknown values fall back to the default rather than erroring at load."""
    return _DTYPE_ALIASES.get(os.getenv("RECALL_DTYPE", "bfloat16").lower(), "bfloat16")


def _resolve_device_map():
    """device_map for from_pretrained. Default 'auto' (accelerate places it);
    RECALL_DEVICE overrides, e.g. 'cpu' for stable local CPU inference."""
    return os.getenv("RECALL_DEVICE") or "auto"


def _load():
    """Lazy-load the model once. Only called when STUB is off."""
    global _model, _tokenizer
    if _model is not None:
        return
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    dtype = getattr(torch, _resolve_dtype_name())
    device_map = _resolve_device_map()
    print(f"[recall] loading model: {MODEL_ID} (dtype={_resolve_dtype_name()}, "
          f"device_map={device_map})")
    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    _model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=dtype,
        device_map=device_map,
        trust_remote_code=True,
    )


# ---- MiniCPM-V (multimodal) path -------------------------------------------
# NEEDS GPU VERIFICATION: the calls below mirror the official MiniCPM-V-4.6 demo
# Space (openbmb/MiniCPM-V-4.6-Demo) but can't be exercised without a GPU + the
# ~9B model. The stub and legacy text paths are unchanged and remain testable.


def _maybe_gpu(fn):
    """Wrap with HF ZeroGPU's @spaces.GPU when available; otherwise a no-op.
    `spaces` ships only in the real-model deps and is effect-free off a ZeroGPU
    Space, so this is safe in stub/local environments where it isn't installed.
    Registering a @spaces.GPU function is ALSO what keeps a ZeroGPU Space healthy
    (a ZeroGPU Space with none flips to RUNTIME_ERROR — see server.py)."""
    try:
        import spaces
    except Exception:  # noqa: BLE001 — not installed (stub/local): run un-wrapped
        return fn
    return spaces.GPU(duration=120)(fn)


def _load_vision() -> None:
    """Lazy-load the MiniCPM-V model + processor once. Only called when STUB is
    off and the active model is a vision model."""
    global _model, _processor
    if _model is not None:
        return
    import torch
    from transformers import AutoProcessor, MiniCPMV4_6ForConditionalGeneration

    dtype = getattr(torch, _resolve_dtype_name())
    device_map = _resolve_device_map()
    print(f"[recall] loading vision model: {MODEL_ID} (dtype={_resolve_dtype_name()}, "
          f"device_map={device_map})")
    _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    _model = MiniCPMV4_6ForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=dtype,
        attn_implementation="sdpa",
        trust_remote_code=True,
        device_map=device_map,
    ).eval()


def _to_vision_content(content):
    """Normalize a message's `content` to MiniCPM-V parts. Accepts a plain string
    (text-only) or a list mixing strings and PIL.Image objects (image+text)."""
    if isinstance(content, str):
        return [{"type": "text", "text": content}]
    parts = []
    for item in content:
        if isinstance(item, str):
            parts.append({"type": "text", "text": item})
        else:  # a PIL.Image (or anything image-like the processor accepts)
            parts.append({"type": "image", "image": item})
    return parts


@_maybe_gpu
def _chat_vision(messages: list[dict], max_tokens: int) -> str:
    """MiniCPM-V 4.6 inference, mirroring the official demo's processor+generate
    call (non-streaming). enable_thinking=False keeps the tight token budget for
    the JSON answer instead of a <think> preamble."""
    _load_vision()
    import torch

    msgs = [{"role": m["role"], "content": _to_vision_content(m["content"])}
            for m in messages]
    inputs = _processor.apply_chat_template(
        msgs,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        enable_thinking=False,
        processor_kwargs={
            "downsample_mode": "16x",
            "max_slice_nums": 9,
            "use_image_id": True,
        },
    ).to(_model.device)
    # MiniCPM-V wants floating inputs (e.g. pixel_values) in the model dtype.
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
            inputs[k] = v.to(dtype=getattr(torch, _resolve_dtype_name()))

    with torch.no_grad():
        # Greedy decoding (do_sample=False): every caller wants a strict JSON
        # object/array, and greedy is markedly more reliable at that than sampling
        # for MiniCPM-V — verified on GPU. enable_thinking is already False so the
        # tight token budget goes to the answer, not a <think> preamble.
        out = _model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            downsample_mode="16x",
        )
    gen = out[0][inputs["input_ids"].shape[1]:]
    return _processor.tokenizer.decode(gen, skip_special_tokens=True).strip()


def _render_prompt(messages: list[dict]) -> str:
    """Build the prompt string. MiniCPM4.1/MiniCPM5 are hybrid reasoning models;
    we pass enable_thinking=False so they answer directly instead of spending the
    (deliberately tight) token budget on a <think> preamble that would push the
    JSON answer past max_tokens — and slow the demo. Non-reasoning models (e.g.
    MiniCPM3-4B) ignore the unused template variable; templates that actively
    reject it fall back to a plain render. extract_json() still strips any
    <think> that leaks through, so this is an optimization, not a correctness
    dependency."""
    try:
        return _tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True,
            enable_thinking=False,
        )
    except Exception:  # noqa: BLE001 — template can't take the flag; render plain
        return _tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True,
        )


def chat(messages: list[dict], max_tokens: int = 512) -> str:
    """
    messages: [{"role": "system"|"user"|"assistant", "content": str}, ...]
    Returns the assistant's text.

    `content` is normally a str. For the multimodal model it may also be a list
    mixing strings and PIL.Image objects (image+text) — e.g. for image-only PDFs.

    GPU work is wrapped with @spaces.GPU inside the vision path; that decorator is
    also what keeps a ZeroGPU Space healthy. Keep max_tokens tight — latency is
    the demo killer.
    """
    if STUB:
        return _stub_reply(messages)

    if VISION:
        return _chat_vision(messages, max_tokens)

    _load()
    text = _render_prompt(messages)
    inputs = _tokenizer(text, return_tensors="pt").to(_model.device)
    out = _model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )
    gen = out[0][inputs["input_ids"].shape[1]:]
    return _tokenizer.decode(gen, skip_special_tokens=True).strip()


# ---- JSON helper: model output is never trusted ----------------------------

_THINK_CLOSE = re.compile(r"</think\s*>", re.IGNORECASE)


def _strip_think(text: str) -> str:
    """Drop a reasoning-model <think> preamble. MiniCPM4.1/MiniCPM5 are hybrid
    reasoning models that emit <think>…</think> before the actual answer; when
    the chat template pre-fills the opening tag only the closing </think> shows
    up in the reply. Either way the answer (the JSON we want) is whatever follows
    the LAST </think>, so anchoring there also defuses stray braces inside the
    reasoning that would otherwise mislead the JSON search below. A truncated,
    never-closed <think> leaves the text untouched -> extract_json returns None
    -> the caller's repair retry / safe default handles it."""
    last = None
    for last in _THINK_CLOSE.finditer(text):
        pass
    return text[last.end():].strip() if last else text


def _loads(s: str):
    """json.loads, but tolerant of models that over-escape their output. Seen
    with MiniCPM4.1-8B, which sometimes escapes JSON as if it were a string
    literal — quotes as `\\"` and newlines as `\\n` — e.g.
    `[\\n  {\\"k\\": \\"v\\"}\\n]` instead of real JSON. If the straight parse
    fails and the text carries `\\"`, retry by (a) decoding it as a JSON string
    body, which undoes \\", \\n, \\t and unicode escapes in one shot, then
    parsing the result; and (b) a simpler quote-only un-escape as a backstop.
    Strictly additive: valid JSON parses on the first try and never reaches the
    fallbacks, so legitimately escaped quotes inside a string are untouched.
    Returns the parsed value or None."""
    try:
        return json.loads(s)
    except Exception:
        pass
    if '\\"' in s:
        # (a) Treat the whole reply as an escaped string and decode it once.
        try:
            return json.loads(json.loads('"' + s + '"'))
        except Exception:
            pass
        # (b) Backstop: just collapse the escaped quotes.
        try:
            return json.loads(s.replace('\\"', '"'))
        except Exception:
            pass
    return None


def _scan_json_values(text: str) -> list:
    """Walk the string and collect every top-level JSON value. Handles models
    that emit several values with no array wrapper — e.g. MiniCPM-V on image
    input returns `{...} {...} {...}` (space-separated objects, no brackets) —
    and ignores junk between/around them (stray quotes, prose, trailing `"}`)."""
    dec = json.JSONDecoder()
    out, i, n = [], 0, len(text)
    while i < n:
        if text[i] in "{[":
            try:
                val, end = dec.raw_decode(text, i)
                out.append(val)
                i = end
                continue
            except ValueError:
                pass
        i += 1
    return out


def _open_brackets(s: str) -> tuple[list, bool]:
    """The bracket closers still open at the end of `s`, plus whether `s` ends
    inside a string literal. String-aware, so braces inside quotes don't count."""
    stack: list[str] = []
    in_str = esc = False
    for ch in s:
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
            continue
        if ch == '"':
            in_str = True
        elif ch == "{":
            stack.append("}")
        elif ch == "[":
            stack.append("]")
        elif ch in "}]" and stack:
            stack.pop()
    return stack, in_str


def _repair_truncated(text: str):
    """Best-effort recovery of a JSON value cut off mid-stream by the model's
    token limit — a common cause of an otherwise-clean grade/deck failing to
    parse (e.g. a reasoning preamble eats the budget). Closes a dangling string
    and any still-open brackets; if that won't parse, walks back to each completed
    top-level element and retries. Returns the parsed value or None."""
    starts = [p for p in (text.find("{"), text.find("[")) if p >= 0]
    if not starts:
        return None
    s = text[min(starts):]
    stack, in_str = _open_brackets(s)
    if not stack and not in_str:
        return None  # nothing left open — not a truncation we can repair
    # Attempt 1: close the value as-is (a trailing complete pair survives).
    data = _loads(s + ('"' if in_str else "") + "".join(reversed(stack)))
    if data is not None:
        return data
    # Attempt 2..n: drop the trailing incomplete element. Top-level element
    # boundaries are commas seen at bracket-depth 1.
    depth = 0
    in_str = esc = False
    boundaries: list[int] = []
    for i, ch in enumerate(s):
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
            continue
        if ch == '"':
            in_str = True
        elif ch in "{[":
            depth += 1
        elif ch in "}]":
            depth -= 1
        elif ch == "," and depth == 1:
            boundaries.append(i)
    for cut in reversed(boundaries):
        head = s[:cut]
        st, _ = _open_brackets(head)
        data = _loads(head + "".join(reversed(st)))
        if data is not None:
            return data
    return None


def extract_json(text: str):
    """
    Pull JSON out of a model reply. Returns the parsed object/array, a list when
    the model emitted several values without an array wrapper, or None. Callers
    must handle None (skip card / use fallback grade).
    """
    text = _strip_think(text.strip())
    # strip ```json fences if present
    text = re.sub(r"^```(?:json)?|```$", "", text, flags=re.MULTILINE).strip()
    data = _loads(text)
    if data is not None:
        return data
    # The whole text didn't parse — commonly because the model concatenated
    # multiple JSON values (objects and/or arrays). Collect them all and flatten
    # to a single list so callers expecting an array still work.
    values = _scan_json_values(text)
    if len(values) == 1:
        return values[0]
    if values:
        flat: list = []
        for v in values:
            flat.extend(v) if isinstance(v, list) else flat.append(v)
        return flat
    # Last resort: a single object/array embedded in prose and/or over-escaped
    # (\" / \n) — the plain scan above can't read that, but _loads can.
    match = re.search(r"(\[.*\]|\{.*\})", text, re.DOTALL)
    if match:
        data = _loads(match.group(1))
        if data is not None:
            return data
    # Last resort: the value was cut off by the token limit (unterminated string
    # / open brackets) — recover the largest valid prefix.
    return _repair_truncated(text)


def _augment_last_user(messages: list[dict]) -> list[dict]:
    """A copy of `messages` with a terse 'JSON only' reminder appended to the
    FINAL user turn — used for the repair pass.

    Appending to the existing instruction (rather than injecting a standalone
    'that was not valid JSON' user turn, the previous approach) changes the prompt
    enough to break a deterministic bad reply WITHOUT handing the model a meta
    message it would otherwise *grade as if it were the student's answer* — which
    produced real-looking but nonsensical grades like score 0 / "incorrect JSON
    syntax". We also drop the bad reply rather than echo it back, so the model
    doesn't anchor on its own malformed output."""
    out = [dict(m) for m in messages]
    reminder = ("\n\nIMPORTANT: reply with ONLY the raw JSON value — no prose, no "
                "markdown fences, no commentary before or after it.")
    for m in reversed(out):
        if m.get("role") == "user":
            c = m.get("content")
            if isinstance(c, list):  # multimodal content (images + text parts)
                m["content"] = list(c) + [reminder]
            else:
                m["content"] = f"{c}{reminder}"
            return out
    out.append({"role": "user", "content": reminder.strip()})
    return out


def chat_json(messages: list[dict], max_tokens: int = 256, retries: int = 1):
    """
    Call the model and parse its reply as JSON, with up to `retries` repair
    passes. Model output is never trusted: if the first reply isn't valid JSON we
    re-ask with a terse "ONLY JSON" reminder folded into the request and try
    again.

    Returns the parsed object/array, or None if every attempt fails (callers
    must handle None with a safe default — never crash the study loop).
    """
    convo = list(messages)
    for attempt in range(retries + 1):
        reply = chat(convo, max_tokens=max_tokens)
        data = extract_json(reply)
        if data is not None:
            return data
        if attempt < retries:
            # Repair pass: re-ask the SAME task with a format reminder folded into
            # the final user turn (see _augment_last_user for why we don't inject
            # a separate 'that was not valid JSON' turn).
            convo = _augment_last_user(messages)
    return None


# ---- Stub replies so the app runs with no model ----------------------------

def _msg_text(content) -> str:
    """Text of a message's content, ignoring any images (content may be a str or
    a list mixing strings and PIL.Image objects)."""
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        return " ".join(p for p in content if isinstance(p, str))
    return ""


def _stub_reply(messages: list[dict]) -> str:
    """Cheap deterministic-ish replies keyed off the caller's intent tag."""
    content = " ".join(_msg_text(m.get("content", "")) for m in messages).lower()
    if "generate" in content and "question" in content:
        return json.dumps([
            {"question": "[stub] What is the main idea of the source text?",
             "answer": "The main concept described in the passage.",
             "topic": "Stub Topic",
             "difficulty": 1},
            {"question": "[stub] How does the key concept apply in this context?",
             "answer": "It applies by connecting the described mechanism to the outcome.",
             "topic": "Stub Topic",
             "difficulty": 2},
            {"question": "[stub] Compare and contrast the two ideas presented.",
             "answer": "They differ in scope but share the same underlying principle.",
             "topic": "Stub Topic",
             "difficulty": 3},
        ])
    if "grade" in content or "score" in content:
        return json.dumps({
            "score": 4,
            "explanation": "[stub] Close — you captured the main idea but missed a detail.",
            "missed_concept": "the specific detail",
        })
    if "follow" in content:
        return json.dumps([
            {"question": "[stub follow-up] Can you restate the missed detail?",
             "answer": "The specific detail from the passage.",
             "topic": "Stub Topic"},
        ])
    return "[stub] model reply"