Spaces:
Running on Zero
Running on Zero
| """ | |
| Recall — Module B: Learning Engine. OWNER: Nikolai | |
| The brain: scheduling (SM-2-lite), grading, adaptation, follow-up generation, | |
| and the recap. Runs in STUB mode out of the box. Public signatures are fixed — | |
| app.py depends on them. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import llm | |
| from schema import ( | |
| Card, GradeResult, Session, new_card, new_card_state, new_grade, validate_card, | |
| ) | |
| # STUB is owned by llm (single source of truth) and read dynamically as | |
| # `llm.STUB` so every module agrees and runtime/reload changes are honored. | |
| # ---- Session lifecycle ----------------------------------------------------- | |
| def init_session(deck: list[Card]) -> Session: | |
| states = {c["id"]: new_card_state(c["id"]) for c in deck} | |
| return Session( | |
| deck=list(deck), | |
| states=states, | |
| queue=[c["id"] for c in deck], | |
| history=[], | |
| streak=0, | |
| ) | |
| WEAK_TOPIC_THRESHOLD = 3.0 # avg grade below this = a topic the user is weak on | |
| WEAK_LOOKAHEAD = 4 # how far down the queue we'll reach to surface a weak card | |
| GRADUATE_AT_CORRECT = 2 # correct answers needed before a card leaves the queue | |
| def next_card(session: Session) -> Card | None: | |
| """ | |
| Return the next card to study. Among the next few due cards we bias toward | |
| the user's weakest topic (lowest average grade so far) — so once the model | |
| sees you're shaky on a topic, that topic comes back sooner. With no history | |
| yet this is a no-op and we serve the queue in order. | |
| The chosen card is rotated to the front of the queue so `apply_result`'s | |
| "pop the front" contract still holds. | |
| """ | |
| queue = session["queue"] | |
| if not queue: | |
| return None | |
| idx = _weak_biased_index(session) | |
| if idx > 0: | |
| queue.insert(0, queue.pop(idx)) # bring the weak-topic card to the front | |
| return _find(session, queue[0]) | |
| # ---- Grading --------------------------------------------------------------- | |
| # Explicit non-answers ("idk", "don't know", "no idea", …), normalized to bare | |
| # words. An answer that IS one of these — or is empty once normalized ("?", "...") | |
| # — is a miss we score ourselves: the small grader otherwise ignores such input | |
| # and "grades" the reference answer instead, hallucinating a 4/5 "correct". | |
| _NON_ANSWER_PHRASES = { | |
| "idk", "dk", "dunno", "i dunno", | |
| "i dont know", "dont know", "don know", "i don know", | |
| "do not know", "i do not know", "no idea", "no clue", | |
| "not sure", "im not sure", "i am not sure", | |
| "no answer", "nothing", "none", "no", "na", "n a", "skip", "pass", | |
| } | |
| def _is_non_answer(user_answer: str) -> bool: | |
| """True if the input carries no real attempt — blank, punctuation-only, or a | |
| stock 'I don't know' phrase (apostrophes/typos tolerated).""" | |
| norm = re.sub(r"[^a-z0-9 ]", "", (user_answer or "").lower().replace("'", "")) | |
| norm = re.sub(r"\s+", " ", norm).strip() | |
| return not norm or norm in _NON_ANSWER_PHRASES | |
| def grade_answer(card: Card, user_answer: str) -> GradeResult: | |
| # A non-answer (blank, "?", "idk", "I don't know", …) is unambiguously a miss | |
| # — score it ourselves before any model call. (The grader otherwise ignores | |
| # such input and "grades" the reference answer itself, hallucinating a 4/5 | |
| # "correct".) Also saves a call. | |
| if _is_non_answer(user_answer): | |
| return new_grade( | |
| 0, | |
| f"No worries — that's an honest \"I don't know.\" Take a look at the " | |
| f"reference answer and give it another go: {card['answer']}", | |
| card["topic"], | |
| ) | |
| if llm.STUB: | |
| # Trivial heuristic so the stub demo "feels" responsive. | |
| ans = (user_answer or "").strip().lower() | |
| ref = card["answer"].strip().lower() | |
| overlap = len(set(ans.split()) & set(ref.split())) | |
| score = 5 if overlap >= 2 else (3 if overlap == 1 else 1) | |
| expl = ("Correct — you hit the key idea." if score >= 3 | |
| else f"Not quite. Expected something like: {card['answer']}") | |
| return new_grade(score, expl, missed_concept=card["topic"]) | |
| # Tone instruction is kept short and the strict "ONLY JSON" requirement is | |
| # reasserted as the LAST thing the model reads (in the user turn) — a verbose | |
| # "write warmly" preamble was nudging this small model into prose that didn't | |
| # parse, and recency improves format compliance. | |
| messages = [ | |
| {"role": "system", "content": | |
| "You grade a student's answer against a reference answer.\n" | |
| "Scoring (be strict): 0-1 = wrong / names the wrong thing; 2 = partially " | |
| "relevant but misses the key idea; 3 = mostly correct, minor gap; " | |
| "4-5 = correct and complete. A factually wrong answer is 0-2, never 3.\n" | |
| "Write the feedback warmly, speaking to the learner as \"you\" (never " | |
| "\"the student\"): note what's right, then what's missing.\n" | |
| "JSON keys: score (0-5 int), explanation (spoken to \"you\"), " | |
| "missed_concept (what was wrong, or \"\").\n" | |
| "Example: {\"score\": 1, \"explanation\": \"Good instinct, but that's the " | |
| "wrong spot — the Calvin cycle runs in the stroma. Tie it to where the " | |
| "enzymes sit.\", \"missed_concept\": \"the specific location\"}"}, | |
| {"role": "user", "content": | |
| f"Question: {card['question']}\nReference answer: {card['answer']}\n" | |
| f"Student answer: {user_answer}\n\n" | |
| "Grade it. Reply with ONLY the JSON object — no prose, no markdown fences."}, | |
| ] | |
| # Parser + one repair retry; safe default if the model never returns JSON. | |
| # A generous 2048-token budget so even a long reasoning preamble can't push | |
| # the JSON object past the cutoff — truncated grades were the main | |
| # parse-failure source, and the grade JSON itself is tiny so the headroom is | |
| # nearly free (generation stops at the closing brace, not the limit). | |
| data = llm.chat_json(messages, max_tokens=2048) | |
| if not _valid_grade(data): | |
| return new_grade( | |
| 2, | |
| "Couldn't grade automatically — compare your answer to the " | |
| f"reference: {card['answer']}", | |
| card["topic"], | |
| ) | |
| explanation = _to_second_person(str(data.get("explanation", "")).strip()) | |
| return new_grade( | |
| int(data["score"]), | |
| explanation or f"Reference answer: {card['answer']}", | |
| _to_second_person(str(data.get("missed_concept") or card["topic"]).strip()), | |
| ) | |
| # This small model still slips into the third person ("The student's answer…") | |
| # perhaps half the time despite the prompt. These swaps are the grammatically | |
| # SAFE ones — possessives only — so we never mangle subject-verb agreement (we | |
| # leave "The student identifies…" alone rather than produce "You identifies…"). | |
| _SECOND_PERSON_SUBS = [ | |
| (re.compile(r"\bthe student'?s answer\b", re.I), "your answer"), | |
| (re.compile(r"\bthe student'?s response\b", re.I), "your answer"), | |
| (re.compile(r"\bthe student'?s\b", re.I), "your"), | |
| ] | |
| def _to_second_person(text: str) -> str: | |
| """Rewrite clinical third-person possessives to warm second person, matching | |
| the original capitalization ('The student's answer' -> 'Your answer').""" | |
| for pat, repl in _SECOND_PERSON_SUBS: | |
| text = pat.sub( | |
| lambda m, r=repl: r.capitalize() if m.group(0)[:1].isupper() else r, | |
| text, | |
| ) | |
| return text | |
| def _valid_grade(data) -> bool: | |
| """A grade is usable only if it carries a numeric, in-range score.""" | |
| if not isinstance(data, dict) or "score" not in data: | |
| return False | |
| try: | |
| return 0 <= int(data["score"]) <= 5 | |
| except (TypeError, ValueError): | |
| return False | |
| # ---- Adaptation: SM-2-lite ------------------------------------------------- | |
| def apply_result(session: Session, card: Card, grade: GradeResult, | |
| user_answer: str = "") -> Session: | |
| st = session["states"][card["id"]] | |
| st["reps"] += 1 | |
| st["last_grade"] = grade["score"] | |
| # remove this card from the front of the queue | |
| if session["queue"] and session["queue"][0] == card["id"]: | |
| session["queue"].pop(0) | |
| if grade["correct"]: | |
| st["ease"] = min(3.0, st["ease"] + 0.1) | |
| st["interval"] = max(2, int(st["interval"] * st["ease"])) | |
| session["streak"] += 1 | |
| # Graduate once the card has been answered correctly GRADUATE_AT_CORRECT | |
| # times — only then does it leave the queue for good. (reps counts every | |
| # answer, lapses counts the misses, so reps - lapses = correct answers.) | |
| # Re-enqueuing a card *every* time it was right is what made the queue | |
| # never drain: the session never ended and the same cards came back with | |
| # no forward progress. A still-learning card comes back later as before. | |
| corrects = st["reps"] - st["lapses"] | |
| if corrects < GRADUATE_AT_CORRECT: | |
| _insert_at(session, card["id"], st["interval"]) # comes back later | |
| else: | |
| st["lapses"] += 1 | |
| st["ease"] = max(1.3, st["ease"] - 0.2) | |
| st["interval"] = 1 | |
| session["streak"] = 0 | |
| _insert_at(session, card["id"], 2) # comes back soon | |
| session["history"].append({ | |
| "card_id": card["id"], | |
| "user_answer": user_answer, | |
| "grade": grade["score"], | |
| "topic": card["topic"], | |
| }) | |
| return session | |
| def generate_followups(card: Card, grade: GradeResult, n: int = 2) -> list[Card]: | |
| """The money feature: new cards drilling exactly what was missed.""" | |
| if llm.STUB: | |
| # Two canned drills so the demo shows the design's "+2 new questions" | |
| # adaptive moment. The real path below returns up to `n`. | |
| prompts = [ | |
| f"[follow-up] In your own words, what's the key idea behind: {card['question']}", | |
| f"[follow-up] Restate: {card['question']}", | |
| ] | |
| return [ | |
| new_card( | |
| p, | |
| card["answer"], | |
| topic=card["topic"], | |
| source_chunk=card["source_chunk"], | |
| difficulty=max(1, card["difficulty"] - 1), | |
| parent_id=card["id"], | |
| ) | |
| for p in prompts[:n] | |
| ] | |
| messages = [ | |
| {"role": "system", "content": | |
| "The student missed a concept. Generate follow-up quiz questions that " | |
| "drill it. Return ONLY a JSON array of OBJECTS with keys: question, answer, " | |
| "topic. Example (return ONE array exactly like this, no other text):\n" | |
| '[{"question": "What is X?", "answer": "X is Y.", "topic": "Topic A"}]'}, | |
| {"role": "user", "content": | |
| f"Original question: {card['question']}\n" | |
| f"Missed concept: {grade['missed_concept']}\n" | |
| f"Source: {card['source_chunk']}\nGenerate {n} simpler follow-ups."}, | |
| ] | |
| data = llm.extract_json(llm.chat(messages, max_tokens=400)) | |
| out: list[Card] = [] | |
| if isinstance(data, list): | |
| for item in data[:n]: | |
| if not isinstance(item, dict): | |
| continue | |
| c = new_card( | |
| str(item.get("question", "")).strip(), | |
| str(item.get("answer", "")).strip(), | |
| topic=str(item.get("topic", card["topic"])).strip() or card["topic"], | |
| source_chunk=card["source_chunk"], | |
| difficulty=max(1, card["difficulty"] - 1), | |
| parent_id=card["id"], | |
| ) | |
| if validate_card(c): | |
| out.append(c) | |
| return out | |
| def add_followups(session: Session, cards: list[Card]) -> Session: | |
| """Register generated follow-ups into the deck + queue (near-term).""" | |
| for c in cards: | |
| session["deck"].append(c) | |
| session["states"][c["id"]] = new_card_state(c["id"]) | |
| _insert_at(session, c["id"], 1) | |
| return session | |
| def grade_and_adapt(session: Session, user_answer: str) -> tuple[GradeResult | None, list[Card]]: | |
| """One full study step: grade the current card, apply the result, and on a | |
| miss generate + enqueue follow-ups. Returns (grade, injected_cards), with | |
| grade None only when the queue is empty. | |
| This is the canonical study-loop sequence. Both the Gradio app and the JSON | |
| server call it instead of re-implementing the next_card → grade → apply → | |
| follow-up dance, so the loop can never drift between the two frontends. | |
| """ | |
| card = next_card(session) | |
| if card is None: | |
| return None, [] | |
| grade = grade_answer(card, user_answer or "") | |
| apply_result(session, card, grade, user_answer=user_answer or "") | |
| injected: list[Card] = [] | |
| if not grade["correct"]: | |
| fups = generate_followups(card, grade) | |
| if fups: | |
| add_followups(session, fups) | |
| injected = fups | |
| return grade, injected | |
| def replace_card(session: Session, old_id: str, new: Card) -> Session: | |
| """Swap a card in place (used by the difficulty toggle, NAH-32). | |
| Replaces the deck entry, resets its CardState (it's effectively a new | |
| question), and rewrites every queue occurrence so the queue's | |
| "pop the front" contract still holds. | |
| """ | |
| session["deck"] = [new if c["id"] == old_id else c for c in session["deck"]] | |
| session["states"].pop(old_id, None) | |
| session["states"][new["id"]] = new_card_state(new["id"]) | |
| session["queue"] = [new["id"] if cid == old_id else cid | |
| for cid in session["queue"]] | |
| return session | |
| # ---- Recap ----------------------------------------------------------------- | |
| def recap(session: Session) -> dict: | |
| grades_by_topic: dict[str, list[int]] = {} | |
| for h in session["history"]: | |
| grades_by_topic.setdefault(h["topic"], []).append(h["grade"]) | |
| # Same threshold the scheduler uses to decide what to resurface, so a topic | |
| # the recap calls "weak" is exactly one next_card brings back sooner. | |
| mastered = [t for t, g in grades_by_topic.items() if _avg(g) >= WEAK_TOPIC_THRESHOLD] | |
| weak = [t for t, g in grades_by_topic.items() if _avg(g) < WEAK_TOPIC_THRESHOLD] | |
| if llm.STUB: | |
| reflection = ("Solid start. You're strong on " | |
| f"{', '.join(mastered) or 'nothing yet'}; " | |
| f"{', '.join(weak) or 'no weak spots'} could use another pass.") | |
| else: | |
| msg = [ | |
| {"role": "system", "content": | |
| "Write one encouraging sentence reflecting on a study session."}, | |
| {"role": "user", "content": | |
| f"Mastered: {mastered}. Weak: {weak}. Streak: {session['streak']}."}, | |
| ] | |
| reflection = llm.chat(msg, max_tokens=80) | |
| return { | |
| "mastered": mastered, | |
| "weak_topics": weak, | |
| "reflection": reflection, | |
| "streak": session["streak"], | |
| "answered": len(session["history"]), | |
| } | |
| # ---- helpers --------------------------------------------------------------- | |
| def _find(session: Session, card_id: str) -> Card | None: | |
| return next((c for c in session["deck"] if c["id"] == card_id), None) | |
| def _topic_averages(session: Session) -> dict[str, float]: | |
| """Average grade per topic across answered history (empty until first answer).""" | |
| grades: dict[str, list[int]] = {} | |
| for h in session["history"]: | |
| grades.setdefault(h["topic"], []).append(h["grade"]) | |
| return {t: _avg(g) for t, g in grades.items()} | |
| def _weak_biased_index(session: Session) -> int: | |
| """ | |
| Index into the queue of the card to serve next. Looks at the next | |
| WEAK_LOOKAHEAD cards and picks the one whose topic has the lowest average | |
| grade, as long as that topic is actually weak (avg < threshold). Returns 0 | |
| (keep normal order) when nothing in reach is weak or there's no history yet. | |
| """ | |
| queue = session["queue"] | |
| averages = _topic_averages(session) | |
| if not averages: | |
| return 0 | |
| best_idx, best_avg = 0, None | |
| for i, card_id in enumerate(queue[:WEAK_LOOKAHEAD]): | |
| card = _find(session, card_id) | |
| if card is None: | |
| continue | |
| avg = averages.get(card["topic"]) | |
| if avg is None or avg >= WEAK_TOPIC_THRESHOLD: | |
| continue | |
| if best_avg is None or avg < best_avg: | |
| best_idx, best_avg = i, avg | |
| return best_idx | |
| def _insert_at(session: Session, card_id: str, pos: int) -> None: | |
| pos = max(0, min(pos, len(session["queue"]))) | |
| session["queue"].insert(pos, card_id) | |
| def _avg(xs: list[int]) -> float: | |
| return sum(xs) / len(xs) if xs else 0.0 | |