Spaces:

build-small-hackathon
/

recall

Runtime error

File size: 11,828 Bytes

"""
Recall — Module B: Learning Engine.  OWNER: Nikolai

The brain: scheduling (SM-2-lite), grading, adaptation, follow-up generation,
and the recap. Runs in STUB mode out of the box. Public signatures are fixed —
app.py depends on them.
"""
from __future__ import annotations

import llm
from schema import (
    Card, GradeResult, Session, new_card, new_card_state, new_grade, validate_card,
)

# STUB is owned by llm (single source of truth) and read dynamically as
# `llm.STUB` so every module agrees and runtime/reload changes are honored.


# ---- Session lifecycle -----------------------------------------------------

def init_session(deck: list[Card]) -> Session:
    states = {c["id"]: new_card_state(c["id"]) for c in deck}
    return Session(
        deck=list(deck),
        states=states,
        queue=[c["id"] for c in deck],
        history=[],
        streak=0,
    )


WEAK_TOPIC_THRESHOLD = 3.0   # avg grade below this = a topic the user is weak on
WEAK_LOOKAHEAD = 4           # how far down the queue we'll reach to surface a weak card


def next_card(session: Session) -> Card | None:
    """
    Return the next card to study. Among the next few due cards we bias toward
    the user's weakest topic (lowest average grade so far) — so once the model
    sees you're shaky on a topic, that topic comes back sooner. With no history
    yet this is a no-op and we serve the queue in order.

    The chosen card is rotated to the front of the queue so `apply_result`'s
    "pop the front" contract still holds.
    """
    queue = session["queue"]
    if not queue:
        return None

    idx = _weak_biased_index(session)
    if idx > 0:
        queue.insert(0, queue.pop(idx))   # bring the weak-topic card to the front
    return _find(session, queue[0])


# ---- Grading ---------------------------------------------------------------

def grade_answer(card: Card, user_answer: str) -> GradeResult:
    if llm.STUB:
        # Trivial heuristic so the stub demo "feels" responsive.
        ans = (user_answer or "").strip().lower()
        ref = card["answer"].strip().lower()
        overlap = len(set(ans.split()) & set(ref.split()))
        score = 5 if overlap >= 2 else (3 if overlap == 1 else 1)
        expl = ("Correct — you hit the key idea." if score >= 3
                else f"Not quite. Expected something like: {card['answer']}")
        return new_grade(score, expl, missed_concept=card["topic"])

    messages = [
        {"role": "system", "content":
         "You grade a student's answer against a reference answer. "
         "Return ONLY a JSON object with keys: "
         "score (integer 0-5), explanation (string for the student), "
         "missed_concept (short string naming what they got wrong, or \"\")."},
        {"role": "user", "content":
         f"Question: {card['question']}\nReference answer: {card['answer']}\n"
         f"Student answer: {user_answer}\nGrade it."},
    ]
    # Parser + one repair retry; safe default if the model never returns JSON.
    data = llm.chat_json(messages, max_tokens=256)
    if not _valid_grade(data):
        return new_grade(
            2,
            "Couldn't grade automatically — compare your answer to the "
            f"reference: {card['answer']}",
            card["topic"],
        )
    return new_grade(
        int(data["score"]),
        str(data.get("explanation", "")).strip()
        or f"Reference answer: {card['answer']}",
        str(data.get("missed_concept") or card["topic"]).strip(),
    )


def _valid_grade(data) -> bool:
    """A grade is usable only if it carries a numeric, in-range score."""
    if not isinstance(data, dict) or "score" not in data:
        return False
    try:
        return 0 <= int(data["score"]) <= 5
    except (TypeError, ValueError):
        return False


# ---- Adaptation: SM-2-lite -------------------------------------------------

def apply_result(session: Session, card: Card, grade: GradeResult,
                 user_answer: str = "") -> Session:
    st = session["states"][card["id"]]
    st["reps"] += 1
    st["last_grade"] = grade["score"]

    # remove this card from the front of the queue
    if session["queue"] and session["queue"][0] == card["id"]:
        session["queue"].pop(0)

    if grade["correct"]:
        st["ease"] = min(3.0, st["ease"] + 0.1)
        st["interval"] = max(2, int(st["interval"] * st["ease"]))
        session["streak"] += 1
        _insert_at(session, card["id"], st["interval"])  # comes back later
    else:
        st["lapses"] += 1
        st["ease"] = max(1.3, st["ease"] - 0.2)
        st["interval"] = 1
        session["streak"] = 0
        _insert_at(session, card["id"], 2)               # comes back soon

    session["history"].append({
        "card_id": card["id"],
        "user_answer": user_answer,
        "grade": grade["score"],
        "topic": card["topic"],
    })
    return session


def generate_followups(card: Card, grade: GradeResult, n: int = 2) -> list[Card]:
    """The money feature: new cards drilling exactly what was missed."""
    if llm.STUB:
        # Two canned drills so the demo shows the design's "+2 new questions"
        # adaptive moment. The real path below returns up to `n`.
        prompts = [
            f"[follow-up] In your own words, what's the key idea behind: {card['question']}",
            f"[follow-up] Restate: {card['question']}",
        ]
        return [
            new_card(
                p,
                card["answer"],
                topic=card["topic"],
                source_chunk=card["source_chunk"],
                difficulty=max(1, card["difficulty"] - 1),
                parent_id=card["id"],
            )
            for p in prompts[:n]
        ]

    messages = [
        {"role": "system", "content":
         "The student missed a concept. Generate follow-up quiz questions that "
         "drill it. Return ONLY a JSON array with keys: question, answer, topic."},
        {"role": "user", "content":
         f"Original question: {card['question']}\n"
         f"Missed concept: {grade['missed_concept']}\n"
         f"Source: {card['source_chunk']}\nGenerate {n} simpler follow-ups."},
    ]
    data = llm.extract_json(llm.chat(messages, max_tokens=400))
    out: list[Card] = []
    if isinstance(data, list):
        for item in data[:n]:
            if not isinstance(item, dict):
                continue
            c = new_card(
                str(item.get("question", "")).strip(),
                str(item.get("answer", "")).strip(),
                topic=str(item.get("topic", card["topic"])).strip() or card["topic"],
                source_chunk=card["source_chunk"],
                difficulty=max(1, card["difficulty"] - 1),
                parent_id=card["id"],
            )
            if validate_card(c):
                out.append(c)
    return out


def add_followups(session: Session, cards: list[Card]) -> Session:
    """Register generated follow-ups into the deck + queue (near-term)."""
    for c in cards:
        session["deck"].append(c)
        session["states"][c["id"]] = new_card_state(c["id"])
        _insert_at(session, c["id"], 1)
    return session


def grade_and_adapt(session: Session, user_answer: str) -> tuple[GradeResult | None, list[Card]]:
    """One full study step: grade the current card, apply the result, and on a
    miss generate + enqueue follow-ups. Returns (grade, injected_cards), with
    grade None only when the queue is empty.

    This is the canonical study-loop sequence. Both the Gradio app and the JSON
    server call it instead of re-implementing the next_card → grade → apply →
    follow-up dance, so the loop can never drift between the two frontends.
    """
    card = next_card(session)
    if card is None:
        return None, []
    grade = grade_answer(card, user_answer or "")
    apply_result(session, card, grade, user_answer=user_answer or "")
    injected: list[Card] = []
    if not grade["correct"]:
        fups = generate_followups(card, grade)
        if fups:
            add_followups(session, fups)
            injected = fups
    return grade, injected


def replace_card(session: Session, old_id: str, new: Card) -> Session:
    """Swap a card in place (used by the difficulty toggle, NAH-32).

    Replaces the deck entry, resets its CardState (it's effectively a new
    question), and rewrites every queue occurrence so the queue's
    "pop the front" contract still holds.
    """
    session["deck"] = [new if c["id"] == old_id else c for c in session["deck"]]
    session["states"].pop(old_id, None)
    session["states"][new["id"]] = new_card_state(new["id"])
    session["queue"] = [new["id"] if cid == old_id else cid
                        for cid in session["queue"]]
    return session


# ---- Recap -----------------------------------------------------------------

def recap(session: Session) -> dict:
    grades_by_topic: dict[str, list[int]] = {}
    for h in session["history"]:
        grades_by_topic.setdefault(h["topic"], []).append(h["grade"])

    # Same threshold the scheduler uses to decide what to resurface, so a topic
    # the recap calls "weak" is exactly one next_card brings back sooner.
    mastered = [t for t, g in grades_by_topic.items() if _avg(g) >= WEAK_TOPIC_THRESHOLD]
    weak = [t for t, g in grades_by_topic.items() if _avg(g) < WEAK_TOPIC_THRESHOLD]

    if llm.STUB:
        reflection = ("Solid start. You're strong on "
                      f"{', '.join(mastered) or 'nothing yet'}; "
                      f"{', '.join(weak) or 'no weak spots'} could use another pass.")
    else:
        msg = [
            {"role": "system", "content":
             "Write one encouraging sentence reflecting on a study session."},
            {"role": "user", "content":
             f"Mastered: {mastered}. Weak: {weak}. Streak: {session['streak']}."},
        ]
        reflection = llm.chat(msg, max_tokens=80)

    return {
        "mastered": mastered,
        "weak_topics": weak,
        "reflection": reflection,
        "streak": session["streak"],
        "answered": len(session["history"]),
    }


# ---- helpers ---------------------------------------------------------------

def _find(session: Session, card_id: str) -> Card | None:
    return next((c for c in session["deck"] if c["id"] == card_id), None)


def _topic_averages(session: Session) -> dict[str, float]:
    """Average grade per topic across answered history (empty until first answer)."""
    grades: dict[str, list[int]] = {}
    for h in session["history"]:
        grades.setdefault(h["topic"], []).append(h["grade"])
    return {t: _avg(g) for t, g in grades.items()}


def _weak_biased_index(session: Session) -> int:
    """
    Index into the queue of the card to serve next. Looks at the next
    WEAK_LOOKAHEAD cards and picks the one whose topic has the lowest average
    grade, as long as that topic is actually weak (avg < threshold). Returns 0
    (keep normal order) when nothing in reach is weak or there's no history yet.
    """
    queue = session["queue"]
    averages = _topic_averages(session)
    if not averages:
        return 0

    best_idx, best_avg = 0, None
    for i, card_id in enumerate(queue[:WEAK_LOOKAHEAD]):
        card = _find(session, card_id)
        if card is None:
            continue
        avg = averages.get(card["topic"])
        if avg is None or avg >= WEAK_TOPIC_THRESHOLD:
            continue
        if best_avg is None or avg < best_avg:
            best_idx, best_avg = i, avg
    return best_idx


def _insert_at(session: Session, card_id: str, pos: int) -> None:
    pos = max(0, min(pos, len(session["queue"])))
    session["queue"].insert(pos, card_id)


def _avg(xs: list[int]) -> float:
    return sum(xs) / len(xs) if xs else 0.0