Spaces:

build-small-hackathon
/

study-partner

Running on Zero

App Files Files Community

study-partner / learning_engine.py

nz-nz

Sync from GitHub via hub-sync

6505c2d verified 9 days ago

Raw

History Blame Contribute Delete

16.5 kB

	"""
	Recall — Module B: Learning Engine. OWNER: Nikolai

	The brain: scheduling (SM-2-lite), grading, adaptation, follow-up generation,
	and the recap. Runs in STUB mode out of the box. Public signatures are fixed —
	app.py depends on them.
	"""
	from __future__ import annotations

	import re

	import llm
	from schema import (
	Card, GradeResult, Session, new_card, new_card_state, new_grade, validate_card,
	)

	# STUB is owned by llm (single source of truth) and read dynamically as
	# `llm.STUB` so every module agrees and runtime/reload changes are honored.


	# ---- Session lifecycle -----------------------------------------------------

	def init_session(deck: list[Card]) -> Session:
	states = {c["id"]: new_card_state(c["id"]) for c in deck}
	return Session(
	deck=list(deck),
	states=states,
	queue=[c["id"] for c in deck],
	history=[],
	streak=0,
	)


	WEAK_TOPIC_THRESHOLD = 3.0 # avg grade below this = a topic the user is weak on
	WEAK_LOOKAHEAD = 4 # how far down the queue we'll reach to surface a weak card
	GRADUATE_AT_CORRECT = 2 # correct answers needed before a card leaves the queue


	def next_card(session: Session) -> Card \| None:
	"""
	Return the next card to study. Among the next few due cards we bias toward
	the user's weakest topic (lowest average grade so far) — so once the model
	sees you're shaky on a topic, that topic comes back sooner. With no history
	yet this is a no-op and we serve the queue in order.

	The chosen card is rotated to the front of the queue so `apply_result`'s
	"pop the front" contract still holds.
	"""
	queue = session["queue"]
	if not queue:
	return None

	idx = _weak_biased_index(session)
	if idx > 0:
	queue.insert(0, queue.pop(idx)) # bring the weak-topic card to the front
	return _find(session, queue[0])


	# ---- Grading ---------------------------------------------------------------

	# Explicit non-answers ("idk", "don't know", "no idea", …), normalized to bare
	# words. An answer that IS one of these — or is empty once normalized ("?", "...")
	# — is a miss we score ourselves: the small grader otherwise ignores such input
	# and "grades" the reference answer instead, hallucinating a 4/5 "correct".
	_NON_ANSWER_PHRASES = {
	"idk", "dk", "dunno", "i dunno",
	"i dont know", "dont know", "don know", "i don know",
	"do not know", "i do not know", "no idea", "no clue",
	"not sure", "im not sure", "i am not sure",
	"no answer", "nothing", "none", "no", "na", "n a", "skip", "pass",
	}


	def _is_non_answer(user_answer: str) -> bool:
	"""True if the input carries no real attempt — blank, punctuation-only, or a
	stock 'I don't know' phrase (apostrophes/typos tolerated)."""
	norm = re.sub(r"[^a-z0-9 ]", "", (user_answer or "").lower().replace("'", ""))
	norm = re.sub(r"\s+", " ", norm).strip()
	return not norm or norm in _NON_ANSWER_PHRASES


	def grade_answer(card: Card, user_answer: str) -> GradeResult:
	# A non-answer (blank, "?", "idk", "I don't know", …) is unambiguously a miss
	# — score it ourselves before any model call. (The grader otherwise ignores
	# such input and "grades" the reference answer itself, hallucinating a 4/5
	# "correct".) Also saves a call.
	if _is_non_answer(user_answer):
	return new_grade(
	0,
	f"No worries — that's an honest \"I don't know.\" Take a look at the "
	f"reference answer and give it another go: {card['answer']}",
	card["topic"],
	)

	if llm.STUB:
	# Trivial heuristic so the stub demo "feels" responsive.
	ans = (user_answer or "").strip().lower()
	ref = card["answer"].strip().lower()
	overlap = len(set(ans.split()) & set(ref.split()))
	score = 5 if overlap >= 2 else (3 if overlap == 1 else 1)
	expl = ("Correct — you hit the key idea." if score >= 3
	else f"Not quite. Expected something like: {card['answer']}")
	return new_grade(score, expl, missed_concept=card["topic"])

	# Tone instruction is kept short and the strict "ONLY JSON" requirement is
	# reasserted as the LAST thing the model reads (in the user turn) — a verbose
	# "write warmly" preamble was nudging this small model into prose that didn't
	# parse, and recency improves format compliance.
	messages = [
	{"role": "system", "content":
	"You grade a student's answer against a reference answer.\n"
	"Scoring (be strict): 0-1 = wrong / names the wrong thing; 2 = partially "
	"relevant but misses the key idea; 3 = mostly correct, minor gap; "
	"4-5 = correct and complete. A factually wrong answer is 0-2, never 3.\n"
	"Write the feedback warmly, speaking to the learner as \"you\" (never "
	"\"the student\"): note what's right, then what's missing.\n"
	"JSON keys: score (0-5 int), explanation (spoken to \"you\"), "
	"missed_concept (what was wrong, or \"\").\n"
	"Example: {\"score\": 1, \"explanation\": \"Good instinct, but that's the "
	"wrong spot — the Calvin cycle runs in the stroma. Tie it to where the "
	"enzymes sit.\", \"missed_concept\": \"the specific location\"}"},
	{"role": "user", "content":
	f"Question: {card['question']}\nReference answer: {card['answer']}\n"
	f"Student answer: {user_answer}\n\n"
	"Grade it. Reply with ONLY the JSON object — no prose, no markdown fences."},
	]
	# Parser + one repair retry; safe default if the model never returns JSON.
	# A generous 2048-token budget so even a long reasoning preamble can't push
	# the JSON object past the cutoff — truncated grades were the main
	# parse-failure source, and the grade JSON itself is tiny so the headroom is
	# nearly free (generation stops at the closing brace, not the limit).
	data = llm.chat_json(messages, max_tokens=2048)
	if not _valid_grade(data):
	return new_grade(
	2,
	"Couldn't grade automatically — compare your answer to the "
	f"reference: {card['answer']}",
	card["topic"],
	)
	explanation = _to_second_person(str(data.get("explanation", "")).strip())
	return new_grade(
	int(data["score"]),
	explanation or f"Reference answer: {card['answer']}",
	_to_second_person(str(data.get("missed_concept") or card["topic"]).strip()),
	)


	# This small model still slips into the third person ("The student's answer…")
	# perhaps half the time despite the prompt. These swaps are the grammatically
	# SAFE ones — possessives only — so we never mangle subject-verb agreement (we
	# leave "The student identifies…" alone rather than produce "You identifies…").
	_SECOND_PERSON_SUBS = [
	(re.compile(r"\bthe student'?s answer\b", re.I), "your answer"),
	(re.compile(r"\bthe student'?s response\b", re.I), "your answer"),
	(re.compile(r"\bthe student'?s\b", re.I), "your"),
	]


	def _to_second_person(text: str) -> str:
	"""Rewrite clinical third-person possessives to warm second person, matching
	the original capitalization ('The student's answer' -> 'Your answer')."""
	for pat, repl in _SECOND_PERSON_SUBS:
	text = pat.sub(
	lambda m, r=repl: r.capitalize() if m.group(0)[:1].isupper() else r,
	text,
	)
	return text


	def _valid_grade(data) -> bool:
	"""A grade is usable only if it carries a numeric, in-range score."""
	if not isinstance(data, dict) or "score" not in data:
	return False
	try:
	return 0 <= int(data["score"]) <= 5
	except (TypeError, ValueError):
	return False


	# ---- Adaptation: SM-2-lite -------------------------------------------------

	def apply_result(session: Session, card: Card, grade: GradeResult,
	user_answer: str = "") -> Session:
	st = session["states"][card["id"]]
	st["reps"] += 1
	st["last_grade"] = grade["score"]

	# remove this card from the front of the queue
	if session["queue"] and session["queue"][0] == card["id"]:
	session["queue"].pop(0)

	if grade["correct"]:
	st["ease"] = min(3.0, st["ease"] + 0.1)
	st["interval"] = max(2, int(st["interval"] * st["ease"]))
	session["streak"] += 1
	# Graduate once the card has been answered correctly GRADUATE_AT_CORRECT
	# times — only then does it leave the queue for good. (reps counts every
	# answer, lapses counts the misses, so reps - lapses = correct answers.)
	# Re-enqueuing a card every time it was right is what made the queue
	# never drain: the session never ended and the same cards came back with
	# no forward progress. A still-learning card comes back later as before.
	corrects = st["reps"] - st["lapses"]
	if corrects < GRADUATE_AT_CORRECT:
	_insert_at(session, card["id"], st["interval"]) # comes back later
	else:
	st["lapses"] += 1
	st["ease"] = max(1.3, st["ease"] - 0.2)
	st["interval"] = 1
	session["streak"] = 0
	_insert_at(session, card["id"], 2) # comes back soon

	session["history"].append({
	"card_id": card["id"],
	"user_answer": user_answer,
	"grade": grade["score"],
	"topic": card["topic"],
	})
	return session


	def generate_followups(card: Card, grade: GradeResult, n: int = 2) -> list[Card]:
	"""The money feature: new cards drilling exactly what was missed."""
	if llm.STUB:
	# Two canned drills so the demo shows the design's "+2 new questions"
	# adaptive moment. The real path below returns up to `n`.
	prompts = [
	f"[follow-up] In your own words, what's the key idea behind: {card['question']}",
	f"[follow-up] Restate: {card['question']}",
	]
	return [
	new_card(
	p,
	card["answer"],
	topic=card["topic"],
	source_chunk=card["source_chunk"],
	difficulty=max(1, card["difficulty"] - 1),
	parent_id=card["id"],
	)
	for p in prompts[:n]
	]

	messages = [
	{"role": "system", "content":
	"The student missed a concept. Generate follow-up quiz questions that "
	"drill it. Return ONLY a JSON array of OBJECTS with keys: question, answer, "
	"topic. Example (return ONE array exactly like this, no other text):\n"
	'[{"question": "What is X?", "answer": "X is Y.", "topic": "Topic A"}]'},
	{"role": "user", "content":
	f"Original question: {card['question']}\n"
	f"Missed concept: {grade['missed_concept']}\n"
	f"Source: {card['source_chunk']}\nGenerate {n} simpler follow-ups."},
	]
	data = llm.extract_json(llm.chat(messages, max_tokens=400))
	out: list[Card] = []
	if isinstance(data, list):
	for item in data[:n]:
	if not isinstance(item, dict):
	continue
	c = new_card(
	str(item.get("question", "")).strip(),
	str(item.get("answer", "")).strip(),
	topic=str(item.get("topic", card["topic"])).strip() or card["topic"],
	source_chunk=card["source_chunk"],
	difficulty=max(1, card["difficulty"] - 1),
	parent_id=card["id"],
	)
	if validate_card(c):
	out.append(c)
	return out


	def add_followups(session: Session, cards: list[Card]) -> Session:
	"""Register generated follow-ups into the deck + queue (near-term)."""
	for c in cards:
	session["deck"].append(c)
	session["states"][c["id"]] = new_card_state(c["id"])
	_insert_at(session, c["id"], 1)
	return session


	def grade_and_adapt(session: Session, user_answer: str) -> tuple[GradeResult \| None, list[Card]]:
	"""One full study step: grade the current card, apply the result, and on a
	miss generate + enqueue follow-ups. Returns (grade, injected_cards), with
	grade None only when the queue is empty.

	This is the canonical study-loop sequence. Both the Gradio app and the JSON
	server call it instead of re-implementing the next_card → grade → apply →
	follow-up dance, so the loop can never drift between the two frontends.
	"""
	card = next_card(session)
	if card is None:
	return None, []
	grade = grade_answer(card, user_answer or "")
	apply_result(session, card, grade, user_answer=user_answer or "")
	injected: list[Card] = []
	if not grade["correct"]:
	fups = generate_followups(card, grade)
	if fups:
	add_followups(session, fups)
	injected = fups
	return grade, injected


	def replace_card(session: Session, old_id: str, new: Card) -> Session:
	"""Swap a card in place (used by the difficulty toggle, NAH-32).

	Replaces the deck entry, resets its CardState (it's effectively a new
	question), and rewrites every queue occurrence so the queue's
	"pop the front" contract still holds.
	"""
	session["deck"] = [new if c["id"] == old_id else c for c in session["deck"]]
	session["states"].pop(old_id, None)
	session["states"][new["id"]] = new_card_state(new["id"])
	session["queue"] = [new["id"] if cid == old_id else cid
	for cid in session["queue"]]
	return session


	# ---- Recap -----------------------------------------------------------------

	def recap(session: Session) -> dict:
	grades_by_topic: dict[str, list[int]] = {}
	for h in session["history"]:
	grades_by_topic.setdefault(h["topic"], []).append(h["grade"])

	# Same threshold the scheduler uses to decide what to resurface, so a topic
	# the recap calls "weak" is exactly one next_card brings back sooner.
	mastered = [t for t, g in grades_by_topic.items() if _avg(g) >= WEAK_TOPIC_THRESHOLD]
	weak = [t for t, g in grades_by_topic.items() if _avg(g) < WEAK_TOPIC_THRESHOLD]

	if llm.STUB:
	reflection = ("Solid start. You're strong on "
	f"{', '.join(mastered) or 'nothing yet'}; "
	f"{', '.join(weak) or 'no weak spots'} could use another pass.")
	else:
	msg = [
	{"role": "system", "content":
	"Write one encouraging sentence reflecting on a study session."},
	{"role": "user", "content":
	f"Mastered: {mastered}. Weak: {weak}. Streak: {session['streak']}."},
	]
	reflection = llm.chat(msg, max_tokens=80)

	return {
	"mastered": mastered,
	"weak_topics": weak,
	"reflection": reflection,
	"streak": session["streak"],
	"answered": len(session["history"]),
	}


	# ---- helpers ---------------------------------------------------------------

	def _find(session: Session, card_id: str) -> Card \| None:
	return next((c for c in session["deck"] if c["id"] == card_id), None)


	def _topic_averages(session: Session) -> dict[str, float]:
	"""Average grade per topic across answered history (empty until first answer)."""
	grades: dict[str, list[int]] = {}
	for h in session["history"]:
	grades.setdefault(h["topic"], []).append(h["grade"])
	return {t: _avg(g) for t, g in grades.items()}


	def _weak_biased_index(session: Session) -> int:
	"""
	Index into the queue of the card to serve next. Looks at the next
	WEAK_LOOKAHEAD cards and picks the one whose topic has the lowest average
	grade, as long as that topic is actually weak (avg < threshold). Returns 0
	(keep normal order) when nothing in reach is weak or there's no history yet.
	"""
	queue = session["queue"]
	averages = _topic_averages(session)
	if not averages:
	return 0

	best_idx, best_avg = 0, None
	for i, card_id in enumerate(queue[:WEAK_LOOKAHEAD]):
	card = _find(session, card_id)
	if card is None:
	continue
	avg = averages.get(card["topic"])
	if avg is None or avg >= WEAK_TOPIC_THRESHOLD:
	continue
	if best_avg is None or avg < best_avg:
	best_idx, best_avg = i, avg
	return best_idx


	def _insert_at(session: Session, card_id: str, pos: int) -> None:
	pos = max(0, min(pos, len(session["queue"])))
	session["queue"].insert(pos, card_id)


	def _avg(xs: list[int]) -> float:
	return sum(xs) / len(xs) if xs else 0.0