Spaces:
Running on Zero
Running on Zero
File size: 17,242 Bytes
7563305 847a58e 7563305 1eec565 8182c0f 9124577 7563305 dc47444 9124577 dc47444 7563305 dc47444 b722e62 dc47444 9124577 8182c0f 4e70816 8182c0f 7563305 4e70816 7563305 02c841f 7563305 02c841f 7563305 058157a 7563305 dc47444 7563305 058157a 7563305 058157a 7563305 9124577 7563305 efca112 847a58e 4d74f1c 847a58e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | """
Recall — custom frontend server (NAH-36).
Serves the polished `Recall.dc.html` design (frontend/index.html) and exposes a
thin JSON API over the EXISTING backend. The learning/content logic and the
`schema.py` data contract are treated as an API and are NOT modified here.
The Session dict lives server-side, keyed by a short id the client carries
around (mirrors the single-session gr.State model the Gradio app uses) — so the
reference answers in the deck never leave the server.
Run it (stub mode is on by default):
pip install -r requirements.txt
python server.py # http://127.0.0.1:7860
Flip RECALL_STUB=0 once the real model is wired:
RECALL_STUB=0 python server.py
The legacy Gradio UI is still mounted at /gradio.
"""
from __future__ import annotations
import os
import tempfile
import threading
import time
import uuid
from collections import OrderedDict
from pathlib import Path
import gradio as gr
from fastapi import File, Form, UploadFile
from fastapi.responses import FileResponse, JSONResponse, Response
from pydantic import BaseModel
from starlette.concurrency import run_in_threadpool
import content_pipeline as cp
import learning_engine as le
FRONTEND = Path(__file__).parent / "frontend"
# Single-process session store. Bounded so a public Space can't be OOM'd by a
# client looping /api/generate: sessions are evicted least-recently-used past
# MAX_SESSIONS and expire after SESSION_TTL_SECONDS of inactivity.
MAX_SESSIONS = int(os.getenv("RECALL_MAX_SESSIONS", "500"))
SESSION_TTL_SECONDS = int(os.getenv("RECALL_SESSION_TTL", str(2 * 60 * 60))) # 2h
# Caps on input size so a single request can't exhaust memory/disk.
MAX_UPLOAD_BYTES = int(os.getenv("RECALL_MAX_UPLOAD_MB", "10")) * 1024 * 1024
MAX_TEXT_CHARS = int(os.getenv("RECALL_MAX_TEXT_CHARS", "200000")) # ~50k tokens
# sid -> (session, last_access_epoch). OrderedDict gives O(1) LRU eviction.
SESSIONS: "OrderedDict[str, tuple[dict, float]]" = OrderedDict()
def _purge_expired(now: float | None = None) -> None:
now = time.time() if now is None else now
stale = [sid for sid, (_, ts) in SESSIONS.items()
if now - ts > SESSION_TTL_SECONDS]
for sid in stale:
SESSIONS.pop(sid, None)
def get_session(sid: str) -> dict | None:
"""Fetch a live session and mark it most-recently-used, or None if it's
unknown/expired (callers already return a friendly 'session expired')."""
_purge_expired()
entry = SESSIONS.get(sid)
if entry is None:
return None
session, _ = entry
SESSIONS[sid] = (session, time.time())
SESSIONS.move_to_end(sid)
return session
def put_session(sid: str, session: dict) -> None:
"""Store/refresh a session, evicting the least-recently-used past the cap."""
_purge_expired()
SESSIONS[sid] = (session, time.time())
SESSIONS.move_to_end(sid)
while len(SESSIONS) > MAX_SESSIONS:
SESSIONS.popitem(last=False)
# The photosynthesis notes the design's "sample" affordances load. Lets the
# Upload screen's sample chip work even with no real PDF on disk.
SAMPLE_NOTES = (
"Photosynthesis happens in the chloroplast. The light-dependent reactions "
"occur in the thylakoid membranes, where water is split, ATP and NADPH are "
"produced, and oxygen is released. The Calvin cycle takes place in the "
"stroma, where the enzyme RuBisCO fixes CO2 onto RuBP. Cellular respiration "
"occurs in the mitochondria; most ATP is made during oxidative "
"phosphorylation, as the electron transport chain pumps protons and oxygen "
"acts as the final electron acceptor, forming water."
)
# The "scanned-slides.pdf" sample renders a slide to an IMAGE (no text layer) so
# the chip demonstrates the real image-PDF OCR path end-to-end: the vision model
# reads the rendered page and writes a deck from it (image PDFs are now supported,
# so the old "image-only = error" copy no longer applies).
SCAN_SAMPLE_LINES = (
"Photosynthesis - Lecture 3",
"",
"Occurs in the chloroplast.",
"Light-dependent reactions happen in the thylakoid",
"membranes: water is split, ATP and NADPH are made,",
"and oxygen is released.",
"The Calvin cycle runs in the stroma, where RuBisCO",
"fixes CO2 onto RuBP to produce glucose.",
)
def _render_scan_sample():
"""A synthetic 'scanned slide' page image (text rasterized, no text layer) so
the sample chip shows the vision/OCR path with one click."""
from PIL import Image, ImageDraw, ImageFont
def _font(size):
for p in ("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "DejaVuSans.ttf"):
try:
return ImageFont.truetype(p, size)
except Exception: # noqa: BLE001 — no TTF on this host; use the bitmap default
pass
# Pillow >=10.1 lets load_default scale; without it the 5px bitmap font is
# too small for the model to OCR reliably.
try:
return ImageFont.load_default(size=size)
except TypeError:
return ImageFont.load_default()
img = Image.new("RGB", (1024, 720), "white")
draw = ImageDraw.Draw(img)
title_f, body_f = _font(40), _font(30)
y = 70
for i, line in enumerate(SCAN_SAMPLE_LINES):
draw.text((70, y), line, fill=(15, 15, 15), font=title_f if i == 0 else body_f)
y += 64 if i == 0 else 52
return img
_SCAN_SAMPLE_PNG: bytes | None = None
def _scan_sample_png() -> bytes:
"""PNG bytes of the synthetic scanned slide, rendered once and cached. The
render is deterministic, so the preview thumbnail is byte-identical to the
image the vision/OCR path actually consumes — the preview can't drift from
what's OCR'd."""
global _SCAN_SAMPLE_PNG
if _SCAN_SAMPLE_PNG is None:
import io
buf = io.BytesIO()
_render_scan_sample().save(buf, format="PNG")
_SCAN_SAMPLE_PNG = buf.getvalue()
return _SCAN_SAMPLE_PNG
# `gradio.Server` extends FastAPI: we keep all our own routes (custom frontend +
# JSON API) AND get gradio's launcher, which owns port 7860 on a Hugging Face
# gradio-SDK Space and wires up the queue + ZeroGPU GPU allocation. A plain
# FastAPI + manual `uvicorn.run(7860)` collides with the Space's own gradio
# server ("address already in use") — `gradio.Server` is the supported way to run
# a custom frontend on a gradio Space.
app = gr.Server(title="Recall")
# A gradio.Server MUST register at least one `@app.api(...)` endpoint: launch()
# builds its internal Blocks from these, and on a Space an *empty* Blocks doesn't
# stay "running" (launch returns and the process exits → RUNTIME_ERROR). This is
# also the gradio-native handle (`gradio_client` / the JS Client) onto the same
# backend the custom frontend drives over plain JSON `/api/*` routes below.
@app.api(name="health")
def health() -> str:
"""Liveness probe + the Server's required gradio endpoint."""
return "ok"
# ---- serialization ---------------------------------------------------------
def _card_out(card: dict | None) -> dict | None:
"""The client never needs (or should see) the reference answer or the raw
source chunk — strip the card down to what the UI renders."""
if not card:
return None
return {
"id": card["id"],
"question": card["question"],
"topic": card["topic"],
"difficulty": card["difficulty"],
"parent_id": card.get("parent_id"),
}
def _view(session: dict) -> dict:
"""Display state the header / mastery bars / deck rail are built from."""
deck = session["deck"]
history = session["history"]
answered = len(history)
total = len(deck)
# Cards that have graduated out of the queue (mastered this session). Progress
# tracks this rather than `answered` so the bar climbs as cards are mastered
# and reaches total exactly when the queue drains — instead of pinning at
# total/total the moment you've seen every card once.
mastered = total - len(session["queue"])
stats: dict[str, dict] = {}
for h in history:
s = stats.setdefault(h["topic"], {"correct": 0, "total": 0})
s["total"] += 1
if h["grade"] >= 3:
s["correct"] += 1
return {
"total": total,
"answered": answered,
"posDisplay": min(mastered + 1, total) if total else 0,
"streak": session["streak"],
"topicStats": stats,
"rail": [
{"id": c["id"], "topic": c["topic"], "injected": bool(c.get("parent_id"))}
for c in deck
],
}
# ---- request models --------------------------------------------------------
class SidBody(BaseModel):
sid: str
class GradeBody(BaseModel):
sid: str
answer: str = ""
class RegenBody(BaseModel):
sid: str
direction: str # "harder" | "easier"
# ---- API -------------------------------------------------------------------
@app.post("/api/generate")
async def api_generate(
text: str = Form(""),
sample: str = Form(""),
file: UploadFile | None = File(None),
):
need_more = ("I need a little more to work with — paste a paragraph of "
"notes or pick a PDF, and I'll build your deck.")
source = ""
# Set when a scanned/image-only PDF is turned into a deck by the vision model
# (no text to chunk); `None` means "fall through to the text path below".
deck_from_images = None
if text and text.strip():
source = text.strip()[:MAX_TEXT_CHARS] # cap to bound chunking work
if len(source) < 40:
return JSONResponse({"error": need_more}, status_code=400)
elif sample == "bio":
source = SAMPLE_NOTES
elif sample == "scan":
# Render a slide to an image and run the OCR/vision path — demonstrates
# image-PDF support end-to-end (stub returns the canned deck).
deck_from_images = await run_in_threadpool(
cp.generate_deck_from_images, [_render_scan_sample()])
elif file is not None:
suffix = Path(file.filename or "upload").suffix or ".txt"
# Stream to a temp file in capped chunks so an oversized upload never
# gets fully buffered in memory.
too_large = False
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp_path = tmp.name
size = 0
while chunk := await file.read(1024 * 1024):
size += len(chunk)
if size > MAX_UPLOAD_BYTES:
too_large = True
break
tmp.write(chunk)
try:
if too_large:
return JSONResponse(
{"error": f"That file is too large (limit "
f"{MAX_UPLOAD_BYTES // (1024 * 1024)} MB). Upload a "
"smaller PDF or paste the notes instead."},
status_code=413,
)
# Extraction (PDF parsing) is blocking — keep it off the event loop.
source = await run_in_threadpool(cp.extract_text, tmp_path)
except cp.ExtractionError as e:
# A scanned/image-only PDF has no text to chunk — render its pages and
# let the multimodal model read them directly. Fall back to the
# original error if it's not image-only or no pages render.
if await run_in_threadpool(cp.is_image_only_pdf, tmp_path):
images = await run_in_threadpool(cp.render_pdf_images, tmp_path)
if images:
deck_from_images = await run_in_threadpool(
cp.generate_deck_from_images, images)
if deck_from_images is None:
return JSONResponse({"error": str(e)}, status_code=422)
finally:
os.unlink(tmp_path)
else:
return JSONResponse({"error": need_more}, status_code=400)
if deck_from_images is not None:
deck = deck_from_images
else:
try:
# Deck generation hits the model — run it in a worker thread so a slow
# call doesn't block every other request on the event loop.
deck = await run_in_threadpool(cp.generate_deck, source)
except Exception as e: # noqa: BLE001 — surface as friendly copy, never crash
return JSONResponse(
{"error": f"Couldn't build a deck from that ({type(e).__name__}). "
"Try different material."},
status_code=422,
)
if not deck:
return JSONResponse(
{"error": "Couldn't generate questions from that. Try different material."},
status_code=422,
)
session = le.init_session(deck)
card = le.next_card(session)
sid = uuid.uuid4().hex
put_session(sid, session)
return {"sid": sid, "card": _card_out(card), "view": _view(session)}
@app.post("/api/grade")
async def api_grade(body: GradeBody):
session = get_session(body.sid)
if session is None:
return JSONResponse({"error": "session expired"}, status_code=404)
# Grading + follow-up generation hit the model; run the whole study step in
# a worker thread so it doesn't block the event loop.
grade, fups = await run_in_threadpool(le.grade_and_adapt, session, body.answer or "")
if grade is None:
return {"done": True, "view": _view(session)}
injected_ids = [f["id"] for f in fups]
put_session(body.sid, session)
return {
"grade": {
"score": grade["score"],
"correct": grade["correct"],
"explanation": grade["explanation"],
"missed": grade["missed_concept"],
},
"injectedIds": injected_ids,
"view": _view(session),
}
@app.post("/api/regenerate")
async def api_regenerate(body: RegenBody):
session = get_session(body.sid)
if session is None:
return JSONResponse({"error": "session expired"}, status_code=404)
card = le.next_card(session)
if card is None:
return {"card": None, "view": _view(session)}
new = await run_in_threadpool(cp.regenerate, card, body.direction) # hits the model
session = le.replace_card(session, card["id"], new)
put_session(body.sid, session)
out = _card_out(new)
out["diffLabel"] = "harder" if body.direction == "harder" else "easier"
return {"card": out, "view": _view(session)}
@app.post("/api/next")
async def api_next(body: SidBody):
session = get_session(body.sid)
if session is None:
return JSONResponse({"error": "session expired"}, status_code=404)
card = le.next_card(session)
return {"card": _card_out(card), "view": _view(session)}
@app.post("/api/recap")
async def api_recap(body: SidBody):
session = get_session(body.sid)
if session is None:
return JSONResponse({"error": "session expired"}, status_code=404)
r = await run_in_threadpool(le.recap, session) # reflection line hits the model
return {"recap": r, "view": _view(session)}
@app.post("/api/restart")
async def api_restart(body: SidBody):
"""Study the same source deck again from a clean session (Recap → restart)."""
session = get_session(body.sid)
if session is None:
return JSONResponse({"error": "session expired"}, status_code=404)
# Rebuild from the original (non-injected) cards only.
base = [c for c in session["deck"] if not c.get("parent_id")]
fresh = le.init_session(base)
card = le.next_card(fresh)
put_session(body.sid, fresh)
return {"card": _card_out(card), "view": _view(fresh)}
# ---- frontend --------------------------------------------------------------
@app.get("/api/sample/scan.png")
async def sample_scan_png():
"""Preview of the scanned-slides sample — served from the same renderer the
vision/OCR path consumes, so the thumbnail stays honest about what's OCR'd."""
return Response(_scan_sample_png(), media_type="image/png")
@app.get("/")
async def index():
return FileResponse(FRONTEND / "index.html")
# HF runs `python server.py` (README app_file) and proxies port 7860. `gradio.Server`
# launches at MODULE TOP LEVEL; on a Space gradio binds 0.0.0.0:7860 automatically.
# We launch non-blocking and hold the main thread ourselves so the (daemon) uvicorn
# server keeps serving. (Locally, set GRADIO_SERVER_PORT to use a different port.)
#
# Two deploy gotchas, both learned the hard way:
# * gradio is pinned to 6.10.0. On 6.17.x a custom gradio.Server doesn't stay up
# under the Space runtime. 6.10.0 is the version gradio's own `Server` example
# ships, and it still allows huggingface-hub<1.0 (needed by the real model).
# * Stub mode registers no `@spaces.GPU` function, which makes a ZeroGPU Space's
# `spaces` reload server error out and flip the Space to RUNTIME_ERROR even
# though uvicorn serves fine. So the stub demo runs on CPU-basic hardware. Once
# the real model (with `@spaces.GPU`) is wired, switch hardware back to ZeroGPU.
# Keep the README sdk_version and requirements*.txt gradio pins in lockstep.
app.launch(show_error=True, prevent_thread_lock=True)
threading.Event().wait()
|