Spaces:
Runtime error
Runtime error
File size: 5,594 Bytes
a62b942 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """
Loads the quantised language head (GGUF / int4) for numeracy feedback generation.
Model: TinyLlama-1.1B-Chat-v1.0 quantised to Q4_K_M GGUF (~669 MB full, ~350 MB Q4).
We strip it further with a LoRA adapter trained only on numeracy instruction pairs,
then re-quantise the merged model to Q4_K_M targeting ≤ 55 MB for the language head.
At runtime we use llama-cpp-python for CPU inference (no GPU required).
"""
from __future__ import annotations
import re
import time
from pathlib import Path
from typing import Optional
_llm = None
_MODEL_PATH: Optional[Path] = None
SYSTEM_PROMPT = {
"en": (
"You are a friendly math tutor for young children aged 5–9. "
"Give very short, warm feedback (≤ 2 sentences). "
"Use simple words. Never use markdown. "
"If the child is correct, celebrate. If wrong, gently show the right answer."
),
"fr": (
"Tu es un tuteur de mathématiques sympa pour les enfants de 5 à 9 ans. "
"Donne un retour très court et chaleureux (≤ 2 phrases). "
"Utilise des mots simples. Pas de markdown."
),
"kin": (
"Uri umwigisha w'imibare w'inshuti ku bana b'imyaka 5-9. "
"Tanga igisubizo gito kandi cyiza (inyandiko ≤ 2). "
"Koresha amagambo yoroheje."
),
"sw": (
"Wewe ni mwalimu wa hisabu wa kirafiki kwa watoto wenye umri wa miaka 5-9. "
"Toa maoni mafupi na ya joto (≤ sentensi 2). "
"Tumia maneno rahisi. Usiandike markdown."
),
}
def set_model_path(path: str | Path) -> None:
global _MODEL_PATH
_MODEL_PATH = Path(path)
def _load():
global _llm
if _llm is not None:
return _llm
if _MODEL_PATH is None or not _MODEL_PATH.exists():
return None # graceful degradation to template responses
try:
from llama_cpp import Llama
_llm = Llama(
model_path=str(_MODEL_PATH),
n_ctx=512,
n_threads=2,
verbose=False,
)
except Exception:
_llm = None
return _llm
# ------------------------------------------------------------------
# Template fallback (works without any model loaded)
# ------------------------------------------------------------------
_CORRECT_TEMPLATES = {
"en": [
"Great job! That's exactly right! 🎉",
"Wonderful! You got it!",
"Yes! {answer} is correct! Well done!",
],
"fr": [
"Bravo ! C'est exactement ça ! 🎉",
"Super ! Tu as trouvé !",
"Oui ! {answer} est correct ! Bien joué !",
],
"kin": [
"Ni byiza cyane! Ni yo! 🎉",
"Yego! Wabikoze neza!",
"Ni {answer}! Wabikoze neza cyane!",
],
"sw": [
"Hongera! Hiyo ndiyo jibu sahihi! 🎉",
"Vizuri sana! Umefaulu!",
"Ndiyo! {answer} ni sahihi! Umefanya vizuri!",
],
}
_WRONG_TEMPLATES = {
"en": [
"Good try! The answer is {answer}. Let's try again!",
"Almost! It's {answer}. You'll get it next time!",
"Not quite — the answer is {answer}. Keep going!",
],
"fr": [
"Bon essai ! La réponse est {answer}. Réessaie !",
"Presque ! C'est {answer}. Tu y arriveras !",
"Pas tout à fait — la réponse est {answer}. Continue !",
],
"kin": [
"Gerageza neza! Igisubizo ni {answer}. Ongera ugerageze!",
"Hafi! Ni {answer}. Uzagera!",
"Si byo — igisubizo ni {answer}. Komeza!",
],
"sw": [
"Jaribu tena! Jibu ni {answer}. Endelea!",
"Karibu! Ni {answer}. Utafaulu wakati ujao!",
"Sivyo — jibu ni {answer}. Jaribu tena!",
],
}
import random as _random
def _template_feedback(is_correct: bool, answer: int, lang: str) -> str:
lang = lang if lang in ("en", "fr", "kin", "sw") else "en"
pool = _CORRECT_TEMPLATES[lang] if is_correct else _WRONG_TEMPLATES[lang]
return _random.choice(pool).format(answer=answer)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def generate_feedback(
is_correct: bool,
answer: int,
lang: str = "en",
child_response: str = "",
max_tokens: int = 60,
) -> str:
"""
Generate a short feedback string for the child.
Falls back to templates if the GGUF model is not loaded,
ensuring < 2.5 s latency even on slow hardware.
"""
t0 = time.time()
llm = _load()
if llm is None:
return _template_feedback(is_correct, answer, lang)
system = SYSTEM_PROMPT.get(lang, SYSTEM_PROMPT["en"])
verdict = "correct" if is_correct else f"incorrect (correct answer is {answer})"
user_msg = (
f"The child said: '{child_response}'. Their answer was {verdict}. "
f"Give feedback in {'English' if lang=='en' else 'French' if lang=='fr' else 'Swahili' if lang=='sw' else 'Kinyarwanda'}."
)
try:
out = llm.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user_msg},
],
max_tokens=max_tokens,
temperature=0.7,
stop=["\n\n"],
)
text = out["choices"][0]["message"]["content"].strip()
elapsed = time.time() - t0
# Latency guard: if model took > 2 s, use template next time
if elapsed > 2.0:
return _template_feedback(is_correct, answer, lang)
return text
except Exception:
return _template_feedback(is_correct, answer, lang)
|