Spaces:
Running
Running
| """ | |
| Loads the quantised language head (GGUF / int4) for numeracy feedback generation. | |
| Model: TinyLlama-1.1B-Chat-v1.0 quantised to Q4_K_M GGUF (~669 MB full, ~350 MB Q4). | |
| We strip it further with a LoRA adapter trained only on numeracy instruction pairs, | |
| then re-quantise the merged model to Q4_K_M targeting ≤ 55 MB for the language head. | |
| At runtime we use llama-cpp-python for CPU inference (no GPU required). | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import time | |
| from pathlib import Path | |
| from typing import Optional | |
| _llm = None | |
| _MODEL_PATH: Optional[Path] = None | |
| SYSTEM_PROMPT = { | |
| "en": ( | |
| "You are a friendly math tutor for young children aged 5–9. " | |
| "Give very short, warm feedback (≤ 2 sentences). " | |
| "Use simple words. Never use markdown. " | |
| "If the child is correct, celebrate. If wrong, gently show the right answer." | |
| ), | |
| "fr": ( | |
| "Tu es un tuteur de mathématiques sympa pour les enfants de 5 à 9 ans. " | |
| "Donne un retour très court et chaleureux (≤ 2 phrases). " | |
| "Utilise des mots simples. Pas de markdown." | |
| ), | |
| "kin": ( | |
| "Uri umwigisha w'imibare w'inshuti ku bana b'imyaka 5-9. " | |
| "Tanga igisubizo gito kandi cyiza (inyandiko ≤ 2). " | |
| "Koresha amagambo yoroheje." | |
| ), | |
| "sw": ( | |
| "Wewe ni mwalimu wa hisabu wa kirafiki kwa watoto wenye umri wa miaka 5-9. " | |
| "Toa maoni mafupi na ya joto (≤ sentensi 2). " | |
| "Tumia maneno rahisi. Usiandike markdown." | |
| ), | |
| } | |
| def set_model_path(path: str | Path) -> None: | |
| global _MODEL_PATH | |
| _MODEL_PATH = Path(path) | |
| def _load(): | |
| global _llm | |
| if _llm is not None: | |
| return _llm | |
| if _MODEL_PATH is None or not _MODEL_PATH.exists(): | |
| return None # graceful degradation to template responses | |
| try: | |
| from llama_cpp import Llama | |
| _llm = Llama( | |
| model_path=str(_MODEL_PATH), | |
| n_ctx=512, | |
| n_threads=2, | |
| verbose=False, | |
| ) | |
| except Exception: | |
| _llm = None | |
| return _llm | |
| # ------------------------------------------------------------------ | |
| # Template fallback (works without any model loaded) | |
| # ------------------------------------------------------------------ | |
| _CORRECT_TEMPLATES = { | |
| "en": [ | |
| "Great job! That's exactly right! 🎉", | |
| "Wonderful! You got it!", | |
| "Yes! {answer} is correct! Well done!", | |
| ], | |
| "fr": [ | |
| "Bravo ! C'est exactement ça ! 🎉", | |
| "Super ! Tu as trouvé !", | |
| "Oui ! {answer} est correct ! Bien joué !", | |
| ], | |
| "kin": [ | |
| "Ni byiza cyane! Ni yo! 🎉", | |
| "Yego! Wabikoze neza!", | |
| "Ni {answer}! Wabikoze neza cyane!", | |
| ], | |
| "sw": [ | |
| "Hongera! Hiyo ndiyo jibu sahihi! 🎉", | |
| "Vizuri sana! Umefaulu!", | |
| "Ndiyo! {answer} ni sahihi! Umefanya vizuri!", | |
| ], | |
| } | |
| _WRONG_TEMPLATES = { | |
| "en": [ | |
| "Good try! The answer is {answer}. Let's try again!", | |
| "Almost! It's {answer}. You'll get it next time!", | |
| "Not quite — the answer is {answer}. Keep going!", | |
| ], | |
| "fr": [ | |
| "Bon essai ! La réponse est {answer}. Réessaie !", | |
| "Presque ! C'est {answer}. Tu y arriveras !", | |
| "Pas tout à fait — la réponse est {answer}. Continue !", | |
| ], | |
| "kin": [ | |
| "Gerageza neza! Igisubizo ni {answer}. Ongera ugerageze!", | |
| "Hafi! Ni {answer}. Uzagera!", | |
| "Si byo — igisubizo ni {answer}. Komeza!", | |
| ], | |
| "sw": [ | |
| "Jaribu tena! Jibu ni {answer}. Endelea!", | |
| "Karibu! Ni {answer}. Utafaulu wakati ujao!", | |
| "Sivyo — jibu ni {answer}. Jaribu tena!", | |
| ], | |
| } | |
| import random as _random | |
| def _template_feedback(is_correct: bool, answer: int, lang: str) -> str: | |
| lang = lang if lang in ("en", "fr", "kin", "sw") else "en" | |
| pool = _CORRECT_TEMPLATES[lang] if is_correct else _WRONG_TEMPLATES[lang] | |
| return _random.choice(pool).format(answer=answer) | |
| # ------------------------------------------------------------------ | |
| # Public API | |
| # ------------------------------------------------------------------ | |
| def generate_feedback( | |
| is_correct: bool, | |
| answer: int, | |
| lang: str = "en", | |
| child_response: str = "", | |
| max_tokens: int = 60, | |
| ) -> str: | |
| """ | |
| Generate a short feedback string for the child. | |
| Falls back to templates if the GGUF model is not loaded, | |
| ensuring < 2.5 s latency even on slow hardware. | |
| """ | |
| t0 = time.time() | |
| llm = _load() | |
| if llm is None: | |
| return _template_feedback(is_correct, answer, lang) | |
| system = SYSTEM_PROMPT.get(lang, SYSTEM_PROMPT["en"]) | |
| verdict = "correct" if is_correct else f"incorrect (correct answer is {answer})" | |
| user_msg = ( | |
| f"The child said: '{child_response}'. Their answer was {verdict}. " | |
| f"Give feedback in {'English' if lang=='en' else 'French' if lang=='fr' else 'Swahili' if lang=='sw' else 'Kinyarwanda'}." | |
| ) | |
| try: | |
| out = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user_msg}, | |
| ], | |
| max_tokens=max_tokens, | |
| temperature=0.7, | |
| stop=["\n\n"], | |
| ) | |
| text = out["choices"][0]["message"]["content"].strip() | |
| elapsed = time.time() - t0 | |
| # Latency guard: if model took > 2 s, use template next time | |
| if elapsed > 2.0: | |
| return _template_feedback(is_correct, answer, lang) | |
| return text | |
| except Exception: | |
| return _template_feedback(is_correct, answer, lang) | |