""" Loads the quantised language head (GGUF / int4) for numeracy feedback generation. Model: TinyLlama-1.1B-Chat-v1.0 quantised to Q4_K_M GGUF (~669 MB full, ~350 MB Q4). We strip it further with a LoRA adapter trained only on numeracy instruction pairs, then re-quantise the merged model to Q4_K_M targeting ≤ 55 MB for the language head. At runtime we use llama-cpp-python for CPU inference (no GPU required). """ from __future__ import annotations import re import time from pathlib import Path from typing import Optional _llm = None _MODEL_PATH: Optional[Path] = None SYSTEM_PROMPT = { "en": ( "You are a friendly math tutor for young children aged 5–9. " "Give very short, warm feedback (≤ 2 sentences). " "Use simple words. Never use markdown. " "If the child is correct, celebrate. If wrong, gently show the right answer." ), "fr": ( "Tu es un tuteur de mathématiques sympa pour les enfants de 5 à 9 ans. " "Donne un retour très court et chaleureux (≤ 2 phrases). " "Utilise des mots simples. Pas de markdown." ), "kin": ( "Uri umwigisha w'imibare w'inshuti ku bana b'imyaka 5-9. " "Tanga igisubizo gito kandi cyiza (inyandiko ≤ 2). " "Koresha amagambo yoroheje." ), "sw": ( "Wewe ni mwalimu wa hisabu wa kirafiki kwa watoto wenye umri wa miaka 5-9. " "Toa maoni mafupi na ya joto (≤ sentensi 2). " "Tumia maneno rahisi. Usiandike markdown." ), } def set_model_path(path: str | Path) -> None: global _MODEL_PATH _MODEL_PATH = Path(path) def _load(): global _llm if _llm is not None: return _llm if _MODEL_PATH is None or not _MODEL_PATH.exists(): return None # graceful degradation to template responses try: from llama_cpp import Llama _llm = Llama( model_path=str(_MODEL_PATH), n_ctx=512, n_threads=2, verbose=False, ) except Exception: _llm = None return _llm # ------------------------------------------------------------------ # Template fallback (works without any model loaded) # ------------------------------------------------------------------ _CORRECT_TEMPLATES = { "en": [ "Great job! That's exactly right! 🎉", "Wonderful! You got it!", "Yes! {answer} is correct! Well done!", ], "fr": [ "Bravo ! C'est exactement ça ! 🎉", "Super ! Tu as trouvé !", "Oui ! {answer} est correct ! Bien joué !", ], "kin": [ "Ni byiza cyane! Ni yo! 🎉", "Yego! Wabikoze neza!", "Ni {answer}! Wabikoze neza cyane!", ], "sw": [ "Hongera! Hiyo ndiyo jibu sahihi! 🎉", "Vizuri sana! Umefaulu!", "Ndiyo! {answer} ni sahihi! Umefanya vizuri!", ], } _WRONG_TEMPLATES = { "en": [ "Good try! The answer is {answer}. Let's try again!", "Almost! It's {answer}. You'll get it next time!", "Not quite — the answer is {answer}. Keep going!", ], "fr": [ "Bon essai ! La réponse est {answer}. Réessaie !", "Presque ! C'est {answer}. Tu y arriveras !", "Pas tout à fait — la réponse est {answer}. Continue !", ], "kin": [ "Gerageza neza! Igisubizo ni {answer}. Ongera ugerageze!", "Hafi! Ni {answer}. Uzagera!", "Si byo — igisubizo ni {answer}. Komeza!", ], "sw": [ "Jaribu tena! Jibu ni {answer}. Endelea!", "Karibu! Ni {answer}. Utafaulu wakati ujao!", "Sivyo — jibu ni {answer}. Jaribu tena!", ], } import random as _random def _template_feedback(is_correct: bool, answer: int, lang: str) -> str: lang = lang if lang in ("en", "fr", "kin", "sw") else "en" pool = _CORRECT_TEMPLATES[lang] if is_correct else _WRONG_TEMPLATES[lang] return _random.choice(pool).format(answer=answer) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def generate_feedback( is_correct: bool, answer: int, lang: str = "en", child_response: str = "", max_tokens: int = 60, ) -> str: """ Generate a short feedback string for the child. Falls back to templates if the GGUF model is not loaded, ensuring < 2.5 s latency even on slow hardware. """ t0 = time.time() llm = _load() if llm is None: return _template_feedback(is_correct, answer, lang) system = SYSTEM_PROMPT.get(lang, SYSTEM_PROMPT["en"]) verdict = "correct" if is_correct else f"incorrect (correct answer is {answer})" user_msg = ( f"The child said: '{child_response}'. Their answer was {verdict}. " f"Give feedback in {'English' if lang=='en' else 'French' if lang=='fr' else 'Swahili' if lang=='sw' else 'Kinyarwanda'}." ) try: out = llm.create_chat_completion( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user_msg}, ], max_tokens=max_tokens, temperature=0.7, stop=["\n\n"], ) text = out["choices"][0]["message"]["content"].strip() elapsed = time.time() - t0 # Latency guard: if model took > 2 s, use template next time if elapsed > 2.0: return _template_feedback(is_correct, answer, lang) return text except Exception: return _template_feedback(is_correct, answer, lang)