File size: 5,594 Bytes
a62b942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Loads the quantised language head (GGUF / int4) for numeracy feedback generation.

Model: TinyLlama-1.1B-Chat-v1.0 quantised to Q4_K_M GGUF (~669 MB full, ~350 MB Q4).
We strip it further with a LoRA adapter trained only on numeracy instruction pairs,
then re-quantise the merged model to Q4_K_M targeting ≤ 55 MB for the language head.

At runtime we use llama-cpp-python for CPU inference (no GPU required).
"""
from __future__ import annotations

import re
import time
from pathlib import Path
from typing import Optional

_llm = None
_MODEL_PATH: Optional[Path] = None

SYSTEM_PROMPT = {
    "en": (
        "You are a friendly math tutor for young children aged 5–9. "
        "Give very short, warm feedback (≤ 2 sentences). "
        "Use simple words. Never use markdown. "
        "If the child is correct, celebrate. If wrong, gently show the right answer."
    ),
    "fr": (
        "Tu es un tuteur de mathématiques sympa pour les enfants de 5 à 9 ans. "
        "Donne un retour très court et chaleureux (≤ 2 phrases). "
        "Utilise des mots simples. Pas de markdown."
    ),
    "kin": (
        "Uri umwigisha w'imibare w'inshuti ku bana b'imyaka 5-9. "
        "Tanga igisubizo gito kandi cyiza (inyandiko ≤ 2). "
        "Koresha amagambo yoroheje."
    ),
    "sw": (
        "Wewe ni mwalimu wa hisabu wa kirafiki kwa watoto wenye umri wa miaka 5-9. "
        "Toa maoni mafupi na ya joto (≤ sentensi 2). "
        "Tumia maneno rahisi. Usiandike markdown."
    ),
}


def set_model_path(path: str | Path) -> None:
    global _MODEL_PATH
    _MODEL_PATH = Path(path)


def _load():
    global _llm
    if _llm is not None:
        return _llm
    if _MODEL_PATH is None or not _MODEL_PATH.exists():
        return None  # graceful degradation to template responses
    try:
        from llama_cpp import Llama
        _llm = Llama(
            model_path=str(_MODEL_PATH),
            n_ctx=512,
            n_threads=2,
            verbose=False,
        )
    except Exception:
        _llm = None
    return _llm


# ------------------------------------------------------------------
# Template fallback (works without any model loaded)
# ------------------------------------------------------------------

_CORRECT_TEMPLATES = {
    "en": [
        "Great job! That's exactly right! 🎉",
        "Wonderful! You got it!",
        "Yes! {answer} is correct! Well done!",
    ],
    "fr": [
        "Bravo ! C'est exactement ça ! 🎉",
        "Super ! Tu as trouvé !",
        "Oui ! {answer} est correct ! Bien joué !",
    ],
    "kin": [
        "Ni byiza cyane! Ni yo! 🎉",
        "Yego! Wabikoze neza!",
        "Ni {answer}! Wabikoze neza cyane!",
    ],
    "sw": [
        "Hongera! Hiyo ndiyo jibu sahihi! 🎉",
        "Vizuri sana! Umefaulu!",
        "Ndiyo! {answer} ni sahihi! Umefanya vizuri!",
    ],
}
_WRONG_TEMPLATES = {
    "en": [
        "Good try! The answer is {answer}. Let's try again!",
        "Almost! It's {answer}. You'll get it next time!",
        "Not quite — the answer is {answer}. Keep going!",
    ],
    "fr": [
        "Bon essai ! La réponse est {answer}. Réessaie !",
        "Presque ! C'est {answer}. Tu y arriveras !",
        "Pas tout à fait — la réponse est {answer}. Continue !",
    ],
    "kin": [
        "Gerageza neza! Igisubizo ni {answer}. Ongera ugerageze!",
        "Hafi! Ni {answer}. Uzagera!",
        "Si byo — igisubizo ni {answer}. Komeza!",
    ],
    "sw": [
        "Jaribu tena! Jibu ni {answer}. Endelea!",
        "Karibu! Ni {answer}. Utafaulu wakati ujao!",
        "Sivyo — jibu ni {answer}. Jaribu tena!",
    ],
}

import random as _random


def _template_feedback(is_correct: bool, answer: int, lang: str) -> str:
    lang = lang if lang in ("en", "fr", "kin", "sw") else "en"
    pool = _CORRECT_TEMPLATES[lang] if is_correct else _WRONG_TEMPLATES[lang]
    return _random.choice(pool).format(answer=answer)


# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------

def generate_feedback(
    is_correct: bool,
    answer: int,
    lang: str = "en",
    child_response: str = "",
    max_tokens: int = 60,
) -> str:
    """
    Generate a short feedback string for the child.

    Falls back to templates if the GGUF model is not loaded,
    ensuring < 2.5 s latency even on slow hardware.
    """
    t0 = time.time()
    llm = _load()

    if llm is None:
        return _template_feedback(is_correct, answer, lang)

    system = SYSTEM_PROMPT.get(lang, SYSTEM_PROMPT["en"])
    verdict = "correct" if is_correct else f"incorrect (correct answer is {answer})"
    user_msg = (
        f"The child said: '{child_response}'. Their answer was {verdict}. "
        f"Give feedback in {'English' if lang=='en' else 'French' if lang=='fr' else 'Swahili' if lang=='sw' else 'Kinyarwanda'}."
    )

    try:
        out = llm.create_chat_completion(
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user_msg},
            ],
            max_tokens=max_tokens,
            temperature=0.7,
            stop=["\n\n"],
        )
        text = out["choices"][0]["message"]["content"].strip()
        elapsed = time.time() - t0
        # Latency guard: if model took > 2 s, use template next time
        if elapsed > 2.0:
            return _template_feedback(is_correct, answer, lang)
        return text
    except Exception:
        return _template_feedback(is_correct, answer, lang)