"""
Loads the quantised language head (GGUF / int4) for numeracy feedback generation.

Model: TinyLlama-1.1B-Chat-v1.0 quantised to Q4_K_M GGUF (~669 MB full, ~350 MB Q4).
We strip it further with a LoRA adapter trained only on numeracy instruction pairs,
then re-quantise the merged model to Q4_K_M targeting ≤ 55 MB for the language head.

At runtime we use llama-cpp-python for CPU inference (no GPU required).
"""
from __future__ import annotations

import re
import time
from pathlib import Path
from typing import Optional

_llm = None
_MODEL_PATH: Optional[Path] = None

SYSTEM_PROMPT = {
    "en": (
        "You are a friendly math tutor for young children aged 5–9. "
        "Give very short, warm feedback (≤ 2 sentences). "
        "Use simple words. Never use markdown. "
        "If the child is correct, celebrate. If wrong, gently show the right answer."
    ),
    "fr": (
        "Tu es un tuteur de mathématiques sympa pour les enfants de 5 à 9 ans. "
        "Donne un retour très court et chaleureux (≤ 2 phrases). "
        "Utilise des mots simples. Pas de markdown."
    ),
    "kin": (
        "Uri umwigisha w'imibare w'inshuti ku bana b'imyaka 5-9. "
        "Tanga igisubizo gito kandi cyiza (inyandiko ≤ 2). "
        "Koresha amagambo yoroheje."
    ),
    "sw": (
        "Wewe ni mwalimu wa hisabu wa kirafiki kwa watoto wenye umri wa miaka 5-9. "
        "Toa maoni mafupi na ya joto (≤ sentensi 2). "
        "Tumia maneno rahisi. Usiandike markdown."
    ),
}


def set_model_path(path: str | Path) -> None:
    global _MODEL_PATH
    _MODEL_PATH = Path(path)


def _load():
    global _llm
    if _llm is not None:
        return _llm
    if _MODEL_PATH is None or not _MODEL_PATH.exists():
        return None  # graceful degradation to template responses
    try:
        from llama_cpp import Llama
        _llm = Llama(
            model_path=str(_MODEL_PATH),
            n_ctx=512,
            n_threads=2,
            verbose=False,
        )
    except Exception:
        _llm = None
    return _llm


# ------------------------------------------------------------------
# Template fallback (works without any model loaded)
# ------------------------------------------------------------------

_CORRECT_TEMPLATES = {
    "en": [
        "Great job! That's exactly right! 🎉",
        "Wonderful! You got it!",
        "Yes! {answer} is correct! Well done!",
    ],
    "fr": [
        "Bravo ! C'est exactement ça ! 🎉",
        "Super ! Tu as trouvé !",
        "Oui ! {answer} est correct ! Bien joué !",
    ],
    "kin": [
        "Ni byiza cyane! Ni yo! 🎉",
        "Yego! Wabikoze neza!",
        "Ni {answer}! Wabikoze neza cyane!",
    ],
    "sw": [
        "Hongera! Hiyo ndiyo jibu sahihi! 🎉",
        "Vizuri sana! Umefaulu!",
        "Ndiyo! {answer} ni sahihi! Umefanya vizuri!",
    ],
}
_WRONG_TEMPLATES = {
    "en": [
        "Good try! The answer is {answer}. Let's try again!",
        "Almost! It's {answer}. You'll get it next time!",
        "Not quite — the answer is {answer}. Keep going!",
    ],
    "fr": [
        "Bon essai ! La réponse est {answer}. Réessaie !",
        "Presque ! C'est {answer}. Tu y arriveras !",
        "Pas tout à fait — la réponse est {answer}. Continue !",
    ],
    "kin": [
        "Gerageza neza! Igisubizo ni {answer}. Ongera ugerageze!",
        "Hafi! Ni {answer}. Uzagera!",
        "Si byo — igisubizo ni {answer}. Komeza!",
    ],
    "sw": [
        "Jaribu tena! Jibu ni {answer}. Endelea!",
        "Karibu! Ni {answer}. Utafaulu wakati ujao!",
        "Sivyo — jibu ni {answer}. Jaribu tena!",
    ],
}

import random as _random


def _template_feedback(is_correct: bool, answer: int, lang: str) -> str:
    lang = lang if lang in ("en", "fr", "kin", "sw") else "en"
    pool = _CORRECT_TEMPLATES[lang] if is_correct else _WRONG_TEMPLATES[lang]
    return _random.choice(pool).format(answer=answer)


# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------

def generate_feedback(
    is_correct: bool,
    answer: int,
    lang: str = "en",
    child_response: str = "",
    max_tokens: int = 60,
) -> str:
    """
    Generate a short feedback string for the child.

    Falls back to templates if the GGUF model is not loaded,
    ensuring < 2.5 s latency even on slow hardware.
    """
    t0 = time.time()
    llm = _load()

    if llm is None:
        return _template_feedback(is_correct, answer, lang)

    system = SYSTEM_PROMPT.get(lang, SYSTEM_PROMPT["en"])
    verdict = "correct" if is_correct else f"incorrect (correct answer is {answer})"
    user_msg = (
        f"The child said: '{child_response}'. Their answer was {verdict}. "
        f"Give feedback in {'English' if lang=='en' else 'French' if lang=='fr' else 'Swahili' if lang=='sw' else 'Kinyarwanda'}."
    )

    try:
        out = llm.create_chat_completion(
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user_msg},
            ],
            max_tokens=max_tokens,
            temperature=0.7,
            stop=["\n\n"],
        )
        text = out["choices"][0]["message"]["content"].strip()
        elapsed = time.time() - t0
        # Latency guard: if model took > 2 s, use template next time
        if elapsed > 2.0:
            return _template_feedback(is_correct, answer, lang)
        return text
    except Exception:
        return _template_feedback(is_correct, answer, lang)