File size: 5,278 Bytes
096b19d
 
 
 
 
 
 
 
 
 
 
61e52d7
 
 
 
 
 
 
 
 
096b19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61e52d7
096b19d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma.

The system prompt implements the 'adult-child' logic:
  - The LLM is a child learning Bambara/Fula from the user (adult/teacher)
  - vocabulary.jsonl is its primary memory / source of truth
  - It detects TEACHING intent and returns structured JSON so MemoryManager
    can persist the new word
  - It answers QUESTIONS using the vocabulary it has learned

Model: configurable via LLM_MODEL_ID env var.
Default: Qwen/Qwen2.5-72B-Instruct  — reliably available on HF Serverless free tier.

Tested models that work on HF Serverless (no paid provider needed):
  Qwen/Qwen2.5-72B-Instruct        ← default, best quality
  Qwen/Qwen2.5-7B-Instruct         ← faster, slightly lower quality
  mistralai/Mistral-7B-Instruct-v0.3
  HuggingFaceH4/zephyr-7b-beta

google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider.
"""
from __future__ import annotations

import json
import logging
import re
from typing import Optional

logger = logging.getLogger(__name__)

SYSTEM_PROMPT_TEMPLATE = """\
You are an AI language assistant learning Bambara and Fula — two West African languages. \
You behave like an eager child learner: you absorb every word the user teaches you, \
and you use what you have already learned to answer questions.

YOUR CURRENT VOCABULARY (your only source of truth):
{vocabulary_context}

RESPONSE RULES — always reply with a single valid JSON object, nothing else:

1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \
"X se dit Y en bambara" / "X veut dire Y"), reply:
{{
  "intent": "teaching",
  "word": "<the word/phrase being taught>",
  "language": "<bam | ful | fr | en>",
  "translation": "<the translation given>",
  "translation_language": "<bam | ful | fr | en>",
  "response": "<warm acknowledgment in the same language the user used, \
1-2 sentences, use the word in a sentence if possible>"
}}

2. If the user is ASKING a question you can answer using the vocabulary:
{{
  "intent": "question",
  "response": "<answer using vocabulary — be honest if you don't know>"
}}

3. For general CONVERSATION or GREETING:
{{
  "intent": "conversation",
  "response": "<natural, friendly reply — 1-3 sentences>"
}}

Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\
"""


class GemmaClient:
    """Calls Gemma via HF Serverless Inference API."""

    def __init__(
        self,
        model_id: str = "Qwen/Qwen2.5-72B-Instruct",
        hf_token: Optional[str] = None,
    ) -> None:
        self.model_id  = model_id
        self.hf_token  = hf_token
        self._client   = None  # lazy init

    def _get_client(self):
        if self._client is None:
            from huggingface_hub import InferenceClient
            self._client = InferenceClient(token=self.hf_token)
        return self._client

    def chat(self, user_text: str, vocabulary_context: str) -> dict:
        """
        Send a message and get a structured response back.
        Returns a dict with at minimum: intent, response.
        On any error returns: {"intent": "error", "response": <error message>}
        """
        system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
            vocabulary_context=vocabulary_context or "(no vocabulary yet)"
        )

        try:
            client = self._get_client()
            completion = client.chat_completion(
                model=self.model_id,
                messages=[
                    {"role": "system",    "content": system_prompt},
                    {"role": "user",      "content": user_text},
                ],
                max_tokens=512,
                temperature=0.4,
            )
            raw = completion.choices[0].message.content.strip()
            logger.debug("Gemma raw response: %s", raw[:200])
            return self._parse(raw)

        except Exception as exc:
            logger.error("GemmaClient error: %s", exc)
            return {
                "intent": "error",
                "response": f"(LLM unavailable: {exc})",
            }

    # ── Parsing ───────────────────────────────────────────────────────────────

    def _parse(self, raw: str) -> dict:
        """Extract JSON from the model output — handles markdown code fences."""
        # Strip markdown code fences if present
        text = raw.strip()
        fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
        if fence_match:
            text = fence_match.group(1)
        else:
            # Find first { ... } block
            brace_match = re.search(r"\{.*\}", text, re.DOTALL)
            if brace_match:
                text = brace_match.group(0)

        try:
            data = json.loads(text)
            if "intent" not in data:
                data["intent"] = "conversation"
            if "response" not in data:
                data["response"] = raw  # fall back to raw text
            return data
        except json.JSONDecodeError:
            # Return the raw text as a conversation response
            return {"intent": "conversation", "response": raw}