Spaces:
Sleeping
Sleeping
File size: 5,278 Bytes
096b19d 61e52d7 096b19d 61e52d7 096b19d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma.
The system prompt implements the 'adult-child' logic:
- The LLM is a child learning Bambara/Fula from the user (adult/teacher)
- vocabulary.jsonl is its primary memory / source of truth
- It detects TEACHING intent and returns structured JSON so MemoryManager
can persist the new word
- It answers QUESTIONS using the vocabulary it has learned
Model: configurable via LLM_MODEL_ID env var.
Default: Qwen/Qwen2.5-72B-Instruct — reliably available on HF Serverless free tier.
Tested models that work on HF Serverless (no paid provider needed):
Qwen/Qwen2.5-72B-Instruct ← default, best quality
Qwen/Qwen2.5-7B-Instruct ← faster, slightly lower quality
mistralai/Mistral-7B-Instruct-v0.3
HuggingFaceH4/zephyr-7b-beta
google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Optional
logger = logging.getLogger(__name__)
SYSTEM_PROMPT_TEMPLATE = """\
You are an AI language assistant learning Bambara and Fula — two West African languages. \
You behave like an eager child learner: you absorb every word the user teaches you, \
and you use what you have already learned to answer questions.
YOUR CURRENT VOCABULARY (your only source of truth):
{vocabulary_context}
RESPONSE RULES — always reply with a single valid JSON object, nothing else:
1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \
"X se dit Y en bambara" / "X veut dire Y"), reply:
{{
"intent": "teaching",
"word": "<the word/phrase being taught>",
"language": "<bam | ful | fr | en>",
"translation": "<the translation given>",
"translation_language": "<bam | ful | fr | en>",
"response": "<warm acknowledgment in the same language the user used, \
1-2 sentences, use the word in a sentence if possible>"
}}
2. If the user is ASKING a question you can answer using the vocabulary:
{{
"intent": "question",
"response": "<answer using vocabulary — be honest if you don't know>"
}}
3. For general CONVERSATION or GREETING:
{{
"intent": "conversation",
"response": "<natural, friendly reply — 1-3 sentences>"
}}
Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\
"""
class GemmaClient:
"""Calls Gemma via HF Serverless Inference API."""
def __init__(
self,
model_id: str = "Qwen/Qwen2.5-72B-Instruct",
hf_token: Optional[str] = None,
) -> None:
self.model_id = model_id
self.hf_token = hf_token
self._client = None # lazy init
def _get_client(self):
if self._client is None:
from huggingface_hub import InferenceClient
self._client = InferenceClient(token=self.hf_token)
return self._client
def chat(self, user_text: str, vocabulary_context: str) -> dict:
"""
Send a message and get a structured response back.
Returns a dict with at minimum: intent, response.
On any error returns: {"intent": "error", "response": <error message>}
"""
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
vocabulary_context=vocabulary_context or "(no vocabulary yet)"
)
try:
client = self._get_client()
completion = client.chat_completion(
model=self.model_id,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
max_tokens=512,
temperature=0.4,
)
raw = completion.choices[0].message.content.strip()
logger.debug("Gemma raw response: %s", raw[:200])
return self._parse(raw)
except Exception as exc:
logger.error("GemmaClient error: %s", exc)
return {
"intent": "error",
"response": f"(LLM unavailable: {exc})",
}
# ── Parsing ───────────────────────────────────────────────────────────────
def _parse(self, raw: str) -> dict:
"""Extract JSON from the model output — handles markdown code fences."""
# Strip markdown code fences if present
text = raw.strip()
fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if fence_match:
text = fence_match.group(1)
else:
# Find first { ... } block
brace_match = re.search(r"\{.*\}", text, re.DOTALL)
if brace_match:
text = brace_match.group(0)
try:
data = json.loads(text)
if "intent" not in data:
data["intent"] = "conversation"
if "response" not in data:
data["response"] = raw # fall back to raw text
return data
except json.JSONDecodeError:
# Return the raw text as a conversation response
return {"intent": "conversation", "response": raw}
|