Spaces:

MataStrategy
/

ground-zero

Sleeping

File size: 5,278 Bytes

"""
GemmaClient — wraps the HuggingFace Serverless Inference API for Gemma.

The system prompt implements the 'adult-child' logic:
  - The LLM is a child learning Bambara/Fula from the user (adult/teacher)
  - vocabulary.jsonl is its primary memory / source of truth
  - It detects TEACHING intent and returns structured JSON so MemoryManager
    can persist the new word
  - It answers QUESTIONS using the vocabulary it has learned

Model: configurable via LLM_MODEL_ID env var.
Default: Qwen/Qwen2.5-72B-Instruct  — reliably available on HF Serverless free tier.

Tested models that work on HF Serverless (no paid provider needed):
  Qwen/Qwen2.5-72B-Instruct        ← default, best quality
  Qwen/Qwen2.5-7B-Instruct         ← faster, slightly lower quality
  mistralai/Mistral-7B-Instruct-v0.3
  HuggingFaceH4/zephyr-7b-beta

google/gemma-3-4b-it is NOT on the free tier — it requires a paid provider.
"""
from __future__ import annotations

import json
import logging
import re
from typing import Optional

logger = logging.getLogger(__name__)

SYSTEM_PROMPT_TEMPLATE = """\
You are an AI language assistant learning Bambara and Fula — two West African languages. \
You behave like an eager child learner: you absorb every word the user teaches you, \
and you use what you have already learned to answer questions.

YOUR CURRENT VOCABULARY (your only source of truth):
{vocabulary_context}

RESPONSE RULES — always reply with a single valid JSON object, nothing else:

1. If the user is TEACHING you a word or phrase (e.g. "I ni ce means hello" / \
"X se dit Y en bambara" / "X veut dire Y"), reply:
{{
  "intent": "teaching",
  "word": "<the word/phrase being taught>",
  "language": "<bam | ful | fr | en>",
  "translation": "<the translation given>",
  "translation_language": "<bam | ful | fr | en>",
  "response": "<warm acknowledgment in the same language the user used, \
1-2 sentences, use the word in a sentence if possible>"
}}

2. If the user is ASKING a question you can answer using the vocabulary:
{{
  "intent": "question",
  "response": "<answer using vocabulary — be honest if you don't know>"
}}

3. For general CONVERSATION or GREETING:
{{
  "intent": "conversation",
  "response": "<natural, friendly reply — 1-3 sentences>"
}}

Always be warm, encouraging, and curious. If unsure of intent, choose "conversation".\
"""


class GemmaClient:
    """Calls Gemma via HF Serverless Inference API."""

    def __init__(
        self,
        model_id: str = "Qwen/Qwen2.5-72B-Instruct",
        hf_token: Optional[str] = None,
    ) -> None:
        self.model_id  = model_id
        self.hf_token  = hf_token
        self._client   = None  # lazy init

    def _get_client(self):
        if self._client is None:
            from huggingface_hub import InferenceClient
            self._client = InferenceClient(token=self.hf_token)
        return self._client

    def chat(self, user_text: str, vocabulary_context: str) -> dict:
        """
        Send a message and get a structured response back.
        Returns a dict with at minimum: intent, response.
        On any error returns: {"intent": "error", "response": <error message>}
        """
        system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
            vocabulary_context=vocabulary_context or "(no vocabulary yet)"
        )

        try:
            client = self._get_client()
            completion = client.chat_completion(
                model=self.model_id,
                messages=[
                    {"role": "system",    "content": system_prompt},
                    {"role": "user",      "content": user_text},
                ],
                max_tokens=512,
                temperature=0.4,
            )
            raw = completion.choices[0].message.content.strip()
            logger.debug("Gemma raw response: %s", raw[:200])
            return self._parse(raw)

        except Exception as exc:
            logger.error("GemmaClient error: %s", exc)
            return {
                "intent": "error",
                "response": f"(LLM unavailable: {exc})",
            }

    # ── Parsing ───────────────────────────────────────────────────────────────

    def _parse(self, raw: str) -> dict:
        """Extract JSON from the model output — handles markdown code fences."""
        # Strip markdown code fences if present
        text = raw.strip()
        fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
        if fence_match:
            text = fence_match.group(1)
        else:
            # Find first { ... } block
            brace_match = re.search(r"\{.*\}", text, re.DOTALL)
            if brace_match:
                text = brace_match.group(0)

        try:
            data = json.loads(text)
            if "intent" not in data:
                data["intent"] = "conversation"
            if "response" not in data:
                data["response"] = raw  # fall back to raw text
            return data
        except json.JSONDecodeError:
            # Return the raw text as a conversation response
            return {"intent": "conversation", "response": raw}