File size: 5,593 Bytes

5d8fd4f

"""
LLM Engine — local Qwen2.5-0.5B-Instruct fallback via llama-cpp-python.

This is the bottom layer of the AnveshAI hierarchy:

    Math          →  math_engine         (instant, rule-based)
    Knowledge     →  knowledge_engine    (keyword retrieval from knowledge.txt)
      └─ no match →  LLMEngine.generate  (Qwen2.5-0.5B)
    Conversation  →  conversation_engine (pattern matching from conversation.txt)
      └─ no match →  LLMEngine.generate  (Qwen2.5-0.5B)

Model: Qwen/Qwen2.5-0.5B-Instruct  (Q4_K_M GGUF, ~350 MB)
    ─ Best-in-class quality at 0.5B parameters
    ─ Runs entirely on CPU via llama.cpp
    ─ Downloaded once into ~/.cache/huggingface/ on first use
    ─ Loaded LAZILY: the model only loads when first needed,
      keeping startup instant.
"""

MODEL_REPO   = "Qwen/Qwen2.5-0.5B-Instruct-GGUF"
MODEL_FILE   = "qwen2.5-0.5b-instruct-q4_k_m.gguf"

SYSTEM_PROMPT = (
    "You are AnveshAI Edge, a helpful offline AI assistant. "
    "Answer questions thoroughly and completely. Show full working steps "
    "for math or technical questions. Do not repeat the question back. "
    "If you are unsure about something, say so clearly."
)

MATH_SYSTEM_PROMPT = (
    "You are a mathematics tutor. "
    "You will be given a VERIFIED ANSWER computed by a symbolic engine. "
    "That answer is 100% correct — do NOT change it, do NOT recompute it. "
    "Your ONLY job is to explain, step by step, HOW a student would work through "
    "the problem and arrive at that exact answer. "
    "Every step must lead logically toward the verified answer. "
    "State the verified answer word-for-word at the end of your explanation."
)

MAX_TOKENS  = 1024   # enough for detailed explanations and step-by-step answers
TEMPERATURE = 0.7
MATH_TEMPERATURE = 0.1   # near-deterministic for math explanations
TOP_P       = 0.9
N_CTX       = 16384  # match model's trained context (supports up to 32768)


class LLMEngine:
    """
    Lazy-loading wrapper around Qwen2.5-0.5B-Instruct (GGUF via llama.cpp).

    Usage:
        engine = LLMEngine()
        response = engine.generate("What is photosynthesis?")

    The GGUF model is downloaded from HuggingFace on the first call to
    generate() and cached locally. Every subsequent call reuses the
    in-memory model — no re-loading.
    """

    def __init__(self) -> None:
        self._llm   = None
        self._loaded: bool = False
        self._failed: bool = False
        self._fail_reason: str = ""

    def is_available(self) -> bool:
        """True once the model has loaded without error."""
        return self._loaded and not self._failed

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _load(self) -> None:
        """Download (first run only) and load the GGUF model into memory."""
        if self._loaded or self._failed:
            return

        try:
            print(
                f"\n  [LLM] Loading {MODEL_FILE} … "
                "(first run downloads ~350 MB, then cached locally)",
                flush=True,
            )

            from llama_cpp import Llama

            self._llm = Llama.from_pretrained(
                repo_id=MODEL_REPO,
                filename=MODEL_FILE,
                n_ctx=N_CTX,
                n_threads=4,   # use up to 4 CPU threads
                verbose=False,
            )

            self._loaded = True
            print("  [LLM] Qwen2.5-0.5B-Instruct ready\n", flush=True)

        except Exception as exc:
            self._failed = True
            self._fail_reason = str(exc)
            print(f"  [LLM] Failed to load: {exc}\n", flush=True)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def generate(
        self,
        user_input: str,
        context: str = "",
        system_prompt: str = "",
        temperature: float = TEMPERATURE,
    ) -> str:
        """
        Generate a response using the local LLM.

        Args:
            user_input    : The user's message or question.
            context       : Optional retrieved text to inject as background.
            system_prompt : Override the default system prompt (e.g. for math).
            temperature   : Sampling temperature; use low values for math.

        Returns:
            The model's reply as a plain string.
        """
        self._load()

        if self._failed:
            return (
                "The local LLM is currently unavailable "
                f"({self._fail_reason}). "
                "Ensure 'llama-cpp-python' is installed and the model "
                "could be downloaded."
            )

        try:
            system_content = system_prompt if system_prompt else SYSTEM_PROMPT
            if context:
                system_content += f"\n\nRelevant background:\n{context}"

            messages = [
                {"role": "system", "content": system_content},
                {"role": "user",   "content": user_input},
            ]

            output = self._llm.create_chat_completion(
                messages=messages,
                max_tokens=MAX_TOKENS,
                temperature=temperature,
                top_p=TOP_P,
            )

            response: str = output["choices"][0]["message"]["content"]
            return response.strip()

        except Exception as exc:
            return f"LLM generation error: {exc}"