Spaces:

Mobiworks
/

rag-chatbot

Running

File size: 4,813 Bytes

"""
llm_handler.py
--------------
Loads and runs the open-source LLM (Phi-2 GGUF) via llama-cpp-python.

Step 3 Enhancement:
- Added generate_stream() which yields tokens one by one for streaming UI.
- generate() kept unchanged — still used by non-streaming code paths.

Design decisions
----------------
* GGUF 4-bit quantisation (Q4_K_M) keeps RAM usage low.
* Model downloaded via HuggingFace Hub global cache (~/.cache/huggingface/hub/)
  which persists between Space restarts on code-only pushes — no re-download.
* GPU layers default to 0 (CPU-only) but can be set via LLM_N_GPU_LAYERS env var.
"""

import logging
import os
from pathlib import Path
from typing import Generator

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

from app.config import (
    LLM_CACHE_DIR,
    LLM_CONTEXT_LEN,
    LLM_MAX_TOKENS,
    LLM_MODEL_FILE,
    LLM_MODEL_REPO,
    LLM_N_GPU_LAYERS,
    LLM_N_THREADS,
    LLM_TEMPERATURE,
)

logger = logging.getLogger(__name__)


class LLMHandler:
    """
    Wraps llama-cpp-python to provide generate() and generate_stream() interfaces.

    The model is lazily loaded on the first call to avoid blocking UI startup.
    """

    def __init__(self) -> None:
        self._llm: Llama | None = None

    # ── Public API ───────────────────────────────────────────────────────────

    def generate(self, prompt: str) -> str:
        """
        Run inference on the given prompt and return the full generated text.

        Args:
            prompt: Fully formatted RAG prompt string.

        Returns:
            Generated answer string (stripped of whitespace).
        """
        llm = self._get_or_load_model()
        logger.debug("Running LLM inference (prompt length=%d chars) …", len(prompt))

        output = llm(
            prompt,
            max_tokens=LLM_MAX_TOKENS,
            temperature=LLM_TEMPERATURE,
            stop=["Sources:", "</s>"],
            echo=False,
        )
        answer = output["choices"][0]["text"].strip()
        logger.debug("LLM generated %d chars.", len(answer))
        return answer

    def generate_stream(self, prompt: str) -> Generator[str, None, None]:
        """
        Run inference and yield tokens one by one as the model generates them.

        Used by chat_stream() in chatbot.py to enable word-by-word UI streaming.
        The only API difference from generate() is stream=True and yield instead
        of return. The "if token:" guard skips empty strings llama-cpp may emit.

        Args:
            prompt: Fully formatted RAG prompt string.

        Yields:
            Individual token strings as the model produces them.
        """
        llm = self._get_or_load_model()
        logger.debug(
            "Running streaming LLM inference (prompt length=%d chars) …", len(prompt)
        )

        output = llm(
            prompt,
            max_tokens=LLM_MAX_TOKENS,
            temperature=LLM_TEMPERATURE,
            stop=["Sources:", "</s>"],
            echo=False,
            stream=True,   # ← only difference from generate()
        )

        for chunk in output:
            token = chunk["choices"][0]["text"]
            if token:   # skip empty strings llama-cpp occasionally emits
                yield token

    # ── Private helpers ──────────────────────────────────────────────────────

    def _get_or_load_model(self) -> Llama:
        if self._llm is None:
            model_path = self._download_model()
            logger.info("Loading LLM from '%s' …", model_path)
            self._llm = Llama(
                model_path=str(model_path),
                n_ctx=LLM_CONTEXT_LEN,
                n_threads=LLM_N_THREADS,
                n_gpu_layers=LLM_N_GPU_LAYERS,
                verbose=False,
            )
            logger.info("LLM ready.")
        return self._llm

    @staticmethod
    def _download_model() -> Path:
        # Use locally cached model — no download needed
        local_path = Path(LLM_CACHE_DIR) / LLM_MODEL_FILE
        
        if local_path.exists():
            logger.info("Model found locally at '%s'.", local_path)
            return local_path

        # Fallback — download from HuggingFace Hub if not found locally
        logger.info("Local model not found, downloading from HuggingFace Hub …")
        downloaded = hf_hub_download(
            repo_id=LLM_MODEL_REPO,
            filename=LLM_MODEL_FILE,
            token=os.environ.get("HF_TOKEN"),
        )
        logger.info("Model downloaded to '%s'.", downloaded)
        return Path(downloaded)