Spaces:
Running
Running
File size: 4,813 Bytes
d456104 9e2482d d456104 9e2482d d456104 9e2482d d456104 37351ab d456104 9e2482d d456104 9e2482d d456104 9e2482d d456104 9e2482d d456104 4cf1913 d456104 9e2482d 4cf1913 9e2482d d456104 9e2482d 37351ab 9e2482d 3779ff3 9e2482d 3779ff3 9e2482d d456104 9e2482d d456104 3779ff3 9e2482d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | """
llm_handler.py
--------------
Loads and runs the open-source LLM (Phi-2 GGUF) via llama-cpp-python.
Step 3 Enhancement:
- Added generate_stream() which yields tokens one by one for streaming UI.
- generate() kept unchanged β still used by non-streaming code paths.
Design decisions
----------------
* GGUF 4-bit quantisation (Q4_K_M) keeps RAM usage low.
* Model downloaded via HuggingFace Hub global cache (~/.cache/huggingface/hub/)
which persists between Space restarts on code-only pushes β no re-download.
* GPU layers default to 0 (CPU-only) but can be set via LLM_N_GPU_LAYERS env var.
"""
import logging
import os
from pathlib import Path
from typing import Generator
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from app.config import (
LLM_CACHE_DIR,
LLM_CONTEXT_LEN,
LLM_MAX_TOKENS,
LLM_MODEL_FILE,
LLM_MODEL_REPO,
LLM_N_GPU_LAYERS,
LLM_N_THREADS,
LLM_TEMPERATURE,
)
logger = logging.getLogger(__name__)
class LLMHandler:
"""
Wraps llama-cpp-python to provide generate() and generate_stream() interfaces.
The model is lazily loaded on the first call to avoid blocking UI startup.
"""
def __init__(self) -> None:
self._llm: Llama | None = None
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate(self, prompt: str) -> str:
"""
Run inference on the given prompt and return the full generated text.
Args:
prompt: Fully formatted RAG prompt string.
Returns:
Generated answer string (stripped of whitespace).
"""
llm = self._get_or_load_model()
logger.debug("Running LLM inference (prompt length=%d chars) β¦", len(prompt))
output = llm(
prompt,
max_tokens=LLM_MAX_TOKENS,
temperature=LLM_TEMPERATURE,
stop=["Sources:", "</s>"],
echo=False,
)
answer = output["choices"][0]["text"].strip()
logger.debug("LLM generated %d chars.", len(answer))
return answer
def generate_stream(self, prompt: str) -> Generator[str, None, None]:
"""
Run inference and yield tokens one by one as the model generates them.
Used by chat_stream() in chatbot.py to enable word-by-word UI streaming.
The only API difference from generate() is stream=True and yield instead
of return. The "if token:" guard skips empty strings llama-cpp may emit.
Args:
prompt: Fully formatted RAG prompt string.
Yields:
Individual token strings as the model produces them.
"""
llm = self._get_or_load_model()
logger.debug(
"Running streaming LLM inference (prompt length=%d chars) β¦", len(prompt)
)
output = llm(
prompt,
max_tokens=LLM_MAX_TOKENS,
temperature=LLM_TEMPERATURE,
stop=["Sources:", "</s>"],
echo=False,
stream=True, # β only difference from generate()
)
for chunk in output:
token = chunk["choices"][0]["text"]
if token: # skip empty strings llama-cpp occasionally emits
yield token
# ββ Private helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _get_or_load_model(self) -> Llama:
if self._llm is None:
model_path = self._download_model()
logger.info("Loading LLM from '%s' β¦", model_path)
self._llm = Llama(
model_path=str(model_path),
n_ctx=LLM_CONTEXT_LEN,
n_threads=LLM_N_THREADS,
n_gpu_layers=LLM_N_GPU_LAYERS,
verbose=False,
)
logger.info("LLM ready.")
return self._llm
@staticmethod
def _download_model() -> Path:
# Use locally cached model β no download needed
local_path = Path(LLM_CACHE_DIR) / LLM_MODEL_FILE
if local_path.exists():
logger.info("Model found locally at '%s'.", local_path)
return local_path
# Fallback β download from HuggingFace Hub if not found locally
logger.info("Local model not found, downloading from HuggingFace Hub β¦")
downloaded = hf_hub_download(
repo_id=LLM_MODEL_REPO,
filename=LLM_MODEL_FILE,
token=os.environ.get("HF_TOKEN"),
)
logger.info("Model downloaded to '%s'.", downloaded)
return Path(downloaded) |