File size: 4,813 Bytes
d456104
 
 
9e2482d
 
 
 
 
d456104
 
 
9e2482d
 
 
 
d456104
 
 
 
 
9e2482d
d456104
 
 
 
 
37351ab
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2482d
d456104
9e2482d
d456104
 
 
 
 
 
 
 
 
9e2482d
d456104
 
 
 
 
9e2482d
d456104
 
 
 
 
 
 
 
4cf1913
d456104
 
 
 
 
 
9e2482d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cf1913
9e2482d
 
 
 
 
 
 
 
 
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e2482d
37351ab
9e2482d
3779ff3
9e2482d
3779ff3
 
9e2482d
 
d456104
 
 
9e2482d
d456104
3779ff3
9e2482d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
llm_handler.py
--------------
Loads and runs the open-source LLM (Phi-2 GGUF) via llama-cpp-python.

Step 3 Enhancement:
- Added generate_stream() which yields tokens one by one for streaming UI.
- generate() kept unchanged β€” still used by non-streaming code paths.

Design decisions
----------------
* GGUF 4-bit quantisation (Q4_K_M) keeps RAM usage low.
* Model downloaded via HuggingFace Hub global cache (~/.cache/huggingface/hub/)
  which persists between Space restarts on code-only pushes β€” no re-download.
* GPU layers default to 0 (CPU-only) but can be set via LLM_N_GPU_LAYERS env var.
"""

import logging
import os
from pathlib import Path
from typing import Generator

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

from app.config import (
    LLM_CACHE_DIR,
    LLM_CONTEXT_LEN,
    LLM_MAX_TOKENS,
    LLM_MODEL_FILE,
    LLM_MODEL_REPO,
    LLM_N_GPU_LAYERS,
    LLM_N_THREADS,
    LLM_TEMPERATURE,
)

logger = logging.getLogger(__name__)


class LLMHandler:
    """
    Wraps llama-cpp-python to provide generate() and generate_stream() interfaces.

    The model is lazily loaded on the first call to avoid blocking UI startup.
    """

    def __init__(self) -> None:
        self._llm: Llama | None = None

    # ── Public API ───────────────────────────────────────────────────────────

    def generate(self, prompt: str) -> str:
        """
        Run inference on the given prompt and return the full generated text.

        Args:
            prompt: Fully formatted RAG prompt string.

        Returns:
            Generated answer string (stripped of whitespace).
        """
        llm = self._get_or_load_model()
        logger.debug("Running LLM inference (prompt length=%d chars) …", len(prompt))

        output = llm(
            prompt,
            max_tokens=LLM_MAX_TOKENS,
            temperature=LLM_TEMPERATURE,
            stop=["Sources:", "</s>"],
            echo=False,
        )
        answer = output["choices"][0]["text"].strip()
        logger.debug("LLM generated %d chars.", len(answer))
        return answer

    def generate_stream(self, prompt: str) -> Generator[str, None, None]:
        """
        Run inference and yield tokens one by one as the model generates them.

        Used by chat_stream() in chatbot.py to enable word-by-word UI streaming.
        The only API difference from generate() is stream=True and yield instead
        of return. The "if token:" guard skips empty strings llama-cpp may emit.

        Args:
            prompt: Fully formatted RAG prompt string.

        Yields:
            Individual token strings as the model produces them.
        """
        llm = self._get_or_load_model()
        logger.debug(
            "Running streaming LLM inference (prompt length=%d chars) …", len(prompt)
        )

        output = llm(
            prompt,
            max_tokens=LLM_MAX_TOKENS,
            temperature=LLM_TEMPERATURE,
            stop=["Sources:", "</s>"],
            echo=False,
            stream=True,   # ← only difference from generate()
        )

        for chunk in output:
            token = chunk["choices"][0]["text"]
            if token:   # skip empty strings llama-cpp occasionally emits
                yield token

    # ── Private helpers ──────────────────────────────────────────────────────

    def _get_or_load_model(self) -> Llama:
        if self._llm is None:
            model_path = self._download_model()
            logger.info("Loading LLM from '%s' …", model_path)
            self._llm = Llama(
                model_path=str(model_path),
                n_ctx=LLM_CONTEXT_LEN,
                n_threads=LLM_N_THREADS,
                n_gpu_layers=LLM_N_GPU_LAYERS,
                verbose=False,
            )
            logger.info("LLM ready.")
        return self._llm

    @staticmethod
    def _download_model() -> Path:
        # Use locally cached model β€” no download needed
        local_path = Path(LLM_CACHE_DIR) / LLM_MODEL_FILE
        
        if local_path.exists():
            logger.info("Model found locally at '%s'.", local_path)
            return local_path

        # Fallback β€” download from HuggingFace Hub if not found locally
        logger.info("Local model not found, downloading from HuggingFace Hub …")
        downloaded = hf_hub_download(
            repo_id=LLM_MODEL_REPO,
            filename=LLM_MODEL_FILE,
            token=os.environ.get("HF_TOKEN"),
        )
        logger.info("Model downloaded to '%s'.", downloaded)
        return Path(downloaded)