File size: 4,211 Bytes
0daff5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Lulluna β€” Story Engine
========================
Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation)
via llama-cpp-python with Metal backend on M5 MacBook Air.

MiniCPM5-1B is the right model choice here:
  - Text-only task (no vision needed for story generation)
  - 1B parameters β†’ ~8 second generation on M5 Air
  - OpenBMB model family β†’ qualifies for OpenBMB $5k prize
  - GGUF Q4_K_M fits in ~800MB RAM
"""

import os
import logging
import time
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()
log = logging.getLogger("lulluna.engine")

MODEL_PATH     = os.getenv("MODEL_PATH",     "./models/MiniCPM5-1B-Q4_K_M.gguf")
MODEL_HF_REPO  = os.getenv("MODEL_HF_REPO",  "openbmb/MiniCPM5-1B-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME",  "MiniCPM5-1B-Q4_K_M.gguf")
N_GPU_LAYERS   = int(os.getenv("N_GPU_LAYERS",   "99"))   # all layers to Metal/CUDA
N_CTX          = int(os.getenv("N_CTX",          "2048"))
MAX_TOKENS     = int(os.getenv("MAX_TOKENS",     "1024"))
TEMPERATURE    = float(os.getenv("TEMPERATURE",  "0.75"))
REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1"))


class StoryEngine:
    """
    Loads once at startup, generates stories on demand.
    Thread-safe for single-user local use (Gradio handles queuing).
    """

    def __init__(self):
        self._model = None
        self._loaded = False

    def load(self) -> "StoryEngine":
        """Load the model. Called once at app startup."""
        if self._loaded:
            return self

        model_path = Path(MODEL_PATH)

        # Auto-download when running on HF Spaces (model not pre-baked into the repo)
        if not model_path.exists():
            log.info(f"Model not found at {MODEL_PATH} β€” attempting HF Hub download...")
            try:
                from huggingface_hub import hf_hub_download
                downloaded = hf_hub_download(
                    repo_id=MODEL_HF_REPO,
                    filename=MODEL_FILENAME,
                    local_dir=str(model_path.parent),
                    local_dir_use_symlinks=False,
                )
                model_path = Path(downloaded)
                log.info(f"βœ“ Model downloaded to {model_path}")
            except Exception as e:
                log.warning(
                    f"Could not download model ({e}). "
                    "Run: python download_model.py β€” inference will be unavailable until then."
                )
                return self

        from llama_cpp import Llama

        log.info(f"Loading model: {MODEL_PATH}")
        t0 = time.time()

        self._model = Llama(
            model_path=str(model_path),
            n_gpu_layers=N_GPU_LAYERS,   # Metal acceleration on M5
            n_ctx=N_CTX,
            n_threads=8,                 # M5 Air has 10 cores, leave 2 for OS
            verbose=False,
            seed=42,
        )
        self._loaded = True
        log.info(f"βœ“ Model loaded in {time.time()-t0:.1f}s")
        return self

    @property
    def ready(self) -> bool:
        return self._loaded and self._model is not None

    def generate(self, system_prompt: str, user_message: str) -> str:
        """
        Run a single generation. Returns the raw model output string.
        Caller is responsible for parsing.
        """
        if not self.ready:
            raise RuntimeError(
                "Model not loaded. "
                "Run 'python download_model.py' first, then restart."
            )

        messages = [
            {"role": "system",  "content": system_prompt},
            {"role": "user",    "content": user_message},
        ]

        t0 = time.time()
        response = self._model.create_chat_completion(
            messages=messages,
            max_tokens=MAX_TOKENS,
            temperature=TEMPERATURE,
            repeat_penalty=REPEAT_PENALTY,
            stop=["<|im_end|>", "[END]", "---"],
        )
        elapsed = time.time() - t0

        text = response["choices"][0]["message"]["content"].strip()
        tokens = response["usage"]["completion_tokens"]
        log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)")

        return text