lulluna / engine.py
mbkv's picture
Initial deployment β€” Lulluna bedtime story weaver
0daff5d
Raw
History Blame Contribute Delete
4.21 kB
"""
Lulluna β€” Story Engine
========================
Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation)
via llama-cpp-python with Metal backend on M5 MacBook Air.
MiniCPM5-1B is the right model choice here:
- Text-only task (no vision needed for story generation)
- 1B parameters β†’ ~8 second generation on M5 Air
- OpenBMB model family β†’ qualifies for OpenBMB $5k prize
- GGUF Q4_K_M fits in ~800MB RAM
"""
import os
import logging
import time
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
log = logging.getLogger("lulluna.engine")
MODEL_PATH = os.getenv("MODEL_PATH", "./models/MiniCPM5-1B-Q4_K_M.gguf")
MODEL_HF_REPO = os.getenv("MODEL_HF_REPO", "openbmb/MiniCPM5-1B-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MiniCPM5-1B-Q4_K_M.gguf")
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "99")) # all layers to Metal/CUDA
N_CTX = int(os.getenv("N_CTX", "2048"))
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1024"))
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.75"))
REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1"))
class StoryEngine:
"""
Loads once at startup, generates stories on demand.
Thread-safe for single-user local use (Gradio handles queuing).
"""
def __init__(self):
self._model = None
self._loaded = False
def load(self) -> "StoryEngine":
"""Load the model. Called once at app startup."""
if self._loaded:
return self
model_path = Path(MODEL_PATH)
# Auto-download when running on HF Spaces (model not pre-baked into the repo)
if not model_path.exists():
log.info(f"Model not found at {MODEL_PATH} β€” attempting HF Hub download...")
try:
from huggingface_hub import hf_hub_download
downloaded = hf_hub_download(
repo_id=MODEL_HF_REPO,
filename=MODEL_FILENAME,
local_dir=str(model_path.parent),
local_dir_use_symlinks=False,
)
model_path = Path(downloaded)
log.info(f"βœ“ Model downloaded to {model_path}")
except Exception as e:
log.warning(
f"Could not download model ({e}). "
"Run: python download_model.py β€” inference will be unavailable until then."
)
return self
from llama_cpp import Llama
log.info(f"Loading model: {MODEL_PATH}")
t0 = time.time()
self._model = Llama(
model_path=str(model_path),
n_gpu_layers=N_GPU_LAYERS, # Metal acceleration on M5
n_ctx=N_CTX,
n_threads=8, # M5 Air has 10 cores, leave 2 for OS
verbose=False,
seed=42,
)
self._loaded = True
log.info(f"βœ“ Model loaded in {time.time()-t0:.1f}s")
return self
@property
def ready(self) -> bool:
return self._loaded and self._model is not None
def generate(self, system_prompt: str, user_message: str) -> str:
"""
Run a single generation. Returns the raw model output string.
Caller is responsible for parsing.
"""
if not self.ready:
raise RuntimeError(
"Model not loaded. "
"Run 'python download_model.py' first, then restart."
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
]
t0 = time.time()
response = self._model.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
repeat_penalty=REPEAT_PENALTY,
stop=["<|im_end|>", "[END]", "---"],
)
elapsed = time.time() - t0
text = response["choices"][0]["message"]["content"].strip()
tokens = response["usage"]["completion_tokens"]
log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)")
return text