Spaces:
Runtime error
Runtime error
File size: 4,211 Bytes
0daff5d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
Lulluna β Story Engine
========================
Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation)
via llama-cpp-python with Metal backend on M5 MacBook Air.
MiniCPM5-1B is the right model choice here:
- Text-only task (no vision needed for story generation)
- 1B parameters β ~8 second generation on M5 Air
- OpenBMB model family β qualifies for OpenBMB $5k prize
- GGUF Q4_K_M fits in ~800MB RAM
"""
import os
import logging
import time
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
log = logging.getLogger("lulluna.engine")
MODEL_PATH = os.getenv("MODEL_PATH", "./models/MiniCPM5-1B-Q4_K_M.gguf")
MODEL_HF_REPO = os.getenv("MODEL_HF_REPO", "openbmb/MiniCPM5-1B-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MiniCPM5-1B-Q4_K_M.gguf")
N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "99")) # all layers to Metal/CUDA
N_CTX = int(os.getenv("N_CTX", "2048"))
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1024"))
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.75"))
REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1"))
class StoryEngine:
"""
Loads once at startup, generates stories on demand.
Thread-safe for single-user local use (Gradio handles queuing).
"""
def __init__(self):
self._model = None
self._loaded = False
def load(self) -> "StoryEngine":
"""Load the model. Called once at app startup."""
if self._loaded:
return self
model_path = Path(MODEL_PATH)
# Auto-download when running on HF Spaces (model not pre-baked into the repo)
if not model_path.exists():
log.info(f"Model not found at {MODEL_PATH} β attempting HF Hub download...")
try:
from huggingface_hub import hf_hub_download
downloaded = hf_hub_download(
repo_id=MODEL_HF_REPO,
filename=MODEL_FILENAME,
local_dir=str(model_path.parent),
local_dir_use_symlinks=False,
)
model_path = Path(downloaded)
log.info(f"β Model downloaded to {model_path}")
except Exception as e:
log.warning(
f"Could not download model ({e}). "
"Run: python download_model.py β inference will be unavailable until then."
)
return self
from llama_cpp import Llama
log.info(f"Loading model: {MODEL_PATH}")
t0 = time.time()
self._model = Llama(
model_path=str(model_path),
n_gpu_layers=N_GPU_LAYERS, # Metal acceleration on M5
n_ctx=N_CTX,
n_threads=8, # M5 Air has 10 cores, leave 2 for OS
verbose=False,
seed=42,
)
self._loaded = True
log.info(f"β Model loaded in {time.time()-t0:.1f}s")
return self
@property
def ready(self) -> bool:
return self._loaded and self._model is not None
def generate(self, system_prompt: str, user_message: str) -> str:
"""
Run a single generation. Returns the raw model output string.
Caller is responsible for parsing.
"""
if not self.ready:
raise RuntimeError(
"Model not loaded. "
"Run 'python download_model.py' first, then restart."
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
]
t0 = time.time()
response = self._model.create_chat_completion(
messages=messages,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
repeat_penalty=REPEAT_PENALTY,
stop=["<|im_end|>", "[END]", "---"],
)
elapsed = time.time() - t0
text = response["choices"][0]["message"]["content"].strip()
tokens = response["usage"]["completion_tokens"]
log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)")
return text
|