Spaces:
Runtime error
Runtime error
| """ | |
| Lulluna β Story Engine | |
| ======================== | |
| Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation) | |
| via llama-cpp-python with Metal backend on M5 MacBook Air. | |
| MiniCPM5-1B is the right model choice here: | |
| - Text-only task (no vision needed for story generation) | |
| - 1B parameters β ~8 second generation on M5 Air | |
| - OpenBMB model family β qualifies for OpenBMB $5k prize | |
| - GGUF Q4_K_M fits in ~800MB RAM | |
| """ | |
| import os | |
| import logging | |
| import time | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| log = logging.getLogger("lulluna.engine") | |
| MODEL_PATH = os.getenv("MODEL_PATH", "./models/MiniCPM5-1B-Q4_K_M.gguf") | |
| MODEL_HF_REPO = os.getenv("MODEL_HF_REPO", "openbmb/MiniCPM5-1B-GGUF") | |
| MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MiniCPM5-1B-Q4_K_M.gguf") | |
| N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "99")) # all layers to Metal/CUDA | |
| N_CTX = int(os.getenv("N_CTX", "2048")) | |
| MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1024")) | |
| TEMPERATURE = float(os.getenv("TEMPERATURE", "0.75")) | |
| REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1")) | |
| class StoryEngine: | |
| """ | |
| Loads once at startup, generates stories on demand. | |
| Thread-safe for single-user local use (Gradio handles queuing). | |
| """ | |
| def __init__(self): | |
| self._model = None | |
| self._loaded = False | |
| def load(self) -> "StoryEngine": | |
| """Load the model. Called once at app startup.""" | |
| if self._loaded: | |
| return self | |
| model_path = Path(MODEL_PATH) | |
| # Auto-download when running on HF Spaces (model not pre-baked into the repo) | |
| if not model_path.exists(): | |
| log.info(f"Model not found at {MODEL_PATH} β attempting HF Hub download...") | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| downloaded = hf_hub_download( | |
| repo_id=MODEL_HF_REPO, | |
| filename=MODEL_FILENAME, | |
| local_dir=str(model_path.parent), | |
| local_dir_use_symlinks=False, | |
| ) | |
| model_path = Path(downloaded) | |
| log.info(f"β Model downloaded to {model_path}") | |
| except Exception as e: | |
| log.warning( | |
| f"Could not download model ({e}). " | |
| "Run: python download_model.py β inference will be unavailable until then." | |
| ) | |
| return self | |
| from llama_cpp import Llama | |
| log.info(f"Loading model: {MODEL_PATH}") | |
| t0 = time.time() | |
| self._model = Llama( | |
| model_path=str(model_path), | |
| n_gpu_layers=N_GPU_LAYERS, # Metal acceleration on M5 | |
| n_ctx=N_CTX, | |
| n_threads=8, # M5 Air has 10 cores, leave 2 for OS | |
| verbose=False, | |
| seed=42, | |
| ) | |
| self._loaded = True | |
| log.info(f"β Model loaded in {time.time()-t0:.1f}s") | |
| return self | |
| def ready(self) -> bool: | |
| return self._loaded and self._model is not None | |
| def generate(self, system_prompt: str, user_message: str) -> str: | |
| """ | |
| Run a single generation. Returns the raw model output string. | |
| Caller is responsible for parsing. | |
| """ | |
| if not self.ready: | |
| raise RuntimeError( | |
| "Model not loaded. " | |
| "Run 'python download_model.py' first, then restart." | |
| ) | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_message}, | |
| ] | |
| t0 = time.time() | |
| response = self._model.create_chat_completion( | |
| messages=messages, | |
| max_tokens=MAX_TOKENS, | |
| temperature=TEMPERATURE, | |
| repeat_penalty=REPEAT_PENALTY, | |
| stop=["<|im_end|>", "[END]", "---"], | |
| ) | |
| elapsed = time.time() - t0 | |
| text = response["choices"][0]["message"]["content"].strip() | |
| tokens = response["usage"]["completion_tokens"] | |
| log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)") | |
| return text | |