""" Lulluna — Story Engine ======================== Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation) via llama-cpp-python with Metal backend on M5 MacBook Air. MiniCPM5-1B is the right model choice here: - Text-only task (no vision needed for story generation) - 1B parameters → ~8 second generation on M5 Air - OpenBMB model family → qualifies for OpenBMB $5k prize - GGUF Q4_K_M fits in ~800MB RAM """ import os import logging import time from pathlib import Path from dotenv import load_dotenv load_dotenv() log = logging.getLogger("lulluna.engine") MODEL_PATH = os.getenv("MODEL_PATH", "./models/MiniCPM5-1B-Q4_K_M.gguf") MODEL_HF_REPO = os.getenv("MODEL_HF_REPO", "openbmb/MiniCPM5-1B-GGUF") MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MiniCPM5-1B-Q4_K_M.gguf") N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "99")) # all layers to Metal/CUDA N_CTX = int(os.getenv("N_CTX", "2048")) MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1024")) TEMPERATURE = float(os.getenv("TEMPERATURE", "0.75")) REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1")) class StoryEngine: """ Loads once at startup, generates stories on demand. Thread-safe for single-user local use (Gradio handles queuing). """ def __init__(self): self._model = None self._loaded = False def load(self) -> "StoryEngine": """Load the model. Called once at app startup.""" if self._loaded: return self model_path = Path(MODEL_PATH) # Auto-download when running on HF Spaces (model not pre-baked into the repo) if not model_path.exists(): log.info(f"Model not found at {MODEL_PATH} — attempting HF Hub download...") try: from huggingface_hub import hf_hub_download downloaded = hf_hub_download( repo_id=MODEL_HF_REPO, filename=MODEL_FILENAME, local_dir=str(model_path.parent), local_dir_use_symlinks=False, ) model_path = Path(downloaded) log.info(f"✓ Model downloaded to {model_path}") except Exception as e: log.warning( f"Could not download model ({e}). " "Run: python download_model.py — inference will be unavailable until then." ) return self from llama_cpp import Llama log.info(f"Loading model: {MODEL_PATH}") t0 = time.time() self._model = Llama( model_path=str(model_path), n_gpu_layers=N_GPU_LAYERS, # Metal acceleration on M5 n_ctx=N_CTX, n_threads=8, # M5 Air has 10 cores, leave 2 for OS verbose=False, seed=42, ) self._loaded = True log.info(f"✓ Model loaded in {time.time()-t0:.1f}s") return self @property def ready(self) -> bool: return self._loaded and self._model is not None def generate(self, system_prompt: str, user_message: str) -> str: """ Run a single generation. Returns the raw model output string. Caller is responsible for parsing. """ if not self.ready: raise RuntimeError( "Model not loaded. " "Run 'python download_model.py' first, then restart." ) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}, ] t0 = time.time() response = self._model.create_chat_completion( messages=messages, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, repeat_penalty=REPEAT_PENALTY, stop=["<|im_end|>", "[END]", "---"], ) elapsed = time.time() - t0 text = response["choices"][0]["message"]["content"].strip() tokens = response["usage"]["completion_tokens"] log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)") return text