Spaces:

build-small-hackathon
/

lulluna

Runtime error

App Files Files Community

lulluna / engine.py

mbkv

Initial deployment — Lulluna bedtime story weaver

0daff5d 15 days ago

Raw

History Blame Contribute Delete

4.21 kB

	"""
	Lulluna — Story Engine
	========================
	Loads MiniCPM5-1B (text-only, faster than MiniCPM-V for pure generation)
	via llama-cpp-python with Metal backend on M5 MacBook Air.

	MiniCPM5-1B is the right model choice here:
	- Text-only task (no vision needed for story generation)
	- 1B parameters → ~8 second generation on M5 Air
	- OpenBMB model family → qualifies for OpenBMB $5k prize
	- GGUF Q4_K_M fits in ~800MB RAM
	"""

	import os
	import logging
	import time
	from pathlib import Path

	from dotenv import load_dotenv

	load_dotenv()
	log = logging.getLogger("lulluna.engine")

	MODEL_PATH = os.getenv("MODEL_PATH", "./models/MiniCPM5-1B-Q4_K_M.gguf")
	MODEL_HF_REPO = os.getenv("MODEL_HF_REPO", "openbmb/MiniCPM5-1B-GGUF")
	MODEL_FILENAME = os.getenv("MODEL_FILENAME", "MiniCPM5-1B-Q4_K_M.gguf")
	N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", "99")) # all layers to Metal/CUDA
	N_CTX = int(os.getenv("N_CTX", "2048"))
	MAX_TOKENS = int(os.getenv("MAX_TOKENS", "1024"))
	TEMPERATURE = float(os.getenv("TEMPERATURE", "0.75"))
	REPEAT_PENALTY = float(os.getenv("REPEAT_PENALTY", "1.1"))


	class StoryEngine:
	"""
	Loads once at startup, generates stories on demand.
	Thread-safe for single-user local use (Gradio handles queuing).
	"""

	def __init__(self):
	self._model = None
	self._loaded = False

	def load(self) -> "StoryEngine":
	"""Load the model. Called once at app startup."""
	if self._loaded:
	return self

	model_path = Path(MODEL_PATH)

	# Auto-download when running on HF Spaces (model not pre-baked into the repo)
	if not model_path.exists():
	log.info(f"Model not found at {MODEL_PATH} — attempting HF Hub download...")
	try:
	from huggingface_hub import hf_hub_download
	downloaded = hf_hub_download(
	repo_id=MODEL_HF_REPO,
	filename=MODEL_FILENAME,
	local_dir=str(model_path.parent),
	local_dir_use_symlinks=False,
	)
	model_path = Path(downloaded)
	log.info(f"✓ Model downloaded to {model_path}")
	except Exception as e:
	log.warning(
	f"Could not download model ({e}). "
	"Run: python download_model.py — inference will be unavailable until then."
	)
	return self

	from llama_cpp import Llama

	log.info(f"Loading model: {MODEL_PATH}")
	t0 = time.time()

	self._model = Llama(
	model_path=str(model_path),
	n_gpu_layers=N_GPU_LAYERS, # Metal acceleration on M5
	n_ctx=N_CTX,
	n_threads=8, # M5 Air has 10 cores, leave 2 for OS
	verbose=False,
	seed=42,
	)
	self._loaded = True
	log.info(f"✓ Model loaded in {time.time()-t0:.1f}s")
	return self

	@property
	def ready(self) -> bool:
	return self._loaded and self._model is not None

	def generate(self, system_prompt: str, user_message: str) -> str:
	"""
	Run a single generation. Returns the raw model output string.
	Caller is responsible for parsing.
	"""
	if not self.ready:
	raise RuntimeError(
	"Model not loaded. "
	"Run 'python download_model.py' first, then restart."
	)

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_message},
	]

	t0 = time.time()
	response = self._model.create_chat_completion(
	messages=messages,
	max_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	repeat_penalty=REPEAT_PENALTY,
	stop=["<\|im_end\|>", "[END]", "---"],
	)
	elapsed = time.time() - t0

	text = response["choices"][0]["message"]["content"].strip()
	tokens = response["usage"]["completion_tokens"]
	log.info(f"Generated {tokens} tokens in {elapsed:.1f}s ({tokens/elapsed:.0f} tok/s)")

	return text