Spaces:
Running
Running
Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)
414dc55 | """In-process llama.cpp backend via ``llama-cpp-python``. | |
| The shipped runtime: fully in-process on the CPU - no server, no GPU, no network. The | |
| model is loaded once (lazily, on first use) and reused for every call. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from collections.abc import Iterator | |
| from pathlib import Path | |
| from ..config import Settings | |
| from .backend import GenParams, LLMError | |
| class LlamaCppBackend: | |
| """Wraps a single ``llama_cpp.Llama`` instance, loaded lazily on first use.""" | |
| def __init__(self, model_path: Path, *, n_ctx: int, n_threads: int) -> None: | |
| self.model_path = model_path | |
| self._n_ctx = n_ctx | |
| self._n_threads = n_threads | |
| self._llama: object | None = None | |
| def from_settings(cls, settings: Settings) -> LlamaCppBackend: | |
| try: | |
| import llama_cpp # noqa: F401 | |
| except ImportError as exc: # pragma: no cover | |
| raise LLMError( | |
| "llama-cpp-python is not installed. Install it with " | |
| "'pip install -r requirements.txt'." | |
| ) from exc | |
| model_path = settings.llm_model_path | |
| if not model_path.exists(): | |
| raise LLMError( | |
| f"model weights not found at {model_path}. Run scripts/fetch_models.py." | |
| ) | |
| return cls(model_path, n_ctx=settings.llm_n_ctx, n_threads=settings.llm_n_threads) | |
| def _ensure(self) -> object: | |
| if self._llama is None: | |
| from llama_cpp import Llama | |
| self._llama = Llama( | |
| model_path=str(self.model_path), | |
| n_ctx=self._n_ctx, | |
| n_threads=self._n_threads, | |
| n_threads_batch=self._n_threads, | |
| n_gpu_layers=0, | |
| # RAM is plentiful on the Space (model is ~1GB of 16GB); lock the weights | |
| # resident so they are never paged out mid-game. Ignored if unsupported. | |
| use_mlock=True, | |
| verbose=False, | |
| ) | |
| return self._llama | |
| def _grammar(self, params: GenParams) -> object | None: | |
| from llama_cpp import LlamaGrammar | |
| if params.grammar: | |
| return LlamaGrammar.from_string(params.grammar, verbose=False) | |
| if params.json_schema: | |
| return LlamaGrammar.from_json_schema(json.dumps(params.json_schema), verbose=False) | |
| return None | |
| def _messages(self, prompt: str) -> list[dict[str, str]]: | |
| return [{"role": "user", "content": prompt}] | |
| def generate(self, prompt: str, params: GenParams) -> str: | |
| llama = self._ensure() | |
| result = llama.create_chat_completion( # type: ignore[attr-defined] | |
| messages=self._messages(prompt), | |
| max_tokens=params.max_tokens, | |
| temperature=params.temperature, | |
| top_p=params.top_p, | |
| stop=list(params.stop) or None, | |
| grammar=self._grammar(params), | |
| seed=params.seed, | |
| repeat_penalty=params.repeat_penalty, | |
| frequency_penalty=params.frequency_penalty, | |
| presence_penalty=params.presence_penalty, | |
| ) | |
| return result["choices"][0]["message"]["content"] or "" | |
| def stream(self, prompt: str, params: GenParams) -> Iterator[str]: | |
| llama = self._ensure() | |
| chunks = llama.create_chat_completion( # type: ignore[attr-defined] | |
| messages=self._messages(prompt), | |
| max_tokens=params.max_tokens, | |
| temperature=params.temperature, | |
| top_p=params.top_p, | |
| stop=list(params.stop) or None, | |
| grammar=self._grammar(params), | |
| seed=params.seed, | |
| repeat_penalty=params.repeat_penalty, | |
| frequency_penalty=params.frequency_penalty, | |
| presence_penalty=params.presence_penalty, | |
| stream=True, | |
| ) | |
| for chunk in chunks: | |
| delta = chunk["choices"][0].get("delta", {}) | |
| text = delta.get("content") | |
| if text: | |
| yield text | |