Spaces:

build-small-hackathon
/

case0

Running

case0 / src /case_zero /llm /llamacpp_backend.py

Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)

414dc55 3 days ago

raw

history blame contribute delete

4.03 kB

	"""In-process llama.cpp backend via ``llama-cpp-python``.

	The shipped runtime: fully in-process on the CPU - no server, no GPU, no network. The
	model is loaded once (lazily, on first use) and reused for every call.
	"""

	from __future__ import annotations

	import json
	from collections.abc import Iterator
	from pathlib import Path

	from ..config import Settings
	from .backend import GenParams, LLMError


	class LlamaCppBackend:
	"""Wraps a single ``llama_cpp.Llama`` instance, loaded lazily on first use."""

	def __init__(self, model_path: Path, *, n_ctx: int, n_threads: int) -> None:
	self.model_path = model_path
	self._n_ctx = n_ctx
	self._n_threads = n_threads
	self._llama: object \| None = None

	@classmethod
	def from_settings(cls, settings: Settings) -> LlamaCppBackend:
	try:
	import llama_cpp # noqa: F401
	except ImportError as exc: # pragma: no cover
	raise LLMError(
	"llama-cpp-python is not installed. Install it with "
	"'pip install -r requirements.txt'."
	) from exc

	model_path = settings.llm_model_path
	if not model_path.exists():
	raise LLMError(
	f"model weights not found at {model_path}. Run scripts/fetch_models.py."
	)
	return cls(model_path, n_ctx=settings.llm_n_ctx, n_threads=settings.llm_n_threads)

	def _ensure(self) -> object:
	if self._llama is None:
	from llama_cpp import Llama

	self._llama = Llama(
	model_path=str(self.model_path),
	n_ctx=self._n_ctx,
	n_threads=self._n_threads,
	n_threads_batch=self._n_threads,
	n_gpu_layers=0,
	# RAM is plentiful on the Space (model is ~1GB of 16GB); lock the weights
	# resident so they are never paged out mid-game. Ignored if unsupported.
	use_mlock=True,
	verbose=False,
	)
	return self._llama

	def _grammar(self, params: GenParams) -> object \| None:
	from llama_cpp import LlamaGrammar

	if params.grammar:
	return LlamaGrammar.from_string(params.grammar, verbose=False)
	if params.json_schema:
	return LlamaGrammar.from_json_schema(json.dumps(params.json_schema), verbose=False)
	return None

	def _messages(self, prompt: str) -> list[dict[str, str]]:
	return [{"role": "user", "content": prompt}]

	def generate(self, prompt: str, params: GenParams) -> str:
	llama = self._ensure()
	result = llama.create_chat_completion( # type: ignore[attr-defined]
	messages=self._messages(prompt),
	max_tokens=params.max_tokens,
	temperature=params.temperature,
	top_p=params.top_p,
	stop=list(params.stop) or None,
	grammar=self._grammar(params),
	seed=params.seed,
	repeat_penalty=params.repeat_penalty,
	frequency_penalty=params.frequency_penalty,
	presence_penalty=params.presence_penalty,
	)
	return result["choices"][0]["message"]["content"] or ""

	def stream(self, prompt: str, params: GenParams) -> Iterator[str]:
	llama = self._ensure()
	chunks = llama.create_chat_completion( # type: ignore[attr-defined]
	messages=self._messages(prompt),
	max_tokens=params.max_tokens,
	temperature=params.temperature,
	top_p=params.top_p,
	stop=list(params.stop) or None,
	grammar=self._grammar(params),
	seed=params.seed,
	repeat_penalty=params.repeat_penalty,
	frequency_penalty=params.frequency_penalty,
	presence_penalty=params.presence_penalty,
	stream=True,
	)
	for chunk in chunks:
	delta = chunk["choices"][0].get("delta", {})
	text = delta.get("content")
	if text:
	yield text