Spaces:

build-small-hackathon
/

case0

Running

Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)

414dc55 3 days ago

raw

history blame

2.17 kB

	"""LLM backend protocol and factory.

	The in-process ``LlamaCppBackend`` implements this protocol. Everything above this layer
	depends only on the protocol, so the engine never reaches into the runtime directly.
	"""

	from __future__ import annotations

	from collections.abc import Iterator
	from dataclasses import dataclass
	from typing import Protocol, runtime_checkable

	from ..config import Settings


	@dataclass(frozen=True)
	class GenParams:
	"""Sampling parameters for one call.

	Provide at most one constraint: ``grammar`` (GBNF text) or ``json_schema``
	(a JSON-Schema dict the backend converts to a grammar). Leave both unset for the
	fast, grammar-free path.
	"""

	grammar: str \| None = None
	json_schema: dict \| None = None
	max_tokens: int = 512
	temperature: float = 0.7
	top_p: float = 0.95
	stop: tuple[str, ...] = ()
	seed: int \| None = None
	# Anti-repetition. Defaults match llama.cpp's own defaults so generation behaviour is
	# unchanged; the interrogation hot path raises these to stop a small model from copying
	# its previous answer verbatim across turns.
	repeat_penalty: float = 1.1
	frequency_penalty: float = 0.0
	presence_penalty: float = 0.0


	class LLMError(RuntimeError):
	"""Raised on unrecoverable backend failures."""


	@runtime_checkable
	class LLMBackend(Protocol):
	"""A minimal text-in / text-out interface with optional grammar constraint."""

	def generate(self, prompt: str, params: GenParams) -> str:
	"""Return the full completion as a single string."""
	...

	def stream(self, prompt: str, params: GenParams) -> Iterator[str]:
	"""Yield completion text deltas as they are produced."""
	...


	def make_backend(settings: Settings) -> LLMBackend:
	"""Construct the in-process llama.cpp backend (import is local so the heavy native
	dependency is only loaded when a backend is actually built)."""
	from .llamacpp_backend import LlamaCppBackend

	return LlamaCppBackend.from_settings(settings)


	def join_stream(deltas: Iterator[str]) -> str:
	"""Collect a streamed completion into one string."""
	return "".join(deltas)