case0 / src /case_zero /llm /backend.py
HusseinEid's picture
Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)
414dc55
raw
history blame
2.17 kB
"""LLM backend protocol and factory.
The in-process ``LlamaCppBackend`` implements this protocol. Everything above this layer
depends only on the protocol, so the engine never reaches into the runtime directly.
"""
from __future__ import annotations
from collections.abc import Iterator
from dataclasses import dataclass
from typing import Protocol, runtime_checkable
from ..config import Settings
@dataclass(frozen=True)
class GenParams:
"""Sampling parameters for one call.
Provide at most one constraint: ``grammar`` (GBNF text) or ``json_schema``
(a JSON-Schema dict the backend converts to a grammar). Leave both unset for the
fast, grammar-free path.
"""
grammar: str | None = None
json_schema: dict | None = None
max_tokens: int = 512
temperature: float = 0.7
top_p: float = 0.95
stop: tuple[str, ...] = ()
seed: int | None = None
# Anti-repetition. Defaults match llama.cpp's own defaults so generation behaviour is
# unchanged; the interrogation hot path raises these to stop a small model from copying
# its previous answer verbatim across turns.
repeat_penalty: float = 1.1
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
class LLMError(RuntimeError):
"""Raised on unrecoverable backend failures."""
@runtime_checkable
class LLMBackend(Protocol):
"""A minimal text-in / text-out interface with optional grammar constraint."""
def generate(self, prompt: str, params: GenParams) -> str:
"""Return the full completion as a single string."""
...
def stream(self, prompt: str, params: GenParams) -> Iterator[str]:
"""Yield completion text deltas as they are produced."""
...
def make_backend(settings: Settings) -> LLMBackend:
"""Construct the in-process llama.cpp backend (import is local so the heavy native
dependency is only loaded when a backend is actually built)."""
from .llamacpp_backend import LlamaCppBackend
return LlamaCppBackend.from_settings(settings)
def join_stream(deltas: Iterator[str]) -> str:
"""Collect a streamed completion into one string."""
return "".join(deltas)