"""LLM backend protocol and factory. The in-process ``LlamaCppBackend`` implements this protocol. Everything above this layer depends only on the protocol, so the engine never reaches into the runtime directly. """ from __future__ import annotations from collections.abc import Iterator from dataclasses import dataclass from typing import Protocol, runtime_checkable from ..config import Settings @dataclass(frozen=True) class GenParams: """Sampling parameters for one call. Provide at most one constraint: ``grammar`` (GBNF text) or ``json_schema`` (a JSON-Schema dict the backend converts to a grammar). Leave both unset for the fast, grammar-free path. """ grammar: str | None = None json_schema: dict | None = None max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.95 stop: tuple[str, ...] = () seed: int | None = None # Anti-repetition. Defaults match llama.cpp's own defaults so generation behaviour is # unchanged; the interrogation hot path raises these to stop a small model from copying # its previous answer verbatim across turns. repeat_penalty: float = 1.1 frequency_penalty: float = 0.0 presence_penalty: float = 0.0 class LLMError(RuntimeError): """Raised on unrecoverable backend failures.""" @runtime_checkable class LLMBackend(Protocol): """A minimal text-in / text-out interface with optional grammar constraint.""" def generate(self, prompt: str, params: GenParams) -> str: """Return the full completion as a single string.""" ... def stream(self, prompt: str, params: GenParams) -> Iterator[str]: """Yield completion text deltas as they are produced.""" ... def make_backend(settings: Settings) -> LLMBackend: """Construct the in-process llama.cpp backend (import is local so the heavy native dependency is only loaded when a backend is actually built).""" from .llamacpp_backend import LlamaCppBackend return LlamaCppBackend.from_settings(settings) def join_stream(deltas: Iterator[str]) -> str: """Collect a streamed completion into one string.""" return "".join(deltas)