File size: 2,172 Bytes
414dc55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""LLM backend protocol and factory.

The in-process ``LlamaCppBackend`` implements this protocol. Everything above this layer
depends only on the protocol, so the engine never reaches into the runtime directly.
"""

from __future__ import annotations

from collections.abc import Iterator
from dataclasses import dataclass
from typing import Protocol, runtime_checkable

from ..config import Settings


@dataclass(frozen=True)
class GenParams:
    """Sampling parameters for one call.

    Provide at most one constraint: ``grammar`` (GBNF text) or ``json_schema``
    (a JSON-Schema dict the backend converts to a grammar). Leave both unset for the
    fast, grammar-free path.
    """

    grammar: str | None = None
    json_schema: dict | None = None
    max_tokens: int = 512
    temperature: float = 0.7
    top_p: float = 0.95
    stop: tuple[str, ...] = ()
    seed: int | None = None
    # Anti-repetition. Defaults match llama.cpp's own defaults so generation behaviour is
    # unchanged; the interrogation hot path raises these to stop a small model from copying
    # its previous answer verbatim across turns.
    repeat_penalty: float = 1.1
    frequency_penalty: float = 0.0
    presence_penalty: float = 0.0


class LLMError(RuntimeError):
    """Raised on unrecoverable backend failures."""


@runtime_checkable
class LLMBackend(Protocol):
    """A minimal text-in / text-out interface with optional grammar constraint."""

    def generate(self, prompt: str, params: GenParams) -> str:
        """Return the full completion as a single string."""
        ...

    def stream(self, prompt: str, params: GenParams) -> Iterator[str]:
        """Yield completion text deltas as they are produced."""
        ...


def make_backend(settings: Settings) -> LLMBackend:
    """Construct the in-process llama.cpp backend (import is local so the heavy native
    dependency is only loaded when a backend is actually built)."""
    from .llamacpp_backend import LlamaCppBackend

    return LlamaCppBackend.from_settings(settings)


def join_stream(deltas: Iterator[str]) -> str:
    """Collect a streamed completion into one string."""
    return "".join(deltas)