hymenjj commited on Mar 9

Commit

6f8aedf

verified ·

1 Parent(s): 7d58434

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
llama_cpp/__init__.py +4 -0
llama_cpp/__pycache__/__init__.cpython-311.pyc +0 -0
llama_cpp/__pycache__/_ctypes_extensions.cpython-311.pyc +0 -0
llama_cpp/__pycache__/_ggml.cpython-311.pyc +0 -0
llama_cpp/__pycache__/_internals.cpython-311.pyc +0 -0
llama_cpp/__pycache__/_logger.cpython-311.pyc +0 -0
llama_cpp/__pycache__/_utils.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama_cache.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama_chat_format.cpython-311.pyc +3 -0
llama_cpp/__pycache__/llama_cpp.cpython-311.pyc +3 -0
llama_cpp/__pycache__/llama_grammar.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama_speculative.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama_tokenizer.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llama_types.cpython-311.pyc +0 -0
llama_cpp/__pycache__/llava_cpp.cpython-311.pyc +0 -0
llama_cpp/__pycache__/mtmd_cpp.cpython-311.pyc +0 -0
llama_cpp/_ctypes_extensions.py +131 -0
llama_cpp/_ggml.py +12 -0
llama_cpp/_internals.py +856 -0
llama_cpp/_logger.py +47 -0
llama_cpp/_utils.py +78 -0
llama_cpp/lib/libggml-base.so +3 -0
llama_cpp/lib/libggml-cpu.so +3 -0
llama_cpp/lib/libggml.so +0 -0
llama_cpp/lib/libllama.so +3 -0
llama_cpp/lib/libmtmd.so +3 -0
llama_cpp/llama.py +2422 -0
llama_cpp/llama_cache.py +155 -0
llama_cpp/llama_chat_format.py +0 -0
llama_cpp/llama_cpp.py +0 -0
llama_cpp/llama_grammar.py +953 -0
llama_cpp/llama_speculative.py +64 -0
llama_cpp/llama_tokenizer.py +120 -0
llama_cpp/llama_types.py +316 -0
llama_cpp/llava_cpp.py +158 -0
llama_cpp/mtmd_cpp.py +280 -0
llama_cpp/py.typed +0 -0
llama_cpp/server/__init__.py +0 -0
llama_cpp/server/__main__.py +100 -0
llama_cpp/server/__pycache__/__init__.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/__main__.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/app.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/cli.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/errors.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/model.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/settings.cpython-311.pyc +0 -0
llama_cpp/server/__pycache__/types.cpython-311.pyc +0 -0
llama_cpp/server/app.py +597 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+llama_cpp/__pycache__/llama_chat_format.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+llama_cpp/__pycache__/llama_cpp.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+llama_cpp/lib/libggml-base.so filter=lfs diff=lfs merge=lfs -text
+llama_cpp/lib/libggml-cpu.so filter=lfs diff=lfs merge=lfs -text
+llama_cpp/lib/libllama.so filter=lfs diff=lfs merge=lfs -text
+llama_cpp/lib/libmtmd.so filter=lfs diff=lfs merge=lfs -text

llama_cpp/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .llama_cpp import *
+from .llama import *
+__version__ = "0.3.16"

llama_cpp/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (259 Bytes). View file

llama_cpp/__pycache__/_ctypes_extensions.cpython-311.pyc ADDED Viewed

Binary file (6.13 kB). View file

llama_cpp/__pycache__/_ggml.cpython-311.pyc ADDED Viewed

Binary file (800 Bytes). View file

llama_cpp/__pycache__/_internals.cpython-311.pyc ADDED Viewed

Binary file (51.3 kB). View file

llama_cpp/__pycache__/_logger.cpython-311.pyc ADDED Viewed

Binary file (1.74 kB). View file

llama_cpp/__pycache__/_utils.cpython-311.pyc ADDED Viewed

Binary file (4.31 kB). View file

llama_cpp/__pycache__/llama.cpython-311.pyc ADDED Viewed

Binary file (93.7 kB). View file

llama_cpp/__pycache__/llama_cache.cpython-311.pyc ADDED Viewed

Binary file (9.74 kB). View file

llama_cpp/__pycache__/llama_chat_format.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1112a8090af3509b71ef46f87b57b23dbf0d410dda331b3612b506983a312b9
+size 138859

llama_cpp/__pycache__/llama_cpp.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff68308c4b1951f3e8bdc79275ad24faa6cd18669939a1233be626713764d8a1
+size 100194

llama_cpp/__pycache__/llama_grammar.cpython-311.pyc ADDED Viewed

Binary file (42.8 kB). View file

llama_cpp/__pycache__/llama_speculative.cpython-311.pyc ADDED Viewed

Binary file (3.39 kB). View file

llama_cpp/__pycache__/llama_tokenizer.cpython-311.pyc ADDED Viewed

Binary file (6.47 kB). View file

llama_cpp/__pycache__/llama_types.cpython-311.pyc ADDED Viewed

Binary file (16.8 kB). View file

llama_cpp/__pycache__/llava_cpp.cpython-311.pyc ADDED Viewed

Binary file (4.49 kB). View file

llama_cpp/__pycache__/mtmd_cpp.cpython-311.pyc ADDED Viewed

Binary file (8.75 kB). View file

llama_cpp/_ctypes_extensions.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from __future__ import annotations
+import sys
+import os
+import ctypes
+import functools
+import pathlib
+from typing import (
+    Any,
+    Callable,
+    List,
+    Union,
+    Optional,
+    TYPE_CHECKING,
+    TypeVar,
+    Generic,
+)
+from typing_extensions import TypeAlias
+# Load the library
+def load_shared_library(lib_base_name: str, base_path: pathlib.Path):
+    """Platform independent shared library loader"""
+    # Searching for the library in the current directory under the name "libllama" (default name
+    # for llamacpp) and "llama" (default name for this repo)
+    lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"):
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+        ]
+    elif sys.platform == "darwin":
+        lib_paths += [
+            base_path / f"lib{lib_base_name}.so",
+            base_path / f"lib{lib_base_name}.dylib",
+        ]
+    elif sys.platform == "win32":
+        lib_paths += [
+            base_path / f"{lib_base_name}.dll",
+            base_path / f"lib{lib_base_name}.dll",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+    cdll_args = dict()  # type: ignore
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32":
+        os.add_dll_directory(str(base_path))
+        os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"]
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        if "HIP_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
+    # Try to load the shared library, handling potential errors
+    for lib_path in lib_paths:
+        if lib_path.exists():
+            try:
+                return ctypes.CDLL(str(lib_path), **cdll_args)  # type: ignore
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}")
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+# ctypes sane type hint helpers
+#
+# - Generic Pointer and Array types
+# - PointerOrRef type with a type hinted byref function
+#
+# NOTE: Only use these for static type checking not for runtime checks
+# no good will come of that
+if TYPE_CHECKING:
+    CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData)  # type: ignore
+    CtypesArray: TypeAlias = ctypes.Array[CtypesCData]  # type: ignore
+    CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData]  # type: ignore
+    CtypesVoidPointer: TypeAlias = ctypes.c_void_p
+    class CtypesRef(Generic[CtypesCData]):
+        pass
+    CtypesPointerOrRef: TypeAlias = Union[
+        CtypesPointer[CtypesCData], CtypesRef[CtypesCData]
+    ]
+    CtypesFuncPointer: TypeAlias = ctypes._FuncPointer  # type: ignore
+F = TypeVar("F", bound=Callable[..., Any])
+def ctypes_function_for_shared_library(lib: ctypes.CDLL):
+    """Decorator for defining ctypes functions with type hints"""
+    def ctypes_function(
+        name: str, argtypes: List[Any], restype: Any, enabled: bool = True
+    ):
+        def decorator(f: F) -> F:
+            if enabled:
+                func = getattr(lib, name)
+                func.argtypes = argtypes
+                func.restype = restype
+                functools.wraps(f)(func)
+                return func
+            else:
+                return f
+        return decorator
+    return ctypes_function
+def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]:
+    """Type-annotated version of ctypes.byref"""
+    ...
+byref = _byref if TYPE_CHECKING else ctypes.byref

llama_cpp/_ggml.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Internal module use at your own risk
+This module provides a minimal interface for working with ggml tensors from llama-cpp-python
+"""
+import os
+import pathlib
+import llama_cpp._ctypes_extensions as ctypes_ext
+libggml_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
+libggml = ctypes_ext.load_shared_library("ggml", libggml_base_path)

llama_cpp/_internals.py ADDED Viewed

	@@ -0,0 +1,856 @@

+from __future__ import annotations
+import os
+import ctypes
+from typing import (
+    Dict,
+    List,
+    Tuple,
+    Optional,
+    Sequence,
+    Callable,
+    Union,
+)
+from dataclasses import dataclass, field
+from contextlib import ExitStack
+import numpy as np
+import numpy.typing as npt
+from .llama_types import *
+from .llama_grammar import LlamaGrammar
+from ._utils import suppress_stdout_stderr
+import llama_cpp.llama_cpp as llama_cpp
+# Python wrappers over llama.h structs
+class LlamaModel:
+    """Intermediate Python wrapper for a llama.cpp llama_model.
+    NOTE: For stability it's recommended you use the Llama class instead."""
+    def __init__(
+        self,
+        *,
+        path_model: str,
+        params: llama_cpp.llama_model_params,
+        verbose: bool = True,
+    ):
+        self.path_model = path_model
+        self.params = params
+        self.verbose = verbose
+        self._exit_stack = ExitStack()
+        model = None
+        if not os.path.exists(path_model):
+            raise ValueError(f"Model path does not exist: {path_model}")
+        with suppress_stdout_stderr(disable=verbose):
+            model = llama_cpp.llama_model_load_from_file(
+                self.path_model.encode("utf-8"), self.params
+            )
+        if model is None:
+            raise ValueError(f"Failed to load model from file: {path_model}")
+        vocab = llama_cpp.llama_model_get_vocab(model)
+        if vocab is None:
+            raise ValueError(f"Failed to get vocab from model: {path_model}")
+        self.model = model
+        self.vocab = vocab
+        self.sampler = None  # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
+        def free_model():
+            if self.model is None:
+                return
+            llama_cpp.llama_model_free(self.model)
+            self.model = None
+        self._exit_stack.callback(free_model)
+    def close(self):
+        if self.sampler is not None:
+            # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+            for i, _ in reversed(self.custom_samplers):
+                llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+            self.custom_samplers.clear()
+        self._exit_stack.close()
+    def __del__(self):
+        self.close()
+    def vocab_type(self) -> int:
+        return llama_cpp.llama_vocab_type(self.vocab)
+    def n_vocab(self) -> int:
+        return llama_cpp.llama_vocab_n_tokens(self.vocab)
+    def n_ctx_train(self) -> int:
+        return llama_cpp.llama_model_n_ctx_train(self.model)
+    def n_embd(self) -> int:
+        return llama_cpp.llama_model_n_embd(self.model)
+    def rope_freq_scale_train(self) -> float:
+        return llama_cpp.llama_model_rope_freq_scale_train(self.model)
+    def desc(self) -> str:
+        buf = ctypes.create_string_buffer(1024)
+        llama_cpp.llama_model_desc(self.model, buf, 1024)
+        return buf.value.decode("utf-8")
+    def size(self) -> int:
+        return llama_cpp.llama_model_size(self.model)
+    def n_params(self) -> int:
+        return llama_cpp.llama_model_n_params(self.model)
+    def get_tensor(self, name: str) -> ctypes.c_void_p:
+        raise NotImplementedError("get_tensor is not implemented in llama.cpp")
+    # Vocab
+    def token_get_text(self, token: int) -> str:
+        return llama_cpp.llama_vocab_get_text(self.vocab, token).decode("utf-8")
+    def token_get_score(self, token: int) -> float:
+        return llama_cpp.llama_vocab_get_score(self.vocab, token)
+    def token_get_attr(self, token: int) -> int:
+        return llama_cpp.llama_vocab_get_attr(self.vocab, token)
+    # Special tokens
+    def token_bos(self) -> int:
+        return llama_cpp.llama_vocab_bos(self.vocab)
+    def token_eos(self) -> int:
+        return llama_cpp.llama_vocab_eos(self.vocab)
+    def token_cls(self) -> int:
+        return llama_cpp.llama_vocab_cls(self.vocab)
+    def token_sep(self) -> int:
+        return llama_cpp.llama_vocab_sep(self.vocab)
+    def token_nl(self) -> int:
+        return llama_cpp.llama_vocab_nl(self.vocab)
+    def token_prefix(self) -> int:
+        return llama_cpp.llama_vocab_fim_pre(self.vocab)
+    def token_middle(self) -> int:
+        return llama_cpp.llama_vocab_fim_mid(self.vocab)
+    def token_suffix(self) -> int:
+        return llama_cpp.llama_vocab_fim_suf(self.vocab)
+    def token_eot(self) -> int:
+        return llama_cpp.llama_vocab_eot(self.vocab)
+    def add_bos_token(self) -> bool:
+        return llama_cpp.llama_vocab_get_add_bos(self.vocab)
+    def add_eos_token(self) -> bool:
+        return llama_cpp.llama_vocab_get_add_eos(self.vocab)
+    # Tokenization
+    def tokenize(self, text: bytes, add_bos: bool, special: bool):
+        n_ctx = self.n_ctx_train()
+        tokens = (llama_cpp.llama_token * n_ctx)()
+        n_tokens = llama_cpp.llama_tokenize(
+            self.vocab, text, len(text), tokens, n_ctx, add_bos, special
+        )
+        if n_tokens < 0:
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * n_tokens)()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.vocab, text, len(text), tokens, n_tokens, add_bos, special
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
+        return list(tokens[:n_tokens])
+    def token_to_piece(self, token: int, special: bool = False) -> bytes:
+        buf = ctypes.create_string_buffer(32)
+        llama_cpp.llama_token_to_piece(self.vocab, token, buf, 32, 0, special)
+        return bytes(buf)
+    def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
+        output = b""
+        size = 32
+        buffer = (ctypes.c_char * size)()
+        for token in tokens:
+            n = llama_cpp.llama_token_to_piece(
+                self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special
+            )
+            assert n <= size
+            output += bytes(buffer[:n])
+        # NOTE: Llama1 models automatically added a space at the start of the prompt
+        # this line removes a leading space if the first token is a beginning of sentence token
+        return (
+            output[1:]
+            if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" "
+            else output
+        )
+    # Extra
+    def metadata(self) -> Dict[str, str]:
+        metadata: Dict[str, str] = {}
+        buffer_size = 1024
+        buffer = ctypes.create_string_buffer(buffer_size)
+        # zero the buffer
+        buffer.value = b"\0" * buffer_size
+        # iterate over model keys
+        for i in range(llama_cpp.llama_model_meta_count(self.model)):
+            nbytes = llama_cpp.llama_model_meta_key_by_index(
+                self.model, i, buffer, buffer_size
+            )
+            if nbytes > buffer_size:
+                buffer_size = nbytes + 1
+                buffer = ctypes.create_string_buffer(buffer_size)
+                nbytes = llama_cpp.llama_model_meta_key_by_index(
+                    self.model, i, buffer, buffer_size
+                )
+            key = buffer.value.decode("utf-8")
+            nbytes = llama_cpp.llama_model_meta_val_str_by_index(
+                self.model, i, buffer, buffer_size
+            )
+            if nbytes > buffer_size:
+                buffer_size = nbytes + 1
+                buffer = ctypes.create_string_buffer(buffer_size)
+                nbytes = llama_cpp.llama_model_meta_val_str_by_index(
+                    self.model, i, buffer, buffer_size
+                )
+            value = buffer.value.decode("utf-8")
+            metadata[key] = value
+        return metadata
+    @staticmethod
+    def default_params():
+        """Get the default llama_model_params."""
+        return llama_cpp.llama_model_default_params()
+class LlamaContext:
+    """Intermediate Python wrapper for a llama.cpp llama_context.
+    NOTE: For stability it's recommended you use the Llama class instead."""
+    def __init__(
+        self,
+        *,
+        model: LlamaModel,
+        params: llama_cpp.llama_context_params,
+        verbose: bool = True,
+    ):
+        self.model = model
+        self.params = params
+        self.verbose = verbose
+        self._exit_stack = ExitStack()
+        ctx = llama_cpp.llama_init_from_model(self.model.model, self.params)
+        if ctx is None:
+            raise ValueError("Failed to create llama_context")
+        self.ctx = ctx
+        self.memory = llama_cpp.llama_get_memory(self.ctx)
+        self.sampler = None  # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
+        def free_ctx():
+            if self.ctx is None:
+                return
+            llama_cpp.llama_free(self.ctx)
+            self.ctx = None
+        self._exit_stack.callback(free_ctx)
+    def close(self):
+        self._exit_stack.close()
+    def __del__(self):
+        self.close()
+    def n_ctx(self) -> int:
+        return llama_cpp.llama_n_ctx(self.ctx)
+    def pooling_type(self) -> int:
+        return llama_cpp.llama_pooling_type(self.ctx)
+    def kv_cache_clear(self):
+        assert self.memory is not None, "Memory is not initialized"
+        llama_cpp.llama_memory_clear(self.memory, True)
+    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
+        seq_id = seq_id if seq_id >= 0 else 0
+        llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
+    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        assert self.memory is not None, "Memory is not initialized"
+        llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
+    def kv_cache_seq_keep(self, seq_id: int):
+        assert self.memory is not None, "Memory is not initialized"
+        llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
+    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        assert self.memory is not None, "Memory is not initialized"
+        llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
+    def get_state_size(self) -> int:
+        return llama_cpp.llama_state_get_size(self.ctx)
+    # TODO: copy_state_data
+    # TODO: set_state_data
+    # TODO: llama_load_session_file
+    # TODO: llama_save_session_file
+    def decode(self, batch: LlamaBatch):
+        return_code = llama_cpp.llama_decode(
+            self.ctx,
+            batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_decode returned {return_code}")
+    def encode(self, batch: LlamaBatch):
+        return_code = llama_cpp.llama_encode(
+            self.ctx,
+            batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_encode returned {return_code}")
+    def set_n_threads(self, n_threads: int, n_threads_batch: int):
+        llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
+    def get_logits(self):
+        return llama_cpp.llama_get_logits(self.ctx)
+    def get_logits_ith(self, i: int):
+        return llama_cpp.llama_get_logits_ith(self.ctx, i)
+    def get_embeddings(self):
+        return llama_cpp.llama_get_embeddings(self.ctx)
+    def get_embeddings_ith(self, i: int):
+        return llama_cpp.llama_get_embeddings_ith(self.ctx, i)
+    def get_embeddings_seq(self, seq_id: int):
+        return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id)
+    # Sampling functions - deprecated, use LlamaSampler instead
+    def set_rng_seed(self, seed: int):
+        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+    def sample_repetition_penalties(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
+        penalty_last_n: int,
+        penalty_repeat: float,
+        penalty_freq: float,
+        penalty_present: float,
+    ):
+        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
+        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+    def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
+        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+    def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+    def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+    def sample_typical(
+        self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
+    ):
+        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+    def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
+        raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
+    def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
+        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+    def sample_token_mirostat(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        tau: float,
+        eta: float,
+        m: int,
+        mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
+    ) -> int:
+        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+    def sample_token_mirostat_v2(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        tau: float,
+        eta: float,
+        mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
+    ) -> int:
+        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+    def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
+        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+    def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
+        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+    # Grammar
+    def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
+        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+    def reset_timings(self):
+        llama_cpp.llama_perf_context_reset(self.ctx)
+    def print_timings(self):
+        llama_cpp.llama_perf_context_print(self.ctx)
+    # Utility functions
+    @staticmethod
+    def default_params():
+        """Get the default llama_context_params."""
+        return llama_cpp.llama_context_default_params()
+class LlamaBatch:
+    def __init__(
+        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
+    ):
+        self._n_tokens = n_tokens
+        self.embd = embd
+        self.n_seq_max = n_seq_max
+        self.verbose = verbose
+        self._exit_stack = ExitStack()
+        batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max)
+        if batch is None:
+            raise ValueError("Failed to create llama_batch")
+        self.batch = batch
+        self.sampler = None  # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
+        def free_batch():
+            if self.batch is None:
+                return
+            llama_cpp.llama_batch_free(self.batch)
+            self.batch = None
+        self._exit_stack.callback(free_batch)
+    def close(self):
+        self._exit_stack.close()
+    def __del__(self):
+        self.close()
+    def n_tokens(self) -> int:
+        return self.batch.n_tokens
+    def reset(self):
+        self.batch.n_tokens = 0
+    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
+        n_tokens = len(batch)
+        self.batch.n_tokens = n_tokens
+        for i in range(n_tokens):
+            self.batch.token[i] = batch[i]
+            self.batch.pos[i] = n_past + i
+            self.batch.seq_id[i][0] = 0
+            self.batch.n_seq_id[i] = 1
+            self.batch.logits[i] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
+        n_tokens = len(batch)
+        n_tokens0 = self.batch.n_tokens
+        self.batch.n_tokens += n_tokens
+        for i in range(n_tokens):
+            j = n_tokens0 + i
+            self.batch.token[j] = batch[i]
+            self.batch.pos[j] = i
+            self.batch.seq_id[j][0] = seq_id
+            self.batch.n_seq_id[j] = 1
+            self.batch.logits[j] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+class LlamaTokenDataArray:
+    def __init__(self, *, n_vocab: int):
+        self.n_vocab = n_vocab
+        self.candidates_data = np.recarray(
+            (self.n_vocab,),
+            dtype=np.dtype(
+                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
+            ),
+        )
+        self.candidates = llama_cpp.llama_token_data_array(
+            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
+            size=self.n_vocab,
+            sorted=False,
+        )
+        self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)  # type: ignore
+        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
+        self.sampler = None  # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
+    def copy_logits(self, logits: npt.NDArray[np.single]):
+        self.candidates_data.id[:] = self.default_candidates_data_id
+        self.candidates_data.logit[:] = logits
+        self.candidates_data.p[:] = self.default_candidates_data_p
+        self.candidates.sorted = False
+        self.candidates.size = self.n_vocab
+# Embedding functions
+def normalize_embedding(embedding):
+    norm = float(np.linalg.norm(embedding))
+    if norm == 0.0:
+        return embedding
+    return [v / norm for v in embedding]
+# Python wrappers over common/sampling structs
+@dataclass
+class LlamaSamplingParams:
+    n_prev: int = 64
+    n_probs: int = 0
+    top_k: int = 40
+    top_p: float = 0.95
+    min_p: float = 0.05
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
+    temp: float = 0.80
+    penalty_last_n: int = 64
+    penalty_repeat: float = 1.0
+    penalty_freq: float = 0.00
+    penalty_present: float = 0.00
+    mirostat: int = 0
+    mirostat_tau: float = 5.00
+    mirostat_eta: float = 0.10
+    penalize_nl: bool = True
+    grammar: str = ""
+    cfg_negative_prompt: str = ""
+    cfg_scale: float = 1.00
+    logit_bias: dict[int, float] = field(default_factory=dict)
+@dataclass
+class LlamaSamplingContext:
+    params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams)
+    mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float)
+    grammar: Optional[LlamaGrammar] = None
+    # NOTE: Missing parsed_grammar
+    prev: list[int] = field(default_factory=list)
+    cur: list[llama_cpp.llama_token_data] = field(default_factory=list)
+    def reset(self):
+        self.prev = []
+        self.cur = []
+        if self.grammar is not None:
+            self.grammar.reset()
+    def cp(self):
+        return LlamaSamplingContext(
+            params=self.params,
+            mirostat_mu=self.mirostat_mu,
+            grammar=self.grammar,
+            prev=self.prev.copy(),
+            cur=self.cur.copy(),
+        )
+    def last(self) -> Optional[int]:
+        if len(self.prev) > 0:
+            return self.prev[-1]
+        else:
+            return None
+    def prev_str(self, ctx_main: LlamaContext, n: int) -> str:
+        return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
+    def sample(
+        self,
+        ctx_main: LlamaContext,
+        idx: int = 0,
+        logits_array: Optional[npt.NDArray[np.single]] = None,
+    ):
+        # This method is deprecated in favor of using LlamaSampler directly
+        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+    def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
+        self.prev.append(id)
+class CustomSampler:
+    def __init__(
+        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
+    ):
+        self.apply_func = apply_func
+        def apply_wrapper(
+            sampler: llama_cpp.llama_sampler_p,
+            cur_p: llama_cpp.llama_token_data_array_p,
+        ):
+            self.apply_func(cur_p)
+        def free_wrapper(sampler: llama_cpp.llama_sampler_p):
+            pass
+        sampler_i = llama_cpp.llama_sampler_i()
+        sampler_i.apply = llama_cpp.llama_sampler_i_apply(apply_wrapper)
+        self._apply_wrapper_ref = apply_wrapper
+        sampler_i.name = llama_cpp.llama_sampler_i_name(0)
+        sampler_i.accept = llama_cpp.llama_sampler_i_accept(0)
+        sampler_i.reset = llama_cpp.llama_sampler_i_reset(0)
+        sampler_i.clone = llama_cpp.llama_sampler_i_clone(0)
+        sampler_i.free = llama_cpp.llama_sampler_i_free(0)
+        self.sampler = llama_cpp.llama_sampler()
+        self.sampler.iface = ctypes.pointer(sampler_i)
+        self.sampler.ctx = None
+    def get_sampler(self) -> llama_cpp.llama_sampler_p:
+        return ctypes.pointer(self.sampler)
+class LlamaSampler:
+    def __init__(self):
+        params = llama_cpp.llama_sampler_chain_default_params()
+        self.sampler = llama_cpp.llama_sampler_chain_init(params)
+        self.custom_samplers: List[Tuple[int, CustomSampler]] = []
+        self._exit_stack = ExitStack()
+        def free_sampler():
+            if self.sampler is not None:
+                # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+                for i, _ in reversed(self.custom_samplers):
+                    llama_cpp.llama_sampler_chain_remove(self.sampler, i)
+                llama_cpp.llama_sampler_free(self.sampler)
+                self.sampler = None
+        self._exit_stack.callback(free_sampler)
+    def close(self):
+        self._exit_stack.close()
+    def __del__(self):
+        self.close()
+    def add_greedy(self):
+        sampler = llama_cpp.llama_sampler_init_greedy()
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_dist(self, seed: int):
+        sampler = llama_cpp.llama_sampler_init_dist(seed)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_softmax(self):
+        sampler = llama_cpp.llama_sampler_init_softmax()
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_top_k(self, k: int):
+        sampler = llama_cpp.llama_sampler_init_top_k(k)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_top_p(self, p: float, min_keep: int = 1):
+        sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_min_p(self, p: float, min_keep: int = 1):
+        sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_typical(self, p: float, min_keep: int = 1):
+        sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_temp(self, temp: float):
+        sampler = llama_cpp.llama_sampler_init_temp(temp)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_temp_ext(self, t: float, delta: float, exponent: float):
+        sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_xtc(self, p: float, t: float, min_keep: int, seed: int):
+        sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_top_n_sigma(self, n: float):
+        sampler = llama_cpp.llama_sampler_init_top_n_sigma(n)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int):
+        sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_mirostat_v2(self, seed: int, tau: float, eta: float):
+        sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
+        sampler = llama_cpp.llama_sampler_init_grammar(
+            model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_grammar_lazy_patterns(
+        self,
+        model: LlamaModel,
+        grammar: LlamaGrammar,
+        trigger_patterns: List[str],
+        trigger_tokens: List[int]
+    ):
+        # Convert patterns to C array
+        pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
+        for i, pattern in enumerate(trigger_patterns):
+            pattern_ptrs[i] = pattern.encode("utf-8")
+        # Convert tokens to C array
+        token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
+        sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
+            model.vocab,
+            grammar._grammar.encode("utf-8"),
+            grammar._root.encode("utf-8"),
+            pattern_ptrs,
+            len(trigger_patterns),
+            token_array,
+            len(trigger_tokens)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_penalties(
+        self,
+        penalty_last_n: int,
+        penalty_repeat: float,
+        penalty_freq: float,
+        penalty_present: float,
+    ):
+        sampler = llama_cpp.llama_sampler_init_penalties(
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_dry(
+        self,
+        model: LlamaModel,
+        n_ctx_train: int,
+        dry_multiplier: float,
+        dry_base: float,
+        dry_allowed_length: int,
+        dry_penalty_last_n: int,
+        seq_breakers: List[str]
+    ):
+        # Convert seq_breakers to C array
+        breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
+        for i, breaker in enumerate(seq_breakers):
+            breaker_ptrs[i] = breaker.encode("utf-8")
+        sampler = llama_cpp.llama_sampler_init_dry(
+            model.vocab,
+            n_ctx_train,
+            dry_multiplier,
+            dry_base,
+            dry_allowed_length,
+            dry_penalty_last_n,
+            breaker_ptrs,
+            len(seq_breakers)
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_logit_bias(
+        self,
+        n_vocab: int,
+        logit_bias: Dict[int, float]
+    ):
+        # Convert logit_bias dict to C array
+        bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
+        for i, (token, bias) in enumerate(logit_bias.items()):
+            bias_array[i].token = token
+            bias_array[i].bias = bias
+        sampler = llama_cpp.llama_sampler_init_logit_bias(
+            n_vocab,
+            len(logit_bias),
+            bias_array
+        )
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_infill(self, model: LlamaModel):
+        sampler = llama_cpp.llama_sampler_init_infill(model.vocab)
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+    def add_custom(
+        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
+    ):
+        custom_sampler = CustomSampler(apply_func)
+        sampler = custom_sampler.get_sampler()
+        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+        # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
+        self.custom_samplers.append(
+            (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler)
+        )
+    def get_seed(self) -> int:
+        return llama_cpp.llama_sampler_get_seed(self.sampler)
+    def sample(self, ctx: LlamaContext, idx: int = -1) -> int:
+        return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)
+    def accept(self, token: int):
+        llama_cpp.llama_sampler_accept(self.sampler, token)
+    def reset(self):
+        llama_cpp.llama_sampler_reset(self.sampler)
+    def clone(self):
+        # NOTE: Custom samplers cannot be cloned due to Python callback limitations
+        if self.custom_samplers:
+            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
+        cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
+        # Create a new wrapper around the cloned sampler
+        new_sampler = LlamaSampler.__new__(LlamaSampler)
+        new_sampler.sampler = cloned_sampler
+        new_sampler.custom_samplers = []
+        new_sampler._exit_stack = ExitStack()
+        def free_sampler():
+            if new_sampler.sampler is not None:
+                llama_cpp.llama_sampler_free(new_sampler.sampler)
+                new_sampler.sampler = None
+        new_sampler._exit_stack.callback(free_sampler)
+        return new_sampler

llama_cpp/_logger.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import sys
+import ctypes
+import logging
+import llama_cpp
+# enum ggml_log_level {
+#     GGML_LOG_LEVEL_NONE  = 0,
+#     GGML_LOG_LEVEL_INFO  = 1,
+#     GGML_LOG_LEVEL_WARN  = 2,
+#     GGML_LOG_LEVEL_ERROR = 3,
+#     GGML_LOG_LEVEL_DEBUG = 4,
+#     GGML_LOG_LEVEL_CONT  = 5, // continue previous log
+# };
+GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
+    0: logging.CRITICAL,
+    1: logging.INFO,
+    2: logging.WARNING,
+    3: logging.ERROR,
+    4: logging.DEBUG,
+    5: logging.DEBUG,
+}
+logger = logging.getLogger("llama-cpp-python")
+_last_log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[0]
+# typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+@llama_cpp.llama_log_callback
+def llama_log_callback(
+    level: int,
+    text: bytes,
+    user_data: ctypes.c_void_p,
+):
+    # TODO: Correctly implement continue previous log
+    global _last_log_level
+    log_level = GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level] if level != 5 else _last_log_level
+    if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
+        print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
+    _last_log_level = log_level
+llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
+def set_verbose(verbose: bool):
+    logger.setLevel(logging.DEBUG if verbose else logging.ERROR)

llama_cpp/_utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import sys
+from typing import Any, Dict
+# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
+outnull_file = open(os.devnull, "w")
+errnull_file = open(os.devnull, "w")
+STDOUT_FILENO = 1
+STDERR_FILENO = 2
+class suppress_stdout_stderr(object):
+    # NOTE: these must be "saved" here to avoid exceptions when using
+    #       this context manager inside of a __del__ method
+    sys = sys
+    os = os
+    def __init__(self, disable: bool = True):
+        self.disable = disable
+    # Oddly enough this works better than the contextlib version
+    def __enter__(self):
+        if self.disable:
+            return self
+        self.old_stdout_fileno_undup = STDOUT_FILENO
+        self.old_stderr_fileno_undup = STDERR_FILENO
+        self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
+        self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
+        self.old_stdout = self.sys.stdout
+        self.old_stderr = self.sys.stderr
+        self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
+        self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
+        self.sys.stdout = outnull_file
+        self.sys.stderr = errnull_file
+        return self
+    def __exit__(self, *_):
+        if self.disable:
+            return
+        # Check if sys.stdout and sys.stderr have fileno method
+        self.sys.stdout = self.old_stdout
+        self.sys.stderr = self.old_stderr
+        self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+        self.os.close(self.old_stdout_fileno)
+        self.os.close(self.old_stderr_fileno)
+class MetaSingleton(type):
+    """
+    Metaclass for implementing the Singleton pattern.
+    """
+    _instances: Dict[type, Any] = {}
+    def __call__(cls, *args: Any, **kwargs: Any) -> Any:
+        if cls not in cls._instances:
+            cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+class Singleton(object, metaclass=MetaSingleton):
+    """
+    Base class for implementing the Singleton pattern.
+    """
+    def __init__(self):
+        super(Singleton, self).__init__()

llama_cpp/lib/libggml-base.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a33aad406e2803734808d1a05e8a45834fa726a4066f03eb6aaff5b7a3c155a7
+size 615864

llama_cpp/lib/libggml-cpu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57833bdcf97a60d84e9ae3089678ab06771fb1c7d4affbdd7360283ccd8e8e16
+size 791480

llama_cpp/lib/libggml.so ADDED Viewed

Binary file (47.6 kB). View file

llama_cpp/lib/libllama.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9369b069f7df66e1bfb7afc0ff31c18117aba769fbf9cbe7b209ab6254de90cf
+size 2150632

llama_cpp/lib/libmtmd.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58d6765b2f621fc5feb7e2e188810129be5ff8d5151e1bfef35231f22f6b9b08
+size 722296

llama_cpp/llama.py ADDED Viewed

	@@ -0,0 +1,2422 @@

+from __future__ import annotations
+import os
+import sys
+import uuid
+import time
+import json
+import ctypes
+import typing
+import random
+import fnmatch
+import warnings
+import contextlib
+import multiprocessing
+from typing import (
+    Any,
+    List,
+    Literal,
+    Optional,
+    Union,
+    Generator,
+    Sequence,
+    Iterator,
+    Deque,
+    Callable,
+    Dict,
+)
+from collections import deque
+from pathlib import Path
+from .llama_types import *
+from .llama_grammar import LlamaGrammar
+from .llama_cache import (
+    BaseLlamaCache,
+    LlamaCache,  # type: ignore
+    LlamaDiskCache,  # type: ignore
+    LlamaRAMCache,  # type: ignore
+)
+from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
+import llama_cpp.llama_cpp as llama_cpp
+import llama_cpp.llama_chat_format as llama_chat_format
+from llama_cpp.llama_speculative import LlamaDraftModel
+import numpy as np
+import numpy.typing as npt
+import llama_cpp._internals as internals
+from ._logger import set_verbose
+from ._utils import suppress_stdout_stderr
+class Llama:
+    """High-level Python wrapper for a llama.cpp model."""
+    __backend_initialized = False
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        # Model Params
+        n_gpu_layers: int = 0,
+        split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
+        main_gpu: int = 0,
+        tensor_split: Optional[List[float]] = None,
+        vocab_only: bool = False,
+        use_mmap: bool = True,
+        use_mlock: bool = False,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
+        # Context Params
+        seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
+        n_ctx: int = 512,
+        n_batch: int = 512,
+        n_ubatch: int = 512,
+        n_threads: Optional[int] = None,
+        n_threads_batch: Optional[int] = None,
+        rope_scaling_type: Optional[
+            int
+        ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
+        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        rope_freq_base: float = 0.0,
+        rope_freq_scale: float = 0.0,
+        yarn_ext_factor: float = -1.0,
+        yarn_attn_factor: float = 1.0,
+        yarn_beta_fast: float = 32.0,
+        yarn_beta_slow: float = 1.0,
+        yarn_orig_ctx: int = 0,
+        logits_all: bool = False,
+        embedding: bool = False,
+        offload_kqv: bool = True,
+        flash_attn: bool = False,
+        op_offload: Optional[bool] = None,
+        swa_full: Optional[bool] = None,
+        # Sampling Params
+        no_perf: bool = False,
+        last_n_tokens_size: int = 64,
+        # LoRA Params
+        lora_base: Optional[str] = None,
+        lora_scale: float = 1.0,
+        lora_path: Optional[str] = None,
+        # Backend Params
+        numa: Union[bool, int] = False,
+        # Chat Format Params
+        chat_format: Optional[str] = None,
+        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
+        # Speculative Decoding
+        draft_model: Optional[LlamaDraftModel] = None,
+        # Tokenizer Override
+        tokenizer: Optional[BaseLlamaTokenizer] = None,
+        # KV cache quantization
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
+        # Misc
+        spm_infill: bool = False,
+        verbose: bool = True,
+        # Extra Params
+        **kwargs,  # type: ignore
+    ):
+        """Load a llama.cpp model from `model_path`.
+        Examples:
+            Basic usage
+            >>> import llama_cpp
+            >>> model = llama_cpp.Llama(
+            ...     model_path="path/to/model",
+            ... )
+            >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"])
+            the lazy dog
+            Loading a chat model
+            >>> import llama_cpp
+            >>> model = llama_cpp.Llama(
+            ...     model_path="path/to/model",
+            ...     chat_format="llama-2",
+            ... )
+            >>> print(model.create_chat_completion(
+            ...     messages=[{
+            ...         "role": "user",
+            ...         "content": "what is the meaning of life?"
+            ...     }]
+            ... ))
+        Args:
+            model_path: Path to the model.
+            n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
+            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
+            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
+            tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
+            vocab_only: Only load the vocabulary no weights.
+            use_mmap: Use mmap if possible.
+            use_mlock: Force the system to keep the model in RAM.
+            kv_overrides: Key-value overrides for the model.
+            seed: RNG seed, -1 for random
+            n_ctx: Text context, 0 = from model
+            n_batch: Prompt processing maximum batch size
+            n_ubatch: Physical batch size
+            n_threads: Number of threads to use for generation
+            n_threads_batch: Number of threads to use for batch processing
+            rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
+            pooling_type: Pooling type, from `enum llama_pooling_type`.
+            rope_freq_base: RoPE base frequency, 0 = from model
+            rope_freq_scale: RoPE frequency scaling factor, 0 = from model
+            yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
+            yarn_attn_factor: YaRN magnitude scaling factor
+            yarn_beta_fast: YaRN low correction dim
+            yarn_beta_slow: YaRN high correction dim
+            yarn_orig_ctx: YaRN original context size
+            logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
+            embedding: Embedding mode only.
+            offload_kqv: Offload K, Q, V to GPU.
+            flash_attn: Use flash attention.
+            op_offload: offload host tensor operations to device
+            swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+            no_perf: Measure performance timings.
+            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
+            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
+            lora_path: Path to a LoRA file to apply to the model.
+            numa: numa policy
+            chat_format: String specifying the chat format to use when calling create_chat_completion.
+            chat_handler: Optional chat handler to use when calling create_chat_completion.
+            draft_model: Optional draft model to use for speculative decoding.
+            tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
+            verbose: Print verbose output to stderr.
+            type_k: KV cache data type for K (default: f16)
+            type_v: KV cache data type for V (default: f16)
+            spm_infill: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
+        Raises:
+            ValueError: If the model path does not exist.
+        Returns:
+            A Llama instance.
+        """
+        self.verbose = verbose
+        self._stack = contextlib.ExitStack()
+        set_verbose(verbose)
+        if not Llama.__backend_initialized:
+            with suppress_stdout_stderr(disable=verbose):
+                llama_cpp.llama_backend_init()
+            Llama.__backend_initialized = True
+        if isinstance(numa, bool):
+            self.numa = (
+                llama_cpp.GGML_NUMA_STRATEGY_DISTRIBUTE
+                if numa
+                else llama_cpp.GGML_NUMA_STRATEGY_DISABLED
+            )
+        else:
+            self.numa = numa
+        if self.numa != llama_cpp.GGML_NUMA_STRATEGY_DISABLED:
+            with suppress_stdout_stderr(disable=verbose):
+                llama_cpp.llama_numa_init(self.numa)
+        self.model_path = model_path
+        # Model Params
+        self.model_params = llama_cpp.llama_model_default_params()
+        self.model_params.n_gpu_layers = (
+            0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
+        )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+        self.model_params.split_mode = split_mode
+        self.model_params.main_gpu = main_gpu
+        self.tensor_split = tensor_split
+        self._c_tensor_split = None
+        if self.tensor_split is not None:
+            if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
+                raise ValueError(
+                    f"Attempt to split tensors that exceed maximum supported devices. Current LLAMA_MAX_DEVICES={llama_cpp.LLAMA_MAX_DEVICES}"
+                )
+            # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+            FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES
+            self._c_tensor_split = FloatArray(
+                *tensor_split  # type: ignore
+            )  # keep a reference to the array so it is not gc'd
+            self.model_params.tensor_split = self._c_tensor_split
+        self.model_params.vocab_only = vocab_only
+        self.model_params.use_mmap = use_mmap if lora_path is None else False
+        self.model_params.use_mlock = use_mlock
+        # kv_overrides is the original python dict
+        self.kv_overrides = kv_overrides
+        if kv_overrides is not None:
+            # _kv_overrides_array is a ctypes.Array of llama_model_kv_override Structs
+            kvo_array_len = len(kv_overrides) + 1  # for sentinel element
+            self._kv_overrides_array = (
+                llama_cpp.llama_model_kv_override * kvo_array_len
+            )()
+            for i, (k, v) in enumerate(kv_overrides.items()):
+                self._kv_overrides_array[i].key = k.encode("utf-8")
+                if isinstance(v, bool):
+                    self._kv_overrides_array[
+                        i
+                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
+                    self._kv_overrides_array[i].value.val_bool = v
+                elif isinstance(v, int):
+                    self._kv_overrides_array[
+                        i
+                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
+                    self._kv_overrides_array[i].value.val_i64 = v
+                elif isinstance(v, float):
+                    self._kv_overrides_array[
+                        i
+                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
+                    self._kv_overrides_array[i].value.val_f64 = v
+                elif isinstance(v, str):  # type: ignore
+                    v_bytes = v.encode("utf-8")
+                    if len(v_bytes) > 128:  # TODO: Make this a constant
+                        raise ValueError(f"Value for {k} is too long: {v}")
+                    v_bytes = v_bytes.ljust(128, b"\0")
+                    self._kv_overrides_array[
+                        i
+                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
+                    # copy min(v_bytes, 128) to str_value
+                    address = typing.cast(
+                        int,
+                        ctypes.addressof(self._kv_overrides_array[i].value)
+                        + llama_cpp.llama_model_kv_override_value.val_str.offset,
+                    )
+                    buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
+                    ctypes.memmove(
+                        buffer_start,
+                        v_bytes,
+                        128,
+                    )
+                else:
+                    raise ValueError(f"Unknown value type for {k}: {v}")
+            self._kv_overrides_array[
+                -1
+            ].key = b"\0"  # ensure sentinel element is zeroed
+            self.model_params.kv_overrides = self._kv_overrides_array
+        self.n_batch = min(n_ctx, n_batch)  # ???
+        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
+        self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count()
+        # Used by the sampler
+        self._seed = seed or llama_cpp.LLAMA_DEFAULT_SEED
+        # Context Params
+        self.context_params = llama_cpp.llama_context_default_params()
+        self.context_params.n_ctx = n_ctx
+        self.context_params.n_batch = self.n_batch
+        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
+        self.context_params.n_threads = self.n_threads
+        self.context_params.n_threads_batch = self.n_threads_batch
+        self.context_params.rope_scaling_type = (
+            rope_scaling_type
+            if rope_scaling_type is not None
+            else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+        )
+        self.context_params.pooling_type = pooling_type
+        self.context_params.rope_freq_base = (
+            rope_freq_base if rope_freq_base != 0.0 else 0
+        )
+        self.context_params.rope_freq_scale = (
+            rope_freq_scale if rope_freq_scale != 0.0 else 0
+        )
+        self.context_params.yarn_ext_factor = (
+            yarn_ext_factor if yarn_ext_factor != 0.0 else 0
+        )
+        self.context_params.yarn_attn_factor = (
+            yarn_attn_factor if yarn_attn_factor != 0.0 else 0
+        )
+        self.context_params.yarn_beta_fast = (
+            yarn_beta_fast if yarn_beta_fast != 0.0 else 0
+        )
+        self.context_params.yarn_beta_slow = (
+            yarn_beta_slow if yarn_beta_slow != 0.0 else 0
+        )
+        self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
+        self._logits_all = logits_all if draft_model is None else True
+        self.context_params.embeddings = embedding  # TODO: Rename to embeddings
+        self.context_params.offload_kqv = offload_kqv
+        self.context_params.flash_attn = flash_attn
+        if op_offload is not None:
+            self.context_params.op_offload = op_offload
+        if swa_full is not None:
+            self.context_params.swa_full = swa_full
+        #  KV cache quantization
+        if type_k is not None:
+            self.context_params.type_k = type_k
+        if type_v is not None:
+            self.context_params.type_v = type_v
+        # Sampling Params
+        self.context_params.no_perf = no_perf
+        self.last_n_tokens_size = last_n_tokens_size
+        self.cache: Optional[BaseLlamaCache] = None
+        self.lora_base = lora_base
+        self.lora_scale = lora_scale
+        self.lora_path = lora_path
+        self.spm_infill = spm_infill
+        if not os.path.exists(model_path):
+            raise ValueError(f"Model path does not exist: {model_path}")
+        self._model = self._stack.enter_context(
+            contextlib.closing(
+                internals.LlamaModel(
+                    path_model=self.model_path,
+                    params=self.model_params,
+                    verbose=self.verbose,
+                )
+            )
+        )
+        # Override tokenizer
+        self.tokenizer_ = tokenizer or LlamaTokenizer(self)
+        # Set the default value for the context and correct the batch
+        if n_ctx == 0:
+            n_ctx = self._model.n_ctx_train()
+            self.n_batch = min(n_ctx, n_batch)
+            self.context_params.n_ctx = self._model.n_ctx_train()
+            self.context_params.n_batch = self.n_batch
+            self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
+        self._ctx = self._stack.enter_context(
+            contextlib.closing(
+                internals.LlamaContext(
+                    model=self._model,
+                    params=self.context_params,
+                    verbose=self.verbose,
+                )
+            )
+        )
+        self._batch = self._stack.enter_context(
+            contextlib.closing(
+                internals.LlamaBatch(
+                    n_tokens=self.n_batch,
+                    embd=0,
+                    n_seq_max=self.context_params.n_ctx,
+                    verbose=self.verbose,
+                )
+            )
+        )
+        self._lora_adapter: Optional[llama_cpp.llama_adapter_lora_p] = None
+        if self.lora_path:
+            self._lora_adapter = llama_cpp.llama_adapter_lora_init(
+                self._model.model,
+                self.lora_path.encode("utf-8"),
+            )
+            if self._lora_adapter is None:
+                raise RuntimeError(
+                    f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
+                )
+            def free_lora_adapter():
+                if self._lora_adapter is None:
+                    return
+                llama_cpp.llama_adapter_lora_free(self._lora_adapter)
+                self._lora_adapter = None
+            self._stack.callback(free_lora_adapter)
+            if llama_cpp.llama_set_adapter_lora(
+                self._ctx.ctx, self._lora_adapter, self.lora_scale
+            ):
+                raise RuntimeError(
+                    f"Failed to set LoRA adapter from lora path: {self.lora_path}"
+                )
+        if self.verbose:
+            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+        self.chat_format = chat_format
+        self.chat_handler = chat_handler
+        self._chat_handlers: Dict[
+            str, llama_chat_format.LlamaChatCompletionHandler
+        ] = {}
+        self.draft_model = draft_model
+        self._n_vocab = self.n_vocab()
+        self._n_ctx = self.n_ctx()
+        self._token_nl = self.token_nl()
+        self._token_eos = self.token_eos()
+        self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
+        self.n_tokens = 0
+        self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
+        self.scores: npt.NDArray[np.single] = np.ndarray(
+            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
+        )
+        self._mirostat_mu = ctypes.c_float(
+            2.0 * 5.0
+        )  # TODO: Move this to sampling context
+        try:
+            self.metadata = self._model.metadata()
+        except Exception as e:
+            self.metadata = {}
+            if self.verbose:
+                print(f"Failed to load metadata: {e}", file=sys.stderr)
+        if self.verbose:
+            print(f"Model metadata: {self.metadata}", file=sys.stderr)
+        eos_token_id = self.token_eos()
+        bos_token_id = self.token_bos()
+        eos_token = (
+            self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
+        )
+        bos_token = (
+            self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
+        )
+        # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
+        template_choices = dict(
+            (name[10:], template)
+            for name, template in self.metadata.items()
+            if name.startswith("tokenizer.chat_template.")
+        )
+        if "tokenizer.chat_template" in self.metadata:
+            template_choices["chat_template.default"] = self.metadata[
+                "tokenizer.chat_template"
+            ]
+        if self.verbose and template_choices:
+            print(
+                f"Available chat formats from metadata: {', '.join(template_choices.keys())}",
+                file=sys.stderr,
+            )
+        for name, template in template_choices.items():
+            self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
+                template=template,
+                eos_token=eos_token,
+                bos_token=bos_token,
+                stop_token_ids=[eos_token_id],
+            ).to_chat_handler()
+        if (
+            self.chat_format is None
+            and self.chat_handler is None
+            and "chat_template.default" in template_choices
+        ):
+            chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
+                self.metadata
+            )
+            if chat_format is not None:
+                self.chat_format = chat_format
+                if self.verbose:
+                    print(f"Guessed chat format: {chat_format}", file=sys.stderr)
+            else:
+                if self.verbose:
+                    print(
+                        f"Using gguf chat template: {template_choices['chat_template.default']}",
+                        file=sys.stderr,
+                    )
+                    print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
+                    print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
+                self.chat_format = "chat_template.default"
+        if self.chat_format is None and self.chat_handler is None:
+            self.chat_format = "llama-2"
+            if self.verbose:
+                print(
+                    f"Using fallback chat format: {self.chat_format}", file=sys.stderr
+                )
+        self._sampler = None
+    @property
+    def ctx(self) -> llama_cpp.llama_context_p:
+        return self._ctx.ctx
+    @property
+    def model(self) -> llama_cpp.llama_model_p:
+        return self._model.model
+    @property
+    def _input_ids(self) -> npt.NDArray[np.intc]:
+        return self.input_ids[: self.n_tokens]
+    @property
+    def _scores(self) -> npt.NDArray[np.single]:
+        return self.scores[: self.n_tokens, :]
+    @property
+    def eval_tokens(self) -> Deque[int]:
+        return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx)
+    @property
+    def eval_logits(self) -> Deque[List[float]]:
+        return deque(
+            self.scores[: self.n_tokens, :].tolist(),
+            maxlen=self._n_ctx if self._logits_all else 1,
+        )
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = False
+    ) -> List[int]:
+        """Tokenize a string.
+        Args:
+            text: The utf-8 encoded string to tokenize.
+            add_bos: Whether to add a beginning of sequence token.
+            special: Whether to tokenize special tokens.
+        Raises:
+            RuntimeError: If the tokenization failed.
+        Returns:
+            A list of tokens.
+        """
+        return self.tokenizer_.tokenize(text, add_bos, special)
+    def detokenize(
+        self,
+        tokens: List[int],
+        prev_tokens: Optional[List[int]] = None,
+        special: bool = False,
+    ) -> bytes:
+        """Detokenize a list of tokens.
+        Args:
+            tokens: The list of tokens to detokenize.
+            prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
+            special: Whether to detokenize special tokens.
+        Returns:
+            The detokenized string.
+        """
+        return self.tokenizer_.detokenize(
+            tokens, prev_tokens=prev_tokens, special=special
+        )
+    def set_cache(self, cache: Optional[BaseLlamaCache]):
+        """Set the cache.
+        Args:
+            cache: The cache to set.
+        """
+        self.cache = cache
+    def set_seed(self, seed: int):
+        """Set the random seed.
+        Args:
+            seed: The random seed.
+        """
+        self._seed = seed
+    def reset(self):
+        """Reset the model state."""
+        self.n_tokens = 0
+    def eval(self, tokens: Sequence[int]):
+        """Evaluate a list of tokens.
+        Args:
+            tokens: The list of tokens to evaluate.
+        """
+        self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+        for i in range(0, len(tokens), self.n_batch):
+            batch = tokens[i : min(len(tokens), i + self.n_batch)]
+            n_past = self.n_tokens
+            n_tokens = len(batch)
+            self._batch.set_batch(
+                batch=batch, n_past=n_past, logits_all=self._logits_all
+            )
+            self._ctx.decode(self._batch)
+            # Save tokens
+            self.input_ids[n_past : n_past + n_tokens] = batch
+            # Save logits
+            if self._logits_all:
+                rows = n_tokens
+                cols = self._n_vocab
+                logits = np.ctypeslib.as_array(
+                    self._ctx.get_logits(), shape=(rows * cols,)
+                )
+                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
+            else:
+                # rows = 1
+                # cols = self._n_vocab
+                # logits = np.ctypeslib.as_array(
+                #     self._ctx.get_logits(), shape=(rows * cols,)
+                # )
+                # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits
+                # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all
+                pass
+            # Update n_tokens
+            self.n_tokens += n_tokens
+    def _init_sampler(
+        self,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_eta: float = 0.1,
+        mirostat_tau: float = 5.0,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+    ):
+        sampler = internals.LlamaSampler()
+        if logits_processor is not None:
+            # Create and add a custom sampler
+            def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
+                size = token_data_array.contents.size
+                data_soa = token_data_array.contents.data
+                data_soa_address = ctypes.addressof(data_soa.contents)
+                # NOTE: This is probably broken
+                recarray = np.recarray(
+                    shape=(size,),
+                    dtype=np.dtype(
+                        [("id", np.intc), ("logit", np.single), ("p", np.single)],
+                        align=True,
+                    ),
+                    buf=(llama_cpp.llama_token_data * size).from_address(
+                        data_soa_address
+                    ),
+                )
+                for logit_processor in logits_processor:
+                    recarray.logit[:] = logit_processor(self._input_ids, recarray.logit)
+            sampler.add_custom(apply_func)
+        sampler.add_penalties(
+            # n_vocab=self._n_vocab,
+            # special_eos_id=self._token_eos,
+            # linefeed_id=self._token_nl,
+            penalty_last_n=self.last_n_tokens_size,
+            penalty_repeat=repeat_penalty,
+            penalty_freq=frequency_penalty,
+            penalty_present=presence_penalty,
+            # penalize_nl=penalize_nl,
+            # ignore_eos=False,
+        )
+        if grammar is not None:
+            sampler.add_grammar(self._model, grammar)
+        if temp < 0.0:
+            sampler.add_softmax()
+            sampler.add_dist(self._seed)
+        elif temp == 0.0:
+            sampler.add_greedy()
+        else:
+            if mirostat_mode == 1:
+                mirostat_m = 100
+                sampler.add_mirostat(
+                    self._n_vocab,
+                    self._seed,
+                    mirostat_tau,
+                    mirostat_eta,
+                    mirostat_m,
+                )
+            elif mirostat_mode == 2:
+                sampler.add_mirostat_v2(
+                    self._seed,
+                    mirostat_tau,
+                    mirostat_eta,
+                )
+            else:
+                n_probs = 0
+                min_keep = max(1, n_probs)
+                sampler.add_top_k(top_k)
+                sampler.add_typical(typical_p, min_keep)
+                sampler.add_top_p(top_p, min_keep)
+                sampler.add_min_p(min_p, min_keep)
+                sampler.add_temp(temp)
+                sampler.add_dist(self._seed)
+        return sampler
+    def sample(
+        self,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_eta: float = 0.1,
+        mirostat_tau: float = 5.0,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        idx: Optional[int] = None,
+    ):
+        """Sample a token from the model.
+        Args:
+            top_k: The top-k sampling parameter.
+            top_p: The top-p sampling parameter.
+            temp: The temperature parameter.
+            repeat_penalty: The repeat penalty parameter.
+        Returns:
+            The sampled token.
+        """
+        assert self.n_tokens > 0
+        tmp_sampler = False
+        if self._sampler is None:
+            tmp_sampler = True
+            self._sampler = self._init_sampler(
+                top_k=top_k,
+                top_p=top_p,
+                min_p=min_p,
+                typical_p=typical_p,
+                temp=temp,
+                repeat_penalty=repeat_penalty,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                penalize_nl=penalize_nl,
+                logits_processor=logits_processor,
+                grammar=grammar,
+            )
+        ridx = idx - self.n_tokens if idx is not None else -1
+        assert self.ctx is not None
+        token = self._sampler.sample(self._ctx, ridx)
+        if tmp_sampler:
+            self._sampler = None
+        return token
+    def generate(
+        self,
+        tokens: Sequence[int],
+        top_k: int = 40,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.0,
+        reset: bool = True,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+    ) -> Generator[int, Optional[Sequence[int]], None]:
+        """Create a generator of tokens from a prompt.
+        Examples:
+            >>> llama = Llama("models/ggml-7b.bin")
+            >>> tokens = llama.tokenize(b"Hello, world!")
+            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.0):
+            ...     print(llama.detokenize([token]))
+        Args:
+            tokens: The prompt tokens.
+            top_k: The top-k sampling parameter.
+            top_p: The top-p sampling parameter.
+            temp: The temperature parameter.
+            repeat_penalty: The repeat penalty parameter.
+            reset: Whether to reset the model state.
+        Yields:
+            The generated tokens.
+        """
+        # Reset mirostat sampling
+        self._mirostat_mu = ctypes.c_float(2.0 * mirostat_tau)
+        self._sampler = self._init_sampler(
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            typical_p=typical_p,
+            temp=temp,
+            repeat_penalty=repeat_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            penalize_nl=penalize_nl,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
+        # Check for kv cache prefix match
+        if reset and self.n_tokens > 0:
+            longest_prefix = 0
+            for a, b in zip(self._input_ids, tokens[:-1]):
+                if a == b:
+                    longest_prefix += 1
+                else:
+                    break
+            if longest_prefix > 0:
+                reset = False
+                tokens = tokens[longest_prefix:]
+                self.n_tokens = longest_prefix
+                if self.verbose:
+                    print(
+                        f"Llama.generate: {longest_prefix} prefix-match hit, "
+                        f"remaining {len(tokens)} prompt tokens to eval",
+                        file=sys.stderr,
+                    )
+        # Reset the model state
+        if reset:
+            self.reset()
+        # # Reset the grammar
+        # if grammar is not None:
+        #     grammar.reset()
+        sample_idx = self.n_tokens + len(tokens) - 1
+        tokens = list(tokens)
+        # Eval and sample
+        while True:
+            self.eval(tokens)
+            while sample_idx < self.n_tokens:
+                token = self.sample(
+                    top_k=top_k,
+                    top_p=top_p,
+                    min_p=min_p,
+                    typical_p=typical_p,
+                    temp=temp,
+                    repeat_penalty=repeat_penalty,
+                    frequency_penalty=frequency_penalty,
+                    presence_penalty=presence_penalty,
+                    tfs_z=tfs_z,
+                    mirostat_mode=mirostat_mode,
+                    mirostat_tau=mirostat_tau,
+                    mirostat_eta=mirostat_eta,
+                    logits_processor=logits_processor,
+                    grammar=grammar,
+                    penalize_nl=penalize_nl,
+                    idx=sample_idx,
+                )
+                sample_idx += 1
+                if stopping_criteria is not None and stopping_criteria(
+                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                ):
+                    return
+                tokens_or_none = yield token
+                tokens.clear()
+                tokens.append(token)
+                if tokens_or_none is not None:
+                    tokens.extend(tokens_or_none)
+                if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
+                    self.n_tokens = sample_idx
+                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    break
+            if self.draft_model is not None:
+                self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
+                draft_tokens = self.draft_model(
+                    self.input_ids[: self.n_tokens + len(tokens)]
+                )
+                tokens.extend(
+                    draft_tokens.astype(int)[
+                        : self._n_ctx - self.n_tokens - len(tokens)
+                    ]
+                )
+    def create_embedding(
+        self, input: Union[str, List[str]], model: Optional[str] = None
+    ) -> CreateEmbeddingResponse:
+        """Embed a string.
+        Args:
+            input: The utf-8 encoded string to embed.
+        Returns:
+            An embedding object.
+        """
+        model_name: str = model if model is not None else self.model_path
+        input = input if isinstance(input, list) else [input]
+        # get numeric embeddings
+        embeds: Union[List[List[float]], List[List[List[float]]]]
+        total_tokens: int
+        embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
+        # convert to CreateEmbeddingResponse
+        data: List[Embedding] = [
+            {
+                "object": "embedding",
+                "embedding": emb,
+                "index": idx,
+            }
+            for idx, emb in enumerate(embeds)
+        ]
+        return {
+            "object": "list",
+            "data": data,
+            "model": model_name,
+            "usage": {
+                "prompt_tokens": total_tokens,
+                "total_tokens": total_tokens,
+            },
+        }
+    def embed(
+        self,
+        input: Union[str, List[str]],
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: bool = False,
+    ):
+        """Embed a string.
+        Args:
+            input: The utf-8 encoded string to embed.
+        Returns:
+            A list of embeddings
+        """
+        n_embd = self.n_embd()
+        n_batch = self.n_batch
+        # get pooling information
+        pooling_type = self.pooling_type()
+        logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE
+        if self.context_params.embeddings is False:
+            raise RuntimeError(
+                "Llama model must be created with embedding=True to call this method"
+            )
+        if self.verbose:
+            llama_cpp.llama_perf_context_reset(self._ctx.ctx)
+        if isinstance(input, str):
+            inputs = [input]
+        else:
+            inputs = input
+        # reset batch
+        self._batch.reset()
+        # decode and fetch embeddings
+        data: Union[List[List[float]], List[List[List[float]]]] = []
+        def decode_batch(seq_sizes: List[int]):
+            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            self._ctx.decode(self._batch)
+            self._batch.reset()
+            # store embeddings
+            if pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE:
+                pos: int = 0
+                for i, size in enumerate(seq_sizes):
+                    ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
+                    embedding: List[List[float]] = [
+                        ptr[pos + j * n_embd : pos + (j + 1) * n_embd]
+                        for j in range(size)
+                    ]
+                    if normalize:
+                        embedding = [
+                            internals.normalize_embedding(e) for e in embedding
+                        ]
+                    data.append(embedding)
+                    pos += size
+            else:
+                for i in range(len(seq_sizes)):
+                    ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
+                    embedding: List[float] = ptr[:n_embd]
+                    if normalize:
+                        embedding = internals.normalize_embedding(embedding)
+                    data.append(embedding)
+        # init state
+        total_tokens = 0
+        s_batch = []
+        t_batch = 0
+        p_batch = 0
+        # accumulate batches and encode
+        for text in inputs:
+            tokens = self.tokenize(text.encode("utf-8"))
+            if truncate:
+                tokens = tokens[:n_batch]
+            n_tokens = len(tokens)
+            total_tokens += n_tokens
+            # check for overrun
+            if n_tokens > n_batch:
+                raise ValueError(
+                    f"Requested tokens ({n_tokens}) exceed batch size of {n_batch}"
+                )
+            # time to eval batch
+            if t_batch + n_tokens > n_batch:
+                decode_batch(s_batch)
+                s_batch = []
+                t_batch = 0
+                p_batch = 0
+            # add to batch
+            self._batch.add_sequence(tokens, p_batch, logits_all)
+            # update batch stats
+            s_batch.append(n_tokens)
+            t_batch += n_tokens
+            p_batch += 1
+        # hanlde last batch
+        decode_batch(s_batch)
+        if self.verbose:
+            llama_cpp.llama_perf_context_print(self._ctx.ctx)
+        output = data[0] if isinstance(input, str) else data
+        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        self.reset()
+        if return_count:
+            return output, total_tokens
+        else:
+            return output
+    def _create_completion(
+        self,
+        prompt: Union[str, List[int]],
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repeat_penalty: float = 1.0,
+        top_k: int = 40,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
+    ) -> Union[
+        Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
+    ]:
+        assert suffix is None or suffix.__class__ is str
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        bos_token_id: int = self.token_bos()
+        cls_token_id: int = self._model.token_cls()
+        sep_token_id: int = self._model.token_sep()
+        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        add_space_prefix: bool = (
+            self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
+        )
+        bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
+        eos_tokens: List[int] = [
+            sep_token_id if sep_token_id != -1 else self.token_eos()
+        ]
+        if (
+            (isinstance(prompt, list) and suffix is None)
+            or not self._model.add_bos_token()
+            or bos_tokens[:1] == [-1]
+        ):
+            bos_tokens = []
+        if (isinstance(prompt, list) and suffix is None) or (
+            not self._model.add_eos_token() and sep_token_id == -1
+        ):
+            eos_tokens = []
+        suffix_space_prefix: int = 0
+        # Tokenizer hack to remove leading space
+        if add_space_prefix and suffix_token_id >= 0 and suffix:
+            suffix = "☺" + suffix
+            suffix_space_prefix = 2
+        # If prompt is empty, initialize completion with BOS token to avoid
+        # detokenization including a space at the beginning of the completion
+        completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id]
+        # Add blank space to start of prompt to match OG llama tokenizer
+        prefix_tokens: List[int] = (
+            [prefix_token_id] if prefix_token_id >= 0 and suffix is not None else []
+        ) + (
+            (
+                self.tokenize(
+                    prompt.encode("utf-8"),
+                    add_bos=False,
+                    special=(prefix_token_id < 0 or suffix is None),
+                )
+                if prompt != ""
+                else []
+            )
+            if isinstance(prompt, str)
+            else prompt
+        )
+        suffix_tokens: List[int] = (
+            (
+                [suffix_token_id]
+                + (
+                    self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)[
+                        suffix_space_prefix:
+                    ]
+                    if suffix
+                    else []
+                )
+            )
+            if suffix_token_id >= 0 and suffix is not None
+            else []
+        )
+        middle_tokens: List[int] = (
+            [middle_token_id] if middle_token_id >= 0 and suffix is not None else []
+        )
+        prompt_tokens: List[int] = (
+            bos_tokens
+            + (
+                (suffix_tokens + prefix_tokens + middle_tokens)
+                if self.spm_infill
+                else (prefix_tokens + suffix_tokens + middle_tokens)
+            )
+            + eos_tokens
+        )
+        text: bytes = b""
+        returned_tokens: int = 0
+        stop = (
+            stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
+        )
+        model_name: str = model if model is not None else self.model_path
+        if prompt_tokens[:2] == [self.token_bos()] * 2:
+            warnings.warn(
+                f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...',
+                RuntimeWarning,
+            )
+        # NOTE: This likely doesn't work correctly for the first token in the prompt
+        # because of the extra space added to the start of the prompt_tokens
+        if logit_bias is not None:
+            logit_bias_map = {int(k): float(v) for k, v in logit_bias.items()}
+            def logit_bias_processor(
+                input_ids: npt.NDArray[np.intc],
+                scores: npt.NDArray[np.single],
+            ) -> npt.NDArray[np.single]:
+                new_scores = np.copy(
+                    scores
+                )  # Does it make sense to copy the whole array or can we just overwrite the original one?
+                for input_id, score in logit_bias_map.items():
+                    new_scores[input_id] = score + scores[input_id]
+                return new_scores
+            _logit_bias_processor = LogitsProcessorList([logit_bias_processor])
+            if logits_processor is None:
+                logits_processor = _logit_bias_processor
+            else:
+                logits_processor = logits_processor.extend(_logit_bias_processor)
+        if self.verbose:
+            self._ctx.reset_timings()
+        if len(prompt_tokens) >= self._n_ctx:
+            raise ValueError(
+                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+            )
+        if max_tokens is None or max_tokens <= 0:
+            # Unlimited, depending on n_ctx.
+            max_tokens = self._n_ctx - len(prompt_tokens)
+        # Truncate max_tokens if requested tokens would exceed the context window
+        max_tokens = (
+            max_tokens
+            if max_tokens + len(prompt_tokens) < self._n_ctx
+            else (self._n_ctx - len(prompt_tokens))
+        )
+        if stop != []:
+            stop_sequences = [s.encode("utf-8") for s in stop]
+        else:
+            stop_sequences = []
+        if logprobs is not None and self._logits_all is False:
+            raise ValueError(
+                "logprobs is not supported for models created with logits_all=False"
+            )
+        if self.cache:
+            try:
+                cache_item = self.cache[prompt_tokens]
+                cache_prefix_len = Llama.longest_token_prefix(
+                    cache_item.input_ids.tolist(), prompt_tokens
+                )
+                eval_prefix_len = Llama.longest_token_prefix(
+                    self._input_ids.tolist(), prompt_tokens
+                )
+                if cache_prefix_len > eval_prefix_len:
+                    self.load_state(cache_item)
+                    if self.verbose:
+                        print("Llama._create_completion: cache hit", file=sys.stderr)
+            except KeyError:
+                if self.verbose:
+                    print("Llama._create_completion: cache miss", file=sys.stderr)
+        if seed is not None:
+            self.set_seed(seed)
+        else:
+            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+        finish_reason = "length"
+        multibyte_fix = 0
+        for token in self.generate(
+            prompt_tokens,
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            typical_p=typical_p,
+            temp=temperature,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            repeat_penalty=repeat_penalty,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        ):
+            if llama_cpp.llama_token_is_eog(self._model.vocab, token):
+                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                finish_reason = "stop"
+                break
+            completion_tokens.append(token)
+            all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            # Contains multi-byte UTF8
+            for k, char in enumerate(all_text[-3:]):
+                k = 3 - k
+                for num, pattern in [(2, 192), (3, 224), (4, 240)]:
+                    # Bitwise AND check
+                    if num > k and pattern & char == pattern:
+                        multibyte_fix = num - k
+            # Stop incomplete bytes from passing
+            if multibyte_fix > 0:
+                multibyte_fix -= 1
+                continue
+            any_stop = [s for s in stop_sequences if s in all_text]
+            if len(any_stop) > 0:
+                first_stop = any_stop[0]
+                text = all_text[: all_text.index(first_stop)]
+                finish_reason = "stop"
+                break
+            if stream:
+                remaining_tokens = completion_tokens[returned_tokens:]
+                remaining_text = self.detokenize(
+                    remaining_tokens,
+                    prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                )
+                remaining_length = len(remaining_text)
+                # We want to avoid yielding any characters from
+                # the generated text if they are part of a stop
+                # sequence.
+                first_stop_position = 0
+                for s in stop_sequences:
+                    for i in range(min(len(s), remaining_length), 0, -1):
+                        if remaining_text.endswith(s[:i]):
+                            if i > first_stop_position:
+                                first_stop_position = i
+                            break
+                token_end_position = 0
+                if logprobs is not None:
+                    # not sure how to handle this branch when dealing
+                    # with CJK output, so keep it unchanged
+                    for token in remaining_tokens:
+                        if token == bos_token_id:
+                            continue
+                        token_end_position += len(
+                            self.detokenize(
+                                [token],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            )
+                        )
+                        # Check if stop sequence is in the token
+                        if token_end_position > (
+                            remaining_length - first_stop_position
+                        ):
+                            break
+                        token_str = self.detokenize(
+                            [token],
+                            prev_tokens=prompt_tokens
+                            + completion_tokens[:returned_tokens],
+                        ).decode("utf-8", errors="ignore")
+                        text_offset = len(prompt) + len(
+                            self.detokenize(
+                                completion_tokens[:returned_tokens],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            ).decode("utf-8", errors="ignore")
+                        )
+                        token_offset = len(prompt_tokens) + returned_tokens
+                        logits = self._scores[token_offset - 1, :]
+                        current_logprobs = Llama.logits_to_logprobs(logits).tolist()
+                        sorted_logprobs = list(
+                            sorted(
+                                zip(current_logprobs, range(len(current_logprobs))),
+                                reverse=True,
+                            )
+                        )
+                        top_logprob = {
+                            self.detokenize([i]).decode(
+                                "utf-8", errors="ignore"
+                            ): logprob
+                            for logprob, i in sorted_logprobs[:logprobs]
+                        }
+                        top_logprob.update({token_str: current_logprobs[int(token)]})
+                        logprobs_or_none = {
+                            "tokens": [
+                                self.detokenize(
+                                    [token],
+                                    prev_tokens=prompt_tokens
+                                    + completion_tokens[:returned_tokens],
+                                ).decode("utf-8", errors="ignore")
+                            ],
+                            "text_offset": [text_offset],
+                            "token_logprobs": [current_logprobs[int(token)]],
+                            "top_logprobs": [top_logprob],
+                        }
+                        returned_tokens += 1
+                        yield {
+                            "id": completion_id,
+                            "object": "text_completion",
+                            "created": created,
+                            "model": model_name,
+                            "choices": [
+                                {
+                                    "text": self.detokenize(
+                                        [token],
+                                        prev_tokens=prompt_tokens
+                                        + completion_tokens[:returned_tokens],
+                                    ).decode("utf-8", errors="ignore"),
+                                    "index": 0,
+                                    "logprobs": logprobs_or_none,
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                else:
+                    while len(remaining_tokens) > 0:
+                        decode_success = False
+                        for i in range(1, len(remaining_tokens) + 1):
+                            try:
+                                bs = self.detokenize(
+                                    remaining_tokens[:i],
+                                    prev_tokens=prompt_tokens
+                                    + completion_tokens[:returned_tokens],
+                                )
+                                ts = bs.decode("utf-8")
+                                decode_success = True
+                                break
+                            except UnicodeError:
+                                pass
+                        else:
+                            break
+                        if not decode_success:
+                            # all remaining tokens cannot be decoded to a UTF-8 character
+                            break
+                        token_end_position += len(bs)
+                        if token_end_position > (
+                            remaining_length - first_stop_position
+                        ):
+                            break
+                        remaining_tokens = remaining_tokens[i:]
+                        returned_tokens += i
+                        yield {
+                            "id": completion_id,
+                            "object": "text_completion",
+                            "created": created,
+                            "model": model_name,
+                            "choices": [
+                                {
+                                    "text": ts,
+                                    "index": 0,
+                                    "logprobs": None,
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+            if len(completion_tokens) >= max_tokens:
+                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                finish_reason = "length"
+                break
+        if stopping_criteria is not None and stopping_criteria(
+            self._input_ids, self._scores[-1, :]
+        ):
+            text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            finish_reason = "stop"
+        if self.verbose:
+            self._ctx.print_timings()
+        if stream:
+            remaining_tokens = completion_tokens[returned_tokens:]
+            remaining_text = self.detokenize(
+                remaining_tokens,
+                prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+            )
+            any_stop = [s for s in stop_sequences if s in remaining_text]
+            if len(any_stop) > 0:
+                end = min(remaining_text.index(stop) for stop in any_stop)
+            else:
+                end = len(remaining_text)
+            token_end_position = 0
+            for token in remaining_tokens:
+                token_end_position += len(
+                    self.detokenize(
+                        [token],
+                        prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                    )
+                )
+                logprobs_or_none: Optional[CompletionLogprobs] = None
+                if logprobs is not None:
+                    if token == bos_token_id:
+                        continue
+                    token_str = self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    )
+                    text_offset = len(prompt) + len(
+                        self.detokenize(
+                            completion_tokens[:returned_tokens],
+                            prev_tokens=prompt_tokens
+                            + completion_tokens[:returned_tokens],
+                        )
+                    )
+                    token_offset = len(prompt_tokens) + returned_tokens - 1
+                    logits = self._scores[token_offset, :]
+                    current_logprobs = Llama.logits_to_logprobs(logits).tolist()
+                    sorted_logprobs = list(
+                        sorted(
+                            zip(current_logprobs, range(len(current_logprobs))),
+                            reverse=True,
+                        )
+                    )
+                    top_logprob = {
+                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                        for logprob, i in sorted_logprobs[:logprobs]
+                    }
+                    top_logprob.update({token_str: current_logprobs[int(token)]})
+                    logprobs_or_none = {
+                        "tokens": [
+                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                        ],
+                        "text_offset": [text_offset],
+                        "token_logprobs": [current_logprobs[int(token)]],
+                        "top_logprobs": [top_logprob],
+                    }
+                if token_end_position >= end:
+                    last_text = self.detokenize([token])
+                    if token_end_position == end - 1:
+                        break
+                    returned_tokens += 1
+                    yield {
+                        "id": completion_id,
+                        "object": "text_completion",
+                        "created": created,
+                        "model": model_name,
+                        "choices": [
+                            {
+                                "text": last_text[
+                                    : len(last_text) - (token_end_position - end)
+                                ].decode("utf-8", errors="ignore"),
+                                "index": 0,
+                                "logprobs": logprobs_or_none,
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    break
+                returned_tokens += 1
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": self.detokenize([token]).decode(
+                                "utf-8", errors="ignore"
+                            ),
+                            "index": 0,
+                            "logprobs": logprobs_or_none,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": "",
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+            if self.cache:
+                if self.verbose:
+                    print("Llama._create_completion: cache save", file=sys.stderr)
+                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                if self.verbose:
+                    print("Llama._create_completion: cache saved", file=sys.stderr)
+            return
+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+        text_str = text.decode("utf-8", errors="ignore")
+        if echo:
+            text_str = prompt + text_str
+        if suffix_token_id < 0 and suffix is not None:
+            text_str = text_str + suffix
+        logprobs_or_none: Optional[CompletionLogprobs] = None
+        if logprobs is not None:
+            text_offset = 0 if echo else len(prompt)
+            token_offset = 0 if echo else len(prompt_tokens[1:])
+            text_offsets: List[int] = []
+            token_logprobs: List[Optional[float]] = []
+            tokens: List[str] = []
+            top_logprobs: List[Optional[Dict[str, float]]] = []
+            if echo:
+                # Remove leading BOS token if exists
+                all_tokens = (
+                    prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0 :]
+                    + completion_tokens
+                )
+            else:
+                all_tokens = completion_tokens
+            all_token_strs = [
+                self.detokenize([token], prev_tokens=all_tokens[:i]).decode(
+                    "utf-8", errors="ignore"
+                )
+                for i, token in enumerate(all_tokens)
+            ]
+            all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
+            # TODO: may be able to change this loop to use np.take_along_dim
+            for idx, (token, token_str, logprobs_token) in enumerate(
+                zip(all_tokens, all_token_strs, all_logprobs)
+            ):
+                if token == bos_token_id:
+                    continue
+                text_offsets.append(
+                    text_offset
+                    + len(
+                        self.detokenize(all_tokens[:idx]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                )
+                tokens.append(token_str)
+                sorted_logprobs = list(
+                    sorted(
+                        zip(logprobs_token, range(len(logprobs_token))), reverse=True
+                    )
+                )
+                token_logprobs.append(logprobs_token[int(token)])
+                top_logprob: Optional[Dict[str, float]] = {
+                    self.detokenize([i], prev_tokens=all_tokens[:idx]).decode(
+                        "utf-8", errors="ignore"
+                    ): logprob
+                    for logprob, i in sorted_logprobs[:logprobs]
+                }
+                top_logprob.update({token_str: logprobs_token[int(token)]})
+                top_logprobs.append(top_logprob)
+            # Weird idosincracy of the OpenAI API where
+            # token_logprobs and top_logprobs are null for
+            # the first token.
+            if echo and len(all_tokens) > 0:
+                token_logprobs[0] = None
+                top_logprobs[0] = None
+            logprobs_or_none = {
+                "tokens": tokens,
+                "text_offset": text_offsets,
+                "token_logprobs": token_logprobs,
+                "top_logprobs": top_logprobs,
+            }
+        yield {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": created,
+            "model": model_name,
+            "choices": [
+                {
+                    "text": text_str,
+                    "index": 0,
+                    "logprobs": logprobs_or_none,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": len(completion_tokens),
+                "total_tokens": len(prompt_tokens) + len(completion_tokens),
+            },
+        }
+    def create_completion(
+        self,
+        prompt: Union[str, List[int]],
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repeat_penalty: float = 1.0,
+        top_k: int = 40,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
+        """Generate text from a prompt.
+        Args:
+            prompt: The prompt to generate text from.
+            suffix: A suffix to append to the generated text. If None, no suffix is appended.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+            logprobs: The number of logprobs to return. If None, no logprobs are returned.
+            echo: Whether to echo the prompt.
+            stop: A list of strings to stop generation when encountered.
+            frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
+            presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            stream: Whether to stream the results.
+            seed: The seed to use for sampling.
+            tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+            mirostat_mode: The mirostat sampling mode.
+            mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+            mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+            model: The name to use for the model in the completion object.
+            stopping_criteria: A list of stopping criteria to use.
+            logits_processor: A list of logits processors to use.
+            grammar: A grammar to use for constrained sampling.
+            logit_bias: A logit bias to use.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_or_chunks = self._create_completion(
+            prompt=prompt,
+            suffix=suffix,
+            max_tokens=-1 if max_tokens is None else max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            min_p=min_p,
+            typical_p=typical_p,
+            logprobs=logprobs,
+            echo=echo,
+            stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            repeat_penalty=repeat_penalty,
+            top_k=top_k,
+            stream=stream,
+            seed=seed,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            logit_bias=logit_bias,
+        )
+        if stream:
+            chunks: Iterator[CreateCompletionStreamResponse] = completion_or_chunks
+            return chunks
+        completion: Completion = next(completion_or_chunks)  # type: ignore
+        return completion
+    def __call__(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: Optional[int] = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repeat_penalty: float = 1.0,
+        top_k: int = 40,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
+    ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
+        """Generate text from a prompt.
+        Args:
+            prompt: The prompt to generate text from.
+            suffix: A suffix to append to the generated text. If None, no suffix is appended.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+            logprobs: The number of logprobs to return. If None, no logprobs are returned.
+            echo: Whether to echo the prompt.
+            stop: A list of strings to stop generation when encountered.
+            frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
+            presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            stream: Whether to stream the results.
+            seed: The seed to use for sampling.
+            tfs_z: The tail-free sampling parameter. Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+            mirostat_mode: The mirostat sampling mode.
+            mirostat_tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+            mirostat_eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+            model: The name to use for the model in the completion object.
+            stopping_criteria: A list of stopping criteria to use.
+            logits_processor: A list of logits processors to use.
+            grammar: A grammar to use for constrained sampling.
+            logit_bias: A logit bias to use.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        return self.create_completion(
+            prompt=prompt,
+            suffix=suffix,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            min_p=min_p,
+            typical_p=typical_p,
+            logprobs=logprobs,
+            echo=echo,
+            stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            repeat_penalty=repeat_penalty,
+            top_k=top_k,
+            stream=stream,
+            seed=seed,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            logit_bias=logit_bias,
+        )
+    def create_chat_completion(
+        self,
+        messages: List[ChatCompletionRequestMessage],
+        functions: Optional[List[ChatCompletionFunction]] = None,
+        function_call: Optional[ChatCompletionRequestFunctionCall] = None,
+        tools: Optional[List[ChatCompletionTool]] = None,
+        tool_choice: Optional[ChatCompletionToolChoiceOption] = None,
+        temperature: float = 0.2,
+        top_p: float = 0.95,
+        top_k: int = 40,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        stream: bool = False,
+        stop: Optional[Union[str, List[str]]] = [],
+        seed: Optional[int] = None,
+        response_format: Optional[ChatCompletionRequestResponseFormat] = None,
+        max_tokens: Optional[int] = None,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repeat_penalty: float = 1.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+    ) -> Union[
+        CreateChatCompletionResponse, Iterator[CreateChatCompletionStreamResponse]
+    ]:
+        """Generate a chat completion from a list of messages.
+        Args:
+            messages: A list of messages to generate a response for.
+            functions: A list of functions to use for the chat completion.
+            function_call: A function call to use for the chat completion.
+            tools: A list of tools to use for the chat completion.
+            tool_choice: A tool choice to use for the chat completion.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for nucleus sampling. Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            top_k: The top-k value to use for sampling. Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+            min_p: The min-p value to use for minimum p sampling. Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+            typical_p: The typical-p value to use for sampling. Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+            stream: Whether to stream the results.
+            stop: A list of strings to stop generation when encountered.
+            seed: The seed to use for sampling.
+            response_format: The response format to use for the chat completion. Use { "type": "json_object" } to contstrain output to only valid json.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0 or None, the maximum number of tokens to generate is unlimited and depends on n_ctx.
+            presence_penalty: The penalty to apply to tokens based on their presence in the prompt.
+            frequency_penalty: The penalty to apply to tokens based on their frequency in the prompt.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            tfs_z: The tail-free sampling parameter.
+            mirostat_mode: The mirostat sampling mode.
+            mirostat_tau: The mirostat sampling tau parameter.
+            mirostat_eta: The mirostat sampling eta parameter.
+            model: The name to use for the model in the completion object.
+            logits_processor: A list of logits processors to use.
+            grammar: A grammar to use.
+            logit_bias: A logit bias to use.
+        Returns:
+            Generated chat completion or a stream of chat completion chunks.
+        """
+        handler = (
+            self.chat_handler
+            or self._chat_handlers.get(self.chat_format)
+            or llama_chat_format.get_chat_completion_handler(self.chat_format)
+        )
+        return handler(
+            llama=self,
+            messages=messages,
+            functions=functions,
+            function_call=function_call,
+            tools=tools,
+            tool_choice=tool_choice,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            typical_p=typical_p,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            stream=stream,
+            stop=stop,
+            seed=seed,
+            response_format=response_format,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            repeat_penalty=repeat_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            model=model,
+            logits_processor=logits_processor,
+            grammar=grammar,
+            logit_bias=logit_bias,
+        )
+    def create_chat_completion_openai_v1(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Generate a chat completion with return type based on the the OpenAI v1 API.
+        OpenAI python package is required to use this method.
+        You can install it with `pip install openai`.
+        Args:
+            *args: Positional arguments to pass to create_chat_completion.
+            **kwargs: Keyword arguments to pass to create_chat_completion.
+        Returns:
+            Generated chat completion or a stream of chat completion chunks.
+        """
+        try:
+            from openai.types.chat import ChatCompletion, ChatCompletionChunk
+            stream = kwargs.get("stream", False)  # type: ignore
+            assert isinstance(stream, bool)
+            if stream:
+                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+            else:
+                return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
+        except ImportError:
+            raise ImportError(
+                "To use create_chat_completion_openai_v1, you must install the openai package."
+                "You can install it with `pip install openai`."
+            )
+    def __getstate__(self):
+        return dict(
+            model_path=self.model_path,
+            # Model Params
+            n_gpu_layers=self.model_params.n_gpu_layers,
+            split_mode=self.model_params.split_mode,
+            main_gpu=self.model_params.main_gpu,
+            tensor_split=self.tensor_split,
+            vocab_only=self.model_params.vocab_only,
+            use_mmap=self.model_params.use_mmap,
+            use_mlock=self.model_params.use_mlock,
+            kv_overrides=self.kv_overrides,
+            # Context Params
+            seed=self._seed,
+            n_ctx=self.context_params.n_ctx,
+            n_batch=self.n_batch,
+            n_ubatch=self.context_params.n_ubatch,
+            n_threads=self.context_params.n_threads,
+            n_threads_batch=self.context_params.n_threads_batch,
+            rope_scaling_type=self.context_params.rope_scaling_type,
+            pooling_type=self.context_params.pooling_type,
+            rope_freq_base=self.context_params.rope_freq_base,
+            rope_freq_scale=self.context_params.rope_freq_scale,
+            yarn_ext_factor=self.context_params.yarn_ext_factor,
+            yarn_attn_factor=self.context_params.yarn_attn_factor,
+            yarn_beta_fast=self.context_params.yarn_beta_fast,
+            yarn_beta_slow=self.context_params.yarn_beta_slow,
+            yarn_orig_ctx=self.context_params.yarn_orig_ctx,
+            logits_all=self._logits_all,
+            embedding=self.context_params.embeddings,
+            offload_kqv=self.context_params.offload_kqv,
+            flash_attn=self.context_params.flash_attn,
+            op_offload=self.context_params.op_offload,
+            swa_full=self.context_params.swa_full,
+            # Sampling Params
+            no_perf=self.context_params.no_perf,
+            last_n_tokens_size=self.last_n_tokens_size,
+            # LoRA Params
+            lora_base=self.lora_base,
+            lora_scale=self.lora_scale,
+            lora_path=self.lora_path,
+            # Backend Params
+            numa=self.numa,
+            # Chat Format Params
+            chat_format=self.chat_format,
+            chat_handler=self.chat_handler,
+            # Speculative Decidng
+            draft_model=self.draft_model,
+            # KV cache quantization
+            type_k=self.context_params.type_k,
+            type_v=self.context_params.type_v,
+            # Misc
+            spm_infill=self.spm_infill,
+            verbose=self.verbose,
+        )
+    def __setstate__(self, state):
+        self.__init__(**state)
+    def save_state(self) -> LlamaState:
+        if self.verbose:
+            print("Llama.save_state: saving llama state", file=sys.stderr)
+        state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        if self.verbose:
+            print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
+        llama_state = (ctypes.c_uint8 * int(state_size))()
+        if self.verbose:
+            print("Llama.save_state: allocated state", file=sys.stderr)
+        n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
+        if self.verbose:
+            print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
+        if int(n_bytes) > int(state_size):
+            raise RuntimeError("Failed to copy llama state data")
+        llama_state_compact = (ctypes.c_uint8 * int(n_bytes))()
+        llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes))
+        if self.verbose:
+            print(
+                f"Llama.save_state: saving {n_bytes} bytes of llama state",
+                file=sys.stderr,
+            )
+        return LlamaState(
+            scores=self._scores.copy(),
+            input_ids=self.input_ids.copy(),
+            n_tokens=self.n_tokens,
+            llama_state=bytes(llama_state_compact),
+            llama_state_size=n_bytes,
+            seed=self._seed,
+        )
+    def load_state(self, state: LlamaState) -> None:
+        # Only filling in up to `n_tokens` and then zero-ing out the rest
+        self.scores[: state.n_tokens, :] = state.scores.copy()
+        rest = self.scores[state.n_tokens :, :]
+        rest[rest > 0] = 0.0
+        self.input_ids = state.input_ids.copy()
+        self.n_tokens = state.n_tokens
+        self._seed = state.seed
+        state_size = state.llama_state_size
+        LLamaStateArrayType = ctypes.c_uint8 * state_size
+        llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
+        if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
+            raise RuntimeError("Failed to set llama state data")
+    def n_ctx(self) -> int:
+        """Return the context window size."""
+        return self._ctx.n_ctx()
+    def n_embd(self) -> int:
+        """Return the embedding size."""
+        return self._model.n_embd()
+    def n_vocab(self) -> int:
+        """Return the vocabulary size."""
+        return self._model.n_vocab()
+    def tokenizer(self) -> LlamaTokenizer:
+        """Return the llama tokenizer for this model."""
+        return LlamaTokenizer(self)
+    def token_eos(self) -> int:
+        """Return the end-of-sequence token."""
+        return self._model.token_eos()
+    def token_bos(self) -> int:
+        """Return the beginning-of-sequence token."""
+        return self._model.token_bos()
+    def token_nl(self) -> int:
+        """Return the newline token."""
+        return self._model.token_nl()
+    def pooling_type(self) -> str:
+        """Return the pooling type."""
+        return self._ctx.pooling_type()
+    def close(self) -> None:
+        """Explicitly free the model from memory."""
+        self._stack.close()
+    def __del__(self) -> None:
+        self.close()
+    @staticmethod
+    def logits_to_logprobs(
+        logits: Union[npt.NDArray[np.single], List], axis: int = -1
+    ) -> npt.NDArray[np.single]:
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
+        logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
+        if logits_maxs.ndim > 0:
+            logits_maxs[~np.isfinite(logits_maxs)] = 0
+        elif not np.isfinite(logits_maxs):
+            logits_maxs = 0
+        subtract_maxs = np.subtract(logits, logits_maxs, dtype=np.single)
+        exp = np.exp(subtract_maxs)
+        # Suppress warnings about log of zero
+        with np.errstate(divide="ignore"):
+            summed = np.sum(exp, axis=axis, keepdims=True)
+            out = np.log(summed)
+        return subtract_maxs - out
+    @staticmethod
+    def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
+        longest_prefix = 0
+        for _a, _b in zip(a, b):
+            if _a == _b:
+                longest_prefix += 1
+            else:
+                break
+        return longest_prefix
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str,
+        filename: Optional[str],
+        additional_files: Optional[List] = None,
+        local_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+        cache_dir: Optional[Union[str, os.PathLike[str]]] = None,
+        **kwargs: Any,
+    ) -> "Llama":
+        """Create a Llama model from a pretrained model name or path.
+        This method requires the huggingface-hub package.
+        You can install it with `pip install huggingface-hub`.
+        Args:
+            repo_id: The model repo id.
+            filename: A filename or glob pattern to match the model file in the repo.
+            additional_files: A list of filenames or glob patterns to match additional model files in the repo.
+            local_dir: The local directory to save the model to.
+            local_dir_use_symlinks: Whether to use symlinks when downloading the model.
+            **kwargs: Additional keyword arguments to pass to the Llama constructor.
+        Returns:
+            A Llama model."""
+        try:
+            from huggingface_hub import hf_hub_download, HfFileSystem
+            from huggingface_hub.utils import validate_repo_id
+        except ImportError:
+            raise ImportError(
+                "Llama.from_pretrained requires the huggingface-hub package. "
+                "You can install it with `pip install huggingface-hub`."
+            )
+        validate_repo_id(repo_id)
+        hffs = HfFileSystem()
+        files = [
+            file["name"] if isinstance(file, dict) else file
+            for file in hffs.ls(repo_id, recursive=True)
+        ]
+        # split each file into repo_id, subfolder, filename
+        file_list: List[str] = []
+        for file in files:
+            rel_path = Path(file).relative_to(repo_id)
+            file_list.append(str(rel_path))
+        # find the only/first shard file:
+        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
+        if len(matching_files) == 0:
+            raise ValueError(
+                f"No file found in {repo_id} that match {filename}\n\n"
+                f"Available Files:\n{json.dumps(file_list)}"
+            )
+        if len(matching_files) > 1:
+            raise ValueError(
+                f"Multiple files found in {repo_id} matching {filename}\n\n"
+                f"Available Files:\n{json.dumps(files)}"
+            )
+        (matching_file,) = matching_files
+        subfolder = str(Path(matching_file).parent)
+        filename = Path(matching_file).name
+        # download the file
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            subfolder=subfolder,
+            local_dir=local_dir,
+            local_dir_use_symlinks=local_dir_use_symlinks,
+            cache_dir=cache_dir,
+        )
+        if additional_files:
+            for additonal_file_name in additional_files:
+                # find the additional shard file:
+                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                if len(matching_additional_files) == 0:
+                    raise ValueError(
+                        f"No file found in {repo_id} that match {additonal_file_name}\n\n"
+                        f"Available Files:\n{json.dumps(file_list)}"
+                    )
+                if len(matching_additional_files) > 1:
+                    raise ValueError(
+                        f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n"
+                        f"Available Files:\n{json.dumps(files)}"
+                    )
+                (matching_additional_file,) = matching_additional_files
+                # download the additional file
+                hf_hub_download(
+                    repo_id=repo_id,
+                    filename=matching_additional_file,
+                    subfolder=subfolder,
+                    local_dir=local_dir,
+                    local_dir_use_symlinks=local_dir_use_symlinks,
+                    cache_dir=cache_dir,
+                )
+        if local_dir is None:
+            model_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                subfolder=subfolder,
+                local_dir=local_dir,
+                local_dir_use_symlinks=local_dir_use_symlinks,
+                cache_dir=cache_dir,
+                local_files_only=True,
+            )
+        else:
+            model_path = os.path.join(local_dir, filename)
+        # loading the first file of a sharded GGUF loads all remaining shard files in the subfolder
+        return cls(
+            model_path=model_path,
+            **kwargs,
+        )
+class LlamaState:
+    def __init__(
+        self,
+        input_ids: npt.NDArray[np.intc],
+        scores: npt.NDArray[np.single],
+        n_tokens: int,
+        llama_state: bytes,
+        llama_state_size: int,
+        seed: int,
+    ):
+        self.input_ids = input_ids
+        self.scores = scores
+        self.n_tokens = n_tokens
+        self.llama_state = llama_state
+        self.llama_state_size = llama_state_size
+        self.seed = seed
+LogitsProcessor = Callable[
+    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
+]
+class LogitsProcessorList(List[LogitsProcessor]):
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
+class StoppingCriteriaList(List[StoppingCriteria]):
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
+    ) -> bool:
+        return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
+class MinTokensLogitsProcessor(LogitsProcessor):
+    def __init__(self, min_tokens: int, token_eos: int):
+        self.min_tokens = min_tokens
+        self.token_eos = token_eos
+        self.prompt_tokens = None
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
+    ) -> npt.NDArray[np.single]:
+        if self.prompt_tokens is None:
+            self.prompt_tokens = len(input_ids)
+        if len(input_ids) - self.prompt_tokens < self.min_tokens:
+            scores[self.token_eos] = -np.inf
+        return scores

llama_cpp/llama_cache.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import sys
+from abc import ABC, abstractmethod
+from typing import (
+    Optional,
+    Sequence,
+    Tuple,
+)
+from collections import OrderedDict
+import diskcache
+import llama_cpp.llama
+from .llama_types import *
+class BaseLlamaCache(ABC):
+    """Base cache class for a llama.cpp model."""
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        self.capacity_bytes = capacity_bytes
+    @property
+    @abstractmethod
+    def cache_size(self) -> int:
+        raise NotImplementedError
+    def _find_longest_prefix_key(
+        self,
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
+        pass
+    @abstractmethod
+    def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
+        raise NotImplementedError
+    @abstractmethod
+    def __contains__(self, key: Sequence[int]) -> bool:
+        raise NotImplementedError
+    @abstractmethod
+    def __setitem__(
+        self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"
+    ) -> None:
+        raise NotImplementedError
+class LlamaRAMCache(BaseLlamaCache):
+    """Cache for a llama.cpp model using RAM."""
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        super().__init__(capacity_bytes)
+        self.capacity_bytes = capacity_bytes
+        self.cache_state: OrderedDict[
+            Tuple[int, ...], "llama_cpp.llama.LlamaState"
+        ] = OrderedDict()
+    @property
+    def cache_size(self):
+        return sum([state.llama_state_size for state in self.cache_state.values()])
+    def _find_longest_prefix_key(
+        self,
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
+        min_len = 0
+        min_key = None
+        keys = (
+            (k, llama_cpp.llama.Llama.longest_token_prefix(k, key))
+            for k in self.cache_state.keys()
+        )
+        for k, prefix_len in keys:
+            if prefix_len > min_len:
+                min_len = prefix_len
+                min_key = k
+        return min_key
+    def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
+        key = tuple(key)
+        _key = self._find_longest_prefix_key(key)
+        if _key is None:
+            raise KeyError("Key not found")
+        value = self.cache_state[_key]
+        self.cache_state.move_to_end(_key)
+        return value
+    def __contains__(self, key: Sequence[int]) -> bool:
+        return self._find_longest_prefix_key(tuple(key)) is not None
+    def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
+        key = tuple(key)
+        if key in self.cache_state:
+            del self.cache_state[key]
+        self.cache_state[key] = value
+        while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0:
+            self.cache_state.popitem(last=False)
+# Alias for backwards compatibility
+LlamaCache = LlamaRAMCache
+class LlamaDiskCache(BaseLlamaCache):
+    """Cache for a llama.cpp model using disk."""
+    def __init__(
+        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
+    ):
+        super().__init__(capacity_bytes)
+        self.cache = diskcache.Cache(cache_dir)
+    @property
+    def cache_size(self):
+        return int(self.cache.volume())  # type: ignore
+    def _find_longest_prefix_key(
+        self,
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
+        min_len = 0
+        min_key: Optional[Tuple[int, ...]] = None
+        for k in self.cache.iterkeys():  # type: ignore
+            prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key)
+            if prefix_len > min_len:
+                min_len = prefix_len
+                min_key = k  # type: ignore
+        return min_key
+    def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
+        key = tuple(key)
+        _key = self._find_longest_prefix_key(key)
+        if _key is None:
+            raise KeyError("Key not found")
+        value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key)  # type: ignore
+        # NOTE: This puts an integer as key in cache, which breaks,
+        # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
+        # self.cache.push(_key, side="front")  # type: ignore
+        return value
+    def __contains__(self, key: Sequence[int]) -> bool:
+        return self._find_longest_prefix_key(tuple(key)) is not None
+    def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
+        print("LlamaDiskCache.__setitem__: called", file=sys.stderr)
+        key = tuple(key)
+        if key in self.cache:
+            print("LlamaDiskCache.__setitem__: delete", file=sys.stderr)
+            del self.cache[key]
+        self.cache[key] = value
+        print("LlamaDiskCache.__setitem__: set", file=sys.stderr)
+        while self.cache_size > self.capacity_bytes and len(self.cache) > 0:
+            key_to_remove = next(iter(self.cache))
+            del self.cache[key_to_remove]
+        print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)

llama_cpp/llama_chat_format.py ADDED Viewed

The diff for this file is too large to render. See raw diff

llama_cpp/llama_cpp.py ADDED Viewed

The diff for this file is too large to render. See raw diff

llama_cpp/llama_grammar.py ADDED Viewed

	@@ -0,0 +1,953 @@

+"""Python implementation of llama grammar parser directly translated from C++ source file in vendor/llama.cpp/common/grammar-parser.cpp."""
+# flake8: noqa
+from pathlib import Path
+from itertools import groupby
+from typing import (
+    Any,
+    Set,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+LLAMA_GRAMMAR_DEFAULT_ROOT = "root"
+class LlamaGrammar:
+    def __init__(self, *args, _grammar: str, **kwargs):
+        self._grammar = _grammar
+        self._root = LLAMA_GRAMMAR_DEFAULT_ROOT
+    @classmethod
+    def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar":
+        return cls(_grammar=grammar)
+    @classmethod
+    def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar":
+        try:
+            with open(file) as f:
+                grammar = f.read()
+        except Exception as err:
+            raise Exception(
+                f"{cls.from_file.__name__}: error reading grammar file: {err}"
+            )
+        if grammar:
+            return cls.from_string(grammar, verbose=verbose)
+        raise ValueError(
+            f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty"
+        )
+    @classmethod
+    def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar":
+        return cls.from_string(json_schema_to_gbnf(json_schema), verbose=verbose)
+"""llama.cpp gbnf rules from vendor/llama.cpp/grammars"""
+ARITHMETIC_GBNF = r"""
+root  ::= (expr "=" ws term "\n")+
+expr  ::= term ([-+*/] term)*
+term  ::= ident | num | "(" ws expr ")" ws
+ident ::= [a-z] [a-z0-9_]* ws
+num   ::= [0-9]+ ws
+ws    ::= [ \t\n]*
+"""
+C_GBNF = r"""
+root ::= (declaration)*
+declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
+dataType  ::= "int" ws | "float" ws | "char" ws
+identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
+parameter ::= dataType identifier
+statement ::=
+    ( dataType identifier ws "=" ws expression ";" ) |
+    ( identifier ws "=" ws expression ";" ) |
+    ( identifier ws "(" argList? ")" ";" ) |
+    ( "return" ws expression ";" ) |
+    ( "while" "(" condition ")" "{" statement* "}" ) |
+    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
+    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
+    ( singleLineComment ) |
+    ( multiLineComment )
+forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
+forUpdate ::= identifier ws "=" ws expression
+condition ::= expression relationOperator expression
+relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
+expression ::= term (("+" | "-") term)*
+term ::= factor(("*" | "/") factor)*
+factor ::= identifier | number | unaryTerm | funcCall | parenExpression
+unaryTerm ::= "-" factor
+funcCall ::= identifier "(" argList? ")"
+parenExpression ::= "(" ws expression ws ")"
+argList ::= expression ("," ws expression)*
+number ::= [0-9]+
+singleLineComment ::= "//" [^\n]* "\n"
+multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
+ws ::= ([ \t\n]+)
+"""
+CHESS_GBNF = r"""
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+"""
+JAPANESE_GBNF = r"""
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+"""
+JSON_ARR_GBNF = r"""
+# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
+# Useful for generating JSON arrays
+root   ::= arr
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+arr  ::=
+  "[\n" ws (
+            value
+    (",\n" ws value)*
+  )? "]"
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+"""
+JSON_GBNF = r"""
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+string ::=
+  "\"" (
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
+  )* "\"" ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= | " " | "\n" [ \t]{0,20}
+"""
+LIST_GBNF = r"""
+root ::= item+
+# Excludes various line break characters
+item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"
+"""
+"""llama.cpp json-schema to grammar converter from vendor/llama.cpp/examples/json-schema-to-grammar.py"""
+import json
+import re
+from typing import List, Optional
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'
+INVALID_RULE_CHARS_RE = re.compile(r"[^a-zA-Z0-9-]+")
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPES = {"\r": "\\r", "\n": "\\n", '"': '\\"'}
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'
+def _build_repetition(
+    item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False
+):
+    if not separator_rule:
+        if min_items == 0 and max_items == 1:
+            return f"{item_rule}?"
+        elif min_items == 1 and max_items is None:
+            return f"{item_rule}+"
+    result = ""
+    if min_items > 0:
+        if item_rule_is_literal and separator_rule is None:
+            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        else:
+            result = (f" {separator_rule} " if separator_rule else " ").join(
+                [item_rule] * min_items
+            )
+    def opt_repetitions(up_to_n, prefix_with_sep=False):
+        """
+        - n=4, no sep:             '(a (a (a (a)?)?)?)?'
+        - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
+        - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+        """
+        content = (
+            f"{separator_rule} {item_rule}"
+            if prefix_with_sep and separator_rule
+            else item_rule
+        )
+        if up_to_n == 0:
+            return ""
+        elif up_to_n == 1:
+            return f"({content})?"
+        elif separator_rule and not prefix_with_sep:
+            return f"({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?"
+        else:
+            return (f"({content} " * up_to_n).rstrip() + (")?" * up_to_n)
+    if min_items > 0 and max_items != min_items:
+        result += " "
+    if max_items is not None:
+        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+    else:
+        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+        if min_items == 0 and separator_rule:
+            result = f"({item_rule} {item_operator}*)?"
+        else:
+            result += f"{item_operator}*"
+    return result
+class BuiltinRule:
+    def __init__(self, content: str, deps: list = None):
+        self.content = content
+        self.deps = deps or []
+_up_to_15_digits = _build_repetition("[0-9]", 0, 15)
+PRIMITIVE_RULES = {
+    "boolean": BuiltinRule('("true" | "false") space', []),
+    "decimal-part": BuiltinRule("[0-9] " + _up_to_15_digits, []),
+    "integral-part": BuiltinRule("[0-9] | [1-9] " + _up_to_15_digits, []),
+    "number": BuiltinRule(
+        '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space',
+        ["integral-part", "decimal-part"],
+    ),
+    "integer": BuiltinRule('("-"? integral-part) space', ["integral-part"]),
+    "value": BuiltinRule(
+        "object | array | string | number | boolean | null",
+        ["object", "array", "string", "number", "boolean", "null"],
+    ),
+    "object": BuiltinRule(
+        '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+        ["string", "value"],
+    ),
+    "array": BuiltinRule(
+        '"[" space ( value ("," space value)* )? "]" space', ["value"]
+    ),
+    "uuid": BuiltinRule(
+        r'"\"" '
+        + ' "-" '.join("[0-9a-fA-F]" * n for n in [8, 4, 4, 4, 12])
+        + r' "\"" space',
+        [],
+    ),
+    "char": BuiltinRule(
+        r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])',
+        [],
+    ),
+    "string": BuiltinRule(r'"\"" char* "\"" space', ["char"]),
+    "null": BuiltinRule('"null" space', []),
+}
+# TODO: support "uri", "email" string formats
+STRING_FORMAT_RULES = {
+    "date": BuiltinRule(
+        '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+        [],
+    ),
+    "time": BuiltinRule(
+        '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+        [],
+    ),
+    "date-time": BuiltinRule('date "T" time', ["date", "time"]),
+    "date-string": BuiltinRule('"\\"" date "\\"" space', ["date"]),
+    "time-string": BuiltinRule('"\\"" time "\\"" space', ["time"]),
+    "date-time-string": BuiltinRule('"\\"" date-time "\\"" space', ["date-time"]),
+}
+DOTALL = "[\\U00000000-\\U0010FFFF]"
+DOT = "[^\\x0A\\x0D]"
+RESERVED_NAMES = set(
+    ["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]
+)
+NON_LITERAL_SET = set("|.()[]{}*+?")
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set("[]()|{}*+?")
+class SchemaConverter:
+    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
+        self._prop_order = prop_order
+        self._allow_fetch = allow_fetch
+        self._dotall = dotall
+        self._raw_pattern = raw_pattern
+        self._rules = {
+            "space": SPACE_RULE,
+        }
+        self._refs = {}
+        self._refs_being_resolved = set()
+    def _format_literal(self, literal):
+        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+        )
+        return f'"{escaped}"'
+    def not_literal(
+        self, literal: str, dotall: bool = True, maybe_escaped_underscores=False
+    ) -> str:
+        """
+        not_literal('a') -> '[^a]'
+        not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+        """
+        assert len(literal) > 0, "Empty literal not supported"
+        def recurse(i: int):
+            c = literal[i]
+            if maybe_escaped_underscores and c == "_":
+                yield f"[^{c}\\\\]"
+                yield " | "
+                yield f'"\\\\"? "{c}"'
+            else:
+                yield f"[^{c}]"
+            if i < len(literal) - 1:
+                yield " | "
+                yield self._format_literal(c)
+                yield " ("
+                yield from recurse(i + 1)
+                yield ")?"
+        return "".join(("(", *recurse(0), ")"))
+    def _add_rule(self, name, rule):
+        esc_name = INVALID_RULE_CHARS_RE.sub("-", name)
+        if esc_name not in self._rules or self._rules[esc_name] == rule:
+            key = esc_name
+        else:
+            i = 0
+            while (
+                f"{esc_name}{i}" in self._rules
+                and self._rules[f"{esc_name}{i}"] != rule
+            ):
+                i += 1
+            key = f"{esc_name}{i}"
+        self._rules[key] = rule
+        return key
+    def resolve_refs(self, schema: dict, url: str):
+        """
+        Resolves all $ref fields in the given schema, fetching any remote schemas,
+        replacing $ref with absolute reference URL and populating self._refs with the
+        respective referenced (sub)schema dictionaries.
+        """
+        def visit(n: dict):
+            if isinstance(n, list):
+                return [visit(x) for x in n]
+            elif isinstance(n, dict):
+                ref = n.get("$ref")
+                if ref is not None and ref not in self._refs:
+                    if ref.startswith("https://"):
+                        assert (
+                            self._allow_fetch
+                        ), "Fetching remote schemas is not allowed (use --allow-fetch for force)"
+                        import requests
+                        frag_split = ref.split("#")
+                        base_url = frag_split[0]
+                        target = self._refs.get(base_url)
+                        if target is None:
+                            target = self.resolve_refs(
+                                requests.get(ref).json(), base_url
+                            )
+                            self._refs[base_url] = target
+                        if len(frag_split) == 1 or frag_split[-1] == "":
+                            return target
+                    elif ref.startswith("#/"):
+                        target = schema
+                        ref = f"{url}{ref}"
+                        n["$ref"] = ref
+                    else:
+                        raise ValueError(f"Unsupported ref {ref}")
+                    for sel in ref.split("#")[-1].split("/")[1:]:
+                        assert (
+                            target is not None and sel in target
+                        ), f"Error resolving ref {ref}: {sel} not in {target}"
+                        target = target[sel]
+                    self._refs[ref] = target
+                else:
+                    for v in n.values():
+                        visit(v)
+            return n
+        return visit(schema)
+    def _generate_union_rule(self, name, alt_schemas):
+        return " | ".join(
+            (
+                self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
+                for i, alt_schema in enumerate(alt_schemas)
+            )
+        )
+    def _visit_pattern(self, pattern, name):
+        """
+        Transforms a regular expression pattern into a GBNF rule.
+        Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
+        Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+        Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
+        Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
+        we define sub-rules to keep the output lean.
+        """
+        assert pattern.startswith("^") and pattern.endswith(
+            "$"
+        ), 'Pattern must start with "^" and end with "$"'
+        pattern = pattern[1:-1]
+        sub_rule_ids = {}
+        i = 0
+        length = len(pattern)
+        def to_rule(s: Tuple[str, bool]) -> str:
+            (txt, is_literal) = s
+            return '"' + txt + '"' if is_literal else txt
+        def transform() -> Tuple[str, bool]:
+            """
+            Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
+            """
+            nonlocal i
+            nonlocal pattern
+            nonlocal sub_rule_ids
+            start = i
+            # For each component of this sequence, store its string representation and whether it's a literal.
+            # We only need a flat structure here to apply repetition operators to the last item, and
+            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+            # (GBNF's syntax is luckily very close to regular expressions!)
+            seq: list[Tuple[str, bool]] = []
+            def get_dot():
+                if self._dotall:
+                    rule = DOTALL
+                else:
+                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
+                    rule = DOT
+                return self._add_rule(f"dot", rule)
+            def join_seq():
+                nonlocal seq
+                ret = []
+                for is_literal, g in groupby(seq, lambda x: x[1]):
+                    if is_literal:
+                        ret.append(("".join(x[0] for x in g), True))
+                    else:
+                        ret.extend(g)
+                if len(ret) == 1:
+                    return ret[0]
+                return (" ".join(to_rule(x) for x in seq), False)
+            while i < length:
+                c = pattern[i]
+                if c == ".":
+                    seq.append((get_dot(), False))
+                    i += 1
+                elif c == "(":
+                    i += 1
+                    if i < length:
+                        assert (
+                            pattern[i] != "?"
+                        ), f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                    seq.append((f"({to_rule(transform())})", False))
+                elif c == ")":
+                    i += 1
+                    assert (
+                        start > 0 and pattern[start - 1] == "("
+                    ), f"Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}"
+                    return join_seq()
+                elif c == "[":
+                    square_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != "]":
+                        if pattern[i] == "\\":
+                            square_brackets += pattern[i : i + 2]
+                            i += 2
+                        else:
+                            square_brackets += pattern[i]
+                            i += 1
+                    assert (
+                        i < length
+                    ), f"Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    square_brackets += "]"
+                    i += 1
+                    seq.append((square_brackets, False))
+                elif c == "|":
+                    seq.append(("|", False))
+                    i += 1
+                elif c in ("*", "+", "?"):
+                    seq[-1] = (to_rule(seq[-1]) + c, False)
+                    i += 1
+                elif c == "{":
+                    curly_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != "}":
+                        curly_brackets += pattern[i]
+                        i += 1
+                    assert (
+                        i < length
+                    ), f"Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}"
+                    curly_brackets += "}"
+                    i += 1
+                    nums = [s.strip() for s in curly_brackets[1:-1].split(",")]
+                    min_times = 0
+                    max_times = None
+                    try:
+                        if len(nums) == 1:
+                            min_times = int(nums[0])
+                            max_times = min_times
+                        else:
+                            assert len(nums) == 2
+                            min_times = int(nums[0]) if nums[0] else 0
+                            max_times = int(nums[1]) if nums[1] else None
+                    except ValueError:
+                        raise ValueError(
+                            f"Invalid quantifier {curly_brackets} in /{pattern}/"
+                        )
+                    (sub, sub_is_literal) = seq[-1]
+                    if not sub_is_literal:
+                        id = sub_rule_ids.get(sub)
+                        if id is None:
+                            id = self._add_rule(f"{name}-{len(sub_rule_ids) + 1}", sub)
+                            sub_rule_ids[sub] = id
+                        sub = id
+                    seq[-1] = (
+                        _build_repetition(
+                            f'"{sub}"' if sub_is_literal else sub,
+                            min_times,
+                            max_times,
+                            item_rule_is_literal=sub_is_literal,
+                        ),
+                        False,
+                    )
+                else:
+                    literal = ""
+                    while i < length:
+                        if pattern[i] == "\\" and i < length - 1:
+                            next = pattern[i + 1]
+                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
+                                i += 1
+                                literal += pattern[i]
+                                i += 1
+                            else:
+                                literal += pattern[i : i + 2]
+                                i += 2
+                        elif pattern[i] == '"' and not self._raw_pattern:
+                            literal += '\\"'
+                            i += 1
+                        elif pattern[i] not in NON_LITERAL_SET and (
+                            i == length - 1
+                            or literal == ""
+                            or pattern[i + 1] == "."
+                            or pattern[i + 1] not in NON_LITERAL_SET
+                        ):
+                            literal += pattern[i]
+                            i += 1
+                        else:
+                            break
+                    if literal:
+                        seq.append((literal, True))
+            return join_seq()
+        return self._add_rule(
+            name,
+            (
+                to_rule(transform())
+                if self._raw_pattern
+                else '"\\"" ' + to_rule(transform()) + ' "\\"" space'
+            ),
+        )
+    def _resolve_ref(self, ref):
+        ref_name = ref.split("/")[-1]
+        if ref_name not in self._rules and ref not in self._refs_being_resolved:
+            self._refs_being_resolved.add(ref)
+            resolved = self._refs[ref]
+            ref_name = self.visit(resolved, ref_name)
+            self._refs_being_resolved.remove(ref)
+        return ref_name
+    def _generate_constant_rule(self, value):
+        return self._format_literal(json.dumps(value))
+    def visit(self, schema, name):
+        schema_type = schema.get("type")
+        schema_format = schema.get("format")
+        rule_name = name + "-" if name in RESERVED_NAMES else name or "root"
+        if (ref := schema.get("$ref")) is not None:
+            return self._add_rule(rule_name, self._resolve_ref(ref))
+        elif "oneOf" in schema or "anyOf" in schema:
+            return self._add_rule(
+                rule_name,
+                self._generate_union_rule(name, schema.get("oneOf") or schema["anyOf"]),
+            )
+        elif isinstance(schema_type, list):
+            return self._add_rule(
+                rule_name,
+                self._generate_union_rule(name, [{"type": t} for t in schema_type]),
+            )
+        elif "const" in schema:
+            return self._add_rule(
+                rule_name, self._generate_constant_rule(schema["const"])
+            )
+        elif "enum" in schema:
+            rule = " | ".join((self._generate_constant_rule(v) for v in schema["enum"]))
+            return self._add_rule(rule_name, rule)
+        elif schema_type in (None, "object") and (
+            "properties" in schema
+            or (
+                "additionalProperties" in schema
+                and schema["additionalProperties"] is not True
+            )
+        ):
+            required = set(schema.get("required", []))
+            properties = list(schema.get("properties", {}).items())
+            return self._add_rule(
+                rule_name,
+                self._build_object_rule(
+                    properties, required, name, schema.get("additionalProperties")
+                ),
+            )
+        elif schema_type in (None, "object") and "allOf" in schema:
+            required = set()
+            properties = []
+            hybrid_name = name
+            def add_component(comp_schema, is_required):
+                if (ref := comp_schema.get("$ref")) is not None:
+                    comp_schema = self._refs[ref]
+                if "properties" in comp_schema:
+                    for prop_name, prop_schema in comp_schema["properties"].items():
+                        properties.append((prop_name, prop_schema))
+                        if is_required:
+                            required.add(prop_name)
+            for t in schema["allOf"]:
+                if "anyOf" in t:
+                    for tt in t["anyOf"]:
+                        add_component(tt, is_required=False)
+                else:
+                    add_component(t, is_required=True)
+            return self._add_rule(
+                rule_name,
+                self._build_object_rule(
+                    properties, required, hybrid_name, additional_properties=[]
+                ),
+            )
+        elif schema_type in (None, "array") and (
+            "items" in schema or "prefixItems" in schema
+        ):
+            items = schema.get("items") or schema["prefixItems"]
+            if isinstance(items, list):
+                return self._add_rule(
+                    rule_name,
+                    '"[" space '
+                    + ' "," space '.join(
+                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
+                        for i, item in enumerate(items)
+                    )
+                    + ' "]" space',
+                )
+            else:
+                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
+                min_items = schema.get("minItems", 0)
+                max_items = schema.get("maxItems")
+                return self._add_rule(
+                    rule_name,
+                    '"[" space '
+                    + _build_repetition(
+                        item_rule_name, min_items, max_items, separator_rule='"," space'
+                    )
+                    + ' "]" space',
+                )
+        elif schema_type in (None, "string") and "pattern" in schema:
+            return self._visit_pattern(schema["pattern"], rule_name)
+        elif schema_type in (None, "string") and re.match(
+            r"^uuid[1-5]?$", schema_format or ""
+        ):
+            return self._add_primitive(
+                "root" if rule_name == "root" else schema_format,
+                PRIMITIVE_RULES["uuid"],
+            )
+        elif (
+            schema_type in (None, "string")
+            and f"{schema_format}-string" in STRING_FORMAT_RULES
+        ):
+            prim_name = f"{schema_format}-string"
+            return self._add_rule(
+                rule_name,
+                self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]),
+            )
+        elif schema_type == "string" and (
+            "minLength" in schema or "maxLength" in schema
+        ):
+            char_rule = self._add_primitive("char", PRIMITIVE_RULES["char"])
+            min_len = schema.get("minLength", 0)
+            max_len = schema.get("maxLength")
+            return self._add_rule(
+                rule_name,
+                r'"\"" '
+                + _build_repetition(char_rule, min_len, max_len)
+                + r' "\"" space',
+            )
+        elif (schema_type == "object") or (len(schema) == 0):
+            return self._add_rule(
+                rule_name, self._add_primitive("object", PRIMITIVE_RULES["object"])
+            )
+        else:
+            assert schema_type in PRIMITIVE_RULES, f"Unrecognized schema: {schema}"
+            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+            return self._add_primitive(
+                "root" if rule_name == "root" else schema_type,
+                PRIMITIVE_RULES[schema_type],
+            )
+    def _add_primitive(self, name: str, rule: BuiltinRule):
+        n = self._add_rule(name, rule.content)
+        for dep in rule.deps:
+            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+            assert dep_rule, f"Rule {dep} not known"
+            if dep not in self._rules:
+                self._add_primitive(dep, dep_rule)
+        return n
+    def _build_object_rule(
+        self,
+        properties: List[Tuple[str, Any]],
+        required: Set[str],
+        name: str,
+        additional_properties: Union[bool, Any],
+    ):
+        prop_order = self._prop_order
+        # sort by position in prop_order (if specified) then by original order
+        sorted_props = [
+            kv[0]
+            for _, kv in sorted(
+                enumerate(properties),
+                key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]),
+            )
+        ]
+        prop_kv_rule_names = {}
+        for prop_name, prop_schema in properties:
+            prop_rule_name = self.visit(
+                prop_schema, f'{name}{"-" if name else ""}{prop_name}'
+            )
+            prop_kv_rule_names[prop_name] = self._add_rule(
+                f'{name}{"-" if name else ""}{prop_name}-kv',
+                rf'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}',
+            )
+        required_props = [k for k in sorted_props if k in required]
+        optional_props = [k for k in sorted_props if k not in required]
+        if additional_properties == True or isinstance(additional_properties, dict):
+            sub_name = f'{name}{"-" if name else ""}additional'
+            value_rule = self.visit(
+                {} if additional_properties == True else additional_properties,
+                f"{sub_name}-value",
+            )
+            prop_kv_rule_names["*"] = self._add_rule(
+                f"{sub_name}-kv",
+                self._add_primitive("string", PRIMITIVE_RULES["string"])
+                + f' ":" space {value_rule}',
+            )
+            optional_props.append("*")
+        rule = '"{" space '
+        rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
+        if optional_props:
+            rule += " ("
+            if required_props:
+                rule += ' "," space ( '
+            def get_recursive_refs(ks, first_is_optional):
+                [k, *rest] = ks
+                kv_rule_name = prop_kv_rule_names[k]
+                if k == "*":
+                    res = self._add_rule(
+                        f'{name}{"-" if name else ""}additional-kvs',
+                        f'{kv_rule_name} ( "," space ' + kv_rule_name + " )*",
+                    )
+                elif first_is_optional:
+                    res = f'( "," space {kv_rule_name} )?'
+                else:
+                    res = kv_rule_name
+                if len(rest) > 0:
+                    res += " " + self._add_rule(
+                        f'{name}{"-" if name else ""}{k}-rest',
+                        get_recursive_refs(rest, first_is_optional=True),
+                    )
+                return res
+            rule += " | ".join(
+                get_recursive_refs(optional_props[i:], first_is_optional=False)
+                for i in range(len(optional_props))
+            )
+            if required_props:
+                rule += " )"
+            rule += " )?"
+        rule += ' "}" space'
+        return rule
+    def format_grammar(self):
+        return "\n".join(
+            f"{name} ::= {rule}"
+            for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
+        )
+def json_schema_to_gbnf(schema: str, prop_order: Optional[List[str]] = None):
+    prop_order = prop_order or []
+    schema = json.loads(schema)
+    prop_order = {name: idx for idx, name in enumerate(prop_order)}
+    converter = SchemaConverter(
+        prop_order=prop_order, allow_fetch=False, dotall=False, raw_pattern=False
+    )
+    schema = converter.resolve_refs(schema, "stdin")
+    converter.visit(schema, "")
+    return converter.format_grammar()

llama_cpp/llama_speculative.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import abc
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+class LlamaDraftModel(abc.ABC):
+    @abc.abstractmethod
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+    ) -> npt.NDArray[np.intc]:
+        raise NotImplementedError()
+class LlamaPromptLookupDecoding(LlamaDraftModel):
+    """Based on https://github.com/apoorvumang/prompt-lookup-decoding"""
+    def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10):
+        self.max_ngram_size = max_ngram_size
+        self.num_pred_tokens = num_pred_tokens
+    @staticmethod
+    def find_candidate_pred_tokens(
+        input_ids: npt.NDArray[np.intc],
+        max_ngram_size: int,
+        num_pred_tokens: int,
+    ):
+        input_length = input_ids.shape[0]
+        for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1):
+            # Create sliding windows of size ngram_size
+            windows = np.lib.stride_tricks.sliding_window_view(input_ids, (ngram_size,))
+            # Convert ngram to an array for comparison
+            ngram_array = input_ids[-ngram_size:]
+            # Find where the windows match the ngram
+            matches = np.all(windows == ngram_array, axis=1)
+            # Get the indices of matches
+            match_indices = np.nonzero(matches)[0]
+            # Iterate through match indices to find a valid continuation
+            for idx in match_indices:
+                start_idx = idx + ngram_size
+                end_idx = start_idx + num_pred_tokens
+                end_idx = min(end_idx, input_length)
+                if start_idx < end_idx:
+                    return input_ids[start_idx:end_idx]
+        # If no match is found, return an empty array
+        return np.array([], dtype=np.intc)
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+    ) -> npt.NDArray[np.intc]:
+        return self.find_candidate_pred_tokens(
+            input_ids=input_ids,
+            max_ngram_size=self.max_ngram_size,
+            num_pred_tokens=self.num_pred_tokens,
+        )

llama_cpp/llama_tokenizer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from __future__ import annotations
+import abc
+from typing import (
+    List,
+    Optional,
+    Any,
+)
+import llama_cpp
+from llama_cpp.llama_types import List
+class BaseLlamaTokenizer(abc.ABC):
+    @abc.abstractmethod
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        """Tokenize the text into tokens.
+        Args:
+            text: The utf-8 encoded string to tokenize.
+            add_bos: Whether to add a beginning of sequence token.
+            special: Whether to tokenize special tokens.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def detokenize(
+        self,
+        tokens: List[int],
+        prev_tokens: Optional[List[int]] = None,
+        special: bool = False,
+    ) -> bytes:
+        """Detokenize the tokens into text.
+        Args:
+            tokens: The list of tokens to detokenize.
+            prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
+            special: Whether to detokenize special tokens.
+        """
+        raise NotImplementedError
+class LlamaTokenizer(BaseLlamaTokenizer):
+    def __init__(self, llama: llama_cpp.Llama):
+        self._model = llama._model  # type: ignore
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self._model.tokenize(text, add_bos=add_bos, special=special)
+    def detokenize(
+        self,
+        tokens: List[int],
+        prev_tokens: Optional[List[int]] = None,
+        special: bool = False,
+    ) -> bytes:
+        return self._model.detokenize(tokens, special=special)
+    def encode(
+        self, text: str, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self.tokenize(
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
+        )
+    def decode(self, tokens: List[int]) -> str:
+        return self.detokenize(tokens).decode("utf-8", errors="ignore")
+    @classmethod
+    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
+        return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
+class LlamaHFTokenizer(BaseLlamaTokenizer):
+    def __init__(self, hf_tokenizer: Any):
+        self.hf_tokenizer = hf_tokenizer
+    def tokenize(
+        self, text: bytes, add_bos: bool = True, special: bool = True
+    ) -> List[int]:
+        return self.hf_tokenizer.encode(
+            text.decode("utf-8", errors="ignore"), add_special_tokens=special
+        )
+    def detokenize(
+        self,
+        tokens: List[int],
+        prev_tokens: Optional[List[int]] = None,
+        special: bool = False,
+    ) -> bytes:
+        skip_special_tokens = not special
+        if prev_tokens is not None:
+            text = self.hf_tokenizer.decode(
+                prev_tokens + tokens, skip_special_tokens=skip_special_tokens
+            ).encode("utf-8", errors="ignore")
+            prev_text = self.hf_tokenizer.decode(
+                prev_tokens, skip_special_tokens=skip_special_tokens
+            ).encode("utf-8", errors="ignore")
+            return text[len(prev_text) :]
+        else:
+            return self.hf_tokenizer.decode(
+                tokens, skip_special_tokens=skip_special_tokens
+            ).encode("utf-8", errors="ignore")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
+        try:
+            from transformers import AutoTokenizer
+        except ImportError:
+            raise ImportError(
+                "The `transformers` library is required to use the `HFTokenizer`."
+                "You can install it with `pip install transformers`."
+            )
+        hf_tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path
+        )
+        return cls(hf_tokenizer)

llama_cpp/llama_types.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""Types and request signatures for OpenAI compatibility
+NOTE: These types may change to match the OpenAI OpenAPI specification.
+Based on the OpenAI OpenAPI specification:
+https://github.com/openai/openai-openapi/blob/master/openapi.yaml
+"""
+from typing import Any, List, Optional, Dict, Union
+from typing_extensions import TypedDict, NotRequired, Literal
+# NOTE: Defining this correctly using annotations seems to break pydantic validation.
+#       This is a workaround until we can figure out how to do this correctly
+# JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
+JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
+class EmbeddingUsage(TypedDict):
+    prompt_tokens: int
+    total_tokens: int
+class Embedding(TypedDict):
+    index: int
+    object: str
+    embedding: Union[List[float], List[List[float]]]
+class CreateEmbeddingResponse(TypedDict):
+    object: Literal["list"]
+    model: str
+    data: List[Embedding]
+    usage: EmbeddingUsage
+class CompletionLogprobs(TypedDict):
+    text_offset: List[int]
+    token_logprobs: List[Optional[float]]
+    tokens: List[str]
+    top_logprobs: List[Optional[Dict[str, float]]]
+class CompletionChoice(TypedDict):
+    text: str
+    index: int
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[Literal["stop", "length"]]
+class CompletionUsage(TypedDict):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class CreateCompletionResponse(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+    usage: NotRequired[CompletionUsage]
+class ChatCompletionResponseFunctionCall(TypedDict):
+    name: str
+    arguments: str
+class ChatCompletionResponseMessage(TypedDict):
+    content: Optional[str]
+    tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
+    role: Literal["assistant", "function"]  # NOTE: "function" may be incorrect here
+    function_call: NotRequired[ChatCompletionResponseFunctionCall]  # DEPRECATED
+class ChatCompletionFunction(TypedDict):
+    name: str
+    description: NotRequired[str]
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
+class ChatCompletionTopLogprobToken(TypedDict):
+    token: str
+    logprob: float
+    bytes: Optional[List[int]]
+class ChatCompletionLogprobToken(ChatCompletionTopLogprobToken):
+    token: str
+    logprob: float
+    bytes: Optional[List[int]]
+    top_logprobs: List[ChatCompletionTopLogprobToken]
+class ChatCompletionLogprobs(TypedDict):
+    content: Optional[List[ChatCompletionLogprobToken]]
+    refusal: Optional[List[ChatCompletionLogprobToken]]
+class ChatCompletionResponseChoice(TypedDict):
+    index: int
+    message: "ChatCompletionResponseMessage"
+    logprobs: Optional[ChatCompletionLogprobs]
+    finish_reason: Optional[str]
+class CreateChatCompletionResponse(TypedDict):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: List["ChatCompletionResponseChoice"]
+    usage: CompletionUsage
+class ChatCompletionMessageToolCallChunkFunction(TypedDict):
+    name: Optional[str]
+    arguments: str
+class ChatCompletionMessageToolCallChunk(TypedDict):
+    index: int
+    id: NotRequired[str]
+    type: Literal["function"]
+    function: ChatCompletionMessageToolCallChunkFunction
+class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
+    pass
+class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
+    name: str
+    arguments: str
+class ChatCompletionStreamResponseDelta(TypedDict):
+    content: NotRequired[Optional[str]]
+    function_call: NotRequired[
+        Optional[ChatCompletionStreamResponseDeltaFunctionCall]
+    ]  # DEPRECATED
+    tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
+    role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
+class ChatCompletionStreamResponseChoice(TypedDict):
+    index: int
+    delta: Union[
+        ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
+    ]
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
+    logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
+class CreateChatCompletionStreamResponse(TypedDict):
+    id: str
+    model: str
+    object: Literal["chat.completion.chunk"]
+    created: int
+    choices: List[ChatCompletionStreamResponseChoice]
+class ChatCompletionFunctions(TypedDict):
+    name: str
+    description: NotRequired[str]
+    parameters: Dict[str, JsonType]  # TODO: make this more specific
+class ChatCompletionFunctionCallOption(TypedDict):
+    name: str
+class ChatCompletionRequestResponseFormat(TypedDict):
+    type: Literal["text", "json_object"]
+    schema: NotRequired[
+        JsonType
+    ]  # https://docs.endpoints.anyscale.com/guides/json_mode/
+class ChatCompletionRequestMessageContentPartText(TypedDict):
+    type: Literal["text"]
+    text: str
+class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
+    url: str
+    detail: NotRequired[Literal["auto", "low", "high"]]
+class ChatCompletionRequestMessageContentPartImage(TypedDict):
+    type: Literal["image_url"]
+    image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
+ChatCompletionRequestMessageContentPart = Union[
+    ChatCompletionRequestMessageContentPartText,
+    ChatCompletionRequestMessageContentPartImage,
+]
+class ChatCompletionRequestSystemMessage(TypedDict):
+    role: Literal["system"]
+    content: Optional[str]
+class ChatCompletionRequestUserMessage(TypedDict):
+    role: Literal["user"]
+    content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
+class ChatCompletionMessageToolCallFunction(TypedDict):
+    name: str
+    arguments: str
+class ChatCompletionMessageToolCall(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: ChatCompletionMessageToolCallFunction
+ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
+class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
+    name: str
+    arguments: str
+class ChatCompletionRequestAssistantMessage(TypedDict):
+    role: Literal["assistant"]
+    content: NotRequired[str]
+    tool_calls: NotRequired[ChatCompletionMessageToolCalls]
+    function_call: NotRequired[
+        ChatCompletionRequestAssistantMessageFunctionCall
+    ]  # DEPRECATED
+class ChatCompletionRequestToolMessage(TypedDict):
+    role: Literal["tool"]
+    content: Optional[str]
+    tool_call_id: str
+class ChatCompletionRequestFunctionMessage(TypedDict):
+    role: Literal["function"]
+    content: Optional[str]
+    name: str
+ChatCompletionRequestMessage = Union[
+    ChatCompletionRequestSystemMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestAssistantMessage,
+    ChatCompletionRequestUserMessage,
+    ChatCompletionRequestToolMessage,
+    ChatCompletionRequestFunctionMessage,
+]
+class ChatCompletionRequestFunctionCallOption(TypedDict):
+    name: str
+ChatCompletionRequestFunctionCall = Union[
+    Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
+]
+ChatCompletionFunctionParameters = Dict[str, JsonType]  # TODO: make this more specific
+class ChatCompletionToolFunction(TypedDict):
+    name: str
+    description: NotRequired[str]
+    parameters: ChatCompletionFunctionParameters
+class ChatCompletionTool(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionToolFunction
+class ChatCompletionNamedToolChoiceFunction(TypedDict):
+    name: str
+class ChatCompletionNamedToolChoice(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionNamedToolChoiceFunction
+ChatCompletionToolChoiceOption = Union[
+    Literal["none", "auto", "required"], ChatCompletionNamedToolChoice
+]
+# NOTE: The following type names are not part of the OpenAI OpenAPI specification
+# and will be removed in a future major release.
+EmbeddingData = Embedding
+CompletionChunk = CreateCompletionResponse
+Completion = CreateCompletionResponse
+CreateCompletionStreamResponse = CreateCompletionResponse
+ChatCompletionMessage = ChatCompletionResponseMessage
+ChatCompletionChoice = ChatCompletionResponseChoice
+ChatCompletion = CreateChatCompletionResponse
+ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
+ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
+ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
+ChatCompletionChunk = CreateChatCompletionStreamResponse
+ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
+ChatCompletionResponseFunction = ChatCompletionFunction
+ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall

llama_cpp/llava_cpp.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from __future__ import annotations
+import os
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_uint8,
+    c_float,
+    c_void_p,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+)
+import pathlib
+from typing import (
+    Union,
+    NewType,
+    Optional,
+    TYPE_CHECKING,
+)
+import llama_cpp.llama_cpp as llama_cpp
+from llama_cpp._ctypes_extensions import (
+    load_shared_library,
+    ctypes_function_for_shared_library,
+)
+if TYPE_CHECKING:
+    from llama_cpp._ctypes_extensions import (
+        CtypesArray,
+    )
+# Specify the base name of the shared library to load
+_libllava_base_name = "llava"
+_libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
+_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
+# Load the library
+_libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
+ctypes_function = ctypes_function_for_shared_library(_libllava)
+################################################
+# llava.h
+################################################
+# struct clip_ctx;
+clip_ctx_p = NewType("clip_ctx_p", int)
+clip_ctx_p_ctypes = c_void_p
+# struct llava_image_embed {
+#     float * embed;
+#     int n_image_pos;
+# };
+class llava_image_embed(Structure):
+    _fields_ = [
+        ("embed", POINTER(c_float)),
+        ("n_image_pos", c_int),
+    ]
+# /** sanity check for clip <-> llava embed size match */
+# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+@ctypes_function(
+    "llava_validate_embed_size",
+    [llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
+    c_bool,
+)
+def llava_validate_embed_size(
+    ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
+) -> bool:
+    ...
+# /** build an image embed from image file bytes */
+# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+@ctypes_function(
+    "llava_image_embed_make_with_bytes",
+    [clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
+    POINTER(llava_image_embed),
+)
+def llava_image_embed_make_with_bytes(
+    ctx_clip: clip_ctx_p,
+    n_threads: Union[c_int, int],
+    image_bytes: CtypesArray[c_uint8],
+    image_bytes_length: Union[c_int, int],
+    /,
+) -> "_Pointer[llava_image_embed]":
+    ...
+# /** build an image embed from a path to an image filename */
+# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+@ctypes_function(
+    "llava_image_embed_make_with_filename",
+    [clip_ctx_p_ctypes, c_int, c_char_p],
+    POINTER(llava_image_embed),
+)
+def llava_image_embed_make_with_filename(
+    ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
+) -> "_Pointer[llava_image_embed]":
+    ...
+# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
+# /** free an embedding made with llava_image_embed_make_* */
+@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
+def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
+    ...
+# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+@ctypes_function(
+    "llava_eval_image_embed",
+    [
+        llama_cpp.llama_context_p_ctypes,
+        POINTER(llava_image_embed),
+        c_int,
+        POINTER(c_int),
+    ],
+    c_bool,
+)
+def llava_eval_image_embed(
+    ctx_llama: llama_cpp.llama_context_p,
+    embed: "_Pointer[llava_image_embed]",
+    n_batch: Union[c_int, int],
+    n_past: "_Pointer[c_int]",
+    /,
+) -> bool:
+    ...
+################################################
+# clip.h
+################################################
+# /** load mmproj model */
+# CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
+def clip_model_load(
+    fname: bytes, verbosity: Union[c_int, int], /
+) -> Optional[clip_ctx_p]:
+    ...
+# /** free mmproj model */
+# CLIP_API void clip_free(struct clip_ctx * ctx);
+@ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
+def clip_free(ctx: clip_ctx_p, /):
+    ...

llama_cpp/mtmd_cpp.py ADDED Viewed

	@@ -0,0 +1,280 @@

+from __future__ import annotations
+import os
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_uint8,
+    c_uint32,
+    c_float,
+    c_void_p,
+    c_size_t,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    byref,
+)
+import pathlib
+from typing import (
+    Union,
+    NewType,
+    Optional,
+    TYPE_CHECKING,
+)
+import llama_cpp.llama_cpp as llama_cpp
+from llama_cpp._ctypes_extensions import (
+    load_shared_library,
+    ctypes_function_for_shared_library,
+)
+if TYPE_CHECKING:
+    from llama_cpp._ctypes_extensions import (
+        CtypesArray,
+    )
+# Specify the base name of the shared library to load
+_libmtmd_base_name = "mtmd"
+_libmtmd_override_path = os.environ.get("MTMD_CPP_LIB")
+_libmtmd_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libmtmd_override_path is None else pathlib.Path()
+# Load the library
+_libmtmd = load_shared_library(_libmtmd_base_name, _libmtmd_base_path)
+ctypes_function = ctypes_function_for_shared_library(_libmtmd)
+################################################
+# mtmd.h types
+################################################
+# Opaque types
+mtmd_context_p = NewType("mtmd_context_p", int)
+mtmd_context_p_ctypes = c_void_p
+mtmd_bitmap_p = NewType("mtmd_bitmap_p", int)
+mtmd_bitmap_p_ctypes = c_void_p
+mtmd_image_tokens_p = NewType("mtmd_image_tokens_p", int)
+mtmd_image_tokens_p_ctypes = c_void_p
+mtmd_input_chunk_p = NewType("mtmd_input_chunk_p", int)
+mtmd_input_chunk_p_ctypes = c_void_p
+mtmd_input_chunks_p = NewType("mtmd_input_chunks_p", int)
+mtmd_input_chunks_p_ctypes = c_void_p
+# Enums
+MTMD_INPUT_CHUNK_TYPE_TEXT = 0
+MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
+MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
+# Structures
+class mtmd_context_params(Structure):
+    _fields_ = [
+        ("use_gpu", c_bool),
+        ("print_timings", c_bool),
+        ("n_threads", c_int),
+        ("verbosity", c_int),  # ggml_log_level
+        ("image_marker", c_char_p),
+        ("media_marker", c_char_p),
+    ]
+class mtmd_input_text(Structure):
+    _fields_ = [
+        ("text", c_char_p),
+        ("add_special", c_bool),
+        ("parse_special", c_bool),
+    ]
+################################################
+# mtmd.h functions
+################################################
+# MTMD_API const char * mtmd_default_marker(void);
+@ctypes_function("mtmd_default_marker", [], c_char_p)
+def mtmd_default_marker() -> bytes:
+    ...
+# MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+@ctypes_function("mtmd_context_params_default", [], mtmd_context_params)
+def mtmd_context_params_default() -> mtmd_context_params:
+    ...
+# MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+#                                             const struct llama_model * text_model,
+#                                             const struct mtmd_context_params ctx_params);
+@ctypes_function(
+    "mtmd_init_from_file",
+    [c_char_p, llama_cpp.llama_model_p_ctypes, mtmd_context_params],
+    mtmd_context_p_ctypes
+)
+def mtmd_init_from_file(
+    mmproj_fname: bytes,
+    text_model: llama_cpp.llama_model_p,
+    ctx_params: mtmd_context_params,
+    /,
+) -> Optional[mtmd_context_p]:
+    ...
+# MTMD_API void mtmd_free(mtmd_context * ctx);
+@ctypes_function("mtmd_free", [mtmd_context_p_ctypes], None)
+def mtmd_free(ctx: mtmd_context_p, /):
+    ...
+# MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+@ctypes_function("mtmd_support_vision", [mtmd_context_p_ctypes], c_bool)
+def mtmd_support_vision(ctx: mtmd_context_p, /) -> bool:
+    ...
+# MTMD_API mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, uint32_t ny, const unsigned char * data);
+@ctypes_function(
+    "mtmd_bitmap_init",
+    [c_uint32, c_uint32, POINTER(c_uint8)],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_bitmap_init(
+    nx: Union[c_uint32, int],
+    ny: Union[c_uint32, int],
+    data: CtypesArray[c_uint8],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+# MTMD_API void mtmd_bitmap_free(mtmd_bitmap * bitmap);
+@ctypes_function("mtmd_bitmap_free", [mtmd_bitmap_p_ctypes], None)
+def mtmd_bitmap_free(bitmap: mtmd_bitmap_p, /):
+    ...
+# MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
+@ctypes_function("mtmd_input_chunks_init", [], mtmd_input_chunks_p_ctypes)
+def mtmd_input_chunks_init() -> Optional[mtmd_input_chunks_p]:
+    ...
+# MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_free", [mtmd_input_chunks_p_ctypes], None)
+def mtmd_input_chunks_free(chunks: mtmd_input_chunks_p, /):
+    ...
+# MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_input_chunks_size", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_input_chunks_size(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+# MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get(const mtmd_input_chunks * chunks, size_t idx);
+@ctypes_function(
+    "mtmd_input_chunks_get",
+    [mtmd_input_chunks_p_ctypes, c_size_t],
+    mtmd_input_chunk_p_ctypes
+)
+def mtmd_input_chunks_get(
+    chunks: mtmd_input_chunks_p, idx: Union[c_size_t, int], /
+) -> Optional[mtmd_input_chunk_p]:
+    ...
+# MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+#                                mtmd_input_chunks * output,
+#                                const mtmd_input_text * text,
+#                                const mtmd_bitmap ** bitmaps,
+#                                size_t n_bitmaps);
+@ctypes_function(
+    "mtmd_tokenize",
+    [
+        mtmd_context_p_ctypes,
+        mtmd_input_chunks_p_ctypes,
+        POINTER(mtmd_input_text),
+        POINTER(mtmd_bitmap_p_ctypes),
+        c_size_t,
+    ],
+    c_int,
+)
+def mtmd_tokenize(
+    ctx: mtmd_context_p,
+    output: mtmd_input_chunks_p,
+    text: "_Pointer[mtmd_input_text]",
+    bitmaps: CtypesArray[mtmd_bitmap_p_ctypes],
+    n_bitmaps: Union[c_size_t, int],
+    /,
+) -> int:
+    ...
+# MTMD_API size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_n_tokens", [mtmd_input_chunk_p_ctypes], c_size_t)
+def mtmd_input_chunk_get_n_tokens(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+# MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type(const mtmd_input_chunk * chunk);
+@ctypes_function("mtmd_input_chunk_get_type", [mtmd_input_chunk_p_ctypes], c_int)
+def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p, /) -> int:
+    ...
+# MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text(const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+@ctypes_function(
+    "mtmd_input_chunk_get_tokens_text",
+    [mtmd_input_chunk_p_ctypes, POINTER(c_size_t)],
+    POINTER(llama_cpp.llama_token)
+)
+def mtmd_input_chunk_get_tokens_text(
+    chunk: mtmd_input_chunk_p, n_tokens_output: "_Pointer[c_size_t]", /
+) -> Optional["_Pointer[llama_cpp.llama_token]"]:
+    ...
+################################################
+# mtmd-helper.h functions
+################################################
+# MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+@ctypes_function(
+    "mtmd_helper_bitmap_init_from_buf",
+    [mtmd_context_p_ctypes, POINTER(c_uint8), c_size_t],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_helper_bitmap_init_from_buf(
+    ctx: mtmd_context_p,
+    buf: CtypesArray[c_uint8],
+    length: Union[c_size_t, int],
+    /,
+) -> Optional[mtmd_bitmap_p]:
+    ...
+# MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+@ctypes_function("mtmd_helper_get_n_tokens", [mtmd_input_chunks_p_ctypes], c_size_t)
+def mtmd_helper_get_n_tokens(chunks: mtmd_input_chunks_p, /) -> int:
+    ...
+# MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+#                                                struct llama_context * lctx,
+#                                                const mtmd_input_chunk * chunk,
+#                                                llama_pos n_past,
+#                                                llama_seq_id seq_id,
+#                                                int32_t n_batch,
+#                                                bool logits_last,
+#                                                llama_pos * new_n_past);
+@ctypes_function(
+    "mtmd_helper_eval_chunk_single",
+    [
+        mtmd_context_p_ctypes,
+        llama_cpp.llama_context_p_ctypes,
+        mtmd_input_chunk_p_ctypes,
+        llama_cpp.llama_pos,
+        llama_cpp.llama_seq_id,
+        c_int,
+        c_bool,
+        POINTER(llama_cpp.llama_pos),
+    ],
+    c_int,
+)
+def mtmd_helper_eval_chunk_single(
+    ctx: mtmd_context_p,
+    lctx: llama_cpp.llama_context_p,
+    chunk: mtmd_input_chunk_p,
+    n_past: llama_cpp.llama_pos,
+    seq_id: llama_cpp.llama_seq_id,
+    n_batch: Union[c_int, int],
+    logits_last: Union[c_bool, bool],
+    new_n_past: "_Pointer[llama_cpp.llama_pos]",
+    /,
+) -> int:
+    ...

llama_cpp/py.typed ADDED Viewed

File without changes

llama_cpp/server/__init__.py ADDED Viewed

File without changes

llama_cpp/server/__main__.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Example FastAPI server for llama.cpp.
+To run this example:
+```bash
+pip install fastapi uvicorn sse-starlette pydantic-settings
+export MODEL=../models/7B/...
+```
+Then run:
+```
+uvicorn llama_cpp.server.app:create_app --reload
+```
+or
+```
+python3 -m llama_cpp.server
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+"""
+from __future__ import annotations
+import os
+import sys
+import argparse
+import uvicorn
+from llama_cpp.server.app import create_app
+from llama_cpp.server.settings import (
+    Settings,
+    ServerSettings,
+    ModelSettings,
+    ConfigFileSettings,
+)
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
+def main():
+    description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
+    parser = argparse.ArgumentParser(description=description)
+    add_args_from_model(parser, Settings)
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to a config file to load.",
+    )
+    server_settings: ServerSettings | None = None
+    model_settings: list[ModelSettings] = []
+    args = parser.parse_args()
+    try:
+        # Load server settings from config_file if provided
+        config_file = os.environ.get("CONFIG_FILE", args.config_file)
+        if config_file:
+            if not os.path.exists(config_file):
+                raise ValueError(f"Config file {config_file} not found!")
+            with open(config_file, "rb") as f:
+                # Check if yaml file
+                if config_file.endswith(".yaml") or config_file.endswith(".yml"):
+                    import yaml
+                    import json
+                    config_file_settings = ConfigFileSettings.model_validate_json(
+                        json.dumps(yaml.safe_load(f))
+                    )
+                else:
+                    config_file_settings = ConfigFileSettings.model_validate_json(
+                        f.read()
+                    )
+                server_settings = ServerSettings.model_validate(config_file_settings)
+                model_settings = config_file_settings.models
+        else:
+            server_settings = parse_model_from_args(ServerSettings, args)
+            model_settings = [parse_model_from_args(ModelSettings, args)]
+    except Exception as e:
+        print(e, file=sys.stderr)
+        parser.print_help()
+        sys.exit(1)
+    assert server_settings is not None
+    assert model_settings is not None
+    app = create_app(
+        server_settings=server_settings,
+        model_settings=model_settings,
+    )
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", server_settings.host),
+        port=int(os.getenv("PORT", server_settings.port)),
+        ssl_keyfile=server_settings.ssl_keyfile,
+        ssl_certfile=server_settings.ssl_certfile,
+    )
+if __name__ == "__main__":
+    main()

llama_cpp/server/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (177 Bytes). View file

llama_cpp/server/__pycache__/__main__.cpython-311.pyc ADDED Viewed

Binary file (4.12 kB). View file

llama_cpp/server/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (23.4 kB). View file

llama_cpp/server/__pycache__/cli.cpython-311.pyc ADDED Viewed

Binary file (5.44 kB). View file

llama_cpp/server/__pycache__/errors.cpython-311.pyc ADDED Viewed

Binary file (8.14 kB). View file

llama_cpp/server/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file

llama_cpp/server/__pycache__/settings.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

llama_cpp/server/__pycache__/types.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

llama_cpp/server/app.py ADDED Viewed

	@@ -0,0 +1,597 @@

+from __future__ import annotations
+import os
+import json
+import typing
+import contextlib
+from anyio import Lock
+from functools import partial
+from typing import List, Optional, Union, Dict
+import llama_cpp
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body
+from fastapi.middleware import Middleware
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.security import HTTPBearer
+from sse_starlette.sse import EventSourceResponse
+from starlette_context.plugins import RequestIdPlugin  # type: ignore
+from starlette_context.middleware import RawContextMiddleware
+from llama_cpp.server.model import (
+    LlamaProxy,
+)
+from llama_cpp.server.settings import (
+    ConfigFileSettings,
+    Settings,
+    ModelSettings,
+    ServerSettings,
+)
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+    ModelList,
+    TokenizeInputRequest,
+    TokenizeInputResponse,
+    TokenizeInputCountResponse,
+    DetokenizeInputRequest,
+    DetokenizeInputResponse,
+)
+from llama_cpp.server.errors import RouteErrorHandler
+router = APIRouter(route_class=RouteErrorHandler)
+_server_settings: Optional[ServerSettings] = None
+def set_server_settings(server_settings: ServerSettings):
+    global _server_settings
+    _server_settings = server_settings
+def get_server_settings():
+    yield _server_settings
+_llama_proxy: Optional[LlamaProxy] = None
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+def set_llama_proxy(model_settings: List[ModelSettings]):
+    global _llama_proxy
+    _llama_proxy = LlamaProxy(models=model_settings)
+async def get_llama_proxy():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    await llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        await llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield _llama_proxy
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+_ping_message_factory: typing.Optional[typing.Callable[[], bytes]] = None
+def set_ping_message_factory(factory: typing.Callable[[], bytes]):
+    global _ping_message_factory
+    _ping_message_factory = factory
+def create_app(
+    settings: Settings | None = None,
+    server_settings: ServerSettings | None = None,
+    model_settings: List[ModelSettings] | None = None,
+):
+    config_file = os.environ.get("CONFIG_FILE", None)
+    if config_file is not None:
+        if not os.path.exists(config_file):
+            raise ValueError(f"Config file {config_file} not found!")
+        with open(config_file, "rb") as f:
+            # Check if yaml file
+            if config_file.endswith(".yaml") or config_file.endswith(".yml"):
+                import yaml
+                config_file_settings = ConfigFileSettings.model_validate_json(
+                    json.dumps(yaml.safe_load(f))
+                )
+            else:
+                config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+            server_settings = ServerSettings.model_validate(config_file_settings)
+            model_settings = config_file_settings.models
+    if server_settings is None and model_settings is None:
+        if settings is None:
+            settings = Settings()
+        server_settings = ServerSettings.model_validate(settings)
+        model_settings = [ModelSettings.model_validate(settings)]
+    assert (
+        server_settings is not None and model_settings is not None
+    ), "server_settings and model_settings must be provided together"
+    set_server_settings(server_settings)
+    middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
+    app = FastAPI(
+        middleware=middleware,
+        title="🦙 llama.cpp Python API",
+        version=llama_cpp.__version__,
+        root_path=server_settings.root_path,
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+    assert model_settings is not None
+    set_llama_proxy(model_settings=model_settings)
+    if server_settings.disable_ping_events:
+        set_ping_message_factory(lambda: bytes())
+    return app
+def prepare_request_resources(
+    body: CreateCompletionRequest | CreateChatCompletionRequest,
+    llama_proxy: LlamaProxy,
+    body_model: str | None,
+    kwargs,
+) -> llama_cpp.Llama:
+    if llama_proxy is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Service is not available",
+        )
+    llama = llama_proxy(body_model)
+    if body.logit_bias is not None:
+        kwargs["logit_bias"] = (
+            _logit_bias_tokens_to_input_ids(llama, body.logit_bias)
+            if body.logit_bias_type == "tokens"
+            else body.logit_bias
+        )
+    if body.grammar is not None:
+        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
+    if body.min_tokens > 0:
+        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
+            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
+        )
+        if "logits_processor" not in kwargs:
+            kwargs["logits_processor"] = _min_tokens_logits_processor
+        else:
+            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
+    return llama
+async def get_event_publisher(
+    request: Request,
+    inner_send_chan: MemoryObjectSendStream[typing.Any],
+    body: CreateCompletionRequest | CreateChatCompletionRequest,
+    body_model: str | None,
+    llama_call,
+    kwargs,
+):
+    server_settings = next(get_server_settings())
+    interrupt_requests = (
+        server_settings.interrupt_requests if server_settings else False
+    )
+    async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy:
+        llama = prepare_request_resources(body, llama_proxy, body_model, kwargs)
+        async with inner_send_chan:
+            try:
+                iterator = await run_in_threadpool(llama_call, llama, **kwargs)
+                async for chunk in iterate_in_threadpool(iterator):
+                    await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                    if await request.is_disconnected():
+                        raise anyio.get_cancelled_exc_class()()
+                    if interrupt_requests and llama_outer_lock.locked():
+                        await inner_send_chan.send(dict(data="[DONE]"))
+                        raise anyio.get_cancelled_exc_class()()
+                await inner_send_chan.send(dict(data="[DONE]"))
+            except anyio.get_cancelled_exc_class() as e:
+                print("disconnected")
+                with anyio.move_on_after(1, shield=True):
+                    print(
+                        f"Disconnected from client (via refresh/close) {request.client}"
+                    )
+                    raise e
+def _logit_bias_tokens_to_input_ids(
+    llama: llama_cpp.Llama,
+    logit_bias: Dict[str, float],
+) -> Dict[str, float]:
+    to_bias: Dict[str, float] = {}
+    for token, score in logit_bias.items():
+        token = token.encode("utf-8")
+        for input_id in llama.tokenize(token, add_bos=False, special=True):
+            to_bias[str(input_id)] = score
+    return to_bias
+# Setup Bearer authentication scheme
+bearer_scheme = HTTPBearer(auto_error=False)
+async def authenticate(
+    settings: Settings = Depends(get_server_settings),
+    authorization: Optional[str] = Depends(bearer_scheme),
+):
+    # Skip API key check if it's not set in settings
+    if settings.api_key is None:
+        return True
+    # check bearer credentials against the api_key
+    if authorization and authorization.credentials == settings.api_key:
+        # api key is valid
+        return authorization.credentials
+    # raise http error 401
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Invalid API key",
+    )
+openai_v1_tag = "OpenAI V1"
+@router.post(
+    "/v1/completions",
+    summary="Completion",
+    dependencies=[Depends(authenticate)],
+    response_model=Union[
+        llama_cpp.CreateCompletionResponse,
+        str,
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"}
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream": {
+                    "schema": {
+                        "type": "string",
+                        "title": "Server Side Streaming response, when stream=True. "
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
+                    }
+                },
+            },
+        }
+    },
+    tags=[openai_v1_tag],
+)
+@router.post(
+    "/v1/engines/copilot-codex/completions",
+    include_in_schema=False,
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+) -> llama_cpp.Completion:
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    body_model = (
+        body.model
+        if request.url.path != "/v1/engines/copilot-codex/completions"
+        else "copilot-codex"
+    )
+    exclude = {
+        "n",
+        "best_of",
+        "logit_bias_type",
+        "user",
+        "min_tokens",
+    }
+    kwargs = body.model_dump(exclude=exclude)
+    # handle streaming request
+    if kwargs.get("stream", False):
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                body=body,
+                body_model=body_model,
+                llama_call=llama_cpp.Llama.__call__,
+                kwargs=kwargs,
+            ),
+            sep="\n",
+            ping_message_factory=_ping_message_factory,
+        )
+    # handle regular request
+    async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy:
+        llama = prepare_request_resources(body, llama_proxy, body_model, kwargs)
+        if await request.is_disconnected():
+            print(
+                f"Disconnected from client (via refresh/close) before llm invoked {request.client}"
+            )
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Client closed request",
+            )
+        return await run_in_threadpool(llama, **kwargs)
+@router.post(
+    "/v1/embeddings",
+    summary="Embedding",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def create_embedding(
+    request: CreateEmbeddingRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+):
+    return await run_in_threadpool(
+        llama_proxy(request.model).create_embedding,
+        **request.model_dump(exclude={"user"}),
+    )
+@router.post(
+    "/v1/chat/completions",
+    summary="Chat",
+    dependencies=[Depends(authenticate)],
+    response_model=Union[llama_cpp.ChatCompletion, str],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/CreateChatCompletionResponse"
+                            }
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream": {
+                    "schema": {
+                        "type": "string",
+                        "title": "Server Side Streaming response, when stream=True"
+                        + "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                        "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
+                    }
+                },
+            },
+        }
+    },
+    tags=[openai_v1_tag],
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest = Body(
+        openapi_examples={
+            "normal": {
+                "summary": "Chat Completion",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                },
+            },
+            "json_mode": {
+                "summary": "JSON Mode",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Who won the world series in 2020"},
+                    ],
+                    "response_format": {"type": "json_object"},
+                },
+            },
+            "tool_calling": {
+                "summary": "Tool Calling",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "Extract Jason is 30 years old."},
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "User",
+                                "description": "User record",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "name": {"type": "string"},
+                                        "age": {"type": "number"},
+                                    },
+                                    "required": ["name", "age"],
+                                },
+                            },
+                        }
+                    ],
+                    "tool_choice": {
+                        "type": "function",
+                        "function": {
+                            "name": "User",
+                        },
+                    },
+                },
+            },
+            "logprobs": {
+                "summary": "Logprobs",
+                "value": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": "What is the capital of France?"},
+                    ],
+                    "logprobs": True,
+                    "top_logprobs": 10,
+                },
+            },
+        }
+    ),
+) -> llama_cpp.ChatCompletion:
+    # This is a workaround for an issue in FastAPI dependencies
+    # where the dependency is cleaned up before a StreamingResponse
+    # is complete.
+    # https://github.com/tiangolo/fastapi/issues/11143
+    body_model = body.model
+    exclude = {
+        "n",
+        "logit_bias_type",
+        "user",
+        "min_tokens",
+    }
+    kwargs = body.model_dump(exclude=exclude)
+    # handle streaming request
+    if kwargs.get("stream", False):
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                body=body,
+                body_model=body_model,
+                llama_call=llama_cpp.Llama.create_chat_completion,
+                kwargs=kwargs,
+            ),
+            sep="\n",
+            ping_message_factory=_ping_message_factory,
+        )
+    # handle regular request
+    async with contextlib.asynccontextmanager(get_llama_proxy)() as llama_proxy:
+        llama = prepare_request_resources(body, llama_proxy, body_model, kwargs)
+        if await request.is_disconnected():
+            print(
+                f"Disconnected from client (via refresh/close) before llm invoked {request.client}"
+            )
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Client closed request",
+            )
+        return await run_in_threadpool(llama.create_chat_completion, **kwargs)
+@router.get(
+    "/v1/models",
+    summary="Models",
+    dependencies=[Depends(authenticate)],
+    tags=[openai_v1_tag],
+)
+async def get_models(
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> ModelList:
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": model_alias,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+            for model_alias in llama_proxy
+        ],
+    }
+extras_tag = "Extras"
+@router.post(
+    "/extras/tokenize",
+    summary="Tokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def tokenize(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+    return TokenizeInputResponse(tokens=tokens)
+@router.post(
+    "/extras/tokenize/count",
+    summary="Tokenize Count",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def count_query_tokens(
+    body: TokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> TokenizeInputCountResponse:
+    tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
+    return TokenizeInputCountResponse(count=len(tokens))
+@router.post(
+    "/extras/detokenize",
+    summary="Detokenize",
+    dependencies=[Depends(authenticate)],
+    tags=[extras_tag],
+)
+async def detokenize(
+    body: DetokenizeInputRequest,
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
+) -> DetokenizeInputResponse:
+    text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
+    return DetokenizeInputResponse(text=text)