koichi12's picture
Add files using upload-large-folder tool
762d748 verified
raw
history blame
14.7 kB
import dataclasses
import pickle
import warnings
from typing import (
TYPE_CHECKING,
Dict,
Iterator,
List,
Optional,
Set,
Tuple,
TypedDict,
Union,
)
from typing_extensions import Unpack
from outlines.generate.api import GenerationParameters, SamplingParameters
from outlines.models.tokenizer import Tokenizer
if TYPE_CHECKING:
from llama_cpp import Llama, LogitsProcessorList
class LlamaCppTokenizer(Tokenizer):
def __init__(self, model: "Llama"):
self.eos_token_id = model.token_eos()
self.eos_token = model.tokenizer().decode([self.eos_token_id])
self.pad_token_id = self.eos_token_id
self.special_tokens: Set[str] = set()
self.vocabulary: Dict[str, int] = dict()
self.tokenizer = model.tokenizer()
# TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
self._hf_tokenizer = None
try:
self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab()
self._hf_tokenizer = model.tokenizer_.hf_tokenizer
except AttributeError:
# ###
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t
# ensure stable ordering of vocabulary
self.vocabulary = {
tok: tok_id
for tok, tok_id in sorted(self.vocabulary.items(), key=lambda x: x[1])
}
self._hash = None
def decode(self, token_ids: List[int]) -> List[str]:
decoded_bytes = self.tokenizer.detokenize(token_ids)
return [decoded_bytes.decode("utf-8", errors="ignore")]
def encode(
self, prompt: Union[str, List[str]], add_bos: bool = True, special: bool = True
) -> Tuple[List[int], List[int]]:
if isinstance(prompt, list):
raise NotImplementedError(
"llama-cpp-python tokenizer doesn't support batch tokenization"
)
token_ids = self.tokenizer.tokenize(
prompt.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
)
# generate attention mask, missing from llama-cpp-python
attention_mask = [
1 if token_id != self.pad_token_id else 0 for token_id in token_ids
]
return token_ids, attention_mask
def convert_token_to_string(self, token: str) -> str:
if self._hf_tokenizer is not None:
from transformers.file_utils import SPIECE_UNDERLINE
token_str = self._hf_tokenizer.convert_tokens_to_string([token])
if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
token_str = " " + token_str
return token_str
else:
return token
def __eq__(self, other):
if not isinstance(other, LlamaCppTokenizer):
return False
return self.__getstate__() == other.__getstate__()
def __hash__(self):
if self._hash is None:
self._hash = hash(pickle.dumps(self))
return self._hash
def __getstate__(self):
"""Create a stable representation for outlines.caching"""
return (
self.vocabulary,
self.eos_token_id,
self.eos_token,
self.pad_token_id,
sorted(self.special_tokens),
)
def __setstate__(self, state):
raise NotImplementedError("Cannot load a pickled llamacpp tokenizer")
class LlamaCppParams(TypedDict, total=False):
suffix: Optional[str]
temperature: float
top_p: float
min_p: float
typical_p: float
seed: int
max_tokens: int
logits_processor: "LogitsProcessorList"
stop: Optional[Union[str, List[str]]]
frequence_penalty: float
presence_penalty: float
repeat_penalty: float
top_k: int
tfs_z: float
mirostat_mode: int
mirostat_tau: float
mirostat_eta: float
stream: bool
class LlamaCpp:
"""Represents a model provided by the `llama-cpp-python` library.
We wrap models from model providing libraries in order to give all of
them the same interface in Outlines and allow users to easily switch
between providers. This class wraps the `llama_cpp.Llama` class from the
`llama-cpp-python` library.
"""
def __init__(self, model: "Llama"):
self.model = model
@property
def tokenizer(self):
return LlamaCppTokenizer(self.model)
def prepare_generation_parameters(
self,
generation_parameters: GenerationParameters,
sampling_parameters: SamplingParameters,
structure_logits_processor,
**llama_cpp_params: Unpack[LlamaCppParams],
):
"""Prepare the generation parameters.
`llama-cpp-python` uses different default values
"""
from llama_cpp import LogitsProcessorList
max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters)
# We update `llama_cpp_params` with the values the user passed to the
# generator.
if "stop" not in llama_cpp_params:
llama_cpp_params["stop"] = stop_at
if "seed" not in llama_cpp_params:
llama_cpp_params["seed"] = seed
# Somehow `llama-cpp-python` generates `max_tokens + 1` tokens
if "max_tokens" not in llama_cpp_params:
if max_tokens is None:
llama_cpp_params["max_tokens"] = -1 # indicates unlimited tokens
else:
llama_cpp_params["max_tokens"] = max_tokens - 1
else:
llama_cpp_params["max_tokens"] = llama_cpp_params["max_tokens"] - 1
sampler, num_samples, top_p, top_k, temperature = dataclasses.astuple(
sampling_parameters
)
# We update the `llama_cpp_params` with the sampling values that
# were specified by the user via the `Sampler` class, unless they
# are also specified in `llama_cpp_params`. We also disable other
# sampling methods that are enabled by default and reset the temperature
# value.
#
# See https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
# for the default values in `llama.cpp` and indications to disable the sampling modes.
# Mirostat sampling, tail-free sampling and all penalties are disabled by default.
#
# See https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
# for default values in `llama-cpp-python`
if sampler == "beam_search":
raise NotImplementedError(
"The `llama_cpp_python` library does not support Beam Search."
)
if num_samples != 1:
raise NotImplementedError(
"The `llama_cpp_python` library does not allow to take several samples."
)
if "top_p" not in llama_cpp_params:
if top_p is not None:
llama_cpp_params["top_p"] = top_p
else:
llama_cpp_params["top_p"] = 1.0
if "min_p" not in llama_cpp_params:
llama_cpp_params["min_p"] = 0.0
if "top_k" not in llama_cpp_params:
if top_k is not None:
llama_cpp_params["top_k"] = top_k
else:
llama_cpp_params["top_k"] = -1
if "temperature" not in llama_cpp_params:
if temperature is not None:
llama_cpp_params["temperature"] = temperature
else:
llama_cpp_params["temperature"] = 1.0
if "repeat_penalty" not in llama_cpp_params:
llama_cpp_params["repeat_penalty"] = 1.0
# The choice to stream or not should happen via the high-level API
llama_cpp_params["stream"] = False
if structure_logits_processor is not None:
if "logits_processor" in llama_cpp_params:
llama_cpp_params["logits_processor"].append(structure_logits_processor)
else:
llama_cpp_params["logits_processor"] = LogitsProcessorList(
[structure_logits_processor]
)
return llama_cpp_params
def generate(
self,
prompts: Union[str, List[str]],
generation_parameters: GenerationParameters,
structure_logits_processor,
sampling_parameters: SamplingParameters,
**llama_cpp_params: Unpack[LlamaCppParams],
) -> str:
"""Generate text using `llama-cpp-python`.
Arguments
---------
prompts
A prompt or list of prompts.
generation_parameters
An instance of `GenerationParameters` that contains the prompt,
the maximum number of tokens, stop sequences and seed. All the
arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
logits_processor
The logits processor to use when generating text.
sampling_parameters
An instance of `SamplingParameters`, a dataclass that contains
the name of the sampler to use and related parameters as available
in Outlines.
llama_cpp_params
Keyword arguments that can be passed to
`llama_cpp_python.Llama.__call__`. The values in `llama_cpp_params`
supersede the values of the parameters in `generation_parameters` and
`sampling_parameters`. See the `llama_cpp_python` documentation for
a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
Returns
-------
The generated text.
"""
if not isinstance(prompts, str):
raise NotImplementedError(
"The `llama-cpp-python` library does not support batch inference."
)
llama_cpp_params = self.prepare_generation_parameters(
generation_parameters,
sampling_parameters,
structure_logits_processor,
**llama_cpp_params,
)
completion = self.model(prompts, **llama_cpp_params)
result = completion["choices"][0]["text"]
self.model.reset()
return result
def stream(
self,
prompts: Union[str, List[str]],
generation_parameters: GenerationParameters,
structure_logits_processor,
sampling_parameters: SamplingParameters,
**llama_cpp_params: Unpack[LlamaCppParams],
) -> Iterator[str]:
"""Stream text using `llama-cpp-python`.
Arguments
---------
prompts
A prompt or list of prompts.
generation_parameters
An instance of `GenerationParameters` that contains the prompt,
the maximum number of tokens, stop sequences and seed. All the
arguments to `SequenceGeneratorAdapter`'s `__cal__` method.
logits_processor
The logits processor to use when generating text.
sampling_parameters
An instance of `SamplingParameters`, a dataclass that contains
the name of the sampler to use and related parameters as available
in Outlines.
llama_cpp_params
Keyword arguments that can be passed to
`llama_cpp_python.Llama.__call__`. The values in `llama_cpp_params`
supersede the values of the parameters in `generation_parameters` and
`sampling_parameters`. See the `llama_cpp_python` documentation for
a list of possible values: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
Returns
-------
A generator that return strings.
"""
if not isinstance(prompts, str):
raise NotImplementedError(
"The `llama-cpp-python` library does not support batch inference."
)
llama_cpp_params = self.prepare_generation_parameters(
generation_parameters,
sampling_parameters,
structure_logits_processor,
**llama_cpp_params,
)
llama_cpp_params["stream"] = True
generator = self.model(prompts, **llama_cpp_params)
def token_generator() -> Iterator[str]:
while True:
try:
result = next(generator)
yield result["choices"][0]["text"]
except StopIteration:
self.model.reset()
return
return token_generator()
def load_lora(self, adapter_path: str):
if self.model._model.apply_lora_from_file(
adapter_path,
1.0,
):
raise RuntimeError(f"Failed to apply LoRA from lora path: {adapter_path}")
def llamacpp(
repo_id: str, filename: Optional[str] = None, **llamacpp_model_params
) -> LlamaCpp:
"""Load a model from the `llama-cpp-python` library.
We use the `Llama.from_pretrained` classmethod that downloads models
directly from the HuggingFace hub, instead of asking users to specify
a path to the downloaded model. One can still load a local model
by initializing `llama_cpp.Llama` directly.
Arguments
---------
repo_id
The name of the model repository.
filename:
A filename of glob pattern to match the model file in the repo.
llama_cpp_model_params
Llama-specific model parameters. See the `llama-cpp-python` documentation
for the full list: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__
"""
from llama_cpp import Llama
# Default to using the model's full context length
if "n_ctx" not in llamacpp_model_params:
llamacpp_model_params["n_ctx"] = 0
if "verbose" not in llamacpp_model_params:
llamacpp_model_params["verbose"] = False
# TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
if "tokenizer" not in llamacpp_model_params:
warnings.warn(
"The pre-tokenizer in `llama.cpp` handles unicode improperly "
+ "(https://github.com/ggerganov/llama.cpp/pull/5613)\n"
+ "Outlines may raise a `RuntimeError` when building the regex index.\n"
+ "To circumvent this error when using `models.llamacpp()` you may pass the argument"
+ "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(<hf_repo_id>)`\n"
)
model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params)
return LlamaCpp(model)