| | from __future__ import annotations |
| |
|
| | import abc |
| | from typing import ( |
| | List, |
| | Optional, |
| | Any, |
| | ) |
| |
|
| | import llama_cpp |
| | from llama_cpp.llama_types import List |
| |
|
| |
|
| | class BaseLlamaTokenizer(abc.ABC): |
| | @abc.abstractmethod |
| | def tokenize( |
| | self, text: bytes, add_bos: bool = True, special: bool = True |
| | ) -> List[int]: |
| | """Tokenize the text into tokens. |
| | |
| | Args: |
| | text: The utf-8 encoded string to tokenize. |
| | add_bos: Whether to add a beginning of sequence token. |
| | special: Whether to tokenize special tokens. |
| | """ |
| | raise NotImplementedError |
| |
|
| | @abc.abstractmethod |
| | def detokenize( |
| | self, |
| | tokens: List[int], |
| | prev_tokens: Optional[List[int]] = None, |
| | special: bool = False, |
| | ) -> bytes: |
| | """Detokenize the tokens into text. |
| | |
| | Args: |
| | tokens: The list of tokens to detokenize. |
| | prev_tokens: The list of previous tokens. Offset mapping will be performed if provided. |
| | special: Whether to detokenize special tokens. |
| | """ |
| | raise NotImplementedError |
| |
|
| |
|
| | class LlamaTokenizer(BaseLlamaTokenizer): |
| | def __init__(self, llama: llama_cpp.Llama): |
| | self._model = llama._model |
| |
|
| | def tokenize( |
| | self, text: bytes, add_bos: bool = True, special: bool = True |
| | ) -> List[int]: |
| | return self._model.tokenize(text, add_bos=add_bos, special=special) |
| |
|
| | def detokenize( |
| | self, |
| | tokens: List[int], |
| | prev_tokens: Optional[List[int]] = None, |
| | special: bool = False, |
| | ) -> bytes: |
| | return self._model.detokenize(tokens, special=special) |
| |
|
| | def encode( |
| | self, text: str, add_bos: bool = True, special: bool = True |
| | ) -> List[int]: |
| | return self.tokenize( |
| | text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special |
| | ) |
| |
|
| | def decode(self, tokens: List[int]) -> str: |
| | return self.detokenize(tokens).decode("utf-8", errors="ignore") |
| |
|
| | @classmethod |
| | def from_ggml_file(cls, path: str) -> "LlamaTokenizer": |
| | return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) |
| |
|
| |
|
| | class LlamaHFTokenizer(BaseLlamaTokenizer): |
| | def __init__(self, hf_tokenizer: Any): |
| | self.hf_tokenizer = hf_tokenizer |
| |
|
| | def tokenize( |
| | self, text: bytes, add_bos: bool = True, special: bool = True |
| | ) -> List[int]: |
| | return self.hf_tokenizer.encode( |
| | text.decode("utf-8", errors="ignore"), add_special_tokens=special |
| | ) |
| |
|
| | def detokenize( |
| | self, |
| | tokens: List[int], |
| | prev_tokens: Optional[List[int]] = None, |
| | special: bool = False, |
| | ) -> bytes: |
| | skip_special_tokens = not special |
| | if prev_tokens is not None: |
| | text = self.hf_tokenizer.decode( |
| | prev_tokens + tokens, skip_special_tokens=skip_special_tokens |
| | ).encode("utf-8", errors="ignore") |
| | prev_text = self.hf_tokenizer.decode( |
| | prev_tokens, skip_special_tokens=skip_special_tokens |
| | ).encode("utf-8", errors="ignore") |
| | return text[len(prev_text) :] |
| | else: |
| | return self.hf_tokenizer.decode( |
| | tokens, skip_special_tokens=skip_special_tokens |
| | ).encode("utf-8", errors="ignore") |
| |
|
| | @classmethod |
| | def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": |
| | try: |
| | from transformers import AutoTokenizer |
| | except ImportError: |
| | raise ImportError( |
| | "The `transformers` library is required to use the `HFTokenizer`." |
| | "You can install it with `pip install transformers`." |
| | ) |
| | hf_tokenizer = AutoTokenizer.from_pretrained( |
| | pretrained_model_name_or_path=pretrained_model_name_or_path |
| | ) |
| | return cls(hf_tokenizer) |
| |
|