| from typing import Dict, Hashable, List, Protocol, Set, Tuple, Union | |
| import numpy as np | |
| from numpy.typing import NDArray | |
| class Tokenizer(Hashable, Protocol): | |
| eos_token: str | |
| eos_token_id: int | |
| pad_token_id: int | |
| vocabulary: Dict[str, int] | |
| special_tokens: Set[str] | |
| def encode( | |
| self, prompt: Union[str, List[str]] | |
| ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]: | |
| """Translate the input prompts into arrays of token ids and attention mask.""" | |
| ... | |
| def decode(self, token_ids: NDArray[np.int64]) -> List[str]: | |
| """Translate an array of token ids to a string or list of strings.""" | |
| ... | |
| def convert_token_to_string(self, token: str) -> str: | |
| """Convert a token to its equivalent string. | |
| This is for instance useful for BPE tokenizers where whitespaces are | |
| represented by the special characted `Ġ`. This prevents matching a raw | |
| token that includes `Ġ` with a string. | |
| """ | |
| ... | |