koichi12's picture
Add files using upload-large-folder tool
762d748 verified
raw
history blame
996 Bytes
from typing import Dict, Hashable, List, Protocol, Set, Tuple, Union
import numpy as np
from numpy.typing import NDArray
class Tokenizer(Hashable, Protocol):
eos_token: str
eos_token_id: int
pad_token_id: int
vocabulary: Dict[str, int]
special_tokens: Set[str]
def encode(
self, prompt: Union[str, List[str]]
) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
"""Translate the input prompts into arrays of token ids and attention mask."""
...
def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
"""Translate an array of token ids to a string or list of strings."""
...
def convert_token_to_string(self, token: str) -> str:
"""Convert a token to its equivalent string.
This is for instance useful for BPE tokenizers where whitespaces are
represented by the special characted `Ġ`. This prevents matching a raw
token that includes `Ġ` with a string.
"""
...