Spaces:
Runtime error
Runtime error
| import csv | |
| from typing import Any, Dict, List | |
| import sentencepiece as sp | |
| class SentencePieceBPETokenizer(object): | |
| r""" | |
| A tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`_ | |
| with BPE sub-routine. It encodes caption strings into list of tokens. | |
| Parameters | |
| ---------- | |
| model_path: str | |
| Path to the ``.model`` file trained by SentencePiece. | |
| """ | |
| SP_SPACE = u"▁" | |
| def __init__(self, model_path: str): | |
| self.model_path = model_path | |
| # Load pretrained tokenizer model. | |
| self.model = sp.SentencePieceProcessor() | |
| self.model.Load(model_path) | |
| def __getstate__(self): | |
| r""" | |
| This magic method, along with ``__setstate__`` makes an object of this | |
| class picklable (and usable while data loading with multiple workers). | |
| """ | |
| state_dict = self.__dict__.copy() | |
| state_dict["model"] = None | |
| return state_dict | |
| def __setstate__(self, state_dict: Dict[str, Any]): | |
| self.__dict__ = state_dict | |
| self.model = sp.SentencePieceProcessor() | |
| self.model.Load(self.model_path) | |
| def get_vocab_size(self) -> int: | |
| r"""Return number of tokens in vocabulary (including special tokens).""" | |
| return len(self.model) | |
| def token_to_id(self, token: str) -> int: | |
| r"""Get integer ID of a string token (``<unk>`` if does not exist).""" | |
| # Since tokenizer uses subword regularization, one token may break down to multiple IDs. | |
| # Keep trying till we get a single ID. | |
| return self.model.piece_to_id(token) | |
| def id_to_token(self, token_id: int) -> str: | |
| r"""Get string token of an integer ID (``<unk>`` if does not exist).""" | |
| return self.model.id_to_piece(token_id) | |
| def encode(self, text: str) -> List[int]: | |
| r"""Convert a text string to a list of integer token ids.""" | |
| return self.model.EncodeAsIds(text) | |
| def decode(self, token_ids: List[int]) -> str: | |
| r"""Convert a sequence of token IDs to a text string.""" | |
| return self.model.DecodeIds(token_ids) | |