| from pathlib import Path |
| from typing import Iterable |
| from typing import List |
| from typing import Union |
|
|
| import sentencepiece as spm |
| from typeguard import check_argument_types |
|
|
| from espnet2.text.abs_tokenizer import AbsTokenizer |
|
|
|
|
| class SentencepiecesTokenizer(AbsTokenizer): |
| def __init__(self, model: Union[Path, str]): |
| assert check_argument_types() |
| self.model = str(model) |
| |
| |
| |
| |
| |
| self.sp = None |
|
|
| def __repr__(self): |
| return f'{self.__class__.__name__}(model="{self.model}")' |
|
|
| def _build_sentence_piece_processor(self): |
| |
| if self.sp is None: |
| self.sp = spm.SentencePieceProcessor() |
| self.sp.load(self.model) |
|
|
| def text2tokens(self, line: str) -> List[str]: |
| self._build_sentence_piece_processor() |
| return self.sp.EncodeAsPieces(line) |
|
|
| def tokens2text(self, tokens: Iterable[str]) -> str: |
| self._build_sentence_piece_processor() |
| return self.sp.DecodePieces(list(tokens)) |
|
|