Step-Audio-EditX / funasr_detach /tokenizer /sentencepiece_tokenizer.py
xieli
audio edit
6852edb
raw
history blame
1.66 kB
from pathlib import Path
from typing import Iterable
from typing import List
from typing import Union
import sentencepiece as spm
from funasr_detach.tokenizer.abs_tokenizer import BaseTokenizer
from funasr_detach.register import tables
@tables.register("tokenizer_classes", "SentencepiecesTokenizer")
class SentencepiecesTokenizer(BaseTokenizer):
def __init__(self, bpemodel: Union[Path, str], **kwargs):
super().__init__(**kwargs)
self.bpemodel = str(bpemodel)
# NOTE(kamo):
# Don't build SentencePieceProcessor in __init__()
# because it's not picklable and it may cause following error,
# "TypeError: can't pickle SwigPyObject objects",
# when giving it as argument of "multiprocessing.Process()".
self.sp = None
def __repr__(self):
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
def _build_sentence_piece_processor(self):
# Build SentencePieceProcessor lazily.
if self.sp is None:
self.sp = spm.SentencePieceProcessor()
self.sp.load(self.bpemodel)
def text2tokens(self, line: str) -> List[str]:
self._build_sentence_piece_processor()
return self.sp.EncodeAsPieces(line)
def tokens2text(self, tokens: Iterable[str]) -> str:
self._build_sentence_piece_processor()
return self.sp.DecodePieces(list(tokens))
def encode(self, line: str) -> List[int]:
self._build_sentence_piece_processor()
return self.sp.EncodeAsIds(line)
def decode(self, line: List[int]):
self._build_sentence_piece_processor()
return self.sp.DecodeIds(line)