| from transformers import DebertaV2Tokenizer | |
| class DebertaV2JumanppTokenizer(DebertaV2Tokenizer): | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.juman_tokenizer = JumanppTokenizer() | |
| def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]: | |
| text = self.juman_tokenizer.tokenize(text) | |
| add_prefix_space = kwargs.pop("add_prefix_space", False) | |
| if is_split_into_words or add_prefix_space: | |
| text = " " + text | |
| return (text, kwargs) | |
| class JumanppTokenizer: | |
| def __init__(self): | |
| try: | |
| import rhoknp | |
| except ImportError: | |
| raise ImportError( | |
| "You need to install rhoknp to use JumanppPreTokenizer. " | |
| "See https://github.com/ku-nlp/rhoknp for installation." | |
| ) | |
| self.rhoknp = rhoknp | |
| self.jumanpp = rhoknp.Jumanpp() | |
| def tokenize(self, text: str) -> str: | |
| morphemes = self.jumanpp.apply_to_sentence(text).morphemes | |
| if not morphemes: | |
| doc = self.rhoknp.Document.from_raw_text(text) | |
| morphemes = self.jumanpp.apply_to_document(doc).morphemes | |
| return " ".join([morpheme.surf for morpheme in morphemes]) | |