|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from abc import ABC, abstractmethod |
|
|
from typing import List |
|
|
|
|
|
__all__ = ['TokenizerSpec'] |
|
|
|
|
|
|
|
|
class TokenizerSpec(ABC): |
|
|
""" |
|
|
Inherit this class to implement a new tokenizer. |
|
|
""" |
|
|
|
|
|
@abstractmethod |
|
|
def text_to_tokens(self, text): |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def tokens_to_text(self, tokens): |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def tokens_to_ids(self, tokens): |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def ids_to_tokens(self, ids): |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def text_to_ids(self, text): |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def ids_to_text(self, ids): |
|
|
pass |
|
|
|
|
|
def add_special_tokens(self, special_tokens: List[str]): |
|
|
raise NotImplementedError("To be implemented") |
|
|
|
|
|
@property |
|
|
def name(self): |
|
|
return type(self).__name__ |
|
|
|