| from abc import ABC, abstractmethod, abstractproperty |
| from typing import Dict, List, Tuple, Union |
|
|
| T = Union[str, bytes] |
|
|
|
|
| class BaseTokenizer(ABC): |
|
|
| def tokenize(self, line: str) -> Tuple[List[T], List[int]]: |
| tokens = self.text2tokens(line) |
| ids = self.tokens2ids(tokens) |
| return tokens, ids |
|
|
| def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]: |
| tokens = self.ids2tokens(ids) |
| text = self.tokens2text(tokens) |
| return text, tokens |
|
|
| @abstractmethod |
| def text2tokens(self, line: str) -> List[T]: |
| raise NotImplementedError("abstract method") |
|
|
| @abstractmethod |
| def tokens2text(self, tokens: List[T]) -> str: |
| raise NotImplementedError("abstract method") |
|
|
| @abstractmethod |
| def tokens2ids(self, tokens: List[T]) -> List[int]: |
| raise NotImplementedError("abstract method") |
|
|
| @abstractmethod |
| def ids2tokens(self, ids: List[int]) -> List[T]: |
| raise NotImplementedError("abstract method") |
|
|
| @abstractmethod |
| def vocab_size(self) -> int: |
| raise NotImplementedError("abstract method") |
|
|
| @abstractproperty |
| def symbol_table(self) -> Dict[T, int]: |
| raise NotImplementedError("abstract method") |
|
|