Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| import logging | |
| from typing import Any | |
| from pydantic import BaseModel | |
| from bytelatent.tokenizers.blt_tokenizer import BltTokenizer | |
| from bytelatent.tokenizers.tiktoken_tokenizer import TikTokenTokenizer | |
| try: | |
| from sentencepiece import SentencePieceProcessor | |
| has_sp = True | |
| except ImportError: | |
| has_sp = False | |
| try: | |
| import tiktoken | |
| from tiktoken.load import load_tiktoken_bpe | |
| has_tiktoken = True | |
| except ImportError: | |
| has_tiktoken = False | |
| from bytelatent.tokenizers.abstract_tokenizer import Tokenizer | |
| from bytelatent.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer | |
| logger = logging.getLogger(__name__) | |
| class MockTokenizer(Tokenizer): | |
| n_words: int = 256 | |
| def encode(self, text: str, add_bos: bool, add_eos: bool): | |
| return text | |
| def decode(self, tokens): | |
| raise NotImplementedError() | |
| def get_token_offsets( | |
| self, text: str, tokens: list[int] | None = None | |
| ) -> tuple[list[str]]: | |
| raise NotImplementedError() | |
| class TokenizerArgs(BaseModel): | |
| name: str = "bytes" | |
| init_kwargs: dict[str, Any] | None = None | |
| def build(self) -> Tokenizer: | |
| if self.init_kwargs is None: | |
| init_kwargs = {} | |
| else: | |
| init_kwargs = self.init_kwargs | |
| if self.name == "blt": | |
| return BltTokenizer(**init_kwargs) | |
| elif self.name == "mock": | |
| return MockTokenizer(**init_kwargs) | |
| elif self.name == "sp": | |
| assert has_sp, "sentencepiece not installed" | |
| return SentencePieceTokenizer(**init_kwargs) | |
| elif self.name == "tiktoken": | |
| assert has_tiktoken, "tiktoken not installed" | |
| return TikTokenTokenizer(**init_kwargs) | |
| else: | |
| raise NotImplementedError(f"{self.name} tokenizer type is not implemented") | |