| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer |
| |
|
| |
|
| | class NullTokenizer(MegatronTokenizer): |
| | """ |
| | Synthetic tokenizer for performance benchmarking and debugging |
| | |
| | Args: |
| | vocab_size: vocabulary size for embedding |
| | """ |
| |
|
| | def __init__(self, vocab_size): |
| | super().__init__(None, vocab_size=vocab_size) |
| | self._vocab_size_without_eod = int(vocab_size) |
| | self._eod_id = self._vocab_size_without_eod |
| |
|
| | def tokenize(self, text): |
| | return [int(x) for x in text.split(' ')] |
| |
|
| | def detokenize(self, ids): |
| | text = [str(x) for x in ids] |
| | return ' '.join(text) |
| |
|
| | def offsets(self, ids: list[int], text: str) -> list[int]: |
| | offsets, start_idx = [], 0 |
| | for id_ in ids: |
| | offsets.append(start_idx) |
| | start_idx += 1 + len(str(id_)) |
| | return offsets |
| |
|
| | @property |
| | def vocab_size(self): |
| | return self._vocab_size_without_eod + 1 |
| |
|
| | @property |
| | def vocab(self): |
| | raise NotImplementedError |
| |
|
| | @property |
| | def inv_vocab(self): |
| | raise NotImplementedError |
| |
|
| | @property |
| | def cls(self): |
| | return -1 |
| |
|
| | @property |
| | def sep(self): |
| | return -1 |
| |
|
| | @property |
| | def mask(self): |
| | return -1 |
| |
|
| | @property |
| | def eod(self): |
| | return self._eod_id |
| |
|
| | @property |
| | def additional_special_tokens_ids(self): |
| | return None |
| |
|