ishanjmukherjee commited on
Commit
db299d8
·
1 Parent(s): 78c1cf3

Add tokenizer files verbatim from Together's Evo 1 HF

Browse files
Files changed (2) hide show
  1. tokenizer.py +130 -0
  2. tokenizer_config.json +14 -0
tokenizer.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied verbatim from Together's Evo 1
2
+ # based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
3
+ from __future__ import annotations
4
+
5
+ import torch
6
+
7
+ import numpy as np
8
+
9
+ from os import PathLike
10
+ from typing import List, Tuple
11
+
12
+ from tokenizers import Tokenizer
13
+ from transformers.tokenization_utils import PreTrainedTokenizer
14
+ from transformers.tokenization_utils_base import BatchEncoding, TruncationStrategy
15
+ from transformers.utils.generic import TensorType, PaddingStrategy
16
+
17
+
18
+ EMPTY: str = ""
19
+
20
+
21
+ class ByteTokenizer(PreTrainedTokenizer):
22
+
23
+ """UTF-8 Encoder."""
24
+
25
+ @classmethod
26
+ def from_pretrained(cls, model_id: str | PathLike, **kwargs) -> ByteTokenizer:
27
+
28
+ return cls(**kwargs, byte_level=True)
29
+
30
+ @property
31
+ def vocab_size(self) -> int:
32
+
33
+ return 512
34
+
35
+ @property
36
+ def byte_level(self) -> bool:
37
+
38
+ return self.init_kwargs.get('byte_level', True)
39
+
40
+ def get_vocab(self) -> Dict[str, int]:
41
+
42
+ return {chr(i): i for i in range(self.vocab_size)}
43
+
44
+ def __len__(self) -> int:
45
+
46
+ return self.vocab_size
47
+
48
+ def clamp(self, n: int) -> int:
49
+
50
+ return max(32, min(n, self.vocab_size))
51
+
52
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
53
+
54
+ return list(text)
55
+
56
+ def byte_tokenize(self, text: str) -> np.ndarray:
57
+
58
+ return np.frombuffer(text.encode('utf-8'), dtype=np.uint8)
59
+
60
+ def _convert_token_to_id(self, token: str) -> int:
61
+
62
+ return self.clamp(ord(token))
63
+
64
+ def _convert_id_to_token(self, index: int) -> str:
65
+
66
+ return chr(self.clamp(index))
67
+
68
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
69
+
70
+ return EMPTY.join(tokens)
71
+
72
+ def _decode(self, token_ids: List[int], **kwargs) -> str:
73
+
74
+ indices = np.asarray(token_ids, dtype=np.uint8)
75
+
76
+ return (
77
+ indices.clip(min=32, max=self.vocab_size, out=indices)
78
+ .tobytes()
79
+ .decode('utf-8')
80
+ )
81
+
82
+ def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
83
+
84
+ first_ids = self.byte_tokenize(text).tolist()
85
+
86
+ return self.prepare_for_model(
87
+ first_ids,
88
+ pair_ids=None,
89
+ add_special_tokens=kwargs.get('add_special_tokens', False),
90
+ padding=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD).value,
91
+ truncation=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE).value,
92
+ max_length=kwargs.get('max_length'),
93
+ stride=kwargs.get('stride', 0),
94
+ pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
95
+ return_tensors=kwargs.get('return_tensors'),
96
+ prepend_batch_axis=True,
97
+ return_attention_mask=kwargs.get('return_attention_mask'),
98
+ return_token_type_ids=kwargs.get('return_token_type_ids'),
99
+ return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
100
+ return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
101
+ return_length=kwargs.get('return_length', False),
102
+ verbose=kwargs.get('verbose', True),
103
+ )
104
+
105
+ def _batch_encode_plus(self, batch_text_or_text_pairs: List[str], **kwargs) -> BatchEncoding:
106
+
107
+ input_ids = [(self.byte_tokenize(text).tolist(), None) for text in batch_text_or_text_pairs]
108
+
109
+ return self._batch_prepare_for_model(
110
+ input_ids,
111
+ add_special_tokens=kwargs.get('add_special_tokens', False),
112
+ padding_strategy=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD),
113
+ truncation_strategy=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE),
114
+ max_length=kwargs.get('max_length'),
115
+ stride=kwargs.get('stride', 0),
116
+ pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
117
+ return_attention_mask=kwargs.get('return_attention_mask'),
118
+ return_token_type_ids=kwargs.get('return_token_type_ids'),
119
+ return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
120
+ return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
121
+ return_length=kwargs.get('return_length', False),
122
+ return_tensors=kwargs.get('return_tensors'),
123
+ verbose=kwargs.get('verbose', True),
124
+ )
125
+
126
+ def _save_pretrained(
127
+ self, save_directory: str | PathLike, file_names: Tuple[str], **kwargs
128
+ ) -> Tuple[str]:
129
+
130
+ return file_names
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenizer.ByteTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "byte_level": true,
10
+ "clean_up_tokenization_spaces": true,
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "padding_side": "left",
13
+ "truncation_side": "left"
14
+ }