JakeOh commited on
Commit
f4f74cb
·
verified ·
1 Parent(s): eb8366f

Upload folder using huggingface_hub

Browse files
char_tokenizer.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Character-level tokenizer compatible with HuggingFace transformers.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ from typing import Dict, List, Optional
8
+
9
+ from transformers import PreTrainedTokenizer
10
+
11
+
12
+ class CharTokenizer(PreTrainedTokenizer):
13
+ """
14
+ Character-level tokenizer that treats each character as a token.
15
+ Compatible with HuggingFace transformers.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ vocab_file: Optional[str] = None,
21
+ characters: Optional[str] = None,
22
+ model_max_length: int = 512,
23
+ padding_side: str = "right",
24
+ **kwargs,
25
+ ):
26
+ """
27
+ Initialize character tokenizer.
28
+
29
+ Args:
30
+ vocab_file: Path to vocabulary file (vocab.json) to load.
31
+ This is the first argument for HuggingFace compatibility.
32
+ characters: String of characters to include in vocabulary.
33
+ If None, will be built from training data or loaded from vocab_file.
34
+ model_max_length: Maximum sequence length.
35
+ padding_side: Which side to pad on ("left" or "right").
36
+ """
37
+ # Define special tokens before super().__init__
38
+ pad_token = kwargs.pop("pad_token", "<pad>")
39
+ unk_token = kwargs.pop("unk_token", "<unk>")
40
+ bos_token = kwargs.pop("bos_token", "<s>")
41
+ eos_token = kwargs.pop("eos_token", "</s>")
42
+
43
+ # Initialize vocab dictionaries first
44
+ self.char_to_id = {}
45
+ self.id_to_char = {}
46
+
47
+ # Load or build vocabulary
48
+ if vocab_file is not None and os.path.exists(vocab_file):
49
+ # Load vocabulary from file
50
+ with open(vocab_file, "r", encoding="utf-8") as f:
51
+ self.char_to_id = json.load(f)
52
+ self.id_to_char = {int(idx): char for char, idx in self.char_to_id.items()}
53
+ # Convert string keys to int keys for id_to_char
54
+ self.char_to_id = {
55
+ char: int(idx) if isinstance(idx, str) else idx
56
+ for char, idx in self.char_to_id.items()
57
+ }
58
+ elif characters is not None:
59
+ # Build vocabulary from characters
60
+ special_tokens = [pad_token, unk_token, bos_token, eos_token]
61
+ unique_chars = []
62
+ for char in characters:
63
+ if char not in unique_chars and char not in special_tokens:
64
+ unique_chars.append(char)
65
+ all_tokens = special_tokens + sorted(unique_chars)
66
+ self.char_to_id = {char: idx for idx, char in enumerate(all_tokens)}
67
+ self.id_to_char = {idx: char for char, idx in self.char_to_id.items()}
68
+
69
+ super().__init__(
70
+ pad_token=pad_token,
71
+ unk_token=unk_token,
72
+ bos_token=bos_token,
73
+ eos_token=eos_token,
74
+ model_max_length=model_max_length,
75
+ padding_side=padding_side,
76
+ **kwargs,
77
+ )
78
+
79
+ # Register special tokens to _added_tokens_encoder for proper tokenization.
80
+ # This ensures special tokens are recognized by tokens_trie and not split
81
+ # into individual characters during tokenization.
82
+ special_tokens_to_register = [pad_token, unk_token, bos_token, eos_token]
83
+ for token in special_tokens_to_register:
84
+ if token is not None and token in self.char_to_id:
85
+ token_id = self.char_to_id[token]
86
+ if token not in self._added_tokens_encoder:
87
+ from transformers.tokenization_utils import AddedToken
88
+
89
+ added_token = AddedToken(token, special=True, normalized=False)
90
+ self._added_tokens_encoder[token] = token_id
91
+ self._added_tokens_decoder[token_id] = added_token
92
+ self._update_trie()
93
+
94
+ @property
95
+ def vocab_size(self) -> int:
96
+ """Return vocabulary size including added tokens."""
97
+ base_size = len(self.char_to_id)
98
+ # Check if there are added tokens beyond base vocabulary
99
+ if hasattr(self, "added_tokens_decoder") and self.added_tokens_decoder:
100
+ max_added_id = max(int(k) for k in self.added_tokens_decoder.keys())
101
+ return max(base_size, max_added_id + 1)
102
+ return base_size
103
+
104
+ def get_vocab(self) -> Dict[str, int]:
105
+ """Return vocabulary dictionary."""
106
+ return self.char_to_id.copy()
107
+
108
+ def _tokenize(self, text: str) -> List[str]:
109
+ """Tokenize text into characters."""
110
+ return list(text)
111
+
112
+ def _convert_token_to_id(self, token: str) -> int:
113
+ """Convert a token (character) to an id."""
114
+ # Handle AddedToken objects from transformers
115
+ token_str = str(token) if not isinstance(token, str) else token
116
+ return self.char_to_id.get(token_str, self.char_to_id.get(self.unk_token, 1))
117
+
118
+ def _convert_id_to_token(self, index: int) -> str:
119
+ """Convert an id to a token (character)."""
120
+ return self.id_to_char.get(index, self.unk_token)
121
+
122
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
123
+ """Convert tokens back to string."""
124
+ return "".join(tokens)
125
+
126
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
127
+ """Save vocabulary to file."""
128
+ if not os.path.isdir(save_directory):
129
+ os.makedirs(save_directory, exist_ok=True)
130
+
131
+ vocab_file = os.path.join(
132
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
133
+ )
134
+
135
+ with open(vocab_file, "w", encoding="utf-8") as f:
136
+ json.dump(self.char_to_id, f, ensure_ascii=False, indent=2)
137
+
138
+ return (vocab_file,)
139
+
140
+ def build_inputs_with_special_tokens(
141
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
142
+ ) -> List[int]:
143
+ """
144
+ Build model inputs by adding special tokens.
145
+ Format: <s> token_ids_0 </s> [<s> token_ids_1 </s>]
146
+ """
147
+ bos = [self.bos_token_id] if self.bos_token_id is not None else []
148
+ eos = [self.eos_token_id] if self.eos_token_id is not None else []
149
+
150
+ if token_ids_1 is None:
151
+ return bos + token_ids_0 + eos
152
+
153
+ return bos + token_ids_0 + eos + bos + token_ids_1 + eos
154
+
155
+ def get_special_tokens_mask(
156
+ self,
157
+ token_ids_0: List[int],
158
+ token_ids_1: Optional[List[int]] = None,
159
+ already_has_special_tokens: bool = False,
160
+ ) -> List[int]:
161
+ """
162
+ Get mask for special tokens.
163
+ """
164
+ if already_has_special_tokens:
165
+ return super().get_special_tokens_mask(
166
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
167
+ )
168
+
169
+ bos_mask = [1] if self.bos_token_id is not None else []
170
+ eos_mask = [1] if self.eos_token_id is not None else []
171
+
172
+ if token_ids_1 is None:
173
+ return bos_mask + ([0] * len(token_ids_0)) + eos_mask
174
+
175
+ return (
176
+ bos_mask
177
+ + ([0] * len(token_ids_0))
178
+ + eos_mask
179
+ + bos_mask
180
+ + ([0] * len(token_ids_1))
181
+ + eos_mask
182
+ )
183
+
184
+
185
+ def create_char_tokenizer_from_file(
186
+ file_path: str, save_directory: str, model_max_length: int = 512, **kwargs
187
+ ) -> CharTokenizer:
188
+ """
189
+ Create and save a character tokenizer from a text file.
190
+
191
+ Args:
192
+ file_path: Path to text file to build vocabulary from.
193
+ save_directory: Directory to save the tokenizer.
194
+ model_max_length: Maximum sequence length.
195
+ **kwargs: Additional arguments for CharTokenizer.
196
+
197
+ Returns:
198
+ Initialized CharTokenizer.
199
+ """
200
+ # Read text file and collect all unique characters
201
+ with open(file_path, "r", encoding="utf-8") as f:
202
+ text = f.read()
203
+
204
+ # Create tokenizer
205
+ tokenizer = CharTokenizer(characters=text, model_max_length=model_max_length, **kwargs)
206
+
207
+ # Save tokenizer
208
+ tokenizer.save_pretrained(save_directory)
209
+
210
+ print(f"Character tokenizer created with vocabulary size: {tokenizer.vocab_size}")
211
+ print(f"Saved to: {save_directory}")
212
+
213
+ return tokenizer
214
+
215
+
216
+ def create_char_tokenizer_from_dataset(
217
+ dataset,
218
+ text_column: str,
219
+ save_directory: str,
220
+ model_max_length: int = 512,
221
+ max_samples: Optional[int] = None,
222
+ **kwargs,
223
+ ) -> CharTokenizer:
224
+ """
225
+ Create and save a character tokenizer from a HuggingFace dataset.
226
+
227
+ Args:
228
+ dataset: HuggingFace dataset object.
229
+ text_column: Name of the column containing text.
230
+ save_directory: Directory to save the tokenizer.
231
+ model_max_length: Maximum sequence length.
232
+ max_samples: Maximum number of samples to use (None for all).
233
+ **kwargs: Additional arguments for CharTokenizer.
234
+
235
+ Returns:
236
+ Initialized CharTokenizer.
237
+ """
238
+ # Collect all unique characters
239
+ all_chars = set()
240
+
241
+ samples = (
242
+ dataset if max_samples is None else dataset.select(range(min(max_samples, len(dataset))))
243
+ )
244
+
245
+ for example in samples:
246
+ text = example[text_column]
247
+ all_chars.update(text)
248
+
249
+ # Create tokenizer
250
+ characters = "".join(sorted(all_chars))
251
+ tokenizer = CharTokenizer(characters=characters, model_max_length=model_max_length, **kwargs)
252
+
253
+ # Save tokenizer
254
+ tokenizer.save_pretrained(save_directory)
255
+
256
+ print(f"Character tokenizer created with vocabulary size: {tokenizer.vocab_size}")
257
+ print(f"Saved to: {save_directory}")
258
+
259
+ return tokenizer
chat_template.jinja ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- for message in messages %}
3
+ {%- if (message['role'] == 'system') -%}
4
+ {{'<|system|>' + '\n' + message['content'].strip() + '<|end|>' + '\n'}}
5
+ {%- elif (message['role'] == 'user') -%}
6
+ {{'<|user|>' + '\n' + message['content'].strip() + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}
7
+ {%- elif message['role'] == 'assistant' -%}
8
+ {{message['content'].strip() + '<|end|>' + '\n'}}
9
+ {%- endif %}
10
+ {%- endfor %}
special_tokens_map.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|system|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|user|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|assistant|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ ],
32
+ "bos_token": "<|startoftext|>",
33
+ "eos_token": "<|endoftext|>",
34
+ "mask_token": "<|mdm_mask|>",
35
+ "pad_token": "<|endoftext|>"
36
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "2": {
4
+ "content": "<|startoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "3": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "19": {
20
+ "content": "<|system|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "20": {
28
+ "content": "<|user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "21": {
36
+ "content": "<|assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "22": {
44
+ "content": "<|end|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "additional_special_tokens": [
53
+ "<|system|>",
54
+ "<|user|>",
55
+ "<|assistant|>",
56
+ "<|end|>"
57
+ ],
58
+ "bos_token": "<|startoftext|>",
59
+ "clean_up_tokenization_spaces": false,
60
+ "eos_token": "<|endoftext|>",
61
+ "extra_special_tokens": {},
62
+ "mask_token": "<|mdm_mask|>",
63
+ "model_max_length": 4096,
64
+ "pad_token": "<|endoftext|>",
65
+ "padding_side": "right",
66
+ "tokenizer_class": "CharTokenizer",
67
+ "unk_token": null,
68
+ "auto_map": {
69
+ "AutoTokenizer": [
70
+ "char_tokenizer.CharTokenizer",
71
+ null
72
+ ]
73
+ }
74
+ }
vocab.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 3,
3
+ "null": 1,
4
+ "<|startoftext|>": 2,
5
+ "*": 4,
6
+ "+": 5,
7
+ "-": 6,
8
+ "/": 7,
9
+ "0": 8,
10
+ "1": 9,
11
+ "2": 10,
12
+ "3": 11,
13
+ "4": 12,
14
+ "5": 13,
15
+ "6": 14,
16
+ "7": 15,
17
+ "8": 16,
18
+ "9": 17,
19
+ "=": 18,
20
+ "?": 19
21
+ }