JorgeVanco commited on
Commit
8c2cc2d
·
verified ·
1 Parent(s): 7c53e69

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<eos>": 50259,
3
+ "<mask>": 50258,
4
+ "<pad>": 50257,
5
+ "<|delete|>": 50260,
6
+ "<|im_end|>": 50262,
7
+ "<|im_start|>": 50261
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% for message in messages %}<|im_start|>{{ message['role'] }}
2
+ {% if message['role'] == 'assistant' %}{% generation %}{{ message['content'] }}<|im_end|>{% endgeneration %}{% else %}{{ message['content'] }}<|im_end|>{% endif %}
3
+ {% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
4
+ {% endif %}
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertForMaskedLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50256,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "custom_pipelines": {
14
+ "text-diffusion": {
15
+ "impl": "pipeline.TextDiffusionPipeline",
16
+ "pt": [
17
+ "AutoModelForMaskedLM"
18
+ ],
19
+ "tf": []
20
+ }
21
+ },
22
+ "decoder_bias": true,
23
+ "deterministic_flash_attn": false,
24
+ "dtype": "float32",
25
+ "embedding_dropout": 0.0,
26
+ "eos_token_id": 50259,
27
+ "global_attn_every_n_layers": 3,
28
+ "global_rope_theta": 160000.0,
29
+ "gradient_checkpointing": false,
30
+ "hidden_activation": "gelu",
31
+ "hidden_size": 1280,
32
+ "initializer_cutoff_factor": 2.0,
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 5120,
35
+ "layer_norm_eps": 1e-05,
36
+ "local_attention": 128,
37
+ "local_rope_theta": 10000.0,
38
+ "mask_token_id": 50258,
39
+ "max_position_embeddings": 8192,
40
+ "mlp_bias": false,
41
+ "mlp_dropout": 0.0,
42
+ "model_type": "modernbert",
43
+ "norm_bias": false,
44
+ "norm_eps": 1e-05,
45
+ "num_attention_heads": 10,
46
+ "num_hidden_layers": 20,
47
+ "pad_token_id": 50257,
48
+ "position_embedding_type": "absolute",
49
+ "repad_logits_with_grad": false,
50
+ "sep_token_id": 50282,
51
+ "seq_length": 2048,
52
+ "sparse_pred_ignore_index": -100,
53
+ "sparse_prediction": false,
54
+ "transformers_version": "4.56.2",
55
+ "use_cache": false,
56
+ "vocab_size": 50263
57
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2451886549ee8a6934888a3c296b22d212b3d6725322ab8204fd89db2e532354
3
+ size 2361481436
pipeline.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BatchEncoding, Pipeline
2
+ import torch
3
+ from typing import Any, Generator
4
+
5
+ class TextDiffusionPipeline(Pipeline):
6
+ def _sanitize_parameters(
7
+ self,
8
+ num_steps: int = 50,
9
+ allow_edits: bool = True,
10
+ use_confidence: bool = False,
11
+ stop_token: None = None,
12
+ **kwargs
13
+ ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
14
+ # Allow user to control the number of steps (e.g., diffusion steps)
15
+ # default to 10 steps
16
+ forward_kwargs = {
17
+ "num_steps": num_steps,
18
+ "allow_edits": allow_edits,
19
+ "use_confidence": use_confidence,
20
+ "stop_token": stop_token
21
+ }
22
+
23
+ preprocess_kwargs = {}
24
+ if "max_length" in kwargs:
25
+ preprocess_kwargs["max_length"] = kwargs["max_length"]
26
+
27
+ return preprocess_kwargs, forward_kwargs, {}
28
+
29
+ def preprocess(self, input_text, max_length=None) -> BatchEncoding | Any:
30
+ if self.tokenizer is None:
31
+ raise ValueError("Tokenizer was not passed to the pipeline!")
32
+ # Standard tokenization
33
+ if max_length is None:
34
+ # Safely access config if it exists, default to 512
35
+ max_length = getattr(self.model.config, "seq_length", 512)
36
+
37
+ if input_text is None:
38
+ input_text = ""
39
+
40
+ tokenized_text = self.tokenizer.encode(input_text)
41
+
42
+ if len(tokenized_text) < max_length:
43
+ input_ids = torch.full((1, max_length), self.tokenizer.mask_token_id, dtype=torch.long) # type: ignore
44
+ input_ids[0, :len(tokenized_text)] = torch.tensor(tokenized_text, dtype=torch.long)
45
+
46
+ return BatchEncoding({
47
+ "input_ids": input_ids,
48
+ "attention_mask": torch.ones_like(input_ids)
49
+ })
50
+
51
+ return self.tokenizer(
52
+ input_text,
53
+ return_tensors="pt",
54
+ padding="max_length",
55
+ max_length=max_length,
56
+ truncation=True,
57
+ )
58
+
59
+ @torch.no_grad()
60
+ def diffusion_generator(
61
+ self,
62
+ input_ids: torch.Tensor,
63
+ num_steps: int,
64
+ allow_edits: bool = True,
65
+ use_confidence: bool = False
66
+ ) -> Generator[torch.Tensor, None, None]:
67
+ if self.tokenizer is None:
68
+ raise ValueError("Tokenizer was not passed to the pipeline!")
69
+
70
+ current_state: torch.Tensor = input_ids.clone()
71
+ yield current_state.clone() # Yield Step 0
72
+
73
+ # Determine which tokens can be re-masked (i.e., mask and pad tokens)
74
+ initial_mask = (current_state == self.tokenizer.mask_token_id) | \
75
+ (current_state == self.tokenizer.pad_token_id)
76
+
77
+ for step in range(num_steps):
78
+ t_current = 1 - step / num_steps
79
+ t_next = 1 - (step + 1) / num_steps
80
+
81
+ # Predict full text with model
82
+ output = self.model(input_ids=current_state)
83
+ logits = output.logits
84
+
85
+ # Set logit that corresponds to the mask token to -inf
86
+ logits[:, :, self.tokenizer.mask_token_id] = torch.finfo(logits.dtype).min
87
+
88
+ # Ancestral sampling logic
89
+ probs = torch.softmax(logits, dim=-1)
90
+ dist = torch.distributions.Categorical(probs)
91
+ sampled_ids = dist.sample()
92
+
93
+ # Calculate Unmasking Probability (Equation 7 https://arxiv.org/pdf/2406.07524)
94
+ # P(unmask | masked) = (alpha_s - alpha_t) / (1 - alpha_t)
95
+ # mapping: alpha_t = (1 - t_current), alpha_s = (1 - t_next)
96
+ # resulting simplified formula: (t_current - t_next) / t_current
97
+ if step < num_steps - 1:
98
+ unmasking_prob = (t_current - t_next) / t_current
99
+ else:
100
+ unmasking_prob = 1.0 # Force unmask at the end
101
+
102
+ remasking_mask: torch.Tensor = (current_state == self.tokenizer.mask_token_id) | \
103
+ (current_state == self.tokenizer.pad_token_id) # type: ignore
104
+
105
+ if use_confidence:
106
+ # Get the confidence (probability) of the tokens we just sampled
107
+ sample_probs = probs.gather(-1, sampled_ids.unsqueeze(-1)).squeeze(-1)
108
+
109
+ # Determine how many tokens to unmask this step
110
+ if step < num_steps - 1:
111
+ num_masked = remasking_mask.sum(dim=1, keepdim=True)
112
+ num_to_unmask = (num_masked.float() * unmasking_prob).ceil().long()
113
+ else:
114
+ num_to_unmask = remasking_mask.sum(dim=1, keepdim=True)
115
+
116
+ # Select Top-K most confident tokens
117
+ # Set confidence of already visible tokens to -inf so they aren't picked
118
+ candidate_confidences = sample_probs.clone()
119
+ candidate_confidences[~remasking_mask] = -float('inf')
120
+
121
+ unmasking_mask = torch.zeros_like(remasking_mask, dtype=torch.bool)
122
+
123
+ max_k = num_to_unmask.max().item()
124
+ if max_k > 0:
125
+ _, top_indices = candidate_confidences.topk(k=max_k, dim=1)
126
+ range_tensor = torch.arange(max_k, device=current_state.device).unsqueeze(0)
127
+ mask_k = range_tensor < num_to_unmask
128
+ unmasking_mask.scatter_(1, top_indices, mask_k)
129
+
130
+ else:
131
+ # Random Unmasking
132
+ unmasking_mask = torch.rand_like(current_state, dtype=torch.float) < unmasking_prob
133
+
134
+ update_mask = unmasking_mask & remasking_mask & initial_mask
135
+
136
+ if allow_edits: # Apply Seed Diffusion Editing Logic (Section 3.1 in https://arxiv.org/pdf/2508.02193)
137
+ alpha_t = 0.1 * (1 - step / num_steps) # alpha_t decreases from 0.1 to 0 (Seed Diffusion)
138
+
139
+ edit_mask = torch.rand_like(current_state, dtype=torch.float) < alpha_t
140
+
141
+ is_visible = (current_state != self.tokenizer.mask_token_id) & \
142
+ (current_state != self.tokenizer.pad_token_id) & \
143
+ (current_state != self.tokenizer.eos_token_id)
144
+ edit_mask = is_visible & edit_mask & initial_mask # Use initial_mask to avoid editing original prompt
145
+
146
+ # Combine both masks
147
+ update_mask = update_mask | edit_mask
148
+
149
+ # Update current state
150
+ current_state[update_mask] = sampled_ids[update_mask]
151
+
152
+ yield current_state.clone() # Yield after each step
153
+
154
+ @torch.no_grad()
155
+ def _forward(
156
+ self,
157
+ model_inputs: torch.Tensor,
158
+ num_steps: int = 50,
159
+ allow_edits: bool = True,
160
+ use_confidence: bool = False,
161
+ stop_token: None = None
162
+ ) -> dict[str, Any]:
163
+ if self.tokenizer is None:
164
+ raise ValueError("Tokenizer was not passed to the pipeline!")
165
+
166
+ input_ids = model_inputs["input_ids"]
167
+ all_states = list(self.diffusion_generator(input_ids=input_ids, num_steps=num_steps, allow_edits=allow_edits, use_confidence=use_confidence))
168
+ final_state = all_states[-1]
169
+
170
+ return {"final_state": final_state, "history": all_states}
171
+
172
+ @torch.no_grad()
173
+ def stream_generation(
174
+ self,
175
+ input_text: str,
176
+ num_steps: int = 50,
177
+ allow_edits: bool = True,
178
+ use_confidence: bool = False,
179
+ max_length: int | None = None,
180
+ stop_token: str | None = None
181
+ ) -> Generator[str, None, None]:
182
+ """
183
+ Public method to stream text generation step-by-step.
184
+ """
185
+ # 1. Preprocess
186
+ inputs = self.preprocess(input_text, max_length)
187
+ input_ids = inputs["input_ids"].to(self.model.device) # type: ignore
188
+
189
+ # 2. Iterate over generator
190
+ for step_tensor in self.diffusion_generator(input_ids=input_ids, num_steps=num_steps, allow_edits=allow_edits, use_confidence=use_confidence):
191
+ # Decode current state
192
+ text = self.tokenizer.decode(step_tensor[0], skip_special_tokens=False) # type: ignore
193
+ yield text
194
+
195
+ if stop_token is not None and stop_token in text[len(input_text):]:
196
+ text = input_text + text[len(input_text):].split(stop_token)[0]
197
+ yield text
198
+
199
+ def postprocess(self, model_outputs) -> list[str] | Any:
200
+ if self.tokenizer is None:
201
+ raise ValueError("Tokenizer was not passed to the pipeline!")
202
+
203
+ # Convert final tensor to image/text
204
+ final_ids = model_outputs["final_state"]
205
+ return {
206
+ "decoded_texts": self.tokenizer.batch_decode(final_ids, skip_special_tokens=False),
207
+ "history": model_outputs["history"],
208
+ "final_ids": final_ids
209
+ }
210
+
211
+ @torch.no_grad()
212
+ def block_diffusion_generator(
213
+ self, input_ids: torch.Tensor,
214
+ block_size: int,
215
+ max_length: int,
216
+ num_steps: int,
217
+ allow_edits: bool = True,
218
+ use_confidence: bool = False,
219
+ stop_token: str | None = None
220
+ ) -> Generator[torch.Tensor, None, None]:
221
+ """
222
+ Generator that yields the diffusion states block-by-block.
223
+ Args:
224
+ input_ids (torch.Tensor): Initial input IDs with context.
225
+ block_size (int): Number of tokens to generate in each block.
226
+ max_length (int): Max length of the generated text.
227
+ num_steps (int): Number of diffusion steps per block.
228
+ allow_edits (bool): Whether to allow edits to existing tokens.
229
+ use_confidence (bool): Whether to use confidence-based unmasking.
230
+ stop_token (str | None): Token at which to stop generation early.
231
+ Yields:
232
+ torch.Tensor: The current state of the full sequence after each diffusion step.
233
+ """
234
+ assert num_steps > 0, "num_steps must be greater than 0"
235
+ if self.tokenizer is None:
236
+ raise ValueError("Tokenizer was not passed to the pipeline!")
237
+
238
+ max_seq_length = self.model.config.seq_length if hasattr(self.model.config, "seq_length") else 512
239
+ stop_token_id = self.tokenizer.convert_tokens_to_ids(stop_token) if stop_token is not None else None
240
+
241
+ assert block_size > 0 and block_size <= max_seq_length, f"block_size must be in (0, {max_seq_length}]"
242
+
243
+ full_sequence = input_ids.clone()
244
+ current_length = input_ids.shape[1]
245
+ while current_length < max_length:
246
+ remaining = max_length - current_length
247
+ this_block_len = min(block_size, remaining)
248
+ if this_block_len <= 0: break
249
+
250
+ # Append MASK tokens for the new block
251
+ mask_block = torch.full(
252
+ (1, this_block_len),
253
+ self.tokenizer.mask_token_id, # type: ignore
254
+ dtype=torch.long,
255
+ device=self.model.device
256
+ )
257
+
258
+ # Combine Context + New Masks
259
+ input_ids = torch.cat([full_sequence[:, -(max_seq_length - this_block_len):], mask_block], dim=1)
260
+
261
+ for step_tensor in self.diffusion_generator(
262
+ input_ids,
263
+ num_steps=num_steps,
264
+ allow_edits=allow_edits,
265
+ use_confidence=use_confidence
266
+ ):
267
+ current_generated_tokens = step_tensor[:, -this_block_len:]
268
+ yield torch.cat([full_sequence, current_generated_tokens], dim=1)
269
+
270
+
271
+ if stop_token_id is not None and stop_token_id in current_generated_tokens:
272
+ # Stop if EOS is generated
273
+ eos_index = (current_generated_tokens == stop_token_id).nonzero(as_tuple=True)[1] # type: ignore
274
+ current_generated_tokens = current_generated_tokens[:, :eos_index[0]]
275
+ yield torch.cat([full_sequence, current_generated_tokens], dim=1)
276
+ break
277
+
278
+ # Update full sequence and current length
279
+ full_sequence = torch.cat([full_sequence, current_generated_tokens], dim=1)
280
+ current_length = full_sequence.shape[1]
281
+
282
+
283
+ @torch.no_grad()
284
+ def semi_autoregressive_generate(
285
+ self,
286
+ input_text: str,
287
+ block_size: int = 64,
288
+ max_length: int = 256,
289
+ num_steps: int = 50,
290
+ allow_edits: bool = True,
291
+ use_confidence: bool = False
292
+ ) -> dict[str, Any]:
293
+ """
294
+ Semi-Autoregressive Generation:
295
+ Generates text in blocks using the diffusion model.
296
+ Each block is generated by appending MASK tokens to the current context
297
+ and running the diffusion process on the combined sequence.
298
+ Args:
299
+ input_text (str): The initial prompt text.
300
+ block_size (int): Number of tokens to generate in each block.
301
+ max_length (int): Max length of the generated text.
302
+ num_steps (int): Number of diffusion steps per block.
303
+ allow_edits (bool): Whether to allow edits to existing tokens.
304
+ use_confidence (bool): Whether to use confidence-based unmasking.
305
+ Returns:
306
+ dict[str, Any]: A dictionary containing the decoded texts, generation history, and final token IDs.
307
+ """
308
+ if self.tokenizer is None: raise ValueError("No tokenizer")
309
+
310
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(self.model.device) # type: ignore
311
+ all_states = list(self.block_diffusion_generator(input_ids, block_size, max_length, num_steps, allow_edits, use_confidence=use_confidence))
312
+ final_state = all_states[-1]
313
+ return {
314
+ "decoded_texts": self.tokenizer.batch_decode(final_state, skip_special_tokens=False),
315
+ "history": all_states,
316
+ "final_ids": final_state
317
+ }
318
+
319
+ @torch.no_grad()
320
+ def stream_semi_autoregressive_generate(
321
+ self,
322
+ input_text: str,
323
+ block_size: int = 64,
324
+ max_length: int = 256,
325
+ num_steps: int = 50,
326
+ allow_edits: bool = True,
327
+ use_confidence: bool = False,
328
+ stop_token: str | None = None
329
+ ) -> Generator[str, None, None]:
330
+ """
331
+ Streams the generation process block-by-block.
332
+ Yields the full decoded text at every diffusion step of every block.
333
+ Args:
334
+ input_text (str): The initial prompt text.
335
+ block_size (int): Number of tokens to generate in each block.
336
+ max_length (int): Max length of the generated text.
337
+ num_steps (int): Number of diffusion steps per block.
338
+ allow_edits (bool): Whether to allow edits to existing tokens.
339
+ use_confidence (bool): Whether to use confidence-based unmasking.
340
+ stop_token (None): Token at which to stop generation early.
341
+ Yields:
342
+ str: The current generated text after each diffusion step.
343
+ """
344
+ if self.tokenizer is None: raise ValueError("No tokenizer")
345
+
346
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(self.model.device) # type: ignore
347
+
348
+ for step_tensor in self.block_diffusion_generator(input_ids, block_size, max_length, num_steps, allow_edits, use_confidence=use_confidence, stop_token=stop_token):
349
+ # Decode current state
350
+ yield self.tokenizer.decode(step_tensor[0], skip_special_tokens=False) # type: ignore
special_tokens_map.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "mask_token": {
21
+ "content": "<mask>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "pad_token": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "unk_token": {
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "50258": {
21
+ "content": "<mask>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "50259": {
29
+ "content": "<eos>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50260": {
37
+ "content": "<|delete|>",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ },
44
+ "50261": {
45
+ "content": "<|im_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "50262": {
53
+ "content": "<|im_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
+ "additional_special_tokens": [
62
+ "<|im_start|>",
63
+ "<|im_end|>"
64
+ ],
65
+ "bos_token": "<|endoftext|>",
66
+ "clean_up_tokenization_spaces": false,
67
+ "eos_token": "<eos>",
68
+ "extra_special_tokens": {},
69
+ "mask_token": "<mask>",
70
+ "model_max_length": 1024,
71
+ "pad_token": "<pad>",
72
+ "tokenizer_class": "GPT2Tokenizer",
73
+ "unk_token": "<|endoftext|>"
74
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff