SultanR commited on
Commit
f62d881
·
verified ·
1 Parent(s): da71b02

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "QualityClassifierModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "merged_model.QualityClassifierConfig",
7
+ "AutoModel": "merged_model.QualityClassifierModel"
8
+ },
9
+ "base_model_name": "FacebookAI/xlm-roberta-base",
10
+ "dropout": 0.2,
11
+ "dtype": "float32",
12
+ "hidden_dim": 256,
13
+ "id2label": {
14
+ "0": "LABEL_0"
15
+ },
16
+ "label2id": {
17
+ "LABEL_0": 0
18
+ },
19
+ "model_type": "quality_classifier",
20
+ "transformers_version": "4.57.3"
21
+ }
config.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for HQ document quality classifiers.
3
+
4
+ Defines language-specific settings, dataset sources, and training hyperparameters
5
+ for the FineWeb2-HQ methodology.
6
+ """
7
+ from pathlib import Path
8
+
9
+ # =============================================================================
10
+ # Paths
11
+ # =============================================================================
12
+ HQ_DIR = Path(__file__).parent
13
+ SRC_DIR = HQ_DIR.parent
14
+ BASE_DIR = SRC_DIR.parent
15
+
16
+ # =============================================================================
17
+ # Available Encoder Models
18
+ # =============================================================================
19
+ ENCODER_MODELS = {
20
+ "mmbert-small": {
21
+ "model_name": "jhu-clsp/mmBERT-small",
22
+ "max_length": 512,
23
+ "embedding_dim": 384,
24
+ "description": "mmBERT-small: Modern multilingual encoder (1800+ languages)",
25
+ },
26
+ "mmbert-base": {
27
+ "model_name": "jhu-clsp/mmBERT-base",
28
+ "max_length": 512,
29
+ "embedding_dim": 768,
30
+ "description": "mmBERT-base: Larger multilingual encoder (1800+ languages)",
31
+ },
32
+ "xlm-roberta-base": {
33
+ "model_name": "FacebookAI/xlm-roberta-base",
34
+ "max_length": 512,
35
+ "embedding_dim": 768,
36
+ "description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)",
37
+ },
38
+ "xlm-roberta-large": {
39
+ "model_name": "FacebookAI/xlm-roberta-large",
40
+ "max_length": 512,
41
+ "embedding_dim": 1024,
42
+ "description": "XLM-RoBERTa-large: Larger classic multilingual encoder",
43
+ },
44
+ }
45
+
46
+ # Default encoder
47
+ DEFAULT_ENCODER = "mmbert-small"
48
+
49
+ # =============================================================================
50
+ # Embedding Model Configuration (default)
51
+ # =============================================================================
52
+ EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy()
53
+
54
+ # =============================================================================
55
+ # Classifier Training Configuration
56
+ # =============================================================================
57
+ TRAINING_CONFIG = {
58
+ "epochs": 6,
59
+ "learning_rate": 0.0003,
60
+ "batch_size": 256,
61
+ "hidden_dim": 256,
62
+ "dropout": 0.2,
63
+ "embedding_batch_size": 32,
64
+ }
65
+
66
+ # =============================================================================
67
+ # Language-Specific Configuration
68
+ # =============================================================================
69
+ LANGUAGE_CONFIG = {
70
+ "ara_Arab": {
71
+ "name": "Arabic",
72
+ "answer_label": "الإجابة:",
73
+ "positive_datasets": [
74
+ {
75
+ "dataset_id": "MBZUAI/ArabicMMLU",
76
+ "subset": "All",
77
+ "split": "test",
78
+ "format_type": "mcq",
79
+ "text_field": None, # Use formatter
80
+ },
81
+ {
82
+ "dataset_id": "openai/MMMLU",
83
+ "subset": "AR_XY",
84
+ "split": "test",
85
+ "format_type": "mcq",
86
+ "text_field": None,
87
+ },
88
+ {
89
+ "dataset_id": "CohereForAI/aya_dataset",
90
+ "subset": None,
91
+ "split": "train",
92
+ "format_type": "instruction",
93
+ "text_field": None,
94
+ "language_filter": "Arabic",
95
+ },
96
+ ],
97
+ "negative_source": {
98
+ "dataset_id": "uonlp/CulturaX",
99
+ "subset": "ar",
100
+ "split": "train",
101
+ "text_field": "text",
102
+ },
103
+ },
104
+ "hin_Deva": {
105
+ "name": "Hindi",
106
+ "answer_label": "उत्तर:",
107
+ "positive_datasets": [
108
+ {
109
+ "dataset_id": "openai/MMMLU",
110
+ "subset": "HI_IN",
111
+ "split": "test",
112
+ "format_type": "mcq",
113
+ "text_field": None,
114
+ },
115
+ {
116
+ "dataset_id": "CohereForAI/aya_dataset",
117
+ "subset": None,
118
+ "split": "train",
119
+ "format_type": "instruction",
120
+ "text_field": None,
121
+ "language_filter": "Hindi",
122
+ },
123
+ ],
124
+ "negative_source": {
125
+ "dataset_id": "uonlp/CulturaX",
126
+ "subset": "hi",
127
+ "split": "train",
128
+ "text_field": "text",
129
+ },
130
+ },
131
+ "tur_Latn": {
132
+ "name": "Turkish",
133
+ "answer_label": "Cevap:",
134
+ "positive_datasets": [
135
+ {
136
+ "dataset_id": "AYueksel/TurkishMMLU",
137
+ "subset": "All",
138
+ "split": "test",
139
+ "format_type": "mcq",
140
+ "text_field": None,
141
+ },
142
+ # Note: openai/MMMLU does not have Turkish
143
+ {
144
+ "dataset_id": "CohereForAI/aya_dataset",
145
+ "subset": None,
146
+ "split": "train",
147
+ "format_type": "instruction",
148
+ "text_field": None,
149
+ "language_filter": "Turkish",
150
+ },
151
+ ],
152
+ "negative_source": {
153
+ "dataset_id": "uonlp/CulturaX",
154
+ "subset": "tr",
155
+ "split": "train",
156
+ "text_field": "text",
157
+ },
158
+ },
159
+ }
160
+
161
+ # =============================================================================
162
+ # Supported Languages
163
+ # =============================================================================
164
+ SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys())
165
+
166
+ # =============================================================================
167
+ # Default Sampling Configuration
168
+ # =============================================================================
169
+ SAMPLING_CONFIG = {
170
+ "max_positive_samples": 80000,
171
+ "max_negative_samples": 80000,
172
+ "min_text_length": 50,
173
+ "train_ratio": 0.8,
174
+ "valid_ratio": 0.1,
175
+ "test_ratio": 0.1,
176
+ "random_seed": 42,
177
+ }
merged_model.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified HuggingFace-compatible quality classifier model.
3
+
4
+ Merges mmBERT encoder with trained MLP classifier head into a single
5
+ PreTrainedModel that can be saved/loaded using standard HuggingFace methods
6
+ and used with vLLM for efficient inference.
7
+
8
+ Example:
9
+ # Merge trained classifier into unified model
10
+ from src.hq.merged_model import merge_and_save
11
+ merge_and_save(
12
+ base_model_name="jhu-clsp/mmBERT-small",
13
+ classifier_weights_path="./output/models/ara_Arab.pt",
14
+ output_dir="./release/arabic-quality-classifier"
15
+ )
16
+
17
+ # Load and use
18
+ model = QualityClassifierModel.from_pretrained("./release/arabic-quality-classifier")
19
+ tokenizer = AutoTokenizer.from_pretrained("./release/arabic-quality-classifier")
20
+ """
21
+ import os
22
+ from pathlib import Path
23
+ from typing import Optional, Union
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+ from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PretrainedConfig
28
+ from transformers.modeling_outputs import SequenceClassifierOutput
29
+
30
+ from .config import EMBEDDING_CONFIG, TRAINING_CONFIG
31
+
32
+
33
+ class QualityClassifierConfig(PretrainedConfig):
34
+ """Configuration for the unified quality classifier model."""
35
+
36
+ model_type = "quality_classifier"
37
+
38
+ def __init__(
39
+ self,
40
+ base_model_name: str = None,
41
+ hidden_dim: int = None,
42
+ dropout: float = None,
43
+ num_labels: int = 1,
44
+ **kwargs
45
+ ):
46
+ """
47
+ Initialize configuration.
48
+
49
+ Args:
50
+ base_model_name: HuggingFace model ID for the encoder
51
+ hidden_dim: Hidden dimension of the MLP classifier
52
+ dropout: Dropout probability
53
+ num_labels: Number of output labels (1 for binary)
54
+ """
55
+ super().__init__(**kwargs)
56
+ self.base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
57
+ self.hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
58
+ self.dropout = dropout or TRAINING_CONFIG["dropout"]
59
+ self.num_labels = num_labels
60
+
61
+
62
+ class QualityClassifierModel(PreTrainedModel):
63
+ """
64
+ Unified quality classifier combining mmBERT encoder with MLP head.
65
+
66
+ This model can be saved and loaded using standard HuggingFace methods:
67
+ model.save_pretrained("path/to/model")
68
+ model = QualityClassifierModel.from_pretrained("path/to/model")
69
+
70
+ It can also be used with vLLM for efficient inference since mmBERT
71
+ is supported.
72
+
73
+ Architecture:
74
+ - Encoder: mmBERT (small or base)
75
+ - Pooling: Mean pooling over sequence
76
+ - Classifier: Linear(768->256) -> ReLU -> Dropout(0.2) -> Linear(256->1) -> Sigmoid
77
+ """
78
+
79
+ config_class = QualityClassifierConfig
80
+
81
+ def __init__(self, config: QualityClassifierConfig):
82
+ """
83
+ Initialize the unified model.
84
+
85
+ Args:
86
+ config: QualityClassifierConfig instance
87
+ """
88
+ super().__init__(config)
89
+
90
+ # Load base encoder with eager attention to avoid flash_attn issues
91
+ self.encoder = AutoModel.from_pretrained(
92
+ config.base_model_name,
93
+ attn_implementation="eager",
94
+ )
95
+ hidden_size = self.encoder.config.hidden_size
96
+
97
+ # Classification head (matches standalone training architecture)
98
+ self.classifier = nn.Sequential(
99
+ nn.Linear(hidden_size, config.hidden_dim),
100
+ nn.ReLU(),
101
+ nn.Dropout(config.dropout),
102
+ nn.Linear(config.hidden_dim, config.num_labels),
103
+ nn.Sigmoid()
104
+ )
105
+
106
+ self.post_init()
107
+
108
+ def forward(
109
+ self,
110
+ input_ids: torch.Tensor,
111
+ attention_mask: Optional[torch.Tensor] = None,
112
+ token_type_ids: Optional[torch.Tensor] = None,
113
+ labels: Optional[torch.Tensor] = None,
114
+ return_dict: bool = True,
115
+ ) -> SequenceClassifierOutput:
116
+ """
117
+ Forward pass with optional loss computation.
118
+
119
+ Args:
120
+ input_ids: Token IDs of shape (batch_size, seq_length)
121
+ attention_mask: Attention mask of shape (batch_size, seq_length)
122
+ token_type_ids: Token type IDs (unused for mmBERT)
123
+ labels: Ground truth labels for loss computation
124
+ return_dict: Whether to return a SequenceClassifierOutput
125
+
126
+ Returns:
127
+ SequenceClassifierOutput with loss, logits, and hidden states
128
+ """
129
+ # Encode
130
+ outputs = self.encoder(
131
+ input_ids=input_ids,
132
+ attention_mask=attention_mask,
133
+ )
134
+
135
+ # Mean pooling
136
+ token_embeddings = outputs.last_hidden_state
137
+ if attention_mask is not None:
138
+ mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
139
+ sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
140
+ sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
141
+ pooled = sum_embeddings / sum_mask
142
+ else:
143
+ pooled = token_embeddings.mean(dim=1)
144
+
145
+ # Classify
146
+ logits = self.classifier(pooled)
147
+
148
+ # Compute loss if labels provided
149
+ loss = None
150
+ if labels is not None:
151
+ loss_fn = nn.BCELoss()
152
+ loss = loss_fn(logits.squeeze(), labels.float())
153
+
154
+ if not return_dict:
155
+ output = (logits,) + outputs[2:]
156
+ return ((loss,) + output) if loss is not None else output
157
+
158
+ return SequenceClassifierOutput(
159
+ loss=loss,
160
+ logits=logits,
161
+ hidden_states=outputs.hidden_states,
162
+ attentions=outputs.attentions,
163
+ )
164
+
165
+ def predict(
166
+ self,
167
+ input_ids: torch.Tensor,
168
+ attention_mask: torch.Tensor
169
+ ) -> torch.Tensor:
170
+ """
171
+ Convenience method for inference.
172
+
173
+ Args:
174
+ input_ids: Token IDs
175
+ attention_mask: Attention mask
176
+
177
+ Returns:
178
+ Quality scores in range [0, 1]
179
+ """
180
+ self.eval()
181
+ with torch.no_grad():
182
+ outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
183
+ return outputs.logits.squeeze()
184
+
185
+ def score_texts(
186
+ self,
187
+ texts: list,
188
+ tokenizer: AutoTokenizer,
189
+ batch_size: int = 32,
190
+ max_length: int = 512,
191
+ device: str = None,
192
+ ) -> list:
193
+ """
194
+ Score a list of texts.
195
+
196
+ Args:
197
+ texts: List of text strings to score
198
+ tokenizer: Tokenizer for the model
199
+ batch_size: Batch size for processing
200
+ max_length: Maximum sequence length
201
+ device: Device to use for inference
202
+
203
+ Returns:
204
+ List of quality scores in range [0, 1]
205
+ """
206
+ device = device or ("cuda" if torch.cuda.is_available() else "cpu")
207
+ self.to(device)
208
+ self.eval()
209
+
210
+ scores = []
211
+ for i in range(0, len(texts), batch_size):
212
+ batch = texts[i:i + batch_size]
213
+ inputs = tokenizer(
214
+ batch,
215
+ return_tensors="pt",
216
+ max_length=max_length,
217
+ truncation=True,
218
+ padding=True,
219
+ ).to(device)
220
+
221
+ with torch.no_grad():
222
+ outputs = self.forward(**inputs)
223
+ batch_scores = outputs.logits.squeeze().cpu().tolist()
224
+
225
+ # Handle single item case
226
+ if isinstance(batch_scores, float):
227
+ batch_scores = [batch_scores]
228
+
229
+ scores.extend(batch_scores)
230
+
231
+ return scores
232
+
233
+
234
+ def merge_and_save(
235
+ base_model_name: str,
236
+ classifier_weights_path: Union[str, Path],
237
+ output_dir: Union[str, Path],
238
+ hidden_dim: int = None,
239
+ dropout: float = None,
240
+ ) -> QualityClassifierModel:
241
+ """
242
+ Merge encoder and trained classifier head, then save as unified model.
243
+
244
+ The resulting model can be loaded with:
245
+ model = QualityClassifierModel.from_pretrained(output_dir)
246
+
247
+ Args:
248
+ base_model_name: HuggingFace model ID for the encoder
249
+ classifier_weights_path: Path to trained MLP weights (.pt file)
250
+ output_dir: Directory to save the merged model
251
+ hidden_dim: Hidden dimension of the MLP (must match training)
252
+ dropout: Dropout rate (must match training)
253
+
254
+ Returns:
255
+ The merged QualityClassifierModel
256
+ """
257
+ hidden_dim = hidden_dim or TRAINING_CONFIG["hidden_dim"]
258
+ dropout = dropout or TRAINING_CONFIG["dropout"]
259
+ output_dir = Path(output_dir)
260
+
261
+ print(f"Merging model...")
262
+ print(f" Encoder: {base_model_name}")
263
+ print(f" Classifier: {classifier_weights_path}")
264
+
265
+ # Create config
266
+ config = QualityClassifierConfig(
267
+ base_model_name=base_model_name,
268
+ hidden_dim=hidden_dim,
269
+ dropout=dropout,
270
+ num_labels=1
271
+ )
272
+
273
+ # Initialize model (loads encoder from HuggingFace)
274
+ model = QualityClassifierModel(config)
275
+
276
+ # Load trained classifier weights
277
+ checkpoint = torch.load(classifier_weights_path, map_location="cpu")
278
+
279
+ # Handle both new format (dict with state_dict) and old format (just state_dict)
280
+ if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
281
+ trained_weights = checkpoint["state_dict"]
282
+ else:
283
+ trained_weights = checkpoint
284
+
285
+ # Map weights from standalone MLP to integrated classifier
286
+ # The standalone model saves with "classifier." prefix, strip it
287
+ stripped_weights = {}
288
+ for key, value in trained_weights.items():
289
+ new_key = key.replace("classifier.", "") if key.startswith("classifier.") else key
290
+ stripped_weights[new_key] = value
291
+
292
+ model.classifier.load_state_dict(stripped_weights)
293
+
294
+ # Save everything
295
+ output_dir.mkdir(parents=True, exist_ok=True)
296
+ model.save_pretrained(output_dir)
297
+
298
+ # Also save tokenizer for convenience
299
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
300
+ tokenizer.save_pretrained(output_dir)
301
+
302
+ print(f"Model saved to {output_dir}")
303
+ print(f"Contents: {list(output_dir.iterdir())}")
304
+
305
+ return model
306
+
307
+
308
+ def merge_all_classifiers(
309
+ models_dir: Union[str, Path],
310
+ output_base_dir: Union[str, Path],
311
+ base_model_name: str = None,
312
+ ) -> dict:
313
+ """
314
+ Merge all trained classifiers into unified models.
315
+
316
+ Args:
317
+ models_dir: Directory containing trained .pt files
318
+ output_base_dir: Base directory for output models
319
+ base_model_name: HuggingFace model ID for the encoder
320
+
321
+ Returns:
322
+ Dictionary mapping language codes to output directories
323
+ """
324
+ base_model_name = base_model_name or EMBEDDING_CONFIG["model_name"]
325
+ models_dir = Path(models_dir)
326
+ output_base_dir = Path(output_base_dir)
327
+
328
+ results = {}
329
+
330
+ for pt_file in models_dir.glob("*.pt"):
331
+ lang_code = pt_file.stem # e.g., "ara_Arab"
332
+ output_dir = output_base_dir / f"{lang_code}-quality-classifier"
333
+
334
+ print(f"\n{'=' * 50}")
335
+ print(f"Processing: {lang_code}")
336
+ print(f"{'=' * 50}")
337
+
338
+ merge_and_save(
339
+ base_model_name=base_model_name,
340
+ classifier_weights_path=pt_file,
341
+ output_dir=output_dir,
342
+ )
343
+
344
+ results[lang_code] = str(output_dir)
345
+
346
+ return results
347
+
348
+
349
+ # Register the model for auto-loading
350
+ # This allows: AutoModel.from_pretrained("path") to work
351
+ QualityClassifierConfig.register_for_auto_class()
352
+ QualityClassifierModel.register_for_auto_class("AutoModel")
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e27f1fb9c9a19e2fdd4eb046a3f99e349bfa0ac7f43f22c1e6968be2898cd96c
3
+ size 1112987508
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
3
+ size 17082734
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }