Commit ·
347dec7
1
Parent(s): 13bb0e3
Upload tokenizer
Browse files- tokenizer.json +8 -2
- tokenizer_config.json +1 -1
tokenizer.json
CHANGED
|
@@ -265,7 +265,9 @@
|
|
| 265 |
"special": true
|
| 266 |
}
|
| 267 |
],
|
| 268 |
-
"normalizer":
|
|
|
|
|
|
|
| 269 |
"pre_tokenizer": {
|
| 270 |
"type": "Whitespace"
|
| 271 |
},
|
|
@@ -344,7 +346,11 @@
|
|
| 344 |
}
|
| 345 |
}
|
| 346 |
},
|
| 347 |
-
"decoder":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
"model": {
|
| 349 |
"type": "WordPiece",
|
| 350 |
"unk_token": "[UNK]",
|
|
|
|
| 265 |
"special": true
|
| 266 |
}
|
| 267 |
],
|
| 268 |
+
"normalizer": {
|
| 269 |
+
"type": "NFKD"
|
| 270 |
+
},
|
| 271 |
"pre_tokenizer": {
|
| 272 |
"type": "Whitespace"
|
| 273 |
},
|
|
|
|
| 346 |
}
|
| 347 |
}
|
| 348 |
},
|
| 349 |
+
"decoder": {
|
| 350 |
+
"type": "WordPiece",
|
| 351 |
+
"prefix": "##",
|
| 352 |
+
"cleanup": true
|
| 353 |
+
},
|
| 354 |
"model": {
|
| 355 |
"type": "WordPiece",
|
| 356 |
"unk_token": "[UNK]",
|
tokenizer_config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
"clean_up_tokenization_spaces": true,
|
| 3 |
-
"model_max_length":
|
| 4 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 4 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
| 5 |
}
|