tokenizer-fineweb_de / tokenizer.json
suchirsalhan's picture
Register tokenizer as HF SentencePiece tokenizer
2d9aed2 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "<extra_id_99>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "<extra_id_98>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "<extra_id_97>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "<extra_id_96>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "<extra_id_95>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 9,
"content": "<extra_id_94>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 10,
"content": "<extra_id_93>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 11,
"content": "<extra_id_92>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 12,
"content": "<extra_id_91>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 13,
"content": "<extra_id_90>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 14,
"content": "<extra_id_89>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 15,
"content": "<extra_id_88>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 16,
"content": "<extra_id_87>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 17,
"content": "<extra_id_86>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 18,
"content": "<extra_id_85>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 19,
"content": "<extra_id_84>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 20,
"content": "<extra_id_83>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 21,
"content": "<extra_id_82>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 22,
"content": "<extra_id_81>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 23,
"content": "<extra_id_80>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 24,
"content": "<extra_id_79>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 25,
"content": "<extra_id_78>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 26,
"content": "<extra_id_77>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 27,
"content": "<extra_id_76>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 28,
"content": "<extra_id_75>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 29,
"content": "<extra_id_74>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 30,
"content": "<extra_id_73>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 31,
"content": "<extra_id_72>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 32,
"content": "<extra_id_71>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 33,
"content": "<extra_id_70>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 34,
"content": "<extra_id_69>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 35,
"content": "<extra_id_68>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 36,
"content": "<extra_id_67>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 37,
"content": "<extra_id_66>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 38,
"content": "<extra_id_65>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 39,
"content": "<extra_id_64>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 40,
"content": "<extra_id_63>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 41,
"content": "<extra_id_62>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 42,
"content": "<extra_id_61>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 43,
"content": "<extra_id_60>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 44,
"content": "<extra_id_59>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 45,
"content": "<extra_id_58>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 46,
"content": "<extra_id_57>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 47,
"content": "<extra_id_56>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 48,
"content": "<extra_id_55>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 49,
"content": "<extra_id_54>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 50,
"content": "<extra_id_53>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 51,
"content": "<extra_id_52>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 52,
"content": "<extra_id_51>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 53,
"content": "<extra_id_50>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 54,
"content": "<extra_id_49>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 55,
"content": "<extra_id_48>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 56,
"content": "<extra_id_47>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 57,
"content": "<extra_id_46>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 58,
"content": "<extra_id_45>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 59,
"content": "<extra_id_44>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 60,
"content": "<extra_id_43>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 61,
"content": "<extra_id_42>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 62,
"content": "<extra_id_41>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 63,
"content": "<extra_id_40>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 64,
"content": "<extra_id_39>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 65,
"content": "<extra_id_38>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 66,
"content": "<extra_id_37>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 67,
"content": "<extra_id_36>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 68,
"content": "<extra_id_35>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 69,
"content": "<extra_id_34>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 70,
"content": "<extra_id_33>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 71,
"content": "<extra_id_32>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 72,
"content": "<extra_id_31>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 73,
"content": "<extra_id_30>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 74,
"content": "<extra_id_29>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 75,
"content": "<extra_id_28>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 76,
"content": "<extra_id_27>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 77,
"content": "<extra_id_26>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 78,
"content": "<extra_id_25>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 79,
"content": "<extra_id_24>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 80,
"content": "<extra_id_23>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 81,
"content": "<extra_id_22>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 82,
"content": "<extra_id_21>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 83,
"content": "<extra_id_20>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 84,
"content": "<extra_id_19>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 85,
"content": "<extra_id_18>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 86,
"content": "<extra_id_17>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 87,
"content": "<extra_id_16>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 88,
"content": "<extra_id_15>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 89,
"content": "<extra_id_14>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 90,
"content": "<extra_id_13>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 91,
"content": "<extra_id_12>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 92,
"content": "<extra_id_11>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 93,
"content": "<extra_id_10>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 94,
"content": "<extra_id_9>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 95,
"content": "<extra_id_8>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 96,
"content": "<extra_id_7>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 97,
"content": "<extra_id_6>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 98,
"content": "<extra_id_5>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 99,
"content": "<extra_id_4>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 100,
"content": "<extra_id_3>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 101,
"content": "<extra_id_2>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 102,
"content": "<extra_id_1>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 103,
"content": "<extra_id_0>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 104,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "WhitespaceSplit"
},
{
"type": "Metaspace",
"replacement": "▁",
"prepend_scheme": "always",
"split": true
}
]
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "</s>",
"type_id": 0
}
}
],
"special_tokens": {
"</s>": {
"id": "</s>",
"ids": [
1
],
"tokens": [
"</s>"
]
}
}
},
"decoder": {
"type": "Metaspace",
"replacement": "▁",
"prepend_scheme": "always",
"split": true
},
"model": {
"type": "Unigram",
"unk_id": 2,
"vocab": [
[
"<pad>",
0.0
],
[
"</s>",
0.0
],
[
"<unk>",
0.0
],
[
"▁",
-2.0
],
[
"<extra_id_99>",
0.0
],
[
"<extra_id_98>",
0.0
],
[
"<extra_id_97>",
0.0
],
[
"<extra_id_96>",
0.0
],
[
"<extra_id_95>",
0.0
],
[
"<extra_id_94>",
0.0
],
[
"<extra_id_93>",
0.0
],
[
"<extra_id_92>",
0.0
],
[
"<extra_id_91>",
0.0
],
[
"<extra_id_90>",
0.0
],
[
"<extra_id_89>",
0.0
],
[
"<extra_id_88>",
0.0
],
[
"<extra_id_87>",
0.0
],
[
"<extra_id_86>",
0.0
],
[
"<extra_id_85>",
0.0
],
[
"<extra_id_84>",
0.0
],
[
"<extra_id_83>",
0.0
],
[
"<extra_id_82>",
0.0
],
[
"<extra_id_81>",
0.0
],
[
"<extra_id_80>",
0.0
],
[
"<extra_id_79>",
0.0
],
[
"<extra_id_78>",
0.0
],
[
"<extra_id_77>",
0.0
],
[
"<extra_id_76>",
0.0
],
[
"<extra_id_75>",
0.0
],
[
"<extra_id_74>",
0.0
],
[
"<extra_id_73>",
0.0
],
[
"<extra_id_72>",
0.0
],
[
"<extra_id_71>",
0.0
],
[
"<extra_id_70>",
0.0
],
[
"<extra_id_69>",
0.0
],
[
"<extra_id_68>",
0.0
],
[
"<extra_id_67>",
0.0
],
[
"<extra_id_66>",
0.0
],
[
"<extra_id_65>",
0.0
],
[
"<extra_id_64>",
0.0
],
[
"<extra_id_63>",
0.0
],
[
"<extra_id_62>",
0.0
],
[
"<extra_id_61>",
0.0
],
[
"<extra_id_60>",
0.0
],
[
"<extra_id_59>",
0.0
],
[
"<extra_id_58>",
0.0
],
[
"<extra_id_57>",
0.0
],
[
"<extra_id_56>",
0.0
],
[
"<extra_id_55>",
0.0
],
[
"<extra_id_54>",
0.0
],
[
"<extra_id_53>",
0.0
],
[
"<extra_id_52>",
0.0
],
[
"<extra_id_51>",
0.0
],
[
"<extra_id_50>",
0.0
],
[
"<extra_id_49>",
0.0
],
[
"<extra_id_48>",
0.0
],
[
"<extra_id_47>",
0.0
],
[
"<extra_id_46>",
0.0
],
[
"<extra_id_45>",
0.0
],
[
"<extra_id_44>",
0.0
],
[
"<extra_id_43>",
0.0
],
[
"<extra_id_42>",
0.0
],
[
"<extra_id_41>",
0.0
],
[
"<extra_id_40>",
0.0
],
[
"<extra_id_39>",
0.0
],
[
"<extra_id_38>",
0.0
],
[
"<extra_id_37>",
0.0
],
[
"<extra_id_36>",
0.0
],
[
"<extra_id_35>",
0.0
],
[
"<extra_id_34>",
0.0
],
[
"<extra_id_33>",
0.0
],
[
"<extra_id_32>",
0.0
],
[
"<extra_id_31>",
0.0
],
[
"<extra_id_30>",
0.0
],
[
"<extra_id_29>",
0.0
],
[
"<extra_id_28>",
0.0
],
[
"<extra_id_27>",
0.0
],
[
"<extra_id_26>",
0.0
],
[
"<extra_id_25>",
0.0
],
[
"<extra_id_24>",
0.0
],
[
"<extra_id_23>",
0.0
],
[
"<extra_id_22>",
0.0
],
[
"<extra_id_21>",
0.0
],
[
"<extra_id_20>",
0.0
],
[
"<extra_id_19>",
0.0
],
[
"<extra_id_18>",
0.0
],
[
"<extra_id_17>",
0.0
],
[
"<extra_id_16>",
0.0
],
[
"<extra_id_15>",
0.0
],
[
"<extra_id_14>",
0.0
],
[
"<extra_id_13>",
0.0
],
[
"<extra_id_12>",
0.0
],
[
"<extra_id_11>",
0.0
],
[
"<extra_id_10>",
0.0
],
[
"<extra_id_9>",
0.0
],
[
"<extra_id_8>",
0.0
],
[
"<extra_id_7>",
0.0
],
[
"<extra_id_6>",
0.0
],
[
"<extra_id_5>",
0.0
],
[
"<extra_id_4>",
0.0
],
[
"<extra_id_3>",
0.0
],
[
"<extra_id_2>",
0.0
],
[
"<extra_id_1>",
0.0
],
[
"<extra_id_0>",
0.0
]
],
"byte_fallback": false
}
}