Kashmiri_Char_Tokenizer / tokenizer.json
Omarrran's picture
Initial commit: KashTok tokenizer
d0a9fe5 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "NFC"
},
"pre_tokenizer": {
"type": "Split",
"pattern": {
"String": ""
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"[BOS]": 5,
"[EOS]": 6,
"\n": 7,
" ": 8,
"!": 9,
"\"": 10,
"'": 11,
"-": 12,
".": 13,
"«": 14,
"»": 15,
"؁": 16,
"،": 17,
"؎": 18,
"ؐ": 19,
"ؑ": 20,
"ؒ": 21,
"ؓ": 22,
"ؔ": 23,
"؛": 24,
"؟": 25,
"ؠ": 26,
"ء": 27,
"آ": 28,
"أ": 29,
"ؤ": 30,
"إ": 31,
"ئ": 32,
"ا": 33,
"ب": 34,
"ت": 35,
"ث": 36,
"ج": 37,
"ح": 38,
"خ": 39,
"د": 40,
"ذ": 41,
"ر": 42,
"ز": 43,
"س": 44,
"ش": 45,
"ص": 46,
"ض": 47,
"ط": 48,
"ظ": 49,
"ع": 50,
"غ": 51,
"ف": 52,
"ق": 53,
"ك": 54,
"ل": 55,
"م": 56,
"ن": 57,
"ه": 58,
"و": 59,
"ً": 60,
"ٍ": 61,
"َ": 62,
"ُ": 63,
"ِ": 64,
"ّ": 65,
"ْ": 66,
"ٓ": 67,
"ٔ": 68,
"ٕ": 69,
"ٖ": 70,
"ٗ": 71,
"٘": 72,
"ٚ": 73,
"ٛ": 74,
"ٟ": 75,
"٠": 76,
"١": 77,
"٢": 78,
"٣": 79,
"٤": 80,
"٥": 81,
"٦": 82,
"٧": 83,
"٨": 84,
"٩": 85,
"٪": 86,
"٭": 87,
"ٮ": 88,
"ٮ۪": 89,
"ٰ": 90,
"ٲ": 91,
"ٳ": 92,
"ٹ": 93,
"پ": 94,
"ٿ": 95,
"ڀ": 96,
"چ": 97,
"ڈ": 98,
"ڑ": 99,
"ژ": 100,
"ڙ": 101,
"ک": 102,
"ڪ": 103,
"ڮ": 104,
"گ": 105,
"ں": 106,
"ھ": 107,
"ہ": 108,
"ۂ": 109,
"ۃ": 110,
"ۄ": 111,
"ۅ": 112,
"ۆ": 113,
"ی": 114,
"ۍ": 115,
"ے": 116,
"ۓ": 117,
"۔": 118,
"۪": 119,
"ۭ": 120,
"۰": 121,
"۱": 122,
"۲": 123,
"۳": 124,
"۴": 125,
"۵": 126,
"۶": 127,
"۷": 128,
"۸": 129,
"۹": 130,
"﴾": 131,
"﴿": 132
},
"unk_token": "[UNK]"
}
}