Omarrran
/

Kashmiri_Char_Tokenizer

Model card Files Files and versions

Kashmiri_Char_Tokenizer / tokenizer.json

Omarrran's picture

Initial commit: KashTok tokenizer

d0a9fe5 verified about 1 month ago

history blame contribute delete

3.84 kB

	{
	"version": "1.0",
	"truncation": null,
	"padding": null,
	"added_tokens": [
	{
	"id": 0,
	"content": "[PAD]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "[UNK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "[CLS]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "[SEP]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 4,
	"content": "[MASK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": {
	"type": "NFC"
	},
	"pre_tokenizer": {
	"type": "Split",
	"pattern": {
	"String": ""
	},
	"behavior": "Isolated",
	"invert": false
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 1
	}
	}
	],
	"special_tokens": {}
	},
	"decoder": null,
	"model": {
	"type": "WordLevel",
	"vocab": {
	"[PAD]": 0,
	"[UNK]": 1,
	"[CLS]": 2,
	"[SEP]": 3,
	"[MASK]": 4,
	"[BOS]": 5,
	"[EOS]": 6,
	"\n": 7,
	" ": 8,
	"!": 9,
	"\"": 10,
	"'": 11,
	"-": 12,
	".": 13,
	"«": 14,
	"»": 15,
	"؁": 16,
	"،": 17,
	"؎": 18,
	"ؐ": 19,
	"ؑ": 20,
	"ؒ": 21,
	"ؓ": 22,
	"ؔ": 23,
	"؛": 24,
	"؟": 25,
	"ؠ": 26,
	"ء": 27,
	"آ": 28,
	"أ": 29,
	"ؤ": 30,
	"إ": 31,
	"ئ": 32,
	"ا": 33,
	"ب": 34,
	"ت": 35,
	"ث": 36,
	"ج": 37,
	"ح": 38,
	"خ": 39,
	"د": 40,
	"ذ": 41,
	"ر": 42,
	"ز": 43,
	"س": 44,
	"ش": 45,
	"ص": 46,
	"ض": 47,
	"ط": 48,
	"ظ": 49,
	"ع": 50,
	"غ": 51,
	"ف": 52,
	"ق": 53,
	"ك": 54,
	"ل": 55,
	"م": 56,
	"ن": 57,
	"ه": 58,
	"و": 59,
	"ً": 60,
	"ٍ": 61,
	"َ": 62,
	"ُ": 63,
	"ِ": 64,
	"ّ": 65,
	"ْ": 66,
	"ٓ": 67,
	"ٔ": 68,
	"ٕ": 69,
	"ٖ": 70,
	"ٗ": 71,
	"٘": 72,
	"ٚ": 73,
	"ٛ": 74,
	"ٟ": 75,
	"٠": 76,
	"١": 77,
	"٢": 78,
	"٣": 79,
	"٤": 80,
	"٥": 81,
	"٦": 82,
	"٧": 83,
	"٨": 84,
	"٩": 85,
	"٪": 86,
	"٭": 87,
	"ٮ": 88,
	"ٮ۪": 89,
	"ٰ": 90,
	"ٲ": 91,
	"ٳ": 92,
	"ٹ": 93,
	"پ": 94,
	"ٿ": 95,
	"ڀ": 96,
	"چ": 97,
	"ڈ": 98,
	"ڑ": 99,
	"ژ": 100,
	"ڙ": 101,
	"ک": 102,
	"ڪ": 103,
	"ڮ": 104,
	"گ": 105,
	"ں": 106,
	"ھ": 107,
	"ہ": 108,
	"ۂ": 109,
	"ۃ": 110,
	"ۄ": 111,
	"ۅ": 112,
	"ۆ": 113,
	"ی": 114,
	"ۍ": 115,
	"ے": 116,
	"ۓ": 117,
	"۔": 118,
	"۪": 119,
	"ۭ": 120,
	"۰": 121,
	"۱": 122,
	"۲": 123,
	"۳": 124,
	"۴": 125,
	"۵": 126,
	"۶": 127,
	"۷": 128,
	"۸": 129,
	"۹": 130,
	"﴾": 131,
	"﴿": 132
	},
	"unk_token": "[UNK]"
	}
	}