mishkala / tokenizer.json
flokymind's picture
Upload tokenizer.json
8a9a2f7
{
"char_to_id": {
"[PAD]": 0,
"[UNK]": 1,
"[BOS]": 2,
"[EOS]": 3,
"[MASK]": 4,
" ": 5,
"ا": 6,
"ل": 7,
"ي": 8,
"م": 9,
"و": 10,
"ن": 11,
"ه": 12,
"ب": 13,
"ر": 14,
"ع": 15,
"ف": 16,
"أ": 17,
"ق": 18,
"ت": 19,
"د": 20,
"ك": 21,
"ح": 22,
"ة": 23,
"س": 24,
"ج": 25,
"إ": 26,
"ص": 27,
"ذ": 28,
"ى": 29,
"خ": 30,
"ش": 31,
"ث": 32,
"ض": 33,
"ط": 34,
"ز": 35,
":": 36,
"غ": 37,
"ء": 38,
"ئ": 39,
"ظ": 40,
"؛": 41,
"آ": 42,
"-": 43,
"ؤ": 44,
",": 45,
"ٰ": 46,
"ۚ": 47,
"ۖ": 48,
"ۗ": 49
},
"vocab_size": 50,
"special_tokens": {
"PAD": 0,
"UNK": 1,
"BOS": 2,
"EOS": 3,
"MASK": 4,
"SPACE": 5
},
"diacritic_classes": [
"NO_DIACRITIC",
"FATHA",
"FATHATAN",
"DAMMA",
"DAMMATAN",
"KASRA",
"KASRATAN",
"SUKUN",
"SHADDA",
"SHADDA_FATHA",
"SHADDA_FATHATAN",
"SHADDA_DAMMA",
"SHADDA_DAMMATAN",
"SHADDA_KASRA",
"SHADDA_KASRATAN"
],
"num_labels": 15
}