chatterbox / grapheme_mtl_with_tamil_v2.json
madhavantechie's picture
Add Tamil character support to tokenizer - 62 Tamil characters including vowels, consonants, Grantha characters, vowel markers, and digits
19d5c9a verified
raw
history blame
2.56 kB
{
"version": "1.0",
"model": {
"type": "WordLevel",
"vocab": {
"[UNK]": 0,
"[START]": 1,
"[STOP]": 2,
"[SPACE]": 3,
"[PAD]": 4,
"[CLS]": 5,
"[SEP]": 6,
"a": 7,
"b": 8,
"c": 9,
"d": 10,
"e": 11,
"f": 12,
"g": 13,
"h": 14,
"i": 15,
"j": 16,
"k": 17,
"l": 18,
"m": 19,
"n": 20,
"o": 21,
"p": 22,
"q": 23,
"r": 24,
"s": 25,
"t": 26,
"u": 27,
"v": 28,
"w": 29,
"x": 30,
"y": 31,
"z": 32,
"A": 33,
"B": 34,
"C": 35,
"D": 36,
"E": 37,
"F": 38,
"G": 39,
"H": 40,
"I": 41,
"J": 42,
"K": 43,
"L": 44,
"M": 45,
"N": 46,
"O": 47,
"P": 48,
"Q": 49,
"R": 50,
"S": 51,
"T": 52,
"U": 53,
"V": 54,
"W": 55,
"X": 56,
"Y": 57,
"Z": 58,
"0": 59,
"1": 60,
"2": 61,
"3": 62,
"4": 63,
"5": 64,
"6": 65,
"7": 66,
"8": 67,
"9": 68,
".": 69,
",": 70,
"!": 71,
"?": 72,
";": 73,
":": 74,
"'": 75,
"\"": 76,
"(": 77,
")": 78,
"-": 79,
" ": 80,
"அ": 81,
"ஆ": 82,
"இ": 83,
"ஈ": 84,
"உ": 85,
"ஊ": 86,
"எ": 87,
"ஏ": 88,
"ஐ": 89,
"ஒ": 90,
"ஓ": 91,
"ஔ": 92,
"க": 93,
"ங": 94,
"ச": 95,
"ஞ": 96,
"ட": 97,
"ண": 98,
"த": 99,
"ந": 100,
"ப": 101,
"ம": 102,
"ய": 103,
"ர": 104,
"ல": 105,
"வ": 106,
"ழ": 107,
"ள": 108,
"ற": 109,
"ன": 110,
"ஜ": 111,
"ஷ": 112,
"ஸ": 113,
"ஹ": 114,
"க்ஷ": 115,
"ஸ்ரீ": 116,
"ா": 117,
"ி": 118,
"ீ": 119,
"ு": 120,
"ூ": 121,
"ெ": 122,
"ே": 123,
"ை": 124,
"ொ": 125,
"ோ": 126,
"ௌ": 127,
"்": 128,
"௦": 129,
"௧": 130,
"௨": 131,
"௩": 132,
"௪": 133,
"௫": 134,
"௬": 135,
"௭": 136,
"௮": 137,
"௯": 138,
"ௗ": 139,
"௰": 140,
"௱": 141,
"௲": 142
},
"unk_token": "[UNK]"
},
"pre_tokenizer": {
"type": "Whitespace"
},
"normalizer": {
"type": "NFKC"
}
}