sanchit-gandhi commited on
Commit
23031a6
·
1 Parent(s): ff21ebe

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -0
  2. tokenizer_config.json +12 -0
  3. vocab.json +146 -0
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "|",
3
+ "unk_token": "<unk>"
4
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "kxc",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "|",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 143,
3
+ "-": 135,
4
+ "|": 0,
5
+ "ሀ": 34,
6
+ "ሁ": 84,
7
+ "ሂ": 129,
8
+ "ሃ": 71,
9
+ "ሄ": 68,
10
+ "ህ": 100,
11
+ "ሆ": 106,
12
+ "ለ": 16,
13
+ "ሉ": 101,
14
+ "ሊ": 130,
15
+ "ላ": 73,
16
+ "ሌ": 41,
17
+ "ል": 29,
18
+ "ሎ": 52,
19
+ "መ": 6,
20
+ "ሙ": 83,
21
+ "ሚ": 108,
22
+ "ማ": 60,
23
+ "ሜ": 55,
24
+ "ም": 31,
25
+ "ሞ": 53,
26
+ "ረ": 8,
27
+ "ሩ": 105,
28
+ "ሪ": 127,
29
+ "ራ": 107,
30
+ "ሬ": 54,
31
+ "ር": 20,
32
+ "ሮ": 58,
33
+ "ሰ": 12,
34
+ "ሱ": 48,
35
+ "ሲ": 117,
36
+ "ሳ": 96,
37
+ "ሴ": 10,
38
+ "ስ": 19,
39
+ "ሶ": 57,
40
+ "ሸ": 36,
41
+ "ሹ": 116,
42
+ "ሺ": 120,
43
+ "ሻ": 123,
44
+ "ሼ": 49,
45
+ "ሽ": 35,
46
+ "ሾ": 33,
47
+ "ቀ": 14,
48
+ "ቁ": 47,
49
+ "ቂ": 133,
50
+ "ቃ": 86,
51
+ "ቄ": 72,
52
+ "ቅ": 76,
53
+ "ቆ": 56,
54
+ "ተ": 3,
55
+ "ቱ": 66,
56
+ "ቲ": 103,
57
+ "ታ": 51,
58
+ "ቴ": 27,
59
+ "ት": 26,
60
+ "ቶ": 24,
61
+ "ቸ": 98,
62
+ "ቹ": 124,
63
+ "ቺ": 113,
64
+ "ቻ": 90,
65
+ "ች": 109,
66
+ "ቾ": 91,
67
+ "ነ": 9,
68
+ "ኑ": 93,
69
+ "ኒ": 115,
70
+ "ና": 63,
71
+ "ኔ": 18,
72
+ "ን": 1,
73
+ "ኖ": 30,
74
+ "ኘ": 111,
75
+ "ኙ": 131,
76
+ "ኛ": 134,
77
+ "ኜ": 110,
78
+ "ኝ": 118,
79
+ "ኞ": 138,
80
+ "አ": 4,
81
+ "ኡ": 46,
82
+ "ኢ": 132,
83
+ "ኣ": 69,
84
+ "ኤ": 22,
85
+ "እ": 5,
86
+ "ኦ": 11,
87
+ "ከ": 2,
88
+ "ኩ": 44,
89
+ "ኪ": 67,
90
+ "ካ": 61,
91
+ "ኬ": 43,
92
+ "ክ": 17,
93
+ "ኮ": 28,
94
+ "ኸ": 45,
95
+ "ኹ": 104,
96
+ "ኺ": 141,
97
+ "ኻ": 75,
98
+ "ኼ": 99,
99
+ "ኽ": 119,
100
+ "ኾ": 74,
101
+ "ወ": 42,
102
+ "ዊ": 142,
103
+ "ዋ": 40,
104
+ "ዌ": 78,
105
+ "ው": 39,
106
+ "ዎ": 114,
107
+ "የ": 38,
108
+ "ዩ": 126,
109
+ "ያ": 81,
110
+ "ዬ": 23,
111
+ "ይ": 13,
112
+ "ዮ": 65,
113
+ "ደ": 21,
114
+ "ዱ": 87,
115
+ "ዲ": 82,
116
+ "ዳ": 62,
117
+ "ዴ": 15,
118
+ "ድ": 32,
119
+ "ዶ": 80,
120
+ "ጨ": 70,
121
+ "ጩ": 95,
122
+ "ጪ": 125,
123
+ "ጫ": 92,
124
+ "ጬ": 102,
125
+ "ጭ": 112,
126
+ "ጮ": 128,
127
+ "ጰ": 122,
128
+ "ጳ": 137,
129
+ "ጴ": 136,
130
+ "ጵ": 121,
131
+ "ጶ": 139,
132
+ "ፈ": 59,
133
+ "ፉ": 89,
134
+ "ፊ": 140,
135
+ "ፋ": 94,
136
+ "ፌ": 88,
137
+ "ፍ": 97,
138
+ "ፎ": 50,
139
+ "ፐ": 7,
140
+ "ፑ": 79,
141
+ "ፒ": 64,
142
+ "ፓ": 85,
143
+ "ፔ": 77,
144
+ "ፕ": 25,
145
+ "ፖ": 37
146
+ }