sanchit-gandhi commited on
Commit
773377a
·
1 Parent(s): de6e1f1

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -0
  2. tokenizer_config.json +12 -0
  3. vocab.json +180 -0
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "|",
3
+ "unk_token": "<unk>"
4
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "wal-script_ethiopic",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "|",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 177,
3
+ "-": 166,
4
+ "|": 0,
5
+ "ሀ": 31,
6
+ "ሁ": 67,
7
+ "ሂ": 151,
8
+ "ሃ": 74,
9
+ "ሄ": 40,
10
+ "ህ": 91,
11
+ "ሆ": 132,
12
+ "ለ": 36,
13
+ "ሉ": 108,
14
+ "ሊ": 128,
15
+ "ላ": 63,
16
+ "ሌ": 82,
17
+ "ል": 52,
18
+ "ሎ": 77,
19
+ "መ": 18,
20
+ "ሙ": 92,
21
+ "ሚ": 105,
22
+ "ማ": 62,
23
+ "ሜ": 68,
24
+ "ም": 47,
25
+ "ሞ": 83,
26
+ "ረ": 29,
27
+ "ሩ": 123,
28
+ "ሪ": 139,
29
+ "ራ": 80,
30
+ "ሬ": 70,
31
+ "ር": 17,
32
+ "ሮ": 60,
33
+ "ሰ": 9,
34
+ "ሱ": 49,
35
+ "ሲ": 94,
36
+ "ሳ": 42,
37
+ "ሴ": 69,
38
+ "ስ": 2,
39
+ "ሶ": 51,
40
+ "ሸ": 50,
41
+ "ሹ": 117,
42
+ "ሺ": 124,
43
+ "ሻ": 99,
44
+ "ሼ": 103,
45
+ "ሽ": 44,
46
+ "ሾ": 125,
47
+ "ቀ": 43,
48
+ "ቁ": 100,
49
+ "ቂ": 143,
50
+ "ቃ": 81,
51
+ "ቄ": 98,
52
+ "ቅ": 73,
53
+ "ቆ": 71,
54
+ "በ": 21,
55
+ "ቡ": 144,
56
+ "ቢ": 90,
57
+ "ባ": 46,
58
+ "ቤ": 56,
59
+ "ብ": 72,
60
+ "ቦ": 65,
61
+ "ተ": 6,
62
+ "ቱ": 54,
63
+ "ቲ": 79,
64
+ "ታ": 32,
65
+ "ቴ": 13,
66
+ "ት": 8,
67
+ "ቶ": 53,
68
+ "ቸ": 88,
69
+ "ቹ": 134,
70
+ "ቺ": 146,
71
+ "ቻ": 136,
72
+ "ቼ": 148,
73
+ "ች": 102,
74
+ "ቾ": 141,
75
+ "ነ": 4,
76
+ "ኑ": 55,
77
+ "ኒ": 150,
78
+ "ና": 37,
79
+ "ኔ": 16,
80
+ "ን": 1,
81
+ "ኖ": 119,
82
+ "ኙ": 174,
83
+ "አ": 5,
84
+ "ኡ": 35,
85
+ "ኢ": 107,
86
+ "ኣ": 57,
87
+ "ኤ": 22,
88
+ "እ": 7,
89
+ "ኦ": 28,
90
+ "ከ": 26,
91
+ "ኩ": 101,
92
+ "ኪ": 87,
93
+ "ካ": 112,
94
+ "ኬ": 59,
95
+ "ክ": 58,
96
+ "ኮ": 33,
97
+ "ወ": 66,
98
+ "ዉ": 126,
99
+ "ዋ": 39,
100
+ "ዌ": 145,
101
+ "ው": 23,
102
+ "ዎ": 27,
103
+ "ዘ": 76,
104
+ "ዙ": 137,
105
+ "ዚ": 156,
106
+ "ዛ": 122,
107
+ "ዜ": 120,
108
+ "ዝ": 115,
109
+ "ዞ": 131,
110
+ "የ": 30,
111
+ "ዩ": 138,
112
+ "ዪ": 93,
113
+ "ያ": 10,
114
+ "ዬ": 38,
115
+ "ይ": 3,
116
+ "ዮ": 12,
117
+ "ደ": 11,
118
+ "ዱ": 97,
119
+ "ዲ": 133,
120
+ "ዳ": 61,
121
+ "ዴ": 20,
122
+ "ድ": 14,
123
+ "ዶ": 19,
124
+ "ዸ": 158,
125
+ "ዹ": 176,
126
+ "ዺ": 159,
127
+ "ዻ": 175,
128
+ "ዼ": 164,
129
+ "ዽ": 110,
130
+ "ዾ": 172,
131
+ "ጀ": 163,
132
+ "ጁ": 162,
133
+ "ጂ": 171,
134
+ "ጃ": 170,
135
+ "ጄ": 147,
136
+ "ጅ": 160,
137
+ "ጆ": 168,
138
+ "ገ": 45,
139
+ "ጉ": 84,
140
+ "ጊ": 48,
141
+ "ጋ": 24,
142
+ "ጌ": 25,
143
+ "ግ": 15,
144
+ "ጎ": 64,
145
+ "ጠ": 75,
146
+ "ጡ": 135,
147
+ "ጢ": 153,
148
+ "ጣ": 95,
149
+ "ጤ": 96,
150
+ "ጥ": 78,
151
+ "ጦ": 41,
152
+ "ጨ": 116,
153
+ "ጩ": 149,
154
+ "ጪ": 155,
155
+ "ጫ": 121,
156
+ "ጬ": 127,
157
+ "ጭ": 109,
158
+ "ጮ": 152,
159
+ "ጰ": 118,
160
+ "ጱ": 165,
161
+ "ጲ": 173,
162
+ "ጳ": 169,
163
+ "ጴ": 104,
164
+ "ጵ": 111,
165
+ "ጶ": 157,
166
+ "ፈ": 86,
167
+ "ፉ": 142,
168
+ "ፊ": 154,
169
+ "ፋ": 130,
170
+ "ፌ": 129,
171
+ "ፍ": 114,
172
+ "ፎ": 161,
173
+ "ፐ": 89,
174
+ "ፑ": 113,
175
+ "ፒ": 167,
176
+ "ፓ": 140,
177
+ "ፔ": 34,
178
+ "ፕ": 85,
179
+ "ፖ": 106
180
+ }