Commit
·
dc05f7d
1
Parent(s):
feb6842
Upload 4 files
Browse files- alphabet.json +1 -0
- special_tokens_map.json +1 -6
- tokenizer_config.json +1 -49
- vocab.json +1 -69
alphabet.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"labels": ["<s>", "", "</s>", "\u2047", " ", "\u0901", "\u0902", "\u0903", "\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090a", "\u090b", "\u090f", "\u0910", "\u0911", "\u0913", "\u0914", "\u0915", "\u0916", "\u0917", "\u0918", "\u0919", "\u091a", "\u091b", "\u091c", "\u091d", "\u091e", "\u091f", "\u0920", "\u0921", "\u0922", "\u0923", "\u0924", "\u0925", "\u0926", "\u0927", "\u0928", "\u092a", "\u092b", "\u092c", "\u092d", "\u092e", "\u092f", "\u0930", "\u0932", "\u0935", "\u0936", "\u0937", "\u0938", "\u0939", "\u093c", "\u093e", "\u093f", "\u0940", "\u0941", "\u0942", "\u0943", "\u0945", "\u0947", "\u0948", "\u0949", "\u094b", "\u094c", "\u094d"], "is_bpe": false}
|
special_tokens_map.json
CHANGED
|
@@ -1,6 +1 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bos_token": "<s>",
|
| 3 |
-
"eos_token": "</s>",
|
| 4 |
-
"pad_token": "<pad>",
|
| 5 |
-
"unk_token": "<unk>"
|
| 6 |
-
}
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1,49 +1 @@
|
|
| 1 |
-
{
|
| 2 |
-
"added_tokens_decoder": {
|
| 3 |
-
"0": {
|
| 4 |
-
"content": "<s>",
|
| 5 |
-
"lstrip": true,
|
| 6 |
-
"normalized": false,
|
| 7 |
-
"rstrip": true,
|
| 8 |
-
"single_word": false,
|
| 9 |
-
"special": false
|
| 10 |
-
},
|
| 11 |
-
"1": {
|
| 12 |
-
"content": "<pad>",
|
| 13 |
-
"lstrip": true,
|
| 14 |
-
"normalized": false,
|
| 15 |
-
"rstrip": true,
|
| 16 |
-
"single_word": false,
|
| 17 |
-
"special": false
|
| 18 |
-
},
|
| 19 |
-
"2": {
|
| 20 |
-
"content": "</s>",
|
| 21 |
-
"lstrip": true,
|
| 22 |
-
"normalized": false,
|
| 23 |
-
"rstrip": true,
|
| 24 |
-
"single_word": false,
|
| 25 |
-
"special": false
|
| 26 |
-
},
|
| 27 |
-
"3": {
|
| 28 |
-
"content": "<unk>",
|
| 29 |
-
"lstrip": true,
|
| 30 |
-
"normalized": false,
|
| 31 |
-
"rstrip": true,
|
| 32 |
-
"single_word": false,
|
| 33 |
-
"special": false
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
"additional_special_tokens": [],
|
| 37 |
-
"bos_token": "<s>",
|
| 38 |
-
"clean_up_tokenization_spaces": true,
|
| 39 |
-
"do_lower_case": false,
|
| 40 |
-
"eos_token": "</s>",
|
| 41 |
-
"model_max_length": 1000000000000000019884624838656,
|
| 42 |
-
"pad_token": "<pad>",
|
| 43 |
-
"replace_word_delimiter_char": " ",
|
| 44 |
-
"target_lang": null,
|
| 45 |
-
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
| 46 |
-
"tokenizer_file": null,
|
| 47 |
-
"unk_token": "<unk>",
|
| 48 |
-
"word_delimiter_token": "|"
|
| 49 |
-
}
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.json
CHANGED
|
@@ -1,69 +1 @@
|
|
| 1 |
-
{
|
| 2 |
-
"</s>": 2,
|
| 3 |
-
"<pad>": 1,
|
| 4 |
-
"<s>": 0,
|
| 5 |
-
"<unk>": 3,
|
| 6 |
-
"|": 4,
|
| 7 |
-
"ँ": 5,
|
| 8 |
-
"ं": 6,
|
| 9 |
-
"ः": 7,
|
| 10 |
-
"अ": 8,
|
| 11 |
-
"आ": 9,
|
| 12 |
-
"इ": 10,
|
| 13 |
-
"ई": 11,
|
| 14 |
-
"उ": 12,
|
| 15 |
-
"ऊ": 13,
|
| 16 |
-
"ऋ": 14,
|
| 17 |
-
"ए": 15,
|
| 18 |
-
"ऐ": 16,
|
| 19 |
-
"ऑ": 17,
|
| 20 |
-
"ओ": 18,
|
| 21 |
-
"औ": 19,
|
| 22 |
-
"क": 20,
|
| 23 |
-
"ख": 21,
|
| 24 |
-
"ग": 22,
|
| 25 |
-
"घ": 23,
|
| 26 |
-
"ङ": 24,
|
| 27 |
-
"च": 25,
|
| 28 |
-
"छ": 26,
|
| 29 |
-
"ज": 27,
|
| 30 |
-
"झ": 28,
|
| 31 |
-
"ञ": 29,
|
| 32 |
-
"ट": 30,
|
| 33 |
-
"ठ": 31,
|
| 34 |
-
"ड": 32,
|
| 35 |
-
"ढ": 33,
|
| 36 |
-
"ण": 34,
|
| 37 |
-
"त": 35,
|
| 38 |
-
"थ": 36,
|
| 39 |
-
"द": 37,
|
| 40 |
-
"ध": 38,
|
| 41 |
-
"न": 39,
|
| 42 |
-
"प": 40,
|
| 43 |
-
"फ": 41,
|
| 44 |
-
"ब": 42,
|
| 45 |
-
"भ": 43,
|
| 46 |
-
"म": 44,
|
| 47 |
-
"य": 45,
|
| 48 |
-
"र": 46,
|
| 49 |
-
"ल": 47,
|
| 50 |
-
"व": 48,
|
| 51 |
-
"श": 49,
|
| 52 |
-
"ष": 50,
|
| 53 |
-
"स": 51,
|
| 54 |
-
"ह": 52,
|
| 55 |
-
"़": 53,
|
| 56 |
-
"ा": 54,
|
| 57 |
-
"ि": 55,
|
| 58 |
-
"ी": 56,
|
| 59 |
-
"ु": 57,
|
| 60 |
-
"ू": 58,
|
| 61 |
-
"ृ": 59,
|
| 62 |
-
"ॅ": 60,
|
| 63 |
-
"े": 61,
|
| 64 |
-
"ै": 62,
|
| 65 |
-
"ॉ": 63,
|
| 66 |
-
"ो": 64,
|
| 67 |
-
"ौ": 65,
|
| 68 |
-
"्": 66
|
| 69 |
-
}
|
|
|
|
| 1 |
+
{"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3, "|":4, "ँ": 5, "ं": 6, "ः": 7, "अ": 8, "आ": 9, "इ": 10, "ई": 11, "उ": 12, "ऊ": 13, "ऋ": 14, "ए": 15, "ऐ": 16, "ऑ": 17, "ओ": 18, "औ": 19, "क": 20, "ख": 21, "ग": 22, "घ": 23, "ङ": 24, "च": 25, "छ": 26, "ज": 27, "झ": 28, "ञ": 29, "ट": 30, "ठ": 31, "ड": 32, "ढ": 33, "ण": 34, "त": 35, "थ": 36, "द": 37, "ध": 38, "न": 39, "प": 40, "फ": 41, "ब": 42, "भ": 43, "म": 44, "य": 45, "र": 46, "ल": 47, "व": 48, "श": 49, "ष": 50, "स": 51, "ह": 52, "़": 53, "ा": 54, "ि": 55, "ी": 56, "ु": 57, "ू": 58, "ृ": 59, "ॅ": 60, "े": 61, "ै": 62, "ॉ": 63, "ो": 64, "ौ": 65, "्": 66}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|