add tokenizer
Browse files- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
- vocab.json +1 -1
special_tokens_map.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
|
tokenizer_config.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "
|
|
|
|
| 1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
vocab.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"
|
|
|
|
| 1 |
+
{"ô": 0, "ổ": 1, "l": 2, "é": 3, "p": 4, "ứ": 5, "ờ": 6, "n": 7, "ỵ": 8, "ơ": 9, "e": 10, "ỡ": 11, "ă": 12, "â": 13, "ừ": 14, "ễ": 15, "ử": 16, "ồ": 17, "ỳ": 18, "ằ": 19, "ý": 20, "ầ": 21, "à": 22, "g": 23, "ế": 24, "ủ": 25, "ỉ": 26, "ỏ": 27, "a": 28, "ụ": 29, "è": 30, "b": 31, "k": 32, "r": 33, "o": 34, "v": 35, "ỗ": 36, "ỷ": 37, "q": 38, "ặ": 39, "ớ": 40, "ũ": 41, "á": 42, "ợ": 43, "ắ": 44, "ẫ": 45, "ó": 46, "ĩ": 47, "c": 48, "m": 49, "ể": 50, "ậ": 52, "ấ": 53, "ù": 54, "ê": 55, "x": 56, "ữ": 57, "ạ": 58, "ự": 59, "ẩ": 60, "ẹ": 61, "s": 62, "d": 63, "ọ": 64, "ề": 65, "í": 66, "ẳ": 67, "ì": 68, "ộ": 69, "ỹ": 70, "ẵ": 71, "h": 72, "u": 73, "ò": 74, "ệ": 75, "ú": 76, "i": 77, "ị": 78, "õ": 79, "t": 80, "ở": 81, "ã": 82, "4": 83, "ẽ": 84, "đ": 85, "y": 86, "ư": 87, "ẻ": 88, "ả": 89, "ố": 90, "|": 51, "[UNK]": 91, "[PAD]": 92}
|