tclong commited on
Commit
e8adcdd
·
1 Parent(s): f868f35

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 93, "</s>": 94}
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json CHANGED
@@ -1 +1 @@
1
- {"": 0, "s": 1, "ê": 2, "": 3, "ò": 4, "": 5, "h": 6, "": 7, "ĩ": 8, "": 9, "ù": 10, "": 11, "t": 12, "": 13, "k": 14, "": 15, "m": 16, "": 17, "x": 18, "á": 19, "ư": 20, "": 21, "ũ": 22, "": 23, "ú": 24, "ơ": 25, "": 26, "": 27, "": 28, "": 29, "": 30, "g": 31, "": 32, "i": 33, "": 34, "q": 35, "a": 36, "b": 37, "p": 38, "è": 39, "đ": 40, "": 41, "ý": 42, "ă": 43, "": 44, "4": 45, "": 46, "l": 47, "": 48, "": 49, "r": 50, "": 51, "d": 52, "ã": 53, "": 54, "": 56, "ô": 57, "í": 58, "": 59, "õ": 60, "": 61, "": 62, "ế": 63, "ó": 64, "â": 65, "": 66, "": 67, "": 68, "": 69, "c": 70, "ì": 71, "o": 72, "": 73, "n": 74, "": 75, "u": 76, "à": 77, "": 78, "": 79, "": 80, "": 81, "e": 82, "v": 83, "": 84, "": 85, "y": 86, "": 87, "é": 88, "": 89, "": 90, "|": 55, "[UNK]": 91, "[PAD]": 92}
 
1
+ {"": 0, "": 1, "": 2, "n": 3, "": 4, "ũ": 5, "": 6, "": 7, "": 8, "": 9, "ư": 10, "à": 11, "": 12, "ĩ": 13, "r": 14, "": 15, "ó": 16, "d": 17, "": 18, "ý": 19, "": 20, "õ": 21, "u": 22, "": 23, "": 24, "a": 25, "": 26, "": 27, "": 28, "": 29, "": 30, "y": 31, "": 32, "ơ": 33, "t": 34, "è": 35, "": 36, "đ": 37, "x": 38, "": 39, "é": 40, "": 41, "ù": 43, "": 44, "": 45, "": 46, "p": 47, "": 48, "â": 49, "": 50, "": 51, "ì": 52, "c": 53, "q": 54, "": 55, "l": 56, "": 57, "": 58, "": 59, "4": 60, "ò": 61, "á": 62, "e": 63, "í": 64, "v": 65, "ú": 66, "ă": 67, "ê": 68, "": 69, "": 70, "": 71, "m": 72, "h": 73, "b": 74, "": 75, "": 76, "ế": 77, "o": 78, "": 79, "s": 80, "g": 81, "": 82, "": 83, "ã": 84, "i": 85, "k": 86, "": 87, "": 88, "": 89, "ô": 90, "|": 42, "[UNK]": 91, "[PAD]": 92}