kylelovesllms commited on
Commit
ffa3a26
·
verified ·
1 Parent(s): 8a4413d

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1 +1,4 @@
1
- {"<s>": 35, "</s>": 36}
 
 
 
 
1
+ {
2
+ "</s>": 36,
3
+ "<s>": 35
4
+ }
special_tokens_map.json CHANGED
@@ -1 +1,6 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json CHANGED
@@ -1 +1,95 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "23": {
4
+ "content": "ɛ˞u",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "27": {
12
+ "content": "eɪ",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "28": {
20
+ "content": "aɪ",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "29": {
28
+ "content": "aʊ",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "30": {
36
+ "content": "ɔɪ",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": true,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "31": {
44
+ "content": "oʊ",
45
+ "lstrip": true,
46
+ "normalized": false,
47
+ "rstrip": true,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "32": {
52
+ "content": "[UNK]",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": true,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "33": {
60
+ "content": "[PAD]",
61
+ "lstrip": true,
62
+ "normalized": false,
63
+ "rstrip": true,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "35": {
68
+ "content": "<s>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "36": {
76
+ "content": "</s>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ }
83
+ },
84
+ "bos_token": "<s>",
85
+ "clean_up_tokenization_spaces": false,
86
+ "do_lower_case": false,
87
+ "eos_token": "</s>",
88
+ "model_max_length": 1000000000000000019884624838656,
89
+ "pad_token": "[PAD]",
90
+ "replace_word_delimiter_char": " ",
91
+ "target_lang": null,
92
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
93
+ "unk_token": "[UNK]",
94
+ "word_delimiter_token": "|"
95
+ }
vocab.json CHANGED
@@ -1 +1,37 @@
1
- {"p": 0, "t": 1, "k": 2, "b": 3, "d": 4, "g": 5, "ɾ": 6, "ʔ": 7, "h": 8, "ʃ": 9, "ʒ": 10, "θ": 11, "ð": 12, "ŋ": 13, "j": 14, "w": 15, "i": 16, "ɪ": 17, "ɛ": 18, "æ": 19, "ʌ": 20, "ə": 21, "ɚ": 22, "ɛ˞u": 23, "ʊ": 24, "ɔ": 25, "ɑ": 26, "eɪ": 27, "aɪ": 28, "aʊ": 29, "ɔɪ": 30, "oʊ": 31, "[UNK]": 32, "[PAD]": 33, "|": 34}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 33,
3
+ "[UNK]": 32,
4
+ "aɪ": 28,
5
+ "aʊ": 29,
6
+ "b": 3,
7
+ "d": 4,
8
+ "eɪ": 27,
9
+ "g": 5,
10
+ "h": 8,
11
+ "i": 16,
12
+ "j": 14,
13
+ "k": 2,
14
+ "oʊ": 31,
15
+ "p": 0,
16
+ "t": 1,
17
+ "w": 15,
18
+ "|": 34,
19
+ "æ": 19,
20
+ "ð": 12,
21
+ "ŋ": 13,
22
+ "ɑ": 26,
23
+ "ɔ": 25,
24
+ "ɔɪ": 30,
25
+ "ə": 21,
26
+ "ɚ": 22,
27
+ "ɛ": 18,
28
+ "ɛ˞u": 23,
29
+ "ɪ": 17,
30
+ "ɾ": 6,
31
+ "ʃ": 9,
32
+ "ʊ": 24,
33
+ "ʌ": 20,
34
+ "ʒ": 10,
35
+ "ʔ": 7,
36
+ "θ": 11
37
+ }