slplab commited on
Commit
df06297
ยท
1 Parent(s): 7b72b0d

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +4 -0
  2. special_tokens_map.json +22 -0
  3. tokenizer_config.json +13 -0
  4. vocab.json +119 -0
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 118,
3
+ "<s>": 117
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "[PAD]",
21
+ "unk_token": "[UNK]"
22
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "./",
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
13
+ }
vocab.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 116,
3
+ "[UNK]": 115,
4
+ "|": 0,
5
+ "๊ฐ€": 1,
6
+ "๊ฐ‘": 2,
7
+ "๊ฑฐ": 3,
8
+ "๊ฒฝ": 4,
9
+ "๊ณ ": 5,
10
+ "๊ตญ": 6,
11
+ "๊ตด": 7,
12
+ "๊ทธ": 8,
13
+ "๊ธฐ": 9,
14
+ "๊ป˜": 10,
15
+ "๊ฝƒ": 11,
16
+ "๋ผ": 12,
17
+ "๋‚˜": 13,
18
+ "๋‚ฌ": 14,
19
+ "๋„ค": 15,
20
+ "๋ˆˆ": 16,
21
+ "๋Š”": 17,
22
+ "๋Š˜": 18,
23
+ "๋‹ˆ": 19,
24
+ "๋‹ค": 20,
25
+ "๋‹จ": 21,
26
+ "๋Œ€": 22,
27
+ "๋”": 23,
28
+ "๋„": 24,
29
+ "๋™": 25,
30
+ "๋“ ": 26,
31
+ "๋”ธ": 27,
32
+ "๋ผ": 28,
33
+ "๋žŒ": 29,
34
+ "๋ž‘": 30,
35
+ "๋ž˜": 31,
36
+ "๋ฆฌ": 32,
37
+ "๋งˆ": 33,
38
+ "๋งŒ": 34,
39
+ "๋ง": 35,
40
+ "๋จธ": 36,
41
+ "๋ชจ": 37,
42
+ "๋ฌด": 38,
43
+ "๋ฌผ": 39,
44
+ "๋ฏธ": 40,
45
+ "๋ฐ”": 41,
46
+ "๋ฑ€": 42,
47
+ "๋ฒ„": 43,
48
+ "๋ณ‘": 44,
49
+ "๋ณด": 45,
50
+ "๋ถ": 46,
51
+ "๋น„": 47,
52
+ "๋น—": 48,
53
+ "๋น ": 49,
54
+ "๋นจ": 50,
55
+ "์‚ฌ": 51,
56
+ "์‚ฐ": 52,
57
+ "์ƒ‰": 53,
58
+ "์ƒ": 54,
59
+ "์„œ": 55,
60
+ "์„": 56,
61
+ "์„ธ": 57,
62
+ "์†Œ": 58,
63
+ "์†ก": 59,
64
+ "์ˆ˜": 60,
65
+ "์‹œ": 61,
66
+ "์‹ค": 62,
67
+ "์‹ถ": 63,
68
+ "์‹ธ": 64,
69
+ "์Ÿ": 65,
70
+ "์•„": 66,
71
+ "์•ˆ": 67,
72
+ "์–‘": 68,
73
+ "์–ด": 69,
74
+ "์–ผ": 70,
75
+ "์—„": 71,
76
+ "์—†": 72,
77
+ "์—": 73,
78
+ "์˜ค": 74,
79
+ "์˜ฅ": 75,
80
+ "์˜จ": 76,
81
+ "์˜ฌ": 77,
82
+ "์š”": 78,
83
+ "์šฐ": 79,
84
+ "์›Œ": 80,
85
+ "์›": 81,
86
+ "์„": 82,
87
+ "์ด": 83,
88
+ "์žˆ": 84,
89
+ "์ž": 85,
90
+ "์ž”": 86,
91
+ "์žฅ": 87,
92
+ "์ œ": 88,
93
+ "์กŒ": 89,
94
+ "์กฑ": 90,
95
+ "์ข…": 91,
96
+ "์ฃผ": 92,
97
+ "์ง€": 93,
98
+ "์ฐข": 94,
99
+ "์ฑ…": 95,
100
+ "์ถ”": 96,
101
+ "์นจ": 97,
102
+ "์ปด": 98,
103
+ "์ปต": 99,
104
+ "ํ€ด": 100,
105
+ "ํƒ•": 101,
106
+ "ํ…Œ": 102,
107
+ "ํ† ": 103,
108
+ "ํŒŒ": 104,
109
+ "ํŽธ": 105,
110
+ "ํฌ": 106,
111
+ "ํ•˜": 107,
112
+ "ํ•œ": 108,
113
+ "ํ•จ": 109,
114
+ "ํ•ด": 110,
115
+ "ํ–„": 111,
116
+ "ํ˜ธ": 112,
117
+ "ํ˜ผ": 113,
118
+ "ํ™”": 114
119
+ }