Thatphum commited on
Commit
4f1defa
·
verified ·
1 Parent(s): e147d5e

Initial upload of character-level tokenizer

Browse files
Files changed (4) hide show
  1. merges.txt +1 -0
  2. special_tokens_map.json +6 -0
  3. tokenizer_config.json +9 -0
  4. vocab.json +148 -0
merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ # This is a character-level tokenizer. There are no merges.
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<|unk|>",
3
+ "pad_token": "<|pad|>",
4
+ "bos_token": "<|bos|>",
5
+ "eos_token": "<|eos|>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "gpt2",
3
+ "tokenizer_class": "GPT2Tokenizer",
4
+ "unk_token": "<|unk|>",
5
+ "pad_token": "<|pad|>",
6
+ "bos_token": "<|bos|>",
7
+ "eos_token": "<|eos|>",
8
+ "add_prefix_space": false
9
+ }
vocab.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|unk|>": 0,
3
+ "<|pad|>": 1,
4
+ "<|bos|>": 2,
5
+ "<|eos|>": 3,
6
+ "!": 4,
7
+ "\"": 5,
8
+ "#": 6,
9
+ "$": 7,
10
+ "%": 8,
11
+ "&": 9,
12
+ "'": 10,
13
+ "(": 11,
14
+ ")": 12,
15
+ "*": 13,
16
+ "+": 14,
17
+ ",": 15,
18
+ "-": 16,
19
+ ".": 17,
20
+ "/": 18,
21
+ "0": 19,
22
+ "1": 20,
23
+ "2": 21,
24
+ "3": 22,
25
+ "4": 23,
26
+ "5": 24,
27
+ "6": 25,
28
+ "7": 26,
29
+ "8": 27,
30
+ "9": 28,
31
+ ":": 29,
32
+ ";": 30,
33
+ "<": 31,
34
+ "=": 32,
35
+ ">": 33,
36
+ "?": 34,
37
+ "@": 35,
38
+ "[": 36,
39
+ "\\": 37,
40
+ "]": 38,
41
+ "^": 39,
42
+ "_": 40,
43
+ "`": 41,
44
+ "a": 42,
45
+ "b": 43,
46
+ "c": 44,
47
+ "d": 45,
48
+ "e": 46,
49
+ "f": 47,
50
+ "g": 48,
51
+ "h": 49,
52
+ "i": 50,
53
+ "j": 51,
54
+ "k": 52,
55
+ "l": 53,
56
+ "m": 54,
57
+ "n": 55,
58
+ "o": 56,
59
+ "p": 57,
60
+ "q": 58,
61
+ "r": 59,
62
+ "s": 60,
63
+ "t": 61,
64
+ "u": 62,
65
+ "v": 63,
66
+ "w": 64,
67
+ "x": 65,
68
+ "y": 66,
69
+ "z": 67,
70
+ "{": 68,
71
+ "|": 69,
72
+ "}": 70,
73
+ "~": 71,
74
+ "ก": 72,
75
+ "ข": 73,
76
+ "ฃ": 74,
77
+ "ค": 75,
78
+ "ฅ": 76,
79
+ "ฆ": 77,
80
+ "ง": 78,
81
+ "จ": 79,
82
+ "ฉ": 80,
83
+ "ช": 81,
84
+ "ซ": 82,
85
+ "ฌ": 83,
86
+ "ญ": 84,
87
+ "ฎ": 85,
88
+ "ฏ": 86,
89
+ "ฐ": 87,
90
+ "ฑ": 88,
91
+ "ฒ": 89,
92
+ "ณ": 90,
93
+ "ด": 91,
94
+ "ต": 92,
95
+ "ถ": 93,
96
+ "ท": 94,
97
+ "ธ": 95,
98
+ "น": 96,
99
+ "บ": 97,
100
+ "ป": 98,
101
+ "ผ": 99,
102
+ "ฝ": 100,
103
+ "พ": 101,
104
+ "ฟ": 102,
105
+ "ภ": 103,
106
+ "ม": 104,
107
+ "ย": 105,
108
+ "ร": 106,
109
+ "ฤ": 107,
110
+ "ล": 108,
111
+ "ฦ": 109,
112
+ "ว": 110,
113
+ "ศ": 111,
114
+ "ษ": 112,
115
+ "ส": 113,
116
+ "ห": 114,
117
+ "ฬ": 115,
118
+ "อ": 116,
119
+ "ฮ": 117,
120
+ "ฯ": 118,
121
+ "ะ": 119,
122
+ "ั": 120,
123
+ "า": 121,
124
+ "ำ": 122,
125
+ "ิ": 123,
126
+ "ี": 124,
127
+ "ึ": 125,
128
+ "ื": 126,
129
+ "ุ": 127,
130
+ "ู": 128,
131
+ "ฺ": 129,
132
+ "฿": 130,
133
+ "เ": 131,
134
+ "แ": 132,
135
+ "โ": 133,
136
+ "ใ": 134,
137
+ "ไ": 135,
138
+ "ๅ": 136,
139
+ "ๆ": 137,
140
+ "็": 138,
141
+ "่": 139,
142
+ "้": 140,
143
+ "๊": 141,
144
+ "๋": 142,
145
+ "์": 143,
146
+ "ํ": 144,
147
+ "•": 145
148
+ }