MarkGG commited on
Commit
b6ffe89
·
1 Parent(s): 9a77cc1

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,106 +1,208 @@
1
  {
2
- "[ex0]": 31972,
3
- "[ex10]": 31982,
4
- "[ex11]": 31983,
5
- "[ex12]": 31984,
6
- "[ex13]": 31985,
7
- "[ex14]": 31986,
8
- "[ex15]": 31987,
9
- "[ex16]": 31988,
10
- "[ex17]": 31989,
11
- "[ex18]": 31990,
12
- "[ex19]": 31991,
13
- "[ex1]": 31973,
14
- "[ex20]": 31992,
15
- "[ex21]": 31993,
16
- "[ex22]": 31994,
17
- "[ex23]": 31995,
18
- "[ex24]": 31996,
19
- "[ex25]": 31997,
20
- "[ex26]": 31998,
21
- "[ex27]": 31999,
22
- "[ex28]": 32000,
23
- "[ex29]": 32001,
24
- "[ex2]": 31974,
25
- "[ex30]": 32002,
26
- "[ex31]": 32003,
27
- "[ex32]": 32004,
28
- "[ex33]": 32005,
29
- "[ex34]": 32006,
30
- "[ex35]": 32007,
31
- "[ex36]": 32008,
32
- "[ex37]": 32009,
33
- "[ex38]": 32010,
34
- "[ex39]": 32011,
35
- "[ex3]": 31975,
36
- "[ex40]": 32012,
37
- "[ex41]": 32013,
38
- "[ex42]": 32014,
39
- "[ex43]": 32015,
40
- "[ex44]": 32016,
41
- "[ex45]": 32017,
42
- "[ex46]": 32018,
43
- "[ex47]": 32019,
44
- "[ex48]": 32020,
45
- "[ex49]": 32021,
46
- "[ex4]": 31976,
47
- "[ex50]": 32022,
48
- "[ex51]": 32023,
49
- "[ex52]": 32024,
50
- "[ex53]": 32025,
51
- "[ex54]": 32026,
52
- "[ex55]": 32027,
53
- "[ex56]": 32028,
54
- "[ex57]": 32029,
55
- "[ex58]": 32030,
56
- "[ex59]": 32031,
57
- "[ex5]": 31977,
58
- "[ex60]": 32032,
59
- "[ex61]": 32033,
60
- "[ex62]": 32034,
61
- "[ex63]": 32035,
62
- "[ex64]": 32036,
63
- "[ex65]": 32037,
64
- "[ex66]": 32038,
65
- "[ex67]": 32039,
66
- "[ex68]": 32040,
67
- "[ex69]": 32041,
68
- "[ex6]": 31978,
69
- "[ex70]": 32042,
70
- "[ex71]": 32043,
71
- "[ex72]": 32044,
72
- "[ex73]": 32045,
73
- "[ex74]": 32046,
74
- "[ex75]": 32047,
75
- "[ex76]": 32048,
76
- "[ex77]": 32049,
77
- "[ex78]": 32050,
78
- "[ex79]": 32051,
79
- "[ex7]": 31979,
80
- "[ex80]": 32052,
81
- "[ex81]": 32053,
82
- "[ex82]": 32054,
83
- "[ex83]": 32055,
84
- "[ex84]": 32056,
85
- "[ex85]": 32057,
86
- "[ex86]": 32058,
87
- "[ex87]": 32059,
88
- "[ex88]": 32060,
89
- "[ex89]": 32061,
90
- "[ex8]": 31980,
91
- "[ex90]": 32062,
92
- "[ex91]": 32063,
93
- "[ex92]": 32064,
94
- "[ex93]": 32065,
95
- "[ex94]": 32066,
96
- "[ex95]": 32067,
97
- "[ex96]": 32068,
98
- "[ex97]": 32069,
99
- "[ex98]": 32070,
100
- "[ex99]": 32071,
101
- "[ex9]": 31981,
102
- "[frl]": 31970,
103
- "[mrl]": 31971,
104
- "ext.": 32072,
105
- "int.": 32073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  }
 
1
  {
2
+ "EXT.": 41061,
3
+ "INT.": 41062,
4
+ "[EX0]": 41065,
5
+ "[EX0]:": 41167,
6
+ "[EX10]": 41075,
7
+ "[EX10]:": 41177,
8
+ "[EX11]": 41076,
9
+ "[EX11]:": 41178,
10
+ "[EX12]": 41077,
11
+ "[EX12]:": 41179,
12
+ "[EX13]": 41078,
13
+ "[EX13]:": 41180,
14
+ "[EX14]": 41079,
15
+ "[EX14]:": 41181,
16
+ "[EX15]": 41080,
17
+ "[EX15]:": 41182,
18
+ "[EX16]": 41081,
19
+ "[EX16]:": 41183,
20
+ "[EX17]": 41082,
21
+ "[EX17]:": 41184,
22
+ "[EX18]": 41083,
23
+ "[EX18]:": 41185,
24
+ "[EX19]": 41084,
25
+ "[EX19]:": 41186,
26
+ "[EX1]": 41066,
27
+ "[EX1]:": 41168,
28
+ "[EX20]": 41085,
29
+ "[EX20]:": 41187,
30
+ "[EX21]": 41086,
31
+ "[EX21]:": 41188,
32
+ "[EX22]": 41087,
33
+ "[EX22]:": 41189,
34
+ "[EX23]": 41088,
35
+ "[EX23]:": 41190,
36
+ "[EX24]": 41089,
37
+ "[EX24]:": 41191,
38
+ "[EX25]": 41090,
39
+ "[EX25]:": 41192,
40
+ "[EX26]": 41091,
41
+ "[EX26]:": 41193,
42
+ "[EX27]": 41092,
43
+ "[EX27]:": 41194,
44
+ "[EX28]": 41093,
45
+ "[EX28]:": 41195,
46
+ "[EX29]": 41094,
47
+ "[EX29]:": 41196,
48
+ "[EX2]": 41067,
49
+ "[EX2]:": 41169,
50
+ "[EX30]": 41095,
51
+ "[EX30]:": 41197,
52
+ "[EX31]": 41096,
53
+ "[EX31]:": 41198,
54
+ "[EX32]": 41097,
55
+ "[EX32]:": 41199,
56
+ "[EX33]": 41098,
57
+ "[EX33]:": 41200,
58
+ "[EX34]": 41099,
59
+ "[EX34]:": 41201,
60
+ "[EX35]": 41100,
61
+ "[EX35]:": 41202,
62
+ "[EX36]": 41101,
63
+ "[EX36]:": 41203,
64
+ "[EX37]": 41102,
65
+ "[EX37]:": 41204,
66
+ "[EX38]": 41103,
67
+ "[EX38]:": 41205,
68
+ "[EX39]": 41104,
69
+ "[EX39]:": 41206,
70
+ "[EX3]": 41068,
71
+ "[EX3]:": 41170,
72
+ "[EX40]": 41105,
73
+ "[EX40]:": 41207,
74
+ "[EX41]": 41106,
75
+ "[EX41]:": 41208,
76
+ "[EX42]": 41107,
77
+ "[EX42]:": 41209,
78
+ "[EX43]": 41108,
79
+ "[EX43]:": 41210,
80
+ "[EX44]": 41109,
81
+ "[EX44]:": 41211,
82
+ "[EX45]": 41110,
83
+ "[EX45]:": 41212,
84
+ "[EX46]": 41111,
85
+ "[EX46]:": 41213,
86
+ "[EX47]": 41112,
87
+ "[EX47]:": 41214,
88
+ "[EX48]": 41113,
89
+ "[EX48]:": 41215,
90
+ "[EX49]": 41114,
91
+ "[EX49]:": 41216,
92
+ "[EX4]": 41069,
93
+ "[EX4]:": 41171,
94
+ "[EX50]": 41115,
95
+ "[EX50]:": 41217,
96
+ "[EX51]": 41116,
97
+ "[EX51]:": 41218,
98
+ "[EX52]": 41117,
99
+ "[EX52]:": 41219,
100
+ "[EX53]": 41118,
101
+ "[EX53]:": 41220,
102
+ "[EX54]": 41119,
103
+ "[EX54]:": 41221,
104
+ "[EX55]": 41120,
105
+ "[EX55]:": 41222,
106
+ "[EX56]": 41121,
107
+ "[EX56]:": 41223,
108
+ "[EX57]": 41122,
109
+ "[EX57]:": 41224,
110
+ "[EX58]": 41123,
111
+ "[EX58]:": 41225,
112
+ "[EX59]": 41124,
113
+ "[EX59]:": 41226,
114
+ "[EX5]": 41070,
115
+ "[EX5]:": 41172,
116
+ "[EX60]": 41125,
117
+ "[EX60]:": 41227,
118
+ "[EX61]": 41126,
119
+ "[EX61]:": 41228,
120
+ "[EX62]": 41127,
121
+ "[EX62]:": 41229,
122
+ "[EX63]": 41128,
123
+ "[EX63]:": 41230,
124
+ "[EX64]": 41129,
125
+ "[EX64]:": 41231,
126
+ "[EX65]": 41130,
127
+ "[EX65]:": 41232,
128
+ "[EX66]": 41131,
129
+ "[EX66]:": 41233,
130
+ "[EX67]": 41132,
131
+ "[EX67]:": 41234,
132
+ "[EX68]": 41133,
133
+ "[EX68]:": 41235,
134
+ "[EX69]": 41134,
135
+ "[EX69]:": 41236,
136
+ "[EX6]": 41071,
137
+ "[EX6]:": 41173,
138
+ "[EX70]": 41135,
139
+ "[EX70]:": 41237,
140
+ "[EX71]": 41136,
141
+ "[EX71]:": 41238,
142
+ "[EX72]": 41137,
143
+ "[EX72]:": 41239,
144
+ "[EX73]": 41138,
145
+ "[EX73]:": 41240,
146
+ "[EX74]": 41139,
147
+ "[EX74]:": 41241,
148
+ "[EX75]": 41140,
149
+ "[EX75]:": 41242,
150
+ "[EX76]": 41141,
151
+ "[EX76]:": 41243,
152
+ "[EX77]": 41142,
153
+ "[EX77]:": 41244,
154
+ "[EX78]": 41143,
155
+ "[EX78]:": 41245,
156
+ "[EX79]": 41144,
157
+ "[EX79]:": 41246,
158
+ "[EX7]": 41072,
159
+ "[EX7]:": 41174,
160
+ "[EX80]": 41145,
161
+ "[EX80]:": 41247,
162
+ "[EX81]": 41146,
163
+ "[EX81]:": 41248,
164
+ "[EX82]": 41147,
165
+ "[EX82]:": 41249,
166
+ "[EX83]": 41148,
167
+ "[EX83]:": 41250,
168
+ "[EX84]": 41149,
169
+ "[EX84]:": 41251,
170
+ "[EX85]": 41150,
171
+ "[EX85]:": 41252,
172
+ "[EX86]": 41151,
173
+ "[EX86]:": 41253,
174
+ "[EX87]": 41152,
175
+ "[EX87]:": 41254,
176
+ "[EX88]": 41153,
177
+ "[EX88]:": 41255,
178
+ "[EX89]": 41154,
179
+ "[EX89]:": 41256,
180
+ "[EX8]": 41073,
181
+ "[EX8]:": 41175,
182
+ "[EX90]": 41155,
183
+ "[EX90]:": 41257,
184
+ "[EX91]": 41156,
185
+ "[EX91]:": 41258,
186
+ "[EX92]": 41157,
187
+ "[EX92]:": 41259,
188
+ "[EX93]": 41158,
189
+ "[EX93]:": 41260,
190
+ "[EX94]": 41159,
191
+ "[EX94]:": 41261,
192
+ "[EX95]": 41160,
193
+ "[EX95]:": 41262,
194
+ "[EX96]": 41161,
195
+ "[EX96]:": 41263,
196
+ "[EX97]": 41162,
197
+ "[EX97]:": 41264,
198
+ "[EX98]": 41163,
199
+ "[EX98]:": 41265,
200
+ "[EX99]": 41164,
201
+ "[EX99]:": 41266,
202
+ "[EX9]": 41074,
203
+ "[EX9]:": 41176,
204
+ "[FRL]": 41063,
205
+ "[FRL]:": 41165,
206
+ "[MRL]": 41064,
207
+ "[MRL]:": 41166
208
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
  "bos_token": "<|endoftext|>",
3
  "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
  "unk_token": "<|endoftext|>"
6
  }
 
1
  {
2
  "bos_token": "<|endoftext|>",
3
  "eos_token": "<|endoftext|>",
 
4
  "unk_token": "<|endoftext|>"
5
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "bos_token": "<|endoftext|>",
4
  "eos_token": "<|endoftext|>",
5
  "model_max_length": 1024,
6
- "name_or_path": "MarkGG/Romance-cleaned-3",
7
  "special_tokens_map_file": null,
8
  "tokenizer_class": "GPT2Tokenizer",
9
  "unk_token": "<|endoftext|>"
 
3
  "bos_token": "<|endoftext|>",
4
  "eos_token": "<|endoftext|>",
5
  "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
  "special_tokens_map_file": null,
8
  "tokenizer_class": "GPT2Tokenizer",
9
  "unk_token": "<|endoftext|>"
vocab.json CHANGED
The diff for this file is too large to render. See raw diff