burak commited on
Commit
bf11b9d
·
verified ·
1 Parent(s): b1748da

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. config.json +23 -0
  2. model.safetensors +3 -0
  3. tokenizer.json +395 -0
  4. tokenizer_config.json +13 -0
  5. vocab.json +96 -0
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_layer": 6,
3
+ "n_head": 4,
4
+ "n_embd": 256,
5
+ "max_position_embeddings": 512,
6
+ "vocab_size": 94,
7
+ "model_type": "gpt2",
8
+ "architectures": [
9
+ "GPT2LMHeadModel"
10
+ ],
11
+ "attn_pdrop": 0.1,
12
+ "embd_pdrop": 0.1,
13
+ "resid_pdrop": 0.1,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "n_positions": 512,
17
+ "scale_attn_weights": true,
18
+ "use_cache": true,
19
+ "bos_token_id": 1,
20
+ "eos_token_id": 2,
21
+ "pad_token_id": 0,
22
+ "unk_token_id": 3
23
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d74df2a09fe381c9e7bbb9c14fd2defe8ceb3f622a31e98bc1a93ae7ac740d
3
+ size 25876288
tokenizer.json ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<bos>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<eos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Sequence",
45
+ "normalizers": [
46
+ {
47
+ "type": "Lowercase"
48
+ },
49
+ {
50
+ "type": "NFKC"
51
+ }
52
+ ]
53
+ },
54
+ "pre_tokenizer": {
55
+ "type": "Whitespace"
56
+ },
57
+ "post_processor": null,
58
+ "decoder": null,
59
+ "model": {
60
+ "type": "BPE",
61
+ "dropout": null,
62
+ "unk_token": "<unk>",
63
+ "continuing_subword_prefix": null,
64
+ "end_of_word_suffix": null,
65
+ "fuse_unk": false,
66
+ "byte_fallback": false,
67
+ "ignore_merges": false,
68
+ "vocab": {
69
+ "<pad>": 0,
70
+ "<bos>": 1,
71
+ "<eos>": 2,
72
+ "<unk>": 3,
73
+ "(": 4,
74
+ ")": 5,
75
+ ",": 6,
76
+ ".": 7,
77
+ "a": 8,
78
+ "b": 9,
79
+ "c": 10,
80
+ "d": 11,
81
+ "e": 12,
82
+ "f": 13,
83
+ "g": 14,
84
+ "h": 15,
85
+ "i": 16,
86
+ "j": 17,
87
+ "k": 18,
88
+ "l": 19,
89
+ "m": 20,
90
+ "n": 21,
91
+ "o": 22,
92
+ "p": 23,
93
+ "r": 24,
94
+ "s": 25,
95
+ "t": 26,
96
+ "u": 27,
97
+ "v": 28,
98
+ "y": 29,
99
+ "z": 30,
100
+ "ç": 31,
101
+ "ö": 32,
102
+ "ü": 33,
103
+ "ğ": 34,
104
+ "ı": 35,
105
+ "ş": 36,
106
+ "el": 37,
107
+ "in": 38,
108
+ "il": 39,
109
+ "er": 40,
110
+ "ir": 41,
111
+ "en": 42,
112
+ "la": 43,
113
+ "me": 44,
114
+ "bir": 45,
115
+ "da": 46,
116
+ "dil": 47,
117
+ "ok": 48,
118
+ "tü": 49,
119
+ "eli": 50,
120
+ "eni": 51,
121
+ "bu": 52,
122
+ "eğ": 53,
123
+ "gü": 54,
124
+ "ha": 55,
125
+ "kç": 56,
126
+ "le": 57,
127
+ "ma": 58,
128
+ "or": 59,
129
+ "yor": 60,
130
+ "ır": 61,
131
+ "eği": 62,
132
+ "ar": 63,
133
+ "ay": 64,
134
+ "ala": 65,
135
+ "du": 66,
136
+ "dır": 67,
137
+ "ek": 68,
138
+ "gin": 69,
139
+ "iç": 70,
140
+ "ldu": 71,
141
+ "mo": 72,
142
+ "ne": 73,
143
+ "oldu": 74,
144
+ "rkç": 75,
145
+ "tir": 76,
146
+ "tok": 77,
147
+ "ve": 78,
148
+ "ya": 79,
149
+ "zel": 80,
150
+ "zer": 81,
151
+ "çok": 82,
152
+ "ştir": 83,
153
+ "türkç": 84,
154
+ "enizer": 85,
155
+ "lem": 86,
156
+ "mak": 87,
157
+ "eğit": 88,
158
+ "alan": 89,
159
+ "için": 90,
160
+ "mod": 91,
161
+ "tokenizer": 92,
162
+ "türkçe": 93
163
+ },
164
+ "merges": [
165
+ [
166
+ "e",
167
+ "l"
168
+ ],
169
+ [
170
+ "i",
171
+ "n"
172
+ ],
173
+ [
174
+ "i",
175
+ "l"
176
+ ],
177
+ [
178
+ "e",
179
+ "r"
180
+ ],
181
+ [
182
+ "i",
183
+ "r"
184
+ ],
185
+ [
186
+ "e",
187
+ "n"
188
+ ],
189
+ [
190
+ "l",
191
+ "a"
192
+ ],
193
+ [
194
+ "m",
195
+ "e"
196
+ ],
197
+ [
198
+ "b",
199
+ "ir"
200
+ ],
201
+ [
202
+ "d",
203
+ "a"
204
+ ],
205
+ [
206
+ "d",
207
+ "il"
208
+ ],
209
+ [
210
+ "o",
211
+ "k"
212
+ ],
213
+ [
214
+ "t",
215
+ "ü"
216
+ ],
217
+ [
218
+ "el",
219
+ "i"
220
+ ],
221
+ [
222
+ "en",
223
+ "i"
224
+ ],
225
+ [
226
+ "b",
227
+ "u"
228
+ ],
229
+ [
230
+ "e",
231
+ "ğ"
232
+ ],
233
+ [
234
+ "g",
235
+ "ü"
236
+ ],
237
+ [
238
+ "h",
239
+ "a"
240
+ ],
241
+ [
242
+ "k",
243
+ "ç"
244
+ ],
245
+ [
246
+ "l",
247
+ "e"
248
+ ],
249
+ [
250
+ "m",
251
+ "a"
252
+ ],
253
+ [
254
+ "o",
255
+ "r"
256
+ ],
257
+ [
258
+ "y",
259
+ "or"
260
+ ],
261
+ [
262
+ "ı",
263
+ "r"
264
+ ],
265
+ [
266
+ "eğ",
267
+ "i"
268
+ ],
269
+ [
270
+ "a",
271
+ "r"
272
+ ],
273
+ [
274
+ "a",
275
+ "y"
276
+ ],
277
+ [
278
+ "a",
279
+ "la"
280
+ ],
281
+ [
282
+ "d",
283
+ "u"
284
+ ],
285
+ [
286
+ "d",
287
+ "ır"
288
+ ],
289
+ [
290
+ "e",
291
+ "k"
292
+ ],
293
+ [
294
+ "g",
295
+ "in"
296
+ ],
297
+ [
298
+ "i",
299
+ "ç"
300
+ ],
301
+ [
302
+ "l",
303
+ "du"
304
+ ],
305
+ [
306
+ "m",
307
+ "o"
308
+ ],
309
+ [
310
+ "n",
311
+ "e"
312
+ ],
313
+ [
314
+ "o",
315
+ "ldu"
316
+ ],
317
+ [
318
+ "r",
319
+ "kç"
320
+ ],
321
+ [
322
+ "t",
323
+ "ir"
324
+ ],
325
+ [
326
+ "t",
327
+ "ok"
328
+ ],
329
+ [
330
+ "v",
331
+ "e"
332
+ ],
333
+ [
334
+ "y",
335
+ "a"
336
+ ],
337
+ [
338
+ "z",
339
+ "el"
340
+ ],
341
+ [
342
+ "z",
343
+ "er"
344
+ ],
345
+ [
346
+ "ç",
347
+ "ok"
348
+ ],
349
+ [
350
+ "ş",
351
+ "tir"
352
+ ],
353
+ [
354
+ "tü",
355
+ "rkç"
356
+ ],
357
+ [
358
+ "eni",
359
+ "zer"
360
+ ],
361
+ [
362
+ "le",
363
+ "m"
364
+ ],
365
+ [
366
+ "ma",
367
+ "k"
368
+ ],
369
+ [
370
+ "eği",
371
+ "t"
372
+ ],
373
+ [
374
+ "ala",
375
+ "n"
376
+ ],
377
+ [
378
+ "iç",
379
+ "in"
380
+ ],
381
+ [
382
+ "mo",
383
+ "d"
384
+ ],
385
+ [
386
+ "tok",
387
+ "enizer"
388
+ ],
389
+ [
390
+ "türkç",
391
+ "e"
392
+ ]
393
+ ]
394
+ }
395
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 512,
3
+ "add_prefix_space": false,
4
+ "bos_token": "<bos>",
5
+ "eos_token": "<eos>",
6
+ "unk_token": "<unk>",
7
+ "pad_token": "<pad>",
8
+ "errors": "replace",
9
+ "tokenizer_class": "PreTrainedTokenizerFast",
10
+ "name_or_path": "SykoLLM-0.1B-HF",
11
+ "do_lower_case": true,
12
+ "special_tokens_map_file": null
13
+ }
vocab.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ldu": 71,
3
+ "b": 9,
4
+ "i\u00e7": 70,
5
+ "t\u00fcrk\u00e7": 84,
6
+ "g\u00fc": 54,
7
+ "ne": 73,
8
+ "el": 37,
9
+ "tok": 77,
10
+ "ek": 68,
11
+ "ala": 65,
12
+ "mod": 91,
13
+ "enizer": 85,
14
+ "in": 38,
15
+ "ya": 79,
16
+ "<bos>": 1,
17
+ "n": 21,
18
+ "l": 19,
19
+ "\u00e7": 31,
20
+ "tir": 76,
21
+ "ha": 55,
22
+ "ar": 63,
23
+ "le": 57,
24
+ "<eos>": 2,
25
+ "d": 11,
26
+ "ma": 58,
27
+ "bu": 52,
28
+ "p": 23,
29
+ "r": 24,
30
+ "e": 12,
31
+ ",": 6,
32
+ "eli": 50,
33
+ "(": 4,
34
+ "\u00e7ok": 82,
35
+ "y": 29,
36
+ "<unk>": 3,
37
+ "la": 43,
38
+ "k\u00e7": 56,
39
+ ")": 5,
40
+ "lem": 86,
41
+ "j": 17,
42
+ "tokenizer": 92,
43
+ "t\u00fcrk\u00e7e": 93,
44
+ "e\u011f": 53,
45
+ "s": 25,
46
+ "gin": 69,
47
+ "ir": 41,
48
+ "g": 14,
49
+ "k": 18,
50
+ "\u00fc": 33,
51
+ "d\u0131r": 67,
52
+ "me": 44,
53
+ "eni": 51,
54
+ ".": 7,
55
+ "z": 30,
56
+ "e\u011fit": 88,
57
+ "h": 15,
58
+ "du": 66,
59
+ "yor": 60,
60
+ "u": 27,
61
+ "mak": 87,
62
+ "mo": 72,
63
+ "er": 40,
64
+ "i\u00e7in": 90,
65
+ "a": 8,
66
+ "da": 46,
67
+ "\u015f": 36,
68
+ "t": 26,
69
+ "ve": 78,
70
+ "t\u00fc": 49,
71
+ "\u0131": 35,
72
+ "dil": 47,
73
+ "c": 10,
74
+ "ok": 48,
75
+ "\u015ftir": 83,
76
+ "rk\u00e7": 75,
77
+ "bir": 45,
78
+ "alan": 89,
79
+ "il": 39,
80
+ "\u0131r": 61,
81
+ "\u00f6": 32,
82
+ "m": 20,
83
+ "oldu": 74,
84
+ "\u011f": 34,
85
+ "zel": 80,
86
+ "v": 28,
87
+ "zer": 81,
88
+ "i": 16,
89
+ "e\u011fi": 62,
90
+ "f": 13,
91
+ "<pad>": 0,
92
+ "ay": 64,
93
+ "en": 42,
94
+ "or": 59,
95
+ "o": 22
96
+ }