iko-01 commited on
Commit
e1e9081
·
verified ·
1 Parent(s): 4176dda

رفع النموذج النهائي مع جميع الملفات للتجربة أو إعادة التدريب

Browse files
config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 0,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": 0,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "task_specific_params": {
32
+ "text-generation": {
33
+ "do_sample": true,
34
+ "max_length": 50,
35
+ "no_repeat_ngram_size": 3,
36
+ "num_beams": 5,
37
+ "repetition_penalty": 3.0,
38
+ "top_p": 0.95
39
+ }
40
+ },
41
+ "transformers_version": "4.56.0",
42
+ "use_cache": true,
43
+ "vocab_size": 64000
44
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "4.56.0"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5def632837c285d909161e542fd2a3c41b3ecafe1e77712bea28e7a66daae047
3
+ size 539992704
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933e0c20e1d62997ca58ab98e5ca94ebc0ce7b0c622256bd729eb3b75c20f714
3
+ size 1080081803
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26a9274ad13b7d2e725ff1896667490eda6c083e720a3018f1a5b085f5a75a1e
3
+ size 14645
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2d3396f942f473413beff20277555965d2cc6e97c03e3c93bf435f2f369868d
3
+ size 1383
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82aa95ff07aa3cbc8cc5a6f7944f2fd087c326b3a5b8f66cc4c1f7fc711fafd4
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "</s>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ }
36
+ },
37
+ "bos_token": "<|endoftext|>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "<|endoftext|>",
40
+ "extra_special_tokens": {},
41
+ "max_length": 256,
42
+ "model_max_length": 1000000000000000019884624838656,
43
+ "pad_to_multiple_of": null,
44
+ "pad_token": "<|endoftext|>",
45
+ "pad_token_type_id": 0,
46
+ "padding_side": "right",
47
+ "stride": 0,
48
+ "tokenizer_class": "GPT2Tokenizer",
49
+ "truncation_side": "right",
50
+ "truncation_strategy": "longest_first",
51
+ "unk_token": "<|endoftext|>"
52
+ }
trainer_state.json ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6000,
3
+ "best_metric": 1.4575997591018677,
4
+ "best_model_checkpoint": "/content/marocAI-finetuned/checkpoint-2821/checkpoint-6000",
5
+ "epoch": 7.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6062,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.4618937644341801,
14
+ "grad_norm": 1.0584869384765625,
15
+ "learning_rate": 1.695852534562212e-05,
16
+ "loss": 2.9443,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.9237875288683602,
21
+ "grad_norm": 1.051153302192688,
22
+ "learning_rate": 1.3886328725038403e-05,
23
+ "loss": 2.3079,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 1.3833718244803694,
28
+ "grad_norm": 0.9572842121124268,
29
+ "learning_rate": 1.0814132104454686e-05,
30
+ "loss": 2.2004,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 1.8452655889145495,
35
+ "grad_norm": 0.7911381125450134,
36
+ "learning_rate": 7.741935483870968e-06,
37
+ "loss": 2.0987,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 2.304849884526559,
42
+ "grad_norm": 1.0212643146514893,
43
+ "learning_rate": 4.669738863287251e-06,
44
+ "loss": 2.0724,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 2.304849884526559,
49
+ "eval_loss": 1.7146347761154175,
50
+ "eval_runtime": 3.6485,
51
+ "eval_samples_per_second": 50.157,
52
+ "eval_steps_per_second": 12.608,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 2.766743648960739,
57
+ "grad_norm": 1.0736603736877441,
58
+ "learning_rate": 1.5975422427035332e-06,
59
+ "loss": 2.0376,
60
+ "step": 600
61
+ },
62
+ {
63
+ "epoch": 3.2263279445727484,
64
+ "grad_norm": 0.960295557975769,
65
+ "learning_rate": 9.262672811059909e-06,
66
+ "loss": 2.0108,
67
+ "step": 700
68
+ },
69
+ {
70
+ "epoch": 3.6882217090069283,
71
+ "grad_norm": 0.8508874773979187,
72
+ "learning_rate": 7.726574500768049e-06,
73
+ "loss": 2.0039,
74
+ "step": 800
75
+ },
76
+ {
77
+ "epoch": 4.147806004618937,
78
+ "grad_norm": 0.9244301319122314,
79
+ "learning_rate": 6.1904761904761914e-06,
80
+ "loss": 2.0071,
81
+ "step": 900
82
+ },
83
+ {
84
+ "epoch": 4.609699769053118,
85
+ "grad_norm": 0.8824007511138916,
86
+ "learning_rate": 4.654377880184332e-06,
87
+ "loss": 1.9621,
88
+ "step": 1000
89
+ },
90
+ {
91
+ "epoch": 4.609699769053118,
92
+ "eval_loss": 1.6397035121917725,
93
+ "eval_runtime": 3.6176,
94
+ "eval_samples_per_second": 50.586,
95
+ "eval_steps_per_second": 12.716,
96
+ "step": 1000
97
+ },
98
+ {
99
+ "epoch": 5.069284064665127,
100
+ "grad_norm": 0.8873424530029297,
101
+ "learning_rate": 3.1182795698924735e-06,
102
+ "loss": 1.9262,
103
+ "step": 1100
104
+ },
105
+ {
106
+ "epoch": 5.531177829099307,
107
+ "grad_norm": 0.8406194448471069,
108
+ "learning_rate": 1.5821812596006145e-06,
109
+ "loss": 1.9235,
110
+ "step": 1200
111
+ },
112
+ {
113
+ "epoch": 5.993071593533488,
114
+ "grad_norm": 0.8757996559143066,
115
+ "learning_rate": 4.608294930875576e-08,
116
+ "loss": 1.9402,
117
+ "step": 1300
118
+ },
119
+ {
120
+ "epoch": 6.452655889145497,
121
+ "grad_norm": 0.8397168517112732,
122
+ "learning_rate": 1.0081531371853954e-05,
123
+ "loss": 1.9086,
124
+ "step": 1400
125
+ },
126
+ {
127
+ "epoch": 6.914549653579677,
128
+ "grad_norm": 0.8500417470932007,
129
+ "learning_rate": 9.372562920950018e-06,
130
+ "loss": 1.9123,
131
+ "step": 1500
132
+ },
133
+ {
134
+ "epoch": 6.914549653579677,
135
+ "eval_loss": 1.5952860116958618,
136
+ "eval_runtime": 3.6287,
137
+ "eval_samples_per_second": 50.432,
138
+ "eval_steps_per_second": 12.677,
139
+ "step": 1500
140
+ },
141
+ {
142
+ "epoch": 7.374133949191686,
143
+ "grad_norm": 0.8373268246650696,
144
+ "learning_rate": 8.663594470046084e-06,
145
+ "loss": 1.8666,
146
+ "step": 1600
147
+ },
148
+ {
149
+ "epoch": 7.836027713625866,
150
+ "grad_norm": 0.8652852773666382,
151
+ "learning_rate": 7.954626019142148e-06,
152
+ "loss": 1.8377,
153
+ "step": 1700
154
+ },
155
+ {
156
+ "epoch": 8.295612009237875,
157
+ "grad_norm": 0.8815492391586304,
158
+ "learning_rate": 7.245657568238214e-06,
159
+ "loss": 1.8395,
160
+ "step": 1800
161
+ },
162
+ {
163
+ "epoch": 8.757505773672055,
164
+ "grad_norm": 0.8464174270629883,
165
+ "learning_rate": 6.53668911733428e-06,
166
+ "loss": 1.8285,
167
+ "step": 1900
168
+ },
169
+ {
170
+ "epoch": 9.217090069284065,
171
+ "grad_norm": 0.818134069442749,
172
+ "learning_rate": 5.827720666430344e-06,
173
+ "loss": 1.8244,
174
+ "step": 2000
175
+ },
176
+ {
177
+ "epoch": 9.217090069284065,
178
+ "eval_loss": 1.5474414825439453,
179
+ "eval_runtime": 3.7454,
180
+ "eval_samples_per_second": 48.86,
181
+ "eval_steps_per_second": 12.282,
182
+ "step": 2000
183
+ },
184
+ {
185
+ "epoch": 9.678983833718245,
186
+ "grad_norm": 0.8570533394813538,
187
+ "learning_rate": 5.118752215526409e-06,
188
+ "loss": 1.8044,
189
+ "step": 2100
190
+ },
191
+ {
192
+ "epoch": 10.138568129330254,
193
+ "grad_norm": 0.8370910286903381,
194
+ "learning_rate": 4.409783764622475e-06,
195
+ "loss": 1.7769,
196
+ "step": 2200
197
+ },
198
+ {
199
+ "epoch": 10.600461893764434,
200
+ "grad_norm": 0.8950196504592896,
201
+ "learning_rate": 3.70081531371854e-06,
202
+ "loss": 1.7914,
203
+ "step": 2300
204
+ },
205
+ {
206
+ "epoch": 11.060046189376443,
207
+ "grad_norm": 0.7839242815971375,
208
+ "learning_rate": 2.9918468628146054e-06,
209
+ "loss": 1.7767,
210
+ "step": 2400
211
+ },
212
+ {
213
+ "epoch": 11.521939953810623,
214
+ "grad_norm": 0.8934968709945679,
215
+ "learning_rate": 2.28287841191067e-06,
216
+ "loss": 1.7807,
217
+ "step": 2500
218
+ },
219
+ {
220
+ "epoch": 11.521939953810623,
221
+ "eval_loss": 1.5255564451217651,
222
+ "eval_runtime": 3.6222,
223
+ "eval_samples_per_second": 50.522,
224
+ "eval_steps_per_second": 12.7,
225
+ "step": 2500
226
+ },
227
+ {
228
+ "epoch": 11.983833718244803,
229
+ "grad_norm": 0.9384580254554749,
230
+ "learning_rate": 1.5739099610067355e-06,
231
+ "loss": 1.7571,
232
+ "step": 2600
233
+ },
234
+ {
235
+ "epoch": 12.443418013856814,
236
+ "grad_norm": 0.838097333908081,
237
+ "learning_rate": 8.649415101028006e-07,
238
+ "loss": 1.7684,
239
+ "step": 2700
240
+ },
241
+ {
242
+ "epoch": 12.905311778290994,
243
+ "grad_norm": 0.807694673538208,
244
+ "learning_rate": 1.5597305919886567e-07,
245
+ "loss": 1.7643,
246
+ "step": 2800
247
+ },
248
+ {
249
+ "epoch": 3.348729792147806,
250
+ "grad_norm": 1.560190200805664,
251
+ "learning_rate": 1.0610533378061055e-05,
252
+ "loss": 1.7522,
253
+ "step": 2900
254
+ },
255
+ {
256
+ "epoch": 3.464203233256351,
257
+ "grad_norm": 1.5953682661056519,
258
+ "learning_rate": 1.0275075478027507e-05,
259
+ "loss": 1.7661,
260
+ "step": 3000
261
+ },
262
+ {
263
+ "epoch": 3.464203233256351,
264
+ "eval_loss": 1.5233831405639648,
265
+ "eval_runtime": 3.6827,
266
+ "eval_samples_per_second": 49.692,
267
+ "eval_steps_per_second": 12.491,
268
+ "step": 3000
269
+ },
270
+ {
271
+ "epoch": 3.579676674364896,
272
+ "grad_norm": 1.5370293855667114,
273
+ "learning_rate": 9.939617577993964e-06,
274
+ "loss": 1.7759,
275
+ "step": 3100
276
+ },
277
+ {
278
+ "epoch": 3.695150115473441,
279
+ "grad_norm": 1.470035195350647,
280
+ "learning_rate": 9.604159677960416e-06,
281
+ "loss": 1.7367,
282
+ "step": 3200
283
+ },
284
+ {
285
+ "epoch": 3.8106235565819864,
286
+ "grad_norm": 1.5553827285766602,
287
+ "learning_rate": 9.26870177792687e-06,
288
+ "loss": 1.7614,
289
+ "step": 3300
290
+ },
291
+ {
292
+ "epoch": 3.9260969976905313,
293
+ "grad_norm": 1.7353712320327759,
294
+ "learning_rate": 8.933243877893324e-06,
295
+ "loss": 1.7301,
296
+ "step": 3400
297
+ },
298
+ {
299
+ "epoch": 4.041570438799076,
300
+ "grad_norm": 1.8023557662963867,
301
+ "learning_rate": 8.59778597785978e-06,
302
+ "loss": 1.7797,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 4.041570438799076,
307
+ "eval_loss": 1.5006413459777832,
308
+ "eval_runtime": 3.6376,
309
+ "eval_samples_per_second": 50.308,
310
+ "eval_steps_per_second": 12.646,
311
+ "step": 3500
312
+ },
313
+ {
314
+ "epoch": 4.157043879907621,
315
+ "grad_norm": 1.4707646369934082,
316
+ "learning_rate": 8.262328077826235e-06,
317
+ "loss": 1.7436,
318
+ "step": 3600
319
+ },
320
+ {
321
+ "epoch": 4.272517321016166,
322
+ "grad_norm": 1.8790034055709839,
323
+ "learning_rate": 7.926870177792688e-06,
324
+ "loss": 1.7703,
325
+ "step": 3700
326
+ },
327
+ {
328
+ "epoch": 4.387990762124711,
329
+ "grad_norm": 1.7127020359039307,
330
+ "learning_rate": 7.591412277759142e-06,
331
+ "loss": 1.7297,
332
+ "step": 3800
333
+ },
334
+ {
335
+ "epoch": 4.503464203233256,
336
+ "grad_norm": 1.575723648071289,
337
+ "learning_rate": 7.255954377725596e-06,
338
+ "loss": 1.6714,
339
+ "step": 3900
340
+ },
341
+ {
342
+ "epoch": 4.618937644341801,
343
+ "grad_norm": 1.6164535284042358,
344
+ "learning_rate": 6.92049647769205e-06,
345
+ "loss": 1.6929,
346
+ "step": 4000
347
+ },
348
+ {
349
+ "epoch": 4.618937644341801,
350
+ "eval_loss": 1.4842592477798462,
351
+ "eval_runtime": 3.769,
352
+ "eval_samples_per_second": 48.554,
353
+ "eval_steps_per_second": 12.205,
354
+ "step": 4000
355
+ },
356
+ {
357
+ "epoch": 4.734411085450346,
358
+ "grad_norm": 1.4100682735443115,
359
+ "learning_rate": 6.585038577658505e-06,
360
+ "loss": 1.7042,
361
+ "step": 4100
362
+ },
363
+ {
364
+ "epoch": 4.849884526558892,
365
+ "grad_norm": 1.533423900604248,
366
+ "learning_rate": 6.249580677624959e-06,
367
+ "loss": 1.6724,
368
+ "step": 4200
369
+ },
370
+ {
371
+ "epoch": 4.965357967667437,
372
+ "grad_norm": 1.7051324844360352,
373
+ "learning_rate": 5.9141227775914126e-06,
374
+ "loss": 1.6685,
375
+ "step": 4300
376
+ },
377
+ {
378
+ "epoch": 5.080831408775982,
379
+ "grad_norm": 1.5413386821746826,
380
+ "learning_rate": 5.578664877557867e-06,
381
+ "loss": 1.6784,
382
+ "step": 4400
383
+ },
384
+ {
385
+ "epoch": 5.196304849884527,
386
+ "grad_norm": 1.7100459337234497,
387
+ "learning_rate": 5.243206977524321e-06,
388
+ "loss": 1.6237,
389
+ "step": 4500
390
+ },
391
+ {
392
+ "epoch": 5.196304849884527,
393
+ "eval_loss": 1.4744157791137695,
394
+ "eval_runtime": 3.6566,
395
+ "eval_samples_per_second": 50.047,
396
+ "eval_steps_per_second": 12.58,
397
+ "step": 4500
398
+ },
399
+ {
400
+ "epoch": 5.311778290993072,
401
+ "grad_norm": 1.8523712158203125,
402
+ "learning_rate": 4.907749077490776e-06,
403
+ "loss": 1.7114,
404
+ "step": 4600
405
+ },
406
+ {
407
+ "epoch": 5.427251732101617,
408
+ "grad_norm": 1.578623652458191,
409
+ "learning_rate": 4.572291177457229e-06,
410
+ "loss": 1.6396,
411
+ "step": 4700
412
+ },
413
+ {
414
+ "epoch": 5.542725173210162,
415
+ "grad_norm": 1.6069693565368652,
416
+ "learning_rate": 4.2368332774236835e-06,
417
+ "loss": 1.6942,
418
+ "step": 4800
419
+ },
420
+ {
421
+ "epoch": 5.658198614318707,
422
+ "grad_norm": 1.2671387195587158,
423
+ "learning_rate": 3.901375377390138e-06,
424
+ "loss": 1.6999,
425
+ "step": 4900
426
+ },
427
+ {
428
+ "epoch": 5.773672055427252,
429
+ "grad_norm": 1.5692400932312012,
430
+ "learning_rate": 3.5659174773565918e-06,
431
+ "loss": 1.6395,
432
+ "step": 5000
433
+ },
434
+ {
435
+ "epoch": 5.773672055427252,
436
+ "eval_loss": 1.464585542678833,
437
+ "eval_runtime": 3.6703,
438
+ "eval_samples_per_second": 49.859,
439
+ "eval_steps_per_second": 12.533,
440
+ "step": 5000
441
+ },
442
+ {
443
+ "epoch": 5.8891454965357966,
444
+ "grad_norm": 1.6615720987319946,
445
+ "learning_rate": 3.230459577323046e-06,
446
+ "loss": 1.6839,
447
+ "step": 5100
448
+ },
449
+ {
450
+ "epoch": 6.0046189376443415,
451
+ "grad_norm": 1.6811989545822144,
452
+ "learning_rate": 2.8950016772895005e-06,
453
+ "loss": 1.7058,
454
+ "step": 5200
455
+ },
456
+ {
457
+ "epoch": 6.1200923787528865,
458
+ "grad_norm": 1.569676399230957,
459
+ "learning_rate": 2.5595437772559544e-06,
460
+ "loss": 1.6426,
461
+ "step": 5300
462
+ },
463
+ {
464
+ "epoch": 6.235565819861431,
465
+ "grad_norm": 1.753227949142456,
466
+ "learning_rate": 2.2240858772224088e-06,
467
+ "loss": 1.646,
468
+ "step": 5400
469
+ },
470
+ {
471
+ "epoch": 6.351039260969977,
472
+ "grad_norm": 1.3525090217590332,
473
+ "learning_rate": 1.888627977188863e-06,
474
+ "loss": 1.6184,
475
+ "step": 5500
476
+ },
477
+ {
478
+ "epoch": 6.351039260969977,
479
+ "eval_loss": 1.459729790687561,
480
+ "eval_runtime": 3.6443,
481
+ "eval_samples_per_second": 50.216,
482
+ "eval_steps_per_second": 12.623,
483
+ "step": 5500
484
+ },
485
+ {
486
+ "epoch": 6.466512702078522,
487
+ "grad_norm": 1.6190038919448853,
488
+ "learning_rate": 1.553170077155317e-06,
489
+ "loss": 1.6743,
490
+ "step": 5600
491
+ },
492
+ {
493
+ "epoch": 6.581986143187067,
494
+ "grad_norm": 1.5956363677978516,
495
+ "learning_rate": 1.2177121771217714e-06,
496
+ "loss": 1.6474,
497
+ "step": 5700
498
+ },
499
+ {
500
+ "epoch": 6.697459584295612,
501
+ "grad_norm": 1.6586838960647583,
502
+ "learning_rate": 8.822542770882254e-07,
503
+ "loss": 1.7052,
504
+ "step": 5800
505
+ },
506
+ {
507
+ "epoch": 6.812933025404157,
508
+ "grad_norm": 1.3021780252456665,
509
+ "learning_rate": 5.467963770546797e-07,
510
+ "loss": 1.6333,
511
+ "step": 5900
512
+ },
513
+ {
514
+ "epoch": 6.928406466512702,
515
+ "grad_norm": 1.7689718008041382,
516
+ "learning_rate": 2.1133847702113386e-07,
517
+ "loss": 1.6687,
518
+ "step": 6000
519
+ },
520
+ {
521
+ "epoch": 6.928406466512702,
522
+ "eval_loss": 1.4575997591018677,
523
+ "eval_runtime": 3.7418,
524
+ "eval_samples_per_second": 48.907,
525
+ "eval_steps_per_second": 12.294,
526
+ "step": 6000
527
+ }
528
+ ],
529
+ "logging_steps": 100,
530
+ "max_steps": 6062,
531
+ "num_input_tokens_seen": 0,
532
+ "num_train_epochs": 7,
533
+ "save_steps": 500,
534
+ "stateful_callbacks": {
535
+ "TrainerControl": {
536
+ "args": {
537
+ "should_epoch_stop": false,
538
+ "should_evaluate": false,
539
+ "should_log": false,
540
+ "should_save": true,
541
+ "should_training_stop": true
542
+ },
543
+ "attributes": {}
544
+ }
545
+ },
546
+ "total_flos": 1.5149450723328e+16,
547
+ "train_batch_size": 4,
548
+ "trial_name": null,
549
+ "trial_params": null
550
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c17d2228b577266c47e480c4ec0c7788cd60eeeee97cfb09f976efef00ac030
3
+ size 5777
vocab.json ADDED
The diff for this file is too large to render. See raw diff