LegrandFrederic commited on
Commit
18238fa
·
verified ·
1 Parent(s): 2196feb

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +792 -0
trainer_state.json ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1070,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09345794392523364,
14
+ "grad_norm": 2.035820722579956,
15
+ "learning_rate": 1.6666666666666667e-05,
16
+ "loss": 1.0122,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.18691588785046728,
21
+ "grad_norm": 2.3610706329345703,
22
+ "learning_rate": 3.518518518518519e-05,
23
+ "loss": 0.3526,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.2803738317757009,
28
+ "grad_norm": 1.186436653137207,
29
+ "learning_rate": 5.370370370370371e-05,
30
+ "loss": 0.2888,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.37383177570093457,
35
+ "grad_norm": 2.2752561569213867,
36
+ "learning_rate": 7.222222222222222e-05,
37
+ "loss": 0.2957,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.4672897196261682,
42
+ "grad_norm": 1.9183247089385986,
43
+ "learning_rate": 9.074074074074075e-05,
44
+ "loss": 0.2282,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.5607476635514018,
49
+ "grad_norm": 1.8604793548583984,
50
+ "learning_rate": 9.999402437003975e-05,
51
+ "loss": 0.169,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.6542056074766355,
56
+ "grad_norm": 1.4165388345718384,
57
+ "learning_rate": 9.99462278999732e-05,
58
+ "loss": 0.1587,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.7476635514018691,
63
+ "grad_norm": 1.3570247888565063,
64
+ "learning_rate": 9.985068065535225e-05,
65
+ "loss": 0.1176,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.8411214953271028,
70
+ "grad_norm": 0.8269003033638,
71
+ "learning_rate": 9.970747398351445e-05,
72
+ "loss": 0.1031,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.9345794392523364,
77
+ "grad_norm": 1.9058301448822021,
78
+ "learning_rate": 9.951674479629056e-05,
79
+ "loss": 0.0991,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 1.02803738317757,
84
+ "grad_norm": 1.9390472173690796,
85
+ "learning_rate": 9.927867543911091e-05,
86
+ "loss": 0.0873,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 1.1214953271028036,
91
+ "grad_norm": 0.9300522208213806,
92
+ "learning_rate": 9.899349351667522e-05,
93
+ "loss": 0.0946,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 1.2149532710280373,
98
+ "grad_norm": 0.9239804148674011,
99
+ "learning_rate": 9.866147167535254e-05,
100
+ "loss": 0.0794,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 1.308411214953271,
105
+ "grad_norm": 1.4692777395248413,
106
+ "learning_rate": 9.828292734251944e-05,
107
+ "loss": 0.0898,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 1.4018691588785046,
112
+ "grad_norm": 0.8343071937561035,
113
+ "learning_rate": 9.785822242308562e-05,
114
+ "loss": 0.0819,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 1.4953271028037383,
119
+ "grad_norm": 0.7223255634307861,
120
+ "learning_rate": 9.738776295349687e-05,
121
+ "loss": 0.0803,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 1.588785046728972,
126
+ "grad_norm": 0.7587630748748779,
127
+ "learning_rate": 9.687199871354669e-05,
128
+ "loss": 0.0745,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 1.6822429906542056,
133
+ "grad_norm": 1.022656798362732,
134
+ "learning_rate": 9.631142279636706e-05,
135
+ "loss": 0.0894,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 1.7757009345794392,
140
+ "grad_norm": 0.8869289755821228,
141
+ "learning_rate": 9.570657113700985e-05,
142
+ "loss": 0.0859,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 1.8691588785046729,
147
+ "grad_norm": 0.9170877933502197,
148
+ "learning_rate": 9.50580220000696e-05,
149
+ "loss": 0.0737,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 1.9626168224299065,
154
+ "grad_norm": 1.0185060501098633,
155
+ "learning_rate": 9.436639542683727e-05,
156
+ "loss": 0.0696,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 2.05607476635514,
161
+ "grad_norm": 1.1690869331359863,
162
+ "learning_rate": 9.363235264251369e-05,
163
+ "loss": 0.0635,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 2.149532710280374,
168
+ "grad_norm": 0.940513551235199,
169
+ "learning_rate": 9.285659542404941e-05,
170
+ "loss": 0.072,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 2.2429906542056073,
175
+ "grad_norm": 0.5256121158599854,
176
+ "learning_rate": 9.203986542921532e-05,
177
+ "loss": 0.065,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 2.336448598130841,
182
+ "grad_norm": 0.7171943783760071,
183
+ "learning_rate": 9.11829434875454e-05,
184
+ "loss": 0.0594,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 2.4299065420560746,
189
+ "grad_norm": 0.6012755036354065,
190
+ "learning_rate": 9.02866488538296e-05,
191
+ "loss": 0.0657,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 2.5233644859813085,
196
+ "grad_norm": 0.496242493391037,
197
+ "learning_rate": 8.93518384248705e-05,
198
+ "loss": 0.0712,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 2.616822429906542,
203
+ "grad_norm": 0.7729496955871582,
204
+ "learning_rate": 8.837940592025257e-05,
205
+ "loss": 0.0581,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 2.710280373831776,
210
+ "grad_norm": 0.6967052817344666,
211
+ "learning_rate": 8.737028102790723e-05,
212
+ "loss": 0.065,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 2.803738317757009,
217
+ "grad_norm": 1.2209193706512451,
218
+ "learning_rate": 8.632542851529051e-05,
219
+ "loss": 0.0673,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 2.897196261682243,
224
+ "grad_norm": 0.8657803535461426,
225
+ "learning_rate": 8.524584730702339e-05,
226
+ "loss": 0.0778,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 2.9906542056074765,
231
+ "grad_norm": 0.6701989769935608,
232
+ "learning_rate": 8.413256952987611e-05,
233
+ "loss": 0.0617,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 3.0841121495327104,
238
+ "grad_norm": 0.46317973732948303,
239
+ "learning_rate": 8.298665952600999e-05,
240
+ "loss": 0.061,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 3.177570093457944,
245
+ "grad_norm": 0.549863338470459,
246
+ "learning_rate": 8.180921283541986e-05,
247
+ "loss": 0.0608,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 3.2710280373831777,
252
+ "grad_norm": 0.5789127349853516,
253
+ "learning_rate": 8.060135514854994e-05,
254
+ "loss": 0.0588,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 3.364485981308411,
259
+ "grad_norm": 0.5358679294586182,
260
+ "learning_rate": 7.936424123008464e-05,
261
+ "loss": 0.0584,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 3.457943925233645,
266
+ "grad_norm": 0.682597815990448,
267
+ "learning_rate": 7.809905381494316e-05,
268
+ "loss": 0.0585,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 3.5514018691588785,
273
+ "grad_norm": 0.7718446254730225,
274
+ "learning_rate": 7.68070024775332e-05,
275
+ "loss": 0.0525,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 3.6448598130841123,
280
+ "grad_norm": 0.5267370343208313,
281
+ "learning_rate": 7.548932247534506e-05,
282
+ "loss": 0.0537,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 3.7383177570093458,
287
+ "grad_norm": 0.8042123913764954,
288
+ "learning_rate": 7.414727356799154e-05,
289
+ "loss": 0.0613,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 3.831775700934579,
294
+ "grad_norm": 0.5799658298492432,
295
+ "learning_rate": 7.27821388128227e-05,
296
+ "loss": 0.0535,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 3.925233644859813,
301
+ "grad_norm": 0.8124668598175049,
302
+ "learning_rate": 7.139522333826707e-05,
303
+ "loss": 0.0493,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 4.018691588785047,
308
+ "grad_norm": 0.8771295547485352,
309
+ "learning_rate": 6.99878530960719e-05,
310
+ "loss": 0.0615,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 4.11214953271028,
315
+ "grad_norm": 0.7655673027038574,
316
+ "learning_rate": 6.856137359363533e-05,
317
+ "loss": 0.0605,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 4.205607476635514,
322
+ "grad_norm": 0.6487105488777161,
323
+ "learning_rate": 6.711714860764266e-05,
324
+ "loss": 0.0559,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 4.299065420560748,
329
+ "grad_norm": 0.7439901232719421,
330
+ "learning_rate": 6.565655888023618e-05,
331
+ "loss": 0.0562,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 4.392523364485982,
336
+ "grad_norm": 0.566832423210144,
337
+ "learning_rate": 6.418100079896556e-05,
338
+ "loss": 0.0535,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 4.485981308411215,
343
+ "grad_norm": 0.5109242796897888,
344
+ "learning_rate": 6.269188506178019e-05,
345
+ "loss": 0.0558,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 4.579439252336448,
350
+ "grad_norm": 0.5645363926887512,
351
+ "learning_rate": 6.11906353283405e-05,
352
+ "loss": 0.0445,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 4.672897196261682,
357
+ "grad_norm": 0.7012343406677246,
358
+ "learning_rate": 5.967868685893715e-05,
359
+ "loss": 0.0496,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 4.766355140186916,
364
+ "grad_norm": 0.47865885496139526,
365
+ "learning_rate": 5.815748514231944e-05,
366
+ "loss": 0.0486,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 4.859813084112149,
371
+ "grad_norm": 0.38202741742134094,
372
+ "learning_rate": 5.6628484513745e-05,
373
+ "loss": 0.0412,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 4.953271028037383,
378
+ "grad_norm": 0.7451114058494568,
379
+ "learning_rate": 5.5093146764571866e-05,
380
+ "loss": 0.0561,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 5.046728971962617,
385
+ "grad_norm": 0.29692840576171875,
386
+ "learning_rate": 5.355293974472197e-05,
387
+ "loss": 0.0432,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 5.140186915887851,
392
+ "grad_norm": 0.5470758080482483,
393
+ "learning_rate": 5.2009335959352666e-05,
394
+ "loss": 0.0456,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 5.233644859813084,
399
+ "grad_norm": 0.3530406057834625,
400
+ "learning_rate": 5.046381116107742e-05,
401
+ "loss": 0.05,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 5.327102803738318,
406
+ "grad_norm": 0.5904394388198853,
407
+ "learning_rate": 4.891784293908192e-05,
408
+ "loss": 0.0479,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 5.420560747663552,
413
+ "grad_norm": 0.6387467384338379,
414
+ "learning_rate": 4.7372909306484276e-05,
415
+ "loss": 0.0419,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 5.5140186915887845,
420
+ "grad_norm": 0.3460151255130768,
421
+ "learning_rate": 4.5830487287289966e-05,
422
+ "loss": 0.0457,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 5.607476635514018,
427
+ "grad_norm": 0.6098641157150269,
428
+ "learning_rate": 4.429205150429241e-05,
429
+ "loss": 0.038,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 5.700934579439252,
434
+ "grad_norm": 0.5131182670593262,
435
+ "learning_rate": 4.275907276926918e-05,
436
+ "loss": 0.0518,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 5.794392523364486,
441
+ "grad_norm": 0.5069635510444641,
442
+ "learning_rate": 4.123301667682171e-05,
443
+ "loss": 0.0447,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 5.88785046728972,
448
+ "grad_norm": 0.4461626708507538,
449
+ "learning_rate": 3.971534220320291e-05,
450
+ "loss": 0.0465,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 5.981308411214953,
455
+ "grad_norm": 0.5642176866531372,
456
+ "learning_rate": 3.820750031147211e-05,
457
+ "loss": 0.0415,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 6.074766355140187,
462
+ "grad_norm": 0.525589108467102,
463
+ "learning_rate": 3.67109325643111e-05,
464
+ "loss": 0.0398,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 6.168224299065421,
469
+ "grad_norm": 0.5913786292076111,
470
+ "learning_rate": 3.522706974582717e-05,
471
+ "loss": 0.0421,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 6.261682242990654,
476
+ "grad_norm": 0.44272592663764954,
477
+ "learning_rate": 3.375733049366115e-05,
478
+ "loss": 0.0377,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 6.355140186915888,
483
+ "grad_norm": 0.5991699695587158,
484
+ "learning_rate": 3.2303119942707796e-05,
485
+ "loss": 0.0401,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 6.4485981308411215,
490
+ "grad_norm": 0.4788666069507599,
491
+ "learning_rate": 3.086582838174551e-05,
492
+ "loss": 0.0344,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 6.542056074766355,
497
+ "grad_norm": 0.2347191423177719,
498
+ "learning_rate": 2.944682992425959e-05,
499
+ "loss": 0.0431,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 6.635514018691588,
504
+ "grad_norm": 0.4982701241970062,
505
+ "learning_rate": 2.804748119472969e-05,
506
+ "loss": 0.0431,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 6.728971962616822,
511
+ "grad_norm": 0.3474225103855133,
512
+ "learning_rate": 2.6669120031637663e-05,
513
+ "loss": 0.0312,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 6.822429906542056,
518
+ "grad_norm": 0.484762042760849,
519
+ "learning_rate": 2.5313064208435423e-05,
520
+ "loss": 0.0402,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 6.91588785046729,
525
+ "grad_norm": 0.3601996898651123,
526
+ "learning_rate": 2.3980610173696255e-05,
527
+ "loss": 0.0353,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 7.009345794392523,
532
+ "grad_norm": 0.4373137056827545,
533
+ "learning_rate": 2.2673031811653034e-05,
534
+ "loss": 0.0515,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 7.102803738317757,
539
+ "grad_norm": 0.4686773121356964,
540
+ "learning_rate": 2.139157922430956e-05,
541
+ "loss": 0.042,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 7.196261682242991,
546
+ "grad_norm": 0.4924180507659912,
547
+ "learning_rate": 2.01374775362883e-05,
548
+ "loss": 0.0396,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 7.289719626168225,
553
+ "grad_norm": 0.3904656767845154,
554
+ "learning_rate": 1.8911925723557806e-05,
555
+ "loss": 0.0399,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 7.383177570093458,
560
+ "grad_norm": 0.283329039812088,
561
+ "learning_rate": 1.7716095467159393e-05,
562
+ "loss": 0.0402,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 7.4766355140186915,
567
+ "grad_norm": 0.4036600887775421,
568
+ "learning_rate": 1.6551130033028827e-05,
569
+ "loss": 0.0373,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 7.570093457943925,
574
+ "grad_norm": 0.3786483407020569,
575
+ "learning_rate": 1.541814317898425e-05,
576
+ "loss": 0.0399,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 7.663551401869158,
581
+ "grad_norm": 0.34531161189079285,
582
+ "learning_rate": 1.4318218089924962e-05,
583
+ "loss": 0.04,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 7.757009345794392,
588
+ "grad_norm": 0.3333149254322052,
589
+ "learning_rate": 1.3252406342259527e-05,
590
+ "loss": 0.0328,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 7.850467289719626,
595
+ "grad_norm": 0.35911786556243896,
596
+ "learning_rate": 1.2221726898552665e-05,
597
+ "loss": 0.0359,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 7.94392523364486,
602
+ "grad_norm": 0.37050867080688477,
603
+ "learning_rate": 1.122716513335262e-05,
604
+ "loss": 0.0468,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 8.037383177570094,
609
+ "grad_norm": 0.27867504954338074,
610
+ "learning_rate": 1.0269671891130123e-05,
611
+ "loss": 0.0308,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 8.130841121495328,
616
+ "grad_norm": 0.36876147985458374,
617
+ "learning_rate": 9.350162577229432e-06,
618
+ "loss": 0.0379,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 8.22429906542056,
623
+ "grad_norm": 0.3114546835422516,
624
+ "learning_rate": 8.46951628270098e-06,
625
+ "loss": 0.0354,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 8.317757009345794,
630
+ "grad_norm": 0.26373207569122314,
631
+ "learning_rate": 7.628574943851852e-06,
632
+ "loss": 0.0296,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 8.411214953271028,
637
+ "grad_norm": 0.3879351019859314,
638
+ "learning_rate": 6.82814253731801e-06,
639
+ "loss": 0.0288,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 8.504672897196262,
644
+ "grad_norm": 0.3110659718513489,
645
+ "learning_rate": 6.06898431142745e-06,
646
+ "loss": 0.0293,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 8.598130841121495,
651
+ "grad_norm": 0.21370220184326172,
652
+ "learning_rate": 5.351826054589393e-06,
653
+ "loss": 0.0294,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 8.69158878504673,
658
+ "grad_norm": 0.304156094789505,
659
+ "learning_rate": 4.677353401408974e-06,
660
+ "loss": 0.0323,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 8.785046728971963,
665
+ "grad_norm": 0.30655941367149353,
666
+ "learning_rate": 4.04621117719049e-06,
667
+ "loss": 0.0356,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 8.878504672897197,
672
+ "grad_norm": 0.43658676743507385,
673
+ "learning_rate": 3.459002781456344e-06,
674
+ "loss": 0.0332,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 8.97196261682243,
679
+ "grad_norm": 0.486379474401474,
680
+ "learning_rate": 2.9162896110707163e-06,
681
+ "loss": 0.0336,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 9.065420560747663,
686
+ "grad_norm": 0.35680046677589417,
687
+ "learning_rate": 2.418590523519687e-06,
688
+ "loss": 0.0313,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 9.158878504672897,
693
+ "grad_norm": 0.2002888023853302,
694
+ "learning_rate": 1.9663813408607845e-06,
695
+ "loss": 0.0357,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 9.25233644859813,
700
+ "grad_norm": 0.27867501974105835,
701
+ "learning_rate": 1.5600943948163527e-06,
702
+ "loss": 0.0336,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 9.345794392523365,
707
+ "grad_norm": 0.2867244780063629,
708
+ "learning_rate": 1.2001181134455475e-06,
709
+ "loss": 0.0325,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 9.439252336448599,
714
+ "grad_norm": 0.284397691488266,
715
+ "learning_rate": 8.867966497901282e-07,
716
+ "loss": 0.0278,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 9.532710280373832,
721
+ "grad_norm": 0.24502161145210266,
722
+ "learning_rate": 6.204295528491555e-07,
723
+ "loss": 0.0377,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 9.626168224299064,
728
+ "grad_norm": 0.3403957784175873,
729
+ "learning_rate": 4.012714811970464e-07,
730
+ "loss": 0.0294,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 9.719626168224298,
735
+ "grad_norm": 0.311471551656723,
736
+ "learning_rate": 2.295319595188805e-07,
737
+ "loss": 0.0277,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 9.813084112149532,
742
+ "grad_norm": 0.26186737418174744,
743
+ "learning_rate": 1.0537517829562472e-07,
744
+ "loss": 0.0281,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 9.906542056074766,
749
+ "grad_norm": 0.20402012765407562,
750
+ "learning_rate": 2.8919836830887392e-08,
751
+ "loss": 0.0305,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 10.0,
756
+ "grad_norm": 0.8117493391036987,
757
+ "learning_rate": 2.3902976920009423e-10,
758
+ "loss": 0.0289,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 10.0,
763
+ "step": 1070,
764
+ "total_flos": 0.0,
765
+ "train_loss": 0.07138736554395372,
766
+ "train_runtime": 1117.3117,
767
+ "train_samples_per_second": 46.513,
768
+ "train_steps_per_second": 0.958
769
+ }
770
+ ],
771
+ "logging_steps": 10,
772
+ "max_steps": 1070,
773
+ "num_input_tokens_seen": 0,
774
+ "num_train_epochs": 10,
775
+ "save_steps": 1000,
776
+ "stateful_callbacks": {
777
+ "TrainerControl": {
778
+ "args": {
779
+ "should_epoch_stop": false,
780
+ "should_evaluate": false,
781
+ "should_log": false,
782
+ "should_save": true,
783
+ "should_training_stop": true
784
+ },
785
+ "attributes": {}
786
+ }
787
+ },
788
+ "total_flos": 0.0,
789
+ "train_batch_size": 49,
790
+ "trial_name": null,
791
+ "trial_params": null
792
+ }