MrMoeeee commited on
Commit
dda69c6
·
verified ·
1 Parent(s): e254718

Upload logs/checkpoint-800/trainer_state.json with huggingface_hub

Browse files
logs/checkpoint-800/trainer_state.json ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 550,
3
+ "best_metric": 0.027798546478152275,
4
+ "best_model_checkpoint": "/workspace/lampAI/finetuning/outputs/lamp-qwen-1.5b-full/checkpoint-550",
5
+ "epoch": 5.634920634920634,
6
+ "eval_steps": 50,
7
+ "global_step": 800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.07054673721340388,
14
+ "grad_norm": 4.5,
15
+ "learning_rate": 1.267605633802817e-05,
16
+ "loss": 1.1558,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.14109347442680775,
21
+ "grad_norm": 1.1953125,
22
+ "learning_rate": 2.676056338028169e-05,
23
+ "loss": 0.4274,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.21164021164021163,
28
+ "grad_norm": 0.302734375,
29
+ "learning_rate": 4.0845070422535214e-05,
30
+ "loss": 0.0549,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.2821869488536155,
35
+ "grad_norm": 1.21875,
36
+ "learning_rate": 5.492957746478874e-05,
37
+ "loss": 0.0471,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.3527336860670194,
42
+ "grad_norm": 0.2119140625,
43
+ "learning_rate": 6.901408450704226e-05,
44
+ "loss": 0.0402,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.3527336860670194,
49
+ "eval_loss": 0.03969533368945122,
50
+ "eval_runtime": 6.3012,
51
+ "eval_samples_per_second": 40.151,
52
+ "eval_steps_per_second": 10.157,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 0.42328042328042326,
57
+ "grad_norm": 0.193359375,
58
+ "learning_rate": 8.309859154929578e-05,
59
+ "loss": 0.0377,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 0.49382716049382713,
64
+ "grad_norm": 0.1787109375,
65
+ "learning_rate": 9.718309859154931e-05,
66
+ "loss": 0.0343,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 0.564373897707231,
71
+ "grad_norm": 0.19921875,
72
+ "learning_rate": 0.00011126760563380282,
73
+ "loss": 0.0356,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.6349206349206349,
78
+ "grad_norm": 3.1875,
79
+ "learning_rate": 0.00012535211267605635,
80
+ "loss": 0.0855,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 0.7054673721340388,
85
+ "grad_norm": 0.66796875,
86
+ "learning_rate": 0.00013943661971830987,
87
+ "loss": 0.0868,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.7054673721340388,
92
+ "eval_loss": 0.05627487599849701,
93
+ "eval_runtime": 6.311,
94
+ "eval_samples_per_second": 40.089,
95
+ "eval_steps_per_second": 10.141,
96
+ "step": 100
97
+ },
98
+ {
99
+ "epoch": 0.7760141093474426,
100
+ "grad_norm": 0.8046875,
101
+ "learning_rate": 0.00015352112676056339,
102
+ "loss": 0.0538,
103
+ "step": 110
104
+ },
105
+ {
106
+ "epoch": 0.8465608465608465,
107
+ "grad_norm": 0.5234375,
108
+ "learning_rate": 0.0001676056338028169,
109
+ "loss": 0.0561,
110
+ "step": 120
111
+ },
112
+ {
113
+ "epoch": 0.9171075837742504,
114
+ "grad_norm": 1.8828125,
115
+ "learning_rate": 0.00018169014084507045,
116
+ "loss": 0.0641,
117
+ "step": 130
118
+ },
119
+ {
120
+ "epoch": 0.9876543209876543,
121
+ "grad_norm": 37.0,
122
+ "learning_rate": 0.00019577464788732396,
123
+ "loss": 0.0863,
124
+ "step": 140
125
+ },
126
+ {
127
+ "epoch": 1.056437389770723,
128
+ "grad_norm": 1.0546875,
129
+ "learning_rate": 0.00019999667815369528,
130
+ "loss": 0.3147,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 1.056437389770723,
135
+ "eval_loss": 0.10409189015626907,
136
+ "eval_runtime": 6.0488,
137
+ "eval_samples_per_second": 41.826,
138
+ "eval_steps_per_second": 10.581,
139
+ "step": 150
140
+ },
141
+ {
142
+ "epoch": 1.126984126984127,
143
+ "grad_norm": 0.8671875,
144
+ "learning_rate": 0.00019998040841735952,
145
+ "loss": 0.0868,
146
+ "step": 160
147
+ },
148
+ {
149
+ "epoch": 1.1975308641975309,
150
+ "grad_norm": 0.2109375,
151
+ "learning_rate": 0.00019995058285912832,
152
+ "loss": 0.0527,
153
+ "step": 170
154
+ },
155
+ {
156
+ "epoch": 1.2680776014109347,
157
+ "grad_norm": 0.1435546875,
158
+ "learning_rate": 0.00019990720552289347,
159
+ "loss": 0.0418,
160
+ "step": 180
161
+ },
162
+ {
163
+ "epoch": 1.3386243386243386,
164
+ "grad_norm": 0.09912109375,
165
+ "learning_rate": 0.00019985028228996173,
166
+ "loss": 0.0328,
167
+ "step": 190
168
+ },
169
+ {
170
+ "epoch": 1.4091710758377425,
171
+ "grad_norm": 0.12890625,
172
+ "learning_rate": 0.00019977982087825713,
173
+ "loss": 0.0389,
174
+ "step": 200
175
+ },
176
+ {
177
+ "epoch": 1.4091710758377425,
178
+ "eval_loss": 0.03491974622011185,
179
+ "eval_runtime": 6.042,
180
+ "eval_samples_per_second": 41.874,
181
+ "eval_steps_per_second": 10.593,
182
+ "step": 200
183
+ },
184
+ {
185
+ "epoch": 1.4797178130511464,
186
+ "grad_norm": 0.10498046875,
187
+ "learning_rate": 0.00019969583084127485,
188
+ "loss": 0.0338,
189
+ "step": 210
190
+ },
191
+ {
192
+ "epoch": 1.5502645502645502,
193
+ "grad_norm": 0.0859375,
194
+ "learning_rate": 0.00019959832356678583,
195
+ "loss": 0.033,
196
+ "step": 220
197
+ },
198
+ {
199
+ "epoch": 1.620811287477954,
200
+ "grad_norm": 0.12255859375,
201
+ "learning_rate": 0.00019948731227529258,
202
+ "loss": 0.0313,
203
+ "step": 230
204
+ },
205
+ {
206
+ "epoch": 1.691358024691358,
207
+ "grad_norm": 0.10400390625,
208
+ "learning_rate": 0.00019936281201823688,
209
+ "loss": 0.0288,
210
+ "step": 240
211
+ },
212
+ {
213
+ "epoch": 1.7619047619047619,
214
+ "grad_norm": 0.1396484375,
215
+ "learning_rate": 0.00019922483967595893,
216
+ "loss": 0.0322,
217
+ "step": 250
218
+ },
219
+ {
220
+ "epoch": 1.7619047619047619,
221
+ "eval_loss": 0.03154641017317772,
222
+ "eval_runtime": 6.0077,
223
+ "eval_samples_per_second": 42.113,
224
+ "eval_steps_per_second": 10.653,
225
+ "step": 250
226
+ },
227
+ {
228
+ "epoch": 1.8324514991181657,
229
+ "grad_norm": 0.095703125,
230
+ "learning_rate": 0.00019907341395540877,
231
+ "loss": 0.0279,
232
+ "step": 260
233
+ },
234
+ {
235
+ "epoch": 1.9029982363315696,
236
+ "grad_norm": 0.10107421875,
237
+ "learning_rate": 0.00019890855538760974,
238
+ "loss": 0.0293,
239
+ "step": 270
240
+ },
241
+ {
242
+ "epoch": 1.9735449735449735,
243
+ "grad_norm": 0.1015625,
244
+ "learning_rate": 0.00019873028632487474,
245
+ "loss": 0.029,
246
+ "step": 280
247
+ },
248
+ {
249
+ "epoch": 2.0423280423280423,
250
+ "grad_norm": 0.1025390625,
251
+ "learning_rate": 0.0001985386309377759,
252
+ "loss": 0.0265,
253
+ "step": 290
254
+ },
255
+ {
256
+ "epoch": 2.112874779541446,
257
+ "grad_norm": 0.11962890625,
258
+ "learning_rate": 0.0001983336152118671,
259
+ "loss": 0.0256,
260
+ "step": 300
261
+ },
262
+ {
263
+ "epoch": 2.112874779541446,
264
+ "eval_loss": 0.029803331941366196,
265
+ "eval_runtime": 5.9968,
266
+ "eval_samples_per_second": 42.189,
267
+ "eval_steps_per_second": 10.672,
268
+ "step": 300
269
+ },
270
+ {
271
+ "epoch": 2.18342151675485,
272
+ "grad_norm": 0.09130859375,
273
+ "learning_rate": 0.0001981152669441609,
274
+ "loss": 0.0247,
275
+ "step": 310
276
+ },
277
+ {
278
+ "epoch": 2.253968253968254,
279
+ "grad_norm": 0.10009765625,
280
+ "learning_rate": 0.00019788361573935958,
281
+ "loss": 0.0257,
282
+ "step": 320
283
+ },
284
+ {
285
+ "epoch": 2.324514991181658,
286
+ "grad_norm": 0.10498046875,
287
+ "learning_rate": 0.00019763869300584128,
288
+ "loss": 0.0254,
289
+ "step": 330
290
+ },
291
+ {
292
+ "epoch": 2.3950617283950617,
293
+ "grad_norm": 0.0986328125,
294
+ "learning_rate": 0.00019738053195140148,
295
+ "loss": 0.023,
296
+ "step": 340
297
+ },
298
+ {
299
+ "epoch": 2.4656084656084656,
300
+ "grad_norm": 0.0888671875,
301
+ "learning_rate": 0.00019710916757875052,
302
+ "loss": 0.0243,
303
+ "step": 350
304
+ },
305
+ {
306
+ "epoch": 2.4656084656084656,
307
+ "eval_loss": 0.029007520526647568,
308
+ "eval_runtime": 6.0663,
309
+ "eval_samples_per_second": 41.706,
310
+ "eval_steps_per_second": 10.55,
311
+ "step": 350
312
+ },
313
+ {
314
+ "epoch": 2.5361552028218695,
315
+ "grad_norm": 0.099609375,
316
+ "learning_rate": 0.0001968246366807677,
317
+ "loss": 0.0232,
318
+ "step": 360
319
+ },
320
+ {
321
+ "epoch": 2.6067019400352733,
322
+ "grad_norm": 0.10595703125,
323
+ "learning_rate": 0.0001965269778355129,
324
+ "loss": 0.0253,
325
+ "step": 370
326
+ },
327
+ {
328
+ "epoch": 2.677248677248677,
329
+ "grad_norm": 0.0927734375,
330
+ "learning_rate": 0.00019621623140099578,
331
+ "loss": 0.0253,
332
+ "step": 380
333
+ },
334
+ {
335
+ "epoch": 2.747795414462081,
336
+ "grad_norm": 0.099609375,
337
+ "learning_rate": 0.00019589243950970402,
338
+ "loss": 0.0231,
339
+ "step": 390
340
+ },
341
+ {
342
+ "epoch": 2.818342151675485,
343
+ "grad_norm": 0.08984375,
344
+ "learning_rate": 0.0001955556460628906,
345
+ "loss": 0.0237,
346
+ "step": 400
347
+ },
348
+ {
349
+ "epoch": 2.818342151675485,
350
+ "eval_loss": 0.027823707088828087,
351
+ "eval_runtime": 6.0126,
352
+ "eval_samples_per_second": 42.078,
353
+ "eval_steps_per_second": 10.644,
354
+ "step": 400
355
+ },
356
+ {
357
+ "epoch": 2.888888888888889,
358
+ "grad_norm": 0.10009765625,
359
+ "learning_rate": 0.0001952058967246217,
360
+ "loss": 0.025,
361
+ "step": 410
362
+ },
363
+ {
364
+ "epoch": 2.9594356261022927,
365
+ "grad_norm": 0.11572265625,
366
+ "learning_rate": 0.00019484323891558506,
367
+ "loss": 0.0238,
368
+ "step": 420
369
+ },
370
+ {
371
+ "epoch": 3.0282186948853616,
372
+ "grad_norm": 0.083984375,
373
+ "learning_rate": 0.00019446772180666084,
374
+ "loss": 0.0206,
375
+ "step": 430
376
+ },
377
+ {
378
+ "epoch": 3.0987654320987654,
379
+ "grad_norm": 0.0986328125,
380
+ "learning_rate": 0.00019407939631225439,
381
+ "loss": 0.019,
382
+ "step": 440
383
+ },
384
+ {
385
+ "epoch": 3.1693121693121693,
386
+ "grad_norm": 0.10400390625,
387
+ "learning_rate": 0.00019367831508339327,
388
+ "loss": 0.0186,
389
+ "step": 450
390
+ },
391
+ {
392
+ "epoch": 3.1693121693121693,
393
+ "eval_loss": 0.029222065582871437,
394
+ "eval_runtime": 5.9948,
395
+ "eval_samples_per_second": 42.203,
396
+ "eval_steps_per_second": 10.676,
397
+ "step": 450
398
+ },
399
+ {
400
+ "epoch": 3.239858906525573,
401
+ "grad_norm": 0.09130859375,
402
+ "learning_rate": 0.0001932645325005885,
403
+ "loss": 0.0167,
404
+ "step": 460
405
+ },
406
+ {
407
+ "epoch": 3.310405643738977,
408
+ "grad_norm": 0.10107421875,
409
+ "learning_rate": 0.0001928381046664615,
410
+ "loss": 0.0208,
411
+ "step": 470
412
+ },
413
+ {
414
+ "epoch": 3.380952380952381,
415
+ "grad_norm": 0.119140625,
416
+ "learning_rate": 0.00019239908939813722,
417
+ "loss": 0.0197,
418
+ "step": 480
419
+ },
420
+ {
421
+ "epoch": 3.451499118165785,
422
+ "grad_norm": 0.10009765625,
423
+ "learning_rate": 0.0001919475462194052,
424
+ "loss": 0.0199,
425
+ "step": 490
426
+ },
427
+ {
428
+ "epoch": 3.5220458553791887,
429
+ "grad_norm": 0.0908203125,
430
+ "learning_rate": 0.00019148353635264895,
431
+ "loss": 0.0209,
432
+ "step": 500
433
+ },
434
+ {
435
+ "epoch": 3.5220458553791887,
436
+ "eval_loss": 0.028565241023898125,
437
+ "eval_runtime": 6.5453,
438
+ "eval_samples_per_second": 38.654,
439
+ "eval_steps_per_second": 9.778,
440
+ "step": 500
441
+ },
442
+ {
443
+ "epoch": 3.5925925925925926,
444
+ "grad_norm": 0.1103515625,
445
+ "learning_rate": 0.00019100712271054516,
446
+ "loss": 0.0215,
447
+ "step": 510
448
+ },
449
+ {
450
+ "epoch": 3.6631393298059964,
451
+ "grad_norm": 0.10400390625,
452
+ "learning_rate": 0.00019051836988753372,
453
+ "loss": 0.022,
454
+ "step": 520
455
+ },
456
+ {
457
+ "epoch": 3.7336860670194003,
458
+ "grad_norm": 0.10791015625,
459
+ "learning_rate": 0.0001900173441510597,
460
+ "loss": 0.02,
461
+ "step": 530
462
+ },
463
+ {
464
+ "epoch": 3.804232804232804,
465
+ "grad_norm": 0.0732421875,
466
+ "learning_rate": 0.00018950411343258842,
467
+ "loss": 0.0201,
468
+ "step": 540
469
+ },
470
+ {
471
+ "epoch": 3.874779541446208,
472
+ "grad_norm": 0.0849609375,
473
+ "learning_rate": 0.00018897874731839504,
474
+ "loss": 0.0208,
475
+ "step": 550
476
+ },
477
+ {
478
+ "epoch": 3.874779541446208,
479
+ "eval_loss": 0.027798546478152275,
480
+ "eval_runtime": 6.1662,
481
+ "eval_samples_per_second": 41.03,
482
+ "eval_steps_per_second": 10.379,
483
+ "step": 550
484
+ },
485
+ {
486
+ "epoch": 3.945326278659612,
487
+ "grad_norm": 0.10302734375,
488
+ "learning_rate": 0.00018844131704012968,
489
+ "loss": 0.0202,
490
+ "step": 560
491
+ },
492
+ {
493
+ "epoch": 4.01410934744268,
494
+ "grad_norm": 0.08984375,
495
+ "learning_rate": 0.00018789189546515958,
496
+ "loss": 0.0199,
497
+ "step": 570
498
+ },
499
+ {
500
+ "epoch": 4.084656084656085,
501
+ "grad_norm": 0.09912109375,
502
+ "learning_rate": 0.00018733055708668926,
503
+ "loss": 0.0143,
504
+ "step": 580
505
+ },
506
+ {
507
+ "epoch": 4.155202821869489,
508
+ "grad_norm": 0.0927734375,
509
+ "learning_rate": 0.00018675737801366056,
510
+ "loss": 0.0141,
511
+ "step": 590
512
+ },
513
+ {
514
+ "epoch": 4.225749559082892,
515
+ "grad_norm": 0.09912109375,
516
+ "learning_rate": 0.00018617243596043314,
517
+ "loss": 0.0158,
518
+ "step": 600
519
+ },
520
+ {
521
+ "epoch": 4.225749559082892,
522
+ "eval_loss": 0.02914673648774624,
523
+ "eval_runtime": 6.1782,
524
+ "eval_samples_per_second": 40.95,
525
+ "eval_steps_per_second": 10.359,
526
+ "step": 600
527
+ },
528
+ {
529
+ "epoch": 4.296296296296296,
530
+ "grad_norm": 0.11279296875,
531
+ "learning_rate": 0.00018557581023624788,
532
+ "loss": 0.0158,
533
+ "step": 610
534
+ },
535
+ {
536
+ "epoch": 4.3668430335097,
537
+ "grad_norm": 0.11083984375,
538
+ "learning_rate": 0.00018496758173447368,
539
+ "loss": 0.0166,
540
+ "step": 620
541
+ },
542
+ {
543
+ "epoch": 4.4373897707231045,
544
+ "grad_norm": 0.09521484375,
545
+ "learning_rate": 0.0001843478329216394,
546
+ "loss": 0.0171,
547
+ "step": 630
548
+ },
549
+ {
550
+ "epoch": 4.507936507936508,
551
+ "grad_norm": 0.0859375,
552
+ "learning_rate": 0.00018371664782625287,
553
+ "loss": 0.0164,
554
+ "step": 640
555
+ },
556
+ {
557
+ "epoch": 4.578483245149911,
558
+ "grad_norm": 0.09130859375,
559
+ "learning_rate": 0.00018307411202740773,
560
+ "loss": 0.0175,
561
+ "step": 650
562
+ },
563
+ {
564
+ "epoch": 4.578483245149911,
565
+ "eval_loss": 0.02929055318236351,
566
+ "eval_runtime": 6.0557,
567
+ "eval_samples_per_second": 41.779,
568
+ "eval_steps_per_second": 10.568,
569
+ "step": 650
570
+ },
571
+ {
572
+ "epoch": 4.649029982363316,
573
+ "grad_norm": 0.1083984375,
574
+ "learning_rate": 0.00018242031264318026,
575
+ "loss": 0.0179,
576
+ "step": 660
577
+ },
578
+ {
579
+ "epoch": 4.71957671957672,
580
+ "grad_norm": 0.1015625,
581
+ "learning_rate": 0.00018175533831881757,
582
+ "loss": 0.0175,
583
+ "step": 670
584
+ },
585
+ {
586
+ "epoch": 4.790123456790123,
587
+ "grad_norm": 0.1015625,
588
+ "learning_rate": 0.0001810792792147186,
589
+ "loss": 0.0177,
590
+ "step": 680
591
+ },
592
+ {
593
+ "epoch": 4.860670194003527,
594
+ "grad_norm": 0.09033203125,
595
+ "learning_rate": 0.00018039222699420965,
596
+ "loss": 0.0177,
597
+ "step": 690
598
+ },
599
+ {
600
+ "epoch": 4.931216931216931,
601
+ "grad_norm": 0.11328125,
602
+ "learning_rate": 0.00017969427481111643,
603
+ "loss": 0.0184,
604
+ "step": 700
605
+ },
606
+ {
607
+ "epoch": 4.931216931216931,
608
+ "eval_loss": 0.028933366760611534,
609
+ "eval_runtime": 6.028,
610
+ "eval_samples_per_second": 41.971,
611
+ "eval_steps_per_second": 10.617,
612
+ "step": 700
613
+ },
614
+ {
615
+ "epoch": 5.0,
616
+ "grad_norm": 0.126953125,
617
+ "learning_rate": 0.00017898551729713362,
618
+ "loss": 0.0166,
619
+ "step": 710
620
+ },
621
+ {
622
+ "epoch": 5.070546737213403,
623
+ "grad_norm": 0.10302734375,
624
+ "learning_rate": 0.00017826605054899433,
625
+ "loss": 0.0125,
626
+ "step": 720
627
+ },
628
+ {
629
+ "epoch": 5.141093474426808,
630
+ "grad_norm": 0.09228515625,
631
+ "learning_rate": 0.00017753597211544092,
632
+ "loss": 0.0139,
633
+ "step": 730
634
+ },
635
+ {
636
+ "epoch": 5.211640211640212,
637
+ "grad_norm": 0.09521484375,
638
+ "learning_rate": 0.0001767953809839987,
639
+ "loss": 0.0136,
640
+ "step": 740
641
+ },
642
+ {
643
+ "epoch": 5.2821869488536155,
644
+ "grad_norm": 0.103515625,
645
+ "learning_rate": 0.00017604437756755498,
646
+ "loss": 0.014,
647
+ "step": 750
648
+ },
649
+ {
650
+ "epoch": 5.2821869488536155,
651
+ "eval_loss": 0.030887732282280922,
652
+ "eval_runtime": 6.0316,
653
+ "eval_samples_per_second": 41.946,
654
+ "eval_steps_per_second": 10.611,
655
+ "step": 750
656
+ },
657
+ {
658
+ "epoch": 5.352733686067019,
659
+ "grad_norm": 0.087890625,
660
+ "learning_rate": 0.0001752830636907443,
661
+ "loss": 0.0139,
662
+ "step": 760
663
+ },
664
+ {
665
+ "epoch": 5.423280423280423,
666
+ "grad_norm": 0.1201171875,
667
+ "learning_rate": 0.00017451154257614287,
668
+ "loss": 0.0142,
669
+ "step": 770
670
+ },
671
+ {
672
+ "epoch": 5.493827160493828,
673
+ "grad_norm": 0.1025390625,
674
+ "learning_rate": 0.00017372991883027287,
675
+ "loss": 0.0142,
676
+ "step": 780
677
+ },
678
+ {
679
+ "epoch": 5.564373897707231,
680
+ "grad_norm": 0.0947265625,
681
+ "learning_rate": 0.00017293829842941972,
682
+ "loss": 0.0146,
683
+ "step": 790
684
+ },
685
+ {
686
+ "epoch": 5.634920634920634,
687
+ "grad_norm": 0.1025390625,
688
+ "learning_rate": 0.00017213678870526292,
689
+ "loss": 0.0147,
690
+ "step": 800
691
+ },
692
+ {
693
+ "epoch": 5.634920634920634,
694
+ "eval_loss": 0.02992367185652256,
695
+ "eval_runtime": 6.0048,
696
+ "eval_samples_per_second": 42.133,
697
+ "eval_steps_per_second": 10.658,
698
+ "step": 800
699
+ }
700
+ ],
701
+ "logging_steps": 10,
702
+ "max_steps": 2840,
703
+ "num_input_tokens_seen": 0,
704
+ "num_train_epochs": 20,
705
+ "save_steps": 50,
706
+ "stateful_callbacks": {
707
+ "EarlyStoppingCallback": {
708
+ "args": {
709
+ "early_stopping_patience": 5,
710
+ "early_stopping_threshold": 0.0
711
+ },
712
+ "attributes": {
713
+ "early_stopping_patience_counter": 5
714
+ }
715
+ },
716
+ "TrainerControl": {
717
+ "args": {
718
+ "should_epoch_stop": false,
719
+ "should_evaluate": false,
720
+ "should_log": false,
721
+ "should_save": true,
722
+ "should_training_stop": true
723
+ },
724
+ "attributes": {}
725
+ }
726
+ },
727
+ "total_flos": 1.8482800577851085e+17,
728
+ "train_batch_size": 4,
729
+ "trial_name": null,
730
+ "trial_params": null
731
+ }