PuxAI commited on
Commit
6c0362e
·
verified ·
1 Parent(s): be6b1bc

Upload logs/train_log_T5Gemma-2B_seed456.json with huggingface_hub

Browse files
logs/train_log_T5Gemma-2B_seed456.json ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss": 1.6751,
4
+ "grad_norm": 35.5,
5
+ "learning_rate": 1.8e-05,
6
+ "epoch": 0.06369426751592357,
7
+ "step": 10
8
+ },
9
+ {
10
+ "loss": 0.753,
11
+ "grad_norm": 52.5,
12
+ "learning_rate": 1.9993345687502566e-05,
13
+ "epoch": 0.12738853503184713,
14
+ "step": 20
15
+ },
16
+ {
17
+ "loss": 0.5992,
18
+ "grad_norm": 30.25,
19
+ "learning_rate": 1.9970354495287275e-05,
20
+ "epoch": 0.1910828025477707,
21
+ "step": 30
22
+ },
23
+ {
24
+ "loss": 0.5004,
25
+ "grad_norm": 28.625,
26
+ "learning_rate": 1.993098203681623e-05,
27
+ "epoch": 0.25477707006369427,
28
+ "step": 40
29
+ },
30
+ {
31
+ "loss": 0.4852,
32
+ "grad_norm": 30.0,
33
+ "learning_rate": 1.987529300093532e-05,
34
+ "epoch": 0.3184713375796178,
35
+ "step": 50
36
+ },
37
+ {
38
+ "loss": 0.5795,
39
+ "grad_norm": 18.875,
40
+ "learning_rate": 1.9803378884584266e-05,
41
+ "epoch": 0.3821656050955414,
42
+ "step": 60
43
+ },
44
+ {
45
+ "loss": 0.5235,
46
+ "grad_norm": 30.375,
47
+ "learning_rate": 1.97153578424674e-05,
48
+ "epoch": 0.445859872611465,
49
+ "step": 70
50
+ },
51
+ {
52
+ "loss": 0.4305,
53
+ "grad_norm": 21.375,
54
+ "learning_rate": 1.96113744929258e-05,
55
+ "epoch": 0.5095541401273885,
56
+ "step": 80
57
+ },
58
+ {
59
+ "loss": 0.5519,
60
+ "grad_norm": 29.375,
61
+ "learning_rate": 1.949159968032972e-05,
62
+ "epoch": 0.5732484076433121,
63
+ "step": 90
64
+ },
65
+ {
66
+ "loss": 0.4587,
67
+ "grad_norm": 24.0,
68
+ "learning_rate": 1.935623019438176e-05,
69
+ "epoch": 0.6369426751592356,
70
+ "step": 100
71
+ },
72
+ {
73
+ "loss": 0.5376,
74
+ "grad_norm": 116.5,
75
+ "learning_rate": 1.9205488446791873e-05,
76
+ "epoch": 0.7006369426751592,
77
+ "step": 110
78
+ },
79
+ {
80
+ "loss": 0.476,
81
+ "grad_norm": 16.875,
82
+ "learning_rate": 1.903962210585554e-05,
83
+ "epoch": 0.7643312101910829,
84
+ "step": 120
85
+ },
86
+ {
87
+ "loss": 0.4523,
88
+ "grad_norm": 12.8125,
89
+ "learning_rate": 1.885890368953539e-05,
90
+ "epoch": 0.8280254777070064,
91
+ "step": 130
92
+ },
93
+ {
94
+ "loss": 0.4212,
95
+ "grad_norm": 32.5,
96
+ "learning_rate": 1.8663630117714904e-05,
97
+ "epoch": 0.89171974522293,
98
+ "step": 140
99
+ },
100
+ {
101
+ "loss": 0.4637,
102
+ "grad_norm": 14.625,
103
+ "learning_rate": 1.845412222435984e-05,
104
+ "epoch": 0.9554140127388535,
105
+ "step": 150
106
+ },
107
+ {
108
+ "loss": 0.3916,
109
+ "grad_norm": 53.25,
110
+ "learning_rate": 1.823072423038886e-05,
111
+ "epoch": 1.019108280254777,
112
+ "step": 160
113
+ },
114
+ {
115
+ "loss": 0.3953,
116
+ "grad_norm": 14.875,
117
+ "learning_rate": 1.7993803178119457e-05,
118
+ "epoch": 1.0828025477707006,
119
+ "step": 170
120
+ },
121
+ {
122
+ "loss": 0.3769,
123
+ "grad_norm": 17.125,
124
+ "learning_rate": 1.7743748328218412e-05,
125
+ "epoch": 1.1464968152866242,
126
+ "step": 180
127
+ },
128
+ {
129
+ "loss": 0.3165,
130
+ "grad_norm": 12.5,
131
+ "learning_rate": 1.7480970520147522e-05,
132
+ "epoch": 1.2101910828025477,
133
+ "step": 190
134
+ },
135
+ {
136
+ "loss": 0.3242,
137
+ "grad_norm": 110.0,
138
+ "learning_rate": 1.7205901497155445e-05,
139
+ "epoch": 1.2738853503184713,
140
+ "step": 200
141
+ },
142
+ {
143
+ "loss": 0.387,
144
+ "grad_norm": 45.0,
145
+ "learning_rate": 1.691899319692469e-05,
146
+ "epoch": 1.3375796178343948,
147
+ "step": 210
148
+ },
149
+ {
150
+ "loss": 0.3033,
151
+ "grad_norm": 5.0625,
152
+ "learning_rate": 1.6620717009039175e-05,
153
+ "epoch": 1.4012738853503186,
154
+ "step": 220
155
+ },
156
+ {
157
+ "loss": 0.3157,
158
+ "grad_norm": 29.0,
159
+ "learning_rate": 1.631156300049238e-05,
160
+ "epoch": 1.4649681528662422,
161
+ "step": 230
162
+ },
163
+ {
164
+ "loss": 0.254,
165
+ "grad_norm": 13.4375,
166
+ "learning_rate": 1.5992039110508586e-05,
167
+ "epoch": 1.5286624203821657,
168
+ "step": 240
169
+ },
170
+ {
171
+ "loss": 0.3212,
172
+ "grad_norm": 25.625,
173
+ "learning_rate": 1.566267031600003e-05,
174
+ "epoch": 1.5923566878980893,
175
+ "step": 250
176
+ },
177
+ {
178
+ "loss": 0.3061,
179
+ "grad_norm": 9.6875,
180
+ "learning_rate": 1.532399776903124e-05,
181
+ "epoch": 1.6560509554140128,
182
+ "step": 260
183
+ },
184
+ {
185
+ "loss": 0.2912,
186
+ "grad_norm": 28.125,
187
+ "learning_rate": 1.4976577907707603e-05,
188
+ "epoch": 1.7197452229299364,
189
+ "step": 270
190
+ },
191
+ {
192
+ "loss": 0.3149,
193
+ "grad_norm": 21.75,
194
+ "learning_rate": 1.4620981541949024e-05,
195
+ "epoch": 1.78343949044586,
196
+ "step": 280
197
+ },
198
+ {
199
+ "loss": 0.2663,
200
+ "grad_norm": 8.625,
201
+ "learning_rate": 1.4257792915650728e-05,
202
+ "epoch": 1.8471337579617835,
203
+ "step": 290
204
+ },
205
+ {
206
+ "loss": 0.272,
207
+ "grad_norm": 40.25,
208
+ "learning_rate": 1.3887608746772064e-05,
209
+ "epoch": 1.910828025477707,
210
+ "step": 300
211
+ },
212
+ {
213
+ "loss": 0.2492,
214
+ "grad_norm": 12.6875,
215
+ "learning_rate": 1.3511037246930476e-05,
216
+ "epoch": 1.9745222929936306,
217
+ "step": 310
218
+ },
219
+ {
220
+ "loss": 0.2056,
221
+ "grad_norm": 11.1875,
222
+ "learning_rate": 1.312869712211141e-05,
223
+ "epoch": 2.038216560509554,
224
+ "step": 320
225
+ },
226
+ {
227
+ "loss": 0.1938,
228
+ "grad_norm": 18.25,
229
+ "learning_rate": 1.2741216556136e-05,
230
+ "epoch": 2.1019108280254777,
231
+ "step": 330
232
+ },
233
+ {
234
+ "loss": 0.1736,
235
+ "grad_norm": 8.8125,
236
+ "learning_rate": 1.2349232178556703e-05,
237
+ "epoch": 2.1656050955414012,
238
+ "step": 340
239
+ },
240
+ {
241
+ "loss": 0.1291,
242
+ "grad_norm": 9.875,
243
+ "learning_rate": 1.1953388018676618e-05,
244
+ "epoch": 2.229299363057325,
245
+ "step": 350
246
+ },
247
+ {
248
+ "loss": 0.2206,
249
+ "grad_norm": 7.8125,
250
+ "learning_rate": 1.1554334447411019e-05,
251
+ "epoch": 2.2929936305732483,
252
+ "step": 360
253
+ },
254
+ {
255
+ "loss": 0.1709,
256
+ "grad_norm": 5.0,
257
+ "learning_rate": 1.1152727108729675e-05,
258
+ "epoch": 2.356687898089172,
259
+ "step": 370
260
+ },
261
+ {
262
+ "loss": 0.1411,
263
+ "grad_norm": 8.25,
264
+ "learning_rate": 1.0749225842435498e-05,
265
+ "epoch": 2.4203821656050954,
266
+ "step": 380
267
+ },
268
+ {
269
+ "loss": 0.1655,
270
+ "grad_norm": 4.8125,
271
+ "learning_rate": 1.0344493600049509e-05,
272
+ "epoch": 2.484076433121019,
273
+ "step": 390
274
+ },
275
+ {
276
+ "loss": 0.1753,
277
+ "grad_norm": 8.4375,
278
+ "learning_rate": 9.939195355583199e-06,
279
+ "epoch": 2.5477707006369426,
280
+ "step": 400
281
+ },
282
+ {
283
+ "loss": 0.113,
284
+ "grad_norm": 3.15625,
285
+ "learning_rate": 9.53399701298801e-06,
286
+ "epoch": 2.611464968152866,
287
+ "step": 410
288
+ },
289
+ {
290
+ "loss": 0.1698,
291
+ "grad_norm": 8.3125,
292
+ "learning_rate": 9.129564312076861e-06,
293
+ "epoch": 2.6751592356687897,
294
+ "step": 420
295
+ },
296
+ {
297
+ "loss": 0.1449,
298
+ "grad_norm": 8.125,
299
+ "learning_rate": 8.726561734715388e-06,
300
+ "epoch": 2.738853503184713,
301
+ "step": 430
302
+ },
303
+ {
304
+ "loss": 0.1286,
305
+ "grad_norm": 5.9375,
306
+ "learning_rate": 8.325651413080003e-06,
307
+ "epoch": 2.802547770700637,
308
+ "step": 440
309
+ },
310
+ {
311
+ "loss": 0.1644,
312
+ "grad_norm": 18.75,
313
+ "learning_rate": 7.927492041776452e-06,
314
+ "epoch": 2.8662420382165603,
315
+ "step": 450
316
+ },
317
+ {
318
+ "loss": 0.1375,
319
+ "grad_norm": 12.5,
320
+ "learning_rate": 7.532737795606299e-06,
321
+ "epoch": 2.9299363057324843,
322
+ "step": 460
323
+ },
324
+ {
325
+ "loss": 0.1238,
326
+ "grad_norm": 5.9375,
327
+ "learning_rate": 7.142037254759469e-06,
328
+ "epoch": 2.9936305732484074,
329
+ "step": 470
330
+ },
331
+ {
332
+ "loss": 0.0463,
333
+ "grad_norm": 1.390625,
334
+ "learning_rate": 6.756032339198675e-06,
335
+ "epoch": 3.0573248407643314,
336
+ "step": 480
337
+ },
338
+ {
339
+ "loss": 0.0913,
340
+ "grad_norm": 9.75,
341
+ "learning_rate": 6.375357253986614e-06,
342
+ "epoch": 3.121019108280255,
343
+ "step": 490
344
+ },
345
+ {
346
+ "loss": 0.0608,
347
+ "grad_norm": 4.375,
348
+ "learning_rate": 6.000637447288719e-06,
349
+ "epoch": 3.1847133757961785,
350
+ "step": 500
351
+ },
352
+ {
353
+ "loss": 0.0731,
354
+ "grad_norm": 3.203125,
355
+ "learning_rate": 5.6324885827634665e-06,
356
+ "epoch": 3.248407643312102,
357
+ "step": 510
358
+ },
359
+ {
360
+ "loss": 0.0545,
361
+ "grad_norm": 2.140625,
362
+ "learning_rate": 5.271515528028592e-06,
363
+ "epoch": 3.3121019108280256,
364
+ "step": 520
365
+ },
366
+ {
367
+ "loss": 0.0846,
368
+ "grad_norm": 2.765625,
369
+ "learning_rate": 4.91831136086518e-06,
370
+ "epoch": 3.375796178343949,
371
+ "step": 530
372
+ },
373
+ {
374
+ "loss": 0.0621,
375
+ "grad_norm": 3.296875,
376
+ "learning_rate": 4.573456394792446e-06,
377
+ "epoch": 3.4394904458598727,
378
+ "step": 540
379
+ },
380
+ {
381
+ "loss": 0.0646,
382
+ "grad_norm": 6.75,
383
+ "learning_rate": 4.237517225614138e-06,
384
+ "epoch": 3.5031847133757963,
385
+ "step": 550
386
+ },
387
+ {
388
+ "loss": 0.0716,
389
+ "grad_norm": 8.625,
390
+ "learning_rate": 3.911045800503101e-06,
391
+ "epoch": 3.56687898089172,
392
+ "step": 560
393
+ },
394
+ {
395
+ "loss": 0.0643,
396
+ "grad_norm": 7.40625,
397
+ "learning_rate": 3.5945785111535547e-06,
398
+ "epoch": 3.6305732484076434,
399
+ "step": 570
400
+ },
401
+ {
402
+ "loss": 0.0596,
403
+ "grad_norm": 2.953125,
404
+ "learning_rate": 3.2886353124909267e-06,
405
+ "epoch": 3.694267515923567,
406
+ "step": 580
407
+ },
408
+ {
409
+ "loss": 0.0705,
410
+ "grad_norm": 9.375,
411
+ "learning_rate": 2.9937188683872796e-06,
412
+ "epoch": 3.7579617834394905,
413
+ "step": 590
414
+ },
415
+ {
416
+ "loss": 0.0563,
417
+ "grad_norm": 3.359375,
418
+ "learning_rate": 2.7103137257858867e-06,
419
+ "epoch": 3.821656050955414,
420
+ "step": 600
421
+ },
422
+ {
423
+ "loss": 0.0605,
424
+ "grad_norm": 2.390625,
425
+ "learning_rate": 2.438885518591878e-06,
426
+ "epoch": 3.8853503184713376,
427
+ "step": 610
428
+ },
429
+ {
430
+ "loss": 0.053,
431
+ "grad_norm": 1.7421875,
432
+ "learning_rate": 2.179880202636966e-06,
433
+ "epoch": 3.949044585987261,
434
+ "step": 620
435
+ },
436
+ {
437
+ "loss": 0.0379,
438
+ "grad_norm": 0.462890625,
439
+ "learning_rate": 1.9337233229751686e-06,
440
+ "epoch": 4.012738853503185,
441
+ "step": 630
442
+ },
443
+ {
444
+ "loss": 0.0183,
445
+ "grad_norm": 1.46875,
446
+ "learning_rate": 1.7008193147134167e-06,
447
+ "epoch": 4.076433121019108,
448
+ "step": 640
449
+ },
450
+ {
451
+ "loss": 0.0349,
452
+ "grad_norm": 4.9375,
453
+ "learning_rate": 1.4815508385257316e-06,
454
+ "epoch": 4.140127388535032,
455
+ "step": 650
456
+ },
457
+ {
458
+ "loss": 0.0266,
459
+ "grad_norm": 0.6640625,
460
+ "learning_rate": 1.2762781519427324e-06,
461
+ "epoch": 4.203821656050955,
462
+ "step": 660
463
+ },
464
+ {
465
+ "loss": 0.0173,
466
+ "grad_norm": 1.9765625,
467
+ "learning_rate": 1.0853385174494758e-06,
468
+ "epoch": 4.267515923566879,
469
+ "step": 670
470
+ },
471
+ {
472
+ "loss": 0.022,
473
+ "grad_norm": 5.21875,
474
+ "learning_rate": 9.090456483640686e-07,
475
+ "epoch": 4.3312101910828025,
476
+ "step": 680
477
+ },
478
+ {
479
+ "loss": 0.0218,
480
+ "grad_norm": 0.291015625,
481
+ "learning_rate": 7.476891934075126e-07,
482
+ "epoch": 4.3949044585987265,
483
+ "step": 690
484
+ },
485
+ {
486
+ "loss": 0.0392,
487
+ "grad_norm": 0.2236328125,
488
+ "learning_rate": 6.015342608116092e-07,
489
+ "epoch": 4.45859872611465,
490
+ "step": 700
491
+ },
492
+ {
493
+ "loss": 0.0322,
494
+ "grad_norm": 3.90625,
495
+ "learning_rate": 4.708209827468457e-07,
496
+ "epoch": 4.522292993630574,
497
+ "step": 710
498
+ },
499
+ {
500
+ "loss": 0.0242,
501
+ "grad_norm": 0.130859375,
502
+ "learning_rate": 3.5576412078585755e-07,
503
+ "epoch": 4.585987261146497,
504
+ "step": 720
505
+ },
506
+ {
507
+ "loss": 0.0169,
508
+ "grad_norm": 2.515625,
509
+ "learning_rate": 2.565527130507295e-07,
510
+ "epoch": 4.649681528662421,
511
+ "step": 730
512
+ },
513
+ {
514
+ "loss": 0.0183,
515
+ "grad_norm": 5.9375,
516
+ "learning_rate": 1.7334976362386458e-07,
517
+ "epoch": 4.713375796178344,
518
+ "step": 740
519
+ },
520
+ {
521
+ "loss": 0.0237,
522
+ "grad_norm": 11.4375,
523
+ "learning_rate": 1.0629197473269736e-07,
524
+ "epoch": 4.777070063694268,
525
+ "step": 750
526
+ },
527
+ {
528
+ "loss": 0.03,
529
+ "grad_norm": 6.1875,
530
+ "learning_rate": 5.548952214829762e-08,
531
+ "epoch": 4.840764331210191,
532
+ "step": 760
533
+ },
534
+ {
535
+ "loss": 0.0222,
536
+ "grad_norm": 1.15625,
537
+ "learning_rate": 2.1025874166864523e-08,
538
+ "epoch": 4.904458598726115,
539
+ "step": 770
540
+ },
541
+ {
542
+ "loss": 0.0305,
543
+ "grad_norm": 1.0859375,
544
+ "learning_rate": 2.9576544715370636e-09,
545
+ "epoch": 4.968152866242038,
546
+ "step": 780
547
+ },
548
+ {
549
+ "train_runtime": 1208.3302,
550
+ "train_samples_per_second": 5.197,
551
+ "train_steps_per_second": 0.65,
552
+ "total_flos": 5599444378429440.0,
553
+ "train_loss": 0.22870425369329514,
554
+ "epoch": 5.0,
555
+ "step": 785
556
+ }
557
+ ]