PuxAI commited on
Commit
4d15c16
·
verified ·
1 Parent(s): fb8f383

Upload logs/train_log_T5Gemma-2B_seed42.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. logs/train_log_T5Gemma-2B_seed42.json +46 -522
logs/train_log_T5Gemma-2B_seed42.json CHANGED
@@ -1,557 +1,81 @@
1
  [
2
  {
3
- "loss": 1.6577,
4
- "grad_norm": 43.25,
5
- "learning_rate": 1.8e-05,
6
- "epoch": 0.06369426751592357,
7
  "step": 10
8
  },
9
  {
10
- "loss": 0.5764,
11
- "grad_norm": 43.5,
12
- "learning_rate": 1.9993345687502566e-05,
13
- "epoch": 0.12738853503184713,
14
  "step": 20
15
  },
16
  {
17
- "loss": 0.598,
18
- "grad_norm": 45.5,
19
- "learning_rate": 1.9970354495287275e-05,
20
- "epoch": 0.1910828025477707,
21
  "step": 30
22
  },
23
  {
24
- "loss": 0.621,
25
- "grad_norm": 62.75,
26
- "learning_rate": 1.993098203681623e-05,
27
- "epoch": 0.25477707006369427,
28
  "step": 40
29
  },
30
  {
31
- "loss": 0.5267,
32
- "grad_norm": 37.25,
33
- "learning_rate": 1.987529300093532e-05,
34
- "epoch": 0.3184713375796178,
35
  "step": 50
36
  },
37
  {
38
- "loss": 0.6321,
39
- "grad_norm": 11.0,
40
- "learning_rate": 1.9803378884584266e-05,
41
- "epoch": 0.3821656050955414,
42
  "step": 60
43
  },
44
  {
45
- "loss": 0.5512,
46
- "grad_norm": 11.375,
47
- "learning_rate": 1.97153578424674e-05,
48
- "epoch": 0.445859872611465,
49
  "step": 70
50
  },
51
  {
52
- "loss": 0.5613,
53
- "grad_norm": 34.25,
54
- "learning_rate": 1.96113744929258e-05,
55
- "epoch": 0.5095541401273885,
56
  "step": 80
57
  },
58
  {
59
- "loss": 0.4809,
60
- "grad_norm": 17.125,
61
- "learning_rate": 1.949159968032972e-05,
62
- "epoch": 0.5732484076433121,
63
  "step": 90
64
  },
65
  {
66
- "loss": 0.47,
67
- "grad_norm": 56.25,
68
- "learning_rate": 1.935623019438176e-05,
69
- "epoch": 0.6369426751592356,
70
  "step": 100
71
  },
72
  {
73
- "loss": 0.4501,
74
- "grad_norm": 39.25,
75
- "learning_rate": 1.9205488446791873e-05,
76
- "epoch": 0.7006369426751592,
77
- "step": 110
78
- },
79
- {
80
- "loss": 0.5815,
81
- "grad_norm": 8.375,
82
- "learning_rate": 1.903962210585554e-05,
83
- "epoch": 0.7643312101910829,
84
- "step": 120
85
- },
86
- {
87
- "loss": 0.4434,
88
- "grad_norm": 24.125,
89
- "learning_rate": 1.885890368953539e-05,
90
- "epoch": 0.8280254777070064,
91
- "step": 130
92
- },
93
- {
94
- "loss": 0.3825,
95
- "grad_norm": 34.75,
96
- "learning_rate": 1.8663630117714904e-05,
97
- "epoch": 0.89171974522293,
98
- "step": 140
99
- },
100
- {
101
- "loss": 0.4359,
102
- "grad_norm": 5.1875,
103
- "learning_rate": 1.845412222435984e-05,
104
- "epoch": 0.9554140127388535,
105
- "step": 150
106
- },
107
- {
108
- "loss": 0.3706,
109
- "grad_norm": 18.625,
110
- "learning_rate": 1.823072423038886e-05,
111
- "epoch": 1.019108280254777,
112
- "step": 160
113
- },
114
- {
115
- "loss": 0.3528,
116
- "grad_norm": 13.25,
117
- "learning_rate": 1.7993803178119457e-05,
118
- "epoch": 1.0828025477707006,
119
- "step": 170
120
- },
121
- {
122
- "loss": 0.3916,
123
- "grad_norm": 11.625,
124
- "learning_rate": 1.7743748328218412e-05,
125
- "epoch": 1.1464968152866242,
126
- "step": 180
127
- },
128
- {
129
- "loss": 0.4234,
130
- "grad_norm": 5.53125,
131
- "learning_rate": 1.7480970520147522e-05,
132
- "epoch": 1.2101910828025477,
133
- "step": 190
134
- },
135
- {
136
- "loss": 0.2799,
137
- "grad_norm": 18.875,
138
- "learning_rate": 1.7205901497155445e-05,
139
- "epoch": 1.2738853503184713,
140
- "step": 200
141
- },
142
- {
143
- "loss": 0.3871,
144
- "grad_norm": 66.0,
145
- "learning_rate": 1.691899319692469e-05,
146
- "epoch": 1.3375796178343948,
147
- "step": 210
148
- },
149
- {
150
- "loss": 0.3145,
151
- "grad_norm": 5.34375,
152
- "learning_rate": 1.6620717009039175e-05,
153
- "epoch": 1.4012738853503186,
154
- "step": 220
155
- },
156
- {
157
- "loss": 0.2831,
158
- "grad_norm": 23.0,
159
- "learning_rate": 1.631156300049238e-05,
160
- "epoch": 1.4649681528662422,
161
- "step": 230
162
- },
163
- {
164
- "loss": 0.3198,
165
- "grad_norm": 11.875,
166
- "learning_rate": 1.5992039110508586e-05,
167
- "epoch": 1.5286624203821657,
168
- "step": 240
169
- },
170
- {
171
- "loss": 0.2805,
172
- "grad_norm": 7.875,
173
- "learning_rate": 1.566267031600003e-05,
174
- "epoch": 1.5923566878980893,
175
- "step": 250
176
- },
177
- {
178
- "loss": 0.2194,
179
- "grad_norm": 9.1875,
180
- "learning_rate": 1.532399776903124e-05,
181
- "epoch": 1.6560509554140128,
182
- "step": 260
183
- },
184
- {
185
- "loss": 0.3149,
186
- "grad_norm": 7.3125,
187
- "learning_rate": 1.4976577907707603e-05,
188
- "epoch": 1.7197452229299364,
189
- "step": 270
190
- },
191
- {
192
- "loss": 0.3434,
193
- "grad_norm": 5.375,
194
- "learning_rate": 1.4620981541949024e-05,
195
- "epoch": 1.78343949044586,
196
- "step": 280
197
- },
198
- {
199
- "loss": 0.2845,
200
- "grad_norm": 10.5625,
201
- "learning_rate": 1.4257792915650728e-05,
202
- "epoch": 1.8471337579617835,
203
- "step": 290
204
- },
205
- {
206
- "loss": 0.3298,
207
- "grad_norm": 67.5,
208
- "learning_rate": 1.3887608746772064e-05,
209
- "epoch": 1.910828025477707,
210
- "step": 300
211
- },
212
- {
213
- "loss": 0.2368,
214
- "grad_norm": 19.625,
215
- "learning_rate": 1.3511037246930476e-05,
216
- "epoch": 1.9745222929936306,
217
- "step": 310
218
- },
219
- {
220
- "loss": 0.2667,
221
- "grad_norm": 6.15625,
222
- "learning_rate": 1.312869712211141e-05,
223
- "epoch": 2.038216560509554,
224
- "step": 320
225
- },
226
- {
227
- "loss": 0.1912,
228
- "grad_norm": 10.0625,
229
- "learning_rate": 1.2741216556136e-05,
230
- "epoch": 2.1019108280254777,
231
- "step": 330
232
- },
233
- {
234
- "loss": 0.1935,
235
- "grad_norm": 4.5,
236
- "learning_rate": 1.2349232178556703e-05,
237
- "epoch": 2.1656050955414012,
238
- "step": 340
239
- },
240
- {
241
- "loss": 0.1927,
242
- "grad_norm": 12.9375,
243
- "learning_rate": 1.1953388018676618e-05,
244
- "epoch": 2.229299363057325,
245
- "step": 350
246
- },
247
- {
248
- "loss": 0.2156,
249
- "grad_norm": 13.125,
250
- "learning_rate": 1.1554334447411019e-05,
251
- "epoch": 2.2929936305732483,
252
- "step": 360
253
- },
254
- {
255
- "loss": 0.2342,
256
- "grad_norm": 10.1875,
257
- "learning_rate": 1.1152727108729675e-05,
258
- "epoch": 2.356687898089172,
259
- "step": 370
260
- },
261
- {
262
- "loss": 0.2362,
263
- "grad_norm": 8.1875,
264
- "learning_rate": 1.0749225842435498e-05,
265
- "epoch": 2.4203821656050954,
266
- "step": 380
267
- },
268
- {
269
- "loss": 0.1355,
270
- "grad_norm": 8.0,
271
- "learning_rate": 1.0344493600049509e-05,
272
- "epoch": 2.484076433121019,
273
- "step": 390
274
- },
275
- {
276
- "loss": 0.1495,
277
- "grad_norm": 9.5,
278
- "learning_rate": 9.939195355583199e-06,
279
- "epoch": 2.5477707006369426,
280
- "step": 400
281
- },
282
- {
283
- "loss": 0.1439,
284
- "grad_norm": 12.1875,
285
- "learning_rate": 9.53399701298801e-06,
286
- "epoch": 2.611464968152866,
287
- "step": 410
288
- },
289
- {
290
- "loss": 0.1585,
291
- "grad_norm": 9.1875,
292
- "learning_rate": 9.129564312076861e-06,
293
- "epoch": 2.6751592356687897,
294
- "step": 420
295
- },
296
- {
297
- "loss": 0.1568,
298
- "grad_norm": 14.875,
299
- "learning_rate": 8.726561734715388e-06,
300
- "epoch": 2.738853503184713,
301
- "step": 430
302
- },
303
- {
304
- "loss": 0.134,
305
- "grad_norm": 24.25,
306
- "learning_rate": 8.325651413080003e-06,
307
- "epoch": 2.802547770700637,
308
- "step": 440
309
- },
310
- {
311
- "loss": 0.1672,
312
- "grad_norm": 16.25,
313
- "learning_rate": 7.927492041776452e-06,
314
- "epoch": 2.8662420382165603,
315
- "step": 450
316
- },
317
- {
318
- "loss": 0.1253,
319
- "grad_norm": 2.765625,
320
- "learning_rate": 7.532737795606299e-06,
321
- "epoch": 2.9299363057324843,
322
- "step": 460
323
- },
324
- {
325
- "loss": 0.1352,
326
- "grad_norm": 8.75,
327
- "learning_rate": 7.142037254759469e-06,
328
- "epoch": 2.9936305732484074,
329
- "step": 470
330
- },
331
- {
332
- "loss": 0.0673,
333
- "grad_norm": 3.578125,
334
- "learning_rate": 6.756032339198675e-06,
335
- "epoch": 3.0573248407643314,
336
- "step": 480
337
- },
338
- {
339
- "loss": 0.0829,
340
- "grad_norm": 10.5625,
341
- "learning_rate": 6.375357253986614e-06,
342
- "epoch": 3.121019108280255,
343
- "step": 490
344
- },
345
- {
346
- "loss": 0.0451,
347
- "grad_norm": 5.15625,
348
- "learning_rate": 6.000637447288719e-06,
349
- "epoch": 3.1847133757961785,
350
- "step": 500
351
- },
352
- {
353
- "loss": 0.0943,
354
- "grad_norm": 11.3125,
355
- "learning_rate": 5.6324885827634665e-06,
356
- "epoch": 3.248407643312102,
357
- "step": 510
358
- },
359
- {
360
- "loss": 0.0633,
361
- "grad_norm": 9.5,
362
- "learning_rate": 5.271515528028592e-06,
363
- "epoch": 3.3121019108280256,
364
- "step": 520
365
- },
366
- {
367
- "loss": 0.0833,
368
- "grad_norm": 8.125,
369
- "learning_rate": 4.91831136086518e-06,
370
- "epoch": 3.375796178343949,
371
- "step": 530
372
- },
373
- {
374
- "loss": 0.0534,
375
- "grad_norm": 5.65625,
376
- "learning_rate": 4.573456394792446e-06,
377
- "epoch": 3.4394904458598727,
378
- "step": 540
379
- },
380
- {
381
- "loss": 0.0794,
382
- "grad_norm": 6.65625,
383
- "learning_rate": 4.237517225614138e-06,
384
- "epoch": 3.5031847133757963,
385
- "step": 550
386
- },
387
- {
388
- "loss": 0.0782,
389
- "grad_norm": 3.390625,
390
- "learning_rate": 3.911045800503101e-06,
391
- "epoch": 3.56687898089172,
392
- "step": 560
393
- },
394
- {
395
- "loss": 0.0612,
396
- "grad_norm": 5.15625,
397
- "learning_rate": 3.5945785111535547e-06,
398
- "epoch": 3.6305732484076434,
399
- "step": 570
400
- },
401
- {
402
- "loss": 0.0867,
403
- "grad_norm": 9.3125,
404
- "learning_rate": 3.2886353124909267e-06,
405
- "epoch": 3.694267515923567,
406
- "step": 580
407
- },
408
- {
409
- "loss": 0.0612,
410
- "grad_norm": 7.03125,
411
- "learning_rate": 2.9937188683872796e-06,
412
- "epoch": 3.7579617834394905,
413
- "step": 590
414
- },
415
- {
416
- "loss": 0.0693,
417
- "grad_norm": 2.609375,
418
- "learning_rate": 2.7103137257858867e-06,
419
- "epoch": 3.821656050955414,
420
- "step": 600
421
- },
422
- {
423
- "loss": 0.0616,
424
- "grad_norm": 4.84375,
425
- "learning_rate": 2.438885518591878e-06,
426
- "epoch": 3.8853503184713376,
427
- "step": 610
428
- },
429
- {
430
- "loss": 0.0422,
431
- "grad_norm": 5.125,
432
- "learning_rate": 2.179880202636966e-06,
433
- "epoch": 3.949044585987261,
434
- "step": 620
435
- },
436
- {
437
- "loss": 0.0407,
438
- "grad_norm": 0.9453125,
439
- "learning_rate": 1.9337233229751686e-06,
440
- "epoch": 4.012738853503185,
441
- "step": 630
442
- },
443
- {
444
- "loss": 0.0324,
445
- "grad_norm": 3.578125,
446
- "learning_rate": 1.7008193147134167e-06,
447
- "epoch": 4.076433121019108,
448
- "step": 640
449
- },
450
- {
451
- "loss": 0.0213,
452
- "grad_norm": 1.9296875,
453
- "learning_rate": 1.4815508385257316e-06,
454
- "epoch": 4.140127388535032,
455
- "step": 650
456
- },
457
- {
458
- "loss": 0.0197,
459
- "grad_norm": 1.296875,
460
- "learning_rate": 1.2762781519427324e-06,
461
- "epoch": 4.203821656050955,
462
- "step": 660
463
- },
464
- {
465
- "loss": 0.0523,
466
- "grad_norm": 2.4375,
467
- "learning_rate": 1.0853385174494758e-06,
468
- "epoch": 4.267515923566879,
469
- "step": 670
470
- },
471
- {
472
- "loss": 0.0374,
473
- "grad_norm": 2.640625,
474
- "learning_rate": 9.090456483640686e-07,
475
- "epoch": 4.3312101910828025,
476
- "step": 680
477
- },
478
- {
479
- "loss": 0.0303,
480
- "grad_norm": 2.171875,
481
- "learning_rate": 7.476891934075126e-07,
482
- "epoch": 4.3949044585987265,
483
- "step": 690
484
- },
485
- {
486
- "loss": 0.0226,
487
- "grad_norm": 1.8984375,
488
- "learning_rate": 6.015342608116092e-07,
489
- "epoch": 4.45859872611465,
490
- "step": 700
491
- },
492
- {
493
- "loss": 0.024,
494
- "grad_norm": 2.765625,
495
- "learning_rate": 4.708209827468457e-07,
496
- "epoch": 4.522292993630574,
497
- "step": 710
498
- },
499
- {
500
- "loss": 0.0338,
501
- "grad_norm": 6.53125,
502
- "learning_rate": 3.5576412078585755e-07,
503
- "epoch": 4.585987261146497,
504
- "step": 720
505
- },
506
- {
507
- "loss": 0.036,
508
- "grad_norm": 4.4375,
509
- "learning_rate": 2.565527130507295e-07,
510
- "epoch": 4.649681528662421,
511
- "step": 730
512
- },
513
- {
514
- "loss": 0.0216,
515
- "grad_norm": 1.40625,
516
- "learning_rate": 1.7334976362386458e-07,
517
- "epoch": 4.713375796178344,
518
- "step": 740
519
- },
520
- {
521
- "loss": 0.0276,
522
- "grad_norm": 2.53125,
523
- "learning_rate": 1.0629197473269736e-07,
524
- "epoch": 4.777070063694268,
525
- "step": 750
526
- },
527
- {
528
- "loss": 0.0158,
529
- "grad_norm": 1.125,
530
- "learning_rate": 5.548952214829762e-08,
531
- "epoch": 4.840764331210191,
532
- "step": 760
533
- },
534
- {
535
- "loss": 0.0193,
536
- "grad_norm": 3.171875,
537
- "learning_rate": 2.1025874166864523e-08,
538
- "epoch": 4.904458598726115,
539
- "step": 770
540
- },
541
- {
542
- "loss": 0.0239,
543
- "grad_norm": 0.58203125,
544
- "learning_rate": 2.9576544715370636e-09,
545
- "epoch": 4.968152866242038,
546
- "step": 780
547
- },
548
- {
549
- "train_runtime": 2126.4883,
550
- "train_samples_per_second": 2.953,
551
- "train_steps_per_second": 0.369,
552
- "total_flos": 5619745856102400.0,
553
- "train_loss": 0.23484725686395244,
554
  "epoch": 5.0,
555
- "step": 785
556
  }
557
  ]
 
1
  [
2
  {
3
+ "loss": 2.0402,
4
+ "grad_norm": 5.5213093757629395,
5
+ "learning_rate": 0.00018,
6
+ "epoch": 0.5,
7
  "step": 10
8
  },
9
  {
10
+ "loss": 0.2818,
11
+ "grad_norm": 1.7319334745407104,
12
+ "learning_rate": 0.00019510565162951537,
13
+ "epoch": 1.0,
14
  "step": 20
15
  },
16
  {
17
+ "loss": 0.1396,
18
+ "grad_norm": 1.2271606922149658,
19
+ "learning_rate": 0.00017880107536067218,
20
+ "epoch": 1.5,
21
  "step": 30
22
  },
23
  {
24
+ "loss": 0.1376,
25
+ "grad_norm": 1.5510157346725464,
26
+ "learning_rate": 0.0001529919264233205,
27
+ "epoch": 2.0,
28
  "step": 40
29
  },
30
  {
31
+ "loss": 0.0882,
32
+ "grad_norm": 0.7988260388374329,
33
+ "learning_rate": 0.00012079116908177593,
34
+ "epoch": 2.5,
35
  "step": 50
36
  },
37
  {
38
+ "loss": 0.0721,
39
+ "grad_norm": 1.129199743270874,
40
+ "learning_rate": 8.608268990399349e-05,
41
+ "epoch": 3.0,
42
  "step": 60
43
  },
44
  {
45
+ "loss": 0.0443,
46
+ "grad_norm": 0.45621684193611145,
47
+ "learning_rate": 5.305284372141095e-05,
48
+ "epoch": 3.5,
49
  "step": 70
50
  },
51
  {
52
+ "loss": 0.0349,
53
+ "grad_norm": 0.8962133526802063,
54
+ "learning_rate": 2.5685517452260567e-05,
55
+ "epoch": 4.0,
56
  "step": 80
57
  },
58
  {
59
+ "loss": 0.0205,
60
+ "grad_norm": 0.45593538880348206,
61
+ "learning_rate": 7.281614543321269e-06,
62
+ "epoch": 4.5,
63
  "step": 90
64
  },
65
  {
66
+ "loss": 0.0177,
67
+ "grad_norm": 0.4147992432117462,
68
+ "learning_rate": 6.09172980904238e-08,
69
+ "epoch": 5.0,
70
  "step": 100
71
  },
72
  {
73
+ "train_runtime": 187.8489,
74
+ "train_samples_per_second": 33.431,
75
+ "train_steps_per_second": 0.532,
76
+ "total_flos": 9748966353960960.0,
77
+ "train_loss": 0.2876847678422928,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "epoch": 5.0,
79
+ "step": 100
80
  }
81
  ]