irodkin commited on
Commit
b975d0b
·
verified ·
1 Parent(s): bd023c8

Training checkpoint at step 4500

Browse files
Files changed (1) hide show
  1. trainer_state.json +1266 -186
trainer_state.json CHANGED
@@ -1,553 +1,1633 @@
1
  {
2
- "best_global_step": 1500,
3
- "best_metric": 2.725661039352417,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-1500",
5
- "epoch": 0.03,
6
  "eval_steps": 100,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 82.5841699095815,
15
  "learning_rate": 4.8e-08,
16
- "loss": 3.4393,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 72.33553691687935,
22
  "learning_rate": 9.8e-08,
23
- "loss": 3.401,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 55.00272386424627,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.3077,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 24.67154822301572,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.1946,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 3.11328125,
43
- "eval_runtime": 39.4175,
44
- "eval_samples_per_second": 2.638,
45
- "eval_steps_per_second": 1.319,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 8.681721490029314,
51
  "learning_rate": 2.48e-07,
52
- "loss": 3.0709,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 7.238581078870377,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 3.0046,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 5.931774986901269,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.954,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 4.891205112515998,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.9365,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.922476053237915,
79
- "eval_runtime": 39.6232,
80
- "eval_samples_per_second": 2.625,
81
- "eval_steps_per_second": 1.312,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 4.53174674502475,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.9198,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 4.642185238219915,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.9004,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 5.838003634607987,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.8935,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 4.535236579882751,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.8857,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.874248743057251,
115
- "eval_runtime": 39.4088,
116
- "eval_samples_per_second": 2.639,
117
- "eval_steps_per_second": 1.32,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 4.767460098400186,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.8672,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 6.424432953613615,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.8663,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 4.530337576253928,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.8574,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 5.094043051124328,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.8534,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.846604585647583,
151
- "eval_runtime": 45.9565,
152
- "eval_samples_per_second": 2.263,
153
- "eval_steps_per_second": 1.132,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 4.737628412447718,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.8303,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 4.176119045358587,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.8403,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 4.0039940059315065,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.8339,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 4.202142407976928,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.831,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.826472282409668,
187
- "eval_runtime": 45.8283,
188
- "eval_samples_per_second": 2.269,
189
- "eval_steps_per_second": 1.135,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
- "grad_norm": 4.067695724388097,
195
  "learning_rate": 1.0480000000000002e-06,
196
- "loss": 2.8361,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
- "grad_norm": 4.589090257665776,
202
  "learning_rate": 1.0980000000000001e-06,
203
- "loss": 2.8258,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
- "grad_norm": 4.085702217820906,
209
  "learning_rate": 1.148e-06,
210
- "loss": 2.808,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
- "grad_norm": 4.961422622812396,
216
  "learning_rate": 1.1980000000000002e-06,
217
- "loss": 2.8186,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
- "eval_loss": 2.809044361114502,
223
- "eval_runtime": 45.9695,
224
- "eval_samples_per_second": 2.262,
225
- "eval_steps_per_second": 1.131,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
- "grad_norm": 5.123166529014019,
231
  "learning_rate": 1.248e-06,
232
- "loss": 2.811,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
- "grad_norm": 3.986488261530633,
238
  "learning_rate": 1.2980000000000001e-06,
239
- "loss": 2.8075,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
- "grad_norm": 4.785348989779224,
245
  "learning_rate": 1.348e-06,
246
- "loss": 2.8003,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
- "grad_norm": 6.71907732056903,
252
  "learning_rate": 1.3980000000000002e-06,
253
- "loss": 2.7974,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
- "eval_loss": 2.797025203704834,
259
- "eval_runtime": 45.9216,
260
- "eval_samples_per_second": 2.265,
261
- "eval_steps_per_second": 1.132,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
- "grad_norm": 5.157726668042884,
267
  "learning_rate": 1.4480000000000002e-06,
268
- "loss": 2.7907,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
- "grad_norm": 4.679964588741966,
274
  "learning_rate": 1.498e-06,
275
- "loss": 2.7794,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
- "grad_norm": 6.251667739120528,
281
  "learning_rate": 1.548e-06,
282
- "loss": 2.7925,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
- "grad_norm": 3.474471667792908,
288
  "learning_rate": 1.5980000000000002e-06,
289
- "loss": 2.7797,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
- "eval_loss": 2.786358118057251,
295
- "eval_runtime": 45.8769,
296
- "eval_samples_per_second": 2.267,
297
- "eval_steps_per_second": 1.133,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
- "grad_norm": 3.913235352599255,
303
  "learning_rate": 1.6480000000000001e-06,
304
- "loss": 2.7893,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
- "grad_norm": 3.5977863132825423,
310
  "learning_rate": 1.6980000000000003e-06,
311
- "loss": 2.7691,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
- "grad_norm": 3.6730462454873467,
317
  "learning_rate": 1.7480000000000002e-06,
318
- "loss": 2.7811,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
- "grad_norm": 4.349147110334662,
324
  "learning_rate": 1.798e-06,
325
- "loss": 2.7748,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
- "eval_loss": 2.775090217590332,
331
- "eval_runtime": 46.3008,
332
- "eval_samples_per_second": 2.246,
333
- "eval_steps_per_second": 1.123,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
- "grad_norm": 3.480345675498338,
339
  "learning_rate": 1.8480000000000001e-06,
340
- "loss": 2.762,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
- "grad_norm": 3.833328576431189,
346
  "learning_rate": 1.898e-06,
347
- "loss": 2.7757,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
- "grad_norm": 5.036413282568569,
353
  "learning_rate": 1.9480000000000002e-06,
354
- "loss": 2.7786,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
- "grad_norm": 3.127047897844809,
360
  "learning_rate": 1.998e-06,
361
- "loss": 2.7645,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
- "eval_loss": 2.767127513885498,
367
- "eval_runtime": 46.2475,
368
- "eval_samples_per_second": 2.249,
369
- "eval_steps_per_second": 1.124,
370
  "step": 1000
371
  },
372
  {
373
  "epoch": 0.0205,
374
- "grad_norm": 4.140879887496416,
375
  "learning_rate": 2.048e-06,
376
- "loss": 2.7662,
377
  "step": 1025
378
  },
379
  {
380
  "epoch": 0.021,
381
- "grad_norm": 3.4541052280858118,
382
  "learning_rate": 2.098e-06,
383
- "loss": 2.7791,
384
  "step": 1050
385
  },
386
  {
387
  "epoch": 0.0215,
388
- "grad_norm": 3.5083703383302365,
389
  "learning_rate": 2.148e-06,
390
- "loss": 2.7536,
391
  "step": 1075
392
  },
393
  {
394
  "epoch": 0.022,
395
- "grad_norm": 3.461079244241643,
396
  "learning_rate": 2.198e-06,
397
- "loss": 2.7553,
398
  "step": 1100
399
  },
400
  {
401
  "epoch": 0.022,
402
- "eval_loss": 2.7578125,
403
- "eval_runtime": 46.0318,
404
- "eval_samples_per_second": 2.259,
405
- "eval_steps_per_second": 1.13,
406
  "step": 1100
407
  },
408
  {
409
  "epoch": 0.0225,
410
- "grad_norm": 4.001530983295657,
411
  "learning_rate": 2.2480000000000003e-06,
412
- "loss": 2.7457,
413
  "step": 1125
414
  },
415
  {
416
  "epoch": 0.023,
417
- "grad_norm": 3.8943030577897852,
418
  "learning_rate": 2.2980000000000003e-06,
419
- "loss": 2.7512,
420
  "step": 1150
421
  },
422
  {
423
  "epoch": 0.0235,
424
- "grad_norm": 3.552019500851815,
425
  "learning_rate": 2.3480000000000002e-06,
426
- "loss": 2.7503,
427
  "step": 1175
428
  },
429
  {
430
  "epoch": 0.024,
431
- "grad_norm": 3.591141406301505,
432
  "learning_rate": 2.398e-06,
433
- "loss": 2.7597,
434
  "step": 1200
435
  },
436
  {
437
  "epoch": 0.024,
438
- "eval_loss": 2.750450611114502,
439
- "eval_runtime": 46.1485,
440
- "eval_samples_per_second": 2.254,
441
- "eval_steps_per_second": 1.127,
442
  "step": 1200
443
  },
444
  {
445
  "epoch": 0.0245,
446
- "grad_norm": 3.4973831873828556,
447
  "learning_rate": 2.448e-06,
448
- "loss": 2.7466,
449
  "step": 1225
450
  },
451
  {
452
  "epoch": 0.025,
453
- "grad_norm": 4.1984008078108515,
454
  "learning_rate": 2.498e-06,
455
- "loss": 2.7519,
456
  "step": 1250
457
  },
458
  {
459
  "epoch": 0.0255,
460
- "grad_norm": 4.081479454612426,
461
  "learning_rate": 2.5480000000000004e-06,
462
- "loss": 2.7439,
463
  "step": 1275
464
  },
465
  {
466
  "epoch": 0.026,
467
- "grad_norm": 3.9049654172962645,
468
  "learning_rate": 2.598e-06,
469
- "loss": 2.7343,
470
  "step": 1300
471
  },
472
  {
473
  "epoch": 0.026,
474
- "eval_loss": 2.740234375,
475
- "eval_runtime": 46.5866,
476
- "eval_samples_per_second": 2.232,
477
- "eval_steps_per_second": 1.116,
478
  "step": 1300
479
  },
480
  {
481
  "epoch": 0.0265,
482
- "grad_norm": 3.4381975298718377,
483
  "learning_rate": 2.648e-06,
484
- "loss": 2.7559,
485
  "step": 1325
486
  },
487
  {
488
  "epoch": 0.027,
489
- "grad_norm": 5.94066510561298,
490
  "learning_rate": 2.6980000000000003e-06,
491
- "loss": 2.7445,
492
  "step": 1350
493
  },
494
  {
495
  "epoch": 0.0275,
496
- "grad_norm": 4.0184303787905815,
497
  "learning_rate": 2.748e-06,
498
- "loss": 2.7335,
499
  "step": 1375
500
  },
501
  {
502
  "epoch": 0.028,
503
- "grad_norm": 4.557792596110642,
504
  "learning_rate": 2.798e-06,
505
- "loss": 2.7302,
506
  "step": 1400
507
  },
508
  {
509
  "epoch": 0.028,
510
- "eval_loss": 2.732872486114502,
511
- "eval_runtime": 46.2434,
512
- "eval_samples_per_second": 2.249,
513
- "eval_steps_per_second": 1.124,
514
  "step": 1400
515
  },
516
  {
517
  "epoch": 0.0285,
518
- "grad_norm": 3.6573921602611583,
519
  "learning_rate": 2.848e-06,
520
- "loss": 2.7279,
521
  "step": 1425
522
  },
523
  {
524
  "epoch": 0.029,
525
- "grad_norm": 4.85188341778116,
526
  "learning_rate": 2.8980000000000005e-06,
527
- "loss": 2.726,
528
  "step": 1450
529
  },
530
  {
531
  "epoch": 0.0295,
532
- "grad_norm": 6.36916983874441,
533
  "learning_rate": 2.9480000000000004e-06,
534
- "loss": 2.7122,
535
  "step": 1475
536
  },
537
  {
538
  "epoch": 0.03,
539
- "grad_norm": 4.96577286763615,
540
  "learning_rate": 2.9980000000000003e-06,
541
- "loss": 2.7339,
542
  "step": 1500
543
  },
544
  {
545
  "epoch": 0.03,
546
- "eval_loss": 2.725661039352417,
547
- "eval_runtime": 45.3832,
548
- "eval_samples_per_second": 2.292,
549
- "eval_steps_per_second": 1.146,
550
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  }
552
  ],
553
  "logging_steps": 25,
@@ -567,7 +1647,7 @@
567
  "attributes": {}
568
  }
569
  },
570
- "total_flos": 3.3663288633311887e+18,
571
  "train_batch_size": 1,
572
  "trial_name": null,
573
  "trial_params": null
 
1
  {
2
+ "best_global_step": 4500,
3
+ "best_metric": 2.620342493057251,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-4500",
5
+ "epoch": 0.09,
6
  "eval_steps": 100,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 96.04050869121504,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 3.4391,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 78.95958818615539,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 3.397,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 61.45018428703237,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.297,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 22.353651858428393,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.1733,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 3.09375,
43
+ "eval_runtime": 42.6579,
44
+ "eval_samples_per_second": 2.438,
45
+ "eval_steps_per_second": 1.219,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 9.835689068347888,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 3.0557,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 8.293191220823632,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.9954,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 6.660135091710579,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.9504,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 13.605532098937575,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.9363,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.924128532409668,
79
+ "eval_runtime": 42.5415,
80
+ "eval_samples_per_second": 2.445,
81
+ "eval_steps_per_second": 1.222,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 7.7985826788732435,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.9223,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 7.257382344220691,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.9043,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 9.049674458422025,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.8984,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 5.766079229639856,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.8898,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.877253532409668,
115
+ "eval_runtime": 42.642,
116
+ "eval_samples_per_second": 2.439,
117
+ "eval_steps_per_second": 1.219,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 5.448754520618337,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.871,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 6.866471472157179,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.8693,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 6.115788528016365,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.8601,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 5.871468919197367,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.8555,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.848106861114502,
151
+ "eval_runtime": 42.3632,
152
+ "eval_samples_per_second": 2.455,
153
+ "eval_steps_per_second": 1.227,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 6.050804087803095,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.832,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 4.634127162302958,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.8418,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 5.700549652048682,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.8351,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 5.462019159507559,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.8319,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.828125,
187
+ "eval_runtime": 42.4078,
188
+ "eval_samples_per_second": 2.452,
189
+ "eval_steps_per_second": 1.226,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
+ "grad_norm": 5.100237356575638,
195
  "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.8368,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
+ "grad_norm": 5.8591675831655134,
202
  "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.8262,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
+ "grad_norm": 4.582188259829454,
209
  "learning_rate": 1.148e-06,
210
+ "loss": 2.8083,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
+ "grad_norm": 4.853482247652135,
216
  "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.8187,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
+ "eval_loss": 2.810246467590332,
223
+ "eval_runtime": 42.429,
224
+ "eval_samples_per_second": 2.451,
225
+ "eval_steps_per_second": 1.226,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
+ "grad_norm": 4.813324366644894,
231
  "learning_rate": 1.248e-06,
232
+ "loss": 2.8109,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
+ "grad_norm": 4.680021008982155,
238
  "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.8071,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
+ "grad_norm": 4.232572917961915,
245
  "learning_rate": 1.348e-06,
246
+ "loss": 2.7996,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
+ "grad_norm": 4.140300235345937,
252
  "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.7965,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
+ "eval_loss": 2.795973539352417,
259
+ "eval_runtime": 42.2781,
260
+ "eval_samples_per_second": 2.46,
261
+ "eval_steps_per_second": 1.23,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
+ "grad_norm": 4.066322921244863,
267
  "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.7892,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
+ "grad_norm": 4.790524346969656,
274
  "learning_rate": 1.498e-06,
275
+ "loss": 2.7776,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
+ "grad_norm": 4.814208015592297,
281
  "learning_rate": 1.548e-06,
282
+ "loss": 2.7904,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
+ "grad_norm": 3.495397019361677,
288
  "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.7771,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
+ "eval_loss": 2.783353328704834,
295
+ "eval_runtime": 45.2475,
296
+ "eval_samples_per_second": 2.298,
297
+ "eval_steps_per_second": 1.149,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
+ "grad_norm": 4.509827964168959,
303
  "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.7864,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
+ "grad_norm": 3.396755590212729,
310
  "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.7665,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
+ "grad_norm": 3.6908600934389364,
317
  "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.7784,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
+ "grad_norm": 4.517092572588064,
324
  "learning_rate": 1.798e-06,
325
+ "loss": 2.7718,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
+ "eval_loss": 2.772385835647583,
331
+ "eval_runtime": 42.1503,
332
+ "eval_samples_per_second": 2.467,
333
+ "eval_steps_per_second": 1.234,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
+ "grad_norm": 4.1527970820269635,
339
  "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.7592,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
+ "grad_norm": 4.093946260210414,
346
  "learning_rate": 1.898e-06,
347
+ "loss": 2.7728,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
+ "grad_norm": 3.794409923219389,
353
  "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.7757,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
+ "grad_norm": 3.128018180220031,
360
  "learning_rate": 1.998e-06,
361
+ "loss": 2.7614,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
+ "eval_loss": 2.764573335647583,
367
+ "eval_runtime": 42.2226,
368
+ "eval_samples_per_second": 2.463,
369
+ "eval_steps_per_second": 1.232,
370
  "step": 1000
371
  },
372
  {
373
  "epoch": 0.0205,
374
+ "grad_norm": 3.8078874128993667,
375
  "learning_rate": 2.048e-06,
376
+ "loss": 2.7629,
377
  "step": 1025
378
  },
379
  {
380
  "epoch": 0.021,
381
+ "grad_norm": 3.50724949935112,
382
  "learning_rate": 2.098e-06,
383
+ "loss": 2.776,
384
  "step": 1050
385
  },
386
  {
387
  "epoch": 0.0215,
388
+ "grad_norm": 3.600343997799952,
389
  "learning_rate": 2.148e-06,
390
+ "loss": 2.7503,
391
  "step": 1075
392
  },
393
  {
394
  "epoch": 0.022,
395
+ "grad_norm": 3.4227590286591667,
396
  "learning_rate": 2.198e-06,
397
+ "loss": 2.7522,
398
  "step": 1100
399
  },
400
  {
401
  "epoch": 0.022,
402
+ "eval_loss": 2.754957914352417,
403
+ "eval_runtime": 42.1456,
404
+ "eval_samples_per_second": 2.468,
405
+ "eval_steps_per_second": 1.234,
406
  "step": 1100
407
  },
408
  {
409
  "epoch": 0.0225,
410
+ "grad_norm": 3.6214573340756178,
411
  "learning_rate": 2.2480000000000003e-06,
412
+ "loss": 2.7423,
413
  "step": 1125
414
  },
415
  {
416
  "epoch": 0.023,
417
+ "grad_norm": 4.963456774283441,
418
  "learning_rate": 2.2980000000000003e-06,
419
+ "loss": 2.7473,
420
  "step": 1150
421
  },
422
  {
423
  "epoch": 0.0235,
424
+ "grad_norm": 4.417511515875024,
425
  "learning_rate": 2.3480000000000002e-06,
426
+ "loss": 2.7458,
427
  "step": 1175
428
  },
429
  {
430
  "epoch": 0.024,
431
+ "grad_norm": 3.4640266757488054,
432
  "learning_rate": 2.398e-06,
433
+ "loss": 2.755,
434
  "step": 1200
435
  },
436
  {
437
  "epoch": 0.024,
438
+ "eval_loss": 2.744741678237915,
439
+ "eval_runtime": 42.2958,
440
+ "eval_samples_per_second": 2.459,
441
+ "eval_steps_per_second": 1.229,
442
  "step": 1200
443
  },
444
  {
445
  "epoch": 0.0245,
446
+ "grad_norm": 3.8906187945336637,
447
  "learning_rate": 2.448e-06,
448
+ "loss": 2.7413,
449
  "step": 1225
450
  },
451
  {
452
  "epoch": 0.025,
453
+ "grad_norm": 4.103531427287993,
454
  "learning_rate": 2.498e-06,
455
+ "loss": 2.7464,
456
  "step": 1250
457
  },
458
  {
459
  "epoch": 0.0255,
460
+ "grad_norm": 3.7381187683762565,
461
  "learning_rate": 2.5480000000000004e-06,
462
+ "loss": 2.7383,
463
  "step": 1275
464
  },
465
  {
466
  "epoch": 0.026,
467
+ "grad_norm": 4.019695597142381,
468
  "learning_rate": 2.598e-06,
469
+ "loss": 2.7286,
470
  "step": 1300
471
  },
472
  {
473
  "epoch": 0.026,
474
+ "eval_loss": 2.735727071762085,
475
+ "eval_runtime": 42.1778,
476
+ "eval_samples_per_second": 2.466,
477
+ "eval_steps_per_second": 1.233,
478
  "step": 1300
479
  },
480
  {
481
  "epoch": 0.0265,
482
+ "grad_norm": 3.761754015207239,
483
  "learning_rate": 2.648e-06,
484
+ "loss": 2.7508,
485
  "step": 1325
486
  },
487
  {
488
  "epoch": 0.027,
489
+ "grad_norm": 3.5172792845513023,
490
  "learning_rate": 2.6980000000000003e-06,
491
+ "loss": 2.7396,
492
  "step": 1350
493
  },
494
  {
495
  "epoch": 0.0275,
496
+ "grad_norm": 3.6926838130981556,
497
  "learning_rate": 2.748e-06,
498
+ "loss": 2.7286,
499
  "step": 1375
500
  },
501
  {
502
  "epoch": 0.028,
503
+ "grad_norm": 3.5018547073145,
504
  "learning_rate": 2.798e-06,
505
+ "loss": 2.7247,
506
  "step": 1400
507
  },
508
  {
509
  "epoch": 0.028,
510
+ "eval_loss": 2.728515625,
511
+ "eval_runtime": 42.129,
512
+ "eval_samples_per_second": 2.469,
513
+ "eval_steps_per_second": 1.234,
514
  "step": 1400
515
  },
516
  {
517
  "epoch": 0.0285,
518
+ "grad_norm": 3.575054037567428,
519
  "learning_rate": 2.848e-06,
520
+ "loss": 2.7229,
521
  "step": 1425
522
  },
523
  {
524
  "epoch": 0.029,
525
+ "grad_norm": 4.062924067051664,
526
  "learning_rate": 2.8980000000000005e-06,
527
+ "loss": 2.7208,
528
  "step": 1450
529
  },
530
  {
531
  "epoch": 0.0295,
532
+ "grad_norm": 3.5741121733868573,
533
  "learning_rate": 2.9480000000000004e-06,
534
+ "loss": 2.7071,
535
  "step": 1475
536
  },
537
  {
538
  "epoch": 0.03,
539
+ "grad_norm": 3.9813713940318864,
540
  "learning_rate": 2.9980000000000003e-06,
541
+ "loss": 2.729,
542
  "step": 1500
543
  },
544
  {
545
  "epoch": 0.03,
546
+ "eval_loss": 2.721153736114502,
547
+ "eval_runtime": 42.058,
548
+ "eval_samples_per_second": 2.473,
549
+ "eval_steps_per_second": 1.236,
550
  "step": 1500
551
+ },
552
+ {
553
+ "epoch": 0.0305,
554
+ "grad_norm": 4.465898046671721,
555
+ "learning_rate": 3.0480000000000003e-06,
556
+ "loss": 2.7239,
557
+ "step": 1525
558
+ },
559
+ {
560
+ "epoch": 0.031,
561
+ "grad_norm": 4.083780430751083,
562
+ "learning_rate": 3.0980000000000007e-06,
563
+ "loss": 2.7177,
564
+ "step": 1550
565
+ },
566
+ {
567
+ "epoch": 0.0315,
568
+ "grad_norm": 3.259296223054617,
569
+ "learning_rate": 3.1480000000000006e-06,
570
+ "loss": 2.7149,
571
+ "step": 1575
572
+ },
573
+ {
574
+ "epoch": 0.032,
575
+ "grad_norm": 4.118900376683919,
576
+ "learning_rate": 3.198e-06,
577
+ "loss": 2.7157,
578
+ "step": 1600
579
+ },
580
+ {
581
+ "epoch": 0.032,
582
+ "eval_loss": 2.714693546295166,
583
+ "eval_runtime": 42.155,
584
+ "eval_samples_per_second": 2.467,
585
+ "eval_steps_per_second": 1.234,
586
+ "step": 1600
587
+ },
588
+ {
589
+ "epoch": 0.0325,
590
+ "grad_norm": 3.7685203077928335,
591
+ "learning_rate": 3.248e-06,
592
+ "loss": 2.7185,
593
+ "step": 1625
594
+ },
595
+ {
596
+ "epoch": 0.033,
597
+ "grad_norm": 3.786239665874637,
598
+ "learning_rate": 3.298e-06,
599
+ "loss": 2.694,
600
+ "step": 1650
601
+ },
602
+ {
603
+ "epoch": 0.0335,
604
+ "grad_norm": 4.0202339796786095,
605
+ "learning_rate": 3.348e-06,
606
+ "loss": 2.7076,
607
+ "step": 1675
608
+ },
609
+ {
610
+ "epoch": 0.034,
611
+ "grad_norm": 3.220912468646897,
612
+ "learning_rate": 3.3980000000000003e-06,
613
+ "loss": 2.7086,
614
+ "step": 1700
615
+ },
616
+ {
617
+ "epoch": 0.034,
618
+ "eval_loss": 2.708683967590332,
619
+ "eval_runtime": 42.1812,
620
+ "eval_samples_per_second": 2.466,
621
+ "eval_steps_per_second": 1.233,
622
+ "step": 1700
623
+ },
624
+ {
625
+ "epoch": 0.0345,
626
+ "grad_norm": 3.4236457763643964,
627
+ "learning_rate": 3.4480000000000003e-06,
628
+ "loss": 2.7107,
629
+ "step": 1725
630
+ },
631
+ {
632
+ "epoch": 0.035,
633
+ "grad_norm": 3.428424878937346,
634
+ "learning_rate": 3.4980000000000002e-06,
635
+ "loss": 2.7033,
636
+ "step": 1750
637
+ },
638
+ {
639
+ "epoch": 0.0355,
640
+ "grad_norm": 3.7064590041354597,
641
+ "learning_rate": 3.548e-06,
642
+ "loss": 2.7135,
643
+ "step": 1775
644
+ },
645
+ {
646
+ "epoch": 0.036,
647
+ "grad_norm": 2.6935868617559127,
648
+ "learning_rate": 3.5980000000000005e-06,
649
+ "loss": 2.6977,
650
+ "step": 1800
651
+ },
652
+ {
653
+ "epoch": 0.036,
654
+ "eval_loss": 2.702373743057251,
655
+ "eval_runtime": 42.099,
656
+ "eval_samples_per_second": 2.47,
657
+ "eval_steps_per_second": 1.235,
658
+ "step": 1800
659
+ },
660
+ {
661
+ "epoch": 0.0365,
662
+ "grad_norm": 3.1724624305272577,
663
+ "learning_rate": 3.6480000000000005e-06,
664
+ "loss": 2.6941,
665
+ "step": 1825
666
+ },
667
+ {
668
+ "epoch": 0.037,
669
+ "grad_norm": 3.3947291376692967,
670
+ "learning_rate": 3.6980000000000004e-06,
671
+ "loss": 2.705,
672
+ "step": 1850
673
+ },
674
+ {
675
+ "epoch": 0.0375,
676
+ "grad_norm": 3.2739522130247454,
677
+ "learning_rate": 3.7480000000000004e-06,
678
+ "loss": 2.6971,
679
+ "step": 1875
680
+ },
681
+ {
682
+ "epoch": 0.038,
683
+ "grad_norm": 2.886346941239111,
684
+ "learning_rate": 3.7980000000000007e-06,
685
+ "loss": 2.6878,
686
+ "step": 1900
687
+ },
688
+ {
689
+ "epoch": 0.038,
690
+ "eval_loss": 2.698768138885498,
691
+ "eval_runtime": 42.2524,
692
+ "eval_samples_per_second": 2.461,
693
+ "eval_steps_per_second": 1.231,
694
+ "step": 1900
695
+ },
696
+ {
697
+ "epoch": 0.0385,
698
+ "grad_norm": 2.961130539695273,
699
+ "learning_rate": 3.848e-06,
700
+ "loss": 2.6936,
701
+ "step": 1925
702
+ },
703
+ {
704
+ "epoch": 0.039,
705
+ "grad_norm": 3.2300245788196884,
706
+ "learning_rate": 3.898e-06,
707
+ "loss": 2.6989,
708
+ "step": 1950
709
+ },
710
+ {
711
+ "epoch": 0.0395,
712
+ "grad_norm": 3.2952386418656823,
713
+ "learning_rate": 3.948e-06,
714
+ "loss": 2.6937,
715
+ "step": 1975
716
+ },
717
+ {
718
+ "epoch": 0.04,
719
+ "grad_norm": 2.556435159379079,
720
+ "learning_rate": 3.9980000000000005e-06,
721
+ "loss": 2.6991,
722
+ "step": 2000
723
+ },
724
+ {
725
+ "epoch": 0.04,
726
+ "eval_loss": 2.693058967590332,
727
+ "eval_runtime": 42.2004,
728
+ "eval_samples_per_second": 2.464,
729
+ "eval_steps_per_second": 1.232,
730
+ "step": 2000
731
+ },
732
+ {
733
+ "epoch": 0.0405,
734
+ "grad_norm": 2.975198340671437,
735
+ "learning_rate": 4.048e-06,
736
+ "loss": 2.6896,
737
+ "step": 2025
738
+ },
739
+ {
740
+ "epoch": 0.041,
741
+ "grad_norm": 2.366572300776235,
742
+ "learning_rate": 4.098e-06,
743
+ "loss": 2.6903,
744
+ "step": 2050
745
+ },
746
+ {
747
+ "epoch": 0.0415,
748
+ "grad_norm": 2.650575110326075,
749
+ "learning_rate": 4.148000000000001e-06,
750
+ "loss": 2.6974,
751
+ "step": 2075
752
+ },
753
+ {
754
+ "epoch": 0.042,
755
+ "grad_norm": 2.844363978567716,
756
+ "learning_rate": 4.198e-06,
757
+ "loss": 2.6833,
758
+ "step": 2100
759
+ },
760
+ {
761
+ "epoch": 0.042,
762
+ "eval_loss": 2.687650203704834,
763
+ "eval_runtime": 42.1236,
764
+ "eval_samples_per_second": 2.469,
765
+ "eval_steps_per_second": 1.234,
766
+ "step": 2100
767
+ },
768
+ {
769
+ "epoch": 0.0425,
770
+ "grad_norm": 2.5043519810203425,
771
+ "learning_rate": 4.248000000000001e-06,
772
+ "loss": 2.6848,
773
+ "step": 2125
774
+ },
775
+ {
776
+ "epoch": 0.043,
777
+ "grad_norm": 2.442865859341675,
778
+ "learning_rate": 4.298e-06,
779
+ "loss": 2.6834,
780
+ "step": 2150
781
+ },
782
+ {
783
+ "epoch": 0.0435,
784
+ "grad_norm": 2.396444505850839,
785
+ "learning_rate": 4.3480000000000006e-06,
786
+ "loss": 2.6842,
787
+ "step": 2175
788
+ },
789
+ {
790
+ "epoch": 0.044,
791
+ "grad_norm": 2.467830621762353,
792
+ "learning_rate": 4.398000000000001e-06,
793
+ "loss": 2.6849,
794
+ "step": 2200
795
+ },
796
+ {
797
+ "epoch": 0.044,
798
+ "eval_loss": 2.684495210647583,
799
+ "eval_runtime": 42.337,
800
+ "eval_samples_per_second": 2.456,
801
+ "eval_steps_per_second": 1.228,
802
+ "step": 2200
803
+ },
804
+ {
805
+ "epoch": 0.0445,
806
+ "grad_norm": 2.331183246577976,
807
+ "learning_rate": 4.4480000000000004e-06,
808
+ "loss": 2.6933,
809
+ "step": 2225
810
+ },
811
+ {
812
+ "epoch": 0.045,
813
+ "grad_norm": 2.7108879126095995,
814
+ "learning_rate": 4.498e-06,
815
+ "loss": 2.6756,
816
+ "step": 2250
817
+ },
818
+ {
819
+ "epoch": 0.0455,
820
+ "grad_norm": 2.297487473050839,
821
+ "learning_rate": 4.548e-06,
822
+ "loss": 2.6773,
823
+ "step": 2275
824
+ },
825
+ {
826
+ "epoch": 0.046,
827
+ "grad_norm": 2.260013609826266,
828
+ "learning_rate": 4.598e-06,
829
+ "loss": 2.6869,
830
+ "step": 2300
831
+ },
832
+ {
833
+ "epoch": 0.046,
834
+ "eval_loss": 2.680889368057251,
835
+ "eval_runtime": 42.2308,
836
+ "eval_samples_per_second": 2.463,
837
+ "eval_steps_per_second": 1.231,
838
+ "step": 2300
839
+ },
840
+ {
841
+ "epoch": 0.0465,
842
+ "grad_norm": 2.1362621908829964,
843
+ "learning_rate": 4.648e-06,
844
+ "loss": 2.674,
845
+ "step": 2325
846
+ },
847
+ {
848
+ "epoch": 0.047,
849
+ "grad_norm": 2.530250306266186,
850
+ "learning_rate": 4.698000000000001e-06,
851
+ "loss": 2.6682,
852
+ "step": 2350
853
+ },
854
+ {
855
+ "epoch": 0.0475,
856
+ "grad_norm": 2.284376818082532,
857
+ "learning_rate": 4.748e-06,
858
+ "loss": 2.6741,
859
+ "step": 2375
860
+ },
861
+ {
862
+ "epoch": 0.048,
863
+ "grad_norm": 2.9431781004579403,
864
+ "learning_rate": 4.7980000000000005e-06,
865
+ "loss": 2.6793,
866
+ "step": 2400
867
+ },
868
+ {
869
+ "epoch": 0.048,
870
+ "eval_loss": 2.676382303237915,
871
+ "eval_runtime": 42.1755,
872
+ "eval_samples_per_second": 2.466,
873
+ "eval_steps_per_second": 1.233,
874
+ "step": 2400
875
+ },
876
+ {
877
+ "epoch": 0.0485,
878
+ "grad_norm": 2.2501714313646,
879
+ "learning_rate": 4.848000000000001e-06,
880
+ "loss": 2.6836,
881
+ "step": 2425
882
+ },
883
+ {
884
+ "epoch": 0.049,
885
+ "grad_norm": 2.520507270374293,
886
+ "learning_rate": 4.898e-06,
887
+ "loss": 2.6793,
888
+ "step": 2450
889
+ },
890
+ {
891
+ "epoch": 0.0495,
892
+ "grad_norm": 2.3001609851463156,
893
+ "learning_rate": 4.948000000000001e-06,
894
+ "loss": 2.6825,
895
+ "step": 2475
896
+ },
897
+ {
898
+ "epoch": 0.05,
899
+ "grad_norm": 2.0060268631347973,
900
+ "learning_rate": 4.998e-06,
901
+ "loss": 2.6736,
902
+ "step": 2500
903
+ },
904
+ {
905
+ "epoch": 0.05,
906
+ "eval_loss": 2.671875,
907
+ "eval_runtime": 42.1697,
908
+ "eval_samples_per_second": 2.466,
909
+ "eval_steps_per_second": 1.233,
910
+ "step": 2500
911
+ },
912
+ {
913
+ "epoch": 0.0505,
914
+ "grad_norm": 2.1769919372211564,
915
+ "learning_rate": 5.048000000000001e-06,
916
+ "loss": 2.6741,
917
+ "step": 2525
918
+ },
919
+ {
920
+ "epoch": 0.051,
921
+ "grad_norm": 2.1133782069189366,
922
+ "learning_rate": 5.098000000000001e-06,
923
+ "loss": 2.67,
924
+ "step": 2550
925
+ },
926
+ {
927
+ "epoch": 0.0515,
928
+ "grad_norm": 2.242586565950932,
929
+ "learning_rate": 5.1480000000000005e-06,
930
+ "loss": 2.6835,
931
+ "step": 2575
932
+ },
933
+ {
934
+ "epoch": 0.052,
935
+ "grad_norm": 2.4130154185332615,
936
+ "learning_rate": 5.198000000000001e-06,
937
+ "loss": 2.6752,
938
+ "step": 2600
939
+ },
940
+ {
941
+ "epoch": 0.052,
942
+ "eval_loss": 2.669621467590332,
943
+ "eval_runtime": 42.1123,
944
+ "eval_samples_per_second": 2.47,
945
+ "eval_steps_per_second": 1.235,
946
+ "step": 2600
947
+ },
948
+ {
949
+ "epoch": 0.0525,
950
+ "grad_norm": 2.243339931731786,
951
+ "learning_rate": 5.248000000000001e-06,
952
+ "loss": 2.6631,
953
+ "step": 2625
954
+ },
955
+ {
956
+ "epoch": 0.053,
957
+ "grad_norm": 2.1652170787894964,
958
+ "learning_rate": 5.298000000000001e-06,
959
+ "loss": 2.6653,
960
+ "step": 2650
961
+ },
962
+ {
963
+ "epoch": 0.0535,
964
+ "grad_norm": 2.3514042691010077,
965
+ "learning_rate": 5.348000000000001e-06,
966
+ "loss": 2.6704,
967
+ "step": 2675
968
+ },
969
+ {
970
+ "epoch": 0.054,
971
+ "grad_norm": 2.0555358311645104,
972
+ "learning_rate": 5.398e-06,
973
+ "loss": 2.6744,
974
+ "step": 2700
975
+ },
976
+ {
977
+ "epoch": 0.054,
978
+ "eval_loss": 2.668419361114502,
979
+ "eval_runtime": 42.1636,
980
+ "eval_samples_per_second": 2.467,
981
+ "eval_steps_per_second": 1.233,
982
+ "step": 2700
983
+ },
984
+ {
985
+ "epoch": 0.0545,
986
+ "grad_norm": 2.504233096197935,
987
+ "learning_rate": 5.448e-06,
988
+ "loss": 2.6686,
989
+ "step": 2725
990
+ },
991
+ {
992
+ "epoch": 0.055,
993
+ "grad_norm": 2.1966446495255014,
994
+ "learning_rate": 5.498e-06,
995
+ "loss": 2.6575,
996
+ "step": 2750
997
+ },
998
+ {
999
+ "epoch": 0.0555,
1000
+ "grad_norm": 3.4129666421130738,
1001
+ "learning_rate": 5.548e-06,
1002
+ "loss": 2.6624,
1003
+ "step": 2775
1004
+ },
1005
+ {
1006
+ "epoch": 0.056,
1007
+ "grad_norm": 2.5402178685422028,
1008
+ "learning_rate": 5.5980000000000004e-06,
1009
+ "loss": 2.6615,
1010
+ "step": 2800
1011
+ },
1012
+ {
1013
+ "epoch": 0.056,
1014
+ "eval_loss": 2.666015625,
1015
+ "eval_runtime": 42.1094,
1016
+ "eval_samples_per_second": 2.47,
1017
+ "eval_steps_per_second": 1.235,
1018
+ "step": 2800
1019
+ },
1020
+ {
1021
+ "epoch": 0.0565,
1022
+ "grad_norm": 2.5169534616209215,
1023
+ "learning_rate": 5.648e-06,
1024
+ "loss": 2.6745,
1025
+ "step": 2825
1026
+ },
1027
+ {
1028
+ "epoch": 0.057,
1029
+ "grad_norm": 2.4269096679582347,
1030
+ "learning_rate": 5.698e-06,
1031
+ "loss": 2.658,
1032
+ "step": 2850
1033
+ },
1034
+ {
1035
+ "epoch": 0.0575,
1036
+ "grad_norm": 2.2819396814928763,
1037
+ "learning_rate": 5.748e-06,
1038
+ "loss": 2.6694,
1039
+ "step": 2875
1040
+ },
1041
+ {
1042
+ "epoch": 0.058,
1043
+ "grad_norm": 3.0448163445232512,
1044
+ "learning_rate": 5.798e-06,
1045
+ "loss": 2.6587,
1046
+ "step": 2900
1047
+ },
1048
+ {
1049
+ "epoch": 0.058,
1050
+ "eval_loss": 2.662710428237915,
1051
+ "eval_runtime": 42.173,
1052
+ "eval_samples_per_second": 2.466,
1053
+ "eval_steps_per_second": 1.233,
1054
+ "step": 2900
1055
+ },
1056
+ {
1057
+ "epoch": 0.0585,
1058
+ "grad_norm": 3.2390472506289343,
1059
+ "learning_rate": 5.848000000000001e-06,
1060
+ "loss": 2.661,
1061
+ "step": 2925
1062
+ },
1063
+ {
1064
+ "epoch": 0.059,
1065
+ "grad_norm": 2.5836929915418194,
1066
+ "learning_rate": 5.898e-06,
1067
+ "loss": 2.6514,
1068
+ "step": 2950
1069
+ },
1070
+ {
1071
+ "epoch": 0.0595,
1072
+ "grad_norm": 2.5766876152500227,
1073
+ "learning_rate": 5.9480000000000005e-06,
1074
+ "loss": 2.6673,
1075
+ "step": 2975
1076
+ },
1077
+ {
1078
+ "epoch": 0.06,
1079
+ "grad_norm": 2.507842811667469,
1080
+ "learning_rate": 5.998000000000001e-06,
1081
+ "loss": 2.6658,
1082
+ "step": 3000
1083
+ },
1084
+ {
1085
+ "epoch": 0.06,
1086
+ "eval_loss": 2.659705638885498,
1087
+ "eval_runtime": 42.0906,
1088
+ "eval_samples_per_second": 2.471,
1089
+ "eval_steps_per_second": 1.235,
1090
+ "step": 3000
1091
+ },
1092
+ {
1093
+ "epoch": 0.0605,
1094
+ "grad_norm": 2.291724100817165,
1095
+ "learning_rate": 6.048e-06,
1096
+ "loss": 2.6588,
1097
+ "step": 3025
1098
+ },
1099
+ {
1100
+ "epoch": 0.061,
1101
+ "grad_norm": 2.356775687250912,
1102
+ "learning_rate": 6.098000000000001e-06,
1103
+ "loss": 2.6519,
1104
+ "step": 3050
1105
+ },
1106
+ {
1107
+ "epoch": 0.0615,
1108
+ "grad_norm": 3.6009374683805553,
1109
+ "learning_rate": 6.148e-06,
1110
+ "loss": 2.6581,
1111
+ "step": 3075
1112
+ },
1113
+ {
1114
+ "epoch": 0.062,
1115
+ "grad_norm": 3.2760170273305724,
1116
+ "learning_rate": 6.198000000000001e-06,
1117
+ "loss": 2.6588,
1118
+ "step": 3100
1119
+ },
1120
+ {
1121
+ "epoch": 0.062,
1122
+ "eval_loss": 2.656700611114502,
1123
+ "eval_runtime": 42.0325,
1124
+ "eval_samples_per_second": 2.474,
1125
+ "eval_steps_per_second": 1.237,
1126
+ "step": 3100
1127
+ },
1128
+ {
1129
+ "epoch": 0.0625,
1130
+ "grad_norm": 2.5849236998041825,
1131
+ "learning_rate": 6.248000000000001e-06,
1132
+ "loss": 2.6548,
1133
+ "step": 3125
1134
+ },
1135
+ {
1136
+ "epoch": 0.063,
1137
+ "grad_norm": 2.3095505880624474,
1138
+ "learning_rate": 6.2980000000000005e-06,
1139
+ "loss": 2.6511,
1140
+ "step": 3150
1141
+ },
1142
+ {
1143
+ "epoch": 0.0635,
1144
+ "grad_norm": 2.5258255422234996,
1145
+ "learning_rate": 6.348000000000001e-06,
1146
+ "loss": 2.6589,
1147
+ "step": 3175
1148
+ },
1149
+ {
1150
+ "epoch": 0.064,
1151
+ "grad_norm": 2.3520030773681335,
1152
+ "learning_rate": 6.398000000000001e-06,
1153
+ "loss": 2.6462,
1154
+ "step": 3200
1155
+ },
1156
+ {
1157
+ "epoch": 0.064,
1158
+ "eval_loss": 2.652644157409668,
1159
+ "eval_runtime": 42.2271,
1160
+ "eval_samples_per_second": 2.463,
1161
+ "eval_steps_per_second": 1.231,
1162
+ "step": 3200
1163
+ },
1164
+ {
1165
+ "epoch": 0.0645,
1166
+ "grad_norm": 2.457532178302885,
1167
+ "learning_rate": 6.448000000000001e-06,
1168
+ "loss": 2.6495,
1169
+ "step": 3225
1170
+ },
1171
+ {
1172
+ "epoch": 0.065,
1173
+ "grad_norm": 2.3328730844475833,
1174
+ "learning_rate": 6.498000000000001e-06,
1175
+ "loss": 2.6384,
1176
+ "step": 3250
1177
+ },
1178
+ {
1179
+ "epoch": 0.0655,
1180
+ "grad_norm": 2.382459769400574,
1181
+ "learning_rate": 6.548000000000001e-06,
1182
+ "loss": 2.652,
1183
+ "step": 3275
1184
+ },
1185
+ {
1186
+ "epoch": 0.066,
1187
+ "grad_norm": 2.4287460984943707,
1188
+ "learning_rate": 6.598000000000001e-06,
1189
+ "loss": 2.655,
1190
+ "step": 3300
1191
+ },
1192
+ {
1193
+ "epoch": 0.066,
1194
+ "eval_loss": 2.650841236114502,
1195
+ "eval_runtime": 42.1822,
1196
+ "eval_samples_per_second": 2.465,
1197
+ "eval_steps_per_second": 1.233,
1198
+ "step": 3300
1199
+ },
1200
+ {
1201
+ "epoch": 0.0665,
1202
+ "grad_norm": 3.0374923212376963,
1203
+ "learning_rate": 6.648e-06,
1204
+ "loss": 2.6623,
1205
+ "step": 3325
1206
+ },
1207
+ {
1208
+ "epoch": 0.067,
1209
+ "grad_norm": 2.3072135476674127,
1210
+ "learning_rate": 6.698e-06,
1211
+ "loss": 2.6484,
1212
+ "step": 3350
1213
+ },
1214
+ {
1215
+ "epoch": 0.0675,
1216
+ "grad_norm": 2.3676328206176778,
1217
+ "learning_rate": 6.7480000000000004e-06,
1218
+ "loss": 2.6569,
1219
+ "step": 3375
1220
+ },
1221
+ {
1222
+ "epoch": 0.068,
1223
+ "grad_norm": 2.313390296186245,
1224
+ "learning_rate": 6.798e-06,
1225
+ "loss": 2.6393,
1226
+ "step": 3400
1227
+ },
1228
+ {
1229
+ "epoch": 0.068,
1230
+ "eval_loss": 2.648888111114502,
1231
+ "eval_runtime": 44.6877,
1232
+ "eval_samples_per_second": 2.327,
1233
+ "eval_steps_per_second": 1.164,
1234
+ "step": 3400
1235
+ },
1236
+ {
1237
+ "epoch": 0.0685,
1238
+ "grad_norm": 2.9181668179248033,
1239
+ "learning_rate": 6.848e-06,
1240
+ "loss": 2.6521,
1241
+ "step": 3425
1242
+ },
1243
+ {
1244
+ "epoch": 0.069,
1245
+ "grad_norm": 2.1972242976901457,
1246
+ "learning_rate": 6.898e-06,
1247
+ "loss": 2.6605,
1248
+ "step": 3450
1249
+ },
1250
+ {
1251
+ "epoch": 0.0695,
1252
+ "grad_norm": 2.514104559780915,
1253
+ "learning_rate": 6.948e-06,
1254
+ "loss": 2.6444,
1255
+ "step": 3475
1256
+ },
1257
+ {
1258
+ "epoch": 0.07,
1259
+ "grad_norm": 2.463879404265904,
1260
+ "learning_rate": 6.998000000000001e-06,
1261
+ "loss": 2.6586,
1262
+ "step": 3500
1263
+ },
1264
+ {
1265
+ "epoch": 0.07,
1266
+ "eval_loss": 2.644831657409668,
1267
+ "eval_runtime": 45.1164,
1268
+ "eval_samples_per_second": 2.305,
1269
+ "eval_steps_per_second": 1.153,
1270
+ "step": 3500
1271
+ },
1272
+ {
1273
+ "epoch": 0.0705,
1274
+ "grad_norm": 2.4337078135824126,
1275
+ "learning_rate": 7.048e-06,
1276
+ "loss": 2.6463,
1277
+ "step": 3525
1278
+ },
1279
+ {
1280
+ "epoch": 0.071,
1281
+ "grad_norm": 2.2908199130690257,
1282
+ "learning_rate": 7.0980000000000005e-06,
1283
+ "loss": 2.655,
1284
+ "step": 3550
1285
+ },
1286
+ {
1287
+ "epoch": 0.0715,
1288
+ "grad_norm": 2.4093156448180713,
1289
+ "learning_rate": 7.148000000000001e-06,
1290
+ "loss": 2.6479,
1291
+ "step": 3575
1292
+ },
1293
+ {
1294
+ "epoch": 0.072,
1295
+ "grad_norm": 2.3128290328516172,
1296
+ "learning_rate": 7.198e-06,
1297
+ "loss": 2.6342,
1298
+ "step": 3600
1299
+ },
1300
+ {
1301
+ "epoch": 0.072,
1302
+ "eval_loss": 2.643179178237915,
1303
+ "eval_runtime": 43.1012,
1304
+ "eval_samples_per_second": 2.413,
1305
+ "eval_steps_per_second": 1.206,
1306
+ "step": 3600
1307
+ },
1308
+ {
1309
+ "epoch": 0.0725,
1310
+ "grad_norm": 2.7714344541916165,
1311
+ "learning_rate": 7.248000000000001e-06,
1312
+ "loss": 2.6337,
1313
+ "step": 3625
1314
+ },
1315
+ {
1316
+ "epoch": 0.073,
1317
+ "grad_norm": 2.8399095157670486,
1318
+ "learning_rate": 7.298e-06,
1319
+ "loss": 2.6413,
1320
+ "step": 3650
1321
+ },
1322
+ {
1323
+ "epoch": 0.0735,
1324
+ "grad_norm": 2.6867409675260747,
1325
+ "learning_rate": 7.348000000000001e-06,
1326
+ "loss": 2.6314,
1327
+ "step": 3675
1328
+ },
1329
+ {
1330
+ "epoch": 0.074,
1331
+ "grad_norm": 2.853697365081861,
1332
+ "learning_rate": 7.398000000000001e-06,
1333
+ "loss": 2.6372,
1334
+ "step": 3700
1335
+ },
1336
+ {
1337
+ "epoch": 0.074,
1338
+ "eval_loss": 2.639573335647583,
1339
+ "eval_runtime": 45.0291,
1340
+ "eval_samples_per_second": 2.31,
1341
+ "eval_steps_per_second": 1.155,
1342
+ "step": 3700
1343
+ },
1344
+ {
1345
+ "epoch": 0.0745,
1346
+ "grad_norm": 1.998706410316405,
1347
+ "learning_rate": 7.4480000000000005e-06,
1348
+ "loss": 2.637,
1349
+ "step": 3725
1350
+ },
1351
+ {
1352
+ "epoch": 0.075,
1353
+ "grad_norm": 2.3172883792227417,
1354
+ "learning_rate": 7.498000000000001e-06,
1355
+ "loss": 2.6386,
1356
+ "step": 3750
1357
+ },
1358
+ {
1359
+ "epoch": 0.0755,
1360
+ "grad_norm": 2.2578618031758793,
1361
+ "learning_rate": 7.548000000000001e-06,
1362
+ "loss": 2.637,
1363
+ "step": 3775
1364
+ },
1365
+ {
1366
+ "epoch": 0.076,
1367
+ "grad_norm": 2.022866842989095,
1368
+ "learning_rate": 7.598000000000001e-06,
1369
+ "loss": 2.6303,
1370
+ "step": 3800
1371
+ },
1372
+ {
1373
+ "epoch": 0.076,
1374
+ "eval_loss": 2.63671875,
1375
+ "eval_runtime": 45.1006,
1376
+ "eval_samples_per_second": 2.306,
1377
+ "eval_steps_per_second": 1.153,
1378
+ "step": 3800
1379
+ },
1380
+ {
1381
+ "epoch": 0.0765,
1382
+ "grad_norm": 2.6019929572001987,
1383
+ "learning_rate": 7.648e-06,
1384
+ "loss": 2.6359,
1385
+ "step": 3825
1386
+ },
1387
+ {
1388
+ "epoch": 0.077,
1389
+ "grad_norm": 2.1777094054397343,
1390
+ "learning_rate": 7.698000000000002e-06,
1391
+ "loss": 2.6397,
1392
+ "step": 3850
1393
+ },
1394
+ {
1395
+ "epoch": 0.0775,
1396
+ "grad_norm": 2.0323537115489474,
1397
+ "learning_rate": 7.748000000000001e-06,
1398
+ "loss": 2.6321,
1399
+ "step": 3875
1400
+ },
1401
+ {
1402
+ "epoch": 0.078,
1403
+ "grad_norm": 2.1502944909614037,
1404
+ "learning_rate": 7.798e-06,
1405
+ "loss": 2.6373,
1406
+ "step": 3900
1407
+ },
1408
+ {
1409
+ "epoch": 0.078,
1410
+ "eval_loss": 2.634765625,
1411
+ "eval_runtime": 44.8775,
1412
+ "eval_samples_per_second": 2.317,
1413
+ "eval_steps_per_second": 1.159,
1414
+ "step": 3900
1415
+ },
1416
+ {
1417
+ "epoch": 0.0785,
1418
+ "grad_norm": 2.2895713962575748,
1419
+ "learning_rate": 7.848000000000002e-06,
1420
+ "loss": 2.6325,
1421
+ "step": 3925
1422
+ },
1423
+ {
1424
+ "epoch": 0.079,
1425
+ "grad_norm": 2.473180320397106,
1426
+ "learning_rate": 7.898e-06,
1427
+ "loss": 2.6306,
1428
+ "step": 3950
1429
+ },
1430
+ {
1431
+ "epoch": 0.0795,
1432
+ "grad_norm": 2.5774486324856865,
1433
+ "learning_rate": 7.948e-06,
1434
+ "loss": 2.6345,
1435
+ "step": 3975
1436
+ },
1437
+ {
1438
+ "epoch": 0.08,
1439
+ "grad_norm": 2.282553852536701,
1440
+ "learning_rate": 7.998e-06,
1441
+ "loss": 2.641,
1442
+ "step": 4000
1443
+ },
1444
+ {
1445
+ "epoch": 0.08,
1446
+ "eval_loss": 2.630859375,
1447
+ "eval_runtime": 44.8187,
1448
+ "eval_samples_per_second": 2.32,
1449
+ "eval_steps_per_second": 1.16,
1450
+ "step": 4000
1451
+ },
1452
+ {
1453
+ "epoch": 0.0805,
1454
+ "grad_norm": 2.500864236641362,
1455
+ "learning_rate": 8.048e-06,
1456
+ "loss": 2.6309,
1457
+ "step": 4025
1458
+ },
1459
+ {
1460
+ "epoch": 0.081,
1461
+ "grad_norm": 2.5639376009370674,
1462
+ "learning_rate": 8.098000000000001e-06,
1463
+ "loss": 2.6211,
1464
+ "step": 4050
1465
+ },
1466
+ {
1467
+ "epoch": 0.0815,
1468
+ "grad_norm": 3.0035728334967926,
1469
+ "learning_rate": 8.148e-06,
1470
+ "loss": 2.6317,
1471
+ "step": 4075
1472
+ },
1473
+ {
1474
+ "epoch": 0.082,
1475
+ "grad_norm": 2.804391077504498,
1476
+ "learning_rate": 8.198e-06,
1477
+ "loss": 2.6273,
1478
+ "step": 4100
1479
+ },
1480
+ {
1481
+ "epoch": 0.082,
1482
+ "eval_loss": 2.627704381942749,
1483
+ "eval_runtime": 45.0778,
1484
+ "eval_samples_per_second": 2.307,
1485
+ "eval_steps_per_second": 1.154,
1486
+ "step": 4100
1487
+ },
1488
+ {
1489
+ "epoch": 0.0825,
1490
+ "grad_norm": 2.8025033751566975,
1491
+ "learning_rate": 8.248e-06,
1492
+ "loss": 2.6224,
1493
+ "step": 4125
1494
+ },
1495
+ {
1496
+ "epoch": 0.083,
1497
+ "grad_norm": 4.307364832973918,
1498
+ "learning_rate": 8.298000000000001e-06,
1499
+ "loss": 2.6217,
1500
+ "step": 4150
1501
+ },
1502
+ {
1503
+ "epoch": 0.0835,
1504
+ "grad_norm": 2.510945545421516,
1505
+ "learning_rate": 8.348e-06,
1506
+ "loss": 2.6158,
1507
+ "step": 4175
1508
+ },
1509
+ {
1510
+ "epoch": 0.084,
1511
+ "grad_norm": 2.874475964746802,
1512
+ "learning_rate": 8.398e-06,
1513
+ "loss": 2.6284,
1514
+ "step": 4200
1515
+ },
1516
+ {
1517
+ "epoch": 0.084,
1518
+ "eval_loss": 2.626352071762085,
1519
+ "eval_runtime": 44.9685,
1520
+ "eval_samples_per_second": 2.313,
1521
+ "eval_steps_per_second": 1.156,
1522
+ "step": 4200
1523
+ },
1524
+ {
1525
+ "epoch": 0.0845,
1526
+ "grad_norm": 2.687782456648974,
1527
+ "learning_rate": 8.448000000000001e-06,
1528
+ "loss": 2.613,
1529
+ "step": 4225
1530
+ },
1531
+ {
1532
+ "epoch": 0.085,
1533
+ "grad_norm": 2.290237147776631,
1534
+ "learning_rate": 8.498e-06,
1535
+ "loss": 2.6295,
1536
+ "step": 4250
1537
+ },
1538
+ {
1539
+ "epoch": 0.0855,
1540
+ "grad_norm": 2.5217231224578196,
1541
+ "learning_rate": 8.548e-06,
1542
+ "loss": 2.6194,
1543
+ "step": 4275
1544
+ },
1545
+ {
1546
+ "epoch": 0.086,
1547
+ "grad_norm": 2.478088396853028,
1548
+ "learning_rate": 8.598000000000001e-06,
1549
+ "loss": 2.6269,
1550
+ "step": 4300
1551
+ },
1552
+ {
1553
+ "epoch": 0.086,
1554
+ "eval_loss": 2.624098539352417,
1555
+ "eval_runtime": 45.0092,
1556
+ "eval_samples_per_second": 2.311,
1557
+ "eval_steps_per_second": 1.155,
1558
+ "step": 4300
1559
+ },
1560
+ {
1561
+ "epoch": 0.0865,
1562
+ "grad_norm": 3.160637138604565,
1563
+ "learning_rate": 8.648000000000001e-06,
1564
+ "loss": 2.6179,
1565
+ "step": 4325
1566
+ },
1567
+ {
1568
+ "epoch": 0.087,
1569
+ "grad_norm": 3.2730443987396787,
1570
+ "learning_rate": 8.698e-06,
1571
+ "loss": 2.6128,
1572
+ "step": 4350
1573
+ },
1574
+ {
1575
+ "epoch": 0.0875,
1576
+ "grad_norm": 2.1924980955006257,
1577
+ "learning_rate": 8.748000000000002e-06,
1578
+ "loss": 2.6237,
1579
+ "step": 4375
1580
+ },
1581
+ {
1582
+ "epoch": 0.088,
1583
+ "grad_norm": 2.2909495673616735,
1584
+ "learning_rate": 8.798000000000001e-06,
1585
+ "loss": 2.6183,
1586
+ "step": 4400
1587
+ },
1588
+ {
1589
+ "epoch": 0.088,
1590
+ "eval_loss": 2.622445821762085,
1591
+ "eval_runtime": 44.9844,
1592
+ "eval_samples_per_second": 2.312,
1593
+ "eval_steps_per_second": 1.156,
1594
+ "step": 4400
1595
+ },
1596
+ {
1597
+ "epoch": 0.0885,
1598
+ "grad_norm": 2.3275380340868543,
1599
+ "learning_rate": 8.848e-06,
1600
+ "loss": 2.6198,
1601
+ "step": 4425
1602
+ },
1603
+ {
1604
+ "epoch": 0.089,
1605
+ "grad_norm": 2.5451157769858135,
1606
+ "learning_rate": 8.898000000000002e-06,
1607
+ "loss": 2.6122,
1608
+ "step": 4450
1609
+ },
1610
+ {
1611
+ "epoch": 0.0895,
1612
+ "grad_norm": 2.626975380348867,
1613
+ "learning_rate": 8.948000000000001e-06,
1614
+ "loss": 2.6053,
1615
+ "step": 4475
1616
+ },
1617
+ {
1618
+ "epoch": 0.09,
1619
+ "grad_norm": 3.163525010125433,
1620
+ "learning_rate": 8.998000000000001e-06,
1621
+ "loss": 2.616,
1622
+ "step": 4500
1623
+ },
1624
+ {
1625
+ "epoch": 0.09,
1626
+ "eval_loss": 2.620342493057251,
1627
+ "eval_runtime": 45.1428,
1628
+ "eval_samples_per_second": 2.304,
1629
+ "eval_steps_per_second": 1.152,
1630
+ "step": 4500
1631
  }
1632
  ],
1633
  "logging_steps": 25,
 
1647
  "attributes": {}
1648
  }
1649
  },
1650
+ "total_flos": 1.0098986967950688e+19,
1651
  "train_batch_size": 1,
1652
  "trial_name": null,
1653
  "trial_params": null