sedrickkeh commited on
Commit
02681e3
·
verified ·
1 Parent(s): f21a992

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +3 -3
  3. train_results.json +3 -3
  4. trainer_state.json +195 -195
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: multiple_samples_none_numina_aime
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # multiple_samples_none_numina_aime
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: multiple_samples_none_numina_aime
 
16
 
17
  # multiple_samples_none_numina_aime
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/multiple_samples_none_numina_aime dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9725490196078432,
3
  "total_flos": 121055548211200.0,
4
- "train_loss": 0.7961960165273576,
5
- "train_runtime": 4088.556,
6
- "train_samples_per_second": 2.986,
7
  "train_steps_per_second": 0.031
8
  }
 
1
  {
2
  "epoch": 2.9725490196078432,
3
  "total_flos": 121055548211200.0,
4
+ "train_loss": 0.7961941848671625,
5
+ "train_runtime": 4090.1751,
6
+ "train_samples_per_second": 2.985,
7
  "train_steps_per_second": 0.031
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9725490196078432,
3
  "total_flos": 121055548211200.0,
4
- "train_loss": 0.7961960165273576,
5
- "train_runtime": 4088.556,
6
- "train_samples_per_second": 2.986,
7
  "train_steps_per_second": 0.031
8
  }
 
1
  {
2
  "epoch": 2.9725490196078432,
3
  "total_flos": 121055548211200.0,
4
+ "train_loss": 0.7961941848671625,
5
+ "train_runtime": 4090.1751,
6
+ "train_samples_per_second": 2.985,
7
  "train_steps_per_second": 0.031
8
  }
trainer_state.json CHANGED
@@ -10,882 +10,882 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.023529411764705882,
13
- "grad_norm": 6.901826858520508,
14
  "learning_rate": 7.692307692307694e-07,
15
  "loss": 1.1119,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.047058823529411764,
20
- "grad_norm": 7.422886371612549,
21
  "learning_rate": 1.5384615384615387e-06,
22
  "loss": 1.1537,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.07058823529411765,
27
- "grad_norm": 6.920129299163818,
28
  "learning_rate": 2.307692307692308e-06,
29
- "loss": 1.1205,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.09411764705882353,
34
- "grad_norm": 6.285982608795166,
35
  "learning_rate": 3.0769230769230774e-06,
36
- "loss": 1.0476,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.11764705882352941,
41
- "grad_norm": 5.431225299835205,
42
  "learning_rate": 3.846153846153847e-06,
43
  "loss": 1.0753,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.1411764705882353,
48
- "grad_norm": 3.6898839473724365,
49
  "learning_rate": 4.615384615384616e-06,
50
  "loss": 0.9761,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.16470588235294117,
55
- "grad_norm": 2.8562192916870117,
56
  "learning_rate": 5.384615384615385e-06,
57
- "loss": 0.9581,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.18823529411764706,
62
- "grad_norm": 4.019009590148926,
63
  "learning_rate": 6.153846153846155e-06,
64
- "loss": 1.0193,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.21176470588235294,
69
- "grad_norm": 4.8232526779174805,
70
  "learning_rate": 6.923076923076923e-06,
71
- "loss": 0.951,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.23529411764705882,
76
- "grad_norm": 4.59989595413208,
77
  "learning_rate": 7.692307692307694e-06,
78
- "loss": 1.0139,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.25882352941176473,
83
- "grad_norm": 3.852893829345703,
84
  "learning_rate": 8.461538461538462e-06,
85
- "loss": 0.9087,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.2823529411764706,
90
- "grad_norm": 3.7579197883605957,
91
  "learning_rate": 9.230769230769232e-06,
92
- "loss": 1.0375,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.3058823529411765,
97
- "grad_norm": 2.5401360988616943,
98
  "learning_rate": 1e-05,
99
  "loss": 0.9651,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.32941176470588235,
104
- "grad_norm": 2.095151424407959,
105
  "learning_rate": 9.998067787472772e-06,
106
  "loss": 0.9098,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.35294117647058826,
111
- "grad_norm": 2.1526248455047607,
112
  "learning_rate": 9.992272643269181e-06,
113
  "loss": 0.8308,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.3764705882352941,
118
- "grad_norm": 1.9617197513580322,
119
  "learning_rate": 9.982619046369321e-06,
120
  "loss": 0.9148,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.4,
125
- "grad_norm": 1.5976275205612183,
126
  "learning_rate": 9.96911445789354e-06,
127
- "loss": 0.8948,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.4235294117647059,
132
- "grad_norm": 1.484428882598877,
133
  "learning_rate": 9.951769315335843e-06,
134
- "loss": 0.8592,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.4470588235294118,
139
- "grad_norm": 1.4591351747512817,
140
  "learning_rate": 9.930597024496933e-06,
141
- "loss": 0.8315,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.47058823529411764,
146
- "grad_norm": 1.1599817276000977,
147
  "learning_rate": 9.905613949123036e-06,
148
- "loss": 0.808,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.49411764705882355,
153
- "grad_norm": 1.2381192445755005,
154
  "learning_rate": 9.87683939825864e-06,
155
  "loss": 0.8833,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.5176470588235295,
160
- "grad_norm": 1.25748872756958,
161
  "learning_rate": 9.844295611322804e-06,
162
- "loss": 0.873,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.5411764705882353,
167
- "grad_norm": 1.0262051820755005,
168
  "learning_rate": 9.808007740920647e-06,
169
- "loss": 0.7802,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.5647058823529412,
174
- "grad_norm": 0.9352391362190247,
175
  "learning_rate": 9.768003833403278e-06,
176
  "loss": 0.8134,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.5882352941176471,
181
- "grad_norm": 1.0994199514389038,
182
  "learning_rate": 9.724314807191197e-06,
183
- "loss": 0.8358,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.611764705882353,
188
- "grad_norm": 0.9108858704566956,
189
  "learning_rate": 9.6769744288779e-06,
190
  "loss": 0.8229,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.6352941176470588,
195
- "grad_norm": 0.7783969044685364,
196
  "learning_rate": 9.626019287132202e-06,
197
- "loss": 0.7928,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.6588235294117647,
202
- "grad_norm": 0.8140386343002319,
203
  "learning_rate": 9.571488764419381e-06,
204
  "loss": 0.8129,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.6823529411764706,
209
- "grad_norm": 0.7752570509910583,
210
  "learning_rate": 9.51342500656308e-06,
211
  "loss": 0.8572,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.7058823529411765,
216
- "grad_norm": 0.678372323513031,
217
  "learning_rate": 9.451872890171419e-06,
218
  "loss": 0.8103,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.7294117647058823,
223
- "grad_norm": 0.6800512075424194,
224
  "learning_rate": 9.386879987952549e-06,
225
- "loss": 0.897,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.7529411764705882,
230
- "grad_norm": 0.6305904388427734,
231
  "learning_rate": 9.318496531946411e-06,
232
  "loss": 0.818,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.7764705882352941,
237
- "grad_norm": 0.7293695211410522,
238
  "learning_rate": 9.246775374701139e-06,
239
  "loss": 0.8332,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.8,
244
- "grad_norm": 0.7706238031387329,
245
  "learning_rate": 9.171771948424138e-06,
246
- "loss": 0.8584,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.8235294117647058,
251
- "grad_norm": 0.6435885429382324,
252
  "learning_rate": 9.093544222139338e-06,
253
- "loss": 0.8726,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.8470588235294118,
258
- "grad_norm": 0.7461095452308655,
259
  "learning_rate": 9.012152656883824e-06,
260
- "loss": 0.7851,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.8705882352941177,
265
- "grad_norm": 0.8690148591995239,
266
  "learning_rate": 8.927660158978392e-06,
267
- "loss": 0.8347,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.8941176470588236,
272
- "grad_norm": 0.6226567625999451,
273
  "learning_rate": 8.84013203140821e-06,
274
- "loss": 0.8419,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.9176470588235294,
279
- "grad_norm": 0.603524923324585,
280
  "learning_rate": 8.749635923351108e-06,
281
  "loss": 0.776,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.9411764705882353,
286
- "grad_norm": 0.6303524374961853,
287
  "learning_rate": 8.656241777892544e-06,
288
  "loss": 0.7207,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.9647058823529412,
293
- "grad_norm": 0.5369915962219238,
294
  "learning_rate": 8.56002177796765e-06,
295
  "loss": 0.7694,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.9882352941176471,
300
- "grad_norm": 0.5721177458763123,
301
  "learning_rate": 8.461050290572114e-06,
302
- "loss": 0.7732,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.0156862745098039,
307
- "grad_norm": 1.0953840017318726,
308
  "learning_rate": 8.359403809285054e-06,
309
- "loss": 1.3119,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.0392156862745099,
314
- "grad_norm": 0.5467516183853149,
315
  "learning_rate": 8.255160895148263e-06,
316
- "loss": 0.7665,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 1.0627450980392157,
321
- "grad_norm": 0.6043545007705688,
322
  "learning_rate": 8.14840211594757e-06,
323
  "loss": 0.7368,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 1.0862745098039215,
328
- "grad_norm": 0.6263077259063721,
329
  "learning_rate": 8.039209983943201e-06,
330
  "loss": 0.7976,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 1.1098039215686275,
335
- "grad_norm": 0.5699981451034546,
336
  "learning_rate": 7.927668892097288e-06,
337
  "loss": 0.7109,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 1.1333333333333333,
342
- "grad_norm": 0.5554935336112976,
343
  "learning_rate": 7.81386504884782e-06,
344
  "loss": 0.738,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 1.156862745098039,
349
- "grad_norm": 0.7168362736701965,
350
  "learning_rate": 7.697886411479422e-06,
351
- "loss": 0.8269,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 1.1803921568627451,
356
- "grad_norm": 0.5106287002563477,
357
  "learning_rate": 7.579822618142505e-06,
358
  "loss": 0.7993,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 1.203921568627451,
363
- "grad_norm": 0.7224318385124207,
364
  "learning_rate": 7.459764918573264e-06,
365
- "loss": 0.8325,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 1.227450980392157,
370
- "grad_norm": 0.6035781502723694,
371
  "learning_rate": 7.3378061035681415e-06,
372
  "loss": 0.73,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 1.2509803921568627,
377
- "grad_norm": 0.5344979166984558,
378
  "learning_rate": 7.2140404332671986e-06,
379
- "loss": 0.7398,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 1.2745098039215685,
384
- "grad_norm": 0.6349881291389465,
385
  "learning_rate": 7.088563564301874e-06,
386
  "loss": 0.8289,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 1.2980392156862746,
391
- "grad_norm": 0.5782693028450012,
392
  "learning_rate": 6.961472475863406e-06,
393
- "loss": 0.7454,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 1.3215686274509804,
398
- "grad_norm": 0.4461568295955658,
399
  "learning_rate": 6.832865394749065e-06,
400
  "loss": 0.6694,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 1.3450980392156864,
405
- "grad_norm": 0.5781851410865784,
406
  "learning_rate": 6.702841719444141e-06,
407
  "loss": 0.8093,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 1.3686274509803922,
412
- "grad_norm": 0.48329582810401917,
413
  "learning_rate": 6.571501943298335e-06,
414
  "loss": 0.7096,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 1.392156862745098,
419
- "grad_norm": 0.6234760284423828,
420
  "learning_rate": 6.4389475768559675e-06,
421
- "loss": 0.8138,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 1.415686274509804,
426
- "grad_norm": 0.49070653319358826,
427
  "learning_rate": 6.305281069399989e-06,
428
- "loss": 0.6619,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 1.4392156862745098,
433
- "grad_norm": 0.49748146533966064,
434
  "learning_rate": 6.17060572977047e-06,
435
- "loss": 0.693,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 1.4627450980392158,
440
- "grad_norm": 0.5554483532905579,
441
  "learning_rate": 6.035025646518747e-06,
442
  "loss": 0.7561,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 1.4862745098039216,
447
- "grad_norm": 0.6127786040306091,
448
  "learning_rate": 5.898645607458941e-06,
449
- "loss": 0.7799,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 1.5098039215686274,
454
- "grad_norm": 0.5526847839355469,
455
  "learning_rate": 5.761571018679025e-06,
456
  "loss": 0.7374,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 1.5333333333333332,
461
- "grad_norm": 0.5685780644416809,
462
  "learning_rate": 5.623907823074044e-06,
463
- "loss": 0.8134,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 1.5568627450980392,
468
- "grad_norm": 0.4792926013469696,
469
  "learning_rate": 5.48576241846443e-06,
470
  "loss": 0.7933,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 1.5803921568627453,
475
- "grad_norm": 0.4758462607860565,
476
  "learning_rate": 5.347241575362729e-06,
477
  "loss": 0.7209,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 1.603921568627451,
482
- "grad_norm": 0.5107057690620422,
483
  "learning_rate": 5.208452354452275e-06,
484
- "loss": 0.7746,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 1.6274509803921569,
489
- "grad_norm": 0.4799031913280487,
490
  "learning_rate": 5.069502023841576e-06,
491
  "loss": 0.7635,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 1.6509803921568627,
496
- "grad_norm": 0.5203085541725159,
497
  "learning_rate": 4.9304979761584256e-06,
498
- "loss": 0.7708,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 1.6745098039215687,
503
- "grad_norm": 0.44460946321487427,
504
  "learning_rate": 4.791547645547727e-06,
505
  "loss": 0.6827,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 1.6980392156862745,
510
- "grad_norm": 0.5535275340080261,
511
  "learning_rate": 4.652758424637271e-06,
512
- "loss": 0.794,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 1.7215686274509805,
517
- "grad_norm": 0.4878956377506256,
518
  "learning_rate": 4.514237581535571e-06,
519
- "loss": 0.7368,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 1.7450980392156863,
524
- "grad_norm": 0.5016121864318848,
525
  "learning_rate": 4.3760921769259585e-06,
526
- "loss": 0.6936,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 1.768627450980392,
531
- "grad_norm": 0.5011301040649414,
532
  "learning_rate": 4.2384289813209754e-06,
533
- "loss": 0.7475,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 1.792156862745098,
538
- "grad_norm": 0.4553963243961334,
539
  "learning_rate": 4.101354392541061e-06,
540
- "loss": 0.7358,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 1.815686274509804,
545
- "grad_norm": 0.4620165228843689,
546
  "learning_rate": 3.964974353481254e-06,
547
- "loss": 0.7331,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 1.83921568627451,
552
- "grad_norm": 0.4453507363796234,
553
  "learning_rate": 3.829394270229531e-06,
554
- "loss": 0.7295,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 1.8627450980392157,
559
- "grad_norm": 0.402537077665329,
560
  "learning_rate": 3.694718930600012e-06,
561
- "loss": 0.642,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 1.8862745098039215,
566
- "grad_norm": 0.4362320601940155,
567
  "learning_rate": 3.5610524231440324e-06,
568
- "loss": 0.7889,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 1.9098039215686273,
573
- "grad_norm": 0.43875452876091003,
574
  "learning_rate": 3.428498056701665e-06,
575
  "loss": 0.7499,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 1.9333333333333333,
580
- "grad_norm": 0.43475160002708435,
581
  "learning_rate": 3.2971582805558622e-06,
582
- "loss": 0.7663,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 1.9568627450980394,
587
- "grad_norm": 0.46115896105766296,
588
  "learning_rate": 3.167134605250938e-06,
589
- "loss": 0.7652,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 1.9803921568627452,
594
- "grad_norm": 0.4670518934726715,
595
  "learning_rate": 3.0385275241365965e-06,
596
- "loss": 0.7709,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 2.007843137254902,
601
- "grad_norm": 0.8610158562660217,
602
  "learning_rate": 2.9114364356981274e-06,
603
  "loss": 1.2373,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 2.0313725490196077,
608
- "grad_norm": 0.3888493478298187,
609
  "learning_rate": 2.7859595667328027e-06,
610
  "loss": 0.7255,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 2.0549019607843135,
615
- "grad_norm": 0.42477184534072876,
616
  "learning_rate": 2.6621938964318593e-06,
617
  "loss": 0.6407,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 2.0784313725490198,
622
- "grad_norm": 0.4506017863750458,
623
  "learning_rate": 2.5402350814267364e-06,
624
- "loss": 0.6873,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 2.1019607843137256,
629
- "grad_norm": 0.48074784874916077,
630
  "learning_rate": 2.4201773818574956e-06,
631
- "loss": 0.6542,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 2.1254901960784314,
636
- "grad_norm": 0.45760810375213623,
637
  "learning_rate": 2.302113588520578e-06,
638
  "loss": 0.6809,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 2.149019607843137,
643
- "grad_norm": 0.3958469033241272,
644
  "learning_rate": 2.1861349511521817e-06,
645
- "loss": 0.6087,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 2.172549019607843,
650
- "grad_norm": 0.479245662689209,
651
  "learning_rate": 2.072331107902713e-06,
652
- "loss": 0.9135,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 2.196078431372549,
657
- "grad_norm": 0.4283719062805176,
658
  "learning_rate": 1.960790016056801e-06,
659
- "loss": 0.6736,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 2.219607843137255,
664
- "grad_norm": 0.4299345314502716,
665
  "learning_rate": 1.8515978840524302e-06,
666
  "loss": 0.6972,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 2.243137254901961,
671
- "grad_norm": 0.45547375082969666,
672
  "learning_rate": 1.7448391048517378e-06,
673
- "loss": 0.7224,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 2.2666666666666666,
678
- "grad_norm": 0.40750595927238464,
679
  "learning_rate": 1.640596190714947e-06,
680
  "loss": 0.7225,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 2.2901960784313724,
685
- "grad_norm": 0.43789613246917725,
686
  "learning_rate": 1.5389497094278861e-06,
687
  "loss": 0.7208,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 2.313725490196078,
692
- "grad_norm": 0.4415332078933716,
693
  "learning_rate": 1.4399782220323515e-06,
694
- "loss": 0.6706,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 2.3372549019607844,
699
- "grad_norm": 0.49981608986854553,
700
  "learning_rate": 1.3437582221074574e-06,
701
- "loss": 0.7925,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 2.3607843137254902,
706
- "grad_norm": 0.48716047406196594,
707
  "learning_rate": 1.250364076648894e-06,
708
- "loss": 0.7385,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 2.384313725490196,
713
- "grad_norm": 0.3869420886039734,
714
  "learning_rate": 1.1598679685917901e-06,
715
- "loss": 0.6665,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 2.407843137254902,
720
- "grad_norm": 0.4081011116504669,
721
  "learning_rate": 1.0723398410216085e-06,
722
- "loss": 0.8291,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 2.431372549019608,
727
- "grad_norm": 0.4056829512119293,
728
  "learning_rate": 9.878473431161767e-07,
729
- "loss": 0.6668,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 2.454901960784314,
734
- "grad_norm": 0.36377865076065063,
735
  "learning_rate": 9.064557778606631e-07,
736
  "loss": 0.6017,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 2.4784313725490197,
741
- "grad_norm": 0.43392133712768555,
742
  "learning_rate": 8.282280515758639e-07,
743
- "loss": 0.7824,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 2.5019607843137255,
748
- "grad_norm": 0.47024935483932495,
749
  "learning_rate": 7.532246252988617e-07,
750
  "loss": 0.7446,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 2.5254901960784313,
755
- "grad_norm": 0.425630658864975,
756
  "learning_rate": 6.815034680535915e-07,
757
  "loss": 0.713,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 2.549019607843137,
762
- "grad_norm": 0.40889060497283936,
763
  "learning_rate": 6.131200120474512e-07,
764
  "loss": 0.7409,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 2.572549019607843,
769
- "grad_norm": 0.40218353271484375,
770
  "learning_rate": 5.481271098285818e-07,
771
  "loss": 0.7501,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 2.596078431372549,
776
- "grad_norm": 0.35820406675338745,
777
  "learning_rate": 4.865749934369224e-07,
778
  "loss": 0.6082,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 2.619607843137255,
783
- "grad_norm": 0.42070600390434265,
784
  "learning_rate": 4.2851123558061927e-07,
785
- "loss": 0.7516,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 2.6431372549019607,
790
- "grad_norm": 0.36287838220596313,
791
  "learning_rate": 3.739807128677986e-07,
792
  "loss": 0.6589,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 2.6666666666666665,
797
- "grad_norm": 0.3751106560230255,
798
  "learning_rate": 3.230255711220992e-07,
799
- "loss": 0.7008,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 2.6901960784313728,
804
- "grad_norm": 0.39373522996902466,
805
  "learning_rate": 2.756851928088056e-07,
806
  "loss": 0.7579,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 2.7137254901960786,
811
- "grad_norm": 0.3703792989253998,
812
  "learning_rate": 2.3199616659672352e-07,
813
- "loss": 0.8005,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 2.7372549019607844,
818
- "grad_norm": 0.3416251242160797,
819
  "learning_rate": 1.9199225907935492e-07,
820
  "loss": 0.6913,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 2.76078431372549,
825
- "grad_norm": 0.3666976988315582,
826
  "learning_rate": 1.5570438867719695e-07,
827
  "loss": 0.6749,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 2.784313725490196,
832
- "grad_norm": 0.41929343342781067,
833
  "learning_rate": 1.2316060174136e-07,
834
  "loss": 0.93,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 2.8078431372549018,
839
- "grad_norm": 0.3298085033893585,
840
  "learning_rate": 9.43860508769645e-08,
841
  "loss": 0.5853,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 2.831372549019608,
846
- "grad_norm": 0.4238205552101135,
847
  "learning_rate": 6.940297550306895e-08,
848
  "loss": 0.7548,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 2.854901960784314,
853
- "grad_norm": 0.37445592880249023,
854
  "learning_rate": 4.823068466415615e-08,
855
- "loss": 0.7453,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 2.8784313725490196,
860
- "grad_norm": 0.42801082134246826,
861
  "learning_rate": 3.088554210646133e-08,
862
- "loss": 0.8001,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 2.9019607843137254,
867
- "grad_norm": 0.3497636616230011,
868
  "learning_rate": 1.7380953630678488e-08,
869
  "loss": 0.7289,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 2.9254901960784316,
874
- "grad_norm": 0.4116727411746979,
875
  "learning_rate": 7.727356730820035e-09,
876
  "loss": 0.6974,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 2.9490196078431374,
881
- "grad_norm": 0.3742615282535553,
882
  "learning_rate": 1.9322125272297488e-09,
883
  "loss": 0.765,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 2.9725490196078432,
888
- "grad_norm": 0.36835694313049316,
889
  "learning_rate": 0.0,
890
  "loss": 0.6713,
891
  "step": 126
@@ -894,9 +894,9 @@
894
  "epoch": 2.9725490196078432,
895
  "step": 126,
896
  "total_flos": 121055548211200.0,
897
- "train_loss": 0.7961960165273576,
898
- "train_runtime": 4088.556,
899
- "train_samples_per_second": 2.986,
900
  "train_steps_per_second": 0.031
901
  }
902
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.023529411764705882,
13
+ "grad_norm": 6.901778221130371,
14
  "learning_rate": 7.692307692307694e-07,
15
  "loss": 1.1119,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.047058823529411764,
20
+ "grad_norm": 7.4228010177612305,
21
  "learning_rate": 1.5384615384615387e-06,
22
  "loss": 1.1537,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.07058823529411765,
27
+ "grad_norm": 6.921288967132568,
28
  "learning_rate": 2.307692307692308e-06,
29
+ "loss": 1.1207,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.09411764705882353,
34
+ "grad_norm": 6.287109851837158,
35
  "learning_rate": 3.0769230769230774e-06,
36
+ "loss": 1.0477,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.11764705882352941,
41
+ "grad_norm": 5.439420700073242,
42
  "learning_rate": 3.846153846153847e-06,
43
  "loss": 1.0753,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.1411764705882353,
48
+ "grad_norm": 3.6856906414031982,
49
  "learning_rate": 4.615384615384616e-06,
50
  "loss": 0.9761,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.16470588235294117,
55
+ "grad_norm": 2.859872341156006,
56
  "learning_rate": 5.384615384615385e-06,
57
+ "loss": 0.958,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.18823529411764706,
62
+ "grad_norm": 4.026933670043945,
63
  "learning_rate": 6.153846153846155e-06,
64
+ "loss": 1.0194,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.21176470588235294,
69
+ "grad_norm": 4.821041107177734,
70
  "learning_rate": 6.923076923076923e-06,
71
+ "loss": 0.9508,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.23529411764705882,
76
+ "grad_norm": 4.604616641998291,
77
  "learning_rate": 7.692307692307694e-06,
78
+ "loss": 1.0138,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.25882352941176473,
83
+ "grad_norm": 3.8580994606018066,
84
  "learning_rate": 8.461538461538462e-06,
85
+ "loss": 0.9089,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.2823529411764706,
90
+ "grad_norm": 3.7659692764282227,
91
  "learning_rate": 9.230769230769232e-06,
92
+ "loss": 1.0376,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.3058823529411765,
97
+ "grad_norm": 2.5426435470581055,
98
  "learning_rate": 1e-05,
99
  "loss": 0.9651,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.32941176470588235,
104
+ "grad_norm": 2.092756986618042,
105
  "learning_rate": 9.998067787472772e-06,
106
  "loss": 0.9098,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.35294117647058826,
111
+ "grad_norm": 2.1536059379577637,
112
  "learning_rate": 9.992272643269181e-06,
113
  "loss": 0.8308,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.3764705882352941,
118
+ "grad_norm": 1.9728121757507324,
119
  "learning_rate": 9.982619046369321e-06,
120
  "loss": 0.9148,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.4,
125
+ "grad_norm": 1.6017658710479736,
126
  "learning_rate": 9.96911445789354e-06,
127
+ "loss": 0.8949,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.4235294117647059,
132
+ "grad_norm": 1.4864757061004639,
133
  "learning_rate": 9.951769315335843e-06,
134
+ "loss": 0.8593,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.4470588235294118,
139
+ "grad_norm": 1.447627305984497,
140
  "learning_rate": 9.930597024496933e-06,
141
+ "loss": 0.8316,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.47058823529411764,
146
+ "grad_norm": 1.1549137830734253,
147
  "learning_rate": 9.905613949123036e-06,
148
+ "loss": 0.8079,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.49411764705882355,
153
+ "grad_norm": 1.2319858074188232,
154
  "learning_rate": 9.87683939825864e-06,
155
  "loss": 0.8833,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.5176470588235295,
160
+ "grad_norm": 1.2558043003082275,
161
  "learning_rate": 9.844295611322804e-06,
162
+ "loss": 0.8729,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.5411764705882353,
167
+ "grad_norm": 1.0202245712280273,
168
  "learning_rate": 9.808007740920647e-06,
169
+ "loss": 0.7801,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.5647058823529412,
174
+ "grad_norm": 0.9334889054298401,
175
  "learning_rate": 9.768003833403278e-06,
176
  "loss": 0.8134,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.5882352941176471,
181
+ "grad_norm": 1.1106406450271606,
182
  "learning_rate": 9.724314807191197e-06,
183
+ "loss": 0.8359,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.611764705882353,
188
+ "grad_norm": 0.9234170317649841,
189
  "learning_rate": 9.6769744288779e-06,
190
  "loss": 0.8229,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.6352941176470588,
195
+ "grad_norm": 0.7843503355979919,
196
  "learning_rate": 9.626019287132202e-06,
197
+ "loss": 0.7927,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.6588235294117647,
202
+ "grad_norm": 0.8040333390235901,
203
  "learning_rate": 9.571488764419381e-06,
204
  "loss": 0.8129,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.6823529411764706,
209
+ "grad_norm": 0.7696279287338257,
210
  "learning_rate": 9.51342500656308e-06,
211
  "loss": 0.8572,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.7058823529411765,
216
+ "grad_norm": 0.6732362508773804,
217
  "learning_rate": 9.451872890171419e-06,
218
  "loss": 0.8103,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.7294117647058823,
223
+ "grad_norm": 0.6914750337600708,
224
  "learning_rate": 9.386879987952549e-06,
225
+ "loss": 0.8969,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.7529411764705882,
230
+ "grad_norm": 0.6257173418998718,
231
  "learning_rate": 9.318496531946411e-06,
232
  "loss": 0.818,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.7764705882352941,
237
+ "grad_norm": 0.7246304750442505,
238
  "learning_rate": 9.246775374701139e-06,
239
  "loss": 0.8332,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.8,
244
+ "grad_norm": 0.7830196022987366,
245
  "learning_rate": 9.171771948424138e-06,
246
+ "loss": 0.8585,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.8235294117647058,
251
+ "grad_norm": 0.653729259967804,
252
  "learning_rate": 9.093544222139338e-06,
253
+ "loss": 0.8725,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.8470588235294118,
258
+ "grad_norm": 0.742987871170044,
259
  "learning_rate": 9.012152656883824e-06,
260
+ "loss": 0.785,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.8705882352941177,
265
+ "grad_norm": 0.8727246522903442,
266
  "learning_rate": 8.927660158978392e-06,
267
+ "loss": 0.8348,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.8941176470588236,
272
+ "grad_norm": 0.6196137070655823,
273
  "learning_rate": 8.84013203140821e-06,
274
+ "loss": 0.8418,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.9176470588235294,
279
+ "grad_norm": 0.610687255859375,
280
  "learning_rate": 8.749635923351108e-06,
281
  "loss": 0.776,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.9411764705882353,
286
+ "grad_norm": 0.6278626322746277,
287
  "learning_rate": 8.656241777892544e-06,
288
  "loss": 0.7207,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.9647058823529412,
293
+ "grad_norm": 0.5417529344558716,
294
  "learning_rate": 8.56002177796765e-06,
295
  "loss": 0.7694,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.9882352941176471,
300
+ "grad_norm": 0.5784769654273987,
301
  "learning_rate": 8.461050290572114e-06,
302
+ "loss": 0.7733,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 1.0156862745098039,
307
+ "grad_norm": 1.1008639335632324,
308
  "learning_rate": 8.359403809285054e-06,
309
+ "loss": 1.312,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 1.0392156862745099,
314
+ "grad_norm": 0.5411049723625183,
315
  "learning_rate": 8.255160895148263e-06,
316
+ "loss": 0.7666,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 1.0627450980392157,
321
+ "grad_norm": 0.602947473526001,
322
  "learning_rate": 8.14840211594757e-06,
323
  "loss": 0.7368,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 1.0862745098039215,
328
+ "grad_norm": 0.6235800385475159,
329
  "learning_rate": 8.039209983943201e-06,
330
  "loss": 0.7976,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 1.1098039215686275,
335
+ "grad_norm": 0.569098174571991,
336
  "learning_rate": 7.927668892097288e-06,
337
  "loss": 0.7109,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 1.1333333333333333,
342
+ "grad_norm": 0.5677081942558289,
343
  "learning_rate": 7.81386504884782e-06,
344
  "loss": 0.738,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 1.156862745098039,
349
+ "grad_norm": 0.7025531530380249,
350
  "learning_rate": 7.697886411479422e-06,
351
+ "loss": 0.8267,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 1.1803921568627451,
356
+ "grad_norm": 0.5158477425575256,
357
  "learning_rate": 7.579822618142505e-06,
358
  "loss": 0.7993,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 1.203921568627451,
363
+ "grad_norm": 0.7206972241401672,
364
  "learning_rate": 7.459764918573264e-06,
365
+ "loss": 0.8324,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 1.227450980392157,
370
+ "grad_norm": 0.5990767478942871,
371
  "learning_rate": 7.3378061035681415e-06,
372
  "loss": 0.73,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 1.2509803921568627,
377
+ "grad_norm": 0.5361316204071045,
378
  "learning_rate": 7.2140404332671986e-06,
379
+ "loss": 0.7399,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 1.2745098039215685,
384
+ "grad_norm": 0.6394158601760864,
385
  "learning_rate": 7.088563564301874e-06,
386
  "loss": 0.8289,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 1.2980392156862746,
391
+ "grad_norm": 0.5778906345367432,
392
  "learning_rate": 6.961472475863406e-06,
393
+ "loss": 0.7455,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 1.3215686274509804,
398
+ "grad_norm": 0.453545480966568,
399
  "learning_rate": 6.832865394749065e-06,
400
  "loss": 0.6694,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 1.3450980392156864,
405
+ "grad_norm": 0.5790985226631165,
406
  "learning_rate": 6.702841719444141e-06,
407
  "loss": 0.8093,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 1.3686274509803922,
412
+ "grad_norm": 0.48918935656547546,
413
  "learning_rate": 6.571501943298335e-06,
414
  "loss": 0.7096,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 1.392156862745098,
419
+ "grad_norm": 0.6267134547233582,
420
  "learning_rate": 6.4389475768559675e-06,
421
+ "loss": 0.814,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 1.415686274509804,
426
+ "grad_norm": 0.4953418970108032,
427
  "learning_rate": 6.305281069399989e-06,
428
+ "loss": 0.6618,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 1.4392156862745098,
433
+ "grad_norm": 0.5042392611503601,
434
  "learning_rate": 6.17060572977047e-06,
435
+ "loss": 0.6929,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 1.4627450980392158,
440
+ "grad_norm": 0.5538609027862549,
441
  "learning_rate": 6.035025646518747e-06,
442
  "loss": 0.7561,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 1.4862745098039216,
447
+ "grad_norm": 0.6071920394897461,
448
  "learning_rate": 5.898645607458941e-06,
449
+ "loss": 0.7797,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 1.5098039215686274,
454
+ "grad_norm": 0.5658022165298462,
455
  "learning_rate": 5.761571018679025e-06,
456
  "loss": 0.7374,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 1.5333333333333332,
461
+ "grad_norm": 0.5699365139007568,
462
  "learning_rate": 5.623907823074044e-06,
463
+ "loss": 0.8136,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 1.5568627450980392,
468
+ "grad_norm": 0.47883421182632446,
469
  "learning_rate": 5.48576241846443e-06,
470
  "loss": 0.7933,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 1.5803921568627453,
475
+ "grad_norm": 0.476575642824173,
476
  "learning_rate": 5.347241575362729e-06,
477
  "loss": 0.7209,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 1.603921568627451,
482
+ "grad_norm": 0.5210950970649719,
483
  "learning_rate": 5.208452354452275e-06,
484
+ "loss": 0.7747,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 1.6274509803921569,
489
+ "grad_norm": 0.47601795196533203,
490
  "learning_rate": 5.069502023841576e-06,
491
  "loss": 0.7635,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 1.6509803921568627,
496
+ "grad_norm": 0.521950364112854,
497
  "learning_rate": 4.9304979761584256e-06,
498
+ "loss": 0.7707,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 1.6745098039215687,
503
+ "grad_norm": 0.4452243745326996,
504
  "learning_rate": 4.791547645547727e-06,
505
  "loss": 0.6827,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 1.6980392156862745,
510
+ "grad_norm": 0.5476846098899841,
511
  "learning_rate": 4.652758424637271e-06,
512
+ "loss": 0.7939,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 1.7215686274509805,
517
+ "grad_norm": 0.480186402797699,
518
  "learning_rate": 4.514237581535571e-06,
519
+ "loss": 0.7367,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 1.7450980392156863,
524
+ "grad_norm": 0.5076435804367065,
525
  "learning_rate": 4.3760921769259585e-06,
526
+ "loss": 0.6935,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 1.768627450980392,
531
+ "grad_norm": 0.5008230805397034,
532
  "learning_rate": 4.2384289813209754e-06,
533
+ "loss": 0.7474,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 1.792156862745098,
538
+ "grad_norm": 0.44636473059654236,
539
  "learning_rate": 4.101354392541061e-06,
540
+ "loss": 0.7357,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 1.815686274509804,
545
+ "grad_norm": 0.4537220597267151,
546
  "learning_rate": 3.964974353481254e-06,
547
+ "loss": 0.7329,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 1.83921568627451,
552
+ "grad_norm": 0.4459396302700043,
553
  "learning_rate": 3.829394270229531e-06,
554
+ "loss": 0.7294,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 1.8627450980392157,
559
+ "grad_norm": 0.3966139256954193,
560
  "learning_rate": 3.694718930600012e-06,
561
+ "loss": 0.6419,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 1.8862745098039215,
566
+ "grad_norm": 0.4331508278846741,
567
  "learning_rate": 3.5610524231440324e-06,
568
+ "loss": 0.789,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 1.9098039215686273,
573
+ "grad_norm": 0.44077518582344055,
574
  "learning_rate": 3.428498056701665e-06,
575
  "loss": 0.7499,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 1.9333333333333333,
580
+ "grad_norm": 0.428218275308609,
581
  "learning_rate": 3.2971582805558622e-06,
582
+ "loss": 0.7664,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 1.9568627450980394,
587
+ "grad_norm": 0.45457276701927185,
588
  "learning_rate": 3.167134605250938e-06,
589
+ "loss": 0.7651,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 1.9803921568627452,
594
+ "grad_norm": 0.4581441283226013,
595
  "learning_rate": 3.0385275241365965e-06,
596
+ "loss": 0.771,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 2.007843137254902,
601
+ "grad_norm": 0.8593717813491821,
602
  "learning_rate": 2.9114364356981274e-06,
603
  "loss": 1.2373,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 2.0313725490196077,
608
+ "grad_norm": 0.38274502754211426,
609
  "learning_rate": 2.7859595667328027e-06,
610
  "loss": 0.7255,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 2.0549019607843135,
615
+ "grad_norm": 0.4255603551864624,
616
  "learning_rate": 2.6621938964318593e-06,
617
  "loss": 0.6407,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 2.0784313725490198,
622
+ "grad_norm": 0.4372337758541107,
623
  "learning_rate": 2.5402350814267364e-06,
624
+ "loss": 0.6874,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 2.1019607843137256,
629
+ "grad_norm": 0.4747924506664276,
630
  "learning_rate": 2.4201773818574956e-06,
631
+ "loss": 0.654,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 2.1254901960784314,
636
+ "grad_norm": 0.45802468061447144,
637
  "learning_rate": 2.302113588520578e-06,
638
  "loss": 0.6809,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 2.149019607843137,
643
+ "grad_norm": 0.39464399218559265,
644
  "learning_rate": 2.1861349511521817e-06,
645
+ "loss": 0.6086,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 2.172549019607843,
650
+ "grad_norm": 0.4720733165740967,
651
  "learning_rate": 2.072331107902713e-06,
652
+ "loss": 0.9134,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 2.196078431372549,
657
+ "grad_norm": 0.4298717677593231,
658
  "learning_rate": 1.960790016056801e-06,
659
+ "loss": 0.6735,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 2.219607843137255,
664
+ "grad_norm": 0.4402537941932678,
665
  "learning_rate": 1.8515978840524302e-06,
666
  "loss": 0.6972,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 2.243137254901961,
671
+ "grad_norm": 0.45702147483825684,
672
  "learning_rate": 1.7448391048517378e-06,
673
+ "loss": 0.7225,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 2.2666666666666666,
678
+ "grad_norm": 0.4096600413322449,
679
  "learning_rate": 1.640596190714947e-06,
680
  "loss": 0.7225,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 2.2901960784313724,
685
+ "grad_norm": 0.43652820587158203,
686
  "learning_rate": 1.5389497094278861e-06,
687
  "loss": 0.7208,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 2.313725490196078,
692
+ "grad_norm": 0.44025924801826477,
693
  "learning_rate": 1.4399782220323515e-06,
694
+ "loss": 0.6707,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 2.3372549019607844,
699
+ "grad_norm": 0.5045623779296875,
700
  "learning_rate": 1.3437582221074574e-06,
701
+ "loss": 0.7924,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 2.3607843137254902,
706
+ "grad_norm": 0.48436567187309265,
707
  "learning_rate": 1.250364076648894e-06,
708
+ "loss": 0.7384,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 2.384313725490196,
713
+ "grad_norm": 0.3896447718143463,
714
  "learning_rate": 1.1598679685917901e-06,
715
+ "loss": 0.6666,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 2.407843137254902,
720
+ "grad_norm": 0.40689510107040405,
721
  "learning_rate": 1.0723398410216085e-06,
722
+ "loss": 0.8292,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 2.431372549019608,
727
+ "grad_norm": 0.40202853083610535,
728
  "learning_rate": 9.878473431161767e-07,
729
+ "loss": 0.6669,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 2.454901960784314,
734
+ "grad_norm": 0.3605956733226776,
735
  "learning_rate": 9.064557778606631e-07,
736
  "loss": 0.6017,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 2.4784313725490197,
741
+ "grad_norm": 0.4316107928752899,
742
  "learning_rate": 8.282280515758639e-07,
743
+ "loss": 0.7825,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 2.5019607843137255,
748
+ "grad_norm": 0.46594148874282837,
749
  "learning_rate": 7.532246252988617e-07,
750
  "loss": 0.7446,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 2.5254901960784313,
755
+ "grad_norm": 0.42160096764564514,
756
  "learning_rate": 6.815034680535915e-07,
757
  "loss": 0.713,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 2.549019607843137,
762
+ "grad_norm": 0.4095713794231415,
763
  "learning_rate": 6.131200120474512e-07,
764
  "loss": 0.7409,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 2.572549019607843,
769
+ "grad_norm": 0.40359240770339966,
770
  "learning_rate": 5.481271098285818e-07,
771
  "loss": 0.7501,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 2.596078431372549,
776
+ "grad_norm": 0.3566288352012634,
777
  "learning_rate": 4.865749934369224e-07,
778
  "loss": 0.6082,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 2.619607843137255,
783
+ "grad_norm": 0.417287141084671,
784
  "learning_rate": 4.2851123558061927e-07,
785
+ "loss": 0.7517,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 2.6431372549019607,
790
+ "grad_norm": 0.36074298620224,
791
  "learning_rate": 3.739807128677986e-07,
792
  "loss": 0.6589,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 2.6666666666666665,
797
+ "grad_norm": 0.37484005093574524,
798
  "learning_rate": 3.230255711220992e-07,
799
+ "loss": 0.7009,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 2.6901960784313728,
804
+ "grad_norm": 0.3959641754627228,
805
  "learning_rate": 2.756851928088056e-07,
806
  "loss": 0.7579,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 2.7137254901960786,
811
+ "grad_norm": 0.3696475625038147,
812
  "learning_rate": 2.3199616659672352e-07,
813
+ "loss": 0.8004,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 2.7372549019607844,
818
+ "grad_norm": 0.3429110646247864,
819
  "learning_rate": 1.9199225907935492e-07,
820
  "loss": 0.6913,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 2.76078431372549,
825
+ "grad_norm": 0.36563417315483093,
826
  "learning_rate": 1.5570438867719695e-07,
827
  "loss": 0.6749,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 2.784313725490196,
832
+ "grad_norm": 0.41839203238487244,
833
  "learning_rate": 1.2316060174136e-07,
834
  "loss": 0.93,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 2.8078431372549018,
839
+ "grad_norm": 0.329913854598999,
840
  "learning_rate": 9.43860508769645e-08,
841
  "loss": 0.5853,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 2.831372549019608,
846
+ "grad_norm": 0.44181394577026367,
847
  "learning_rate": 6.940297550306895e-08,
848
  "loss": 0.7548,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 2.854901960784314,
853
+ "grad_norm": 0.3787192702293396,
854
  "learning_rate": 4.823068466415615e-08,
855
+ "loss": 0.7454,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 2.8784313725490196,
860
+ "grad_norm": 0.4256850481033325,
861
  "learning_rate": 3.088554210646133e-08,
862
+ "loss": 0.8,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 2.9019607843137254,
867
+ "grad_norm": 0.3509824573993683,
868
  "learning_rate": 1.7380953630678488e-08,
869
  "loss": 0.7289,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 2.9254901960784316,
874
+ "grad_norm": 0.4128064513206482,
875
  "learning_rate": 7.727356730820035e-09,
876
  "loss": 0.6974,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 2.9490196078431374,
881
+ "grad_norm": 0.36814892292022705,
882
  "learning_rate": 1.9322125272297488e-09,
883
  "loss": 0.765,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 2.9725490196078432,
888
+ "grad_norm": 0.3678930401802063,
889
  "learning_rate": 0.0,
890
  "loss": 0.6713,
891
  "step": 126
 
894
  "epoch": 2.9725490196078432,
895
  "step": 126,
896
  "total_flos": 121055548211200.0,
897
+ "train_loss": 0.7961941848671625,
898
+ "train_runtime": 4090.1751,
899
+ "train_samples_per_second": 2.985,
900
  "train_steps_per_second": 0.031
901
  }
902
  ],
training_loss.png CHANGED