rbelanec commited on
Commit
224790f
verified
1 Parent(s): 4d51061

End of training

Browse files
README.md CHANGED
@@ -17,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # test
19
 
20
- This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.5010
23
- - Num Input Tokens Seen: 43600
24
 
25
  ## Model description
26
 
 
17
 
18
  # test
19
 
20
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the wsc dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.4947
23
+ - Num Input Tokens Seen: 43904
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34907668828964233,
4
- "eval_runtime": 0.5932,
5
- "eval_samples_per_second": 94.404,
6
- "eval_steps_per_second": 47.202,
7
  "num_input_tokens_seen": 43904,
8
- "total_flos": 278458437992448.0,
9
- "train_loss": 0.3984213411568638,
10
- "train_runtime": 80.7936,
11
- "train_samples_per_second": 6.164,
12
- "train_steps_per_second": 3.082
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.4947091042995453,
4
+ "eval_runtime": 1.2645,
5
+ "eval_samples_per_second": 44.285,
6
+ "eval_steps_per_second": 22.142,
7
  "num_input_tokens_seen": 43904,
8
+ "total_flos": 256382402519040.0,
9
+ "train_loss": 0.6638821186310795,
10
+ "train_runtime": 70.9038,
11
+ "train_samples_per_second": 7.024,
12
+ "train_steps_per_second": 3.512
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.34907668828964233,
4
- "eval_runtime": 0.5932,
5
- "eval_samples_per_second": 94.404,
6
- "eval_steps_per_second": 47.202,
7
  "num_input_tokens_seen": 43904
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 0.4947091042995453,
4
+ "eval_runtime": 1.2645,
5
+ "eval_samples_per_second": 44.285,
6
+ "eval_steps_per_second": 22.142,
7
  "num_input_tokens_seen": 43904
8
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "num_input_tokens_seen": 43904,
4
- "total_flos": 278458437992448.0,
5
- "train_loss": 0.3984213411568638,
6
- "train_runtime": 80.7936,
7
- "train_samples_per_second": 6.164,
8
- "train_steps_per_second": 3.082
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "num_input_tokens_seen": 43904,
4
+ "total_flos": 256382402519040.0,
5
+ "train_loss": 0.6638821186310795,
6
+ "train_runtime": 70.9038,
7
+ "train_samples_per_second": 7.024,
8
+ "train_steps_per_second": 3.512
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_global_step": 182,
3
- "best_metric": 0.34907668828964233,
4
- "best_model_checkpoint": "saves/test/checkpoint-182",
5
  "epoch": 1.0,
6
  "eval_steps": 13,
7
  "global_step": 249,
@@ -11,564 +11,564 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.020080321285140562,
14
- "grad_norm": 29.701719284057617,
15
  "learning_rate": 8.000000000000001e-06,
16
- "loss": 0.8323,
17
  "num_input_tokens_seen": 832,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.040160642570281124,
22
- "grad_norm": 19.538766860961914,
23
  "learning_rate": 1.8e-05,
24
- "loss": 0.7462,
25
  "num_input_tokens_seen": 1760,
26
  "step": 10
27
  },
28
  {
29
  "epoch": 0.05220883534136546,
30
- "eval_loss": 0.6849029660224915,
31
- "eval_runtime": 0.5644,
32
- "eval_samples_per_second": 99.229,
33
- "eval_steps_per_second": 49.614,
34
  "num_input_tokens_seen": 2288,
35
  "step": 13
36
  },
37
  {
38
  "epoch": 0.060240963855421686,
39
- "grad_norm": 9.36767292022705,
40
  "learning_rate": 2.8000000000000003e-05,
41
- "loss": 0.71,
42
  "num_input_tokens_seen": 2608,
43
  "step": 15
44
  },
45
  {
46
  "epoch": 0.08032128514056225,
47
- "grad_norm": 17.907136917114258,
48
  "learning_rate": 3.8e-05,
49
- "loss": 0.5466,
50
  "num_input_tokens_seen": 3536,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.10040160642570281,
55
- "grad_norm": 11.546435356140137,
56
  "learning_rate": 4.8e-05,
57
- "loss": 0.6639,
58
  "num_input_tokens_seen": 4496,
59
  "step": 25
60
  },
61
  {
62
  "epoch": 0.10441767068273092,
63
- "eval_loss": 0.45566946268081665,
64
- "eval_runtime": 0.5583,
65
- "eval_samples_per_second": 100.311,
66
- "eval_steps_per_second": 50.156,
67
  "num_input_tokens_seen": 4656,
68
  "step": 26
69
  },
70
  {
71
  "epoch": 0.12048192771084337,
72
- "grad_norm": 8.822799682617188,
73
  "learning_rate": 4.996067037544542e-05,
74
- "loss": 0.5107,
75
  "num_input_tokens_seen": 5424,
76
  "step": 30
77
  },
78
  {
79
  "epoch": 0.14056224899598393,
80
- "grad_norm": 7.542176246643066,
81
  "learning_rate": 4.980110583549062e-05,
82
- "loss": 0.3742,
83
  "num_input_tokens_seen": 6304,
84
  "step": 35
85
  },
86
  {
87
  "epoch": 0.1566265060240964,
88
- "eval_loss": 0.3848888874053955,
89
- "eval_runtime": 0.5676,
90
- "eval_samples_per_second": 98.664,
91
- "eval_steps_per_second": 49.332,
92
  "num_input_tokens_seen": 6944,
93
  "step": 39
94
  },
95
  {
96
  "epoch": 0.1606425702811245,
97
- "grad_norm": 1.9545893669128418,
98
  "learning_rate": 4.951963201008076e-05,
99
- "loss": 0.3994,
100
  "num_input_tokens_seen": 7072,
101
  "step": 40
102
  },
103
  {
104
  "epoch": 0.18072289156626506,
105
- "grad_norm": 6.747119903564453,
106
  "learning_rate": 4.91176324775594e-05,
107
- "loss": 0.3929,
108
  "num_input_tokens_seen": 7856,
109
  "step": 45
110
  },
111
  {
112
  "epoch": 0.20080321285140562,
113
- "grad_norm": 9.478217124938965,
114
  "learning_rate": 4.8597083257709194e-05,
115
- "loss": 0.3565,
116
  "num_input_tokens_seen": 8880,
117
  "step": 50
118
  },
119
  {
120
  "epoch": 0.20883534136546184,
121
- "eval_loss": 0.3768366277217865,
122
- "eval_runtime": 0.5874,
123
- "eval_samples_per_second": 95.338,
124
- "eval_steps_per_second": 47.669,
125
  "num_input_tokens_seen": 9232,
126
  "step": 52
127
  },
128
  {
129
  "epoch": 0.22088353413654618,
130
- "grad_norm": 2.1806461811065674,
131
  "learning_rate": 4.796054309867053e-05,
132
- "loss": 0.4015,
133
  "num_input_tokens_seen": 9680,
134
  "step": 55
135
  },
136
  {
137
  "epoch": 0.24096385542168675,
138
- "grad_norm": 7.034917831420898,
139
  "learning_rate": 4.721114089947181e-05,
140
- "loss": 0.3437,
141
  "num_input_tokens_seen": 10576,
142
  "step": 60
143
  },
144
  {
145
  "epoch": 0.26104417670682734,
146
- "grad_norm": 2.6137804985046387,
147
  "learning_rate": 4.6352560329995686e-05,
148
- "loss": 0.3087,
149
  "num_input_tokens_seen": 11424,
150
  "step": 65
151
  },
152
  {
153
  "epoch": 0.26104417670682734,
154
- "eval_loss": 0.3713006377220154,
155
- "eval_runtime": 0.5783,
156
- "eval_samples_per_second": 96.834,
157
- "eval_steps_per_second": 48.417,
158
  "num_input_tokens_seen": 11424,
159
  "step": 65
160
  },
161
  {
162
  "epoch": 0.28112449799196787,
163
- "grad_norm": 2.673372507095337,
164
  "learning_rate": 4.538902172398151e-05,
165
- "loss": 0.3702,
166
  "num_input_tokens_seen": 12224,
167
  "step": 70
168
  },
169
  {
170
  "epoch": 0.30120481927710846,
171
- "grad_norm": 1.5868593454360962,
172
  "learning_rate": 4.4325261334068426e-05,
173
- "loss": 0.3607,
174
  "num_input_tokens_seen": 13168,
175
  "step": 75
176
  },
177
  {
178
  "epoch": 0.3132530120481928,
179
- "eval_loss": 0.3614208996295929,
180
- "eval_runtime": 0.6054,
181
- "eval_samples_per_second": 92.508,
182
- "eval_steps_per_second": 46.254,
183
  "num_input_tokens_seen": 13760,
184
  "step": 78
185
  },
186
  {
187
  "epoch": 0.321285140562249,
188
- "grad_norm": 7.262303829193115,
189
  "learning_rate": 4.316650805085068e-05,
190
- "loss": 0.3766,
191
  "num_input_tokens_seen": 14080,
192
  "step": 80
193
  },
194
  {
195
  "epoch": 0.3413654618473896,
196
- "grad_norm": 6.2033772468566895,
197
  "learning_rate": 4.1918457700381855e-05,
198
- "loss": 0.3639,
199
  "num_input_tokens_seen": 15056,
200
  "step": 85
201
  },
202
  {
203
  "epoch": 0.3614457831325301,
204
- "grad_norm": 6.97931432723999,
205
  "learning_rate": 4.058724504646834e-05,
206
- "loss": 0.3589,
207
  "num_input_tokens_seen": 15904,
208
  "step": 90
209
  },
210
  {
211
  "epoch": 0.3654618473895582,
212
- "eval_loss": 0.36092114448547363,
213
- "eval_runtime": 0.5824,
214
- "eval_samples_per_second": 96.155,
215
- "eval_steps_per_second": 48.078,
216
  "num_input_tokens_seen": 16048,
217
  "step": 91
218
  },
219
  {
220
  "epoch": 0.3815261044176707,
221
- "grad_norm": 6.607943058013916,
222
  "learning_rate": 3.9179413635373897e-05,
223
- "loss": 0.3395,
224
  "num_input_tokens_seen": 16688,
225
  "step": 95
226
  },
227
  {
228
  "epoch": 0.40160642570281124,
229
- "grad_norm": 6.295155048370361,
230
  "learning_rate": 3.770188363116324e-05,
231
- "loss": 0.2898,
232
  "num_input_tokens_seen": 17552,
233
  "step": 100
234
  },
235
  {
236
  "epoch": 0.41767068273092367,
237
- "eval_loss": 0.37226182222366333,
238
- "eval_runtime": 0.5933,
239
- "eval_samples_per_second": 94.389,
240
- "eval_steps_per_second": 47.194,
241
  "num_input_tokens_seen": 18272,
242
  "step": 104
243
  },
244
  {
245
  "epoch": 0.42168674698795183,
246
- "grad_norm": 1.7356857061386108,
247
  "learning_rate": 3.616191779978907e-05,
248
- "loss": 0.3169,
249
  "num_input_tokens_seen": 18400,
250
  "step": 105
251
  },
252
  {
253
  "epoch": 0.44176706827309237,
254
- "grad_norm": 5.197076797485352,
255
  "learning_rate": 3.456708580912725e-05,
256
- "loss": 0.3631,
257
  "num_input_tokens_seen": 19456,
258
  "step": 110
259
  },
260
  {
261
  "epoch": 0.46184738955823296,
262
- "grad_norm": 8.22790241241455,
263
  "learning_rate": 3.292522702044221e-05,
264
- "loss": 0.4246,
265
  "num_input_tokens_seen": 20288,
266
  "step": 115
267
  },
268
  {
269
  "epoch": 0.46987951807228917,
270
- "eval_loss": 0.36986905336380005,
271
- "eval_runtime": 0.5858,
272
- "eval_samples_per_second": 95.59,
273
- "eval_steps_per_second": 47.795,
274
  "num_input_tokens_seen": 20656,
275
  "step": 117
276
  },
277
  {
278
  "epoch": 0.4819277108433735,
279
- "grad_norm": 2.152425527572632,
280
  "learning_rate": 3.1244411954180676e-05,
281
- "loss": 0.3885,
282
  "num_input_tokens_seen": 21328,
283
  "step": 120
284
  },
285
  {
286
  "epoch": 0.5020080321285141,
287
- "grad_norm": 1.8676035404205322,
288
  "learning_rate": 2.9532902619507462e-05,
289
- "loss": 0.3539,
290
  "num_input_tokens_seen": 22304,
291
  "step": 125
292
  },
293
  {
294
  "epoch": 0.5220883534136547,
295
- "grad_norm": 6.877042293548584,
296
  "learning_rate": 2.7799111902582696e-05,
297
- "loss": 0.3657,
298
  "num_input_tokens_seen": 23056,
299
  "step": 130
300
  },
301
  {
302
  "epoch": 0.5220883534136547,
303
- "eval_loss": 0.35231098532676697,
304
- "eval_runtime": 0.579,
305
- "eval_samples_per_second": 96.724,
306
- "eval_steps_per_second": 48.362,
307
  "num_input_tokens_seen": 23056,
308
  "step": 130
309
  },
310
  {
311
  "epoch": 0.5421686746987951,
312
- "grad_norm": 8.028996467590332,
313
  "learning_rate": 2.6051562213206632e-05,
314
- "loss": 0.3499,
315
  "num_input_tokens_seen": 23840,
316
  "step": 135
317
  },
318
  {
319
  "epoch": 0.5622489959839357,
320
- "grad_norm": 2.0267858505249023,
321
  "learning_rate": 2.429884359310328e-05,
322
- "loss": 0.3637,
323
  "num_input_tokens_seen": 24832,
324
  "step": 140
325
  },
326
  {
327
  "epoch": 0.5742971887550201,
328
- "eval_loss": 0.3550644516944885,
329
- "eval_runtime": 0.5754,
330
- "eval_samples_per_second": 97.321,
331
- "eval_steps_per_second": 48.66,
332
  "num_input_tokens_seen": 25312,
333
  "step": 143
334
  },
335
  {
336
  "epoch": 0.5823293172690763,
337
- "grad_norm": 1.7024149894714355,
338
  "learning_rate": 2.2549571491760986e-05,
339
- "loss": 0.3785,
340
  "num_input_tokens_seen": 25648,
341
  "step": 145
342
  },
343
  {
344
  "epoch": 0.6024096385542169,
345
- "grad_norm": 7.411402225494385,
346
  "learning_rate": 2.0812344417381595e-05,
347
- "loss": 0.3394,
348
  "num_input_tokens_seen": 26496,
349
  "step": 150
350
  },
351
  {
352
  "epoch": 0.6224899598393574,
353
- "grad_norm": 6.996516227722168,
354
  "learning_rate": 1.909570167110415e-05,
355
- "loss": 0.3938,
356
  "num_input_tokens_seen": 27392,
357
  "step": 155
358
  },
359
  {
360
  "epoch": 0.6265060240963856,
361
- "eval_loss": 0.3516700565814972,
362
- "eval_runtime": 0.5784,
363
- "eval_samples_per_second": 96.823,
364
- "eval_steps_per_second": 48.411,
365
  "num_input_tokens_seen": 27552,
366
  "step": 156
367
  },
368
  {
369
  "epoch": 0.642570281124498,
370
- "grad_norm": 1.795516848564148,
371
  "learning_rate": 1.7408081372259632e-05,
372
- "loss": 0.3667,
373
  "num_input_tokens_seen": 28272,
374
  "step": 160
375
  },
376
  {
377
  "epoch": 0.6626506024096386,
378
- "grad_norm": 5.605747222900391,
379
  "learning_rate": 1.5757778980982626e-05,
380
- "loss": 0.3198,
381
  "num_input_tokens_seen": 29184,
382
  "step": 165
383
  },
384
  {
385
  "epoch": 0.678714859437751,
386
- "eval_loss": 0.354565292596817,
387
- "eval_runtime": 0.578,
388
- "eval_samples_per_second": 96.89,
389
- "eval_steps_per_second": 48.445,
390
  "num_input_tokens_seen": 29984,
391
  "step": 169
392
  },
393
  {
394
  "epoch": 0.6827309236947792,
395
- "grad_norm": 2.0162057876586914,
396
  "learning_rate": 1.4152906522061048e-05,
397
- "loss": 0.3366,
398
  "num_input_tokens_seen": 30128,
399
  "step": 170
400
  },
401
  {
402
  "epoch": 0.7028112449799196,
403
- "grad_norm": 2.3657188415527344,
404
  "learning_rate": 1.2601352710458313e-05,
405
- "loss": 0.3291,
406
  "num_input_tokens_seen": 30976,
407
  "step": 175
408
  },
409
  {
410
  "epoch": 0.7228915662650602,
411
- "grad_norm": 4.72028923034668,
412
  "learning_rate": 1.1110744174509952e-05,
413
- "loss": 0.369,
414
  "num_input_tokens_seen": 31776,
415
  "step": 180
416
  },
417
  {
418
  "epoch": 0.7309236947791165,
419
- "eval_loss": 0.34907668828964233,
420
- "eval_runtime": 0.5768,
421
- "eval_samples_per_second": 97.087,
422
- "eval_steps_per_second": 48.544,
423
  "num_input_tokens_seen": 32080,
424
  "step": 182
425
  },
426
  {
427
  "epoch": 0.7429718875502008,
428
- "grad_norm": 7.622625827789307,
429
  "learning_rate": 9.688407967401248e-06,
430
- "loss": 0.3852,
431
  "num_input_tokens_seen": 32608,
432
  "step": 185
433
  },
434
  {
435
  "epoch": 0.7630522088353414,
436
- "grad_norm": 6.026548862457275,
437
  "learning_rate": 8.341335551199902e-06,
438
- "loss": 0.4115,
439
  "num_input_tokens_seen": 33360,
440
  "step": 190
441
  },
442
  {
443
  "epoch": 0.7831325301204819,
444
- "grad_norm": 7.148702621459961,
445
  "learning_rate": 7.076148430479321e-06,
446
- "loss": 0.3673,
447
  "num_input_tokens_seen": 34176,
448
  "step": 195
449
  },
450
  {
451
  "epoch": 0.7831325301204819,
452
- "eval_loss": 0.3541497588157654,
453
- "eval_runtime": 0.58,
454
- "eval_samples_per_second": 96.553,
455
- "eval_steps_per_second": 48.277,
456
  "num_input_tokens_seen": 34176,
457
  "step": 195
458
  },
459
  {
460
  "epoch": 0.8032128514056225,
461
- "grad_norm": 1.6944422721862793,
462
  "learning_rate": 5.899065604459814e-06,
463
- "loss": 0.3583,
464
  "num_input_tokens_seen": 34992,
465
  "step": 200
466
  },
467
  {
468
  "epoch": 0.8232931726907631,
469
- "grad_norm": 1.7302725315093994,
470
  "learning_rate": 4.81587299765594e-06,
471
- "loss": 0.3675,
472
  "num_input_tokens_seen": 35888,
473
  "step": 205
474
  },
475
  {
476
  "epoch": 0.8353413654618473,
477
- "eval_loss": 0.3513210713863373,
478
- "eval_runtime": 0.5991,
479
- "eval_samples_per_second": 93.474,
480
- "eval_steps_per_second": 46.737,
481
  "num_input_tokens_seen": 36512,
482
  "step": 208
483
  },
484
  {
485
  "epoch": 0.8433734939759037,
486
- "grad_norm": 1.571621060371399,
487
  "learning_rate": 3.831895019292897e-06,
488
- "loss": 0.3717,
489
  "num_input_tokens_seen": 36848,
490
  "step": 210
491
  },
492
  {
493
  "epoch": 0.8634538152610441,
494
- "grad_norm": 2.1745762825012207,
495
  "learning_rate": 2.9519683912911266e-06,
496
- "loss": 0.3723,
497
  "num_input_tokens_seen": 37888,
498
  "step": 215
499
  },
500
  {
501
  "epoch": 0.8835341365461847,
502
- "grad_norm": 1.900101900100708,
503
  "learning_rate": 2.1804183734670277e-06,
504
- "loss": 0.3634,
505
  "num_input_tokens_seen": 38768,
506
  "step": 220
507
  },
508
  {
509
  "epoch": 0.8875502008032129,
510
- "eval_loss": 0.3546585738658905,
511
- "eval_runtime": 0.6095,
512
- "eval_samples_per_second": 91.885,
513
- "eval_steps_per_second": 45.943,
514
  "num_input_tokens_seen": 38912,
515
  "step": 221
516
  },
517
  {
518
  "epoch": 0.9036144578313253,
519
- "grad_norm": 1.1839439868927002,
520
  "learning_rate": 1.5210375028143097e-06,
521
- "loss": 0.3656,
522
  "num_input_tokens_seen": 39488,
523
  "step": 225
524
  },
525
  {
526
  "epoch": 0.9236947791164659,
527
- "grad_norm": 1.8801380395889282,
528
  "learning_rate": 9.770669513725128e-07,
529
- "loss": 0.3446,
530
  "num_input_tokens_seen": 40336,
531
  "step": 230
532
  },
533
  {
534
  "epoch": 0.9397590361445783,
535
- "eval_loss": 0.35187554359436035,
536
- "eval_runtime": 0.5983,
537
- "eval_samples_per_second": 93.594,
538
- "eval_steps_per_second": 46.797,
539
  "num_input_tokens_seen": 41120,
540
  "step": 234
541
  },
542
  {
543
  "epoch": 0.9437751004016064,
544
- "grad_norm": 1.5287421941757202,
545
  "learning_rate": 5.5118059431781e-07,
546
- "loss": 0.3724,
547
  "num_input_tokens_seen": 41328,
548
  "step": 235
549
  },
550
  {
551
  "epoch": 0.963855421686747,
552
- "grad_norm": 1.8900800943374634,
553
  "learning_rate": 2.454718665888589e-07,
554
- "loss": 0.3493,
555
  "num_input_tokens_seen": 42176,
556
  "step": 240
557
  },
558
  {
559
  "epoch": 0.9839357429718876,
560
- "grad_norm": 1.2923225164413452,
561
  "learning_rate": 6.14434726538493e-08,
562
- "loss": 0.3364,
563
  "num_input_tokens_seen": 43312,
564
  "step": 245
565
  },
566
  {
567
  "epoch": 0.9919678714859438,
568
- "eval_loss": 0.3515866696834564,
569
- "eval_runtime": 0.62,
570
- "eval_samples_per_second": 90.326,
571
- "eval_steps_per_second": 45.163,
572
  "num_input_tokens_seen": 43600,
573
  "step": 247
574
  },
@@ -576,11 +576,11 @@
576
  "epoch": 1.0,
577
  "num_input_tokens_seen": 43904,
578
  "step": 249,
579
- "total_flos": 278458437992448.0,
580
- "train_loss": 0.3984213411568638,
581
- "train_runtime": 80.7936,
582
- "train_samples_per_second": 6.164,
583
- "train_steps_per_second": 3.082
584
  }
585
  ],
586
  "logging_steps": 5,
@@ -600,7 +600,7 @@
600
  "attributes": {}
601
  }
602
  },
603
- "total_flos": 278458437992448.0,
604
  "train_batch_size": 2,
605
  "trial_name": null,
606
  "trial_params": null
 
1
  {
2
+ "best_global_step": 221,
3
+ "best_metric": 0.4947091042995453,
4
+ "best_model_checkpoint": "saves/test/checkpoint-221",
5
  "epoch": 1.0,
6
  "eval_steps": 13,
7
  "global_step": 249,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.020080321285140562,
14
+ "grad_norm": 1.1343824863433838,
15
  "learning_rate": 8.000000000000001e-06,
16
+ "loss": 0.9403,
17
  "num_input_tokens_seen": 832,
18
  "step": 5
19
  },
20
  {
21
  "epoch": 0.040160642570281124,
22
+ "grad_norm": 0.9066250920295715,
23
  "learning_rate": 1.8e-05,
24
+ "loss": 0.9316,
25
  "num_input_tokens_seen": 1760,
26
  "step": 10
27
  },
28
  {
29
  "epoch": 0.05220883534136546,
30
+ "eval_loss": 0.9549338221549988,
31
+ "eval_runtime": 1.0153,
32
+ "eval_samples_per_second": 55.156,
33
+ "eval_steps_per_second": 27.578,
34
  "num_input_tokens_seen": 2288,
35
  "step": 13
36
  },
37
  {
38
  "epoch": 0.060240963855421686,
39
+ "grad_norm": 0.845765233039856,
40
  "learning_rate": 2.8000000000000003e-05,
41
+ "loss": 0.9858,
42
  "num_input_tokens_seen": 2608,
43
  "step": 15
44
  },
45
  {
46
  "epoch": 0.08032128514056225,
47
+ "grad_norm": 1.3779510259628296,
48
  "learning_rate": 3.8e-05,
49
+ "loss": 0.8997,
50
  "num_input_tokens_seen": 3536,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.10040160642570281,
55
+ "grad_norm": 1.0899865627288818,
56
  "learning_rate": 4.8e-05,
57
+ "loss": 1.1199,
58
  "num_input_tokens_seen": 4496,
59
  "step": 25
60
  },
61
  {
62
  "epoch": 0.10441767068273092,
63
+ "eval_loss": 0.8821852803230286,
64
+ "eval_runtime": 0.9542,
65
+ "eval_samples_per_second": 58.685,
66
+ "eval_steps_per_second": 29.342,
67
  "num_input_tokens_seen": 4656,
68
  "step": 26
69
  },
70
  {
71
  "epoch": 0.12048192771084337,
72
+ "grad_norm": 1.0002409219741821,
73
  "learning_rate": 4.996067037544542e-05,
74
+ "loss": 1.0716,
75
  "num_input_tokens_seen": 5424,
76
  "step": 30
77
  },
78
  {
79
  "epoch": 0.14056224899598393,
80
+ "grad_norm": 1.0406622886657715,
81
  "learning_rate": 4.980110583549062e-05,
82
+ "loss": 0.8317,
83
  "num_input_tokens_seen": 6304,
84
  "step": 35
85
  },
86
  {
87
  "epoch": 0.1566265060240964,
88
+ "eval_loss": 0.8176223039627075,
89
+ "eval_runtime": 1.015,
90
+ "eval_samples_per_second": 55.175,
91
+ "eval_steps_per_second": 27.587,
92
  "num_input_tokens_seen": 6944,
93
  "step": 39
94
  },
95
  {
96
  "epoch": 0.1606425702811245,
97
+ "grad_norm": 0.8227869272232056,
98
  "learning_rate": 4.951963201008076e-05,
99
+ "loss": 0.9978,
100
  "num_input_tokens_seen": 7072,
101
  "step": 40
102
  },
103
  {
104
  "epoch": 0.18072289156626506,
105
+ "grad_norm": 0.9921690821647644,
106
  "learning_rate": 4.91176324775594e-05,
107
+ "loss": 0.7925,
108
  "num_input_tokens_seen": 7856,
109
  "step": 45
110
  },
111
  {
112
  "epoch": 0.20080321285140562,
113
+ "grad_norm": 0.5807920098304749,
114
  "learning_rate": 4.8597083257709194e-05,
115
+ "loss": 0.7882,
116
  "num_input_tokens_seen": 8880,
117
  "step": 50
118
  },
119
  {
120
  "epoch": 0.20883534136546184,
121
+ "eval_loss": 0.7668408155441284,
122
+ "eval_runtime": 1.1336,
123
+ "eval_samples_per_second": 49.4,
124
+ "eval_steps_per_second": 24.7,
125
  "num_input_tokens_seen": 9232,
126
  "step": 52
127
  },
128
  {
129
  "epoch": 0.22088353413654618,
130
+ "grad_norm": 0.7548654079437256,
131
  "learning_rate": 4.796054309867053e-05,
132
+ "loss": 0.7209,
133
  "num_input_tokens_seen": 9680,
134
  "step": 55
135
  },
136
  {
137
  "epoch": 0.24096385542168675,
138
+ "grad_norm": 0.8454243540763855,
139
  "learning_rate": 4.721114089947181e-05,
140
+ "loss": 0.8405,
141
  "num_input_tokens_seen": 10576,
142
  "step": 60
143
  },
144
  {
145
  "epoch": 0.26104417670682734,
146
+ "grad_norm": 0.6962847113609314,
147
  "learning_rate": 4.6352560329995686e-05,
148
+ "loss": 0.7909,
149
  "num_input_tokens_seen": 11424,
150
  "step": 65
151
  },
152
  {
153
  "epoch": 0.26104417670682734,
154
+ "eval_loss": 0.6973205208778381,
155
+ "eval_runtime": 1.0994,
156
+ "eval_samples_per_second": 50.939,
157
+ "eval_steps_per_second": 25.469,
158
  "num_input_tokens_seen": 11424,
159
  "step": 65
160
  },
161
  {
162
  "epoch": 0.28112449799196787,
163
+ "grad_norm": 0.88432377576828,
164
  "learning_rate": 4.538902172398151e-05,
165
+ "loss": 0.7974,
166
  "num_input_tokens_seen": 12224,
167
  "step": 70
168
  },
169
  {
170
  "epoch": 0.30120481927710846,
171
+ "grad_norm": 0.6160923838615417,
172
  "learning_rate": 4.4325261334068426e-05,
173
+ "loss": 0.7007,
174
  "num_input_tokens_seen": 13168,
175
  "step": 75
176
  },
177
  {
178
  "epoch": 0.3132530120481928,
179
+ "eval_loss": 0.6643471121788025,
180
+ "eval_runtime": 1.1689,
181
+ "eval_samples_per_second": 47.909,
182
+ "eval_steps_per_second": 23.954,
183
  "num_input_tokens_seen": 13760,
184
  "step": 78
185
  },
186
  {
187
  "epoch": 0.321285140562249,
188
+ "grad_norm": 0.7029849886894226,
189
  "learning_rate": 4.316650805085068e-05,
190
+ "loss": 0.6931,
191
  "num_input_tokens_seen": 14080,
192
  "step": 80
193
  },
194
  {
195
  "epoch": 0.3413654618473896,
196
+ "grad_norm": 0.7781380414962769,
197
  "learning_rate": 4.1918457700381855e-05,
198
+ "loss": 0.6361,
199
  "num_input_tokens_seen": 15056,
200
  "step": 85
201
  },
202
  {
203
  "epoch": 0.3614457831325301,
204
+ "grad_norm": 0.9208192229270935,
205
  "learning_rate": 4.058724504646834e-05,
206
+ "loss": 0.7416,
207
  "num_input_tokens_seen": 15904,
208
  "step": 90
209
  },
210
  {
211
  "epoch": 0.3654618473895582,
212
+ "eval_loss": 0.6244128346443176,
213
+ "eval_runtime": 1.1184,
214
+ "eval_samples_per_second": 50.073,
215
+ "eval_steps_per_second": 25.037,
216
  "num_input_tokens_seen": 16048,
217
  "step": 91
218
  },
219
  {
220
  "epoch": 0.3815261044176707,
221
+ "grad_norm": 0.6942310333251953,
222
  "learning_rate": 3.9179413635373897e-05,
223
+ "loss": 0.7075,
224
  "num_input_tokens_seen": 16688,
225
  "step": 95
226
  },
227
  {
228
  "epoch": 0.40160642570281124,
229
+ "grad_norm": 0.7927543520927429,
230
  "learning_rate": 3.770188363116324e-05,
231
+ "loss": 0.8212,
232
  "num_input_tokens_seen": 17552,
233
  "step": 100
234
  },
235
  {
236
  "epoch": 0.41767068273092367,
237
+ "eval_loss": 0.5989917516708374,
238
+ "eval_runtime": 1.0356,
239
+ "eval_samples_per_second": 54.073,
240
+ "eval_steps_per_second": 27.037,
241
  "num_input_tokens_seen": 18272,
242
  "step": 104
243
  },
244
  {
245
  "epoch": 0.42168674698795183,
246
+ "grad_norm": 0.5144712328910828,
247
  "learning_rate": 3.616191779978907e-05,
248
+ "loss": 0.71,
249
  "num_input_tokens_seen": 18400,
250
  "step": 105
251
  },
252
  {
253
  "epoch": 0.44176706827309237,
254
+ "grad_norm": 0.5579636693000793,
255
  "learning_rate": 3.456708580912725e-05,
256
+ "loss": 0.5662,
257
  "num_input_tokens_seen": 19456,
258
  "step": 110
259
  },
260
  {
261
  "epoch": 0.46184738955823296,
262
+ "grad_norm": 0.6035380959510803,
263
  "learning_rate": 3.292522702044221e-05,
264
+ "loss": 0.4927,
265
  "num_input_tokens_seen": 20288,
266
  "step": 115
267
  },
268
  {
269
  "epoch": 0.46987951807228917,
270
+ "eval_loss": 0.5652104020118713,
271
+ "eval_runtime": 1.0011,
272
+ "eval_samples_per_second": 55.939,
273
+ "eval_steps_per_second": 27.97,
274
  "num_input_tokens_seen": 20656,
275
  "step": 117
276
  },
277
  {
278
  "epoch": 0.4819277108433735,
279
+ "grad_norm": 0.4272744059562683,
280
  "learning_rate": 3.1244411954180676e-05,
281
+ "loss": 0.5256,
282
  "num_input_tokens_seen": 21328,
283
  "step": 120
284
  },
285
  {
286
  "epoch": 0.5020080321285141,
287
+ "grad_norm": 0.5280653238296509,
288
  "learning_rate": 2.9532902619507462e-05,
289
+ "loss": 0.4829,
290
  "num_input_tokens_seen": 22304,
291
  "step": 125
292
  },
293
  {
294
  "epoch": 0.5220883534136547,
295
+ "grad_norm": 0.6218326687812805,
296
  "learning_rate": 2.7799111902582696e-05,
297
+ "loss": 0.5708,
298
  "num_input_tokens_seen": 23056,
299
  "step": 130
300
  },
301
  {
302
  "epoch": 0.5220883534136547,
303
+ "eval_loss": 0.5374971032142639,
304
+ "eval_runtime": 0.9697,
305
+ "eval_samples_per_second": 57.749,
306
+ "eval_steps_per_second": 28.875,
307
  "num_input_tokens_seen": 23056,
308
  "step": 130
309
  },
310
  {
311
  "epoch": 0.5421686746987951,
312
+ "grad_norm": 0.435881644487381,
313
  "learning_rate": 2.6051562213206632e-05,
314
+ "loss": 0.4772,
315
  "num_input_tokens_seen": 23840,
316
  "step": 135
317
  },
318
  {
319
  "epoch": 0.5622489959839357,
320
+ "grad_norm": 0.49277955293655396,
321
  "learning_rate": 2.429884359310328e-05,
322
+ "loss": 0.4855,
323
  "num_input_tokens_seen": 24832,
324
  "step": 140
325
  },
326
  {
327
  "epoch": 0.5742971887550201,
328
+ "eval_loss": 0.5332380533218384,
329
+ "eval_runtime": 1.0793,
330
+ "eval_samples_per_second": 51.887,
331
+ "eval_steps_per_second": 25.943,
332
  "num_input_tokens_seen": 25312,
333
  "step": 143
334
  },
335
  {
336
  "epoch": 0.5823293172690763,
337
+ "grad_norm": 0.45027071237564087,
338
  "learning_rate": 2.2549571491760986e-05,
339
+ "loss": 0.5654,
340
  "num_input_tokens_seen": 25648,
341
  "step": 145
342
  },
343
  {
344
  "epoch": 0.6024096385542169,
345
+ "grad_norm": 0.6077672243118286,
346
  "learning_rate": 2.0812344417381595e-05,
347
+ "loss": 0.5382,
348
  "num_input_tokens_seen": 26496,
349
  "step": 150
350
  },
351
  {
352
  "epoch": 0.6224899598393574,
353
+ "grad_norm": 0.26088064908981323,
354
  "learning_rate": 1.909570167110415e-05,
355
+ "loss": 0.5239,
356
  "num_input_tokens_seen": 27392,
357
  "step": 155
358
  },
359
  {
360
  "epoch": 0.6265060240963856,
361
+ "eval_loss": 0.5172758102416992,
362
+ "eval_runtime": 1.6299,
363
+ "eval_samples_per_second": 34.358,
364
+ "eval_steps_per_second": 17.179,
365
  "num_input_tokens_seen": 27552,
366
  "step": 156
367
  },
368
  {
369
  "epoch": 0.642570281124498,
370
+ "grad_norm": 0.32266005873680115,
371
  "learning_rate": 1.7408081372259632e-05,
372
+ "loss": 0.52,
373
  "num_input_tokens_seen": 28272,
374
  "step": 160
375
  },
376
  {
377
  "epoch": 0.6626506024096386,
378
+ "grad_norm": 0.33992618322372437,
379
  "learning_rate": 1.5757778980982626e-05,
380
+ "loss": 0.4772,
381
  "num_input_tokens_seen": 29184,
382
  "step": 165
383
  },
384
  {
385
  "epoch": 0.678714859437751,
386
+ "eval_loss": 0.5134099721908569,
387
+ "eval_runtime": 0.9826,
388
+ "eval_samples_per_second": 56.991,
389
+ "eval_steps_per_second": 28.496,
390
  "num_input_tokens_seen": 29984,
391
  "step": 169
392
  },
393
  {
394
  "epoch": 0.6827309236947792,
395
+ "grad_norm": 0.40361297130584717,
396
  "learning_rate": 1.4152906522061048e-05,
397
+ "loss": 0.4832,
398
  "num_input_tokens_seen": 30128,
399
  "step": 170
400
  },
401
  {
402
  "epoch": 0.7028112449799196,
403
+ "grad_norm": 0.35581162571907043,
404
  "learning_rate": 1.2601352710458313e-05,
405
+ "loss": 0.494,
406
  "num_input_tokens_seen": 30976,
407
  "step": 175
408
  },
409
  {
410
  "epoch": 0.7228915662650602,
411
+ "grad_norm": 0.30878543853759766,
412
  "learning_rate": 1.1110744174509952e-05,
413
+ "loss": 0.4958,
414
  "num_input_tokens_seen": 31776,
415
  "step": 180
416
  },
417
  {
418
  "epoch": 0.7309236947791165,
419
+ "eval_loss": 0.5050697922706604,
420
+ "eval_runtime": 1.0024,
421
+ "eval_samples_per_second": 55.865,
422
+ "eval_steps_per_second": 27.932,
423
  "num_input_tokens_seen": 32080,
424
  "step": 182
425
  },
426
  {
427
  "epoch": 0.7429718875502008,
428
+ "grad_norm": 0.6177117228507996,
429
  "learning_rate": 9.688407967401248e-06,
430
+ "loss": 0.6199,
431
  "num_input_tokens_seen": 32608,
432
  "step": 185
433
  },
434
  {
435
  "epoch": 0.7630522088353414,
436
+ "grad_norm": 0.39982450008392334,
437
  "learning_rate": 8.341335551199902e-06,
438
+ "loss": 0.6049,
439
  "num_input_tokens_seen": 33360,
440
  "step": 190
441
  },
442
  {
443
  "epoch": 0.7831325301204819,
444
+ "grad_norm": 0.559433102607727,
445
  "learning_rate": 7.076148430479321e-06,
446
+ "loss": 0.6547,
447
  "num_input_tokens_seen": 34176,
448
  "step": 195
449
  },
450
  {
451
  "epoch": 0.7831325301204819,
452
+ "eval_loss": 0.5061560273170471,
453
+ "eval_runtime": 0.9678,
454
+ "eval_samples_per_second": 57.864,
455
+ "eval_steps_per_second": 28.932,
456
  "num_input_tokens_seen": 34176,
457
  "step": 195
458
  },
459
  {
460
  "epoch": 0.8032128514056225,
461
+ "grad_norm": 0.3360762298107147,
462
  "learning_rate": 5.899065604459814e-06,
463
+ "loss": 0.4622,
464
  "num_input_tokens_seen": 34992,
465
  "step": 200
466
  },
467
  {
468
  "epoch": 0.8232931726907631,
469
+ "grad_norm": 0.3794894516468048,
470
  "learning_rate": 4.81587299765594e-06,
471
+ "loss": 0.6246,
472
  "num_input_tokens_seen": 35888,
473
  "step": 205
474
  },
475
  {
476
  "epoch": 0.8353413654618473,
477
+ "eval_loss": 0.5011698007583618,
478
+ "eval_runtime": 1.0745,
479
+ "eval_samples_per_second": 52.116,
480
+ "eval_steps_per_second": 26.058,
481
  "num_input_tokens_seen": 36512,
482
  "step": 208
483
  },
484
  {
485
  "epoch": 0.8433734939759037,
486
+ "grad_norm": 0.2998324930667877,
487
  "learning_rate": 3.831895019292897e-06,
488
+ "loss": 0.553,
489
  "num_input_tokens_seen": 36848,
490
  "step": 210
491
  },
492
  {
493
  "epoch": 0.8634538152610441,
494
+ "grad_norm": 0.384755939245224,
495
  "learning_rate": 2.9519683912911266e-06,
496
+ "loss": 0.5892,
497
  "num_input_tokens_seen": 37888,
498
  "step": 215
499
  },
500
  {
501
  "epoch": 0.8835341365461847,
502
+ "grad_norm": 0.2608492970466614,
503
  "learning_rate": 2.1804183734670277e-06,
504
+ "loss": 0.5174,
505
  "num_input_tokens_seen": 38768,
506
  "step": 220
507
  },
508
  {
509
  "epoch": 0.8875502008032129,
510
+ "eval_loss": 0.4947091042995453,
511
+ "eval_runtime": 1.0001,
512
+ "eval_samples_per_second": 55.993,
513
+ "eval_steps_per_second": 27.996,
514
  "num_input_tokens_seen": 38912,
515
  "step": 221
516
  },
517
  {
518
  "epoch": 0.9036144578313253,
519
+ "grad_norm": 0.3358154296875,
520
  "learning_rate": 1.5210375028143097e-06,
521
+ "loss": 0.6159,
522
  "num_input_tokens_seen": 39488,
523
  "step": 225
524
  },
525
  {
526
  "epoch": 0.9236947791164659,
527
+ "grad_norm": 0.34916290640830994,
528
  "learning_rate": 9.770669513725128e-07,
529
+ "loss": 0.5318,
530
  "num_input_tokens_seen": 40336,
531
  "step": 230
532
  },
533
  {
534
  "epoch": 0.9397590361445783,
535
+ "eval_loss": 0.49773862957954407,
536
+ "eval_runtime": 0.9441,
537
+ "eval_samples_per_second": 59.319,
538
+ "eval_steps_per_second": 29.659,
539
  "num_input_tokens_seen": 41120,
540
  "step": 234
541
  },
542
  {
543
  "epoch": 0.9437751004016064,
544
+ "grad_norm": 0.3469204604625702,
545
  "learning_rate": 5.5118059431781e-07,
546
+ "loss": 0.6135,
547
  "num_input_tokens_seen": 41328,
548
  "step": 235
549
  },
550
  {
551
  "epoch": 0.963855421686747,
552
+ "grad_norm": 0.28425994515419006,
553
  "learning_rate": 2.454718665888589e-07,
554
+ "loss": 0.6099,
555
  "num_input_tokens_seen": 42176,
556
  "step": 240
557
  },
558
  {
559
  "epoch": 0.9839357429718876,
560
+ "grad_norm": 0.3837321698665619,
561
  "learning_rate": 6.14434726538493e-08,
562
+ "loss": 0.445,
563
  "num_input_tokens_seen": 43312,
564
  "step": 245
565
  },
566
  {
567
  "epoch": 0.9919678714859438,
568
+ "eval_loss": 0.5010460019111633,
569
+ "eval_runtime": 1.0639,
570
+ "eval_samples_per_second": 52.635,
571
+ "eval_steps_per_second": 26.317,
572
  "num_input_tokens_seen": 43600,
573
  "step": 247
574
  },
 
576
  "epoch": 1.0,
577
  "num_input_tokens_seen": 43904,
578
  "step": 249,
579
+ "total_flos": 256382402519040.0,
580
+ "train_loss": 0.6638821186310795,
581
+ "train_runtime": 70.9038,
582
+ "train_samples_per_second": 7.024,
583
+ "train_steps_per_second": 3.512
584
  }
585
  ],
586
  "logging_steps": 5,
 
600
  "attributes": {}
601
  }
602
  },
603
+ "total_flos": 256382402519040.0,
604
  "train_batch_size": 2,
605
  "trial_name": null,
606
  "trial_params": null
training_eval_loss.png CHANGED
training_loss.png CHANGED