sedrickkeh commited on
Commit
3d61891
·
verified ·
1 Parent(s): 72e286f

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_original_wo_airoboros
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_original_wo_airoboros
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6065
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_original_wo_airoboros
 
16
 
17
  # OH_original_wo_airoboros
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_original_wo_airoboros dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6065
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.993050193050193,
3
- "eval_loss": 0.611685574054718,
4
- "eval_runtime": 175.2261,
5
- "eval_samples_per_second": 49.764,
6
- "eval_steps_per_second": 0.394,
7
  "total_flos": 1622692331520000.0,
8
- "train_loss": 0.5660572312810719,
9
- "train_runtime": 29124.935,
10
- "train_samples_per_second": 17.065,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.993050193050193,
3
+ "eval_loss": 0.6065478920936584,
4
+ "eval_runtime": 174.4122,
5
+ "eval_samples_per_second": 49.997,
6
+ "eval_steps_per_second": 0.396,
7
  "total_flos": 1622692331520000.0,
8
+ "train_loss": 0.5687421354839061,
9
+ "train_runtime": 29217.0698,
10
+ "train_samples_per_second": 17.012,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.993050193050193,
3
- "eval_loss": 0.611685574054718,
4
- "eval_runtime": 175.2261,
5
- "eval_samples_per_second": 49.764,
6
- "eval_steps_per_second": 0.394
7
  }
 
1
  {
2
  "epoch": 2.993050193050193,
3
+ "eval_loss": 0.6065478920936584,
4
+ "eval_runtime": 174.4122,
5
+ "eval_samples_per_second": 49.997,
6
+ "eval_steps_per_second": 0.396
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.993050193050193,
3
  "total_flos": 1622692331520000.0,
4
- "train_loss": 0.5660572312810719,
5
- "train_runtime": 29124.935,
6
- "train_samples_per_second": 17.065,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.993050193050193,
3
  "total_flos": 1622692331520000.0,
4
+ "train_loss": 0.5687421354839061,
5
+ "train_runtime": 29217.0698,
6
+ "train_samples_per_second": 17.012,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,697 +10,697 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03088803088803089,
13
- "grad_norm": 109.2129922948738,
14
  "learning_rate": 5e-06,
15
- "loss": 0.8218,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06177606177606178,
20
- "grad_norm": 1.5509789288186628,
21
  "learning_rate": 5e-06,
22
- "loss": 0.7505,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.09266409266409266,
27
- "grad_norm": 1.0237535334293637,
28
  "learning_rate": 5e-06,
29
- "loss": 0.7022,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.12355212355212356,
34
- "grad_norm": 0.9555198347715455,
35
  "learning_rate": 5e-06,
36
- "loss": 0.6862,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.15444015444015444,
41
- "grad_norm": 0.9962768534382307,
42
  "learning_rate": 5e-06,
43
- "loss": 0.6721,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.18532818532818532,
48
- "grad_norm": 1.7530907194148484,
49
  "learning_rate": 5e-06,
50
- "loss": 0.6509,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.21621621621621623,
55
- "grad_norm": 0.7959383217983834,
56
  "learning_rate": 5e-06,
57
- "loss": 0.6472,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2471042471042471,
62
- "grad_norm": 0.7944327039168986,
63
  "learning_rate": 5e-06,
64
- "loss": 0.65,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.277992277992278,
69
- "grad_norm": 0.6922868471685667,
70
  "learning_rate": 5e-06,
71
- "loss": 0.639,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.3088803088803089,
76
- "grad_norm": 0.6117044147831933,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6398,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.33976833976833976,
83
- "grad_norm": 0.6349747322397297,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6335,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.37065637065637064,
90
- "grad_norm": 0.6236670890266053,
91
  "learning_rate": 5e-06,
92
- "loss": 0.6337,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.4015444015444015,
97
- "grad_norm": 0.6724570938621872,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6255,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.43243243243243246,
104
- "grad_norm": 0.8157933757633341,
105
  "learning_rate": 5e-06,
106
- "loss": 0.6283,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.46332046332046334,
111
- "grad_norm": 0.5839495663310666,
112
  "learning_rate": 5e-06,
113
- "loss": 0.6214,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4942084942084942,
118
- "grad_norm": 0.5048909710838132,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6243,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.525096525096525,
125
- "grad_norm": 0.8095059876811945,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6251,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.555984555984556,
132
- "grad_norm": 0.5136815801780633,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6145,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5868725868725869,
139
- "grad_norm": 0.6364035663779027,
140
  "learning_rate": 5e-06,
141
- "loss": 0.6298,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.6177606177606177,
146
- "grad_norm": 0.6844817393850995,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6197,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6486486486486487,
153
- "grad_norm": 0.7483950060039181,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6269,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6795366795366795,
160
- "grad_norm": 0.7350511698635498,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6175,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.7104247104247104,
167
- "grad_norm": 0.6246522556489467,
168
  "learning_rate": 5e-06,
169
- "loss": 0.6172,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.7413127413127413,
174
- "grad_norm": 0.5845884993818068,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6119,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7722007722007722,
181
- "grad_norm": 0.6246906999878916,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6191,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.803088803088803,
188
- "grad_norm": 0.5725768617907299,
189
  "learning_rate": 5e-06,
190
- "loss": 0.6111,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.833976833976834,
195
- "grad_norm": 0.7575125169415168,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6078,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8648648648648649,
202
- "grad_norm": 0.6292537869493213,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6119,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8957528957528957,
209
- "grad_norm": 0.6613666650568315,
210
  "learning_rate": 5e-06,
211
- "loss": 0.6219,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.9266409266409267,
216
- "grad_norm": 0.482392770012809,
217
  "learning_rate": 5e-06,
218
- "loss": 0.6032,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.9575289575289575,
223
- "grad_norm": 0.5691595473530817,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6137,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9884169884169884,
230
- "grad_norm": 0.5605696956503327,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6059,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9976833976833976,
237
- "eval_loss": 0.611504316329956,
238
- "eval_runtime": 174.6046,
239
- "eval_samples_per_second": 49.941,
240
- "eval_steps_per_second": 0.395,
241
  "step": 323
242
  },
243
  {
244
  "epoch": 1.0193050193050193,
245
- "grad_norm": 0.7653552774499555,
246
  "learning_rate": 5e-06,
247
- "loss": 0.577,
248
  "step": 330
249
  },
250
  {
251
  "epoch": 1.05019305019305,
252
- "grad_norm": 0.6717959962039358,
253
  "learning_rate": 5e-06,
254
- "loss": 0.5516,
255
  "step": 340
256
  },
257
  {
258
  "epoch": 1.0810810810810811,
259
- "grad_norm": 0.7810736772266839,
260
  "learning_rate": 5e-06,
261
- "loss": 0.5527,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.111969111969112,
266
- "grad_norm": 0.5222987434687074,
267
  "learning_rate": 5e-06,
268
- "loss": 0.5672,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.1428571428571428,
273
- "grad_norm": 0.6086528109996991,
274
  "learning_rate": 5e-06,
275
- "loss": 0.559,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.1737451737451738,
280
- "grad_norm": 0.5603581613184462,
281
  "learning_rate": 5e-06,
282
- "loss": 0.5557,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.2046332046332047,
287
- "grad_norm": 0.5733679682136087,
288
  "learning_rate": 5e-06,
289
- "loss": 0.5523,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.2355212355212355,
294
- "grad_norm": 0.5892263714928079,
295
  "learning_rate": 5e-06,
296
- "loss": 0.5581,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.2664092664092665,
301
- "grad_norm": 0.5383912423961117,
302
  "learning_rate": 5e-06,
303
- "loss": 0.5587,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2972972972972974,
308
- "grad_norm": 0.5266690487444351,
309
  "learning_rate": 5e-06,
310
- "loss": 0.5538,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.3281853281853282,
315
- "grad_norm": 0.6329004292969694,
316
  "learning_rate": 5e-06,
317
- "loss": 0.5621,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.359073359073359,
322
- "grad_norm": 0.7361457930766753,
323
  "learning_rate": 5e-06,
324
- "loss": 0.5507,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.3899613899613898,
329
- "grad_norm": 0.5514849953088692,
330
  "learning_rate": 5e-06,
331
- "loss": 0.5458,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.420849420849421,
336
- "grad_norm": 0.6157873717956057,
337
  "learning_rate": 5e-06,
338
- "loss": 0.5576,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.4517374517374517,
343
- "grad_norm": 0.6133558219166387,
344
  "learning_rate": 5e-06,
345
- "loss": 0.5549,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.4826254826254825,
350
- "grad_norm": 0.8739126560777261,
351
  "learning_rate": 5e-06,
352
- "loss": 0.5544,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.5135135135135136,
357
- "grad_norm": 0.6361501246308614,
358
  "learning_rate": 5e-06,
359
- "loss": 0.5593,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.5444015444015444,
364
- "grad_norm": 0.5962865704485671,
365
  "learning_rate": 5e-06,
366
- "loss": 0.5536,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.5752895752895753,
371
- "grad_norm": 0.5717477466618819,
372
  "learning_rate": 5e-06,
373
- "loss": 0.5613,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.606177606177606,
378
- "grad_norm": 0.66303577521287,
379
  "learning_rate": 5e-06,
380
- "loss": 0.5515,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.637065637065637,
385
- "grad_norm": 0.567568871578932,
386
  "learning_rate": 5e-06,
387
- "loss": 0.552,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.667953667953668,
392
- "grad_norm": 0.6260947334671597,
393
  "learning_rate": 5e-06,
394
- "loss": 0.5523,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.698841698841699,
399
- "grad_norm": 0.5443908156986677,
400
  "learning_rate": 5e-06,
401
- "loss": 0.5597,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.7297297297297298,
406
- "grad_norm": 0.5875127617844319,
407
  "learning_rate": 5e-06,
408
- "loss": 0.5577,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.7606177606177607,
413
- "grad_norm": 0.5839768971343271,
414
  "learning_rate": 5e-06,
415
- "loss": 0.5556,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.7915057915057915,
420
- "grad_norm": 0.5415996097268374,
421
  "learning_rate": 5e-06,
422
- "loss": 0.5541,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.8223938223938223,
427
- "grad_norm": 0.5047758992113026,
428
  "learning_rate": 5e-06,
429
- "loss": 0.5498,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.8532818532818531,
434
- "grad_norm": 0.6095158836516334,
435
  "learning_rate": 5e-06,
436
- "loss": 0.5577,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.8841698841698842,
441
- "grad_norm": 0.6179499758866378,
442
  "learning_rate": 5e-06,
443
- "loss": 0.5561,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.915057915057915,
448
- "grad_norm": 0.6168326581011625,
449
  "learning_rate": 5e-06,
450
- "loss": 0.5623,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.945945945945946,
455
- "grad_norm": 0.5503294354981652,
456
  "learning_rate": 5e-06,
457
- "loss": 0.5513,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.9768339768339769,
462
- "grad_norm": 0.5514628691685661,
463
  "learning_rate": 5e-06,
464
- "loss": 0.5543,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.9984555984555985,
469
- "eval_loss": 0.6036180853843689,
470
- "eval_runtime": 175.7287,
471
- "eval_samples_per_second": 49.622,
472
  "eval_steps_per_second": 0.393,
473
  "step": 647
474
  },
475
  {
476
  "epoch": 2.0077220077220077,
477
- "grad_norm": 1.1966714312564959,
478
  "learning_rate": 5e-06,
479
- "loss": 0.5432,
480
  "step": 650
481
  },
482
  {
483
  "epoch": 2.0386100386100385,
484
- "grad_norm": 0.6689403789809841,
485
  "learning_rate": 5e-06,
486
- "loss": 0.5094,
487
  "step": 660
488
  },
489
  {
490
  "epoch": 2.0694980694980694,
491
- "grad_norm": 0.61180285844113,
492
  "learning_rate": 5e-06,
493
- "loss": 0.5053,
494
  "step": 670
495
  },
496
  {
497
  "epoch": 2.1003861003861,
498
- "grad_norm": 0.6066089470745547,
499
  "learning_rate": 5e-06,
500
- "loss": 0.5038,
501
  "step": 680
502
  },
503
  {
504
  "epoch": 2.1312741312741315,
505
- "grad_norm": 0.7339359470891843,
506
  "learning_rate": 5e-06,
507
- "loss": 0.4866,
508
  "step": 690
509
  },
510
  {
511
  "epoch": 2.1621621621621623,
512
- "grad_norm": 0.6280327917691826,
513
  "learning_rate": 5e-06,
514
- "loss": 0.4925,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.193050193050193,
519
- "grad_norm": 0.6228681526915294,
520
  "learning_rate": 5e-06,
521
- "loss": 0.4964,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.223938223938224,
526
- "grad_norm": 0.7409827598226171,
527
  "learning_rate": 5e-06,
528
- "loss": 0.4991,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.2548262548262548,
533
- "grad_norm": 0.5789864024622833,
534
  "learning_rate": 5e-06,
535
- "loss": 0.5047,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.2857142857142856,
540
- "grad_norm": 0.6184054511523953,
541
  "learning_rate": 5e-06,
542
- "loss": 0.4983,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.3166023166023164,
547
- "grad_norm": 0.7218033338604144,
548
  "learning_rate": 5e-06,
549
- "loss": 0.5006,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.3474903474903477,
554
- "grad_norm": 0.5488090893995611,
555
  "learning_rate": 5e-06,
556
- "loss": 0.4982,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.3783783783783785,
561
- "grad_norm": 0.7369531886408642,
562
  "learning_rate": 5e-06,
563
- "loss": 0.4967,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.4092664092664093,
568
- "grad_norm": 0.5519469047632208,
569
  "learning_rate": 5e-06,
570
- "loss": 0.4965,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.44015444015444,
575
- "grad_norm": 0.6584775748338472,
576
  "learning_rate": 5e-06,
577
- "loss": 0.501,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.471042471042471,
582
- "grad_norm": 0.6589409942831427,
583
  "learning_rate": 5e-06,
584
- "loss": 0.5006,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.501930501930502,
589
- "grad_norm": 0.6035656966090169,
590
  "learning_rate": 5e-06,
591
- "loss": 0.504,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.532818532818533,
596
- "grad_norm": 0.5939046793612662,
597
  "learning_rate": 5e-06,
598
- "loss": 0.4962,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.563706563706564,
603
- "grad_norm": 0.6332349300931522,
604
  "learning_rate": 5e-06,
605
- "loss": 0.5184,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.5945945945945947,
610
- "grad_norm": 0.5980197287456133,
611
  "learning_rate": 5e-06,
612
- "loss": 0.4989,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.6254826254826256,
617
- "grad_norm": 0.5239673559847541,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5013,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.6563706563706564,
624
- "grad_norm": 0.6161550811061723,
625
  "learning_rate": 5e-06,
626
- "loss": 0.4979,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.687258687258687,
631
- "grad_norm": 0.5658973017035395,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5069,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.718146718146718,
638
- "grad_norm": 0.5709137558646814,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5065,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.749034749034749,
645
- "grad_norm": 0.6389113703934911,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5037,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.7799227799227797,
652
- "grad_norm": 0.6445028760804165,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5064,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.810810810810811,
659
- "grad_norm": 0.6013274832800054,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5032,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.841698841698842,
666
- "grad_norm": 0.5664869441381479,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5068,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.8725868725868726,
673
- "grad_norm": 0.6735218011480151,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5076,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.9034749034749034,
680
- "grad_norm": 1.6394857976813213,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5032,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.9343629343629343,
687
- "grad_norm": 0.5576873331550094,
688
  "learning_rate": 5e-06,
689
- "loss": 0.504,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.965250965250965,
694
- "grad_norm": 0.5637202117054834,
695
  "learning_rate": 5e-06,
696
- "loss": 0.5012,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.993050193050193,
701
- "eval_loss": 0.611685574054718,
702
- "eval_runtime": 175.5767,
703
- "eval_samples_per_second": 49.665,
704
  "eval_steps_per_second": 0.393,
705
  "step": 969
706
  },
@@ -708,9 +708,9 @@
708
  "epoch": 2.993050193050193,
709
  "step": 969,
710
  "total_flos": 1622692331520000.0,
711
- "train_loss": 0.5660572312810719,
712
- "train_runtime": 29124.935,
713
- "train_samples_per_second": 17.065,
714
  "train_steps_per_second": 0.033
715
  }
716
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03088803088803089,
13
+ "grad_norm": 1.9920737405210878,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.8015,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06177606177606178,
20
+ "grad_norm": 2.6204521466523314,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7199,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.09266409266409266,
27
+ "grad_norm": 1.3723889792780992,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.6903,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.12355212355212356,
34
+ "grad_norm": 0.8864238330359895,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.6793,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.15444015444015444,
41
+ "grad_norm": 13.397151069895909,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.6675,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.18532818532818532,
48
+ "grad_norm": 2.1374873857216876,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.6488,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.21621621621621623,
55
+ "grad_norm": 0.7775938162830551,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.6457,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2471042471042471,
62
+ "grad_norm": 0.8956285245200575,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6479,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.277992277992278,
69
+ "grad_norm": 0.7136145785303137,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6365,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.3088803088803089,
76
+ "grad_norm": 0.5386743245333256,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.6366,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.33976833976833976,
83
+ "grad_norm": 0.5331290951707172,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6305,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.37065637065637064,
90
+ "grad_norm": 0.5589168290698243,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.6305,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.4015444015444015,
97
+ "grad_norm": 0.6427317993478433,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.6221,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.43243243243243246,
104
+ "grad_norm": 0.5345254165110678,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.6251,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.46332046332046334,
111
+ "grad_norm": 0.5523122802530471,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6183,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4942084942084942,
118
+ "grad_norm": 0.6345828990779662,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6211,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.525096525096525,
125
+ "grad_norm": 0.5635690118220865,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6222,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.555984555984556,
132
+ "grad_norm": 0.5674431409857721,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6121,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5868725868725869,
139
+ "grad_norm": 0.9846408481503562,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.627,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.6177606177606177,
146
+ "grad_norm": 0.5946873633912808,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6171,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.6486486486486487,
153
+ "grad_norm": 1.1118602099699486,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6244,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.6795366795366795,
160
+ "grad_norm": 0.6560189254340667,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.615,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.7104247104247104,
167
+ "grad_norm": 1.108952229503608,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6147,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.7413127413127413,
174
+ "grad_norm": 0.6239111214970273,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6097,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.7722007722007722,
181
+ "grad_norm": 1.0310044782453138,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6167,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.803088803088803,
188
+ "grad_norm": 0.5083008941812818,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6088,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.833976833976834,
195
+ "grad_norm": 0.5393819554919369,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6054,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.8648648648648649,
202
+ "grad_norm": 0.501525790082182,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6095,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.8957528957528957,
209
+ "grad_norm": 0.5189867584989447,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6199,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.9266409266409267,
216
+ "grad_norm": 0.4557994704814137,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6008,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.9575289575289575,
223
+ "grad_norm": 0.5100750571195128,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6118,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9884169884169884,
230
+ "grad_norm": 0.5597550458388434,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.604,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.9976833976833976,
237
+ "eval_loss": 0.6096363663673401,
238
+ "eval_runtime": 174.3952,
239
+ "eval_samples_per_second": 50.001,
240
+ "eval_steps_per_second": 0.396,
241
  "step": 323
242
  },
243
  {
244
  "epoch": 1.0193050193050193,
245
+ "grad_norm": 1.0059049758489014,
246
  "learning_rate": 5e-06,
247
+ "loss": 0.5788,
248
  "step": 330
249
  },
250
  {
251
  "epoch": 1.05019305019305,
252
+ "grad_norm": 0.8266294447843203,
253
  "learning_rate": 5e-06,
254
+ "loss": 0.5555,
255
  "step": 340
256
  },
257
  {
258
  "epoch": 1.0810810810810811,
259
+ "grad_norm": 0.6817740146761346,
260
  "learning_rate": 5e-06,
261
+ "loss": 0.5568,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.111969111969112,
266
+ "grad_norm": 0.5212640526638012,
267
  "learning_rate": 5e-06,
268
+ "loss": 0.5711,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.1428571428571428,
273
+ "grad_norm": 0.6082293357332792,
274
  "learning_rate": 5e-06,
275
+ "loss": 0.5631,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.1737451737451738,
280
+ "grad_norm": 0.4984378073822691,
281
  "learning_rate": 5e-06,
282
+ "loss": 0.5597,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.2046332046332047,
287
+ "grad_norm": 0.5400232508156533,
288
  "learning_rate": 5e-06,
289
+ "loss": 0.5563,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.2355212355212355,
294
+ "grad_norm": 0.4863797369607136,
295
  "learning_rate": 5e-06,
296
+ "loss": 0.563,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.2664092664092665,
301
+ "grad_norm": 0.5275923409775821,
302
  "learning_rate": 5e-06,
303
+ "loss": 0.5625,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.2972972972972974,
308
+ "grad_norm": 0.527704404092815,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.5577,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.3281853281853282,
315
+ "grad_norm": 0.6166415284725348,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.5658,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.359073359073359,
322
+ "grad_norm": 0.6544026128628749,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.5544,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.3899613899613898,
329
+ "grad_norm": 0.5151291141077943,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.5497,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.420849420849421,
336
+ "grad_norm": 0.5534465037275664,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.5614,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.4517374517374517,
343
+ "grad_norm": 0.5300403375821853,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.5583,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.4826254826254825,
350
+ "grad_norm": 0.7438189920291365,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.558,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.5135135135135136,
357
+ "grad_norm": 0.531280818371624,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.5629,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.5444015444015444,
364
+ "grad_norm": 0.5680517069700968,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.557,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.5752895752895753,
371
+ "grad_norm": 0.5612219075386209,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.5653,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.606177606177606,
378
+ "grad_norm": 0.649010796286653,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.5553,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.637065637065637,
385
+ "grad_norm": 0.5724021305893356,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.5554,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.667953667953668,
392
+ "grad_norm": 0.5266418342750984,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.556,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.698841698841699,
399
+ "grad_norm": 0.4764947467562162,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.5637,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.7297297297297298,
406
+ "grad_norm": 0.49367950454054643,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.5616,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.7606177606177607,
413
+ "grad_norm": 0.48282246019994013,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.559,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.7915057915057915,
420
+ "grad_norm": 0.47502284833211744,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.5575,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.8223938223938223,
427
+ "grad_norm": 0.45633738284447206,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.5532,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.8532818532818531,
434
+ "grad_norm": 0.5759433055884126,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.5615,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.8841698841698842,
441
+ "grad_norm": 0.5858108727658949,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.5593,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.915057915057915,
448
+ "grad_norm": 0.5395422279987274,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.566,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.945945945945946,
455
+ "grad_norm": 0.544007523438176,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.5546,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.9768339768339769,
462
+ "grad_norm": 0.4756136721421921,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.5577,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.9984555984555985,
469
+ "eval_loss": 0.6016931533813477,
470
+ "eval_runtime": 175.4937,
471
+ "eval_samples_per_second": 49.688,
472
  "eval_steps_per_second": 0.393,
473
  "step": 647
474
  },
475
  {
476
  "epoch": 2.0077220077220077,
477
+ "grad_norm": 0.9647134139465964,
478
  "learning_rate": 5e-06,
479
+ "loss": 0.5481,
480
  "step": 650
481
  },
482
  {
483
  "epoch": 2.0386100386100385,
484
+ "grad_norm": 0.6126014654142371,
485
  "learning_rate": 5e-06,
486
+ "loss": 0.5195,
487
  "step": 660
488
  },
489
  {
490
  "epoch": 2.0694980694980694,
491
+ "grad_norm": 0.642576302083791,
492
  "learning_rate": 5e-06,
493
+ "loss": 0.5154,
494
  "step": 670
495
  },
496
  {
497
  "epoch": 2.1003861003861,
498
+ "grad_norm": 0.6053831547491819,
499
  "learning_rate": 5e-06,
500
+ "loss": 0.5141,
501
  "step": 680
502
  },
503
  {
504
  "epoch": 2.1312741312741315,
505
+ "grad_norm": 0.570602965823511,
506
  "learning_rate": 5e-06,
507
+ "loss": 0.4963,
508
  "step": 690
509
  },
510
  {
511
  "epoch": 2.1621621621621623,
512
+ "grad_norm": 0.5789932913521146,
513
  "learning_rate": 5e-06,
514
+ "loss": 0.5023,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.193050193050193,
519
+ "grad_norm": 0.49902888045361504,
520
  "learning_rate": 5e-06,
521
+ "loss": 0.5053,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.223938223938224,
526
+ "grad_norm": 0.5200818247457489,
527
  "learning_rate": 5e-06,
528
+ "loss": 0.5086,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.2548262548262548,
533
+ "grad_norm": 0.5147752817966699,
534
  "learning_rate": 5e-06,
535
+ "loss": 0.5144,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.2857142857142856,
540
+ "grad_norm": 0.5202904589332674,
541
  "learning_rate": 5e-06,
542
+ "loss": 0.5075,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.3166023166023164,
547
+ "grad_norm": 0.6419271084211798,
548
  "learning_rate": 5e-06,
549
+ "loss": 0.5095,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.3474903474903477,
554
+ "grad_norm": 0.48888244575595774,
555
  "learning_rate": 5e-06,
556
+ "loss": 0.5074,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.3783783783783785,
561
+ "grad_norm": 0.4949992881722656,
562
  "learning_rate": 5e-06,
563
+ "loss": 0.5058,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.4092664092664093,
568
+ "grad_norm": 0.6977527327479829,
569
  "learning_rate": 5e-06,
570
+ "loss": 0.5055,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.44015444015444,
575
+ "grad_norm": 0.5281157241550238,
576
  "learning_rate": 5e-06,
577
+ "loss": 0.5101,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.471042471042471,
582
+ "grad_norm": 0.5317168799948615,
583
  "learning_rate": 5e-06,
584
+ "loss": 0.5092,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.501930501930502,
589
+ "grad_norm": 0.5665091196485048,
590
  "learning_rate": 5e-06,
591
+ "loss": 0.5123,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.532818532818533,
596
+ "grad_norm": 0.5733443367920803,
597
  "learning_rate": 5e-06,
598
+ "loss": 0.5045,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.563706563706564,
603
+ "grad_norm": 0.4895951656550531,
604
  "learning_rate": 5e-06,
605
+ "loss": 0.5245,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.5945945945945947,
610
+ "grad_norm": 0.5509555644837258,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.5074,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.6254826254826256,
617
+ "grad_norm": 0.5242119378254794,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5102,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.6563706563706564,
624
+ "grad_norm": 0.6137964577932209,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5066,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.687258687258687,
631
+ "grad_norm": 0.517696061441138,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5156,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.718146718146718,
638
+ "grad_norm": 0.5449458713675641,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5153,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.749034749034749,
645
+ "grad_norm": 0.5127690582430875,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5121,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.7799227799227797,
652
+ "grad_norm": 0.5989390375727383,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5149,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.810810810810811,
659
+ "grad_norm": 0.5398603643342522,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5116,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.841698841698842,
666
+ "grad_norm": 0.5257472179121192,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5154,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.8725868725868726,
673
+ "grad_norm": 0.5919523831497919,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.516,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.9034749034749034,
680
+ "grad_norm": 0.5425769518104601,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.5114,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.9343629343629343,
687
+ "grad_norm": 0.49233460424864756,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5126,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.965250965250965,
694
+ "grad_norm": 0.5105330314981293,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.5096,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.993050193050193,
701
+ "eval_loss": 0.6065478920936584,
702
+ "eval_runtime": 175.4708,
703
+ "eval_samples_per_second": 49.695,
704
  "eval_steps_per_second": 0.393,
705
  "step": 969
706
  },
 
708
  "epoch": 2.993050193050193,
709
  "step": 969,
710
  "total_flos": 1622692331520000.0,
711
+ "train_loss": 0.5687421354839061,
712
+ "train_runtime": 29217.0698,
713
+ "train_samples_per_second": 17.012,
714
  "train_steps_per_second": 0.033
715
  }
716
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED