irishprancer commited on
Commit
edbc27f
·
verified ·
1 Parent(s): d583167

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -22,8 +22,8 @@
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
- "q_proj",
26
- "k_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
+ "k_proj",
26
+ "q_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e509195daf848fcb63ff142e78b630faccf6e51fad01b6e3b473f7c9b81e1da0
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de9c7983d81c83a25d9f5af8ab2f633d7e6d24eec5b255ecd10994cf1de3e68
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44ecbc32e205dad7c9ef52f7e61a7eb02340bf5159e891c7a802a3da51f2db68
3
- size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9b36b33638ac27ad81d5b5cd5fa2e56673b3a7acb5c226ff0baa60d122dfdc7
3
+ size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d591cd4a87db6ff7862986fcbd71a7ea08ac34a6c4ca00eb88fbc6e4ccf1c5bd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afdcd158786234085082ce38b1824c51dd8c72881220443fc2d1c6f4e031a983
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f47df519d3e34f85833ffe9513be298918979811657719c019fec7ab68351e14
3
- size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11a68d93d91bb3affb76e357047e1fc6d0e0e337baf5ccbf0315ac19ab68b7ca
3
+ size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,583 +1,131 @@
1
  {
2
- "best_metric": 0.7210280299186707,
3
- "best_model_checkpoint": "./output/checkpoint-750",
4
- "epoch": 16.666666666666668,
5
  "eval_steps": 150,
6
- "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.2222222222222222,
13
- "grad_norm": 1.9091473817825317,
14
- "learning_rate": 2.9999999999999984e-06,
15
- "loss": 0.6619,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.4444444444444444,
20
- "grad_norm": 1.8889063596725464,
21
- "learning_rate": 5.999999999999997e-06,
22
- "loss": 0.7254,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.6666666666666666,
27
- "grad_norm": 1.6717636585235596,
28
- "learning_rate": 8.999999999999993e-06,
29
- "loss": 0.7335,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.8888888888888888,
34
- "grad_norm": 1.5766067504882812,
35
- "learning_rate": 1.1999999999999994e-05,
36
- "loss": 0.7377,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 1.1111111111111112,
41
- "grad_norm": 1.2490640878677368,
42
- "learning_rate": 1.499999999999999e-05,
43
- "loss": 0.8942,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 1.3333333333333333,
48
- "grad_norm": 1.1447882652282715,
49
- "learning_rate": 1.7999999999999987e-05,
50
- "loss": 0.7146,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 1.5555555555555556,
55
- "grad_norm": 1.5109171867370605,
56
- "learning_rate": 2.0999999999999985e-05,
57
- "loss": 0.7579,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 1.7777777777777777,
62
- "grad_norm": 1.8730992078781128,
63
- "learning_rate": 2.3999999999999987e-05,
64
- "loss": 0.707,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 2.0,
69
- "grad_norm": 2.063655138015747,
70
- "learning_rate": 2.6999999999999982e-05,
71
- "loss": 0.7705,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 2.2222222222222223,
76
- "grad_norm": 1.565353512763977,
77
- "learning_rate": 2.999999999999998e-05,
78
- "loss": 0.7273,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 2.4444444444444446,
83
- "grad_norm": 1.701326608657837,
84
- "learning_rate": 2.999999702723961e-05,
85
- "loss": 0.7474,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 2.6666666666666665,
90
- "grad_norm": 2.7937393188476562,
91
- "learning_rate": 2.9999988108959667e-05,
92
- "loss": 0.5701,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 2.888888888888889,
97
- "grad_norm": 2.1285998821258545,
98
- "learning_rate": 2.9999973245163695e-05,
99
- "loss": 0.7044,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 3.111111111111111,
104
- "grad_norm": 2.2441246509552,
105
- "learning_rate": 2.999995243585758e-05,
106
- "loss": 0.717,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 3.3333333333333335,
111
- "grad_norm": 1.9379878044128418,
112
- "learning_rate": 2.9999925681049573e-05,
113
- "loss": 0.664,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 3.3333333333333335,
118
- "eval_loss": 0.7682406902313232,
119
- "eval_runtime": 0.4514,
120
- "eval_samples_per_second": 22.155,
121
- "eval_steps_per_second": 22.155,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 3.5555555555555554,
126
- "grad_norm": 2.575423002243042,
127
- "learning_rate": 2.9999892980750276e-05,
128
- "loss": 0.6943,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 3.7777777777777777,
133
- "grad_norm": 2.392334222793579,
134
- "learning_rate": 2.9999854334972655e-05,
135
- "loss": 0.6922,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 4.0,
140
- "grad_norm": 2.5393826961517334,
141
- "learning_rate": 2.999980974373202e-05,
142
- "loss": 0.6677,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 4.222222222222222,
147
- "grad_norm": 1.6046398878097534,
148
- "learning_rate": 2.9999759207046055e-05,
149
- "loss": 0.5898,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 4.444444444444445,
154
- "grad_norm": 1.681414246559143,
155
- "learning_rate": 2.9999702724934783e-05,
156
- "loss": 0.7117,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 4.666666666666667,
161
- "grad_norm": 3.416290044784546,
162
- "learning_rate": 2.99996402974206e-05,
163
- "loss": 0.6278,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 4.888888888888889,
168
- "grad_norm": 2.0781354904174805,
169
- "learning_rate": 2.9999571924528243e-05,
170
- "loss": 0.6732,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 5.111111111111111,
175
- "grad_norm": 2.630134105682373,
176
- "learning_rate": 2.9999497606284816e-05,
177
- "loss": 0.6029,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 5.333333333333333,
182
- "grad_norm": 1.482037901878357,
183
- "learning_rate": 2.9999417342719775e-05,
184
- "loss": 0.6941,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 5.555555555555555,
189
- "grad_norm": 2.217900514602661,
190
- "learning_rate": 2.9999331133864935e-05,
191
- "loss": 0.6478,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 5.777777777777778,
196
- "grad_norm": 1.7131129503250122,
197
- "learning_rate": 2.9999238979754465e-05,
198
- "loss": 0.6095,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 6.0,
203
- "grad_norm": 1.908470869064331,
204
- "learning_rate": 2.99991408804249e-05,
205
- "loss": 0.5758,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 6.222222222222222,
210
- "grad_norm": 2.141641616821289,
211
- "learning_rate": 2.999903683591511e-05,
212
- "loss": 0.574,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 6.444444444444445,
217
- "grad_norm": 1.3931849002838135,
218
- "learning_rate": 2.9998926846266345e-05,
219
- "loss": 0.6139,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 6.666666666666667,
224
- "grad_norm": 2.278519868850708,
225
- "learning_rate": 2.9998810911522193e-05,
226
- "loss": 0.6227,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 6.666666666666667,
231
- "eval_loss": 0.7388573884963989,
232
- "eval_runtime": 0.4661,
233
- "eval_samples_per_second": 21.453,
234
- "eval_steps_per_second": 21.453,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 6.888888888888889,
239
- "grad_norm": 2.712602138519287,
240
- "learning_rate": 2.9998689031728615e-05,
241
- "loss": 0.6479,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 7.111111111111111,
246
- "grad_norm": 1.7634906768798828,
247
- "learning_rate": 2.9998561206933918e-05,
248
- "loss": 0.5863,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 7.333333333333333,
253
- "grad_norm": 1.772024154663086,
254
- "learning_rate": 2.9998427437188766e-05,
255
- "loss": 0.5795,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 7.555555555555555,
260
- "grad_norm": 2.34784197807312,
261
- "learning_rate": 2.999828772254618e-05,
262
- "loss": 0.6034,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 7.777777777777778,
267
- "grad_norm": 2.455519199371338,
268
- "learning_rate": 2.9998142063061544e-05,
269
- "loss": 0.6625,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 8.0,
274
- "grad_norm": 1.3227782249450684,
275
- "learning_rate": 2.9997990458792583e-05,
276
- "loss": 0.6041,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 8.222222222222221,
281
- "grad_norm": 1.9034490585327148,
282
- "learning_rate": 2.9997832909799397e-05,
283
- "loss": 0.5491,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 8.444444444444445,
288
- "grad_norm": 1.9352225065231323,
289
- "learning_rate": 2.9997669416144432e-05,
290
- "loss": 0.6406,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 8.666666666666666,
295
- "grad_norm": 1.0488076210021973,
296
- "learning_rate": 2.999749997789249e-05,
297
- "loss": 0.5398,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 8.88888888888889,
302
- "grad_norm": 1.5216209888458252,
303
- "learning_rate": 2.9997324595110723e-05,
304
- "loss": 0.6545,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 9.11111111111111,
309
- "grad_norm": 1.3843863010406494,
310
- "learning_rate": 2.9997143267868663e-05,
311
- "loss": 0.5948,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 9.333333333333334,
316
- "grad_norm": 2.4701507091522217,
317
- "learning_rate": 2.999695599623817e-05,
318
- "loss": 0.6224,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 9.555555555555555,
323
- "grad_norm": 2.598496198654175,
324
- "learning_rate": 2.9996762780293483e-05,
325
- "loss": 0.575,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 9.777777777777779,
330
- "grad_norm": 1.5249278545379639,
331
- "learning_rate": 2.9996563620111176e-05,
332
- "loss": 0.5294,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 10.0,
337
- "grad_norm": 1.3830034732818604,
338
- "learning_rate": 2.9996358515770198e-05,
339
- "loss": 0.5417,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 10.0,
344
- "eval_loss": 0.726381778717041,
345
- "eval_runtime": 0.4056,
346
- "eval_samples_per_second": 24.653,
347
- "eval_steps_per_second": 24.653,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 10.222222222222221,
352
- "grad_norm": 2.226635456085205,
353
- "learning_rate": 2.9996147467351836e-05,
354
- "loss": 0.506,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 10.444444444444445,
359
- "grad_norm": 1.4145069122314453,
360
- "learning_rate": 2.9995930474939753e-05,
361
- "loss": 0.4908,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 10.666666666666666,
366
- "grad_norm": 1.9364039897918701,
367
- "learning_rate": 2.9995707538619954e-05,
368
- "loss": 0.6364,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 10.88888888888889,
373
- "grad_norm": 2.14816951751709,
374
- "learning_rate": 2.9995478658480802e-05,
375
- "loss": 0.5532,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 11.11111111111111,
380
- "grad_norm": 1.8744515180587769,
381
- "learning_rate": 2.9995243834613023e-05,
382
- "loss": 0.5234,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 11.333333333333334,
387
- "grad_norm": 1.6773265600204468,
388
- "learning_rate": 2.9995003067109687e-05,
389
- "loss": 0.5392,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 11.555555555555555,
394
- "grad_norm": 2.6416842937469482,
395
- "learning_rate": 2.9994756356066226e-05,
396
- "loss": 0.5848,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 11.777777777777779,
401
- "grad_norm": 2.257610321044922,
402
- "learning_rate": 2.999450370158044e-05,
403
- "loss": 0.5336,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 12.0,
408
- "grad_norm": 1.5329607725143433,
409
- "learning_rate": 2.9994245103752457e-05,
410
- "loss": 0.5242,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 12.222222222222221,
415
- "grad_norm": 1.2473564147949219,
416
- "learning_rate": 2.999398056268479e-05,
417
- "loss": 0.5356,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 12.444444444444445,
422
- "grad_norm": 1.4698841571807861,
423
- "learning_rate": 2.9993710078482286e-05,
424
- "loss": 0.4155,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 12.666666666666666,
429
- "grad_norm": 3.38484525680542,
430
- "learning_rate": 2.9993433651252164e-05,
431
- "loss": 0.6201,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 12.88888888888889,
436
- "grad_norm": 1.4733773469924927,
437
- "learning_rate": 2.9993151281103986e-05,
438
- "loss": 0.5349,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 13.11111111111111,
443
- "grad_norm": 2.4409337043762207,
444
- "learning_rate": 2.9992862968149675e-05,
445
- "loss": 0.4179,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 13.333333333333334,
450
- "grad_norm": 2.4499781131744385,
451
- "learning_rate": 2.9992568712503513e-05,
452
- "loss": 0.5321,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 13.333333333333334,
457
- "eval_loss": 0.7211434841156006,
458
- "eval_runtime": 0.4323,
459
- "eval_samples_per_second": 23.134,
460
- "eval_steps_per_second": 23.134,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 13.555555555555555,
465
- "grad_norm": 2.347456455230713,
466
- "learning_rate": 2.9992268514282122e-05,
467
- "loss": 0.5653,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 13.777777777777779,
472
- "grad_norm": 2.2229528427124023,
473
- "learning_rate": 2.99919623736045e-05,
474
- "loss": 0.4408,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 14.0,
479
- "grad_norm": 2.275893449783325,
480
- "learning_rate": 2.9991650290591996e-05,
481
- "loss": 0.6037,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 14.222222222222221,
486
- "grad_norm": 2.250699520111084,
487
- "learning_rate": 2.99913322653683e-05,
488
- "loss": 0.4922,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 14.444444444444445,
493
- "grad_norm": 1.8398470878601074,
494
- "learning_rate": 2.9991008298059473e-05,
495
- "loss": 0.5012,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 14.666666666666666,
500
- "grad_norm": 1.539143443107605,
501
- "learning_rate": 2.9990678388793924e-05,
502
- "loss": 0.5321,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 14.88888888888889,
507
- "grad_norm": 1.2890745401382446,
508
- "learning_rate": 2.999034253770242e-05,
509
- "loss": 0.4581,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 15.11111111111111,
514
- "grad_norm": 2.539614200592041,
515
- "learning_rate": 2.9990000744918076e-05,
516
- "loss": 0.4486,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 15.333333333333334,
521
- "grad_norm": 1.6030837297439575,
522
- "learning_rate": 2.9989653010576372e-05,
523
- "loss": 0.4528,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 15.555555555555555,
528
- "grad_norm": 1.46444571018219,
529
- "learning_rate": 2.9989299334815138e-05,
530
- "loss": 0.4805,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 15.777777777777779,
535
- "grad_norm": 2.233593702316284,
536
- "learning_rate": 2.9988939717774558e-05,
537
- "loss": 0.5241,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 16.0,
542
- "grad_norm": 1.3459173440933228,
543
- "learning_rate": 2.9988574159597174e-05,
544
- "loss": 0.5107,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 16.22222222222222,
549
- "grad_norm": 2.139714241027832,
550
- "learning_rate": 2.9988202660427887e-05,
551
- "loss": 0.4647,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 16.444444444444443,
556
- "grad_norm": 3.066899061203003,
557
- "learning_rate": 2.9987825220413937e-05,
558
- "loss": 0.5351,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 16.666666666666668,
563
- "grad_norm": 1.896189570426941,
564
- "learning_rate": 2.998744183970494e-05,
565
- "loss": 0.4269,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 16.666666666666668,
570
- "eval_loss": 0.7210280299186707,
571
- "eval_runtime": 0.4276,
572
- "eval_samples_per_second": 23.389,
573
- "eval_steps_per_second": 23.389,
574
- "step": 750
575
  }
576
  ],
577
  "logging_steps": 10,
578
  "max_steps": 50000,
579
  "num_input_tokens_seen": 0,
580
- "num_train_epochs": 1112,
581
  "save_steps": 150,
582
  "stateful_callbacks": {
583
  "TrainerControl": {
@@ -591,8 +139,8 @@
591
  "attributes": {}
592
  }
593
  },
594
- "total_flos": 7981049240027136.0,
595
- "train_batch_size": 2,
596
  "trial_name": null,
597
  "trial_params": null
598
  }
 
1
  {
2
+ "best_metric": 0.7480350136756897,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 6.521739130434782,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.43478260869565216,
13
+ "grad_norm": 1.5022108554840088,
14
+ "learning_rate": 5e-06,
15
+ "loss": 0.9063,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.8695652173913043,
20
+ "grad_norm": 1.679484248161316,
21
+ "learning_rate": 1e-05,
22
+ "loss": 0.9018,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 1.3043478260869565,
27
+ "grad_norm": 1.7071681022644043,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 0.8972,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 1.7391304347826086,
34
+ "grad_norm": 1.4155817031860352,
35
+ "learning_rate": 2e-05,
36
+ "loss": 0.9019,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 2.1739130434782608,
41
+ "grad_norm": 1.2699992656707764,
42
+ "learning_rate": 2.5e-05,
43
+ "loss": 0.8208,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 2.608695652173913,
48
+ "grad_norm": 1.902714729309082,
49
+ "learning_rate": 3e-05,
50
+ "loss": 0.865,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 3.0434782608695654,
55
+ "grad_norm": 1.344117283821106,
56
+ "learning_rate": 3.5e-05,
57
+ "loss": 0.853,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 3.4782608695652173,
62
+ "grad_norm": 1.6205377578735352,
63
+ "learning_rate": 4e-05,
64
+ "loss": 0.774,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 3.9130434782608696,
69
+ "grad_norm": 1.364487886428833,
70
+ "learning_rate": 4.5e-05,
71
+ "loss": 0.8004,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 4.3478260869565215,
76
+ "grad_norm": 1.2991905212402344,
77
+ "learning_rate": 5e-05,
78
+ "loss": 0.7915,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 4.782608695652174,
83
+ "grad_norm": 2.0769360065460205,
84
+ "learning_rate": 4.999999504539938e-05,
85
+ "loss": 0.7423,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 5.217391304347826,
90
+ "grad_norm": 1.0391877889633179,
91
+ "learning_rate": 4.999998018159948e-05,
92
+ "loss": 0.6799,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 5.6521739130434785,
97
+ "grad_norm": 1.4947184324264526,
98
+ "learning_rate": 4.999995540860619e-05,
99
+ "loss": 0.6506,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 6.086956521739131,
104
+ "grad_norm": 1.8294117450714111,
105
+ "learning_rate": 4.999992072642933e-05,
106
+ "loss": 0.6741,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 6.521739130434782,
111
+ "grad_norm": 1.1097073554992676,
112
+ "learning_rate": 4.9999876135082656e-05,
113
+ "loss": 0.6923,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 6.521739130434782,
118
+ "eval_loss": 0.7480350136756897,
119
+ "eval_runtime": 0.5268,
120
+ "eval_samples_per_second": 18.983,
121
+ "eval_steps_per_second": 18.983,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
126
  "max_steps": 50000,
127
  "num_input_tokens_seen": 0,
128
+ "num_train_epochs": 2174,
129
  "save_steps": 150,
130
  "stateful_callbacks": {
131
  "TrainerControl": {
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 3894839614291968.0,
143
+ "train_batch_size": 4,
144
  "trial_name": null,
145
  "trial_params": null
146
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68671d3a40b312e8c641605c2dc5233644a3f074e034d0d6789a3fdd173db03a
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c534cfc3e2dd38ea6fb24a33807c3d741b1e0c48890f1270e8024ad58bfd114d
3
+ size 5368