irishprancer commited on
Commit
3f527d8
·
verified ·
1 Parent(s): 5cd6f04

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -22,8 +22,8 @@
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
- "k_proj",
26
- "q_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
+ "q_proj",
26
+ "k_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:652d63c68dfe1e35e6942371c8f9f4a9f19ab99ae5f4ddbcdfca63ce319f3780
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b0f779026ec70664afe700822ef58aa8ad2099293e8a97ff37d57139227678
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd325e085efecfd52fda63e3ae981a902519e53e5102f41867a1ca760f848916
3
  size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883c0d10e8ceb16e5ba20eebd16f134a05b14cf22161289dc343eceb41e829a7
3
  size 1054136250
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d591cd4a87db6ff7862986fcbd71a7ea08ac34a6c4ca00eb88fbc6e4ccf1c5bd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226f394c3a9826cc7f74d0799aa02f643f1ee6b891784f44c588787dbc9c0cb3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f47df519d3e34f85833ffe9513be298918979811657719c019fec7ab68351e14
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2673d78ac7304a2a7678ae71ed65422fa2295f07aca63cf23ca76e0b5c92da69
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,577 +1,125 @@
1
  {
2
- "best_metric": 0.7210294008255005,
3
- "best_model_checkpoint": "./output/checkpoint-750",
4
- "epoch": 16.666666666666668,
5
  "eval_steps": 150,
6
- "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2222222222222222,
13
- "grad_norm": 1.9085510969161987,
14
  "learning_rate": 2.9999999999999984e-06,
15
  "loss": 0.6619,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.4444444444444444,
20
- "grad_norm": 1.8888217210769653,
21
  "learning_rate": 5.999999999999997e-06,
22
- "loss": 0.7258,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6666666666666666,
27
- "grad_norm": 1.6715161800384521,
28
  "learning_rate": 8.999999999999993e-06,
29
- "loss": 0.7336,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.8888888888888888,
34
- "grad_norm": 1.576881766319275,
35
  "learning_rate": 1.1999999999999994e-05,
36
- "loss": 0.737,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.1111111111111112,
41
- "grad_norm": 1.2493071556091309,
42
  "learning_rate": 1.499999999999999e-05,
43
- "loss": 0.8943,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.3333333333333333,
48
- "grad_norm": 1.1351404190063477,
49
  "learning_rate": 1.7999999999999987e-05,
50
- "loss": 0.7145,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.5555555555555556,
55
- "grad_norm": 1.5109528303146362,
56
  "learning_rate": 2.0999999999999985e-05,
57
- "loss": 0.7582,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.7777777777777777,
62
- "grad_norm": 1.872719645500183,
63
  "learning_rate": 2.3999999999999987e-05,
64
- "loss": 0.7074,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.0,
69
- "grad_norm": 2.0645689964294434,
70
  "learning_rate": 2.6999999999999982e-05,
71
- "loss": 0.7704,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.2222222222222223,
76
- "grad_norm": 1.5649832487106323,
77
  "learning_rate": 2.999999999999998e-05,
78
- "loss": 0.727,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 2.4444444444444446,
83
- "grad_norm": 1.7015666961669922,
84
  "learning_rate": 2.999999702723961e-05,
85
- "loss": 0.747,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 2.6666666666666665,
90
- "grad_norm": 2.7937145233154297,
91
  "learning_rate": 2.9999988108959667e-05,
92
- "loss": 0.5702,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 2.888888888888889,
97
- "grad_norm": 2.12908673286438,
98
  "learning_rate": 2.9999973245163695e-05,
99
- "loss": 0.7045,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.111111111111111,
104
- "grad_norm": 2.259050130844116,
105
  "learning_rate": 2.999995243585758e-05,
106
- "loss": 0.7171,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 3.3333333333333335,
111
- "grad_norm": 1.9302667379379272,
112
  "learning_rate": 2.9999925681049573e-05,
113
- "loss": 0.6647,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 3.3333333333333335,
118
- "eval_loss": 0.7681264281272888,
119
- "eval_runtime": 0.4717,
120
- "eval_samples_per_second": 21.2,
121
- "eval_steps_per_second": 21.2,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 3.5555555555555554,
126
- "grad_norm": 2.575085401535034,
127
- "learning_rate": 2.9999892980750276e-05,
128
- "loss": 0.6946,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 3.7777777777777777,
133
- "grad_norm": 2.4067776203155518,
134
- "learning_rate": 2.9999854334972655e-05,
135
- "loss": 0.6932,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 4.0,
140
- "grad_norm": 2.5401134490966797,
141
- "learning_rate": 2.999980974373202e-05,
142
- "loss": 0.6686,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 4.222222222222222,
147
- "grad_norm": 1.5957380533218384,
148
- "learning_rate": 2.9999759207046055e-05,
149
- "loss": 0.5898,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 4.444444444444445,
154
- "grad_norm": 1.6820542812347412,
155
- "learning_rate": 2.9999702724934783e-05,
156
- "loss": 0.7104,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 4.666666666666667,
161
- "grad_norm": 3.4158143997192383,
162
- "learning_rate": 2.99996402974206e-05,
163
- "loss": 0.628,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 4.888888888888889,
168
- "grad_norm": 2.0864531993865967,
169
- "learning_rate": 2.9999571924528243e-05,
170
- "loss": 0.6736,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 5.111111111111111,
175
- "grad_norm": 2.6299381256103516,
176
- "learning_rate": 2.9999497606284816e-05,
177
- "loss": 0.6027,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 5.333333333333333,
182
- "grad_norm": 1.4852367639541626,
183
- "learning_rate": 2.9999417342719775e-05,
184
- "loss": 0.6946,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 5.555555555555555,
189
- "grad_norm": 2.2180473804473877,
190
- "learning_rate": 2.9999331133864935e-05,
191
- "loss": 0.6481,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 5.777777777777778,
196
- "grad_norm": 1.7053271532058716,
197
- "learning_rate": 2.9999238979754465e-05,
198
- "loss": 0.6095,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 6.0,
203
- "grad_norm": 1.8998974561691284,
204
- "learning_rate": 2.99991408804249e-05,
205
- "loss": 0.5759,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 6.222222222222222,
210
- "grad_norm": 2.1407132148742676,
211
- "learning_rate": 2.999903683591511e-05,
212
- "loss": 0.5739,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 6.444444444444445,
217
- "grad_norm": 1.3923563957214355,
218
- "learning_rate": 2.9998926846266345e-05,
219
- "loss": 0.6132,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 6.666666666666667,
224
- "grad_norm": 2.2752490043640137,
225
- "learning_rate": 2.9998810911522193e-05,
226
- "loss": 0.6228,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 6.666666666666667,
231
- "eval_loss": 0.7380812168121338,
232
- "eval_runtime": 0.4678,
233
- "eval_samples_per_second": 21.377,
234
- "eval_steps_per_second": 21.377,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 6.888888888888889,
239
- "grad_norm": 2.714766025543213,
240
- "learning_rate": 2.9998689031728615e-05,
241
- "loss": 0.6482,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 7.111111111111111,
246
- "grad_norm": 1.7628875970840454,
247
- "learning_rate": 2.9998561206933918e-05,
248
- "loss": 0.5861,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 7.333333333333333,
253
- "grad_norm": 1.7664484977722168,
254
- "learning_rate": 2.9998427437188766e-05,
255
- "loss": 0.5797,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 7.555555555555555,
260
- "grad_norm": 2.3483684062957764,
261
- "learning_rate": 2.999828772254618e-05,
262
- "loss": 0.6032,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 7.777777777777778,
267
- "grad_norm": 2.471663236618042,
268
- "learning_rate": 2.9998142063061544e-05,
269
- "loss": 0.6629,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 8.0,
274
- "grad_norm": 1.3223252296447754,
275
- "learning_rate": 2.9997990458792583e-05,
276
- "loss": 0.6038,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 8.222222222222221,
281
- "grad_norm": 1.9036260843276978,
282
- "learning_rate": 2.9997832909799397e-05,
283
- "loss": 0.5487,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 8.444444444444445,
288
- "grad_norm": 1.9370992183685303,
289
- "learning_rate": 2.9997669416144432e-05,
290
- "loss": 0.6407,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 8.666666666666666,
295
- "grad_norm": 1.0507700443267822,
296
- "learning_rate": 2.999749997789249e-05,
297
- "loss": 0.54,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 8.88888888888889,
302
- "grad_norm": 1.5254780054092407,
303
- "learning_rate": 2.9997324595110723e-05,
304
- "loss": 0.6542,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 9.11111111111111,
309
- "grad_norm": 1.3856695890426636,
310
- "learning_rate": 2.9997143267868663e-05,
311
- "loss": 0.595,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 9.333333333333334,
316
- "grad_norm": 2.4855446815490723,
317
- "learning_rate": 2.999695599623817e-05,
318
- "loss": 0.6228,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 9.555555555555555,
323
- "grad_norm": 2.5905134677886963,
324
- "learning_rate": 2.9996762780293483e-05,
325
- "loss": 0.5749,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 9.777777777777779,
330
- "grad_norm": 1.533772349357605,
331
- "learning_rate": 2.9996563620111176e-05,
332
- "loss": 0.5296,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 10.0,
337
- "grad_norm": 1.3861850500106812,
338
- "learning_rate": 2.9996358515770198e-05,
339
- "loss": 0.5418,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 10.0,
344
- "eval_loss": 0.7268816828727722,
345
- "eval_runtime": 0.425,
346
- "eval_samples_per_second": 23.528,
347
- "eval_steps_per_second": 23.528,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 10.222222222222221,
352
- "grad_norm": 2.2311553955078125,
353
- "learning_rate": 2.9996147467351836e-05,
354
- "loss": 0.5062,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 10.444444444444445,
359
- "grad_norm": 1.4143763780593872,
360
- "learning_rate": 2.9995930474939753e-05,
361
- "loss": 0.4908,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 10.666666666666666,
366
- "grad_norm": 1.9285305738449097,
367
- "learning_rate": 2.9995707538619954e-05,
368
- "loss": 0.6355,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 10.88888888888889,
373
- "grad_norm": 2.1390364170074463,
374
- "learning_rate": 2.9995478658480802e-05,
375
- "loss": 0.5531,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 11.11111111111111,
380
- "grad_norm": 1.8684849739074707,
381
- "learning_rate": 2.9995243834613023e-05,
382
- "loss": 0.5237,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 11.333333333333334,
387
- "grad_norm": 1.6750541925430298,
388
- "learning_rate": 2.9995003067109687e-05,
389
- "loss": 0.5385,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 11.555555555555555,
394
- "grad_norm": 2.6402785778045654,
395
- "learning_rate": 2.9994756356066226e-05,
396
- "loss": 0.5847,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 11.777777777777779,
401
- "grad_norm": 2.2594568729400635,
402
- "learning_rate": 2.999450370158044e-05,
403
- "loss": 0.5341,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 12.0,
408
- "grad_norm": 1.5283032655715942,
409
- "learning_rate": 2.9994245103752457e-05,
410
- "loss": 0.5243,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 12.222222222222221,
415
- "grad_norm": 1.2468581199645996,
416
- "learning_rate": 2.999398056268479e-05,
417
- "loss": 0.5354,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 12.444444444444445,
422
- "grad_norm": 1.4768636226654053,
423
- "learning_rate": 2.9993710078482286e-05,
424
- "loss": 0.4149,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 12.666666666666666,
429
- "grad_norm": 3.385610580444336,
430
- "learning_rate": 2.9993433651252164e-05,
431
- "loss": 0.6199,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 12.88888888888889,
436
- "grad_norm": 1.4733967781066895,
437
- "learning_rate": 2.9993151281103986e-05,
438
- "loss": 0.5354,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 13.11111111111111,
443
- "grad_norm": 2.435899019241333,
444
- "learning_rate": 2.9992862968149675e-05,
445
- "loss": 0.4177,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 13.333333333333334,
450
- "grad_norm": 2.4628303050994873,
451
- "learning_rate": 2.9992568712503513e-05,
452
- "loss": 0.5327,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 13.333333333333334,
457
- "eval_loss": 0.7213956117630005,
458
- "eval_runtime": 0.4476,
459
- "eval_samples_per_second": 22.343,
460
- "eval_steps_per_second": 22.343,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 13.555555555555555,
465
- "grad_norm": 2.3349859714508057,
466
- "learning_rate": 2.9992268514282122e-05,
467
- "loss": 0.5655,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 13.777777777777779,
472
- "grad_norm": 2.215526580810547,
473
- "learning_rate": 2.99919623736045e-05,
474
- "loss": 0.4406,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 14.0,
479
- "grad_norm": 2.2699978351593018,
480
- "learning_rate": 2.9991650290591996e-05,
481
- "loss": 0.6034,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 14.222222222222221,
486
- "grad_norm": 2.2520437240600586,
487
- "learning_rate": 2.99913322653683e-05,
488
- "loss": 0.492,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 14.444444444444445,
493
- "grad_norm": 1.8439189195632935,
494
- "learning_rate": 2.9991008298059473e-05,
495
- "loss": 0.5013,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 14.666666666666666,
500
- "grad_norm": 1.5318201780319214,
501
- "learning_rate": 2.9990678388793924e-05,
502
- "loss": 0.5316,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 14.88888888888889,
507
- "grad_norm": 1.2852894067764282,
508
- "learning_rate": 2.999034253770242e-05,
509
- "loss": 0.4578,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 15.11111111111111,
514
- "grad_norm": 2.532104015350342,
515
- "learning_rate": 2.9990000744918076e-05,
516
- "loss": 0.4492,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 15.333333333333334,
521
- "grad_norm": 1.6051527261734009,
522
- "learning_rate": 2.9989653010576372e-05,
523
- "loss": 0.4531,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 15.555555555555555,
528
- "grad_norm": 1.4627478122711182,
529
- "learning_rate": 2.9989299334815138e-05,
530
- "loss": 0.4804,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 15.777777777777779,
535
- "grad_norm": 2.238284111022949,
536
- "learning_rate": 2.9988939717774558e-05,
537
- "loss": 0.524,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 16.0,
542
- "grad_norm": 1.3457872867584229,
543
- "learning_rate": 2.9988574159597174e-05,
544
- "loss": 0.5107,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 16.22222222222222,
549
- "grad_norm": 2.1465327739715576,
550
- "learning_rate": 2.9988202660427887e-05,
551
- "loss": 0.4642,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 16.444444444444443,
556
- "grad_norm": 3.066289186477661,
557
- "learning_rate": 2.9987825220413937e-05,
558
- "loss": 0.5349,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 16.666666666666668,
563
- "grad_norm": 1.9061943292617798,
564
- "learning_rate": 2.998744183970494e-05,
565
- "loss": 0.4266,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 16.666666666666668,
570
- "eval_loss": 0.7210294008255005,
571
- "eval_runtime": 0.405,
572
- "eval_samples_per_second": 24.693,
573
- "eval_steps_per_second": 24.693,
574
- "step": 750
575
  }
576
  ],
577
  "logging_steps": 10,
@@ -591,7 +139,7 @@
591
  "attributes": {}
592
  }
593
  },
594
- "total_flos": 7981049240027136.0,
595
  "train_batch_size": 2,
596
  "trial_name": null,
597
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7682406902313232,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 3.3333333333333335,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2222222222222222,
13
+ "grad_norm": 1.9091473817825317,
14
  "learning_rate": 2.9999999999999984e-06,
15
  "loss": 0.6619,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.4444444444444444,
20
+ "grad_norm": 1.8889063596725464,
21
  "learning_rate": 5.999999999999997e-06,
22
+ "loss": 0.7254,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.6666666666666666,
27
+ "grad_norm": 1.6717636585235596,
28
  "learning_rate": 8.999999999999993e-06,
29
+ "loss": 0.7335,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.8888888888888888,
34
+ "grad_norm": 1.5766067504882812,
35
  "learning_rate": 1.1999999999999994e-05,
36
+ "loss": 0.7377,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 1.1111111111111112,
41
+ "grad_norm": 1.2490640878677368,
42
  "learning_rate": 1.499999999999999e-05,
43
+ "loss": 0.8942,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 1.3333333333333333,
48
+ "grad_norm": 1.1447882652282715,
49
  "learning_rate": 1.7999999999999987e-05,
50
+ "loss": 0.7146,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.5555555555555556,
55
+ "grad_norm": 1.5109171867370605,
56
  "learning_rate": 2.0999999999999985e-05,
57
+ "loss": 0.7579,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.7777777777777777,
62
+ "grad_norm": 1.8730992078781128,
63
  "learning_rate": 2.3999999999999987e-05,
64
+ "loss": 0.707,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 2.0,
69
+ "grad_norm": 2.063655138015747,
70
  "learning_rate": 2.6999999999999982e-05,
71
+ "loss": 0.7705,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 2.2222222222222223,
76
+ "grad_norm": 1.565353512763977,
77
  "learning_rate": 2.999999999999998e-05,
78
+ "loss": 0.7273,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 2.4444444444444446,
83
+ "grad_norm": 1.701326608657837,
84
  "learning_rate": 2.999999702723961e-05,
85
+ "loss": 0.7474,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 2.6666666666666665,
90
+ "grad_norm": 2.7937393188476562,
91
  "learning_rate": 2.9999988108959667e-05,
92
+ "loss": 0.5701,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 2.888888888888889,
97
+ "grad_norm": 2.1285998821258545,
98
  "learning_rate": 2.9999973245163695e-05,
99
+ "loss": 0.7044,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 3.111111111111111,
104
+ "grad_norm": 2.2441246509552,
105
  "learning_rate": 2.999995243585758e-05,
106
+ "loss": 0.717,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 3.3333333333333335,
111
+ "grad_norm": 1.9379878044128418,
112
  "learning_rate": 2.9999925681049573e-05,
113
+ "loss": 0.664,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 3.3333333333333335,
118
+ "eval_loss": 0.7682406902313232,
119
+ "eval_runtime": 0.4514,
120
+ "eval_samples_per_second": 22.155,
121
+ "eval_steps_per_second": 22.155,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 1615667282657280.0,
143
  "train_batch_size": 2,
144
  "trial_name": null,
145
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc378f68851406ff2cbb4bd474c2caf3a38d9a74ccb912ae832b6d6e36628285
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68671d3a40b312e8c641605c2dc5233644a3f074e034d0d6789a3fdd173db03a
3
  size 5496