irishprancer commited on
Commit
789cb70
·
verified ·
1 Parent(s): 430c964

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0e8da576de1a533d44f53c0812dae8d5a26fc059f8c242d522e7ff65d9cf742
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90093abcf7bd2cc38fbd8dd4a992d4a8a84765909d6c543ac763634242b4f4e8
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0793bdd731f1c36630008341e378831fc838bd98c826a69296d2e308eb19cec9
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9519724ca8d0709fa71e288b6a83a4f4e4d6df223c2a8f35f742642724da765
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eed7e63646e60ae2bd56a0754378b43da25eff8bd39e1edda0ec4d07c731eeb8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5feb56512e955691dc9bb9a1e37b9dd590e06a961d7d94560b679e2730b03194
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e372b046187e477eabb1e557ed274ea206c76e12dfed9a7f7bcfa7ddabfb242a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf3f988e8fed2daa2e801eb1f19b681872781cf57f0fb7b896e859a12cfe2bb
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1027 +1,157 @@
1
  {
2
- "best_metric": 0.7167752981185913,
3
- "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 45.65217391304348,
5
  "eval_steps": 150,
6
- "global_step": 1050,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.43478260869565216,
13
- "grad_norm": 1.5021635293960571,
14
  "learning_rate": 3e-06,
15
  "loss": 0.906,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
- "grad_norm": 1.687072992324829,
21
  "learning_rate": 6e-06,
22
- "loss": 0.9025,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.3043478260869565,
27
- "grad_norm": 1.7296812534332275,
28
  "learning_rate": 9e-06,
29
- "loss": 0.9001,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.7391304347826086,
34
- "grad_norm": 1.4459028244018555,
35
  "learning_rate": 1.2e-05,
36
- "loss": 0.909,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 2.1739130434782608,
41
- "grad_norm": 1.351724624633789,
42
  "learning_rate": 1.5e-05,
43
- "loss": 0.8361,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 2.608695652173913,
48
- "grad_norm": 2.0466818809509277,
49
  "learning_rate": 1.8e-05,
50
- "loss": 0.8894,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 3.0434782608695654,
55
- "grad_norm": 1.4843230247497559,
56
  "learning_rate": 2.1e-05,
57
- "loss": 0.8912,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 3.4782608695652173,
62
- "grad_norm": 1.7293753623962402,
63
  "learning_rate": 2.4e-05,
64
- "loss": 0.8237,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 3.9130434782608696,
69
- "grad_norm": 1.424095869064331,
70
  "learning_rate": 2.7000000000000002e-05,
71
- "loss": 0.8527,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 4.3478260869565215,
76
- "grad_norm": 1.3656634092330933,
77
  "learning_rate": 3e-05,
78
- "loss": 0.8649,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 4.782608695652174,
83
- "grad_norm": 2.198690891265869,
84
  "learning_rate": 2.999999702723963e-05,
85
- "loss": 0.8224,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 5.217391304347826,
90
- "grad_norm": 1.0726526975631714,
91
  "learning_rate": 2.9999988108959687e-05,
92
- "loss": 0.7651,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 5.6521739130434785,
97
- "grad_norm": 1.5673664808273315,
98
  "learning_rate": 2.9999973245163716e-05,
99
  "loss": 0.7415,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 6.086956521739131,
104
- "grad_norm": 1.9072725772857666,
105
  "learning_rate": 2.99999524358576e-05,
106
- "loss": 0.7655,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 6.521739130434782,
111
- "grad_norm": 1.1216552257537842,
112
  "learning_rate": 2.9999925681049593e-05,
113
- "loss": 0.7857,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 6.521739130434782,
118
- "eval_loss": 0.7949807047843933,
119
  "eval_runtime": 0.4775,
120
- "eval_samples_per_second": 20.944,
121
- "eval_steps_per_second": 20.944,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 6.521739130434782,
126
- "eval_loss": 0.8609212040901184,
127
- "eval_runtime": 0.417,
128
- "eval_samples_per_second": 23.983,
129
- "eval_steps_per_second": 23.983,
130
  "step": 150
131
  },
132
  {
133
  "epoch": 6.521739130434782,
134
- "eval_loss": 0.7949807047843933,
135
- "eval_runtime": 0.4175,
136
- "eval_samples_per_second": 23.952,
137
- "eval_steps_per_second": 23.952,
138
  "step": 150
139
  },
140
  {
141
  "epoch": 6.521739130434782,
142
- "eval_loss": 0.8609212040901184,
143
- "eval_runtime": 0.4278,
144
- "eval_samples_per_second": 23.374,
145
- "eval_steps_per_second": 23.374,
146
  "step": 150
147
  },
148
  {
149
  "epoch": 6.521739130434782,
150
- "eval_loss": 0.8608071208000183,
151
- "eval_runtime": 0.4109,
152
- "eval_samples_per_second": 24.338,
153
- "eval_steps_per_second": 24.338,
154
  "step": 150
155
- },
156
- {
157
- "epoch": 6.956521739130435,
158
- "grad_norm": 1.533442497253418,
159
- "learning_rate": 2.9999892980750297e-05,
160
- "loss": 0.6585,
161
- "step": 160
162
- },
163
- {
164
- "epoch": 7.391304347826087,
165
- "grad_norm": 1.3458119630813599,
166
- "learning_rate": 2.9999854334972675e-05,
167
- "loss": 0.7387,
168
- "step": 170
169
- },
170
- {
171
- "epoch": 7.826086956521739,
172
- "grad_norm": 1.726345419883728,
173
- "learning_rate": 2.999980974373204e-05,
174
- "loss": 0.729,
175
- "step": 180
176
- },
177
- {
178
- "epoch": 8.26086956521739,
179
- "grad_norm": 1.540044903755188,
180
- "learning_rate": 2.9999759207046075e-05,
181
- "loss": 0.6246,
182
- "step": 190
183
- },
184
- {
185
- "epoch": 8.695652173913043,
186
- "grad_norm": 1.7418792247772217,
187
- "learning_rate": 2.9999702724934804e-05,
188
- "loss": 0.6761,
189
- "step": 200
190
- },
191
- {
192
- "epoch": 9.130434782608695,
193
- "grad_norm": 1.038259506225586,
194
- "learning_rate": 2.999964029742062e-05,
195
- "loss": 0.6525,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 9.565217391304348,
200
- "grad_norm": 1.2199773788452148,
201
- "learning_rate": 2.9999571924528263e-05,
202
- "loss": 0.5592,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 10.0,
207
- "grad_norm": 1.5277819633483887,
208
- "learning_rate": 2.9999497606284837e-05,
209
- "loss": 0.7559,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 10.434782608695652,
214
- "grad_norm": 1.4215247631072998,
215
- "learning_rate": 2.9999417342719796e-05,
216
- "loss": 0.7117,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 10.869565217391305,
221
- "grad_norm": 0.974700927734375,
222
- "learning_rate": 2.9999331133864956e-05,
223
- "loss": 0.5895,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 11.304347826086957,
228
- "grad_norm": 1.1944469213485718,
229
- "learning_rate": 2.9999238979754485e-05,
230
- "loss": 0.6546,
231
- "step": 260
232
- },
233
- {
234
- "epoch": 11.73913043478261,
235
- "grad_norm": 1.049609661102295,
236
- "learning_rate": 2.999914088042492e-05,
237
- "loss": 0.6475,
238
- "step": 270
239
- },
240
- {
241
- "epoch": 12.173913043478262,
242
- "grad_norm": 1.3114821910858154,
243
- "learning_rate": 2.9999036835915132e-05,
244
- "loss": 0.5939,
245
- "step": 280
246
- },
247
- {
248
- "epoch": 12.608695652173914,
249
- "grad_norm": 1.0786523818969727,
250
- "learning_rate": 2.9998926846266365e-05,
251
- "loss": 0.633,
252
- "step": 290
253
- },
254
- {
255
- "epoch": 13.043478260869565,
256
- "grad_norm": 1.3863022327423096,
257
- "learning_rate": 2.9998810911522213e-05,
258
- "loss": 0.5808,
259
- "step": 300
260
- },
261
- {
262
- "epoch": 13.043478260869565,
263
- "eval_loss": 0.7308415770530701,
264
- "eval_runtime": 0.5396,
265
- "eval_samples_per_second": 18.534,
266
- "eval_steps_per_second": 18.534,
267
- "step": 300
268
- },
269
- {
270
- "epoch": 13.043478260869565,
271
- "eval_loss": 0.8609212040901184,
272
- "eval_runtime": 0.5562,
273
- "eval_samples_per_second": 17.98,
274
- "eval_steps_per_second": 17.98,
275
- "step": 300
276
- },
277
- {
278
- "epoch": 13.043478260869565,
279
- "eval_loss": 0.7308415770530701,
280
- "eval_runtime": 0.4963,
281
- "eval_samples_per_second": 20.15,
282
- "eval_steps_per_second": 20.15,
283
- "step": 300
284
- },
285
- {
286
- "epoch": 13.043478260869565,
287
- "eval_loss": 0.7747370004653931,
288
- "eval_runtime": 0.5127,
289
- "eval_samples_per_second": 19.506,
290
- "eval_steps_per_second": 19.506,
291
- "step": 300
292
- },
293
- {
294
- "epoch": 13.043478260869565,
295
- "eval_loss": 0.8607426881790161,
296
- "eval_runtime": 0.5332,
297
- "eval_samples_per_second": 18.756,
298
- "eval_steps_per_second": 18.756,
299
- "step": 300
300
- },
301
- {
302
- "epoch": 13.478260869565217,
303
- "grad_norm": 1.779449224472046,
304
- "learning_rate": 2.9998689031728636e-05,
305
- "loss": 0.5144,
306
- "step": 310
307
- },
308
- {
309
- "epoch": 13.91304347826087,
310
- "grad_norm": 1.5321470499038696,
311
- "learning_rate": 2.9998561206933938e-05,
312
- "loss": 0.6497,
313
- "step": 320
314
- },
315
- {
316
- "epoch": 14.347826086956522,
317
- "grad_norm": 1.4785079956054688,
318
- "learning_rate": 2.9998427437188786e-05,
319
- "loss": 0.5745,
320
- "step": 330
321
- },
322
- {
323
- "epoch": 14.782608695652174,
324
- "grad_norm": 1.3100569248199463,
325
- "learning_rate": 2.99982877225462e-05,
326
- "loss": 0.6013,
327
- "step": 340
328
- },
329
- {
330
- "epoch": 15.217391304347826,
331
- "grad_norm": 0.9780473709106445,
332
- "learning_rate": 2.9998142063061564e-05,
333
- "loss": 0.4987,
334
- "step": 350
335
- },
336
- {
337
- "epoch": 15.652173913043478,
338
- "grad_norm": 1.6418719291687012,
339
- "learning_rate": 2.9997990458792603e-05,
340
- "loss": 0.5628,
341
- "step": 360
342
- },
343
- {
344
- "epoch": 16.08695652173913,
345
- "grad_norm": 1.6354929208755493,
346
- "learning_rate": 2.9997832909799417e-05,
347
- "loss": 0.6671,
348
- "step": 370
349
- },
350
- {
351
- "epoch": 16.52173913043478,
352
- "grad_norm": 0.9526194930076599,
353
- "learning_rate": 2.9997669416144452e-05,
354
- "loss": 0.513,
355
- "step": 380
356
- },
357
- {
358
- "epoch": 16.956521739130434,
359
- "grad_norm": 0.9340882897377014,
360
- "learning_rate": 2.999749997789251e-05,
361
- "loss": 0.5795,
362
- "step": 390
363
- },
364
- {
365
- "epoch": 17.391304347826086,
366
- "grad_norm": 1.1163101196289062,
367
- "learning_rate": 2.9997324595110743e-05,
368
- "loss": 0.518,
369
- "step": 400
370
- },
371
- {
372
- "epoch": 17.82608695652174,
373
- "grad_norm": 1.2847086191177368,
374
- "learning_rate": 2.9997143267868683e-05,
375
- "loss": 0.5882,
376
- "step": 410
377
- },
378
- {
379
- "epoch": 18.26086956521739,
380
- "grad_norm": 1.1625791788101196,
381
- "learning_rate": 2.9996955996238192e-05,
382
- "loss": 0.5061,
383
- "step": 420
384
- },
385
- {
386
- "epoch": 18.695652173913043,
387
- "grad_norm": 1.2001575231552124,
388
- "learning_rate": 2.9996762780293503e-05,
389
- "loss": 0.5314,
390
- "step": 430
391
- },
392
- {
393
- "epoch": 19.130434782608695,
394
- "grad_norm": 1.2133065462112427,
395
- "learning_rate": 2.9996563620111197e-05,
396
- "loss": 0.5337,
397
- "step": 440
398
- },
399
- {
400
- "epoch": 19.565217391304348,
401
- "grad_norm": 1.4226895570755005,
402
- "learning_rate": 2.9996358515770218e-05,
403
- "loss": 0.5677,
404
- "step": 450
405
- },
406
- {
407
- "epoch": 19.565217391304348,
408
- "eval_loss": 0.7167752981185913,
409
- "eval_runtime": 0.5068,
410
- "eval_samples_per_second": 19.73,
411
- "eval_steps_per_second": 19.73,
412
- "step": 450
413
- },
414
- {
415
- "epoch": 19.565217391304348,
416
- "eval_loss": 0.8609212040901184,
417
- "eval_runtime": 0.4062,
418
- "eval_samples_per_second": 24.616,
419
- "eval_steps_per_second": 24.616,
420
- "step": 450
421
- },
422
- {
423
- "epoch": 19.565217391304348,
424
- "eval_loss": 0.7167752981185913,
425
- "eval_runtime": 0.4329,
426
- "eval_samples_per_second": 23.098,
427
- "eval_steps_per_second": 23.098,
428
- "step": 450
429
- },
430
- {
431
- "epoch": 19.565217391304348,
432
- "eval_loss": 0.7563869953155518,
433
- "eval_runtime": 0.3953,
434
- "eval_samples_per_second": 25.297,
435
- "eval_steps_per_second": 25.297,
436
- "step": 450
437
- },
438
- {
439
- "epoch": 19.565217391304348,
440
- "eval_loss": 0.8611674308776855,
441
- "eval_runtime": 0.3948,
442
- "eval_samples_per_second": 25.331,
443
- "eval_steps_per_second": 25.331,
444
- "step": 450
445
- },
446
- {
447
- "epoch": 20.0,
448
- "grad_norm": 2.1191132068634033,
449
- "learning_rate": 2.9996147467351856e-05,
450
- "loss": 0.515,
451
- "step": 460
452
- },
453
- {
454
- "epoch": 20.434782608695652,
455
- "grad_norm": 1.2782835960388184,
456
- "learning_rate": 2.9995930474939773e-05,
457
- "loss": 0.4784,
458
- "step": 470
459
- },
460
- {
461
- "epoch": 20.869565217391305,
462
- "grad_norm": 1.4754245281219482,
463
- "learning_rate": 2.9995707538619975e-05,
464
- "loss": 0.5705,
465
- "step": 480
466
- },
467
- {
468
- "epoch": 21.304347826086957,
469
- "grad_norm": 1.322965383529663,
470
- "learning_rate": 2.9995478658480822e-05,
471
- "loss": 0.5162,
472
- "step": 490
473
- },
474
- {
475
- "epoch": 21.73913043478261,
476
- "grad_norm": 1.2421406507492065,
477
- "learning_rate": 2.9995243834613043e-05,
478
- "loss": 0.5209,
479
- "step": 500
480
- },
481
- {
482
- "epoch": 22.17391304347826,
483
- "grad_norm": 1.7905986309051514,
484
- "learning_rate": 2.9995003067109707e-05,
485
- "loss": 0.4834,
486
- "step": 510
487
- },
488
- {
489
- "epoch": 22.608695652173914,
490
- "grad_norm": 1.5309374332427979,
491
- "learning_rate": 2.9994756356066246e-05,
492
- "loss": 0.5617,
493
- "step": 520
494
- },
495
- {
496
- "epoch": 23.043478260869566,
497
- "grad_norm": 1.7404286861419678,
498
- "learning_rate": 2.999450370158046e-05,
499
- "loss": 0.4927,
500
- "step": 530
501
- },
502
- {
503
- "epoch": 23.47826086956522,
504
- "grad_norm": 1.3119419813156128,
505
- "learning_rate": 2.9994245103752478e-05,
506
- "loss": 0.4384,
507
- "step": 540
508
- },
509
- {
510
- "epoch": 23.91304347826087,
511
- "grad_norm": 1.2318671941757202,
512
- "learning_rate": 2.999398056268481e-05,
513
- "loss": 0.5268,
514
- "step": 550
515
- },
516
- {
517
- "epoch": 24.347826086956523,
518
- "grad_norm": 1.4077017307281494,
519
- "learning_rate": 2.9993710078482306e-05,
520
- "loss": 0.5206,
521
- "step": 560
522
- },
523
- {
524
- "epoch": 24.782608695652176,
525
- "grad_norm": 0.9560300707817078,
526
- "learning_rate": 2.9993433651252185e-05,
527
- "loss": 0.4429,
528
- "step": 570
529
- },
530
- {
531
- "epoch": 25.217391304347824,
532
- "grad_norm": 1.7175792455673218,
533
- "learning_rate": 2.9993151281104006e-05,
534
- "loss": 0.5326,
535
- "step": 580
536
- },
537
- {
538
- "epoch": 25.652173913043477,
539
- "grad_norm": 1.1363499164581299,
540
- "learning_rate": 2.9992862968149695e-05,
541
- "loss": 0.4734,
542
- "step": 590
543
- },
544
- {
545
- "epoch": 26.08695652173913,
546
- "grad_norm": 1.1709671020507812,
547
- "learning_rate": 2.9992568712503533e-05,
548
- "loss": 0.4608,
549
- "step": 600
550
- },
551
- {
552
- "epoch": 26.08695652173913,
553
- "eval_loss": 0.7204815149307251,
554
- "eval_runtime": 0.4403,
555
- "eval_samples_per_second": 22.71,
556
- "eval_steps_per_second": 22.71,
557
- "step": 600
558
- },
559
- {
560
- "epoch": 26.08695652173913,
561
- "eval_loss": 0.8609212040901184,
562
- "eval_runtime": 0.5019,
563
- "eval_samples_per_second": 19.926,
564
- "eval_steps_per_second": 19.926,
565
- "step": 600
566
- },
567
- {
568
- "epoch": 26.08695652173913,
569
- "eval_loss": 0.7204815149307251,
570
- "eval_runtime": 0.5015,
571
- "eval_samples_per_second": 19.942,
572
- "eval_steps_per_second": 19.942,
573
- "step": 600
574
- },
575
- {
576
- "epoch": 26.08695652173913,
577
- "eval_loss": 0.7344802618026733,
578
- "eval_runtime": 0.4734,
579
- "eval_samples_per_second": 21.126,
580
- "eval_steps_per_second": 21.126,
581
- "step": 600
582
- },
583
- {
584
- "epoch": 26.08695652173913,
585
- "eval_loss": 0.8617879748344421,
586
- "eval_runtime": 0.4638,
587
- "eval_samples_per_second": 21.559,
588
- "eval_steps_per_second": 21.559,
589
- "step": 600
590
- },
591
- {
592
- "epoch": 26.52173913043478,
593
- "grad_norm": 1.0739339590072632,
594
- "learning_rate": 2.171901642542767e-06,
595
- "loss": 0.5121,
596
- "step": 610
597
- },
598
- {
599
- "epoch": 26.956521739130434,
600
- "grad_norm": 1.3133119344711304,
601
- "learning_rate": 4.343803285085534e-06,
602
- "loss": 0.4308,
603
- "step": 620
604
- },
605
- {
606
- "epoch": 27.391304347826086,
607
- "grad_norm": 1.1951584815979004,
608
- "learning_rate": 6.5157049276283e-06,
609
- "loss": 0.4829,
610
- "step": 630
611
- },
612
- {
613
- "epoch": 27.82608695652174,
614
- "grad_norm": 1.3615652322769165,
615
- "learning_rate": 8.687606570171068e-06,
616
- "loss": 0.5071,
617
- "step": 640
618
- },
619
- {
620
- "epoch": 28.26086956521739,
621
- "grad_norm": 1.3934813737869263,
622
- "learning_rate": 1.0859508212713834e-05,
623
- "loss": 0.4195,
624
- "step": 650
625
- },
626
- {
627
- "epoch": 28.695652173913043,
628
- "grad_norm": 1.5364168882369995,
629
- "learning_rate": 1.30314098552566e-05,
630
- "loss": 0.5198,
631
- "step": 660
632
- },
633
- {
634
- "epoch": 29.130434782608695,
635
- "grad_norm": 1.291927695274353,
636
- "learning_rate": 1.5203311497799366e-05,
637
- "loss": 0.4474,
638
- "step": 670
639
- },
640
- {
641
- "epoch": 29.565217391304348,
642
- "grad_norm": 1.0491178035736084,
643
- "learning_rate": 1.7375213140342136e-05,
644
- "loss": 0.489,
645
- "step": 680
646
- },
647
- {
648
- "epoch": 30.0,
649
- "grad_norm": 2.413468360900879,
650
- "learning_rate": 1.9547114782884902e-05,
651
- "loss": 0.4297,
652
- "step": 690
653
- },
654
- {
655
- "epoch": 30.434782608695652,
656
- "grad_norm": 1.3009700775146484,
657
- "learning_rate": 2.1719016425427668e-05,
658
- "loss": 0.498,
659
- "step": 700
660
- },
661
- {
662
- "epoch": 30.869565217391305,
663
- "grad_norm": 1.5959917306900024,
664
- "learning_rate": 2.1719014273246623e-05,
665
- "loss": 0.4323,
666
- "step": 710
667
- },
668
- {
669
- "epoch": 31.304347826086957,
670
- "grad_norm": 0.9005913138389587,
671
- "learning_rate": 2.171900781670434e-05,
672
- "loss": 0.4452,
673
- "step": 720
674
- },
675
- {
676
- "epoch": 31.73913043478261,
677
- "grad_norm": 1.5817768573760986,
678
- "learning_rate": 2.1718997055803376e-05,
679
- "loss": 0.5078,
680
- "step": 730
681
- },
682
- {
683
- "epoch": 32.17391304347826,
684
- "grad_norm": 1.16550874710083,
685
- "learning_rate": 2.1718981990548e-05,
686
- "loss": 0.4618,
687
- "step": 740
688
- },
689
- {
690
- "epoch": 32.608695652173914,
691
- "grad_norm": 1.7140876054763794,
692
- "learning_rate": 2.1718962620944183e-05,
693
- "loss": 0.5004,
694
- "step": 750
695
- },
696
- {
697
- "epoch": 32.608695652173914,
698
- "eval_loss": 0.7238079309463501,
699
- "eval_runtime": 0.3947,
700
- "eval_samples_per_second": 25.334,
701
- "eval_steps_per_second": 25.334,
702
- "step": 750
703
- },
704
- {
705
- "epoch": 32.608695652173914,
706
- "eval_loss": 0.8609212040901184,
707
- "eval_runtime": 0.3993,
708
- "eval_samples_per_second": 25.042,
709
- "eval_steps_per_second": 25.042,
710
- "step": 750
711
- },
712
- {
713
- "epoch": 32.608695652173914,
714
- "eval_loss": 0.7238079309463501,
715
- "eval_runtime": 0.389,
716
- "eval_samples_per_second": 25.705,
717
- "eval_steps_per_second": 25.705,
718
- "step": 750
719
- },
720
- {
721
- "epoch": 32.608695652173914,
722
- "eval_loss": 0.729554295539856,
723
- "eval_runtime": 0.3906,
724
- "eval_samples_per_second": 25.605,
725
- "eval_steps_per_second": 25.605,
726
- "step": 750
727
- },
728
- {
729
- "epoch": 32.608695652173914,
730
- "eval_loss": 0.8608381152153015,
731
- "eval_runtime": 0.391,
732
- "eval_samples_per_second": 25.574,
733
- "eval_steps_per_second": 25.574,
734
- "step": 750
735
- },
736
- {
737
- "epoch": 33.04347826086956,
738
- "grad_norm": 1.426628589630127,
739
- "learning_rate": 1.5787846818616295e-06,
740
- "loss": 0.4544,
741
- "step": 760
742
- },
743
- {
744
- "epoch": 33.47826086956522,
745
- "grad_norm": 1.549428105354309,
746
- "learning_rate": 3.157569363723259e-06,
747
- "loss": 0.461,
748
- "step": 770
749
- },
750
- {
751
- "epoch": 33.91304347826087,
752
- "grad_norm": 1.2880816459655762,
753
- "learning_rate": 4.736354045584888e-06,
754
- "loss": 0.4361,
755
- "step": 780
756
- },
757
- {
758
- "epoch": 34.34782608695652,
759
- "grad_norm": 1.3904523849487305,
760
- "learning_rate": 6.315138727446518e-06,
761
- "loss": 0.4367,
762
- "step": 790
763
- },
764
- {
765
- "epoch": 34.78260869565217,
766
- "grad_norm": 1.4110171794891357,
767
- "learning_rate": 7.893923409308147e-06,
768
- "loss": 0.4146,
769
- "step": 800
770
- },
771
- {
772
- "epoch": 35.21739130434783,
773
- "grad_norm": 1.2572520971298218,
774
- "learning_rate": 9.472708091169777e-06,
775
- "loss": 0.515,
776
- "step": 810
777
- },
778
- {
779
- "epoch": 35.65217391304348,
780
- "grad_norm": 1.2433037757873535,
781
- "learning_rate": 1.1051492773031405e-05,
782
- "loss": 0.4429,
783
- "step": 820
784
- },
785
- {
786
- "epoch": 36.08695652173913,
787
- "grad_norm": 1.068438172340393,
788
- "learning_rate": 1.2630277454893036e-05,
789
- "loss": 0.4585,
790
- "step": 830
791
- },
792
- {
793
- "epoch": 36.52173913043478,
794
- "grad_norm": 1.1094129085540771,
795
- "learning_rate": 1.4209062136754664e-05,
796
- "loss": 0.4247,
797
- "step": 840
798
- },
799
- {
800
- "epoch": 36.95652173913044,
801
- "grad_norm": 1.3411928415298462,
802
- "learning_rate": 1.5787846818616294e-05,
803
- "loss": 0.4956,
804
- "step": 850
805
- },
806
- {
807
- "epoch": 37.391304347826086,
808
- "grad_norm": 1.8016853332519531,
809
- "learning_rate": 1.5787845254166782e-05,
810
- "loss": 0.4679,
811
- "step": 860
812
- },
813
- {
814
- "epoch": 37.82608695652174,
815
- "grad_norm": 1.2095096111297607,
816
- "learning_rate": 1.5787840560818862e-05,
817
- "loss": 0.4155,
818
- "step": 870
819
- },
820
- {
821
- "epoch": 38.26086956521739,
822
- "grad_norm": 1.1868607997894287,
823
- "learning_rate": 1.5787832738574396e-05,
824
- "loss": 0.4514,
825
- "step": 880
826
- },
827
- {
828
- "epoch": 38.69565217391305,
829
- "grad_norm": 1.377031922340393,
830
- "learning_rate": 1.5787821787436484e-05,
831
- "loss": 0.4023,
832
- "step": 890
833
- },
834
- {
835
- "epoch": 39.130434782608695,
836
- "grad_norm": 1.88169527053833,
837
- "learning_rate": 1.578780770740947e-05,
838
- "loss": 0.5221,
839
- "step": 900
840
- },
841
- {
842
- "epoch": 39.130434782608695,
843
- "eval_loss": 0.7288335561752319,
844
- "eval_runtime": 0.4014,
845
- "eval_samples_per_second": 24.912,
846
- "eval_steps_per_second": 24.912,
847
- "step": 900
848
- },
849
- {
850
- "epoch": 39.130434782608695,
851
- "eval_loss": 0.8609212040901184,
852
- "eval_runtime": 0.5203,
853
- "eval_samples_per_second": 19.219,
854
- "eval_steps_per_second": 19.219,
855
- "step": 900
856
- },
857
- {
858
- "epoch": 39.130434782608695,
859
- "eval_loss": 0.7288335561752319,
860
- "eval_runtime": 0.5688,
861
- "eval_samples_per_second": 17.58,
862
- "eval_steps_per_second": 17.58,
863
- "step": 900
864
- },
865
- {
866
- "epoch": 39.130434782608695,
867
- "eval_loss": 0.7238188982009888,
868
- "eval_runtime": 0.4689,
869
- "eval_samples_per_second": 21.324,
870
- "eval_steps_per_second": 21.324,
871
- "step": 900
872
- },
873
- {
874
- "epoch": 39.130434782608695,
875
- "eval_loss": 0.861186146736145,
876
- "eval_runtime": 0.3946,
877
- "eval_samples_per_second": 25.341,
878
- "eval_steps_per_second": 25.341,
879
- "step": 900
880
- },
881
- {
882
- "epoch": 39.56521739130435,
883
- "grad_norm": 1.403578758239746,
884
- "learning_rate": 1.129275730490657e-06,
885
- "loss": 0.4353,
886
- "step": 910
887
- },
888
- {
889
- "epoch": 40.0,
890
- "grad_norm": 2.7664737701416016,
891
- "learning_rate": 2.258551460981314e-06,
892
- "loss": 0.4364,
893
- "step": 920
894
- },
895
- {
896
- "epoch": 40.43478260869565,
897
- "grad_norm": 1.4606038331985474,
898
- "learning_rate": 3.3878271914719706e-06,
899
- "loss": 0.5149,
900
- "step": 930
901
- },
902
- {
903
- "epoch": 40.869565217391305,
904
- "grad_norm": 1.4345122575759888,
905
- "learning_rate": 4.517102921962628e-06,
906
- "loss": 0.3915,
907
- "step": 940
908
- },
909
- {
910
- "epoch": 41.30434782608695,
911
- "grad_norm": 1.9893536567687988,
912
- "learning_rate": 5.6463786524532845e-06,
913
- "loss": 0.4036,
914
- "step": 950
915
- },
916
- {
917
- "epoch": 41.73913043478261,
918
- "grad_norm": 1.3536087274551392,
919
- "learning_rate": 6.775654382943941e-06,
920
- "loss": 0.4227,
921
- "step": 960
922
- },
923
- {
924
- "epoch": 42.17391304347826,
925
- "grad_norm": 1.3940178155899048,
926
- "learning_rate": 7.904930113434598e-06,
927
- "loss": 0.5038,
928
- "step": 970
929
- },
930
- {
931
- "epoch": 42.608695652173914,
932
- "grad_norm": 1.3348439931869507,
933
- "learning_rate": 9.034205843925256e-06,
934
- "loss": 0.4424,
935
- "step": 980
936
- },
937
- {
938
- "epoch": 43.04347826086956,
939
- "grad_norm": 1.4961334466934204,
940
- "learning_rate": 1.0163481574415913e-05,
941
- "loss": 0.4336,
942
- "step": 990
943
- },
944
- {
945
- "epoch": 43.47826086956522,
946
- "grad_norm": 1.2589102983474731,
947
- "learning_rate": 1.1292757304906569e-05,
948
- "loss": 0.4641,
949
- "step": 1000
950
- },
951
- {
952
- "epoch": 43.91304347826087,
953
- "grad_norm": 1.1131060123443604,
954
- "learning_rate": 1.1292756185884522e-05,
955
- "loss": 0.4143,
956
- "step": 1010
957
- },
958
- {
959
- "epoch": 44.34782608695652,
960
- "grad_norm": 1.8933826684951782,
961
- "learning_rate": 1.1292752828818823e-05,
962
- "loss": 0.4697,
963
- "step": 1020
964
- },
965
- {
966
- "epoch": 44.78260869565217,
967
- "grad_norm": 1.4995206594467163,
968
- "learning_rate": 1.1292747233710805e-05,
969
- "loss": 0.4582,
970
- "step": 1030
971
- },
972
- {
973
- "epoch": 45.21739130434783,
974
- "grad_norm": 1.1720516681671143,
975
- "learning_rate": 1.1292739400562683e-05,
976
- "loss": 0.3298,
977
- "step": 1040
978
- },
979
- {
980
- "epoch": 45.65217391304348,
981
- "grad_norm": 1.5783675909042358,
982
- "learning_rate": 1.1292729329377566e-05,
983
- "loss": 0.4415,
984
- "step": 1050
985
- },
986
- {
987
- "epoch": 45.65217391304348,
988
- "eval_loss": 0.7320327758789062,
989
- "eval_runtime": 0.5007,
990
- "eval_samples_per_second": 19.973,
991
- "eval_steps_per_second": 19.973,
992
- "step": 1050
993
- },
994
- {
995
- "epoch": 45.65217391304348,
996
- "eval_loss": 0.8609212040901184,
997
- "eval_runtime": 0.425,
998
- "eval_samples_per_second": 23.532,
999
- "eval_steps_per_second": 23.532,
1000
- "step": 1050
1001
- },
1002
- {
1003
- "epoch": 45.65217391304348,
1004
- "eval_loss": 0.7320327758789062,
1005
- "eval_runtime": 0.4019,
1006
- "eval_samples_per_second": 24.883,
1007
- "eval_steps_per_second": 24.883,
1008
- "step": 1050
1009
- },
1010
- {
1011
- "epoch": 45.65217391304348,
1012
- "eval_loss": 0.7223752737045288,
1013
- "eval_runtime": 0.4083,
1014
- "eval_samples_per_second": 24.492,
1015
- "eval_steps_per_second": 24.492,
1016
- "step": 1050
1017
- },
1018
- {
1019
- "epoch": 45.65217391304348,
1020
- "eval_loss": 0.8609784245491028,
1021
- "eval_runtime": 0.4101,
1022
- "eval_samples_per_second": 24.385,
1023
- "eval_steps_per_second": 24.385,
1024
- "step": 1050
1025
  }
1026
  ],
1027
  "logging_steps": 10,
@@ -1041,7 +171,7 @@
1041
  "attributes": {}
1042
  }
1043
  },
1044
- "total_flos": 2.696323352857805e+16,
1045
  "train_batch_size": 4,
1046
  "trial_name": null,
1047
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7960879802703857,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 6.521739130434782,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.43478260869565216,
13
+ "grad_norm": 1.502160668373108,
14
  "learning_rate": 3e-06,
15
  "loss": 0.906,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.8695652173913043,
20
+ "grad_norm": 1.6870523691177368,
21
  "learning_rate": 6e-06,
22
+ "loss": 0.9023,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.3043478260869565,
27
+ "grad_norm": 1.7296977043151855,
28
  "learning_rate": 9e-06,
29
+ "loss": 0.9002,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.7391304347826086,
34
+ "grad_norm": 1.4458173513412476,
35
  "learning_rate": 1.2e-05,
36
+ "loss": 0.9095,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 2.1739130434782608,
41
+ "grad_norm": 1.351696252822876,
42
  "learning_rate": 1.5e-05,
43
+ "loss": 0.8362,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 2.608695652173913,
48
+ "grad_norm": 2.046602964401245,
49
  "learning_rate": 1.8e-05,
50
+ "loss": 0.8897,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 3.0434782608695654,
55
+ "grad_norm": 1.4842596054077148,
56
  "learning_rate": 2.1e-05,
57
+ "loss": 0.8916,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 3.4782608695652173,
62
+ "grad_norm": 1.7293957471847534,
63
  "learning_rate": 2.4e-05,
64
+ "loss": 0.8233,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 3.9130434782608696,
69
+ "grad_norm": 1.4240052700042725,
70
  "learning_rate": 2.7000000000000002e-05,
71
+ "loss": 0.8529,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 4.3478260869565215,
76
+ "grad_norm": 1.3658534288406372,
77
  "learning_rate": 3e-05,
78
+ "loss": 0.8646,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 4.782608695652174,
83
+ "grad_norm": 2.2143099308013916,
84
  "learning_rate": 2.999999702723963e-05,
85
+ "loss": 0.8225,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 5.217391304347826,
90
+ "grad_norm": 1.0725128650665283,
91
  "learning_rate": 2.9999988108959687e-05,
92
+ "loss": 0.7653,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 5.6521739130434785,
97
+ "grad_norm": 1.5600417852401733,
98
  "learning_rate": 2.9999973245163716e-05,
99
  "loss": 0.7415,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 6.086956521739131,
104
+ "grad_norm": 1.907906413078308,
105
  "learning_rate": 2.99999524358576e-05,
106
+ "loss": 0.7656,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 6.521739130434782,
111
+ "grad_norm": 1.121804118156433,
112
  "learning_rate": 2.9999925681049593e-05,
113
+ "loss": 0.7858,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 6.521739130434782,
118
+ "eval_loss": 0.7960879802703857,
119
  "eval_runtime": 0.4775,
120
+ "eval_samples_per_second": 20.942,
121
+ "eval_steps_per_second": 20.942,
122
  "step": 150
123
  },
124
  {
125
  "epoch": 6.521739130434782,
126
+ "eval_loss": 0.8616224527359009,
127
+ "eval_runtime": 0.3994,
128
+ "eval_samples_per_second": 25.038,
129
+ "eval_steps_per_second": 25.038,
130
  "step": 150
131
  },
132
  {
133
  "epoch": 6.521739130434782,
134
+ "eval_loss": 0.7960879802703857,
135
+ "eval_runtime": 0.3973,
136
+ "eval_samples_per_second": 25.173,
137
+ "eval_steps_per_second": 25.173,
138
  "step": 150
139
  },
140
  {
141
  "epoch": 6.521739130434782,
142
+ "eval_loss": 0.8616224527359009,
143
+ "eval_runtime": 0.4031,
144
+ "eval_samples_per_second": 24.805,
145
+ "eval_steps_per_second": 24.805,
146
  "step": 150
147
  },
148
  {
149
  "epoch": 6.521739130434782,
150
+ "eval_loss": 0.8619683384895325,
151
+ "eval_runtime": 0.4079,
152
+ "eval_samples_per_second": 24.517,
153
+ "eval_steps_per_second": 24.517,
154
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
  ],
157
  "logging_steps": 10,
 
171
  "attributes": {}
172
  }
173
  },
174
+ "total_flos": 3894839614291968.0,
175
  "train_batch_size": 4,
176
  "trial_name": null,
177
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1943b73dd12e0d46a3b26b62916dc0f34ed8d7e7a1a2985c54b619124519181f
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c27b0feffa55e9783adeb1945da4877c05b7d99c3eb25293fa4481c312fbc7a4
3
  size 5368