augustocsc commited on
Commit
4492e48
·
1 Parent(s): 40db8d8

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:029e9c8894a0c96fd6ca8ba665ad5db158d348035368b515dc3449fb070a1938
3
  size 995654149
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc0123cbba4f38d5cdfff68f2a0e41ff0ec9e6991bb31ff96c249568367da7b
3
  size 995654149
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24f36f4b3e1bdd52cefc9ab7244f3b0b3e4f8b337303c7710a78bb1832f67065
3
  size 497813341
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dacf0a88a737dae9876a70e69ef790c67063ab7f11fc9897f2c2b60fd4134c6
3
  size 497813341
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f36ca5cda8e8172589bec87116d5ef3a453d4e3cf3bb38ad7f51df24bf09b612
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a207e92cacead3ab8860e639b0b71073ac9b7006ea3c6bb5a174842a663984e
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:382a13ddd5c3186fd4cdb4e299711ecb1645e8173bbba27863f35186c104aed9
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:889074ce496d0433f5d3aaa744ef53c746b1953cf83f080c44d924a85d012261
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,1319 +1,71 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 200,
6
- "global_step": 25195,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "eval_loss": 0.2690996825695038,
14
- "eval_runtime": 151.5792,
15
- "eval_samples_per_second": 945.618,
16
- "eval_steps_per_second": 3.694,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.08,
21
- "eval_loss": 1.5648741722106934,
22
- "eval_runtime": 151.1541,
23
- "eval_samples_per_second": 948.277,
24
- "eval_steps_per_second": 3.705,
25
  "step": 400
26
  },
27
  {
28
  "epoch": 0.1,
29
- "learning_rate": 4.9007739630879147e-05,
30
- "loss": 0.7894,
31
  "step": 500
32
  },
33
  {
34
  "epoch": 0.12,
35
- "eval_loss": 0.40906086564064026,
36
- "eval_runtime": 151.1687,
37
- "eval_samples_per_second": 948.185,
38
- "eval_steps_per_second": 3.704,
39
  "step": 600
40
  },
41
  {
42
  "epoch": 0.16,
43
- "eval_loss": 0.4954818785190582,
44
- "eval_runtime": 151.1164,
45
- "eval_samples_per_second": 948.514,
46
- "eval_steps_per_second": 3.706,
47
  "step": 800
48
  },
49
  {
50
  "epoch": 0.2,
51
- "learning_rate": 4.801547926175829e-05,
52
- "loss": 0.7504,
53
  "step": 1000
54
  },
55
  {
56
  "epoch": 0.2,
57
- "eval_loss": 0.5256636738777161,
58
- "eval_runtime": 151.0868,
59
- "eval_samples_per_second": 948.7,
60
- "eval_steps_per_second": 3.706,
61
- "step": 1000
62
- },
63
- {
64
- "epoch": 0.24,
65
- "eval_loss": 0.6123775243759155,
66
- "eval_runtime": 151.1133,
67
- "eval_samples_per_second": 948.533,
68
- "eval_steps_per_second": 3.706,
69
- "step": 1200
70
- },
71
- {
72
- "epoch": 0.28,
73
- "eval_loss": 0.6223951578140259,
74
- "eval_runtime": 151.141,
75
- "eval_samples_per_second": 948.36,
76
- "eval_steps_per_second": 3.705,
77
- "step": 1400
78
- },
79
- {
80
- "epoch": 0.3,
81
- "learning_rate": 4.7023218892637435e-05,
82
- "loss": 0.7901,
83
- "step": 1500
84
- },
85
- {
86
- "epoch": 0.32,
87
- "eval_loss": 0.5156298279762268,
88
- "eval_runtime": 151.1176,
89
- "eval_samples_per_second": 948.506,
90
- "eval_steps_per_second": 3.706,
91
- "step": 1600
92
- },
93
- {
94
- "epoch": 0.36,
95
- "eval_loss": 0.7702064514160156,
96
- "eval_runtime": 151.1572,
97
- "eval_samples_per_second": 948.258,
98
- "eval_steps_per_second": 3.705,
99
- "step": 1800
100
- },
101
- {
102
- "epoch": 0.4,
103
- "learning_rate": 4.603095852351657e-05,
104
- "loss": 0.7633,
105
- "step": 2000
106
- },
107
- {
108
- "epoch": 0.4,
109
- "eval_loss": 0.8383969664573669,
110
- "eval_runtime": 151.2044,
111
- "eval_samples_per_second": 947.962,
112
- "eval_steps_per_second": 3.704,
113
- "step": 2000
114
- },
115
- {
116
- "epoch": 0.44,
117
- "eval_loss": 0.7715063691139221,
118
- "eval_runtime": 151.1515,
119
- "eval_samples_per_second": 948.294,
120
- "eval_steps_per_second": 3.705,
121
- "step": 2200
122
- },
123
- {
124
- "epoch": 0.48,
125
- "eval_loss": 0.8304912447929382,
126
- "eval_runtime": 151.1644,
127
- "eval_samples_per_second": 948.213,
128
- "eval_steps_per_second": 3.705,
129
- "step": 2400
130
- },
131
- {
132
- "epoch": 0.5,
133
- "learning_rate": 4.5038698154395716e-05,
134
- "loss": 0.829,
135
- "step": 2500
136
- },
137
- {
138
- "epoch": 0.52,
139
- "eval_loss": 0.8085972666740417,
140
- "eval_runtime": 151.1877,
141
- "eval_samples_per_second": 948.066,
142
- "eval_steps_per_second": 3.704,
143
- "step": 2600
144
- },
145
- {
146
- "epoch": 0.56,
147
- "eval_loss": 0.9329496026039124,
148
- "eval_runtime": 151.1826,
149
- "eval_samples_per_second": 948.098,
150
- "eval_steps_per_second": 3.704,
151
- "step": 2800
152
- },
153
- {
154
- "epoch": 0.6,
155
- "learning_rate": 4.404643778527486e-05,
156
- "loss": 0.829,
157
- "step": 3000
158
- },
159
- {
160
- "epoch": 0.6,
161
- "eval_loss": 0.8683816194534302,
162
- "eval_runtime": 151.1074,
163
- "eval_samples_per_second": 948.571,
164
- "eval_steps_per_second": 3.706,
165
- "step": 3000
166
- },
167
- {
168
- "epoch": 0.64,
169
- "eval_loss": 0.9232978820800781,
170
- "eval_runtime": 151.3223,
171
- "eval_samples_per_second": 947.223,
172
- "eval_steps_per_second": 3.701,
173
- "step": 3200
174
- },
175
- {
176
- "epoch": 0.67,
177
- "eval_loss": 0.8544695973396301,
178
- "eval_runtime": 151.1372,
179
- "eval_samples_per_second": 948.383,
180
- "eval_steps_per_second": 3.705,
181
- "step": 3400
182
- },
183
- {
184
- "epoch": 0.69,
185
- "learning_rate": 4.3054177416154005e-05,
186
- "loss": 0.9221,
187
- "step": 3500
188
- },
189
- {
190
- "epoch": 0.71,
191
- "eval_loss": 0.9146988987922668,
192
- "eval_runtime": 151.1788,
193
- "eval_samples_per_second": 948.122,
194
- "eval_steps_per_second": 3.704,
195
- "step": 3600
196
- },
197
- {
198
- "epoch": 0.75,
199
- "eval_loss": 0.8903905153274536,
200
- "eval_runtime": 151.1333,
201
- "eval_samples_per_second": 948.408,
202
- "eval_steps_per_second": 3.705,
203
- "step": 3800
204
- },
205
- {
206
- "epoch": 0.79,
207
- "learning_rate": 4.206191704703314e-05,
208
- "loss": 0.9301,
209
- "step": 4000
210
- },
211
- {
212
- "epoch": 0.79,
213
- "eval_loss": 0.7672401666641235,
214
- "eval_runtime": 151.0676,
215
- "eval_samples_per_second": 948.821,
216
- "eval_steps_per_second": 3.707,
217
- "step": 4000
218
- },
219
- {
220
- "epoch": 0.83,
221
- "eval_loss": 0.9532115459442139,
222
- "eval_runtime": 151.0924,
223
- "eval_samples_per_second": 948.665,
224
- "eval_steps_per_second": 3.706,
225
- "step": 4200
226
- },
227
- {
228
- "epoch": 0.87,
229
- "eval_loss": 0.25937971472740173,
230
- "eval_runtime": 151.2151,
231
- "eval_samples_per_second": 947.895,
232
- "eval_steps_per_second": 3.703,
233
- "step": 4400
234
- },
235
- {
236
- "epoch": 0.89,
237
- "learning_rate": 4.1069656677912286e-05,
238
- "loss": 1.5584,
239
- "step": 4500
240
- },
241
- {
242
- "epoch": 0.91,
243
- "eval_loss": 0.13394613564014435,
244
- "eval_runtime": 151.3943,
245
- "eval_samples_per_second": 946.773,
246
- "eval_steps_per_second": 3.699,
247
- "step": 4600
248
- },
249
- {
250
- "epoch": 0.95,
251
- "eval_loss": 0.1273290514945984,
252
- "eval_runtime": 150.9105,
253
- "eval_samples_per_second": 949.808,
254
- "eval_steps_per_second": 3.711,
255
- "step": 4800
256
- },
257
- {
258
- "epoch": 0.99,
259
- "learning_rate": 4.0077396308791423e-05,
260
- "loss": 0.135,
261
- "step": 5000
262
- },
263
- {
264
- "epoch": 0.99,
265
- "eval_loss": 0.11838869005441666,
266
- "eval_runtime": 150.9534,
267
- "eval_samples_per_second": 949.538,
268
- "eval_steps_per_second": 3.71,
269
- "step": 5000
270
- },
271
- {
272
- "epoch": 1.03,
273
- "eval_loss": 0.11631189286708832,
274
- "eval_runtime": 150.9499,
275
- "eval_samples_per_second": 949.56,
276
- "eval_steps_per_second": 3.71,
277
- "step": 5200
278
- },
279
- {
280
- "epoch": 1.07,
281
- "eval_loss": 0.11566050350666046,
282
- "eval_runtime": 151.0546,
283
- "eval_samples_per_second": 948.902,
284
- "eval_steps_per_second": 3.707,
285
- "step": 5400
286
- },
287
- {
288
- "epoch": 1.09,
289
- "learning_rate": 3.908513593967057e-05,
290
- "loss": 0.1189,
291
- "step": 5500
292
- },
293
- {
294
- "epoch": 1.11,
295
- "eval_loss": 0.1144029051065445,
296
- "eval_runtime": 151.022,
297
- "eval_samples_per_second": 949.107,
298
- "eval_steps_per_second": 3.708,
299
- "step": 5600
300
- },
301
- {
302
- "epoch": 1.15,
303
- "eval_loss": 0.11364156752824783,
304
- "eval_runtime": 151.0116,
305
- "eval_samples_per_second": 949.172,
306
- "eval_steps_per_second": 3.708,
307
- "step": 5800
308
- },
309
- {
310
- "epoch": 1.19,
311
- "learning_rate": 3.809287557054971e-05,
312
- "loss": 0.1162,
313
- "step": 6000
314
- },
315
- {
316
- "epoch": 1.19,
317
- "eval_loss": 0.113379567861557,
318
- "eval_runtime": 151.0775,
319
- "eval_samples_per_second": 948.758,
320
- "eval_steps_per_second": 3.707,
321
- "step": 6000
322
- },
323
- {
324
- "epoch": 1.23,
325
- "eval_loss": 0.11304181814193726,
326
- "eval_runtime": 151.1881,
327
- "eval_samples_per_second": 948.064,
328
- "eval_steps_per_second": 3.704,
329
- "step": 6200
330
- },
331
- {
332
- "epoch": 1.27,
333
- "eval_loss": 0.11305490881204605,
334
- "eval_runtime": 151.0078,
335
- "eval_samples_per_second": 949.196,
336
- "eval_steps_per_second": 3.708,
337
- "step": 6400
338
- },
339
- {
340
- "epoch": 1.29,
341
- "learning_rate": 3.7100615201428856e-05,
342
- "loss": 0.1165,
343
- "step": 6500
344
- },
345
- {
346
- "epoch": 1.31,
347
- "eval_loss": 0.11227019131183624,
348
- "eval_runtime": 151.0464,
349
- "eval_samples_per_second": 948.953,
350
- "eval_steps_per_second": 3.707,
351
- "step": 6600
352
- },
353
- {
354
- "epoch": 1.35,
355
- "eval_loss": 0.11202485859394073,
356
- "eval_runtime": 150.9714,
357
- "eval_samples_per_second": 949.425,
358
  "eval_steps_per_second": 3.709,
359
- "step": 6800
360
- },
361
- {
362
- "epoch": 1.39,
363
- "learning_rate": 3.6108354832308e-05,
364
- "loss": 0.1136,
365
- "step": 7000
366
- },
367
- {
368
- "epoch": 1.39,
369
- "eval_loss": 0.11163973808288574,
370
- "eval_runtime": 151.0638,
371
- "eval_samples_per_second": 948.844,
372
- "eval_steps_per_second": 3.707,
373
- "step": 7000
374
- },
375
- {
376
- "epoch": 1.43,
377
- "eval_loss": 0.11151301115751266,
378
- "eval_runtime": 151.052,
379
- "eval_samples_per_second": 948.918,
380
- "eval_steps_per_second": 3.707,
381
- "step": 7200
382
- },
383
- {
384
- "epoch": 1.47,
385
- "eval_loss": 0.11108649522066116,
386
- "eval_runtime": 151.035,
387
- "eval_samples_per_second": 949.025,
388
- "eval_steps_per_second": 3.708,
389
- "step": 7400
390
- },
391
- {
392
- "epoch": 1.49,
393
- "learning_rate": 3.5116094463187144e-05,
394
- "loss": 0.1124,
395
- "step": 7500
396
- },
397
- {
398
- "epoch": 1.51,
399
- "eval_loss": 0.11082353442907333,
400
- "eval_runtime": 150.9886,
401
- "eval_samples_per_second": 949.317,
402
- "eval_steps_per_second": 3.709,
403
- "step": 7600
404
- },
405
- {
406
- "epoch": 1.55,
407
- "eval_loss": 0.11104652285575867,
408
- "eval_runtime": 150.9566,
409
- "eval_samples_per_second": 949.518,
410
- "eval_steps_per_second": 3.71,
411
- "step": 7800
412
- },
413
- {
414
- "epoch": 1.59,
415
- "learning_rate": 3.412383409406629e-05,
416
- "loss": 0.1128,
417
- "step": 8000
418
- },
419
- {
420
- "epoch": 1.59,
421
- "eval_loss": 0.11066918820142746,
422
- "eval_runtime": 150.949,
423
- "eval_samples_per_second": 949.566,
424
- "eval_steps_per_second": 3.71,
425
- "step": 8000
426
- },
427
- {
428
- "epoch": 1.63,
429
- "eval_loss": 0.11038321256637573,
430
- "eval_runtime": 151.0411,
431
- "eval_samples_per_second": 948.987,
432
- "eval_steps_per_second": 3.708,
433
- "step": 8200
434
- },
435
- {
436
- "epoch": 1.67,
437
- "eval_loss": 0.11011925339698792,
438
- "eval_runtime": 151.052,
439
- "eval_samples_per_second": 948.918,
440
- "eval_steps_per_second": 3.707,
441
- "step": 8400
442
- },
443
- {
444
- "epoch": 1.69,
445
- "learning_rate": 3.3131573724945426e-05,
446
- "loss": 0.1114,
447
- "step": 8500
448
- },
449
- {
450
- "epoch": 1.71,
451
- "eval_loss": 0.11026579141616821,
452
- "eval_runtime": 151.2041,
453
- "eval_samples_per_second": 947.963,
454
- "eval_steps_per_second": 3.704,
455
- "step": 8600
456
- },
457
- {
458
- "epoch": 1.75,
459
- "eval_loss": 0.10981647670269012,
460
- "eval_runtime": 151.0892,
461
- "eval_samples_per_second": 948.685,
462
- "eval_steps_per_second": 3.706,
463
- "step": 8800
464
- },
465
- {
466
- "epoch": 1.79,
467
- "learning_rate": 3.213931335582457e-05,
468
- "loss": 0.1094,
469
- "step": 9000
470
- },
471
- {
472
- "epoch": 1.79,
473
- "eval_loss": 0.11008855700492859,
474
- "eval_runtime": 150.9863,
475
- "eval_samples_per_second": 949.331,
476
- "eval_steps_per_second": 3.709,
477
- "step": 9000
478
- },
479
- {
480
- "epoch": 1.83,
481
- "eval_loss": 0.10983282327651978,
482
- "eval_runtime": 150.9699,
483
- "eval_samples_per_second": 949.434,
484
- "eval_steps_per_second": 3.709,
485
- "step": 9200
486
- },
487
- {
488
- "epoch": 1.87,
489
- "eval_loss": 0.10982002317905426,
490
- "eval_runtime": 151.0614,
491
- "eval_samples_per_second": 948.859,
492
- "eval_steps_per_second": 3.707,
493
- "step": 9400
494
- },
495
- {
496
- "epoch": 1.89,
497
- "learning_rate": 3.1147052986703714e-05,
498
- "loss": 0.111,
499
- "step": 9500
500
- },
501
- {
502
- "epoch": 1.91,
503
- "eval_loss": 0.11063655465841293,
504
- "eval_runtime": 151.0974,
505
- "eval_samples_per_second": 948.633,
506
- "eval_steps_per_second": 3.706,
507
- "step": 9600
508
- },
509
- {
510
- "epoch": 1.94,
511
- "eval_loss": 0.10998239368200302,
512
- "eval_runtime": 150.9779,
513
- "eval_samples_per_second": 949.384,
514
- "eval_steps_per_second": 3.709,
515
- "step": 9800
516
- },
517
- {
518
- "epoch": 1.98,
519
- "learning_rate": 3.0154792617582855e-05,
520
- "loss": 0.1118,
521
- "step": 10000
522
- },
523
- {
524
- "epoch": 1.98,
525
- "eval_loss": 0.10957028716802597,
526
- "eval_runtime": 151.0796,
527
- "eval_samples_per_second": 948.745,
528
- "eval_steps_per_second": 3.707,
529
- "step": 10000
530
- },
531
- {
532
- "epoch": 2.02,
533
- "eval_loss": 0.10959649085998535,
534
- "eval_runtime": 150.9465,
535
- "eval_samples_per_second": 949.582,
536
- "eval_steps_per_second": 3.71,
537
- "step": 10200
538
- },
539
- {
540
- "epoch": 2.06,
541
- "eval_loss": 0.10964351147413254,
542
- "eval_runtime": 151.0871,
543
- "eval_samples_per_second": 948.698,
544
- "eval_steps_per_second": 3.706,
545
- "step": 10400
546
- },
547
- {
548
- "epoch": 2.08,
549
- "learning_rate": 2.9162532248462e-05,
550
- "loss": 0.1102,
551
- "step": 10500
552
- },
553
- {
554
- "epoch": 2.1,
555
- "eval_loss": 0.1093754768371582,
556
- "eval_runtime": 150.92,
557
- "eval_samples_per_second": 949.748,
558
- "eval_steps_per_second": 3.711,
559
- "step": 10600
560
- },
561
- {
562
- "epoch": 2.14,
563
- "eval_loss": 0.10980819910764694,
564
- "eval_runtime": 151.0665,
565
- "eval_samples_per_second": 948.827,
566
- "eval_steps_per_second": 3.707,
567
- "step": 10800
568
- },
569
- {
570
- "epoch": 2.18,
571
- "learning_rate": 2.8170271879341143e-05,
572
- "loss": 0.1083,
573
- "step": 11000
574
- },
575
- {
576
- "epoch": 2.18,
577
- "eval_loss": 0.10928498953580856,
578
- "eval_runtime": 151.0595,
579
- "eval_samples_per_second": 948.871,
580
- "eval_steps_per_second": 3.707,
581
- "step": 11000
582
- },
583
- {
584
- "epoch": 2.22,
585
- "eval_loss": 0.10924935340881348,
586
- "eval_runtime": 151.0906,
587
- "eval_samples_per_second": 948.676,
588
- "eval_steps_per_second": 3.706,
589
- "step": 11200
590
- },
591
- {
592
- "epoch": 2.26,
593
- "eval_loss": 0.10928213596343994,
594
- "eval_runtime": 151.0496,
595
- "eval_samples_per_second": 948.933,
596
- "eval_steps_per_second": 3.707,
597
- "step": 11400
598
- },
599
- {
600
- "epoch": 2.28,
601
- "learning_rate": 2.717801151022028e-05,
602
- "loss": 0.1113,
603
- "step": 11500
604
- },
605
- {
606
- "epoch": 2.3,
607
- "eval_loss": 0.10921794176101685,
608
- "eval_runtime": 151.0887,
609
- "eval_samples_per_second": 948.688,
610
- "eval_steps_per_second": 3.706,
611
- "step": 11600
612
- },
613
- {
614
- "epoch": 2.34,
615
- "eval_loss": 0.1091688945889473,
616
- "eval_runtime": 151.1277,
617
- "eval_samples_per_second": 948.443,
618
- "eval_steps_per_second": 3.705,
619
- "step": 11800
620
- },
621
- {
622
- "epoch": 2.38,
623
- "learning_rate": 2.6185751141099424e-05,
624
- "loss": 0.1102,
625
- "step": 12000
626
- },
627
- {
628
- "epoch": 2.38,
629
- "eval_loss": 0.10950697958469391,
630
- "eval_runtime": 151.1121,
631
- "eval_samples_per_second": 948.541,
632
- "eval_steps_per_second": 3.706,
633
- "step": 12000
634
- },
635
- {
636
- "epoch": 2.42,
637
- "eval_loss": 0.10911945253610611,
638
- "eval_runtime": 151.0456,
639
- "eval_samples_per_second": 948.958,
640
- "eval_steps_per_second": 3.707,
641
- "step": 12200
642
- },
643
- {
644
- "epoch": 2.46,
645
- "eval_loss": 0.1094910129904747,
646
- "eval_runtime": 150.9865,
647
- "eval_samples_per_second": 949.33,
648
- "eval_steps_per_second": 3.709,
649
- "step": 12400
650
- },
651
- {
652
- "epoch": 2.48,
653
- "learning_rate": 2.519349077197857e-05,
654
- "loss": 0.1093,
655
- "step": 12500
656
- },
657
- {
658
- "epoch": 2.5,
659
- "eval_loss": 0.10911854356527328,
660
- "eval_runtime": 150.8255,
661
- "eval_samples_per_second": 950.343,
662
- "eval_steps_per_second": 3.713,
663
- "step": 12600
664
- },
665
- {
666
- "epoch": 2.54,
667
- "eval_loss": 0.1091616079211235,
668
- "eval_runtime": 150.8284,
669
- "eval_samples_per_second": 950.325,
670
- "eval_steps_per_second": 3.713,
671
- "step": 12800
672
- },
673
- {
674
- "epoch": 2.58,
675
- "learning_rate": 2.420123040285771e-05,
676
- "loss": 0.1105,
677
- "step": 13000
678
- },
679
- {
680
- "epoch": 2.58,
681
- "eval_loss": 0.10907712578773499,
682
- "eval_runtime": 151.0685,
683
- "eval_samples_per_second": 948.815,
684
- "eval_steps_per_second": 3.707,
685
- "step": 13000
686
- },
687
- {
688
- "epoch": 2.62,
689
- "eval_loss": 0.10917963832616806,
690
- "eval_runtime": 150.9953,
691
- "eval_samples_per_second": 949.274,
692
- "eval_steps_per_second": 3.709,
693
- "step": 13200
694
- },
695
- {
696
- "epoch": 2.66,
697
- "eval_loss": 0.10913284868001938,
698
- "eval_runtime": 150.8911,
699
- "eval_samples_per_second": 949.93,
700
- "eval_steps_per_second": 3.711,
701
- "step": 13400
702
- },
703
- {
704
- "epoch": 2.68,
705
- "learning_rate": 2.3208970033736854e-05,
706
- "loss": 0.1094,
707
- "step": 13500
708
- },
709
- {
710
- "epoch": 2.7,
711
- "eval_loss": 0.10903245210647583,
712
- "eval_runtime": 150.9858,
713
- "eval_samples_per_second": 949.334,
714
- "eval_steps_per_second": 3.709,
715
- "step": 13600
716
- },
717
- {
718
- "epoch": 2.74,
719
- "eval_loss": 0.10889765620231628,
720
- "eval_runtime": 150.9473,
721
- "eval_samples_per_second": 949.577,
722
- "eval_steps_per_second": 3.71,
723
- "step": 13800
724
- },
725
- {
726
- "epoch": 2.78,
727
- "learning_rate": 2.2216709664615994e-05,
728
- "loss": 0.1104,
729
- "step": 14000
730
- },
731
- {
732
- "epoch": 2.78,
733
- "eval_loss": 0.1090923622250557,
734
- "eval_runtime": 151.128,
735
- "eval_samples_per_second": 948.441,
736
- "eval_steps_per_second": 3.705,
737
- "step": 14000
738
- },
739
- {
740
- "epoch": 2.82,
741
- "eval_loss": 0.10899552702903748,
742
- "eval_runtime": 151.5342,
743
- "eval_samples_per_second": 945.899,
744
- "eval_steps_per_second": 3.696,
745
- "step": 14200
746
- },
747
- {
748
- "epoch": 2.86,
749
- "eval_loss": 0.10895191133022308,
750
- "eval_runtime": 150.9811,
751
- "eval_samples_per_second": 949.364,
752
- "eval_steps_per_second": 3.709,
753
- "step": 14400
754
- },
755
- {
756
- "epoch": 2.88,
757
- "learning_rate": 2.122444929549514e-05,
758
- "loss": 0.1117,
759
- "step": 14500
760
- },
761
- {
762
- "epoch": 2.9,
763
- "eval_loss": 0.10904989391565323,
764
- "eval_runtime": 150.9339,
765
- "eval_samples_per_second": 949.661,
766
- "eval_steps_per_second": 3.71,
767
- "step": 14600
768
- },
769
- {
770
- "epoch": 2.94,
771
- "eval_loss": 0.10929518938064575,
772
- "eval_runtime": 150.8669,
773
- "eval_samples_per_second": 950.082,
774
- "eval_steps_per_second": 3.712,
775
- "step": 14800
776
- },
777
- {
778
- "epoch": 2.98,
779
- "learning_rate": 2.0232188926374283e-05,
780
- "loss": 0.1093,
781
- "step": 15000
782
- },
783
- {
784
- "epoch": 2.98,
785
- "eval_loss": 0.10902168601751328,
786
- "eval_runtime": 151.0426,
787
- "eval_samples_per_second": 948.977,
788
- "eval_steps_per_second": 3.708,
789
- "step": 15000
790
- },
791
- {
792
- "epoch": 3.02,
793
- "eval_loss": 0.10888814926147461,
794
- "eval_runtime": 150.8829,
795
- "eval_samples_per_second": 949.982,
796
- "eval_steps_per_second": 3.711,
797
- "step": 15200
798
- },
799
- {
800
- "epoch": 3.06,
801
- "eval_loss": 0.1088201105594635,
802
- "eval_runtime": 150.8765,
803
- "eval_samples_per_second": 950.022,
804
- "eval_steps_per_second": 3.712,
805
- "step": 15400
806
- },
807
- {
808
- "epoch": 3.08,
809
- "learning_rate": 1.9239928557253427e-05,
810
- "loss": 0.1098,
811
- "step": 15500
812
- },
813
- {
814
- "epoch": 3.1,
815
- "eval_loss": 0.10886531323194504,
816
- "eval_runtime": 150.9989,
817
- "eval_samples_per_second": 949.252,
818
- "eval_steps_per_second": 3.709,
819
- "step": 15600
820
- },
821
- {
822
- "epoch": 3.14,
823
- "eval_loss": 0.1089038997888565,
824
- "eval_runtime": 150.8728,
825
- "eval_samples_per_second": 950.045,
826
- "eval_steps_per_second": 3.712,
827
- "step": 15800
828
- },
829
- {
830
- "epoch": 3.18,
831
- "learning_rate": 1.8247668188132567e-05,
832
- "loss": 0.1102,
833
- "step": 16000
834
- },
835
- {
836
- "epoch": 3.18,
837
- "eval_loss": 0.10882215201854706,
838
- "eval_runtime": 151.4626,
839
- "eval_samples_per_second": 946.346,
840
- "eval_steps_per_second": 3.697,
841
- "step": 16000
842
- },
843
- {
844
- "epoch": 3.21,
845
- "eval_loss": 0.10881613940000534,
846
- "eval_runtime": 151.4856,
847
- "eval_samples_per_second": 946.202,
848
- "eval_steps_per_second": 3.697,
849
- "step": 16200
850
- },
851
- {
852
- "epoch": 3.25,
853
- "eval_loss": 0.10882557183504105,
854
- "eval_runtime": 150.9956,
855
- "eval_samples_per_second": 949.273,
856
- "eval_steps_per_second": 3.709,
857
- "step": 16400
858
- },
859
- {
860
- "epoch": 3.27,
861
- "learning_rate": 1.7255407819011708e-05,
862
- "loss": 0.1087,
863
- "step": 16500
864
- },
865
- {
866
- "epoch": 3.29,
867
- "eval_loss": 0.10881206393241882,
868
- "eval_runtime": 150.911,
869
- "eval_samples_per_second": 949.805,
870
- "eval_steps_per_second": 3.711,
871
- "step": 16600
872
- },
873
- {
874
- "epoch": 3.33,
875
- "eval_loss": 0.10894536972045898,
876
- "eval_runtime": 150.8614,
877
- "eval_samples_per_second": 950.117,
878
- "eval_steps_per_second": 3.712,
879
- "step": 16800
880
- },
881
- {
882
- "epoch": 3.37,
883
- "learning_rate": 1.6263147449890852e-05,
884
- "loss": 0.1082,
885
- "step": 17000
886
- },
887
- {
888
- "epoch": 3.37,
889
- "eval_loss": 0.10886503010988235,
890
- "eval_runtime": 151.0504,
891
- "eval_samples_per_second": 948.928,
892
- "eval_steps_per_second": 3.707,
893
- "step": 17000
894
- },
895
- {
896
- "epoch": 3.41,
897
- "eval_loss": 0.10877899825572968,
898
- "eval_runtime": 150.8388,
899
- "eval_samples_per_second": 950.259,
900
- "eval_steps_per_second": 3.713,
901
- "step": 17200
902
- },
903
- {
904
- "epoch": 3.45,
905
- "eval_loss": 0.10876929759979248,
906
- "eval_runtime": 150.9938,
907
- "eval_samples_per_second": 949.284,
908
- "eval_steps_per_second": 3.709,
909
- "step": 17400
910
- },
911
- {
912
- "epoch": 3.47,
913
- "learning_rate": 1.5270887080769993e-05,
914
- "loss": 0.1097,
915
- "step": 17500
916
- },
917
- {
918
- "epoch": 3.49,
919
- "eval_loss": 0.1089547798037529,
920
- "eval_runtime": 151.0328,
921
- "eval_samples_per_second": 949.039,
922
- "eval_steps_per_second": 3.708,
923
- "step": 17600
924
- },
925
- {
926
- "epoch": 3.53,
927
- "eval_loss": 0.10884077101945877,
928
- "eval_runtime": 151.0214,
929
- "eval_samples_per_second": 949.111,
930
- "eval_steps_per_second": 3.708,
931
- "step": 17800
932
- },
933
- {
934
- "epoch": 3.57,
935
- "learning_rate": 1.4278626711649137e-05,
936
- "loss": 0.1105,
937
- "step": 18000
938
- },
939
- {
940
- "epoch": 3.57,
941
- "eval_loss": 0.1087304875254631,
942
- "eval_runtime": 150.7922,
943
- "eval_samples_per_second": 950.553,
944
- "eval_steps_per_second": 3.714,
945
- "step": 18000
946
- },
947
- {
948
- "epoch": 3.61,
949
- "eval_loss": 0.10879357904195786,
950
- "eval_runtime": 150.8914,
951
- "eval_samples_per_second": 949.929,
952
- "eval_steps_per_second": 3.711,
953
- "step": 18200
954
- },
955
- {
956
- "epoch": 3.65,
957
- "eval_loss": 0.10872866213321686,
958
- "eval_runtime": 150.992,
959
- "eval_samples_per_second": 949.295,
960
- "eval_steps_per_second": 3.709,
961
- "step": 18400
962
- },
963
- {
964
- "epoch": 3.67,
965
- "learning_rate": 1.3286366342528281e-05,
966
- "loss": 0.1089,
967
- "step": 18500
968
- },
969
- {
970
- "epoch": 3.69,
971
- "eval_loss": 0.10874517261981964,
972
- "eval_runtime": 151.2717,
973
- "eval_samples_per_second": 947.54,
974
- "eval_steps_per_second": 3.702,
975
- "step": 18600
976
- },
977
- {
978
- "epoch": 3.73,
979
- "eval_loss": 0.10880085825920105,
980
- "eval_runtime": 150.8035,
981
- "eval_samples_per_second": 950.482,
982
- "eval_steps_per_second": 3.713,
983
- "step": 18800
984
- },
985
- {
986
- "epoch": 3.77,
987
- "learning_rate": 1.2294105973407422e-05,
988
- "loss": 0.1101,
989
- "step": 19000
990
- },
991
- {
992
- "epoch": 3.77,
993
- "eval_loss": 0.10869602859020233,
994
- "eval_runtime": 150.9258,
995
- "eval_samples_per_second": 949.712,
996
- "eval_steps_per_second": 3.71,
997
- "step": 19000
998
- },
999
- {
1000
- "epoch": 3.81,
1001
- "eval_loss": 0.10874003916978836,
1002
- "eval_runtime": 150.8809,
1003
- "eval_samples_per_second": 949.994,
1004
- "eval_steps_per_second": 3.712,
1005
- "step": 19200
1006
- },
1007
- {
1008
- "epoch": 3.85,
1009
- "eval_loss": 0.10871053487062454,
1010
- "eval_runtime": 151.4705,
1011
- "eval_samples_per_second": 946.296,
1012
- "eval_steps_per_second": 3.697,
1013
- "step": 19400
1014
- },
1015
- {
1016
- "epoch": 3.87,
1017
- "learning_rate": 1.1301845604286565e-05,
1018
- "loss": 0.1095,
1019
- "step": 19500
1020
- },
1021
- {
1022
- "epoch": 3.89,
1023
- "eval_loss": 0.10870121419429779,
1024
- "eval_runtime": 151.2796,
1025
- "eval_samples_per_second": 947.491,
1026
- "eval_steps_per_second": 3.702,
1027
- "step": 19600
1028
- },
1029
- {
1030
- "epoch": 3.93,
1031
- "eval_loss": 0.1087031215429306,
1032
- "eval_runtime": 151.3836,
1033
- "eval_samples_per_second": 946.839,
1034
- "eval_steps_per_second": 3.699,
1035
- "step": 19800
1036
- },
1037
- {
1038
- "epoch": 3.97,
1039
- "learning_rate": 1.0309585235165709e-05,
1040
- "loss": 0.1089,
1041
- "step": 20000
1042
- },
1043
- {
1044
- "epoch": 3.97,
1045
- "eval_loss": 0.10877121239900589,
1046
- "eval_runtime": 150.9637,
1047
- "eval_samples_per_second": 949.473,
1048
- "eval_steps_per_second": 3.71,
1049
- "step": 20000
1050
- },
1051
- {
1052
- "epoch": 4.01,
1053
- "eval_loss": 0.10867509990930557,
1054
- "eval_runtime": 151.1135,
1055
- "eval_samples_per_second": 948.532,
1056
- "eval_steps_per_second": 3.706,
1057
- "step": 20200
1058
- },
1059
- {
1060
- "epoch": 4.05,
1061
- "eval_loss": 0.10896100848913193,
1062
- "eval_runtime": 151.009,
1063
- "eval_samples_per_second": 949.189,
1064
- "eval_steps_per_second": 3.708,
1065
- "step": 20400
1066
- },
1067
- {
1068
- "epoch": 4.07,
1069
- "learning_rate": 9.317324866044851e-06,
1070
- "loss": 0.1093,
1071
- "step": 20500
1072
- },
1073
- {
1074
- "epoch": 4.09,
1075
- "eval_loss": 0.10866597294807434,
1076
- "eval_runtime": 151.1827,
1077
- "eval_samples_per_second": 948.098,
1078
- "eval_steps_per_second": 3.704,
1079
- "step": 20600
1080
- },
1081
- {
1082
- "epoch": 4.13,
1083
- "eval_loss": 0.1086614802479744,
1084
- "eval_runtime": 150.9777,
1085
- "eval_samples_per_second": 949.385,
1086
- "eval_steps_per_second": 3.709,
1087
- "step": 20800
1088
- },
1089
- {
1090
- "epoch": 4.17,
1091
- "learning_rate": 8.325064496923992e-06,
1092
- "loss": 0.1091,
1093
- "step": 21000
1094
- },
1095
- {
1096
- "epoch": 4.17,
1097
- "eval_loss": 0.1086558997631073,
1098
- "eval_runtime": 150.9361,
1099
- "eval_samples_per_second": 949.647,
1100
- "eval_steps_per_second": 3.71,
1101
- "step": 21000
1102
- },
1103
- {
1104
- "epoch": 4.21,
1105
- "eval_loss": 0.10874707996845245,
1106
- "eval_runtime": 150.8938,
1107
- "eval_samples_per_second": 949.913,
1108
- "eval_steps_per_second": 3.711,
1109
- "step": 21200
1110
- },
1111
- {
1112
- "epoch": 4.25,
1113
- "eval_loss": 0.10870933532714844,
1114
- "eval_runtime": 150.9348,
1115
- "eval_samples_per_second": 949.655,
1116
- "eval_steps_per_second": 3.71,
1117
- "step": 21400
1118
- },
1119
- {
1120
- "epoch": 4.27,
1121
- "learning_rate": 7.332804127803136e-06,
1122
- "loss": 0.11,
1123
- "step": 21500
1124
- },
1125
- {
1126
- "epoch": 4.29,
1127
- "eval_loss": 0.10863141715526581,
1128
- "eval_runtime": 151.0321,
1129
- "eval_samples_per_second": 949.043,
1130
- "eval_steps_per_second": 3.708,
1131
- "step": 21600
1132
- },
1133
- {
1134
- "epoch": 4.33,
1135
- "eval_loss": 0.1086675301194191,
1136
- "eval_runtime": 150.8629,
1137
- "eval_samples_per_second": 950.108,
1138
- "eval_steps_per_second": 3.712,
1139
- "step": 21800
1140
- },
1141
- {
1142
- "epoch": 4.37,
1143
- "learning_rate": 6.3405437586822785e-06,
1144
- "loss": 0.11,
1145
- "step": 22000
1146
- },
1147
- {
1148
- "epoch": 4.37,
1149
- "eval_loss": 0.10875381529331207,
1150
- "eval_runtime": 150.9082,
1151
- "eval_samples_per_second": 949.822,
1152
- "eval_steps_per_second": 3.711,
1153
- "step": 22000
1154
- },
1155
- {
1156
- "epoch": 4.41,
1157
- "eval_loss": 0.10868263989686966,
1158
- "eval_runtime": 150.9029,
1159
- "eval_samples_per_second": 949.856,
1160
- "eval_steps_per_second": 3.711,
1161
- "step": 22200
1162
- },
1163
- {
1164
- "epoch": 4.45,
1165
- "eval_loss": 0.10862214863300323,
1166
- "eval_runtime": 150.9465,
1167
- "eval_samples_per_second": 949.582,
1168
- "eval_steps_per_second": 3.71,
1169
- "step": 22400
1170
- },
1171
- {
1172
- "epoch": 4.47,
1173
- "learning_rate": 5.348283389561421e-06,
1174
- "loss": 0.1094,
1175
- "step": 22500
1176
- },
1177
- {
1178
- "epoch": 4.49,
1179
- "eval_loss": 0.10863359272480011,
1180
- "eval_runtime": 150.949,
1181
- "eval_samples_per_second": 949.566,
1182
- "eval_steps_per_second": 3.71,
1183
- "step": 22600
1184
- },
1185
- {
1186
- "epoch": 4.52,
1187
- "eval_loss": 0.10862518846988678,
1188
- "eval_runtime": 150.9951,
1189
- "eval_samples_per_second": 949.276,
1190
- "eval_steps_per_second": 3.709,
1191
- "step": 22800
1192
- },
1193
- {
1194
- "epoch": 4.56,
1195
- "learning_rate": 4.356023020440564e-06,
1196
- "loss": 0.1086,
1197
- "step": 23000
1198
- },
1199
- {
1200
- "epoch": 4.56,
1201
- "eval_loss": 0.10860537737607956,
1202
- "eval_runtime": 150.9863,
1203
- "eval_samples_per_second": 949.331,
1204
- "eval_steps_per_second": 3.709,
1205
- "step": 23000
1206
- },
1207
- {
1208
- "epoch": 4.6,
1209
- "eval_loss": 0.10861887782812119,
1210
- "eval_runtime": 150.9451,
1211
- "eval_samples_per_second": 949.59,
1212
- "eval_steps_per_second": 3.71,
1213
- "step": 23200
1214
- },
1215
- {
1216
- "epoch": 4.64,
1217
- "eval_loss": 0.10863388329744339,
1218
- "eval_runtime": 150.9571,
1219
- "eval_samples_per_second": 949.515,
1220
- "eval_steps_per_second": 3.71,
1221
- "step": 23400
1222
- },
1223
- {
1224
- "epoch": 4.66,
1225
- "learning_rate": 3.3637626513197067e-06,
1226
- "loss": 0.1087,
1227
- "step": 23500
1228
- },
1229
- {
1230
- "epoch": 4.68,
1231
- "eval_loss": 0.10861673206090927,
1232
- "eval_runtime": 150.8,
1233
- "eval_samples_per_second": 950.504,
1234
- "eval_steps_per_second": 3.714,
1235
- "step": 23600
1236
- },
1237
- {
1238
- "epoch": 4.72,
1239
- "eval_loss": 0.10860313475131989,
1240
- "eval_runtime": 150.9239,
1241
- "eval_samples_per_second": 949.724,
1242
- "eval_steps_per_second": 3.71,
1243
- "step": 23800
1244
- },
1245
- {
1246
- "epoch": 4.76,
1247
- "learning_rate": 2.371502282198849e-06,
1248
- "loss": 0.1105,
1249
- "step": 24000
1250
- },
1251
- {
1252
- "epoch": 4.76,
1253
- "eval_loss": 0.10860700160264969,
1254
- "eval_runtime": 151.02,
1255
- "eval_samples_per_second": 949.119,
1256
- "eval_steps_per_second": 3.708,
1257
- "step": 24000
1258
- },
1259
- {
1260
- "epoch": 4.8,
1261
- "eval_loss": 0.1085989773273468,
1262
- "eval_runtime": 151.0214,
1263
- "eval_samples_per_second": 949.111,
1264
- "eval_steps_per_second": 3.708,
1265
- "step": 24200
1266
- },
1267
- {
1268
- "epoch": 4.84,
1269
- "eval_loss": 0.10860409587621689,
1270
- "eval_runtime": 151.0306,
1271
- "eval_samples_per_second": 949.053,
1272
- "eval_steps_per_second": 3.708,
1273
- "step": 24400
1274
- },
1275
- {
1276
- "epoch": 4.86,
1277
- "learning_rate": 1.3792419130779918e-06,
1278
- "loss": 0.1092,
1279
- "step": 24500
1280
- },
1281
- {
1282
- "epoch": 4.88,
1283
- "eval_loss": 0.10858762264251709,
1284
- "eval_runtime": 150.7929,
1285
- "eval_samples_per_second": 950.549,
1286
- "eval_steps_per_second": 3.714,
1287
- "step": 24600
1288
- },
1289
- {
1290
- "epoch": 4.92,
1291
- "eval_loss": 0.10861339420080185,
1292
- "eval_runtime": 150.9395,
1293
- "eval_samples_per_second": 949.626,
1294
- "eval_steps_per_second": 3.71,
1295
- "step": 24800
1296
- },
1297
- {
1298
- "epoch": 4.96,
1299
- "learning_rate": 3.869815439571344e-07,
1300
- "loss": 0.108,
1301
- "step": 25000
1302
- },
1303
- {
1304
- "epoch": 4.96,
1305
- "eval_loss": 0.10859096795320511,
1306
- "eval_runtime": 150.9591,
1307
- "eval_samples_per_second": 949.502,
1308
- "eval_steps_per_second": 3.71,
1309
- "step": 25000
1310
  }
1311
  ],
1312
  "logging_steps": 500,
1313
- "max_steps": 25195,
1314
- "num_train_epochs": 5,
1315
  "save_steps": 1000,
1316
- "total_flos": 4.2132817575936e+17,
1317
  "trial_name": null,
1318
  "trial_params": null
1319
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.19845207382417146,
5
  "eval_steps": 200,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "eval_loss": 0.24929340183734894,
14
+ "eval_runtime": 152.0524,
15
+ "eval_samples_per_second": 942.675,
16
+ "eval_steps_per_second": 3.683,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.08,
21
+ "eval_loss": 0.397050142288208,
22
+ "eval_runtime": 151.7752,
23
+ "eval_samples_per_second": 944.397,
24
+ "eval_steps_per_second": 3.69,
25
  "step": 400
26
  },
27
  {
28
  "epoch": 0.1,
29
+ "learning_rate": 4.751934907719786e-05,
30
+ "loss": 0.4919,
31
  "step": 500
32
  },
33
  {
34
  "epoch": 0.12,
35
+ "eval_loss": 0.619749903678894,
36
+ "eval_runtime": 153.0348,
37
+ "eval_samples_per_second": 936.624,
38
+ "eval_steps_per_second": 3.659,
39
  "step": 600
40
  },
41
  {
42
  "epoch": 0.16,
43
+ "eval_loss": 0.5482326149940491,
44
+ "eval_runtime": 151.3209,
45
+ "eval_samples_per_second": 947.232,
46
+ "eval_steps_per_second": 3.701,
47
  "step": 800
48
  },
49
  {
50
  "epoch": 0.2,
51
+ "learning_rate": 4.5038698154395716e-05,
52
+ "loss": 0.9307,
53
  "step": 1000
54
  },
55
  {
56
  "epoch": 0.2,
57
+ "eval_loss": 0.861940324306488,
58
+ "eval_runtime": 150.9686,
59
+ "eval_samples_per_second": 949.442,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "eval_steps_per_second": 3.709,
61
+ "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
  ],
64
  "logging_steps": 500,
65
+ "max_steps": 10078,
66
+ "num_train_epochs": 2,
67
  "save_steps": 1000,
68
+ "total_flos": 1.6722690048e+16,
69
  "trial_name": null,
70
  "trial_params": null
71
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:184ef5bf0b041b29b3a59cd7d7702c8cac57e3d54b18c7d1465b02e76d4d8643
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d4324537606b6fc0897b8a9dc3e771514e3581e4386565dc3d3bd2f4e1e518
3
  size 4027