mgh6 commited on
Commit
2fbed23
·
verified ·
1 Parent(s): 6df8684

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa0822fc409472b4df347ce0c6dbb5bf1d445c794af75086f172bf6c2394a18c
3
  size 2708729576
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a4c6f5313c5413e947c3b9ae8fbdcd3db5cfb17ae9d38f3845a0d90a0a99de4
3
  size 2708729576
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf8fcb46ff34fb6b3c884e340b62e5087660faddc671c8d46dc2f75938bf059c
3
  size 52499200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa9bcb9d383cd758daed3e9bdac436feaaa8ace4983afd451ce084e4618839ef
3
  size 52499200
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a45b784475c6c47021a6ff08c5d6a553172a50192f5de3c134961024cdbbaf9f
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b35b4cd4104958a3e34a87e84eee852763c80157fefbeae8abd4d6ed6ea94168
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b514faa73b6e320d8ae19d93f3da594146e59f1072af645ee09b9ce747afd0a1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca4dbe650e04bc8012dd3f1938dfb2a637329721abd75c3bd59d28a64007b54
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,911 +1,43 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
- "eval_steps": 10,
6
- "global_step": 645,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.07808687164470474,
13
- "grad_norm": 562671.25,
14
- "learning_rate": 9.921875000000001e-05,
15
- "loss": 41909.175,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.07808687164470474,
20
- "eval_runtime": 19.6603,
21
- "eval_samples_per_second": 21.973,
22
- "eval_steps_per_second": 5.493,
23
- "step": 10
24
- },
25
- {
26
- "epoch": 0.15617374328940947,
27
- "grad_norm": 344896.03125,
28
- "learning_rate": 9.84375e-05,
29
- "loss": 13890.8641,
30
- "step": 20
31
- },
32
- {
33
- "epoch": 0.15617374328940947,
34
- "eval_runtime": 19.7074,
35
- "eval_samples_per_second": 21.921,
36
- "eval_steps_per_second": 5.48,
37
- "step": 20
38
- },
39
- {
40
- "epoch": 0.2342606149341142,
41
- "grad_norm": 515524.21875,
42
- "learning_rate": 9.765625e-05,
43
- "loss": 11401.4898,
44
- "step": 30
45
- },
46
- {
47
- "epoch": 0.2342606149341142,
48
- "eval_runtime": 19.7344,
49
- "eval_samples_per_second": 21.891,
50
- "eval_steps_per_second": 5.473,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 0.31234748657881894,
55
- "grad_norm": 388208.53125,
56
- "learning_rate": 9.687500000000001e-05,
57
- "loss": 6596.7703,
58
- "step": 40
59
- },
60
- {
61
- "epoch": 0.31234748657881894,
62
- "eval_runtime": 19.7515,
63
- "eval_samples_per_second": 21.872,
64
- "eval_steps_per_second": 5.468,
65
- "step": 40
66
- },
67
  {
68
  "epoch": 0.3904343582235237,
69
- "grad_norm": 236512.34375,
70
  "learning_rate": 9.609375e-05,
71
- "loss": 3515.6215,
72
  "step": 50
73
  },
74
  {
75
  "epoch": 0.3904343582235237,
76
- "eval_runtime": 19.738,
77
- "eval_samples_per_second": 21.887,
78
- "eval_steps_per_second": 5.472,
79
  "step": 50
80
  },
81
- {
82
- "epoch": 0.4685212298682284,
83
- "grad_norm": 694649.5625,
84
- "learning_rate": 9.53125e-05,
85
- "loss": 4275.273,
86
- "step": 60
87
- },
88
- {
89
- "epoch": 0.4685212298682284,
90
- "eval_runtime": 19.7487,
91
- "eval_samples_per_second": 21.875,
92
- "eval_steps_per_second": 5.469,
93
- "step": 60
94
- },
95
- {
96
- "epoch": 0.5466081015129332,
97
- "grad_norm": 375526.53125,
98
- "learning_rate": 9.453125000000001e-05,
99
- "loss": 5022.9281,
100
- "step": 70
101
- },
102
- {
103
- "epoch": 0.5466081015129332,
104
- "eval_runtime": 19.7307,
105
- "eval_samples_per_second": 21.895,
106
- "eval_steps_per_second": 5.474,
107
- "step": 70
108
- },
109
- {
110
- "epoch": 0.6246949731576379,
111
- "grad_norm": 603116.25,
112
- "learning_rate": 9.375e-05,
113
- "loss": 3630.4668,
114
- "step": 80
115
- },
116
- {
117
- "epoch": 0.6246949731576379,
118
- "eval_runtime": 19.7405,
119
- "eval_samples_per_second": 21.884,
120
- "eval_steps_per_second": 5.471,
121
- "step": 80
122
- },
123
- {
124
- "epoch": 0.7027818448023426,
125
- "grad_norm": 440137.71875,
126
- "learning_rate": 9.296875e-05,
127
- "loss": 3183.9186,
128
- "step": 90
129
- },
130
- {
131
- "epoch": 0.7027818448023426,
132
- "eval_runtime": 19.7964,
133
- "eval_samples_per_second": 21.822,
134
- "eval_steps_per_second": 5.456,
135
- "step": 90
136
- },
137
  {
138
  "epoch": 0.7808687164470474,
139
- "grad_norm": 574069.4375,
140
  "learning_rate": 9.21875e-05,
141
- "loss": 3536.15,
142
  "step": 100
143
  },
144
  {
145
  "epoch": 0.7808687164470474,
146
- "eval_runtime": 19.7759,
147
- "eval_samples_per_second": 21.845,
148
- "eval_steps_per_second": 5.461,
149
- "step": 100
150
- },
151
- {
152
- "epoch": 0.8589555880917521,
153
- "grad_norm": 639365.375,
154
- "learning_rate": 9.140625e-05,
155
- "loss": 4252.8562,
156
- "step": 110
157
- },
158
- {
159
- "epoch": 0.8589555880917521,
160
- "eval_runtime": 19.7724,
161
- "eval_samples_per_second": 21.849,
162
  "eval_steps_per_second": 5.462,
163
- "step": 110
164
- },
165
- {
166
- "epoch": 0.9370424597364568,
167
- "grad_norm": 398296.34375,
168
- "learning_rate": 9.062500000000001e-05,
169
- "loss": 3483.168,
170
- "step": 120
171
- },
172
- {
173
- "epoch": 0.9370424597364568,
174
- "eval_runtime": 19.7473,
175
- "eval_samples_per_second": 21.876,
176
- "eval_steps_per_second": 5.469,
177
- "step": 120
178
- },
179
- {
180
- "epoch": 1.0078086871644705,
181
- "grad_norm": 307601.53125,
182
- "learning_rate": 8.984375e-05,
183
- "loss": 3748.8004,
184
- "step": 130
185
- },
186
- {
187
- "epoch": 1.0078086871644705,
188
- "eval_runtime": 19.6206,
189
- "eval_samples_per_second": 22.018,
190
- "eval_steps_per_second": 5.504,
191
- "step": 130
192
- },
193
- {
194
- "epoch": 1.0858955588091752,
195
- "grad_norm": 535453.9375,
196
- "learning_rate": 8.90625e-05,
197
- "loss": 3701.0125,
198
- "step": 140
199
- },
200
- {
201
- "epoch": 1.0858955588091752,
202
- "eval_runtime": 19.714,
203
- "eval_samples_per_second": 21.913,
204
- "eval_steps_per_second": 5.478,
205
- "step": 140
206
- },
207
- {
208
- "epoch": 1.16398243045388,
209
- "grad_norm": 578524.75,
210
- "learning_rate": 8.828125000000001e-05,
211
- "loss": 3222.8713,
212
- "step": 150
213
- },
214
- {
215
- "epoch": 1.16398243045388,
216
- "eval_runtime": 19.7417,
217
- "eval_samples_per_second": 21.883,
218
- "eval_steps_per_second": 5.471,
219
- "step": 150
220
- },
221
- {
222
- "epoch": 1.2420693020985847,
223
- "grad_norm": 413461.9375,
224
- "learning_rate": 8.75e-05,
225
- "loss": 3225.8557,
226
- "step": 160
227
- },
228
- {
229
- "epoch": 1.2420693020985847,
230
- "eval_runtime": 19.7549,
231
- "eval_samples_per_second": 21.868,
232
- "eval_steps_per_second": 5.467,
233
- "step": 160
234
- },
235
- {
236
- "epoch": 1.3201561737432894,
237
- "grad_norm": 473611.9375,
238
- "learning_rate": 8.671875e-05,
239
- "loss": 2951.6514,
240
- "step": 170
241
- },
242
- {
243
- "epoch": 1.3201561737432894,
244
- "eval_runtime": 19.7505,
245
- "eval_samples_per_second": 21.873,
246
- "eval_steps_per_second": 5.468,
247
- "step": 170
248
- },
249
- {
250
- "epoch": 1.3982430453879942,
251
- "grad_norm": 477603.59375,
252
- "learning_rate": 8.593750000000001e-05,
253
- "loss": 3479.666,
254
- "step": 180
255
- },
256
- {
257
- "epoch": 1.3982430453879942,
258
- "eval_runtime": 19.7691,
259
- "eval_samples_per_second": 21.852,
260
- "eval_steps_per_second": 5.463,
261
- "step": 180
262
- },
263
- {
264
- "epoch": 1.476329917032699,
265
- "grad_norm": 284746.375,
266
- "learning_rate": 8.515625e-05,
267
- "loss": 2754.5521,
268
- "step": 190
269
- },
270
- {
271
- "epoch": 1.476329917032699,
272
- "eval_runtime": 19.7613,
273
- "eval_samples_per_second": 21.861,
274
- "eval_steps_per_second": 5.465,
275
- "step": 190
276
- },
277
- {
278
- "epoch": 1.5544167886774036,
279
- "grad_norm": 583813.5625,
280
- "learning_rate": 8.4375e-05,
281
- "loss": 1812.6566,
282
- "step": 200
283
- },
284
- {
285
- "epoch": 1.5544167886774036,
286
- "eval_runtime": 19.7512,
287
- "eval_samples_per_second": 21.872,
288
- "eval_steps_per_second": 5.468,
289
- "step": 200
290
- },
291
- {
292
- "epoch": 1.6325036603221084,
293
- "grad_norm": 277886.28125,
294
- "learning_rate": 8.359375000000001e-05,
295
- "loss": 2136.5822,
296
- "step": 210
297
- },
298
- {
299
- "epoch": 1.6325036603221084,
300
- "eval_runtime": 19.746,
301
- "eval_samples_per_second": 21.878,
302
- "eval_steps_per_second": 5.469,
303
- "step": 210
304
- },
305
- {
306
- "epoch": 1.710590531966813,
307
- "grad_norm": 322902.34375,
308
- "learning_rate": 8.28125e-05,
309
- "loss": 2078.6055,
310
- "step": 220
311
- },
312
- {
313
- "epoch": 1.710590531966813,
314
- "eval_runtime": 19.7299,
315
- "eval_samples_per_second": 21.896,
316
- "eval_steps_per_second": 5.474,
317
- "step": 220
318
- },
319
- {
320
- "epoch": 1.7886774036115178,
321
- "grad_norm": 453730.34375,
322
- "learning_rate": 8.203125e-05,
323
- "loss": 2769.8313,
324
- "step": 230
325
- },
326
- {
327
- "epoch": 1.7886774036115178,
328
- "eval_runtime": 19.7557,
329
- "eval_samples_per_second": 21.867,
330
- "eval_steps_per_second": 5.467,
331
- "step": 230
332
- },
333
- {
334
- "epoch": 1.8667642752562226,
335
- "grad_norm": 267086.625,
336
- "learning_rate": 8.125000000000001e-05,
337
- "loss": 1964.433,
338
- "step": 240
339
- },
340
- {
341
- "epoch": 1.8667642752562226,
342
- "eval_runtime": 19.7438,
343
- "eval_samples_per_second": 21.88,
344
- "eval_steps_per_second": 5.47,
345
- "step": 240
346
- },
347
- {
348
- "epoch": 1.9448511469009273,
349
- "grad_norm": 587362.4375,
350
- "learning_rate": 8.046875e-05,
351
- "loss": 1856.9432,
352
- "step": 250
353
- },
354
- {
355
- "epoch": 1.9448511469009273,
356
- "eval_runtime": 19.8026,
357
- "eval_samples_per_second": 21.815,
358
- "eval_steps_per_second": 5.454,
359
- "step": 250
360
- },
361
- {
362
- "epoch": 2.015617374328941,
363
- "grad_norm": 502681.75,
364
- "learning_rate": 7.96875e-05,
365
- "loss": 3382.1129,
366
- "step": 260
367
- },
368
- {
369
- "epoch": 2.015617374328941,
370
- "eval_runtime": 19.6846,
371
- "eval_samples_per_second": 21.946,
372
- "eval_steps_per_second": 5.487,
373
- "step": 260
374
- },
375
- {
376
- "epoch": 2.0937042459736457,
377
- "grad_norm": 191029.828125,
378
- "learning_rate": 7.890625000000001e-05,
379
- "loss": 2202.6236,
380
- "step": 270
381
- },
382
- {
383
- "epoch": 2.0937042459736457,
384
- "eval_runtime": 19.7242,
385
- "eval_samples_per_second": 21.902,
386
- "eval_steps_per_second": 5.476,
387
- "step": 270
388
- },
389
- {
390
- "epoch": 2.1717911176183504,
391
- "grad_norm": 262116.109375,
392
- "learning_rate": 7.8125e-05,
393
- "loss": 1782.3984,
394
- "step": 280
395
- },
396
- {
397
- "epoch": 2.1717911176183504,
398
- "eval_runtime": 19.7531,
399
- "eval_samples_per_second": 21.87,
400
- "eval_steps_per_second": 5.467,
401
- "step": 280
402
- },
403
- {
404
- "epoch": 2.249877989263055,
405
- "grad_norm": 389979.4375,
406
- "learning_rate": 7.734375e-05,
407
- "loss": 1784.0506,
408
- "step": 290
409
- },
410
- {
411
- "epoch": 2.249877989263055,
412
- "eval_runtime": 19.7431,
413
- "eval_samples_per_second": 21.881,
414
- "eval_steps_per_second": 5.47,
415
- "step": 290
416
- },
417
- {
418
- "epoch": 2.32796486090776,
419
- "grad_norm": 392025.96875,
420
- "learning_rate": 7.65625e-05,
421
- "loss": 1743.6416,
422
- "step": 300
423
- },
424
- {
425
- "epoch": 2.32796486090776,
426
- "eval_runtime": 19.7493,
427
- "eval_samples_per_second": 21.874,
428
- "eval_steps_per_second": 5.469,
429
- "step": 300
430
- },
431
- {
432
- "epoch": 2.4060517325524646,
433
- "grad_norm": 424892.03125,
434
- "learning_rate": 7.578125e-05,
435
- "loss": 1824.1576,
436
- "step": 310
437
- },
438
- {
439
- "epoch": 2.4060517325524646,
440
- "eval_runtime": 19.7454,
441
- "eval_samples_per_second": 21.879,
442
- "eval_steps_per_second": 5.47,
443
- "step": 310
444
- },
445
- {
446
- "epoch": 2.4841386041971694,
447
- "grad_norm": 319042.1875,
448
- "learning_rate": 7.500000000000001e-05,
449
- "loss": 1743.5293,
450
- "step": 320
451
- },
452
- {
453
- "epoch": 2.4841386041971694,
454
- "eval_runtime": 19.756,
455
- "eval_samples_per_second": 21.867,
456
- "eval_steps_per_second": 5.467,
457
- "step": 320
458
- },
459
- {
460
- "epoch": 2.562225475841874,
461
- "grad_norm": 519256.40625,
462
- "learning_rate": 7.421875e-05,
463
- "loss": 1764.0594,
464
- "step": 330
465
- },
466
- {
467
- "epoch": 2.562225475841874,
468
- "eval_runtime": 19.7543,
469
- "eval_samples_per_second": 21.869,
470
- "eval_steps_per_second": 5.467,
471
- "step": 330
472
- },
473
- {
474
- "epoch": 2.640312347486579,
475
- "grad_norm": 210543.75,
476
- "learning_rate": 7.34375e-05,
477
- "loss": 1172.6759,
478
- "step": 340
479
- },
480
- {
481
- "epoch": 2.640312347486579,
482
- "eval_runtime": 19.7374,
483
- "eval_samples_per_second": 21.887,
484
- "eval_steps_per_second": 5.472,
485
- "step": 340
486
- },
487
- {
488
- "epoch": 2.7183992191312836,
489
- "grad_norm": 259726.9375,
490
- "learning_rate": 7.265625000000001e-05,
491
- "loss": 1670.5373,
492
- "step": 350
493
- },
494
- {
495
- "epoch": 2.7183992191312836,
496
- "eval_runtime": 19.7387,
497
- "eval_samples_per_second": 21.886,
498
- "eval_steps_per_second": 5.471,
499
- "step": 350
500
- },
501
- {
502
- "epoch": 2.7964860907759883,
503
- "grad_norm": 575974.4375,
504
- "learning_rate": 7.1875e-05,
505
- "loss": 2305.6137,
506
- "step": 360
507
- },
508
- {
509
- "epoch": 2.7964860907759883,
510
- "eval_runtime": 19.7355,
511
- "eval_samples_per_second": 21.889,
512
- "eval_steps_per_second": 5.472,
513
- "step": 360
514
- },
515
- {
516
- "epoch": 2.874572962420693,
517
- "grad_norm": 529809.125,
518
- "learning_rate": 7.109375e-05,
519
- "loss": 2236.2371,
520
- "step": 370
521
- },
522
- {
523
- "epoch": 2.874572962420693,
524
- "eval_runtime": 19.7607,
525
- "eval_samples_per_second": 21.862,
526
- "eval_steps_per_second": 5.465,
527
- "step": 370
528
- },
529
- {
530
- "epoch": 2.952659834065398,
531
- "grad_norm": 649306.875,
532
- "learning_rate": 7.031250000000001e-05,
533
- "loss": 1477.9074,
534
- "step": 380
535
- },
536
- {
537
- "epoch": 2.952659834065398,
538
- "eval_runtime": 19.7461,
539
- "eval_samples_per_second": 21.878,
540
- "eval_steps_per_second": 5.469,
541
- "step": 380
542
- },
543
- {
544
- "epoch": 3.0234260614934114,
545
- "grad_norm": 468387.1875,
546
- "learning_rate": 6.953125e-05,
547
- "loss": 1111.9008,
548
- "step": 390
549
- },
550
- {
551
- "epoch": 3.0234260614934114,
552
- "eval_runtime": 19.6555,
553
- "eval_samples_per_second": 21.979,
554
- "eval_steps_per_second": 5.495,
555
- "step": 390
556
- },
557
- {
558
- "epoch": 3.101512933138116,
559
- "grad_norm": 396892.71875,
560
- "learning_rate": 6.875e-05,
561
- "loss": 1161.8062,
562
- "step": 400
563
- },
564
- {
565
- "epoch": 3.101512933138116,
566
- "eval_runtime": 19.6988,
567
- "eval_samples_per_second": 21.93,
568
- "eval_steps_per_second": 5.483,
569
- "step": 400
570
- },
571
- {
572
- "epoch": 3.179599804782821,
573
- "grad_norm": 267550.6875,
574
- "learning_rate": 6.796875000000001e-05,
575
- "loss": 1332.8658,
576
- "step": 410
577
- },
578
- {
579
- "epoch": 3.179599804782821,
580
- "eval_runtime": 19.7594,
581
- "eval_samples_per_second": 21.863,
582
- "eval_steps_per_second": 5.466,
583
- "step": 410
584
- },
585
- {
586
- "epoch": 3.2576866764275256,
587
- "grad_norm": 383417.75,
588
- "learning_rate": 6.71875e-05,
589
- "loss": 1149.9299,
590
- "step": 420
591
- },
592
- {
593
- "epoch": 3.2576866764275256,
594
- "eval_runtime": 19.7802,
595
- "eval_samples_per_second": 21.84,
596
- "eval_steps_per_second": 5.46,
597
- "step": 420
598
- },
599
- {
600
- "epoch": 3.3357735480722304,
601
- "grad_norm": 197298.359375,
602
- "learning_rate": 6.640625e-05,
603
- "loss": 1427.1258,
604
- "step": 430
605
- },
606
- {
607
- "epoch": 3.3357735480722304,
608
- "eval_runtime": 19.7807,
609
- "eval_samples_per_second": 21.84,
610
- "eval_steps_per_second": 5.46,
611
- "step": 430
612
- },
613
- {
614
- "epoch": 3.413860419716935,
615
- "grad_norm": 196738.484375,
616
- "learning_rate": 6.562500000000001e-05,
617
- "loss": 1029.203,
618
- "step": 440
619
- },
620
- {
621
- "epoch": 3.413860419716935,
622
- "eval_runtime": 19.7502,
623
- "eval_samples_per_second": 21.873,
624
- "eval_steps_per_second": 5.468,
625
- "step": 440
626
- },
627
- {
628
- "epoch": 3.49194729136164,
629
- "grad_norm": 415209.96875,
630
- "learning_rate": 6.484375e-05,
631
- "loss": 1040.8042,
632
- "step": 450
633
- },
634
- {
635
- "epoch": 3.49194729136164,
636
- "eval_runtime": 19.7611,
637
- "eval_samples_per_second": 21.861,
638
- "eval_steps_per_second": 5.465,
639
- "step": 450
640
- },
641
- {
642
- "epoch": 3.5700341630063446,
643
- "grad_norm": 328070.78125,
644
- "learning_rate": 6.40625e-05,
645
- "loss": 977.6191,
646
- "step": 460
647
- },
648
- {
649
- "epoch": 3.5700341630063446,
650
- "eval_runtime": 19.7543,
651
- "eval_samples_per_second": 21.869,
652
- "eval_steps_per_second": 5.467,
653
- "step": 460
654
- },
655
- {
656
- "epoch": 3.6481210346510493,
657
- "grad_norm": 327954.34375,
658
- "learning_rate": 6.328125e-05,
659
- "loss": 893.5069,
660
- "step": 470
661
- },
662
- {
663
- "epoch": 3.6481210346510493,
664
- "eval_runtime": 19.7609,
665
- "eval_samples_per_second": 21.861,
666
- "eval_steps_per_second": 5.465,
667
- "step": 470
668
- },
669
- {
670
- "epoch": 3.726207906295754,
671
- "grad_norm": 429292.5625,
672
- "learning_rate": 6.25e-05,
673
- "loss": 982.0832,
674
- "step": 480
675
- },
676
- {
677
- "epoch": 3.726207906295754,
678
- "eval_runtime": 19.7484,
679
- "eval_samples_per_second": 21.875,
680
- "eval_steps_per_second": 5.469,
681
- "step": 480
682
- },
683
- {
684
- "epoch": 3.804294777940459,
685
- "grad_norm": 388092.28125,
686
- "learning_rate": 6.171875e-05,
687
- "loss": 849.4025,
688
- "step": 490
689
- },
690
- {
691
- "epoch": 3.804294777940459,
692
- "eval_runtime": 19.7469,
693
- "eval_samples_per_second": 21.877,
694
- "eval_steps_per_second": 5.469,
695
- "step": 490
696
- },
697
- {
698
- "epoch": 3.8823816495851635,
699
- "grad_norm": 240416.734375,
700
- "learning_rate": 6.0937500000000004e-05,
701
- "loss": 1007.4567,
702
- "step": 500
703
- },
704
- {
705
- "epoch": 3.8823816495851635,
706
- "eval_runtime": 19.7422,
707
- "eval_samples_per_second": 21.882,
708
- "eval_steps_per_second": 5.471,
709
- "step": 500
710
- },
711
- {
712
- "epoch": 3.9604685212298683,
713
- "grad_norm": 309579.21875,
714
- "learning_rate": 6.015625e-05,
715
- "loss": 960.0591,
716
- "step": 510
717
- },
718
- {
719
- "epoch": 3.9604685212298683,
720
- "eval_runtime": 19.7662,
721
- "eval_samples_per_second": 21.855,
722
- "eval_steps_per_second": 5.464,
723
- "step": 510
724
- },
725
- {
726
- "epoch": 4.031234748657882,
727
- "grad_norm": 484968.46875,
728
- "learning_rate": 5.9375e-05,
729
- "loss": 1164.3749,
730
- "step": 520
731
- },
732
- {
733
- "epoch": 4.031234748657882,
734
- "eval_runtime": 19.6732,
735
- "eval_samples_per_second": 21.959,
736
- "eval_steps_per_second": 5.49,
737
- "step": 520
738
- },
739
- {
740
- "epoch": 4.109321620302587,
741
- "grad_norm": 329296.75,
742
- "learning_rate": 5.8593750000000005e-05,
743
- "loss": 1631.9979,
744
- "step": 530
745
- },
746
- {
747
- "epoch": 4.109321620302587,
748
- "eval_runtime": 19.7244,
749
- "eval_samples_per_second": 21.902,
750
- "eval_steps_per_second": 5.475,
751
- "step": 530
752
- },
753
- {
754
- "epoch": 4.187408491947291,
755
- "grad_norm": 554537.125,
756
- "learning_rate": 5.78125e-05,
757
- "loss": 1570.2684,
758
- "step": 540
759
- },
760
- {
761
- "epoch": 4.187408491947291,
762
- "eval_runtime": 19.7495,
763
- "eval_samples_per_second": 21.874,
764
- "eval_steps_per_second": 5.468,
765
- "step": 540
766
- },
767
- {
768
- "epoch": 4.265495363591996,
769
- "grad_norm": 417775.03125,
770
- "learning_rate": 5.703125e-05,
771
- "loss": 1599.58,
772
- "step": 550
773
- },
774
- {
775
- "epoch": 4.265495363591996,
776
- "eval_runtime": 19.7438,
777
- "eval_samples_per_second": 21.88,
778
- "eval_steps_per_second": 5.47,
779
- "step": 550
780
- },
781
- {
782
- "epoch": 4.343582235236701,
783
- "grad_norm": 308738.78125,
784
- "learning_rate": 5.6250000000000005e-05,
785
- "loss": 1184.2381,
786
- "step": 560
787
- },
788
- {
789
- "epoch": 4.343582235236701,
790
- "eval_runtime": 19.7443,
791
- "eval_samples_per_second": 21.88,
792
- "eval_steps_per_second": 5.47,
793
- "step": 560
794
- },
795
- {
796
- "epoch": 4.421669106881406,
797
- "grad_norm": 281127.5625,
798
- "learning_rate": 5.546875e-05,
799
- "loss": 969.7674,
800
- "step": 570
801
- },
802
- {
803
- "epoch": 4.421669106881406,
804
- "eval_runtime": 19.7635,
805
- "eval_samples_per_second": 21.858,
806
- "eval_steps_per_second": 5.465,
807
- "step": 570
808
- },
809
- {
810
- "epoch": 4.49975597852611,
811
- "grad_norm": 343310.84375,
812
- "learning_rate": 5.46875e-05,
813
- "loss": 938.8365,
814
- "step": 580
815
- },
816
- {
817
- "epoch": 4.49975597852611,
818
- "eval_runtime": 19.7819,
819
- "eval_samples_per_second": 21.838,
820
- "eval_steps_per_second": 5.46,
821
- "step": 580
822
- },
823
- {
824
- "epoch": 4.577842850170815,
825
- "grad_norm": 274241.34375,
826
- "learning_rate": 5.3906250000000006e-05,
827
- "loss": 1338.7786,
828
- "step": 590
829
- },
830
- {
831
- "epoch": 4.577842850170815,
832
- "eval_runtime": 19.7579,
833
- "eval_samples_per_second": 21.865,
834
- "eval_steps_per_second": 5.466,
835
- "step": 590
836
- },
837
- {
838
- "epoch": 4.65592972181552,
839
- "grad_norm": 416818.875,
840
- "learning_rate": 5.3125000000000004e-05,
841
- "loss": 1180.1237,
842
- "step": 600
843
- },
844
- {
845
- "epoch": 4.65592972181552,
846
- "eval_runtime": 19.7618,
847
- "eval_samples_per_second": 21.86,
848
- "eval_steps_per_second": 5.465,
849
- "step": 600
850
- },
851
- {
852
- "epoch": 4.7340165934602245,
853
- "grad_norm": 292501.71875,
854
- "learning_rate": 5.234375e-05,
855
- "loss": 1117.2909,
856
- "step": 610
857
- },
858
- {
859
- "epoch": 4.7340165934602245,
860
- "eval_runtime": 19.7565,
861
- "eval_samples_per_second": 21.866,
862
- "eval_steps_per_second": 5.467,
863
- "step": 610
864
- },
865
- {
866
- "epoch": 4.812103465104929,
867
- "grad_norm": 415090.75,
868
- "learning_rate": 5.15625e-05,
869
- "loss": 1297.7303,
870
- "step": 620
871
- },
872
- {
873
- "epoch": 4.812103465104929,
874
- "eval_runtime": 19.7543,
875
- "eval_samples_per_second": 21.869,
876
- "eval_steps_per_second": 5.467,
877
- "step": 620
878
- },
879
- {
880
- "epoch": 4.890190336749634,
881
- "grad_norm": 481957.6875,
882
- "learning_rate": 5.0781250000000004e-05,
883
- "loss": 1024.6031,
884
- "step": 630
885
- },
886
- {
887
- "epoch": 4.890190336749634,
888
- "eval_runtime": 19.7456,
889
- "eval_samples_per_second": 21.878,
890
- "eval_steps_per_second": 5.47,
891
- "step": 630
892
- },
893
- {
894
- "epoch": 4.968277208394339,
895
- "grad_norm": 760376.8125,
896
- "learning_rate": 5e-05,
897
- "loss": 1106.1675,
898
- "step": 640
899
- },
900
- {
901
- "epoch": 4.968277208394339,
902
- "eval_runtime": 19.7415,
903
- "eval_samples_per_second": 21.883,
904
- "eval_steps_per_second": 5.471,
905
- "step": 640
906
  }
907
  ],
908
- "logging_steps": 10,
909
  "max_steps": 1280,
910
  "num_input_tokens_seen": 0,
911
  "num_train_epochs": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 50,
6
+ "global_step": 129,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.3904343582235237,
13
+ "grad_norm": 316011.5625,
14
  "learning_rate": 9.609375e-05,
15
+ "loss": 14505.3388,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.3904343582235237,
20
+ "eval_runtime": 19.78,
21
+ "eval_samples_per_second": 21.84,
22
+ "eval_steps_per_second": 5.46,
23
  "step": 50
24
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  {
26
  "epoch": 0.7808687164470474,
27
+ "grad_norm": 484251.53125,
28
  "learning_rate": 9.21875e-05,
29
+ "loss": 4453.3172,
30
  "step": 100
31
  },
32
  {
33
  "epoch": 0.7808687164470474,
34
+ "eval_runtime": 19.7737,
35
+ "eval_samples_per_second": 21.847,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "eval_steps_per_second": 5.462,
37
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
  ],
40
+ "logging_steps": 50,
41
  "max_steps": 1280,
42
  "num_input_tokens_seen": 0,
43
  "num_train_epochs": 10,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00e1815a75e87f99681c46c6d470e12a833e128941d3a562bbcb63c47c459934
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92b5f8d0521197ff491c9441a0c4a83340523a4da4d855bb0373622873120eb0
3
  size 5368