mgh6 commited on
Commit
deefae8
·
verified ·
1 Parent(s): 3acb744

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -16,6 +16,6 @@
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
- "transformers_version": "4.46.3",
20
  "vocab_size": 37
21
  }
 
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
+ "transformers_version": "4.47.0",
20
  "vocab_size": 37
21
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a664102d1b3ac5fecaadaaf9968da194c74be4d99b0e5648fc5ba6e5edbbd53
3
  size 2682482800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359f03565ffde72ca2ba79491c705fe79c6910e9bd98dc4da3c2d72dd984114c
3
  size 2682482800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d77595fb689b5c663dc6d92d024119a8b254d933443aff4cf5d2c4e893f1277
3
  size 5365108834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9048fafeb2aa676460e8ea091a11b1525206d19f4ef4961093c98af204315634
3
  size 5365108834
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecdbbc81f1577c58564d520539f0ecd1e3c63b150d117eedae0016db0ec8a85c
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb357d1fd873b2e9e783702e673b2d6f1de3e1b1f85efe7ccee99b69a6a7abc9
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6602c18a1ebe894c1d51ce5c9cea3744db091c466423f123d4fa8b7754d9378a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fec4fd6bb533b427a0a9db7b7fd25ac543921c09d5d0a518d007b072a567e94
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,828 +1,108 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.998851234922459,
5
  "eval_steps": 50,
6
- "global_step": 2720,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.18380241240666284,
13
- "grad_norm": 96.79701232910156,
14
- "learning_rate": 9.816176470588235e-05,
15
- "loss": 1.4053,
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.18380241240666284,
20
- "eval_loss": 1.3573765754699707,
21
- "eval_runtime": 116.0267,
22
- "eval_samples_per_second": 41.594,
23
- "eval_steps_per_second": 20.797,
24
  "step": 50
25
  },
26
  {
27
- "epoch": 0.3676048248133257,
28
- "grad_norm": 61.9161491394043,
29
- "learning_rate": 9.632352941176472e-05,
30
- "loss": 1.3129,
31
  "step": 100
32
  },
33
  {
34
- "epoch": 0.3676048248133257,
35
- "eval_loss": 1.3341882228851318,
36
- "eval_runtime": 115.8791,
37
- "eval_samples_per_second": 41.647,
38
- "eval_steps_per_second": 20.823,
39
  "step": 100
40
  },
41
  {
42
- "epoch": 0.5514072372199885,
43
- "grad_norm": 96.54769134521484,
44
- "learning_rate": 9.448529411764707e-05,
45
- "loss": 1.2777,
46
  "step": 150
47
  },
48
  {
49
- "epoch": 0.5514072372199885,
50
- "eval_loss": 1.3152097463607788,
51
- "eval_runtime": 115.9906,
52
- "eval_samples_per_second": 41.607,
53
- "eval_steps_per_second": 20.803,
54
  "step": 150
55
  },
56
  {
57
- "epoch": 0.7352096496266514,
58
- "grad_norm": 106.12042999267578,
59
- "learning_rate": 9.264705882352942e-05,
60
- "loss": 1.2403,
61
  "step": 200
62
  },
63
  {
64
- "epoch": 0.7352096496266514,
65
- "eval_loss": 1.3023375272750854,
66
- "eval_runtime": 116.029,
67
- "eval_samples_per_second": 41.593,
68
- "eval_steps_per_second": 20.797,
69
  "step": 200
70
  },
71
  {
72
- "epoch": 0.9190120620333142,
73
- "grad_norm": 67.17801666259766,
74
- "learning_rate": 9.080882352941177e-05,
75
- "loss": 1.2155,
76
  "step": 250
77
  },
78
  {
79
- "epoch": 0.9190120620333142,
80
- "eval_loss": 1.2906817197799683,
81
- "eval_runtime": 115.9297,
82
- "eval_samples_per_second": 41.629,
83
- "eval_steps_per_second": 20.814,
84
  "step": 250
85
  },
86
  {
87
- "epoch": 1.102814474439977,
88
- "grad_norm": 57.83748245239258,
89
- "learning_rate": 8.897058823529412e-05,
90
- "loss": 1.1858,
91
  "step": 300
92
  },
93
  {
94
- "epoch": 1.102814474439977,
95
- "eval_loss": 1.2834607362747192,
96
- "eval_runtime": 115.7904,
97
- "eval_samples_per_second": 41.679,
98
- "eval_steps_per_second": 20.839,
99
  "step": 300
100
- },
101
- {
102
- "epoch": 1.2866168868466399,
103
- "grad_norm": 104.54215240478516,
104
- "learning_rate": 8.713235294117648e-05,
105
- "loss": 1.1591,
106
- "step": 350
107
- },
108
- {
109
- "epoch": 1.2866168868466399,
110
- "eval_loss": 1.274283766746521,
111
- "eval_runtime": 116.0184,
112
- "eval_samples_per_second": 41.597,
113
- "eval_steps_per_second": 20.798,
114
- "step": 350
115
- },
116
- {
117
- "epoch": 1.4704192992533027,
118
- "grad_norm": 85.3061294555664,
119
- "learning_rate": 8.529411764705883e-05,
120
- "loss": 1.1408,
121
- "step": 400
122
- },
123
- {
124
- "epoch": 1.4704192992533027,
125
- "eval_loss": 1.2692538499832153,
126
- "eval_runtime": 116.0932,
127
- "eval_samples_per_second": 41.57,
128
- "eval_steps_per_second": 20.785,
129
- "step": 400
130
- },
131
- {
132
- "epoch": 1.6542217116599656,
133
- "grad_norm": 72.23489379882812,
134
- "learning_rate": 8.345588235294118e-05,
135
- "loss": 1.1256,
136
- "step": 450
137
- },
138
- {
139
- "epoch": 1.6542217116599656,
140
- "eval_loss": 1.2617342472076416,
141
- "eval_runtime": 116.0634,
142
- "eval_samples_per_second": 41.581,
143
- "eval_steps_per_second": 20.79,
144
- "step": 450
145
- },
146
- {
147
- "epoch": 1.8380241240666284,
148
- "grad_norm": 65.63114929199219,
149
- "learning_rate": 8.161764705882353e-05,
150
- "loss": 1.1098,
151
- "step": 500
152
- },
153
- {
154
- "epoch": 1.8380241240666284,
155
- "eval_loss": 1.2621153593063354,
156
- "eval_runtime": 115.9613,
157
- "eval_samples_per_second": 41.617,
158
- "eval_steps_per_second": 20.809,
159
- "step": 500
160
- },
161
- {
162
- "epoch": 2.021826536473291,
163
- "grad_norm": 45.61464309692383,
164
- "learning_rate": 7.977941176470589e-05,
165
- "loss": 1.0919,
166
- "step": 550
167
- },
168
- {
169
- "epoch": 2.021826536473291,
170
- "eval_loss": 1.2552005052566528,
171
- "eval_runtime": 117.239,
172
- "eval_samples_per_second": 41.164,
173
- "eval_steps_per_second": 20.582,
174
- "step": 550
175
- },
176
- {
177
- "epoch": 2.205628948879954,
178
- "grad_norm": 59.019195556640625,
179
- "learning_rate": 7.794117647058824e-05,
180
- "loss": 1.0693,
181
- "step": 600
182
- },
183
- {
184
- "epoch": 2.205628948879954,
185
- "eval_loss": 1.2541062831878662,
186
- "eval_runtime": 115.9595,
187
- "eval_samples_per_second": 41.618,
188
- "eval_steps_per_second": 20.809,
189
- "step": 600
190
- },
191
- {
192
- "epoch": 2.3894313612866167,
193
- "grad_norm": 43.13253402709961,
194
- "learning_rate": 7.610294117647059e-05,
195
- "loss": 1.0562,
196
- "step": 650
197
- },
198
- {
199
- "epoch": 2.3894313612866167,
200
- "eval_loss": 1.2531682252883911,
201
- "eval_runtime": 116.101,
202
- "eval_samples_per_second": 41.567,
203
- "eval_steps_per_second": 20.784,
204
- "step": 650
205
- },
206
- {
207
- "epoch": 2.5732337736932798,
208
- "grad_norm": 59.197181701660156,
209
- "learning_rate": 7.426470588235294e-05,
210
- "loss": 1.048,
211
- "step": 700
212
- },
213
- {
214
- "epoch": 2.5732337736932798,
215
- "eval_loss": 1.2469751834869385,
216
- "eval_runtime": 115.9924,
217
- "eval_samples_per_second": 41.606,
218
- "eval_steps_per_second": 20.803,
219
- "step": 700
220
- },
221
- {
222
- "epoch": 2.757036186099943,
223
- "grad_norm": 75.58405303955078,
224
- "learning_rate": 7.242647058823529e-05,
225
- "loss": 1.0436,
226
- "step": 750
227
- },
228
- {
229
- "epoch": 2.757036186099943,
230
- "eval_loss": 1.2442747354507446,
231
- "eval_runtime": 116.1229,
232
- "eval_samples_per_second": 41.559,
233
- "eval_steps_per_second": 20.78,
234
- "step": 750
235
- },
236
- {
237
- "epoch": 2.9408385985066055,
238
- "grad_norm": 80.94259643554688,
239
- "learning_rate": 7.058823529411765e-05,
240
- "loss": 1.0327,
241
- "step": 800
242
- },
243
- {
244
- "epoch": 2.9408385985066055,
245
- "eval_loss": 1.2350915670394897,
246
- "eval_runtime": 116.1113,
247
- "eval_samples_per_second": 41.564,
248
- "eval_steps_per_second": 20.782,
249
- "step": 800
250
- },
251
- {
252
- "epoch": 3.124641010913268,
253
- "grad_norm": 48.83946990966797,
254
- "learning_rate": 6.875e-05,
255
- "loss": 1.0081,
256
- "step": 850
257
- },
258
- {
259
- "epoch": 3.124641010913268,
260
- "eval_loss": 1.2360025644302368,
261
- "eval_runtime": 116.1076,
262
- "eval_samples_per_second": 41.565,
263
- "eval_steps_per_second": 20.782,
264
- "step": 850
265
- },
266
- {
267
- "epoch": 3.308443423319931,
268
- "grad_norm": 59.597232818603516,
269
- "learning_rate": 6.691176470588235e-05,
270
- "loss": 0.9992,
271
- "step": 900
272
- },
273
- {
274
- "epoch": 3.308443423319931,
275
- "eval_loss": 1.2394022941589355,
276
- "eval_runtime": 115.8965,
277
- "eval_samples_per_second": 41.641,
278
- "eval_steps_per_second": 20.82,
279
- "step": 900
280
- },
281
- {
282
- "epoch": 3.4922458357265938,
283
- "grad_norm": 49.224666595458984,
284
- "learning_rate": 6.507352941176472e-05,
285
- "loss": 0.9943,
286
- "step": 950
287
- },
288
- {
289
- "epoch": 3.4922458357265938,
290
- "eval_loss": 1.235238790512085,
291
- "eval_runtime": 116.2683,
292
- "eval_samples_per_second": 41.507,
293
- "eval_steps_per_second": 20.754,
294
- "step": 950
295
- },
296
- {
297
- "epoch": 3.676048248133257,
298
- "grad_norm": 65.07023620605469,
299
- "learning_rate": 6.323529411764705e-05,
300
- "loss": 0.9868,
301
- "step": 1000
302
- },
303
- {
304
- "epoch": 3.676048248133257,
305
- "eval_loss": 1.23443603515625,
306
- "eval_runtime": 116.2562,
307
- "eval_samples_per_second": 41.512,
308
- "eval_steps_per_second": 20.756,
309
- "step": 1000
310
- },
311
- {
312
- "epoch": 3.8598506605399194,
313
- "grad_norm": 49.963409423828125,
314
- "learning_rate": 6.139705882352942e-05,
315
- "loss": 0.9781,
316
- "step": 1050
317
- },
318
- {
319
- "epoch": 3.8598506605399194,
320
- "eval_loss": 1.232906460762024,
321
- "eval_runtime": 116.0309,
322
- "eval_samples_per_second": 41.592,
323
- "eval_steps_per_second": 20.796,
324
- "step": 1050
325
- },
326
- {
327
- "epoch": 4.043653072946582,
328
- "grad_norm": 57.1251335144043,
329
- "learning_rate": 5.9558823529411766e-05,
330
- "loss": 0.9697,
331
- "step": 1100
332
- },
333
- {
334
- "epoch": 4.043653072946582,
335
- "eval_loss": 1.2329473495483398,
336
- "eval_runtime": 117.8798,
337
- "eval_samples_per_second": 40.94,
338
- "eval_steps_per_second": 20.47,
339
- "step": 1100
340
- },
341
- {
342
- "epoch": 4.227455485353246,
343
- "grad_norm": 47.962928771972656,
344
- "learning_rate": 5.7720588235294116e-05,
345
- "loss": 0.9499,
346
- "step": 1150
347
- },
348
- {
349
- "epoch": 4.227455485353246,
350
- "eval_loss": 1.2345472574234009,
351
- "eval_runtime": 115.8584,
352
- "eval_samples_per_second": 41.654,
353
- "eval_steps_per_second": 20.827,
354
- "step": 1150
355
- },
356
- {
357
- "epoch": 4.411257897759908,
358
- "grad_norm": 43.172767639160156,
359
- "learning_rate": 5.588235294117647e-05,
360
- "loss": 0.9447,
361
- "step": 1200
362
- },
363
- {
364
- "epoch": 4.411257897759908,
365
- "eval_loss": 1.2309461832046509,
366
- "eval_runtime": 116.0943,
367
- "eval_samples_per_second": 41.57,
368
- "eval_steps_per_second": 20.785,
369
- "step": 1200
370
- },
371
- {
372
- "epoch": 4.595060310166571,
373
- "grad_norm": 89.4483413696289,
374
- "learning_rate": 5.404411764705882e-05,
375
- "loss": 0.9384,
376
- "step": 1250
377
- },
378
- {
379
- "epoch": 4.595060310166571,
380
- "eval_loss": 1.2274333238601685,
381
- "eval_runtime": 116.0025,
382
- "eval_samples_per_second": 41.603,
383
- "eval_steps_per_second": 20.801,
384
- "step": 1250
385
- },
386
- {
387
- "epoch": 4.778862722573233,
388
- "grad_norm": 61.61293029785156,
389
- "learning_rate": 5.2205882352941185e-05,
390
- "loss": 0.9369,
391
- "step": 1300
392
- },
393
- {
394
- "epoch": 4.778862722573233,
395
- "eval_loss": 1.2256300449371338,
396
- "eval_runtime": 115.8928,
397
- "eval_samples_per_second": 41.642,
398
- "eval_steps_per_second": 20.821,
399
- "step": 1300
400
- },
401
- {
402
- "epoch": 4.962665134979897,
403
- "grad_norm": 73.70500946044922,
404
- "learning_rate": 5.036764705882353e-05,
405
- "loss": 0.9301,
406
- "step": 1350
407
- },
408
- {
409
- "epoch": 4.962665134979897,
410
- "eval_loss": 1.2280672788619995,
411
- "eval_runtime": 116.1193,
412
- "eval_samples_per_second": 41.561,
413
- "eval_steps_per_second": 20.78,
414
- "step": 1350
415
- },
416
- {
417
- "epoch": 5.1464675473865595,
418
- "grad_norm": 71.07781219482422,
419
- "learning_rate": 4.8529411764705885e-05,
420
- "loss": 0.9172,
421
- "step": 1400
422
- },
423
- {
424
- "epoch": 5.1464675473865595,
425
- "eval_loss": 1.2280040979385376,
426
- "eval_runtime": 116.2535,
427
- "eval_samples_per_second": 41.513,
428
- "eval_steps_per_second": 20.756,
429
- "step": 1400
430
- },
431
- {
432
- "epoch": 5.330269959793222,
433
- "grad_norm": 59.39795684814453,
434
- "learning_rate": 4.669117647058824e-05,
435
- "loss": 0.9067,
436
- "step": 1450
437
- },
438
- {
439
- "epoch": 5.330269959793222,
440
- "eval_loss": 1.2288336753845215,
441
- "eval_runtime": 115.8973,
442
- "eval_samples_per_second": 41.64,
443
- "eval_steps_per_second": 20.82,
444
- "step": 1450
445
- },
446
- {
447
- "epoch": 5.514072372199885,
448
- "grad_norm": 55.501617431640625,
449
- "learning_rate": 4.485294117647059e-05,
450
- "loss": 0.9004,
451
- "step": 1500
452
- },
453
- {
454
- "epoch": 5.514072372199885,
455
- "eval_loss": 1.2262146472930908,
456
- "eval_runtime": 115.9478,
457
- "eval_samples_per_second": 41.622,
458
- "eval_steps_per_second": 20.811,
459
- "step": 1500
460
- },
461
- {
462
- "epoch": 5.697874784606548,
463
- "grad_norm": 69.02213287353516,
464
- "learning_rate": 4.301470588235295e-05,
465
- "loss": 0.9035,
466
- "step": 1550
467
- },
468
- {
469
- "epoch": 5.697874784606548,
470
- "eval_loss": 1.2236130237579346,
471
- "eval_runtime": 116.1221,
472
- "eval_samples_per_second": 41.56,
473
- "eval_steps_per_second": 20.78,
474
- "step": 1550
475
- },
476
- {
477
- "epoch": 5.881677197013211,
478
- "grad_norm": 45.09730529785156,
479
- "learning_rate": 4.11764705882353e-05,
480
- "loss": 0.8962,
481
- "step": 1600
482
- },
483
- {
484
- "epoch": 5.881677197013211,
485
- "eval_loss": 1.2278504371643066,
486
- "eval_runtime": 115.9916,
487
- "eval_samples_per_second": 41.606,
488
- "eval_steps_per_second": 20.803,
489
- "step": 1600
490
- },
491
- {
492
- "epoch": 6.0654796094198735,
493
- "grad_norm": 47.59389877319336,
494
- "learning_rate": 3.933823529411765e-05,
495
- "loss": 0.8925,
496
- "step": 1650
497
- },
498
- {
499
- "epoch": 6.0654796094198735,
500
- "eval_loss": 1.2326780557632446,
501
- "eval_runtime": 116.7248,
502
- "eval_samples_per_second": 41.345,
503
- "eval_steps_per_second": 20.673,
504
- "step": 1650
505
- },
506
- {
507
- "epoch": 6.249282021826536,
508
- "grad_norm": 45.18083190917969,
509
- "learning_rate": 3.7500000000000003e-05,
510
- "loss": 0.8771,
511
- "step": 1700
512
- },
513
- {
514
- "epoch": 6.249282021826536,
515
- "eval_loss": 1.2302526235580444,
516
- "eval_runtime": 115.8769,
517
- "eval_samples_per_second": 41.648,
518
- "eval_steps_per_second": 20.824,
519
- "step": 1700
520
- },
521
- {
522
- "epoch": 6.4330844342332,
523
- "grad_norm": 40.455318450927734,
524
- "learning_rate": 3.566176470588235e-05,
525
- "loss": 0.8743,
526
- "step": 1750
527
- },
528
- {
529
- "epoch": 6.4330844342332,
530
- "eval_loss": 1.2299398183822632,
531
- "eval_runtime": 115.9106,
532
- "eval_samples_per_second": 41.636,
533
- "eval_steps_per_second": 20.818,
534
- "step": 1750
535
- },
536
- {
537
- "epoch": 6.616886846639862,
538
- "grad_norm": 61.713111877441406,
539
- "learning_rate": 3.382352941176471e-05,
540
- "loss": 0.8735,
541
- "step": 1800
542
- },
543
- {
544
- "epoch": 6.616886846639862,
545
- "eval_loss": 1.2240906953811646,
546
- "eval_runtime": 116.0411,
547
- "eval_samples_per_second": 41.589,
548
- "eval_steps_per_second": 20.794,
549
- "step": 1800
550
- },
551
- {
552
- "epoch": 6.800689259046525,
553
- "grad_norm": 69.22649383544922,
554
- "learning_rate": 3.198529411764706e-05,
555
- "loss": 0.8648,
556
- "step": 1850
557
- },
558
- {
559
- "epoch": 6.800689259046525,
560
- "eval_loss": 1.2253305912017822,
561
- "eval_runtime": 115.8996,
562
- "eval_samples_per_second": 41.639,
563
- "eval_steps_per_second": 20.82,
564
- "step": 1850
565
- },
566
- {
567
- "epoch": 6.9844916714531875,
568
- "grad_norm": 65.4384994506836,
569
- "learning_rate": 3.0147058823529413e-05,
570
- "loss": 0.8649,
571
- "step": 1900
572
- },
573
- {
574
- "epoch": 6.9844916714531875,
575
- "eval_loss": 1.2292358875274658,
576
- "eval_runtime": 116.0285,
577
- "eval_samples_per_second": 41.593,
578
- "eval_steps_per_second": 20.797,
579
- "step": 1900
580
- },
581
- {
582
- "epoch": 7.168294083859851,
583
- "grad_norm": 46.392173767089844,
584
- "learning_rate": 2.8308823529411766e-05,
585
- "loss": 0.8475,
586
- "step": 1950
587
- },
588
- {
589
- "epoch": 7.168294083859851,
590
- "eval_loss": 1.2355010509490967,
591
- "eval_runtime": 116.0581,
592
- "eval_samples_per_second": 41.583,
593
- "eval_steps_per_second": 20.791,
594
- "step": 1950
595
- },
596
- {
597
- "epoch": 7.352096496266514,
598
- "grad_norm": 64.82035827636719,
599
- "learning_rate": 2.647058823529412e-05,
600
- "loss": 0.8496,
601
- "step": 2000
602
- },
603
- {
604
- "epoch": 7.352096496266514,
605
- "eval_loss": 1.2320975065231323,
606
- "eval_runtime": 115.9151,
607
- "eval_samples_per_second": 41.634,
608
- "eval_steps_per_second": 20.817,
609
- "step": 2000
610
- },
611
- {
612
- "epoch": 7.535898908673176,
613
- "grad_norm": 46.27527618408203,
614
- "learning_rate": 2.4632352941176472e-05,
615
- "loss": 0.8452,
616
- "step": 2050
617
- },
618
- {
619
- "epoch": 7.535898908673176,
620
- "eval_loss": 1.2323057651519775,
621
- "eval_runtime": 115.981,
622
- "eval_samples_per_second": 41.61,
623
- "eval_steps_per_second": 20.805,
624
- "step": 2050
625
- },
626
- {
627
- "epoch": 7.719701321079839,
628
- "grad_norm": 52.665435791015625,
629
- "learning_rate": 2.2794117647058825e-05,
630
- "loss": 0.8423,
631
- "step": 2100
632
- },
633
- {
634
- "epoch": 7.719701321079839,
635
- "eval_loss": 1.2322068214416504,
636
- "eval_runtime": 115.9188,
637
- "eval_samples_per_second": 41.633,
638
- "eval_steps_per_second": 20.816,
639
- "step": 2100
640
- },
641
- {
642
- "epoch": 7.903503733486502,
643
- "grad_norm": 74.63914489746094,
644
- "learning_rate": 2.0955882352941178e-05,
645
- "loss": 0.8421,
646
- "step": 2150
647
- },
648
- {
649
- "epoch": 7.903503733486502,
650
- "eval_loss": 1.2284280061721802,
651
- "eval_runtime": 115.8808,
652
- "eval_samples_per_second": 41.646,
653
- "eval_steps_per_second": 20.823,
654
- "step": 2150
655
- },
656
- {
657
- "epoch": 8.087306145893164,
658
- "grad_norm": 45.93680191040039,
659
- "learning_rate": 1.9117647058823528e-05,
660
- "loss": 0.8363,
661
- "step": 2200
662
- },
663
- {
664
- "epoch": 8.087306145893164,
665
- "eval_loss": 1.2366794347763062,
666
- "eval_runtime": 115.7882,
667
- "eval_samples_per_second": 41.68,
668
- "eval_steps_per_second": 20.84,
669
- "step": 2200
670
- },
671
- {
672
- "epoch": 8.271108558299828,
673
- "grad_norm": 73.48126983642578,
674
- "learning_rate": 1.7279411764705884e-05,
675
- "loss": 0.8333,
676
- "step": 2250
677
- },
678
- {
679
- "epoch": 8.271108558299828,
680
- "eval_loss": 1.2330245971679688,
681
- "eval_runtime": 115.848,
682
- "eval_samples_per_second": 41.658,
683
- "eval_steps_per_second": 20.829,
684
- "step": 2250
685
- },
686
- {
687
- "epoch": 8.454910970706491,
688
- "grad_norm": 76.98050689697266,
689
- "learning_rate": 1.5441176470588237e-05,
690
- "loss": 0.8267,
691
- "step": 2300
692
- },
693
- {
694
- "epoch": 8.454910970706491,
695
- "eval_loss": 1.2376160621643066,
696
- "eval_runtime": 115.9174,
697
- "eval_samples_per_second": 41.633,
698
- "eval_steps_per_second": 20.817,
699
- "step": 2300
700
- },
701
- {
702
- "epoch": 8.638713383113153,
703
- "grad_norm": 128.97714233398438,
704
- "learning_rate": 1.3602941176470587e-05,
705
- "loss": 0.8217,
706
- "step": 2350
707
- },
708
- {
709
- "epoch": 8.638713383113153,
710
- "eval_loss": 1.2340155839920044,
711
- "eval_runtime": 115.9099,
712
- "eval_samples_per_second": 41.636,
713
- "eval_steps_per_second": 20.818,
714
- "step": 2350
715
- },
716
- {
717
- "epoch": 8.822515795519816,
718
- "grad_norm": 55.4945182800293,
719
- "learning_rate": 1.1764705882352942e-05,
720
- "loss": 0.8221,
721
- "step": 2400
722
- },
723
- {
724
- "epoch": 8.822515795519816,
725
- "eval_loss": 1.2404063940048218,
726
- "eval_runtime": 116.0058,
727
- "eval_samples_per_second": 41.601,
728
- "eval_steps_per_second": 20.801,
729
- "step": 2400
730
- },
731
- {
732
- "epoch": 9.00631820792648,
733
- "grad_norm": 69.84994506835938,
734
- "learning_rate": 9.926470588235293e-06,
735
- "loss": 0.8202,
736
- "step": 2450
737
- },
738
- {
739
- "epoch": 9.00631820792648,
740
- "eval_loss": 1.2383702993392944,
741
- "eval_runtime": 116.7345,
742
- "eval_samples_per_second": 41.342,
743
- "eval_steps_per_second": 20.671,
744
- "step": 2450
745
- },
746
- {
747
- "epoch": 9.190120620333142,
748
- "grad_norm": 75.23961639404297,
749
- "learning_rate": 8.088235294117648e-06,
750
- "loss": 0.8143,
751
- "step": 2500
752
- },
753
- {
754
- "epoch": 9.190120620333142,
755
- "eval_loss": 1.236066460609436,
756
- "eval_runtime": 115.996,
757
- "eval_samples_per_second": 41.605,
758
- "eval_steps_per_second": 20.802,
759
- "step": 2500
760
- },
761
- {
762
- "epoch": 9.373923032739805,
763
- "grad_norm": 62.9267692565918,
764
- "learning_rate": 6.25e-06,
765
- "loss": 0.8106,
766
- "step": 2550
767
- },
768
- {
769
- "epoch": 9.373923032739805,
770
- "eval_loss": 1.23640775680542,
771
- "eval_runtime": 115.848,
772
- "eval_samples_per_second": 41.658,
773
- "eval_steps_per_second": 20.829,
774
- "step": 2550
775
- },
776
- {
777
- "epoch": 9.557725445146467,
778
- "grad_norm": 54.76566696166992,
779
- "learning_rate": 4.411764705882353e-06,
780
- "loss": 0.8144,
781
- "step": 2600
782
- },
783
- {
784
- "epoch": 9.557725445146467,
785
- "eval_loss": 1.2419943809509277,
786
- "eval_runtime": 116.0232,
787
- "eval_samples_per_second": 41.595,
788
- "eval_steps_per_second": 20.798,
789
- "step": 2600
790
- },
791
- {
792
- "epoch": 9.74152785755313,
793
- "grad_norm": 51.20401382446289,
794
- "learning_rate": 2.573529411764706e-06,
795
- "loss": 0.8061,
796
- "step": 2650
797
- },
798
- {
799
- "epoch": 9.74152785755313,
800
- "eval_loss": 1.2368206977844238,
801
- "eval_runtime": 116.069,
802
- "eval_samples_per_second": 41.579,
803
- "eval_steps_per_second": 20.789,
804
- "step": 2650
805
- },
806
- {
807
- "epoch": 9.925330269959794,
808
- "grad_norm": 52.51292419433594,
809
- "learning_rate": 7.352941176470589e-07,
810
- "loss": 0.8122,
811
- "step": 2700
812
- },
813
- {
814
- "epoch": 9.925330269959794,
815
- "eval_loss": 1.2355531454086304,
816
- "eval_runtime": 116.1057,
817
- "eval_samples_per_second": 41.566,
818
- "eval_steps_per_second": 20.783,
819
- "step": 2700
820
  }
821
  ],
822
  "logging_steps": 50,
823
- "max_steps": 2720,
824
  "num_input_tokens_seen": 0,
825
- "num_train_epochs": 10,
826
  "save_steps": 500,
827
  "stateful_callbacks": {
828
  "TrainerControl": {
@@ -831,12 +111,12 @@
831
  "should_evaluate": false,
832
  "should_log": false,
833
  "should_save": true,
834
- "should_training_stop": true
835
  },
836
  "attributes": {}
837
  }
838
  },
839
- "total_flos": 7.018175725001769e+17,
840
  "train_batch_size": 2,
841
  "trial_name": null,
842
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9979996362975087,
5
  "eval_steps": 50,
6
+ "global_step": 343,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.14548099654482632,
13
+ "grad_norm": 92.38280487060547,
14
+ "learning_rate": 9.927113702623908e-05,
15
+ "loss": 1.3958,
16
  "step": 50
17
  },
18
  {
19
+ "epoch": 0.14548099654482632,
20
+ "eval_loss": 1.3268945217132568,
21
+ "eval_runtime": 116.5998,
22
+ "eval_samples_per_second": 41.389,
23
+ "eval_steps_per_second": 20.695,
24
  "step": 50
25
  },
26
  {
27
+ "epoch": 0.29096199308965265,
28
+ "grad_norm": 75.30126190185547,
29
+ "learning_rate": 9.854227405247813e-05,
30
+ "loss": 1.3228,
31
  "step": 100
32
  },
33
  {
34
+ "epoch": 0.29096199308965265,
35
+ "eval_loss": 1.2796554565429688,
36
+ "eval_runtime": 116.9514,
37
+ "eval_samples_per_second": 41.265,
38
+ "eval_steps_per_second": 20.633,
39
  "step": 100
40
  },
41
  {
42
+ "epoch": 0.436442989634479,
43
+ "grad_norm": 113.90583038330078,
44
+ "learning_rate": 9.781341107871722e-05,
45
+ "loss": 1.2818,
46
  "step": 150
47
  },
48
  {
49
+ "epoch": 0.436442989634479,
50
+ "eval_loss": 1.2467392683029175,
51
+ "eval_runtime": 116.7421,
52
+ "eval_samples_per_second": 41.339,
53
+ "eval_steps_per_second": 20.669,
54
  "step": 150
55
  },
56
  {
57
+ "epoch": 0.5819239861793053,
58
+ "grad_norm": 75.85360717773438,
59
+ "learning_rate": 9.708454810495627e-05,
60
+ "loss": 1.255,
61
  "step": 200
62
  },
63
  {
64
+ "epoch": 0.5819239861793053,
65
+ "eval_loss": 1.2072206735610962,
66
+ "eval_runtime": 116.5054,
67
+ "eval_samples_per_second": 41.423,
68
+ "eval_steps_per_second": 20.711,
69
  "step": 200
70
  },
71
  {
72
+ "epoch": 0.7274049827241317,
73
+ "grad_norm": 62.863895416259766,
74
+ "learning_rate": 9.635568513119534e-05,
75
+ "loss": 1.2329,
76
  "step": 250
77
  },
78
  {
79
+ "epoch": 0.7274049827241317,
80
+ "eval_loss": 1.1830443143844604,
81
+ "eval_runtime": 116.5694,
82
+ "eval_samples_per_second": 41.4,
83
+ "eval_steps_per_second": 20.7,
84
  "step": 250
85
  },
86
  {
87
+ "epoch": 0.872885979268958,
88
+ "grad_norm": 67.02438354492188,
89
+ "learning_rate": 9.56268221574344e-05,
90
+ "loss": 1.2113,
91
  "step": 300
92
  },
93
  {
94
+ "epoch": 0.872885979268958,
95
+ "eval_loss": 1.1557544469833374,
96
+ "eval_runtime": 116.6632,
97
+ "eval_samples_per_second": 41.367,
98
+ "eval_steps_per_second": 20.683,
99
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
  ],
102
  "logging_steps": 50,
103
+ "max_steps": 6860,
104
  "num_input_tokens_seen": 0,
105
+ "num_train_epochs": 20,
106
  "save_steps": 500,
107
  "stateful_callbacks": {
108
  "TrainerControl": {
 
111
  "should_evaluate": false,
112
  "should_log": false,
113
  "should_save": true,
114
+ "should_training_stop": false
115
  },
116
  "attributes": {}
117
  }
118
  },
119
+ "total_flos": 8.9337717422293e+16,
120
  "train_batch_size": 2,
121
  "trial_name": null,
122
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd647ab4c4e535f0fc2ef28d563af1b5f95fbaaecd78a15f29102b61aeb873fd
3
- size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7fd3757bfae30126ccb3f467f9722223f1ea2ad8678404c8edb0c2dfc443523
3
+ size 5368