mgh6 commited on
Commit
d16603a
·
verified ·
1 Parent(s): 182e744

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -16,6 +16,6 @@
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
- "transformers_version": "4.47.0",
20
  "vocab_size": 37
21
  }
 
16
  "norm_eps": 1e-05,
17
  "swiglu_multiple_of": 256,
18
  "torch_dtype": "float32",
19
+ "transformers_version": "4.49.0",
20
  "vocab_size": 37
21
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99e82e82f374c2673ef04a502ab788d5b3699ba02ae9cbb612822d23f1048aaa
3
  size 2682482800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d8e1016a9b1bb6828135669bed415800d12ec729a55e57ed78cb1ee7ab1bbc2
3
  size 2682482800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11a72e36fd2dbdba8586fcbf48397a69a66a780b5217bacb7a4c129bba516b9e
3
  size 5365108834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd973e6941dbfc054a82a8fc125c7daff458020cf56c2f24a45ed7a61ec17d74
3
  size 5365108834
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2168e6be520a803e330b14854aa047c840fbbc36e1cd7f9a8956c981c5afc55f
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08baa7817becd0350efcb73cd20230a3510c2a4c44eabb35442e644589d91b4e
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6602c18a1ebe894c1d51ce5c9cea3744db091c466423f123d4fa8b7754d9378a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8196dfb311939b1d2d31afb1d2e2c773958196863e2aef7e228bd8c79258297b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,826 +1,91 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.966800689259047,
5
  "eval_steps": 50,
6
- "global_step": 2720,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.18380241240666284,
13
- "grad_norm": 96.95903778076172,
14
- "learning_rate": 9.816176470588235e-05,
15
- "loss": 1.4053,
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.18380241240666284,
20
- "eval_loss": 1.357384204864502,
21
- "eval_runtime": 116.0223,
22
- "eval_samples_per_second": 41.595,
23
- "eval_steps_per_second": 20.798,
24
  "step": 50
25
  },
26
  {
27
- "epoch": 0.3676048248133257,
28
- "grad_norm": 62.13881301879883,
29
- "learning_rate": 9.632352941176472e-05,
30
- "loss": 1.3129,
31
  "step": 100
32
  },
33
  {
34
- "epoch": 0.3676048248133257,
35
- "eval_loss": 1.3342629671096802,
36
- "eval_runtime": 116.0756,
37
- "eval_samples_per_second": 41.576,
38
- "eval_steps_per_second": 20.788,
39
  "step": 100
40
  },
41
  {
42
- "epoch": 0.5514072372199885,
43
- "grad_norm": 94.01299285888672,
44
- "learning_rate": 9.448529411764707e-05,
45
- "loss": 1.2775,
46
  "step": 150
47
  },
48
  {
49
- "epoch": 0.5514072372199885,
50
- "eval_loss": 1.315340280532837,
51
- "eval_runtime": 116.2655,
52
- "eval_samples_per_second": 41.508,
53
- "eval_steps_per_second": 20.754,
54
  "step": 150
55
  },
56
  {
57
- "epoch": 0.7352096496266514,
58
- "grad_norm": 106.19926452636719,
59
- "learning_rate": 9.264705882352942e-05,
60
- "loss": 1.2405,
61
  "step": 200
62
  },
63
  {
64
- "epoch": 0.7352096496266514,
65
- "eval_loss": 1.3019272089004517,
66
- "eval_runtime": 116.2502,
67
- "eval_samples_per_second": 41.514,
68
- "eval_steps_per_second": 20.757,
69
  "step": 200
70
  },
71
  {
72
- "epoch": 0.9190120620333142,
73
- "grad_norm": 65.5792465209961,
74
- "learning_rate": 9.080882352941177e-05,
75
- "loss": 1.2157,
76
  "step": 250
77
  },
78
  {
79
- "epoch": 0.9190120620333142,
80
- "eval_loss": 1.290924310684204,
81
- "eval_runtime": 116.1975,
82
- "eval_samples_per_second": 41.533,
83
- "eval_steps_per_second": 20.766,
84
  "step": 250
85
- },
86
- {
87
- "epoch": 1.0992533026995979,
88
- "grad_norm": 54.570987701416016,
89
- "learning_rate": 8.897058823529412e-05,
90
- "loss": 1.1632,
91
- "step": 300
92
- },
93
- {
94
- "epoch": 1.0992533026995979,
95
- "eval_loss": 1.2854100465774536,
96
- "eval_runtime": 116.058,
97
- "eval_samples_per_second": 41.583,
98
- "eval_steps_per_second": 20.791,
99
- "step": 300
100
- },
101
- {
102
- "epoch": 1.2830557151062607,
103
- "grad_norm": 62.69035720825195,
104
- "learning_rate": 8.713235294117648e-05,
105
- "loss": 1.1598,
106
- "step": 350
107
- },
108
- {
109
- "epoch": 1.2830557151062607,
110
- "eval_loss": 1.2804261445999146,
111
- "eval_runtime": 116.1289,
112
- "eval_samples_per_second": 41.557,
113
- "eval_steps_per_second": 20.779,
114
- "step": 350
115
- },
116
- {
117
- "epoch": 1.4668581275129235,
118
- "grad_norm": 66.37086486816406,
119
- "learning_rate": 8.529411764705883e-05,
120
- "loss": 1.1419,
121
- "step": 400
122
- },
123
- {
124
- "epoch": 1.4668581275129235,
125
- "eval_loss": 1.272080898284912,
126
- "eval_runtime": 116.1097,
127
- "eval_samples_per_second": 41.564,
128
- "eval_steps_per_second": 20.782,
129
- "step": 400
130
- },
131
- {
132
- "epoch": 1.6506605399195864,
133
- "grad_norm": 55.69127655029297,
134
- "learning_rate": 8.345588235294118e-05,
135
- "loss": 1.1256,
136
- "step": 450
137
- },
138
- {
139
- "epoch": 1.6506605399195864,
140
- "eval_loss": 1.2628560066223145,
141
- "eval_runtime": 116.0668,
142
- "eval_samples_per_second": 41.58,
143
- "eval_steps_per_second": 20.79,
144
- "step": 450
145
- },
146
- {
147
- "epoch": 1.8344629523262492,
148
- "grad_norm": 84.45687103271484,
149
- "learning_rate": 8.161764705882353e-05,
150
- "loss": 1.1115,
151
- "step": 500
152
- },
153
- {
154
- "epoch": 1.8344629523262492,
155
- "eval_loss": 1.2613238096237183,
156
- "eval_runtime": 116.2349,
157
- "eval_samples_per_second": 41.519,
158
- "eval_steps_per_second": 20.76,
159
- "step": 500
160
- },
161
- {
162
- "epoch": 2.014704192992533,
163
- "grad_norm": 66.61148834228516,
164
- "learning_rate": 7.977941176470589e-05,
165
- "loss": 1.0706,
166
- "step": 550
167
- },
168
- {
169
- "epoch": 2.014704192992533,
170
- "eval_loss": 1.2579104900360107,
171
- "eval_runtime": 117.3176,
172
- "eval_samples_per_second": 41.136,
173
- "eval_steps_per_second": 20.568,
174
- "step": 550
175
- },
176
- {
177
- "epoch": 2.1985066053991957,
178
- "grad_norm": 83.05467987060547,
179
- "learning_rate": 7.794117647058824e-05,
180
- "loss": 1.0691,
181
- "step": 600
182
- },
183
- {
184
- "epoch": 2.1985066053991957,
185
- "eval_loss": 1.2582261562347412,
186
- "eval_runtime": 116.1323,
187
- "eval_samples_per_second": 41.556,
188
- "eval_steps_per_second": 20.778,
189
- "step": 600
190
- },
191
- {
192
- "epoch": 2.382309017805859,
193
- "grad_norm": 71.0223617553711,
194
- "learning_rate": 7.610294117647059e-05,
195
- "loss": 1.0573,
196
- "step": 650
197
- },
198
- {
199
- "epoch": 2.382309017805859,
200
- "eval_loss": 1.2502797842025757,
201
- "eval_runtime": 116.1247,
202
- "eval_samples_per_second": 41.559,
203
- "eval_steps_per_second": 20.779,
204
- "step": 650
205
- },
206
- {
207
- "epoch": 2.5661114302125214,
208
- "grad_norm": 64.11682891845703,
209
- "learning_rate": 7.426470588235294e-05,
210
- "loss": 1.0459,
211
- "step": 700
212
- },
213
- {
214
- "epoch": 2.5661114302125214,
215
- "eval_loss": 1.2464367151260376,
216
- "eval_runtime": 116.0733,
217
- "eval_samples_per_second": 41.577,
218
- "eval_steps_per_second": 20.789,
219
- "step": 700
220
- },
221
- {
222
- "epoch": 2.7499138426191845,
223
- "grad_norm": 90.718505859375,
224
- "learning_rate": 7.242647058823529e-05,
225
- "loss": 1.0444,
226
- "step": 750
227
- },
228
- {
229
- "epoch": 2.7499138426191845,
230
- "eval_loss": 1.238742470741272,
231
- "eval_runtime": 116.1005,
232
- "eval_samples_per_second": 41.567,
233
- "eval_steps_per_second": 20.784,
234
- "step": 750
235
- },
236
- {
237
- "epoch": 2.933716255025847,
238
- "grad_norm": 52.93131637573242,
239
- "learning_rate": 7.058823529411765e-05,
240
- "loss": 1.033,
241
- "step": 800
242
- },
243
- {
244
- "epoch": 2.933716255025847,
245
- "eval_loss": 1.2367525100708008,
246
- "eval_runtime": 115.9357,
247
- "eval_samples_per_second": 41.627,
248
- "eval_steps_per_second": 20.813,
249
- "step": 800
250
- },
251
- {
252
- "epoch": 3.113957495692131,
253
- "grad_norm": 47.737281799316406,
254
- "learning_rate": 6.875e-05,
255
- "loss": 0.9902,
256
- "step": 850
257
- },
258
- {
259
- "epoch": 3.113957495692131,
260
- "eval_loss": 1.2378348112106323,
261
- "eval_runtime": 115.9587,
262
- "eval_samples_per_second": 41.618,
263
- "eval_steps_per_second": 20.809,
264
- "step": 850
265
- },
266
- {
267
- "epoch": 3.2977599080987936,
268
- "grad_norm": 64.8248519897461,
269
- "learning_rate": 6.691176470588235e-05,
270
- "loss": 0.9991,
271
- "step": 900
272
- },
273
- {
274
- "epoch": 3.2977599080987936,
275
- "eval_loss": 1.2380481958389282,
276
- "eval_runtime": 116.1261,
277
- "eval_samples_per_second": 41.558,
278
- "eval_steps_per_second": 20.779,
279
- "step": 900
280
- },
281
- {
282
- "epoch": 3.4815623205054567,
283
- "grad_norm": 44.85393524169922,
284
- "learning_rate": 6.507352941176472e-05,
285
- "loss": 0.9954,
286
- "step": 950
287
- },
288
- {
289
- "epoch": 3.4815623205054567,
290
- "eval_loss": 1.2317270040512085,
291
- "eval_runtime": 116.1731,
292
- "eval_samples_per_second": 41.541,
293
- "eval_steps_per_second": 20.771,
294
- "step": 950
295
- },
296
- {
297
- "epoch": 3.6653647329121197,
298
- "grad_norm": 95.18327331542969,
299
- "learning_rate": 6.323529411764705e-05,
300
- "loss": 0.9896,
301
- "step": 1000
302
- },
303
- {
304
- "epoch": 3.6653647329121197,
305
- "eval_loss": 1.2345850467681885,
306
- "eval_runtime": 116.2127,
307
- "eval_samples_per_second": 41.527,
308
- "eval_steps_per_second": 20.764,
309
- "step": 1000
310
- },
311
- {
312
- "epoch": 3.8491671453187823,
313
- "grad_norm": 53.003849029541016,
314
- "learning_rate": 6.139705882352942e-05,
315
- "loss": 0.9786,
316
- "step": 1050
317
- },
318
- {
319
- "epoch": 3.8491671453187823,
320
- "eval_loss": 1.235013723373413,
321
- "eval_runtime": 116.1383,
322
- "eval_samples_per_second": 41.554,
323
- "eval_steps_per_second": 20.777,
324
- "step": 1050
325
- },
326
- {
327
- "epoch": 4.029408385985066,
328
- "grad_norm": 60.95920181274414,
329
- "learning_rate": 5.9558823529411766e-05,
330
- "loss": 0.953,
331
- "step": 1100
332
- },
333
- {
334
- "epoch": 4.029408385985066,
335
- "eval_loss": 1.2303814888000488,
336
- "eval_runtime": 116.9283,
337
- "eval_samples_per_second": 41.273,
338
- "eval_steps_per_second": 20.637,
339
- "step": 1100
340
- },
341
- {
342
- "epoch": 4.213210798391729,
343
- "grad_norm": 94.7210922241211,
344
- "learning_rate": 5.7720588235294116e-05,
345
- "loss": 0.9526,
346
- "step": 1150
347
- },
348
- {
349
- "epoch": 4.213210798391729,
350
- "eval_loss": 1.2337546348571777,
351
- "eval_runtime": 116.1668,
352
- "eval_samples_per_second": 41.544,
353
- "eval_steps_per_second": 20.772,
354
- "step": 1150
355
- },
356
- {
357
- "epoch": 4.3970132107983915,
358
- "grad_norm": 42.170989990234375,
359
- "learning_rate": 5.588235294117647e-05,
360
- "loss": 0.9448,
361
- "step": 1200
362
- },
363
- {
364
- "epoch": 4.3970132107983915,
365
- "eval_loss": 1.228541612625122,
366
- "eval_runtime": 116.2415,
367
- "eval_samples_per_second": 41.517,
368
- "eval_steps_per_second": 20.759,
369
- "step": 1200
370
- },
371
- {
372
- "epoch": 4.580815623205055,
373
- "grad_norm": 60.85408401489258,
374
- "learning_rate": 5.404411764705882e-05,
375
- "loss": 0.9408,
376
- "step": 1250
377
- },
378
- {
379
- "epoch": 4.580815623205055,
380
- "eval_loss": 1.2270257472991943,
381
- "eval_runtime": 116.2096,
382
- "eval_samples_per_second": 41.528,
383
- "eval_steps_per_second": 20.764,
384
- "step": 1250
385
- },
386
- {
387
- "epoch": 4.764618035611718,
388
- "grad_norm": 59.16058349609375,
389
- "learning_rate": 5.2205882352941185e-05,
390
- "loss": 0.9364,
391
- "step": 1300
392
- },
393
- {
394
- "epoch": 4.764618035611718,
395
- "eval_loss": 1.2274161577224731,
396
- "eval_runtime": 116.2519,
397
- "eval_samples_per_second": 41.513,
398
- "eval_steps_per_second": 20.757,
399
- "step": 1300
400
- },
401
- {
402
- "epoch": 4.94842044801838,
403
- "grad_norm": 94.78569030761719,
404
- "learning_rate": 5.036764705882353e-05,
405
- "loss": 0.9319,
406
- "step": 1350
407
- },
408
- {
409
- "epoch": 4.94842044801838,
410
- "eval_loss": 1.228610873222351,
411
- "eval_runtime": 116.0677,
412
- "eval_samples_per_second": 41.579,
413
- "eval_steps_per_second": 20.79,
414
- "step": 1350
415
- },
416
- {
417
- "epoch": 5.128661688684664,
418
- "grad_norm": 56.210121154785156,
419
- "learning_rate": 4.8529411764705885e-05,
420
- "loss": 0.9023,
421
- "step": 1400
422
- },
423
- {
424
- "epoch": 5.128661688684664,
425
- "eval_loss": 1.2338696718215942,
426
- "eval_runtime": 116.0945,
427
- "eval_samples_per_second": 41.57,
428
- "eval_steps_per_second": 20.785,
429
- "step": 1400
430
- },
431
- {
432
- "epoch": 5.312464101091327,
433
- "grad_norm": 54.96003723144531,
434
- "learning_rate": 4.669117647058824e-05,
435
- "loss": 0.9067,
436
- "step": 1450
437
- },
438
- {
439
- "epoch": 5.312464101091327,
440
- "eval_loss": 1.231054663658142,
441
- "eval_runtime": 115.9656,
442
- "eval_samples_per_second": 41.616,
443
- "eval_steps_per_second": 20.808,
444
- "step": 1450
445
- },
446
- {
447
- "epoch": 5.496266513497989,
448
- "grad_norm": 45.37810516357422,
449
- "learning_rate": 4.485294117647059e-05,
450
- "loss": 0.9032,
451
- "step": 1500
452
- },
453
- {
454
- "epoch": 5.496266513497989,
455
- "eval_loss": 1.2278664112091064,
456
- "eval_runtime": 116.0475,
457
- "eval_samples_per_second": 41.586,
458
- "eval_steps_per_second": 20.793,
459
- "step": 1500
460
- },
461
- {
462
- "epoch": 5.680068925904653,
463
- "grad_norm": 51.236480712890625,
464
- "learning_rate": 4.301470588235295e-05,
465
- "loss": 0.9032,
466
- "step": 1550
467
- },
468
- {
469
- "epoch": 5.680068925904653,
470
- "eval_loss": 1.2236429452896118,
471
- "eval_runtime": 116.3342,
472
- "eval_samples_per_second": 41.484,
473
- "eval_steps_per_second": 20.742,
474
- "step": 1550
475
- },
476
- {
477
- "epoch": 5.8638713383113155,
478
- "grad_norm": 40.550933837890625,
479
- "learning_rate": 4.11764705882353e-05,
480
- "loss": 0.896,
481
- "step": 1600
482
- },
483
- {
484
- "epoch": 5.8638713383113155,
485
- "eval_loss": 1.2268708944320679,
486
- "eval_runtime": 116.1038,
487
- "eval_samples_per_second": 41.566,
488
- "eval_steps_per_second": 20.783,
489
- "step": 1600
490
- },
491
- {
492
- "epoch": 6.044112578977599,
493
- "grad_norm": 59.42768859863281,
494
- "learning_rate": 3.933823529411765e-05,
495
- "loss": 0.8781,
496
- "step": 1650
497
- },
498
- {
499
- "epoch": 6.044112578977599,
500
- "eval_loss": 1.233597993850708,
501
- "eval_runtime": 117.1571,
502
- "eval_samples_per_second": 41.193,
503
- "eval_steps_per_second": 20.596,
504
- "step": 1650
505
- },
506
- {
507
- "epoch": 6.227914991384262,
508
- "grad_norm": 68.26610565185547,
509
- "learning_rate": 3.7500000000000003e-05,
510
- "loss": 0.8804,
511
- "step": 1700
512
- },
513
- {
514
- "epoch": 6.227914991384262,
515
- "eval_loss": 1.2279460430145264,
516
- "eval_runtime": 116.1011,
517
- "eval_samples_per_second": 41.567,
518
- "eval_steps_per_second": 20.784,
519
- "step": 1700
520
- },
521
- {
522
- "epoch": 6.411717403790925,
523
- "grad_norm": 77.21823120117188,
524
- "learning_rate": 3.566176470588235e-05,
525
- "loss": 0.8733,
526
- "step": 1750
527
- },
528
- {
529
- "epoch": 6.411717403790925,
530
- "eval_loss": 1.2353451251983643,
531
- "eval_runtime": 116.0518,
532
- "eval_samples_per_second": 41.585,
533
- "eval_steps_per_second": 20.792,
534
- "step": 1750
535
- },
536
- {
537
- "epoch": 6.595519816197587,
538
- "grad_norm": 49.22051239013672,
539
- "learning_rate": 3.382352941176471e-05,
540
- "loss": 0.875,
541
- "step": 1800
542
- },
543
- {
544
- "epoch": 6.595519816197587,
545
- "eval_loss": 1.2324572801589966,
546
- "eval_runtime": 116.0982,
547
- "eval_samples_per_second": 41.568,
548
- "eval_steps_per_second": 20.784,
549
- "step": 1800
550
- },
551
- {
552
- "epoch": 6.779322228604251,
553
- "grad_norm": 61.27114486694336,
554
- "learning_rate": 3.198529411764706e-05,
555
- "loss": 0.8634,
556
- "step": 1850
557
- },
558
- {
559
- "epoch": 6.779322228604251,
560
- "eval_loss": 1.2263100147247314,
561
- "eval_runtime": 116.0582,
562
- "eval_samples_per_second": 41.583,
563
- "eval_steps_per_second": 20.791,
564
- "step": 1850
565
- },
566
- {
567
- "epoch": 6.963124641010913,
568
- "grad_norm": 53.27342224121094,
569
- "learning_rate": 3.0147058823529413e-05,
570
- "loss": 0.8647,
571
- "step": 1900
572
- },
573
- {
574
- "epoch": 6.963124641010913,
575
- "eval_loss": 1.2306259870529175,
576
- "eval_runtime": 116.21,
577
- "eval_samples_per_second": 41.528,
578
- "eval_steps_per_second": 20.764,
579
- "step": 1900
580
- },
581
- {
582
- "epoch": 7.143365881677197,
583
- "grad_norm": 56.99700927734375,
584
- "learning_rate": 2.8308823529411766e-05,
585
- "loss": 0.8335,
586
- "step": 1950
587
- },
588
- {
589
- "epoch": 7.143365881677197,
590
- "eval_loss": 1.2323832511901855,
591
- "eval_runtime": 116.0282,
592
- "eval_samples_per_second": 41.593,
593
- "eval_steps_per_second": 20.797,
594
- "step": 1950
595
- },
596
- {
597
- "epoch": 7.32716829408386,
598
- "grad_norm": 111.48177337646484,
599
- "learning_rate": 2.647058823529412e-05,
600
- "loss": 0.8489,
601
- "step": 2000
602
- },
603
- {
604
- "epoch": 7.32716829408386,
605
- "eval_loss": 1.2314597368240356,
606
- "eval_runtime": 116.1391,
607
- "eval_samples_per_second": 41.554,
608
- "eval_steps_per_second": 20.777,
609
- "step": 2000
610
- },
611
- {
612
- "epoch": 7.5109707064905225,
613
- "grad_norm": 44.07224655151367,
614
- "learning_rate": 2.4632352941176472e-05,
615
- "loss": 0.8473,
616
- "step": 2050
617
- },
618
- {
619
- "epoch": 7.5109707064905225,
620
- "eval_loss": 1.2360129356384277,
621
- "eval_runtime": 116.3906,
622
- "eval_samples_per_second": 41.464,
623
- "eval_steps_per_second": 20.732,
624
- "step": 2050
625
- },
626
- {
627
- "epoch": 7.694773118897185,
628
- "grad_norm": 58.74856948852539,
629
- "learning_rate": 2.2794117647058825e-05,
630
- "loss": 0.8422,
631
- "step": 2100
632
- },
633
- {
634
- "epoch": 7.694773118897185,
635
- "eval_loss": 1.23045015335083,
636
- "eval_runtime": 116.4238,
637
- "eval_samples_per_second": 41.452,
638
- "eval_steps_per_second": 20.726,
639
- "step": 2100
640
- },
641
- {
642
- "epoch": 7.878575531303849,
643
- "grad_norm": 42.6165771484375,
644
- "learning_rate": 2.0955882352941178e-05,
645
- "loss": 0.8414,
646
- "step": 2150
647
- },
648
- {
649
- "epoch": 7.878575531303849,
650
- "eval_loss": 1.232067346572876,
651
- "eval_runtime": 115.9906,
652
- "eval_samples_per_second": 41.607,
653
- "eval_steps_per_second": 20.803,
654
- "step": 2150
655
- },
656
- {
657
- "epoch": 8.058816771970132,
658
- "grad_norm": 80.5433349609375,
659
- "learning_rate": 1.9117647058823528e-05,
660
- "loss": 0.8227,
661
- "step": 2200
662
- },
663
- {
664
- "epoch": 8.058816771970132,
665
- "eval_loss": 1.2389429807662964,
666
- "eval_runtime": 117.23,
667
- "eval_samples_per_second": 41.167,
668
- "eval_steps_per_second": 20.583,
669
- "step": 2200
670
- },
671
- {
672
- "epoch": 8.242619184376794,
673
- "grad_norm": 45.97893142700195,
674
- "learning_rate": 1.7279411764705884e-05,
675
- "loss": 0.8316,
676
- "step": 2250
677
- },
678
- {
679
- "epoch": 8.242619184376794,
680
- "eval_loss": 1.2351105213165283,
681
- "eval_runtime": 116.2345,
682
- "eval_samples_per_second": 41.52,
683
- "eval_steps_per_second": 20.76,
684
- "step": 2250
685
- },
686
- {
687
- "epoch": 8.426421596783458,
688
- "grad_norm": 68.8030014038086,
689
- "learning_rate": 1.5441176470588237e-05,
690
- "loss": 0.8299,
691
- "step": 2300
692
- },
693
- {
694
- "epoch": 8.426421596783458,
695
- "eval_loss": 1.2383313179016113,
696
- "eval_runtime": 115.9203,
697
- "eval_samples_per_second": 41.632,
698
- "eval_steps_per_second": 20.816,
699
- "step": 2300
700
- },
701
- {
702
- "epoch": 8.610224009190121,
703
- "grad_norm": 48.16875076293945,
704
- "learning_rate": 1.3602941176470587e-05,
705
- "loss": 0.822,
706
- "step": 2350
707
- },
708
- {
709
- "epoch": 8.610224009190121,
710
- "eval_loss": 1.2370705604553223,
711
- "eval_runtime": 116.2999,
712
- "eval_samples_per_second": 41.496,
713
- "eval_steps_per_second": 20.748,
714
- "step": 2350
715
- },
716
- {
717
- "epoch": 8.794026421596783,
718
- "grad_norm": 49.53213119506836,
719
- "learning_rate": 1.1764705882352942e-05,
720
- "loss": 0.8251,
721
- "step": 2400
722
- },
723
- {
724
- "epoch": 8.794026421596783,
725
- "eval_loss": 1.2367668151855469,
726
- "eval_runtime": 116.3102,
727
- "eval_samples_per_second": 41.493,
728
- "eval_steps_per_second": 20.746,
729
- "step": 2400
730
- },
731
- {
732
- "epoch": 8.977828834003446,
733
- "grad_norm": 42.136714935302734,
734
- "learning_rate": 9.926470588235293e-06,
735
- "loss": 0.8225,
736
- "step": 2450
737
- },
738
- {
739
- "epoch": 8.977828834003446,
740
- "eval_loss": 1.2319527864456177,
741
- "eval_runtime": 116.086,
742
- "eval_samples_per_second": 41.573,
743
- "eval_steps_per_second": 20.786,
744
- "step": 2450
745
- },
746
- {
747
- "epoch": 9.15807007466973,
748
- "grad_norm": 55.69232940673828,
749
- "learning_rate": 8.088235294117648e-06,
750
- "loss": 0.7995,
751
- "step": 2500
752
- },
753
- {
754
- "epoch": 9.15807007466973,
755
- "eval_loss": 1.2360199689865112,
756
- "eval_runtime": 116.0847,
757
- "eval_samples_per_second": 41.573,
758
- "eval_steps_per_second": 20.787,
759
- "step": 2500
760
- },
761
- {
762
- "epoch": 9.341872487076392,
763
- "grad_norm": 62.24937438964844,
764
- "learning_rate": 6.25e-06,
765
- "loss": 0.8149,
766
- "step": 2550
767
- },
768
- {
769
- "epoch": 9.341872487076392,
770
- "eval_loss": 1.2363650798797607,
771
- "eval_runtime": 116.0508,
772
- "eval_samples_per_second": 41.585,
773
- "eval_steps_per_second": 20.793,
774
- "step": 2550
775
- },
776
- {
777
- "epoch": 9.525674899483056,
778
- "grad_norm": 50.01460266113281,
779
- "learning_rate": 4.411764705882353e-06,
780
- "loss": 0.8146,
781
- "step": 2600
782
- },
783
- {
784
- "epoch": 9.525674899483056,
785
- "eval_loss": 1.2402119636535645,
786
- "eval_runtime": 116.1442,
787
- "eval_samples_per_second": 41.552,
788
- "eval_steps_per_second": 20.776,
789
- "step": 2600
790
- },
791
- {
792
- "epoch": 9.709477311889719,
793
- "grad_norm": 60.61581802368164,
794
- "learning_rate": 2.573529411764706e-06,
795
- "loss": 0.8075,
796
- "step": 2650
797
- },
798
- {
799
- "epoch": 9.709477311889719,
800
- "eval_loss": 1.2341493368148804,
801
- "eval_runtime": 116.3546,
802
- "eval_samples_per_second": 41.477,
803
- "eval_steps_per_second": 20.738,
804
- "step": 2650
805
- },
806
- {
807
- "epoch": 9.89327972429638,
808
- "grad_norm": 54.73764419555664,
809
- "learning_rate": 7.352941176470589e-07,
810
- "loss": 0.8108,
811
- "step": 2700
812
- },
813
- {
814
- "epoch": 9.89327972429638,
815
- "eval_loss": 1.2355538606643677,
816
- "eval_runtime": 116.1282,
817
- "eval_samples_per_second": 41.558,
818
- "eval_steps_per_second": 20.779,
819
- "step": 2700
820
  }
821
  ],
822
  "logging_steps": 50,
823
- "max_steps": 2720,
824
  "num_input_tokens_seen": 0,
825
  "num_train_epochs": 10,
826
  "save_steps": 500,
@@ -831,12 +96,12 @@
831
  "should_evaluate": false,
832
  "should_log": false,
833
  "should_save": true,
834
- "should_training_stop": true
835
  },
836
  "attributes": {}
837
  }
838
  },
839
- "total_flos": 6.99559007609684e+17,
840
  "train_batch_size": 2,
841
  "trial_name": null,
842
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9972065305108946,
5
  "eval_steps": 50,
6
+ "global_step": 251,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.1986467192252778,
13
+ "grad_norm": 1.6958907842636108,
14
+ "learning_rate": 9.800796812749005e-05,
15
+ "loss": 1.6362,
16
  "step": 50
17
  },
18
  {
19
+ "epoch": 0.1986467192252778,
20
+ "eval_loss": 1.5567175149917603,
21
+ "eval_runtime": 14.8041,
22
+ "eval_samples_per_second": 57.281,
23
+ "eval_steps_per_second": 28.641,
24
  "step": 50
25
  },
26
  {
27
+ "epoch": 0.3972934384505556,
28
+ "grad_norm": 1.6160608530044556,
29
+ "learning_rate": 9.601593625498009e-05,
30
+ "loss": 1.5283,
31
  "step": 100
32
  },
33
  {
34
+ "epoch": 0.3972934384505556,
35
+ "eval_loss": 1.5002830028533936,
36
+ "eval_runtime": 14.7266,
37
+ "eval_samples_per_second": 57.583,
38
+ "eval_steps_per_second": 28.792,
39
  "step": 100
40
  },
41
  {
42
+ "epoch": 0.5959401576758334,
43
+ "grad_norm": 1.500954270362854,
44
+ "learning_rate": 9.402390438247013e-05,
45
+ "loss": 1.4825,
46
  "step": 150
47
  },
48
  {
49
+ "epoch": 0.5959401576758334,
50
+ "eval_loss": 1.4542008638381958,
51
+ "eval_runtime": 14.5669,
52
+ "eval_samples_per_second": 58.214,
53
+ "eval_steps_per_second": 29.107,
54
  "step": 150
55
  },
56
  {
57
+ "epoch": 0.7945868769011112,
58
+ "grad_norm": 0.8321912884712219,
59
+ "learning_rate": 9.203187250996016e-05,
60
+ "loss": 1.4431,
61
  "step": 200
62
  },
63
  {
64
+ "epoch": 0.7945868769011112,
65
+ "eval_loss": 1.4306951761245728,
66
+ "eval_runtime": 14.563,
67
+ "eval_samples_per_second": 58.23,
68
+ "eval_steps_per_second": 29.115,
69
  "step": 200
70
  },
71
  {
72
+ "epoch": 0.993233596126389,
73
+ "grad_norm": 1.2672511339187622,
74
+ "learning_rate": 9.00398406374502e-05,
75
+ "loss": 1.4083,
76
  "step": 250
77
  },
78
  {
79
+ "epoch": 0.993233596126389,
80
+ "eval_loss": 1.3854182958602905,
81
+ "eval_runtime": 14.6247,
82
+ "eval_samples_per_second": 57.984,
83
+ "eval_steps_per_second": 28.992,
84
  "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  }
86
  ],
87
  "logging_steps": 50,
88
+ "max_steps": 2510,
89
  "num_input_tokens_seen": 0,
90
  "num_train_epochs": 10,
91
  "save_steps": 500,
 
96
  "should_evaluate": false,
97
  "should_log": false,
98
  "should_save": true,
99
+ "should_training_stop": false
100
  },
101
  "attributes": {}
102
  }
103
  },
104
+ "total_flos": 4.004589805030605e+16,
105
  "train_batch_size": 2,
106
  "trial_name": null,
107
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba3ba4c9440deaddd402fb37b83a2039f6da0a755f6ddc398099f7184d4f6b4e
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60cd4fa81843b4806dea3364d37d3df9835095733d168d051c0b135e77b91aa
3
  size 5368