amos1088 commited on
Commit
27430f6
·
verified ·
1 Parent(s): 5768c59

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -28,10 +28,10 @@
28
  "down_proj",
29
  "gate_proj",
30
  "q_proj",
31
- "k_proj",
32
  "v_proj",
 
33
  "up_proj",
34
- "o_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
28
  "down_proj",
29
  "gate_proj",
30
  "q_proj",
 
31
  "v_proj",
32
+ "o_proj",
33
  "up_proj",
34
+ "k_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddc2f6d25f763b9de0ed5430306eb636ad7c54269a7b3b70c998dbb4d0242450
3
  size 35668592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:218ab56f7a51eb8b7fcd24a854a377779a21c223f5f3094f1c9e892485262041
3
  size 35668592
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d46dc4e7f4c38bd7d740c9f5afd10a56d0c7b90d973e3e7ceaf2a89f6ab3066a
3
- size 18257163
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:368e648350fa88e97ed24bf06f585bb7f7097580946a4a9480bef2318de437fd
3
+ size 18257035
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:532727a1ac4eb5b9846bd900afbac875d546089027ad66d97c611355ff543eb1
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f92bb13c8c261c5bbacd52e4713611a2458ef3c2d47986ab438b3233a082b5f
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20260be3fc45a3cfa8fd6a74639f50b3b33a87c97c47f472437044dfb3488bc9
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4393a84a3109995aa1202073b039b12062e3189ed89aa0b94ef0510ba843009
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02647c16a79c538141d09a2e5ec5135201f004952aab2cef2e8f97c0a0eb658e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2b5ed8d4c0db2e24674d7f125356981e2c73273d96a8f3eabaf284b99f24856
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,556 +2,106 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1805189921022941,
6
  "eval_steps": 500,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.003008649868371568,
14
- "grad_norm": 4.276698589324951,
15
- "learning_rate": 3.0060120240480964e-07,
16
- "loss": 480.0683,
17
  "mean_token_accuracy": 0.0,
18
- "num_tokens": 34261.0,
19
  "step": 10
20
  },
21
  {
22
- "epoch": 0.006017299736743136,
23
- "grad_norm": 4.8311052322387695,
24
- "learning_rate": 8.016032064128256e-07,
25
- "loss": 477.3868,
26
  "mean_token_accuracy": 0.0,
27
- "num_tokens": 69155.0,
28
  "step": 20
29
  },
30
  {
31
- "epoch": 0.009025949605114705,
32
- "grad_norm": 5.98472261428833,
33
- "learning_rate": 1.2525050100200402e-06,
34
- "loss": 1114.7921,
35
  "mean_token_accuracy": 0.0,
36
- "num_tokens": 103105.0,
37
  "step": 30
38
  },
39
  {
40
- "epoch": 0.012034599473486273,
41
- "grad_norm": 5.757265567779541,
42
- "learning_rate": 1.7535070140280561e-06,
43
- "loss": 143.3327,
44
  "mean_token_accuracy": 0.0,
45
- "num_tokens": 138287.0,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 0.01504324934185784,
50
- "grad_norm": 6.342145919799805,
51
- "learning_rate": 2.2545090180360722e-06,
52
- "loss": 1132.4002,
53
  "mean_token_accuracy": 0.0,
54
- "num_tokens": 173859.0,
55
  "step": 50
56
  },
57
  {
58
- "epoch": 0.01805189921022941,
59
- "grad_norm": 5.444029808044434,
60
- "learning_rate": 2.755511022044088e-06,
61
- "loss": 564.7068,
62
  "mean_token_accuracy": 0.0,
63
- "num_tokens": 208076.0,
64
  "step": 60
65
  },
66
  {
67
- "epoch": 0.021060549078600978,
68
- "grad_norm": 8.959345817565918,
69
- "learning_rate": 3.256513026052104e-06,
70
- "loss": 511.3984,
71
  "mean_token_accuracy": 0.0,
72
- "num_tokens": 242205.0,
73
  "step": 70
74
  },
75
  {
76
- "epoch": 0.024069198946972545,
77
- "grad_norm": 8.90196418762207,
78
- "learning_rate": 3.757515030060121e-06,
79
- "loss": 1757.1668,
80
  "mean_token_accuracy": 0.0,
81
- "num_tokens": 278838.0,
82
  "step": 80
83
  },
84
  {
85
- "epoch": 0.027077848815344113,
86
- "grad_norm": 10.707074165344238,
87
- "learning_rate": 4.258517034068137e-06,
88
- "loss": 409.6564,
89
  "mean_token_accuracy": 0.0,
90
- "num_tokens": 313816.0,
91
  "step": 90
92
  },
93
  {
94
- "epoch": 0.03008649868371568,
95
- "grad_norm": 11.27257251739502,
96
- "learning_rate": 4.759519038076152e-06,
97
- "loss": 476.8946,
98
  "mean_token_accuracy": 0.0,
99
- "num_tokens": 349435.0,
100
  "step": 100
101
- },
102
- {
103
- "epoch": 0.03309514855208725,
104
- "grad_norm": 10.067933082580566,
105
- "learning_rate": 5.2605210420841686e-06,
106
- "loss": 3136.1221,
107
- "mean_token_accuracy": 0.0,
108
- "num_tokens": 385097.0,
109
- "step": 110
110
- },
111
- {
112
- "epoch": 0.03610379842045882,
113
- "grad_norm": 12.381434440612793,
114
- "learning_rate": 5.761523046092185e-06,
115
- "loss": 152.1215,
116
- "mean_token_accuracy": 0.0,
117
- "num_tokens": 420134.0,
118
- "step": 120
119
- },
120
- {
121
- "epoch": 0.039112448288830384,
122
- "grad_norm": 16.296707153320312,
123
- "learning_rate": 6.2625250501002e-06,
124
- "loss": 544.0213,
125
- "mean_token_accuracy": 0.0,
126
- "num_tokens": 454034.0,
127
- "step": 130
128
- },
129
- {
130
- "epoch": 0.042121098157201955,
131
- "grad_norm": 24.59258270263672,
132
- "learning_rate": 6.763527054108217e-06,
133
- "loss": 548.5192,
134
- "mean_token_accuracy": 0.0,
135
- "num_tokens": 488139.0,
136
- "step": 140
137
- },
138
- {
139
- "epoch": 0.04512974802557353,
140
- "grad_norm": 23.74886131286621,
141
- "learning_rate": 7.264529058116233e-06,
142
- "loss": 346.1137,
143
- "mean_token_accuracy": 0.0,
144
- "num_tokens": 522244.0,
145
- "step": 150
146
- },
147
- {
148
- "epoch": 0.04813839789394509,
149
- "grad_norm": 20.021039962768555,
150
- "learning_rate": 7.765531062124248e-06,
151
- "loss": 166.4427,
152
- "mean_token_accuracy": 0.1875,
153
- "num_tokens": 556290.0,
154
- "step": 160
155
- },
156
- {
157
- "epoch": 0.05114704776231666,
158
- "grad_norm": 19.577783584594727,
159
- "learning_rate": 8.266533066132265e-06,
160
- "loss": 86.5497,
161
- "mean_token_accuracy": 0.525,
162
- "num_tokens": 588646.0,
163
- "step": 170
164
- },
165
- {
166
- "epoch": 0.054155697630688227,
167
- "grad_norm": 1.172348976135254,
168
- "learning_rate": 8.767535070140282e-06,
169
- "loss": 23.6608,
170
- "mean_token_accuracy": 0.6125,
171
- "num_tokens": 622280.0,
172
- "step": 180
173
- },
174
- {
175
- "epoch": 0.0571643474990598,
176
- "grad_norm": 0.18244539201259613,
177
- "learning_rate": 9.268537074148296e-06,
178
- "loss": 9.9449,
179
- "mean_token_accuracy": 0.6125,
180
- "num_tokens": 656869.0,
181
- "step": 190
182
- },
183
- {
184
- "epoch": 0.06017299736743136,
185
- "grad_norm": 0.10955705493688583,
186
- "learning_rate": 9.769539078156313e-06,
187
- "loss": 1.8071,
188
- "mean_token_accuracy": 0.575,
189
- "num_tokens": 691096.0,
190
- "step": 200
191
- },
192
- {
193
- "epoch": 0.06318164723580294,
194
- "grad_norm": 0.04723483696579933,
195
- "learning_rate": 1.027054108216433e-05,
196
- "loss": 0.0957,
197
- "mean_token_accuracy": 0.5875,
198
- "num_tokens": 726392.0,
199
- "step": 210
200
- },
201
- {
202
- "epoch": 0.0661902971041745,
203
- "grad_norm": 0.010282195173203945,
204
- "learning_rate": 1.0771543086172344e-05,
205
- "loss": 0.3321,
206
- "mean_token_accuracy": 0.55,
207
- "num_tokens": 761837.0,
208
- "step": 220
209
- },
210
- {
211
- "epoch": 0.06919894697254607,
212
- "grad_norm": 13.442811012268066,
213
- "learning_rate": 1.1272545090180361e-05,
214
- "loss": 1.0772,
215
- "mean_token_accuracy": 0.55,
216
- "num_tokens": 797225.0,
217
- "step": 230
218
- },
219
- {
220
- "epoch": 0.07220759684091764,
221
- "grad_norm": 0.011219559237360954,
222
- "learning_rate": 1.1773547094188378e-05,
223
- "loss": 0.0339,
224
- "mean_token_accuracy": 0.6125,
225
- "num_tokens": 831675.0,
226
- "step": 240
227
- },
228
- {
229
- "epoch": 0.07521624670928921,
230
- "grad_norm": 0.004255462437868118,
231
- "learning_rate": 1.2274549098196394e-05,
232
- "loss": 0.01,
233
- "mean_token_accuracy": 0.675,
234
- "num_tokens": 865688.0,
235
- "step": 250
236
- },
237
- {
238
- "epoch": 0.07822489657766077,
239
- "grad_norm": 0.017930058762431145,
240
- "learning_rate": 1.2775551102204408e-05,
241
- "loss": 0.0087,
242
- "mean_token_accuracy": 0.6,
243
- "num_tokens": 901141.0,
244
- "step": 260
245
- },
246
- {
247
- "epoch": 0.08123354644603234,
248
- "grad_norm": 0.0016301374416798353,
249
- "learning_rate": 1.3276553106212425e-05,
250
- "loss": 0.0208,
251
- "mean_token_accuracy": 0.6125,
252
- "num_tokens": 935822.0,
253
- "step": 270
254
- },
255
- {
256
- "epoch": 0.08424219631440391,
257
- "grad_norm": 0.008146238513290882,
258
- "learning_rate": 1.3777555110220442e-05,
259
- "loss": 0.0119,
260
- "mean_token_accuracy": 0.6375,
261
- "num_tokens": 970555.0,
262
- "step": 280
263
- },
264
- {
265
- "epoch": 0.08725084618277548,
266
- "grad_norm": 0.009405690245330334,
267
- "learning_rate": 1.4278557114228458e-05,
268
- "loss": 0.0484,
269
- "mean_token_accuracy": 0.5875,
270
- "num_tokens": 1005588.0,
271
- "step": 290
272
- },
273
- {
274
- "epoch": 0.09025949605114705,
275
- "grad_norm": 14.278578758239746,
276
- "learning_rate": 1.4779559118236475e-05,
277
- "loss": 0.027,
278
- "mean_token_accuracy": 0.5125,
279
- "num_tokens": 1040690.0,
280
- "step": 300
281
- },
282
- {
283
- "epoch": 0.09326814591951861,
284
- "grad_norm": 0.0022012211848050356,
285
- "learning_rate": 1.5280561122244487e-05,
286
- "loss": 0.0203,
287
- "mean_token_accuracy": 0.4875,
288
- "num_tokens": 1075623.0,
289
- "step": 310
290
- },
291
- {
292
- "epoch": 0.09627679578789018,
293
- "grad_norm": 0.004598209168761969,
294
- "learning_rate": 1.5781563126252504e-05,
295
- "loss": 0.0049,
296
- "mean_token_accuracy": 0.65,
297
- "num_tokens": 1110299.0,
298
- "step": 320
299
- },
300
- {
301
- "epoch": 0.09928544565626175,
302
- "grad_norm": 0.006529012229293585,
303
- "learning_rate": 1.628256513026052e-05,
304
- "loss": 0.0014,
305
- "mean_token_accuracy": 0.75,
306
- "num_tokens": 1143605.0,
307
- "step": 330
308
- },
309
- {
310
- "epoch": 0.10229409552463332,
311
- "grad_norm": 0.0015496546402573586,
312
- "learning_rate": 1.678356713426854e-05,
313
- "loss": 0.0088,
314
- "mean_token_accuracy": 0.6125,
315
- "num_tokens": 1178167.0,
316
- "step": 340
317
- },
318
- {
319
- "epoch": 0.1053027453930049,
320
- "grad_norm": 0.005931541323661804,
321
- "learning_rate": 1.7284569138276556e-05,
322
- "loss": 1.3814,
323
- "mean_token_accuracy": 0.6125,
324
- "num_tokens": 1212839.0,
325
- "step": 350
326
- },
327
- {
328
- "epoch": 0.10831139526137645,
329
- "grad_norm": 0.0023496279027312994,
330
- "learning_rate": 1.7785571142284573e-05,
331
- "loss": 0.0054,
332
- "mean_token_accuracy": 0.6375,
333
- "num_tokens": 1246660.0,
334
- "step": 360
335
- },
336
- {
337
- "epoch": 0.11132004512974802,
338
- "grad_norm": 0.001145790098235011,
339
- "learning_rate": 1.8286573146292587e-05,
340
- "loss": 0.0764,
341
- "mean_token_accuracy": 0.5375,
342
- "num_tokens": 1281889.0,
343
- "step": 370
344
- },
345
- {
346
- "epoch": 0.1143286949981196,
347
- "grad_norm": 0.0035560056567192078,
348
- "learning_rate": 1.87875751503006e-05,
349
- "loss": 0.0156,
350
- "mean_token_accuracy": 0.5375,
351
- "num_tokens": 1317575.0,
352
- "step": 380
353
- },
354
- {
355
- "epoch": 0.11733734486649117,
356
- "grad_norm": 0.0020822687074542046,
357
- "learning_rate": 1.9288577154308618e-05,
358
- "loss": 0.0921,
359
- "mean_token_accuracy": 0.6625,
360
- "num_tokens": 1350936.0,
361
- "step": 390
362
- },
363
- {
364
- "epoch": 0.12034599473486272,
365
- "grad_norm": 0.0026227079797536135,
366
- "learning_rate": 1.9789579158316635e-05,
367
- "loss": 0.0013,
368
- "mean_token_accuracy": 0.625,
369
- "num_tokens": 1385961.0,
370
- "step": 400
371
- },
372
- {
373
- "epoch": 0.1233546446032343,
374
- "grad_norm": 0.06864658743143082,
375
- "learning_rate": 2.0290581162324652e-05,
376
- "loss": 0.0724,
377
- "mean_token_accuracy": 0.55,
378
- "num_tokens": 1421542.0,
379
- "step": 410
380
- },
381
- {
382
- "epoch": 0.12636329447160588,
383
- "grad_norm": 0.0036715222522616386,
384
- "learning_rate": 2.079158316633267e-05,
385
- "loss": 0.007,
386
- "mean_token_accuracy": 0.55,
387
- "num_tokens": 1457078.0,
388
- "step": 420
389
- },
390
- {
391
- "epoch": 0.12937194433997742,
392
- "grad_norm": 0.00313239055685699,
393
- "learning_rate": 2.1292585170340683e-05,
394
- "loss": 0.001,
395
- "mean_token_accuracy": 0.6375,
396
- "num_tokens": 1490852.0,
397
- "step": 430
398
- },
399
- {
400
- "epoch": 0.132380594208349,
401
- "grad_norm": 0.0015605625230818987,
402
- "learning_rate": 2.1793587174348697e-05,
403
- "loss": 0.2283,
404
- "mean_token_accuracy": 0.5125,
405
- "num_tokens": 1525241.0,
406
- "step": 440
407
- },
408
- {
409
- "epoch": 0.13538924407672057,
410
- "grad_norm": 0.0009020116995088756,
411
- "learning_rate": 2.2294589178356714e-05,
412
- "loss": 0.0017,
413
- "mean_token_accuracy": 0.5875,
414
- "num_tokens": 1561253.0,
415
- "step": 450
416
- },
417
- {
418
- "epoch": 0.13839789394509214,
419
- "grad_norm": 0.0005520946579053998,
420
- "learning_rate": 2.279559118236473e-05,
421
- "loss": 0.0005,
422
- "mean_token_accuracy": 0.6375,
423
- "num_tokens": 1596990.0,
424
- "step": 460
425
- },
426
- {
427
- "epoch": 0.1414065438134637,
428
- "grad_norm": 0.0006775453221052885,
429
- "learning_rate": 2.3296593186372748e-05,
430
- "loss": 0.0085,
431
- "mean_token_accuracy": 0.5,
432
- "num_tokens": 1633419.0,
433
- "step": 470
434
- },
435
- {
436
- "epoch": 0.14441519368183528,
437
- "grad_norm": 0.0002734291192609817,
438
- "learning_rate": 2.3797595190380762e-05,
439
- "loss": 0.0014,
440
- "mean_token_accuracy": 0.575,
441
- "num_tokens": 1668732.0,
442
- "step": 480
443
- },
444
- {
445
- "epoch": 0.14742384355020685,
446
- "grad_norm": 0.0007396186119876802,
447
- "learning_rate": 2.429859719438878e-05,
448
- "loss": 0.001,
449
- "mean_token_accuracy": 0.6375,
450
- "num_tokens": 1703568.0,
451
- "step": 490
452
- },
453
- {
454
- "epoch": 0.15043249341857842,
455
- "grad_norm": 0.0009952335385605693,
456
- "learning_rate": 2.4799599198396793e-05,
457
- "loss": 0.0017,
458
- "mean_token_accuracy": 0.7375,
459
- "num_tokens": 1736202.0,
460
- "step": 500
461
- },
462
- {
463
- "epoch": 0.15344114328695,
464
- "grad_norm": 0.0044481828808784485,
465
- "learning_rate": 2.530060120240481e-05,
466
- "loss": 0.0004,
467
- "mean_token_accuracy": 0.6,
468
- "num_tokens": 1770561.0,
469
- "step": 510
470
- },
471
- {
472
- "epoch": 0.15644979315532154,
473
- "grad_norm": 0.0005716494051739573,
474
- "learning_rate": 2.5801603206412827e-05,
475
- "loss": 0.0005,
476
- "mean_token_accuracy": 0.625,
477
- "num_tokens": 1804749.0,
478
- "step": 520
479
- },
480
- {
481
- "epoch": 0.1594584430236931,
482
- "grad_norm": 0.00020559463882818818,
483
- "learning_rate": 2.6302605210420845e-05,
484
- "loss": 0.002,
485
- "mean_token_accuracy": 0.5375,
486
- "num_tokens": 1839027.0,
487
- "step": 530
488
- },
489
- {
490
- "epoch": 0.16246709289206468,
491
- "grad_norm": 0.000684644328430295,
492
- "learning_rate": 2.6803607214428862e-05,
493
- "loss": 0.0012,
494
- "mean_token_accuracy": 0.6375,
495
- "num_tokens": 1872641.0,
496
- "step": 540
497
- },
498
- {
499
- "epoch": 0.16547574276043625,
500
- "grad_norm": 0.0008667957736179233,
501
- "learning_rate": 2.730460921843688e-05,
502
- "loss": 0.0011,
503
- "mean_token_accuracy": 0.5375,
504
- "num_tokens": 1908397.0,
505
- "step": 550
506
- },
507
- {
508
- "epoch": 0.16848439262880782,
509
- "grad_norm": 0.0003756976220756769,
510
- "learning_rate": 2.780561122244489e-05,
511
- "loss": 0.0087,
512
- "mean_token_accuracy": 0.675,
513
- "num_tokens": 1942186.0,
514
- "step": 560
515
- },
516
- {
517
- "epoch": 0.1714930424971794,
518
- "grad_norm": 0.0013846313813701272,
519
- "learning_rate": 2.8306613226452906e-05,
520
- "loss": 0.0006,
521
- "mean_token_accuracy": 0.625,
522
- "num_tokens": 1976984.0,
523
- "step": 570
524
- },
525
- {
526
- "epoch": 0.17450169236555096,
527
- "grad_norm": 0.0010639706160873175,
528
- "learning_rate": 2.880761523046092e-05,
529
- "loss": 0.0003,
530
- "mean_token_accuracy": 0.6375,
531
- "num_tokens": 2011487.0,
532
- "step": 580
533
- },
534
- {
535
- "epoch": 0.17751034223392254,
536
- "grad_norm": 0.002905157394707203,
537
- "learning_rate": 2.9308617234468937e-05,
538
- "loss": 0.0004,
539
- "mean_token_accuracy": 0.5625,
540
- "num_tokens": 2047066.0,
541
- "step": 590
542
- },
543
- {
544
- "epoch": 0.1805189921022941,
545
- "grad_norm": 0.0001582392433192581,
546
- "learning_rate": 2.9809619238476955e-05,
547
- "loss": 0.0004,
548
- "mean_token_accuracy": 0.6875,
549
- "num_tokens": 2081518.0,
550
- "step": 600
551
  }
552
  ],
553
  "logging_steps": 10,
554
- "max_steps": 9972,
555
  "num_input_tokens_seen": 0,
556
  "num_train_epochs": 3,
557
  "save_steps": 100,
@@ -567,7 +117,7 @@
567
  "attributes": {}
568
  }
569
  },
570
- "total_flos": 4.660300459087872e+16,
571
  "train_batch_size": 1,
572
  "trial_name": null,
573
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.020057665789143787,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0020057665789143786,
14
+ "grad_norm": 0.0,
15
+ "learning_rate": 3.0080213903743316e-07,
16
+ "loss": 0.0,
17
  "mean_token_accuracy": 0.0,
18
+ "num_tokens": 36673.0,
19
  "step": 10
20
  },
21
  {
22
+ "epoch": 0.004011533157828757,
23
+ "grad_norm": 0.0,
24
+ "learning_rate": 6.350267379679145e-07,
25
+ "loss": 0.0,
26
  "mean_token_accuracy": 0.0,
27
+ "num_tokens": 73344.0,
28
  "step": 20
29
  },
30
  {
31
+ "epoch": 0.006017299736743136,
32
+ "grad_norm": 0.0,
33
+ "learning_rate": 9.692513368983958e-07,
34
+ "loss": 0.0,
35
  "mean_token_accuracy": 0.0,
36
+ "num_tokens": 110756.0,
37
  "step": 30
38
  },
39
  {
40
+ "epoch": 0.008023066315657515,
41
+ "grad_norm": 0.0,
42
+ "learning_rate": 1.3034759358288772e-06,
43
+ "loss": 0.0,
44
  "mean_token_accuracy": 0.0,
45
+ "num_tokens": 147094.0,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 0.010028832894571894,
50
+ "grad_norm": 0.0,
51
+ "learning_rate": 1.6377005347593583e-06,
52
+ "loss": 0.0,
53
  "mean_token_accuracy": 0.0,
54
+ "num_tokens": 184910.0,
55
  "step": 50
56
  },
57
  {
58
+ "epoch": 0.012034599473486273,
59
+ "grad_norm": 0.0,
60
+ "learning_rate": 1.9719251336898394e-06,
61
+ "loss": 0.0,
62
  "mean_token_accuracy": 0.0,
63
+ "num_tokens": 222083.0,
64
  "step": 60
65
  },
66
  {
67
+ "epoch": 0.014040366052400652,
68
+ "grad_norm": 0.0,
69
+ "learning_rate": 2.306149732620321e-06,
70
+ "loss": 0.0,
71
  "mean_token_accuracy": 0.0,
72
+ "num_tokens": 259469.0,
73
  "step": 70
74
  },
75
  {
76
+ "epoch": 0.01604613263131503,
77
+ "grad_norm": 0.0,
78
+ "learning_rate": 2.6403743315508026e-06,
79
+ "loss": 0.0,
80
  "mean_token_accuracy": 0.0,
81
+ "num_tokens": 296196.0,
82
  "step": 80
83
  },
84
  {
85
+ "epoch": 0.01805189921022941,
86
+ "grad_norm": 0.0,
87
+ "learning_rate": 2.9745989304812837e-06,
88
+ "loss": 0.0,
89
  "mean_token_accuracy": 0.0,
90
+ "num_tokens": 333651.0,
91
  "step": 90
92
  },
93
  {
94
+ "epoch": 0.020057665789143787,
95
+ "grad_norm": 0.0,
96
+ "learning_rate": 3.308823529411765e-06,
97
+ "loss": 0.0,
98
  "mean_token_accuracy": 0.0,
99
+ "num_tokens": 371196.0,
100
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  ],
103
  "logging_steps": 10,
104
+ "max_steps": 14958,
105
  "num_input_tokens_seen": 0,
106
  "num_train_epochs": 3,
107
  "save_steps": 100,
 
117
  "attributes": {}
118
  }
119
  },
120
+ "total_flos": 8310689070243840.0,
121
  "train_batch_size": 1,
122
  "trial_name": null,
123
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ded845851c4b5597a29e266e94059b4ee139b0a361d4b536a00f62e6b055230
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c671635e3fdfb7dbe9bdceef520a026df917c27ba861bca94f3b3be5f375546f
3
  size 6225