finalform commited on
Commit
c09ce8b
·
verified ·
1 Parent(s): cfb75ed

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "k_proj",
29
- "down_proj",
30
- "o_proj",
31
  "q_proj",
32
  "v_proj",
 
 
33
  "up_proj",
34
- "gate_proj"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
 
28
  "q_proj",
29
  "v_proj",
30
+ "o_proj",
31
+ "down_proj",
32
  "up_proj",
33
+ "gate_proj",
34
+ "k_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a0c1f01af04ec0d51c229f63e9191c2baab2c2e7a1ad6795ccffe749aae29ff
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c6578e135fe1ebeda7645f7528d4a5264cb6fe59bbad0e296050b928987a8c6
3
  size 645975704
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85a21d03c2ea98d53c52fd96d7e7982e7d6b7827185339dabc6a151f19b15814
3
  size 1292087499
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb772037ca81bcf148d2e2c2f1836a59149e28bee20036962419895038fcf9fa
3
  size 1292087499
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:994c2cba2555eef301d8087ae1484ed0e7252f44df4637c6e9af3389b996ceee
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437694cf80dc70be6a53ad52dce7f6e7f66c496ccc9712033fd53a04b6022a0e
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b1736ec6627ebf927133b64702a5b6824ab5d43b5017e4277694c355a4f042e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4361ddfd3652d9646a72d7e3e69463d6582550af453e0c12dedcaaac34a5d817
3
  size 1465
trainer_state.json CHANGED
@@ -11,473 +11,473 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
- "grad_norm": 0.292353093624115,
15
- "learning_rate": 0.00013971818181818181,
16
- "loss": 1.8075,
17
- "mean_token_accuracy": 0.6316984993219376,
18
- "num_tokens": 155458.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
- "grad_norm": 0.23519554734230042,
24
- "learning_rate": 0.00028525795454545453,
25
- "loss": 0.8594,
26
- "mean_token_accuracy": 0.7813338875770569,
27
- "num_tokens": 280957.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
- "grad_norm": 0.17745567858219147,
33
- "learning_rate": 0.0004307977272727273,
34
- "loss": 0.6189,
35
- "mean_token_accuracy": 0.8301489073038101,
36
- "num_tokens": 439583.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
- "grad_norm": 0.28904563188552856,
42
- "learning_rate": 0.0005122807260672283,
43
- "loss": 0.514,
44
- "mean_token_accuracy": 0.8567226785421371,
45
- "num_tokens": 566372.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
- "grad_norm": 0.18326736986637115,
51
- "learning_rate": 0.0005120935869832972,
52
- "loss": 0.3703,
53
- "mean_token_accuracy": 0.8942542725801468,
54
- "num_tokens": 721871.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
- "grad_norm": 0.22670747339725494,
60
- "learning_rate": 0.0005117075078651932,
61
- "loss": 0.3122,
62
- "mean_token_accuracy": 0.9121941888332367,
63
- "num_tokens": 848123.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
- "grad_norm": 0.1902594268321991,
69
- "learning_rate": 0.0005111227888047993,
70
- "loss": 0.2411,
71
- "mean_token_accuracy": 0.9314639317989349,
72
- "num_tokens": 1005664.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
- "grad_norm": 0.29753610491752625,
78
- "learning_rate": 0.0005103398842930102,
79
- "loss": 0.2266,
80
- "mean_token_accuracy": 0.9340476477146149,
81
- "num_tokens": 1132284.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
- "grad_norm": 0.1415078341960907,
87
- "learning_rate": 0.0005093594028664655,
88
- "loss": 0.1822,
89
- "mean_token_accuracy": 0.9487657606601715,
90
- "num_tokens": 1290915.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
- "grad_norm": 0.19810789823532104,
96
- "learning_rate": 0.0005081821066345455,
97
- "loss": 0.1458,
98
- "mean_token_accuracy": 0.9581668329238892,
99
- "num_tokens": 1418595.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
- "grad_norm": 0.1245037168264389,
105
- "learning_rate": 0.0005068089106869988,
106
- "loss": 0.1361,
107
- "mean_token_accuracy": 0.9611250156164169,
108
- "num_tokens": 1576137.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
- "grad_norm": 0.18405039608478546,
114
- "learning_rate": 0.0005052408823826598,
115
- "loss": 0.1393,
116
- "mean_token_accuracy": 0.9614962357282638,
117
- "num_tokens": 1701485.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
- "grad_norm": 0.18599726259708405,
123
- "learning_rate": 0.000503479240519812,
124
- "loss": 0.1137,
125
- "mean_token_accuracy": 0.9682038247585296,
126
- "num_tokens": 1859818.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
- "grad_norm": 0.1856354922056198,
132
- "learning_rate": 0.0005015253543888389,
133
- "loss": 0.0891,
134
- "mean_token_accuracy": 0.9745866447687149,
135
- "num_tokens": 1987015.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
- "grad_norm": 0.10939127951860428,
141
- "learning_rate": 0.0004993807427079012,
142
- "loss": 0.1001,
143
- "mean_token_accuracy": 0.9714098435640335,
144
- "num_tokens": 2146234.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
- "grad_norm": 0.3307282030582428,
150
- "learning_rate": 0.0004970470724424662,
151
- "loss": 0.0884,
152
- "mean_token_accuracy": 0.9754749721288681,
153
- "num_tokens": 2273585.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
- "eval_loss": 0.08956408500671387,
159
- "eval_mean_token_accuracy": 0.9750892632716411,
160
  "eval_num_tokens": 2354180.0,
161
- "eval_runtime": 16.024,
162
- "eval_samples_per_second": 23.028,
163
- "eval_steps_per_second": 11.545,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
- "grad_norm": 0.16935159265995026,
169
- "learning_rate": 0.0004945261575096078,
170
- "loss": 0.101,
171
- "mean_token_accuracy": 0.9729157378993083,
172
- "num_tokens": 2425025.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
- "grad_norm": 0.12800893187522888,
178
- "learning_rate": 0.0004918199573680834,
179
- "loss": 0.0615,
180
- "mean_token_accuracy": 0.9824073499441147,
181
- "num_tokens": 2568833.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
- "grad_norm": 0.10300774872303009,
187
- "learning_rate": 0.0004889305754952839,
188
- "loss": 0.0805,
189
- "mean_token_accuracy": 0.9773273587226867,
190
- "num_tokens": 2710895.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
- "grad_norm": 0.11092197895050049,
196
- "learning_rate": 0.0004858602577522418,
197
- "loss": 0.0588,
198
- "mean_token_accuracy": 0.9833286923170089,
199
- "num_tokens": 2853318.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
- "grad_norm": 0.0948944017291069,
205
- "learning_rate": 0.0004826113906379664,
206
- "loss": 0.0838,
207
- "mean_token_accuracy": 0.9770084321498871,
208
- "num_tokens": 2994882.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
- "grad_norm": 0.12024246156215668,
214
- "learning_rate": 0.00047918649943446345,
215
- "loss": 0.0572,
216
- "mean_token_accuracy": 0.9838370496034622,
217
- "num_tokens": 3137930.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
- "grad_norm": 0.12121743708848953,
223
- "learning_rate": 0.0004755882462438826,
224
- "loss": 0.0611,
225
- "mean_token_accuracy": 0.9828894352912902,
226
- "num_tokens": 3279894.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
- "grad_norm": 0.19103674590587616,
232
- "learning_rate": 0.000471819427919316,
233
- "loss": 0.0455,
234
- "mean_token_accuracy": 0.9865969383716583,
235
- "num_tokens": 3420462.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
- "grad_norm": 0.06473100930452347,
241
- "learning_rate": 0.0004678829738908584,
242
- "loss": 0.0647,
243
- "mean_token_accuracy": 0.9815235859155655,
244
- "num_tokens": 3561941.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
- "grad_norm": 0.09202416986227036,
250
- "learning_rate": 0.0004637819438886175,
251
- "loss": 0.0517,
252
- "mean_token_accuracy": 0.9851528346538544,
253
- "num_tokens": 3703510.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
- "grad_norm": 0.08362529426813126,
259
- "learning_rate": 0.00045951952556444426,
260
- "loss": 0.0642,
261
- "mean_token_accuracy": 0.9822886544466018,
262
- "num_tokens": 3842063.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
- "grad_norm": 0.0631742924451828,
268
- "learning_rate": 0.0004550990320142324,
269
- "loss": 0.0441,
270
- "mean_token_accuracy": 0.9875086861848831,
271
- "num_tokens": 3984506.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
- "grad_norm": 0.07872219383716583,
277
- "learning_rate": 0.00045052389920271276,
278
- "loss": 0.0569,
279
- "mean_token_accuracy": 0.9842114639282227,
280
- "num_tokens": 4127213.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
- "grad_norm": 0.08871777355670929,
286
- "learning_rate": 0.0004457976832927436,
287
- "loss": 0.0437,
288
- "mean_token_accuracy": 0.9873430663347245,
289
- "num_tokens": 4270185.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
- "grad_norm": 0.08713535219430923,
295
- "learning_rate": 0.00044092405788117396,
296
- "loss": 0.0583,
297
- "mean_token_accuracy": 0.9836823076009751,
298
- "num_tokens": 4412354.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
- "grad_norm": 0.10155721753835678,
304
- "learning_rate": 0.00043590681114342696,
305
- "loss": 0.0404,
306
- "mean_token_accuracy": 0.9879835307598114,
307
- "num_tokens": 4556520.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
- "grad_norm": 0.0794239267706871,
313
- "learning_rate": 0.0004307498428890239,
314
- "loss": 0.045,
315
- "mean_token_accuracy": 0.9872637808322906,
316
- "num_tokens": 4688903.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
- "eval_loss": 0.05410139262676239,
322
- "eval_mean_token_accuracy": 0.9851650663324305,
323
  "eval_num_tokens": 4708360.0,
324
- "eval_runtime": 16.0082,
325
- "eval_samples_per_second": 23.051,
326
- "eval_steps_per_second": 11.557,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
- "grad_norm": 0.10252567380666733,
332
- "learning_rate": 0.00042545716153033746,
333
- "loss": 0.0495,
334
- "mean_token_accuracy": 0.9853065284257082,
335
- "num_tokens": 4838468.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
- "grad_norm": 0.04914547875523567,
341
- "learning_rate": 0.0004200328809669296,
342
- "loss": 0.0313,
343
- "mean_token_accuracy": 0.9909292554855347,
344
- "num_tokens": 4972061.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
- "grad_norm": 0.06335192173719406,
350
- "learning_rate": 0.00041448121738789633,
351
- "loss": 0.0449,
352
- "mean_token_accuracy": 0.9870324164628983,
353
- "num_tokens": 5123609.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
- "grad_norm": 0.09024782478809357,
359
- "learning_rate": 0.0004088064859947051,
360
- "loss": 0.0336,
361
- "mean_token_accuracy": 0.9899903804063797,
362
- "num_tokens": 5255900.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
- "grad_norm": 0.06487799435853958,
368
- "learning_rate": 0.0004030130976470715,
369
- "loss": 0.0471,
370
- "mean_token_accuracy": 0.9861943638324737,
371
- "num_tokens": 5408377.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
- "grad_norm": 0.03881136327981949,
377
- "learning_rate": 0.00039710555543448267,
378
- "loss": 0.033,
379
- "mean_token_accuracy": 0.9898175239562989,
380
- "num_tokens": 5540979.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
- "grad_norm": 0.05819237604737282,
386
- "learning_rate": 0.0003910884511760325,
387
- "loss": 0.0428,
388
- "mean_token_accuracy": 0.9870308661460876,
389
- "num_tokens": 5693030.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
- "grad_norm": 0.06926289945840836,
395
- "learning_rate": 0.00038496646185128854,
396
- "loss": 0.0288,
397
- "mean_token_accuracy": 0.9914705574512481,
398
- "num_tokens": 5827027.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
- "grad_norm": 0.09394887089729309,
404
- "learning_rate": 0.000378744345964966,
405
- "loss": 0.0439,
406
- "mean_token_accuracy": 0.9866676324605942,
407
- "num_tokens": 5975797.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
- "grad_norm": 0.07537297159433365,
413
- "learning_rate": 0.0003724269398482333,
414
- "loss": 0.0316,
415
- "mean_token_accuracy": 0.9907770365476608,
416
- "num_tokens": 6107036.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
- "grad_norm": 0.044363752007484436,
422
- "learning_rate": 0.00036601915389952434,
423
- "loss": 0.046,
424
- "mean_token_accuracy": 0.9861960715055466,
425
- "num_tokens": 6258873.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
- "grad_norm": 0.084761843085289,
431
- "learning_rate": 0.00035952596876778076,
432
- "loss": 0.031,
433
- "mean_token_accuracy": 0.9905279046297073,
434
- "num_tokens": 6392411.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
- "grad_norm": 0.05843805894255638,
440
- "learning_rate": 0.00035295243148108894,
441
- "loss": 0.0441,
442
- "mean_token_accuracy": 0.9872340881824493,
443
- "num_tokens": 6542051.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
- "grad_norm": 0.05149897560477257,
449
- "learning_rate": 0.00034630365152372165,
450
- "loss": 0.0286,
451
- "mean_token_accuracy": 0.9911447340250015,
452
- "num_tokens": 6674951.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
- "grad_norm": 0.04212498292326927,
458
- "learning_rate": 0.00033958479686463464,
459
- "loss": 0.042,
460
- "mean_token_accuracy": 0.9873944985866546,
461
- "num_tokens": 6827063.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
- "grad_norm": 0.02952578291296959,
467
- "learning_rate": 0.00033280108994050315,
468
- "loss": 0.0288,
469
- "mean_token_accuracy": 0.9914515954256058,
470
- "num_tokens": 6960300.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
- "eval_loss": 0.045313552021980286,
476
- "eval_mean_token_accuracy": 0.9875944820610253,
477
  "eval_num_tokens": 7062540.0,
478
- "eval_runtime": 15.9787,
479
- "eval_samples_per_second": 23.093,
480
- "eval_steps_per_second": 11.578,
481
  "step": 1245
482
  }
483
  ],
@@ -498,7 +498,7 @@
498
  "attributes": {}
499
  }
500
  },
501
- "total_flos": 3.0685210928193024e+17,
502
  "train_batch_size": 2,
503
  "trial_name": null,
504
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.060350030175015085,
14
+ "grad_norm": 0.2729557752609253,
15
+ "learning_rate": 0.00013936363636363633,
16
+ "loss": 1.8006,
17
+ "mean_token_accuracy": 0.6339444428682327,
18
+ "num_tokens": 157350.0,
19
  "step": 25
20
  },
21
  {
22
  "epoch": 0.12070006035003017,
23
+ "grad_norm": 0.3110716938972473,
24
+ "learning_rate": 0.0002845340909090909,
25
+ "loss": 0.8486,
26
+ "mean_token_accuracy": 0.78412382543087,
27
+ "num_tokens": 281749.0,
28
  "step": 50
29
  },
30
  {
31
  "epoch": 0.18105009052504525,
32
+ "grad_norm": 0.1929408758878708,
33
+ "learning_rate": 0.00042970454545454545,
34
+ "loss": 0.6116,
35
+ "mean_token_accuracy": 0.8317048019170761,
36
+ "num_tokens": 439677.0,
37
  "step": 75
38
  },
39
  {
40
  "epoch": 0.24140012070006034,
41
+ "grad_norm": 0.2782052755355835,
42
+ "learning_rate": 0.0005109807749762905,
43
+ "loss": 0.489,
44
+ "mean_token_accuracy": 0.8621344155073166,
45
+ "num_tokens": 566627.0,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.30175015087507545,
50
+ "grad_norm": 0.188531756401062,
51
+ "learning_rate": 0.00051079411077194,
52
+ "loss": 0.3545,
53
+ "mean_token_accuracy": 0.8989184832572937,
54
+ "num_tokens": 724945.0,
55
  "step": 125
56
  },
57
  {
58
  "epoch": 0.3621001810500905,
59
+ "grad_norm": 0.2349298596382141,
60
+ "learning_rate": 0.0005104090113588009,
61
+ "loss": 0.325,
62
+ "mean_token_accuracy": 0.9072138226032257,
63
+ "num_tokens": 851980.0,
64
  "step": 150
65
  },
66
  {
67
  "epoch": 0.4224502112251056,
68
+ "grad_norm": 0.18686626851558685,
69
+ "learning_rate": 0.0005098257760672504,
70
+ "loss": 0.2363,
71
+ "mean_token_accuracy": 0.9327792012691498,
72
+ "num_tokens": 1007684.0,
73
  "step": 175
74
  },
75
  {
76
  "epoch": 0.4828002414001207,
77
+ "grad_norm": 0.31283071637153625,
78
+ "learning_rate": 0.0005090448582348783,
79
+ "loss": 0.2257,
80
+ "mean_token_accuracy": 0.9364256656169891,
81
+ "num_tokens": 1134676.0,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.5431502715751357,
86
+ "grad_norm": 0.2003205120563507,
87
+ "learning_rate": 0.0005080668648541163,
88
+ "loss": 0.1798,
89
+ "mean_token_accuracy": 0.9486303454637528,
90
+ "num_tokens": 1288843.0,
91
  "step": 225
92
  },
93
  {
94
  "epoch": 0.6035003017501509,
95
+ "grad_norm": 0.24583138525485992,
96
+ "learning_rate": 0.0005068925561004347,
97
+ "loss": 0.1614,
98
+ "mean_token_accuracy": 0.9540205806493759,
99
+ "num_tokens": 1413587.0,
100
  "step": 250
101
  },
102
  {
103
  "epoch": 0.663850331925166,
104
+ "grad_norm": 0.2094200849533081,
105
+ "learning_rate": 0.0005055228447414724,
106
+ "loss": 0.1345,
107
+ "mean_token_accuracy": 0.9615936678647995,
108
+ "num_tokens": 1571290.0,
109
  "step": 275
110
  },
111
  {
112
  "epoch": 0.724200362100181,
113
+ "grad_norm": 0.20863880217075348,
114
+ "learning_rate": 0.0005039587954275602,
115
+ "loss": 0.1209,
116
+ "mean_token_accuracy": 0.9658558475971222,
117
+ "num_tokens": 1697243.0,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.7845503922751962,
122
+ "grad_norm": 0.12524789571762085,
123
+ "learning_rate": 0.0005022016238641887,
124
+ "loss": 0.1104,
125
+ "mean_token_accuracy": 0.969045399427414,
126
+ "num_tokens": 1855662.0,
127
  "step": 325
128
  },
129
  {
130
  "epoch": 0.8449004224502112,
131
+ "grad_norm": 0.17329099774360657,
132
+ "learning_rate": 0.0005002526958670635,
133
+ "loss": 0.0871,
134
+ "mean_token_accuracy": 0.9757317280769349,
135
+ "num_tokens": 1983678.0,
136
  "step": 350
137
  },
138
  {
139
  "epoch": 0.9052504526252263,
140
+ "grad_norm": 0.07487187534570694,
141
+ "learning_rate": 0.000498113526300483,
142
+ "loss": 0.1073,
143
+ "mean_token_accuracy": 0.9719128596782685,
144
+ "num_tokens": 2142312.0,
145
  "step": 375
146
  },
147
  {
148
  "epoch": 0.9656004828002414,
149
+ "grad_norm": 0.18816682696342468,
150
+ "learning_rate": 0.0004957857778998638,
151
+ "loss": 0.0837,
152
+ "mean_token_accuracy": 0.9764218652248382,
153
+ "num_tokens": 2270799.0,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 1.0,
158
+ "eval_loss": 0.08532869815826416,
159
+ "eval_mean_token_accuracy": 0.9769232859482636,
160
  "eval_num_tokens": 2354180.0,
161
+ "eval_runtime": 15.9895,
162
+ "eval_samples_per_second": 23.078,
163
+ "eval_steps_per_second": 11.57,
164
  "step": 415
165
  },
166
  {
167
  "epoch": 1.024140012070006,
168
+ "grad_norm": 0.09524519741535187,
169
+ "learning_rate": 0.0004932712599793277,
170
+ "loss": 0.0927,
171
+ "mean_token_accuracy": 0.9739998161178274,
172
+ "num_tokens": 2423160.0,
173
  "step": 425
174
  },
175
  {
176
  "epoch": 1.0844900422450212,
177
+ "grad_norm": 0.09827826172113419,
178
+ "learning_rate": 0.0004905719270253573,
179
+ "loss": 0.0573,
180
+ "mean_token_accuracy": 0.9832522106170655,
181
+ "num_tokens": 2565487.0,
182
  "step": 450
183
  },
184
  {
185
  "epoch": 1.1448400724200363,
186
+ "grad_norm": 0.12874829769134521,
187
+ "learning_rate": 0.0004876898771776108,
188
+ "loss": 0.0797,
189
+ "mean_token_accuracy": 0.9777371490001678,
190
+ "num_tokens": 2706106.0,
191
  "step": 475
192
  },
193
  {
194
  "epoch": 1.2051901025950513,
195
+ "grad_norm": 0.09082050621509552,
196
+ "learning_rate": 0.00048462735059807835,
197
+ "loss": 0.053,
198
+ "mean_token_accuracy": 0.9846914553642273,
199
+ "num_tokens": 2848007.0,
200
  "step": 500
201
  },
202
  {
203
  "epoch": 1.2655401327700664,
204
+ "grad_norm": 0.0940171331167221,
205
+ "learning_rate": 0.00048138672772984735,
206
+ "loss": 0.072,
207
+ "mean_token_accuracy": 0.980380329489708,
208
+ "num_tokens": 2990068.0,
209
  "step": 525
210
  },
211
  {
212
  "epoch": 1.3258901629450814,
213
+ "grad_norm": 0.11513015627861023,
214
+ "learning_rate": 0.00047797052744682957,
215
+ "loss": 0.0522,
216
+ "mean_token_accuracy": 0.9850564336776734,
217
+ "num_tokens": 3131402.0,
218
  "step": 550
219
  },
220
  {
221
  "epoch": 1.3862401931200965,
222
+ "grad_norm": 0.0951244980096817,
223
+ "learning_rate": 0.0004743814050958891,
224
+ "loss": 0.0651,
225
+ "mean_token_accuracy": 0.9819402652978897,
226
+ "num_tokens": 3272231.0,
227
  "step": 575
228
  },
229
  {
230
  "epoch": 1.4465902232951118,
231
+ "grad_norm": 0.0790608748793602,
232
+ "learning_rate": 0.00047062215043289175,
233
+ "loss": 0.0496,
234
+ "mean_token_accuracy": 0.9858078044652939,
235
+ "num_tokens": 3415731.0,
236
  "step": 600
237
  },
238
  {
239
  "epoch": 1.5069402534701268,
240
+ "grad_norm": 0.09390847384929657,
241
+ "learning_rate": 0.00046669568545428187,
242
+ "loss": 0.0624,
243
+ "mean_token_accuracy": 0.9824297106266022,
244
+ "num_tokens": 3558206.0,
245
  "step": 625
246
  },
247
  {
248
  "epoch": 1.567290283645142,
249
+ "grad_norm": 0.10085475444793701,
250
+ "learning_rate": 0.00046260506212587063,
251
+ "loss": 0.0481,
252
+ "mean_token_accuracy": 0.9865269219875336,
253
+ "num_tokens": 3701088.0,
254
  "step": 650
255
  },
256
  {
257
  "epoch": 1.627640313820157,
258
+ "grad_norm": 0.08241600543260574,
259
+ "learning_rate": 0.00045835346001060117,
260
+ "loss": 0.0658,
261
+ "mean_token_accuracy": 0.9819342768192292,
262
+ "num_tokens": 3842946.0,
263
  "step": 675
264
  },
265
  {
266
  "epoch": 1.687990343995172,
267
+ "grad_norm": 0.05363951995968819,
268
+ "learning_rate": 0.0004539441837971359,
269
+ "loss": 0.0414,
270
+ "mean_token_accuracy": 0.9879044550657272,
271
+ "num_tokens": 3984790.0,
272
  "step": 700
273
  },
274
  {
275
  "epoch": 1.748340374170187,
276
+ "grad_norm": 0.0792899951338768,
277
+ "learning_rate": 0.00044938066073118524,
278
+ "loss": 0.057,
279
+ "mean_token_accuracy": 0.983517536520958,
280
+ "num_tokens": 4126482.0,
281
  "step": 725
282
  },
283
  {
284
  "epoch": 1.8086904043452021,
285
+ "grad_norm": 0.05479871854186058,
286
+ "learning_rate": 0.00044466643795157515,
287
+ "loss": 0.0447,
288
+ "mean_token_accuracy": 0.9873181569576264,
289
+ "num_tokens": 4269376.0,
290
  "step": 750
291
  },
292
  {
293
  "epoch": 1.8690404345202172,
294
+ "grad_norm": 0.06352550536394119,
295
+ "learning_rate": 0.00043980517973312485,
296
+ "loss": 0.0524,
297
+ "mean_token_accuracy": 0.9851584023237229,
298
+ "num_tokens": 4411323.0,
299
  "step": 775
300
  },
301
  {
302
  "epoch": 1.9293904646952322,
303
+ "grad_norm": 0.07205910980701447,
304
+ "learning_rate": 0.00043480066463847576,
305
+ "loss": 0.0387,
306
+ "mean_token_accuracy": 0.9883100253343582,
307
+ "num_tokens": 4555676.0,
308
  "step": 800
309
  },
310
  {
311
  "epoch": 1.9897404948702473,
312
+ "grad_norm": 0.12736694514751434,
313
+ "learning_rate": 0.0004296567825810876,
314
+ "loss": 0.0484,
315
+ "mean_token_accuracy": 0.9870092964172363,
316
+ "num_tokens": 4688499.0,
317
  "step": 825
318
  },
319
  {
320
  "epoch": 2.0,
321
+ "eval_loss": 0.05405727028846741,
322
+ "eval_mean_token_accuracy": 0.9849017378446218,
323
  "eval_num_tokens": 4708360.0,
324
+ "eval_runtime": 15.9245,
325
+ "eval_samples_per_second": 23.172,
326
+ "eval_steps_per_second": 11.617,
327
  "step": 830
328
  },
329
  {
330
  "epoch": 2.048280024140012,
331
+ "grad_norm": 0.07416768372058868,
332
+ "learning_rate": 0.00042437753180168345,
333
+ "loss": 0.0454,
334
+ "mean_token_accuracy": 0.9866191566604929,
335
+ "num_tokens": 4835961.0,
336
  "step": 850
337
  },
338
  {
339
  "epoch": 2.1086300543150274,
340
+ "grad_norm": 0.1151156798005104,
341
+ "learning_rate": 0.00041896701576049384,
342
+ "loss": 0.0346,
343
+ "mean_token_accuracy": 0.9898207432031632,
344
+ "num_tokens": 4967947.0,
345
  "step": 875
346
  },
347
  {
348
  "epoch": 2.1689800844900424,
349
+ "grad_norm": 0.09905433654785156,
350
+ "learning_rate": 0.00041342943994771616,
351
+ "loss": 0.0476,
352
+ "mean_token_accuracy": 0.9861520522832871,
353
+ "num_tokens": 5119123.0,
354
  "step": 900
355
  },
356
  {
357
  "epoch": 2.2293301146650575,
358
+ "grad_norm": 0.1043798103928566,
359
+ "learning_rate": 0.0004077691086146677,
360
+ "loss": 0.0337,
361
+ "mean_token_accuracy": 0.9898509311676026,
362
+ "num_tokens": 5251750.0,
363
  "step": 925
364
  },
365
  {
366
  "epoch": 2.2896801448400725,
367
+ "grad_norm": 0.08311837911605835,
368
+ "learning_rate": 0.0004019904214281739,
369
+ "loss": 0.0497,
370
+ "mean_token_accuracy": 0.985140620470047,
371
+ "num_tokens": 5403588.0,
372
  "step": 950
373
  },
374
  {
375
  "epoch": 2.3500301750150876,
376
+ "grad_norm": 0.06609100848436356,
377
+ "learning_rate": 0.00039609787005079176,
378
+ "loss": 0.0329,
379
+ "mean_token_accuracy": 0.9901149165630341,
380
+ "num_tokens": 5537023.0,
381
  "step": 975
382
  },
383
  {
384
  "epoch": 2.4103802051901027,
385
+ "grad_norm": 0.06421645730733871,
386
+ "learning_rate": 0.0003900960346495268,
387
+ "loss": 0.0526,
388
+ "mean_token_accuracy": 0.9848462229967118,
389
+ "num_tokens": 5688978.0,
390
  "step": 1000
391
  },
392
  {
393
  "epoch": 2.4707302353651177,
394
+ "grad_norm": 0.07097386568784714,
395
+ "learning_rate": 0.0003839895803357572,
396
+ "loss": 0.031,
397
+ "mean_token_accuracy": 0.9907027989625931,
398
+ "num_tokens": 5820828.0,
399
  "step": 1025
400
  },
401
  {
402
  "epoch": 2.5310802655401328,
403
+ "grad_norm": 0.06456780433654785,
404
+ "learning_rate": 0.0003777832535391326,
405
+ "loss": 0.0423,
406
+ "mean_token_accuracy": 0.9876184749603272,
407
+ "num_tokens": 5969776.0,
408
  "step": 1050
409
  },
410
  {
411
  "epoch": 2.591430295715148,
412
+ "grad_norm": 0.08945687860250473,
413
+ "learning_rate": 0.000371481878318265,
414
+ "loss": 0.031,
415
+ "mean_token_accuracy": 0.9904923564195633,
416
+ "num_tokens": 6101538.0,
417
  "step": 1075
418
  },
419
  {
420
  "epoch": 2.651780325890163,
421
+ "grad_norm": 0.059836167842149734,
422
+ "learning_rate": 0.0003650903526110812,
423
+ "loss": 0.0468,
424
+ "mean_token_accuracy": 0.9865248650312424,
425
+ "num_tokens": 6254993.0,
426
  "step": 1100
427
  },
428
  {
429
  "epoch": 2.712130356065178,
430
+ "grad_norm": 0.08414298295974731,
431
+ "learning_rate": 0.00035861364442774926,
432
+ "loss": 0.0306,
433
+ "mean_token_accuracy": 0.9909639322757721,
434
+ "num_tokens": 6389539.0,
435
  "step": 1125
436
  },
437
  {
438
  "epoch": 2.772480386240193,
439
+ "grad_norm": 0.07291322946548462,
440
+ "learning_rate": 0.00035205678798914004,
441
+ "loss": 0.0399,
442
+ "mean_token_accuracy": 0.988108462691307,
443
+ "num_tokens": 6541057.0,
444
  "step": 1150
445
  },
446
  {
447
  "epoch": 2.832830416415208,
448
+ "grad_norm": 0.05344131961464882,
449
+ "learning_rate": 0.0003454248798138234,
450
+ "loss": 0.0303,
451
+ "mean_token_accuracy": 0.9908234792947769,
452
+ "num_tokens": 6674451.0,
453
  "step": 1175
454
  },
455
  {
456
  "epoch": 2.8931804465902236,
457
+ "grad_norm": 0.08897681534290314,
458
+ "learning_rate": 0.0003387230747566431,
459
+ "loss": 0.0425,
460
+ "mean_token_accuracy": 0.9875851464271546,
461
+ "num_tokens": 6827173.0,
462
  "step": 1200
463
  },
464
  {
465
  "epoch": 2.9535304767652386,
466
+ "grad_norm": 0.043970294296741486,
467
+ "learning_rate": 0.0003319565820019463,
468
+ "loss": 0.0289,
469
+ "mean_token_accuracy": 0.9910114580392837,
470
+ "num_tokens": 6961397.0,
471
  "step": 1225
472
  },
473
  {
474
  "epoch": 3.0,
475
+ "eval_loss": 0.04635874554514885,
476
+ "eval_mean_token_accuracy": 0.9874976680085465,
477
  "eval_num_tokens": 7062540.0,
478
+ "eval_runtime": 15.9412,
479
+ "eval_samples_per_second": 23.148,
480
+ "eval_steps_per_second": 11.605,
481
  "step": 1245
482
  }
483
  ],
 
498
  "attributes": {}
499
  }
500
  },
501
+ "total_flos": 3.068133596909875e+17,
502
  "train_batch_size": 2,
503
  "trial_name": null,
504
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aca81315bde14ece69eb9f4dddd5f4b7bb5393ac99e6a78ae025523ceef1a1d
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8fddb2b708f03bb265536e876e142110e772af61dc986b3ada0247ff2f7859c
3
  size 6097