robertou2 commited on
Commit
176748c
·
verified ·
1 Parent(s): aa183aa

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -12,21 +12,21 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 112,
16
  "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
- "r": 56,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "o_proj",
27
- "gate_up_proj",
28
  "qkv_proj",
29
- "down_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 256,
16
  "lora_bias": false,
17
  "lora_dropout": 0,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
21
  "peft_type": "LORA",
22
+ "r": 128,
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "qkv_proj",
27
+ "o_proj",
28
+ "down_proj",
29
+ "gate_up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:602f53d6959b4f46f004a4ecc196bb614c0a51b5eba000ca3d35f316695a8894
3
- size 161515608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f2a4574a1cf3760f6f91ba60977d9722c117968695c26c25c621af59a4e41c
3
+ size 369134112
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a57114afad149c40d4f4fb59f0ff7a6cb5b55a70484d2f07b47fb05964b101fa
3
- size 323181259
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca865aab08124c6a7502014773b63e03e2db1b57e13d98db1ce833c9c645aa41
3
+ size 738417355
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:476926ce2c69ab4ccb2e05f1160789ff4fbbecc78d81f0f191944d21b8ccd89e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:580cf0c8deda9a5cdf877c15cfecec4f5a37dd72edd01f252f4b56d158b7550a
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.888888888888889,
6
  "eval_steps": 500,
7
- "global_step": 95,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12,7 +12,7 @@
12
  {
13
  "entropy": 2.2323372662067413,
14
  "epoch": 0.03065134099616858,
15
- "grad_norm": 23.625,
16
  "learning_rate": 0.0,
17
  "loss": 2.7706,
18
  "mean_token_accuracy": 0.41634324193000793,
@@ -22,7 +22,7 @@
22
  {
23
  "entropy": 2.174584299325943,
24
  "epoch": 0.06130268199233716,
25
- "grad_norm": 15.9375,
26
  "learning_rate": 2e-06,
27
  "loss": 2.4332,
28
  "mean_token_accuracy": 0.41893551871180534,
@@ -30,934 +30,974 @@
30
  "step": 2
31
  },
32
  {
33
- "entropy": 2.02590711414814,
34
  "epoch": 0.09195402298850575,
35
- "grad_norm": 13.875,
36
  "learning_rate": 4e-06,
37
- "loss": 2.3311,
38
- "mean_token_accuracy": 0.44238732755184174,
39
  "num_tokens": 5582.0,
40
  "step": 3
41
  },
42
  {
43
- "entropy": 2.293337270617485,
44
  "epoch": 0.12260536398467432,
45
- "grad_norm": 17.5,
46
  "learning_rate": 6e-06,
47
- "loss": 2.4635,
48
- "mean_token_accuracy": 0.4375271461904049,
49
  "num_tokens": 7064.0,
50
  "step": 4
51
  },
52
  {
53
- "entropy": 2.307544246315956,
54
  "epoch": 0.1532567049808429,
55
- "grad_norm": 11.875,
56
  "learning_rate": 8e-06,
57
- "loss": 2.1979,
58
- "mean_token_accuracy": 0.43325819820165634,
59
  "num_tokens": 9091.0,
60
  "step": 5
61
  },
62
  {
63
- "entropy": 2.4334808588027954,
64
  "epoch": 0.1839080459770115,
65
- "grad_norm": 12.8125,
66
  "learning_rate": 9.999999999999999e-06,
67
- "loss": 2.4082,
68
- "mean_token_accuracy": 0.46957090869545937,
69
  "num_tokens": 10556.0,
70
  "step": 6
71
  },
72
  {
73
- "entropy": 2.4482616782188416,
74
  "epoch": 0.21455938697318008,
75
- "grad_norm": 10.5625,
76
  "learning_rate": 1.2e-05,
77
- "loss": 2.2519,
78
- "mean_token_accuracy": 0.4758397154510021,
79
  "num_tokens": 12215.0,
80
  "step": 7
81
  },
82
  {
83
- "entropy": 2.358666628599167,
84
  "epoch": 0.24521072796934865,
85
- "grad_norm": 8.8125,
86
  "learning_rate": 1.4e-05,
87
- "loss": 2.0026,
88
- "mean_token_accuracy": 0.5035362914204597,
89
  "num_tokens": 13939.0,
90
  "step": 8
91
  },
92
  {
93
- "entropy": 2.3532307744026184,
94
  "epoch": 0.27586206896551724,
95
- "grad_norm": 7.1875,
96
  "learning_rate": 1.6e-05,
97
- "loss": 1.886,
98
- "mean_token_accuracy": 0.49147794023156166,
99
  "num_tokens": 15986.0,
100
  "step": 9
101
  },
102
  {
103
- "entropy": 2.328258216381073,
104
  "epoch": 0.3065134099616858,
105
- "grad_norm": 7.0625,
106
  "learning_rate": 1.8e-05,
107
- "loss": 1.8628,
108
- "mean_token_accuracy": 0.5151920653879642,
109
  "num_tokens": 18444.0,
110
  "step": 10
111
  },
112
  {
113
- "entropy": 2.0767437368631363,
114
  "epoch": 0.3371647509578544,
115
- "grad_norm": 5.4375,
116
  "learning_rate": 1.9999999999999998e-05,
117
- "loss": 1.6635,
118
- "mean_token_accuracy": 0.5702134519815445,
119
  "num_tokens": 21127.0,
120
  "step": 11
121
  },
122
  {
123
- "entropy": 2.2949997633695602,
124
  "epoch": 0.367816091954023,
125
- "grad_norm": 6.53125,
126
  "learning_rate": 2.2e-05,
127
- "loss": 1.9033,
128
- "mean_token_accuracy": 0.5190943852066994,
129
  "num_tokens": 23308.0,
130
  "step": 12
131
  },
132
  {
133
- "entropy": 2.2641966193914413,
134
  "epoch": 0.39846743295019155,
135
- "grad_norm": 6.65625,
136
  "learning_rate": 2.4e-05,
137
- "loss": 1.9755,
138
- "mean_token_accuracy": 0.5264540836215019,
139
  "num_tokens": 25072.0,
140
  "step": 13
141
  },
142
  {
143
- "entropy": 2.3164361864328384,
144
  "epoch": 0.42911877394636017,
145
- "grad_norm": 7.90625,
146
  "learning_rate": 2.6000000000000002e-05,
147
- "loss": 1.9846,
148
- "mean_token_accuracy": 0.5071298070251942,
149
  "num_tokens": 26450.0,
150
  "step": 14
151
  },
152
  {
153
- "entropy": 2.2884700149297714,
154
  "epoch": 0.45977011494252873,
155
- "grad_norm": 6.875,
156
  "learning_rate": 2.8e-05,
157
- "loss": 1.8003,
158
- "mean_token_accuracy": 0.5303994193673134,
159
  "num_tokens": 27912.0,
160
  "step": 15
161
  },
162
  {
163
- "entropy": 2.163148954510689,
164
  "epoch": 0.4904214559386973,
165
- "grad_norm": 6.53125,
166
  "learning_rate": 3e-05,
167
- "loss": 1.6794,
168
- "mean_token_accuracy": 0.5747136920690536,
169
  "num_tokens": 29392.0,
170
  "step": 16
171
  },
172
  {
173
- "entropy": 2.030226394534111,
174
  "epoch": 0.5210727969348659,
175
- "grad_norm": 4.96875,
176
  "learning_rate": 2.998951057182598e-05,
177
- "loss": 1.4781,
178
- "mean_token_accuracy": 0.5864466205239296,
179
  "num_tokens": 31417.0,
180
  "step": 17
181
  },
182
  {
183
- "entropy": 2.046441972255707,
184
  "epoch": 0.5517241379310345,
185
- "grad_norm": 6.09375,
186
  "learning_rate": 2.99580569577177e-05,
187
- "loss": 1.7775,
188
- "mean_token_accuracy": 0.531848881393671,
189
  "num_tokens": 33727.0,
190
  "step": 18
191
  },
192
  {
193
- "entropy": 1.947427824139595,
194
  "epoch": 0.5823754789272031,
195
- "grad_norm": 5.78125,
196
  "learning_rate": 2.9905683148398642e-05,
197
- "loss": 1.5924,
198
- "mean_token_accuracy": 0.5740942284464836,
199
  "num_tokens": 35836.0,
200
  "step": 19
201
  },
202
  {
203
- "entropy": 2.018353298306465,
204
  "epoch": 0.6130268199233716,
205
- "grad_norm": 5.96875,
206
  "learning_rate": 2.9832462393376926e-05,
207
- "loss": 1.7183,
208
- "mean_token_accuracy": 0.5531399250030518,
209
  "num_tokens": 37639.0,
210
  "step": 20
211
  },
212
  {
213
- "entropy": 2.198324888944626,
214
  "epoch": 0.6436781609195402,
215
- "grad_norm": 5.75,
216
  "learning_rate": 2.9738497098499325e-05,
217
- "loss": 1.7776,
218
- "mean_token_accuracy": 0.5184755437076092,
219
  "num_tokens": 39351.0,
220
  "step": 21
221
  },
222
  {
223
- "entropy": 1.9849777817726135,
224
  "epoch": 0.6743295019157088,
225
- "grad_norm": 6.09375,
226
  "learning_rate": 2.9623918682727355e-05,
227
- "loss": 1.5472,
228
- "mean_token_accuracy": 0.5781424902379513,
229
  "num_tokens": 41024.0,
230
  "step": 22
231
  },
232
  {
233
- "entropy": 2.0799703299999237,
234
  "epoch": 0.7049808429118773,
235
- "grad_norm": 6.0625,
236
  "learning_rate": 2.9488887394336025e-05,
237
- "loss": 1.7815,
238
- "mean_token_accuracy": 0.5519590191543102,
239
  "num_tokens": 42624.0,
240
  "step": 23
241
  },
242
  {
243
- "entropy": 2.188886523246765,
244
  "epoch": 0.735632183908046,
245
- "grad_norm": 7.78125,
246
  "learning_rate": 2.9333592086792113e-05,
247
- "loss": 1.9212,
248
- "mean_token_accuracy": 0.5380603447556496,
249
  "num_tokens": 43836.0,
250
  "step": 24
251
  },
252
  {
253
- "entropy": 2.061448335647583,
254
  "epoch": 0.7662835249042146,
255
- "grad_norm": 5.21875,
256
  "learning_rate": 2.9158249954625514e-05,
257
- "loss": 1.7393,
258
- "mean_token_accuracy": 0.5270493216812611,
259
  "num_tokens": 45870.0,
260
  "step": 25
261
  },
262
  {
263
- "entropy": 2.048309415578842,
264
  "epoch": 0.7969348659003831,
265
- "grad_norm": 5.40625,
266
  "learning_rate": 2.8963106229663064e-05,
267
- "loss": 1.6848,
268
- "mean_token_accuracy": 0.5622207410633564,
269
  "num_tokens": 47664.0,
270
  "step": 26
271
  },
272
  {
273
- "entropy": 2.029194623231888,
274
  "epoch": 0.8275862068965517,
275
- "grad_norm": 5.125,
276
  "learning_rate": 2.8748433838049642e-05,
277
- "loss": 1.6591,
278
- "mean_token_accuracy": 0.5621702149510384,
279
  "num_tokens": 49646.0,
280
  "step": 27
281
  },
282
  {
283
- "entropy": 2.010026901960373,
284
  "epoch": 0.8582375478927203,
285
- "grad_norm": 5.28125,
286
  "learning_rate": 2.8514533018536286e-05,
287
- "loss": 1.5141,
288
- "mean_token_accuracy": 0.5880535058677197,
289
  "num_tokens": 51235.0,
290
  "step": 28
291
  },
292
  {
293
- "entropy": 2.0415484458208084,
294
  "epoch": 0.8888888888888888,
295
- "grad_norm": 5.5,
296
  "learning_rate": 2.8261730902569146e-05,
297
- "loss": 1.6637,
298
- "mean_token_accuracy": 0.5668029375374317,
299
  "num_tokens": 53037.0,
300
  "step": 29
301
  },
302
  {
303
- "entropy": 2.0857083946466446,
304
  "epoch": 0.9195402298850575,
305
- "grad_norm": 5.0625,
306
  "learning_rate": 2.7990381056766583e-05,
307
- "loss": 1.6959,
308
- "mean_token_accuracy": 0.5493744164705276,
309
  "num_tokens": 54826.0,
310
  "step": 30
311
  },
312
  {
313
- "entropy": 2.112009719014168,
314
  "epoch": 0.9501915708812261,
315
- "grad_norm": 5.40625,
316
  "learning_rate": 2.770086298842426e-05,
317
- "loss": 1.6559,
318
- "mean_token_accuracy": 0.5552288331091404,
319
  "num_tokens": 56737.0,
320
  "step": 31
321
  },
322
  {
323
- "entropy": 2.1103257089853287,
324
  "epoch": 0.9808429118773946,
325
- "grad_norm": 6.21875,
326
  "learning_rate": 2.7393581614739924e-05,
327
- "loss": 1.6919,
328
- "mean_token_accuracy": 0.5538047850131989,
329
  "num_tokens": 58084.0,
330
  "step": 32
331
  },
332
  {
333
- "entropy": 1.8903157711029053,
334
  "epoch": 1.0,
335
- "grad_norm": 7.21875,
336
  "learning_rate": 2.7068966696500025e-05,
337
- "loss": 1.6264,
338
- "mean_token_accuracy": 0.5663587927818299,
339
  "num_tokens": 59142.0,
340
  "step": 33
341
  },
342
  {
343
- "entropy": 1.831245243549347,
344
  "epoch": 1.0306513409961686,
345
- "grad_norm": 4.5625,
346
  "learning_rate": 2.672747223702045e-05,
347
- "loss": 1.3045,
348
- "mean_token_accuracy": 0.6326133832335472,
349
  "num_tokens": 60897.0,
350
  "step": 34
351
  },
352
  {
353
- "entropy": 1.9981429725885391,
354
  "epoch": 1.0613026819923372,
355
- "grad_norm": 5.59375,
356
  "learning_rate": 2.6369575847181795e-05,
357
- "loss": 1.5418,
358
- "mean_token_accuracy": 0.6115933358669281,
359
  "num_tokens": 62325.0,
360
  "step": 35
361
  },
362
  {
363
- "entropy": 1.8222165703773499,
364
  "epoch": 1.0919540229885056,
365
- "grad_norm": 4.59375,
366
  "learning_rate": 2.5995778077447393e-05,
367
- "loss": 1.2274,
368
- "mean_token_accuracy": 0.6454463005065918,
369
  "num_tokens": 64163.0,
370
  "step": 36
371
  },
372
  {
373
- "entropy": 1.7093443274497986,
374
  "epoch": 1.1226053639846743,
375
- "grad_norm": 4.4375,
376
  "learning_rate": 2.5606601717798212e-05,
377
- "loss": 1.2805,
378
- "mean_token_accuracy": 0.6579162031412125,
379
  "num_tokens": 66168.0,
380
  "step": 37
381
  },
382
  {
383
- "entropy": 1.74972003698349,
384
  "epoch": 1.1532567049808429,
385
- "grad_norm": 4.34375,
386
  "learning_rate": 2.520259106656379e-05,
387
- "loss": 1.2566,
388
- "mean_token_accuracy": 0.6366828829050064,
389
  "num_tokens": 68398.0,
390
  "step": 38
391
  },
392
  {
393
- "entropy": 1.7153682857751846,
394
  "epoch": 1.1839080459770115,
395
- "grad_norm": 4.78125,
396
  "learning_rate": 2.4784311169171818e-05,
397
- "loss": 1.2717,
398
- "mean_token_accuracy": 0.6223325058817863,
399
  "num_tokens": 70548.0,
400
  "step": 39
401
  },
402
  {
403
- "entropy": 1.7037858068943024,
404
  "epoch": 1.21455938697318,
405
- "grad_norm": 5.15625,
406
  "learning_rate": 2.4352347027881003e-05,
407
- "loss": 1.2255,
408
- "mean_token_accuracy": 0.6531995758414268,
409
  "num_tokens": 72463.0,
410
  "step": 40
411
  },
412
  {
413
- "entropy": 1.5526579767465591,
414
  "epoch": 1.2452107279693487,
415
- "grad_norm": 5.6875,
416
  "learning_rate": 2.3907302783602522e-05,
417
- "loss": 1.1533,
418
- "mean_token_accuracy": 0.662347637116909,
419
  "num_tokens": 74061.0,
420
  "step": 41
421
  },
422
  {
423
- "entropy": 1.5910945385694504,
424
  "epoch": 1.2758620689655173,
425
- "grad_norm": 4.78125,
426
  "learning_rate": 2.344980087095433e-05,
427
- "loss": 1.2152,
428
- "mean_token_accuracy": 0.6524857208132744,
429
  "num_tokens": 76130.0,
430
  "step": 42
431
  },
432
  {
433
- "entropy": 1.5662062019109726,
434
  "epoch": 1.3065134099616857,
435
- "grad_norm": 5.46875,
436
  "learning_rate": 2.298048114773005e-05,
437
- "loss": 1.2326,
438
- "mean_token_accuracy": 0.6478348523378372,
439
  "num_tokens": 77912.0,
440
  "step": 43
441
  },
442
  {
443
- "entropy": 1.471334233880043,
444
  "epoch": 1.3371647509578544,
445
- "grad_norm": 4.875,
446
  "learning_rate": 2.25e-05,
447
- "loss": 1.1379,
448
- "mean_token_accuracy": 0.6691200658679008,
449
  "num_tokens": 79873.0,
450
  "step": 44
451
  },
452
  {
453
- "entropy": 1.586159959435463,
454
  "epoch": 1.367816091954023,
455
- "grad_norm": 5.28125,
456
  "learning_rate": 2.200902942409593e-05,
457
- "loss": 1.1442,
458
- "mean_token_accuracy": 0.6521164402365685,
459
  "num_tokens": 81708.0,
460
  "step": 45
461
  },
462
  {
463
- "entropy": 1.5281111598014832,
464
  "epoch": 1.3984674329501916,
465
- "grad_norm": 6.125,
466
  "learning_rate": 2.1508256086763372e-05,
467
- "loss": 1.2369,
468
- "mean_token_accuracy": 0.6489557102322578,
469
  "num_tokens": 83479.0,
470
  "step": 46
471
  },
472
  {
473
- "entropy": 1.5045715868473053,
474
  "epoch": 1.4291187739463602,
475
- "grad_norm": 6.15625,
476
  "learning_rate": 2.0998380364796112e-05,
477
- "loss": 1.2949,
478
- "mean_token_accuracy": 0.6465602889657021,
479
  "num_tokens": 85091.0,
480
  "step": 47
481
  },
482
  {
483
- "entropy": 1.5538268089294434,
484
  "epoch": 1.4597701149425286,
485
- "grad_norm": 5.09375,
486
  "learning_rate": 2.0480115365495928e-05,
487
- "loss": 1.1747,
488
- "mean_token_accuracy": 0.6478204801678658,
489
  "num_tokens": 87067.0,
490
  "step": 48
491
  },
492
  {
493
- "entropy": 1.492392674088478,
494
  "epoch": 1.4904214559386972,
495
- "grad_norm": 4.71875,
496
  "learning_rate": 1.995418592932751e-05,
497
- "loss": 1.0862,
498
- "mean_token_accuracy": 0.693995900452137,
499
  "num_tokens": 89257.0,
500
  "step": 49
501
  },
502
  {
503
- "entropy": 1.540455773472786,
504
  "epoch": 1.5210727969348659,
505
- "grad_norm": 5.53125,
506
  "learning_rate": 1.9421327616163564e-05,
507
- "loss": 1.2151,
508
- "mean_token_accuracy": 0.654072530567646,
509
  "num_tokens": 91129.0,
510
  "step": 50
511
  },
512
  {
513
- "entropy": 1.5558712631464005,
514
  "epoch": 1.5517241379310345,
515
- "grad_norm": 5.21875,
516
  "learning_rate": 1.888228567653781e-05,
517
- "loss": 1.1343,
518
- "mean_token_accuracy": 0.6754168346524239,
519
  "num_tokens": 93217.0,
520
  "step": 51
521
  },
522
  {
523
- "entropy": 1.5656412094831467,
524
  "epoch": 1.582375478927203,
525
- "grad_norm": 8.375,
526
  "learning_rate": 1.8337814009344716e-05,
527
- "loss": 1.1019,
528
- "mean_token_accuracy": 0.6633822396397591,
529
  "num_tokens": 94882.0,
530
  "step": 52
531
  },
532
  {
533
- "entropy": 1.6264984011650085,
534
  "epoch": 1.6130268199233715,
535
- "grad_norm": 6.5,
536
  "learning_rate": 1.778867410744372e-05,
537
- "loss": 1.1993,
538
- "mean_token_accuracy": 0.6627216190099716,
539
  "num_tokens": 96226.0,
540
  "step": 53
541
  },
542
  {
543
- "entropy": 1.5609679520130157,
544
  "epoch": 1.6436781609195403,
545
- "grad_norm": 5.71875,
546
  "learning_rate": 1.7235633992642615e-05,
547
- "loss": 1.1445,
548
- "mean_token_accuracy": 0.6396612226963043,
549
  "num_tokens": 98064.0,
550
  "step": 54
551
  },
552
  {
553
- "entropy": 1.6555797308683395,
554
  "epoch": 1.6743295019157087,
555
- "grad_norm": 5.21875,
556
  "learning_rate": 1.667946714154962e-05,
557
- "loss": 1.1914,
558
- "mean_token_accuracy": 0.630947545170784,
559
  "num_tokens": 99875.0,
560
  "step": 55
561
  },
562
  {
563
- "entropy": 1.4573774337768555,
564
  "epoch": 1.7049808429118773,
565
- "grad_norm": 4.59375,
566
  "learning_rate": 1.6120951403796367e-05,
567
- "loss": 1.1325,
568
- "mean_token_accuracy": 0.6682965606451035,
569
  "num_tokens": 102303.0,
570
  "step": 56
571
  },
572
  {
573
- "entropy": 1.559302657842636,
574
  "epoch": 1.735632183908046,
575
- "grad_norm": 5.9375,
576
  "learning_rate": 1.5560867914144887e-05,
577
- "loss": 1.1645,
578
- "mean_token_accuracy": 0.6583909243345261,
579
  "num_tokens": 103806.0,
580
  "step": 57
581
  },
582
  {
583
- "entropy": 1.6225543022155762,
584
  "epoch": 1.7662835249042146,
585
- "grad_norm": 7.1875,
586
  "learning_rate": 1.5e-05,
587
- "loss": 1.2239,
588
- "mean_token_accuracy": 0.666826568543911,
589
  "num_tokens": 105012.0,
590
  "step": 58
591
  },
592
  {
593
- "entropy": 1.6973845958709717,
594
  "epoch": 1.7969348659003832,
595
- "grad_norm": 6.3125,
596
  "learning_rate": 1.4439132085855117e-05,
597
- "loss": 1.2516,
598
- "mean_token_accuracy": 0.6574011594057083,
599
  "num_tokens": 106373.0,
600
  "step": 59
601
  },
602
  {
603
- "entropy": 1.630146011710167,
604
  "epoch": 1.8275862068965516,
605
- "grad_norm": 5.78125,
606
  "learning_rate": 1.3879048596203637e-05,
607
- "loss": 1.1719,
608
- "mean_token_accuracy": 0.6703185066580772,
609
  "num_tokens": 107938.0,
610
  "step": 60
611
  },
612
  {
613
- "entropy": 1.5937796980142593,
614
  "epoch": 1.8582375478927204,
615
- "grad_norm": 5.8125,
616
  "learning_rate": 1.3320532858450382e-05,
617
- "loss": 1.1739,
618
- "mean_token_accuracy": 0.6567527502775192,
619
  "num_tokens": 109587.0,
620
  "step": 61
621
  },
622
  {
623
- "entropy": 1.673499509692192,
624
  "epoch": 1.8888888888888888,
625
- "grad_norm": 6.625,
626
  "learning_rate": 1.2764366007357382e-05,
627
- "loss": 1.4405,
628
- "mean_token_accuracy": 0.6155448481440544,
629
  "num_tokens": 111253.0,
630
  "step": 62
631
  },
632
  {
633
- "entropy": 1.61783929169178,
634
  "epoch": 1.9195402298850575,
635
- "grad_norm": 6.125,
636
  "learning_rate": 1.2211325892556282e-05,
637
- "loss": 1.2447,
638
- "mean_token_accuracy": 0.6791554242372513,
639
  "num_tokens": 112833.0,
640
  "step": 63
641
  },
642
  {
643
- "entropy": 1.5331860035657883,
644
  "epoch": 1.950191570881226,
645
- "grad_norm": 6.25,
646
  "learning_rate": 1.1662185990655285e-05,
647
- "loss": 1.2378,
648
- "mean_token_accuracy": 0.6611459106206894,
649
  "num_tokens": 114573.0,
650
  "step": 64
651
  },
652
  {
653
- "entropy": 1.5267712771892548,
654
  "epoch": 1.9808429118773945,
655
- "grad_norm": 4.9375,
656
  "learning_rate": 1.1117714323462188e-05,
657
- "loss": 1.105,
658
- "mean_token_accuracy": 0.6428326666355133,
659
  "num_tokens": 116981.0,
660
  "step": 65
661
  },
662
  {
663
- "entropy": 1.596066379547119,
664
  "epoch": 2.0,
665
- "grad_norm": 7.21875,
666
  "learning_rate": 1.0578672383836437e-05,
667
- "loss": 1.3919,
668
- "mean_token_accuracy": 0.6277154445648193,
669
  "num_tokens": 118284.0,
670
  "step": 66
671
  },
672
  {
673
- "entropy": 1.6225826889276505,
674
  "epoch": 2.0306513409961684,
675
- "grad_norm": 5.78125,
676
  "learning_rate": 1.0045814070672498e-05,
677
- "loss": 0.934,
678
- "mean_token_accuracy": 0.7343822047114372,
679
  "num_tokens": 119663.0,
680
  "step": 67
681
  },
682
  {
683
- "entropy": 1.5247658640146255,
684
  "epoch": 2.0613026819923372,
685
- "grad_norm": 4.5625,
686
  "learning_rate": 9.519884634504074e-06,
687
- "loss": 0.8369,
688
- "mean_token_accuracy": 0.7519923225045204,
689
  "num_tokens": 121476.0,
690
  "step": 68
691
  },
692
  {
693
- "entropy": 1.5478522330522537,
694
  "epoch": 2.0919540229885056,
695
- "grad_norm": 4.1875,
696
  "learning_rate": 9.001619635203889e-06,
697
- "loss": 0.84,
698
- "mean_token_accuracy": 0.7427392601966858,
699
  "num_tokens": 123792.0,
700
  "step": 69
701
  },
702
  {
703
- "entropy": 1.5817518830299377,
704
  "epoch": 2.1226053639846745,
705
- "grad_norm": 5.25,
706
  "learning_rate": 8.491743913236629e-06,
707
- "loss": 0.8532,
708
- "mean_token_accuracy": 0.7553950697183609,
709
  "num_tokens": 125329.0,
710
  "step": 70
711
  },
712
  {
713
- "entropy": 1.5682816207408905,
714
  "epoch": 2.153256704980843,
715
- "grad_norm": 5.5,
716
  "learning_rate": 7.99097057590407e-06,
717
- "loss": 0.8284,
718
- "mean_token_accuracy": 0.7582268938422203,
719
  "num_tokens": 126654.0,
720
  "step": 71
721
  },
722
  {
723
- "entropy": 1.4128143042325974,
724
  "epoch": 2.1839080459770113,
725
- "grad_norm": 4.625,
726
  "learning_rate": 7.500000000000004e-06,
727
- "loss": 0.8072,
728
- "mean_token_accuracy": 0.766459122300148,
729
  "num_tokens": 128629.0,
730
  "step": 72
731
  },
732
  {
733
- "entropy": 1.4567322432994843,
734
  "epoch": 2.21455938697318,
735
- "grad_norm": 5.46875,
736
  "learning_rate": 7.019518852269953e-06,
737
- "loss": 1.0488,
738
- "mean_token_accuracy": 0.7072227671742439,
739
  "num_tokens": 130344.0,
740
  "step": 73
741
  },
742
  {
743
- "entropy": 1.4166576564311981,
744
  "epoch": 2.2452107279693485,
745
- "grad_norm": 4.96875,
746
  "learning_rate": 6.55019912904567e-06,
747
- "loss": 0.9092,
748
- "mean_token_accuracy": 0.7404436245560646,
749
  "num_tokens": 132152.0,
750
  "step": 74
751
  },
752
  {
753
- "entropy": 1.4273284822702408,
754
  "epoch": 2.2758620689655173,
755
- "grad_norm": 4.5,
756
  "learning_rate": 6.092697216397478e-06,
757
- "loss": 0.8233,
758
- "mean_token_accuracy": 0.7407987862825394,
759
  "num_tokens": 134144.0,
760
  "step": 75
761
  },
762
  {
763
- "entropy": 1.3403969407081604,
764
  "epoch": 2.3065134099616857,
765
- "grad_norm": 4.3125,
766
  "learning_rate": 5.647652972118998e-06,
767
- "loss": 0.924,
768
- "mean_token_accuracy": 0.7229798063635826,
769
  "num_tokens": 136715.0,
770
  "step": 76
771
  },
772
  {
773
- "entropy": 1.4036429971456528,
774
  "epoch": 2.3371647509578546,
775
- "grad_norm": 4.5625,
776
  "learning_rate": 5.2156888308281875e-06,
777
- "loss": 0.89,
778
- "mean_token_accuracy": 0.7286327704787254,
779
  "num_tokens": 138907.0,
780
  "step": 77
781
  },
782
  {
783
- "entropy": 1.397742137312889,
784
  "epoch": 2.367816091954023,
785
- "grad_norm": 4.96875,
786
  "learning_rate": 4.797408933436207e-06,
787
- "loss": 0.7942,
788
- "mean_token_accuracy": 0.7485231980681419,
789
  "num_tokens": 140536.0,
790
  "step": 78
791
  },
792
  {
793
- "entropy": 1.386961117386818,
794
  "epoch": 2.3984674329501914,
795
- "grad_norm": 5.65625,
796
  "learning_rate": 4.393398282201788e-06,
797
- "loss": 0.9061,
798
- "mean_token_accuracy": 0.7447740957140923,
799
  "num_tokens": 142205.0,
800
  "step": 79
801
  },
802
  {
803
- "entropy": 1.3736444562673569,
804
  "epoch": 2.42911877394636,
805
- "grad_norm": 5.5,
806
  "learning_rate": 4.004221922552608e-06,
807
- "loss": 0.8485,
808
- "mean_token_accuracy": 0.7546191215515137,
809
  "num_tokens": 143937.0,
810
  "step": 80
811
  },
812
  {
813
- "entropy": 1.3484344482421875,
814
  "epoch": 2.4597701149425286,
815
- "grad_norm": 5.15625,
816
  "learning_rate": 3.630424152818203e-06,
817
- "loss": 0.8728,
818
- "mean_token_accuracy": 0.7577220499515533,
819
  "num_tokens": 145867.0,
820
  "step": 81
821
  },
822
  {
823
- "entropy": 1.3517859131097794,
824
  "epoch": 2.4904214559386975,
825
- "grad_norm": 5.625,
826
  "learning_rate": 3.272527762979553e-06,
827
- "loss": 0.917,
828
- "mean_token_accuracy": 0.7402152791619301,
829
  "num_tokens": 147522.0,
830
  "step": 82
831
  },
832
  {
833
- "entropy": 1.427451640367508,
834
  "epoch": 2.521072796934866,
835
- "grad_norm": 6.0625,
836
  "learning_rate": 2.931033303499975e-06,
837
- "loss": 0.9471,
838
- "mean_token_accuracy": 0.7306742072105408,
839
  "num_tokens": 149154.0,
840
  "step": 83
841
  },
842
  {
843
- "entropy": 1.2776079773902893,
844
  "epoch": 2.5517241379310347,
845
- "grad_norm": 4.75,
846
  "learning_rate": 2.60641838526008e-06,
847
- "loss": 0.8647,
848
- "mean_token_accuracy": 0.7684408649802208,
849
  "num_tokens": 151443.0,
850
  "step": 84
851
  },
852
  {
853
- "entropy": 1.3705534487962723,
854
  "epoch": 2.582375478927203,
855
- "grad_norm": 5.21875,
856
  "learning_rate": 2.2991370115757383e-06,
857
- "loss": 0.8636,
858
- "mean_token_accuracy": 0.7471385598182678,
859
  "num_tokens": 153346.0,
860
  "step": 85
861
  },
862
  {
863
- "entropy": 1.4074051082134247,
864
  "epoch": 2.6130268199233715,
865
- "grad_norm": 5.59375,
866
  "learning_rate": 2.0096189432334194e-06,
867
- "loss": 0.874,
868
- "mean_token_accuracy": 0.7286683171987534,
869
  "num_tokens": 155041.0,
870
  "step": 86
871
  },
872
  {
873
- "entropy": 1.2744528949260712,
874
  "epoch": 2.6436781609195403,
875
- "grad_norm": 5.65625,
876
  "learning_rate": 1.7382690974308551e-06,
877
- "loss": 0.7612,
878
- "mean_token_accuracy": 0.768292061984539,
879
  "num_tokens": 156508.0,
880
  "step": 87
881
  },
882
  {
883
- "entropy": 1.322000876069069,
884
  "epoch": 2.6743295019157087,
885
- "grad_norm": 5.28125,
886
  "learning_rate": 1.4854669814637145e-06,
887
- "loss": 0.8423,
888
- "mean_token_accuracy": 0.7500675544142723,
889
  "num_tokens": 158506.0,
890
  "step": 88
891
  },
892
  {
893
- "entropy": 1.3921757936477661,
894
  "epoch": 2.704980842911877,
895
- "grad_norm": 4.90625,
896
  "learning_rate": 1.2515661619503572e-06,
897
- "loss": 0.8484,
898
- "mean_token_accuracy": 0.7282446771860123,
899
  "num_tokens": 160511.0,
900
  "step": 89
901
  },
902
  {
903
- "entropy": 1.3347049802541733,
904
  "epoch": 2.735632183908046,
905
- "grad_norm": 5.0625,
906
  "learning_rate": 1.036893770336938e-06,
907
- "loss": 0.8751,
908
- "mean_token_accuracy": 0.7406959384679794,
909
  "num_tokens": 162548.0,
910
  "step": 90
911
  },
912
  {
913
- "entropy": 1.296308308839798,
914
  "epoch": 2.766283524904215,
915
- "grad_norm": 4.78125,
916
  "learning_rate": 8.417500453744864e-07,
917
- "loss": 0.818,
918
- "mean_token_accuracy": 0.7431169748306274,
919
  "num_tokens": 164874.0,
920
  "step": 91
921
  },
922
  {
923
- "entropy": 1.296522632241249,
924
  "epoch": 2.796934865900383,
925
- "grad_norm": 8.3125,
926
  "learning_rate": 6.664079132078881e-07,
927
- "loss": 0.8333,
928
- "mean_token_accuracy": 0.7412382811307907,
929
  "num_tokens": 166614.0,
930
  "step": 92
931
  },
932
  {
933
- "entropy": 1.3617160469293594,
934
  "epoch": 2.8275862068965516,
935
- "grad_norm": 6.25,
936
  "learning_rate": 5.11112605663977e-07,
937
- "loss": 0.9403,
938
- "mean_token_accuracy": 0.728195421397686,
939
  "num_tokens": 168220.0,
940
  "step": 93
941
  },
942
  {
943
- "entropy": 1.3486039191484451,
944
  "epoch": 2.8582375478927204,
945
- "grad_norm": 6.8125,
946
  "learning_rate": 3.760813172726457e-07,
947
- "loss": 0.8643,
948
- "mean_token_accuracy": 0.7479279190301895,
949
  "num_tokens": 169540.0,
950
  "step": 94
951
  },
952
  {
953
- "entropy": 1.2711477279663086,
954
  "epoch": 2.888888888888889,
955
- "grad_norm": 4.65625,
956
  "learning_rate": 2.6150290150067593e-07,
957
- "loss": 0.766,
958
- "mean_token_accuracy": 0.7430502995848656,
959
  "num_tokens": 171709.0,
960
  "step": 95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
  }
962
  ],
963
  "logging_steps": 1,
@@ -972,12 +1012,12 @@
972
  "should_evaluate": false,
973
  "should_log": false,
974
  "should_save": true,
975
- "should_training_stop": false
976
  },
977
  "attributes": {}
978
  }
979
  },
980
- "total_flos": 4559405580668928.0,
981
  "train_batch_size": 2,
982
  "trial_name": null,
983
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 99,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12
  {
13
  "entropy": 2.2323372662067413,
14
  "epoch": 0.03065134099616858,
15
+ "grad_norm": 53.25,
16
  "learning_rate": 0.0,
17
  "loss": 2.7706,
18
  "mean_token_accuracy": 0.41634324193000793,
 
22
  {
23
  "entropy": 2.174584299325943,
24
  "epoch": 0.06130268199233716,
25
+ "grad_norm": 36.0,
26
  "learning_rate": 2e-06,
27
  "loss": 2.4332,
28
  "mean_token_accuracy": 0.41893551871180534,
 
30
  "step": 2
31
  },
32
  {
33
+ "entropy": 2.0810845494270325,
34
  "epoch": 0.09195402298850575,
35
+ "grad_norm": 27.75,
36
  "learning_rate": 4e-06,
37
+ "loss": 2.2604,
38
+ "mean_token_accuracy": 0.4491094872355461,
39
  "num_tokens": 5582.0,
40
  "step": 3
41
  },
42
  {
43
+ "entropy": 2.389508530497551,
44
  "epoch": 0.12260536398467432,
45
+ "grad_norm": 28.625,
46
  "learning_rate": 6e-06,
47
+ "loss": 2.224,
48
+ "mean_token_accuracy": 0.47163403779268265,
49
  "num_tokens": 7064.0,
50
  "step": 4
51
  },
52
  {
53
+ "entropy": 2.3899217396974564,
54
  "epoch": 0.1532567049808429,
55
+ "grad_norm": 17.0,
56
  "learning_rate": 8e-06,
57
+ "loss": 1.9894,
58
+ "mean_token_accuracy": 0.4873850643634796,
59
  "num_tokens": 9091.0,
60
  "step": 5
61
  },
62
  {
63
+ "entropy": 2.3988372683525085,
64
  "epoch": 0.1839080459770115,
65
+ "grad_norm": 22.375,
66
  "learning_rate": 9.999999999999999e-06,
67
+ "loss": 2.0726,
68
+ "mean_token_accuracy": 0.5061133019626141,
69
  "num_tokens": 10556.0,
70
  "step": 6
71
  },
72
  {
73
+ "entropy": 2.395625740289688,
74
  "epoch": 0.21455938697318008,
75
+ "grad_norm": 16.75,
76
  "learning_rate": 1.2e-05,
77
+ "loss": 2.0064,
78
+ "mean_token_accuracy": 0.5037284195423126,
79
  "num_tokens": 12215.0,
80
  "step": 7
81
  },
82
  {
83
+ "entropy": 2.2998499274253845,
84
  "epoch": 0.24521072796934865,
85
+ "grad_norm": 14.5625,
86
  "learning_rate": 1.4e-05,
87
+ "loss": 1.7784,
88
+ "mean_token_accuracy": 0.5325785167515278,
89
  "num_tokens": 13939.0,
90
  "step": 8
91
  },
92
  {
93
+ "entropy": 2.233474910259247,
94
  "epoch": 0.27586206896551724,
95
+ "grad_norm": 14.6875,
96
  "learning_rate": 1.6e-05,
97
+ "loss": 1.7552,
98
+ "mean_token_accuracy": 0.5224817767739296,
99
  "num_tokens": 15986.0,
100
  "step": 9
101
  },
102
  {
103
+ "entropy": 2.1560849398374557,
104
  "epoch": 0.3065134099616858,
105
+ "grad_norm": 12.125,
106
  "learning_rate": 1.8e-05,
107
+ "loss": 1.7487,
108
+ "mean_token_accuracy": 0.5436614826321602,
109
  "num_tokens": 18444.0,
110
  "step": 10
111
  },
112
  {
113
+ "entropy": 1.8782547265291214,
114
  "epoch": 0.3371647509578544,
115
+ "grad_norm": 11.1875,
116
  "learning_rate": 1.9999999999999998e-05,
117
+ "loss": 1.5774,
118
+ "mean_token_accuracy": 0.5730905011296272,
119
  "num_tokens": 21127.0,
120
  "step": 11
121
  },
122
  {
123
+ "entropy": 2.0860691219568253,
124
  "epoch": 0.367816091954023,
125
+ "grad_norm": 13.125,
126
  "learning_rate": 2.2e-05,
127
+ "loss": 1.8279,
128
+ "mean_token_accuracy": 0.5077806040644646,
129
  "num_tokens": 23308.0,
130
  "step": 12
131
  },
132
  {
133
+ "entropy": 2.0839987099170685,
134
  "epoch": 0.39846743295019155,
135
+ "grad_norm": 13.5,
136
  "learning_rate": 2.4e-05,
137
+ "loss": 1.8629,
138
+ "mean_token_accuracy": 0.5324465520679951,
139
  "num_tokens": 25072.0,
140
  "step": 13
141
  },
142
  {
143
+ "entropy": 2.211606591939926,
144
  "epoch": 0.42911877394636017,
145
+ "grad_norm": 15.3125,
146
  "learning_rate": 2.6000000000000002e-05,
147
+ "loss": 1.934,
148
+ "mean_token_accuracy": 0.513655960559845,
149
  "num_tokens": 26450.0,
150
  "step": 14
151
  },
152
  {
153
+ "entropy": 2.2505457401275635,
154
  "epoch": 0.45977011494252873,
155
+ "grad_norm": 14.8125,
156
  "learning_rate": 2.8e-05,
157
+ "loss": 1.7603,
158
+ "mean_token_accuracy": 0.5480454824864864,
159
  "num_tokens": 27912.0,
160
  "step": 15
161
  },
162
  {
163
+ "entropy": 2.187108889222145,
164
  "epoch": 0.4904214559386973,
165
+ "grad_norm": 13.125,
166
  "learning_rate": 3e-05,
167
+ "loss": 1.6138,
168
+ "mean_token_accuracy": 0.5843819156289101,
169
  "num_tokens": 29392.0,
170
  "step": 16
171
  },
172
  {
173
+ "entropy": 2.0149056166410446,
174
  "epoch": 0.5210727969348659,
175
+ "grad_norm": 9.9375,
176
  "learning_rate": 2.998951057182598e-05,
177
+ "loss": 1.4549,
178
+ "mean_token_accuracy": 0.597277820110321,
179
  "num_tokens": 31417.0,
180
  "step": 17
181
  },
182
  {
183
+ "entropy": 1.9988498389720917,
184
  "epoch": 0.5517241379310345,
185
+ "grad_norm": 11.0,
186
  "learning_rate": 2.99580569577177e-05,
187
+ "loss": 1.7097,
188
+ "mean_token_accuracy": 0.5442679524421692,
189
  "num_tokens": 33727.0,
190
  "step": 18
191
  },
192
  {
193
+ "entropy": 1.8304037749767303,
194
  "epoch": 0.5823754789272031,
195
+ "grad_norm": 10.125,
196
  "learning_rate": 2.9905683148398642e-05,
197
+ "loss": 1.5381,
198
+ "mean_token_accuracy": 0.5851795524358749,
199
  "num_tokens": 35836.0,
200
  "step": 19
201
  },
202
  {
203
+ "entropy": 1.891087457537651,
204
  "epoch": 0.6130268199233716,
205
+ "grad_norm": 12.625,
206
  "learning_rate": 2.9832462393376926e-05,
207
+ "loss": 1.6876,
208
+ "mean_token_accuracy": 0.5546146482229233,
209
  "num_tokens": 37639.0,
210
  "step": 20
211
  },
212
  {
213
+ "entropy": 1.9664306491613388,
214
  "epoch": 0.6436781609195402,
215
+ "grad_norm": 12.125,
216
  "learning_rate": 2.9738497098499325e-05,
217
+ "loss": 1.7271,
218
+ "mean_token_accuracy": 0.5344564504921436,
219
  "num_tokens": 39351.0,
220
  "step": 21
221
  },
222
  {
223
+ "entropy": 1.7850568294525146,
224
  "epoch": 0.6743295019157088,
225
+ "grad_norm": 13.375,
226
  "learning_rate": 2.9623918682727355e-05,
227
+ "loss": 1.524,
228
+ "mean_token_accuracy": 0.5623632185161114,
229
  "num_tokens": 41024.0,
230
  "step": 22
231
  },
232
  {
233
+ "entropy": 1.898742452263832,
234
  "epoch": 0.7049808429118773,
235
+ "grad_norm": 13.0,
236
  "learning_rate": 2.9488887394336025e-05,
237
+ "loss": 1.732,
238
+ "mean_token_accuracy": 0.5667595192790031,
239
  "num_tokens": 42624.0,
240
  "step": 23
241
  },
242
  {
243
+ "entropy": 2.062256097793579,
244
  "epoch": 0.735632183908046,
245
+ "grad_norm": 15.0625,
246
  "learning_rate": 2.9333592086792113e-05,
247
+ "loss": 1.8659,
248
+ "mean_token_accuracy": 0.5371430143713951,
249
  "num_tokens": 43836.0,
250
  "step": 24
251
  },
252
  {
253
+ "entropy": 1.9839176535606384,
254
  "epoch": 0.7662835249042146,
255
+ "grad_norm": 10.4375,
256
  "learning_rate": 2.9158249954625514e-05,
257
+ "loss": 1.7355,
258
+ "mean_token_accuracy": 0.548308789730072,
259
  "num_tokens": 45870.0,
260
  "step": 25
261
  },
262
  {
263
+ "entropy": 2.005643382668495,
264
  "epoch": 0.7969348659003831,
265
+ "grad_norm": 10.6875,
266
  "learning_rate": 2.8963106229663064e-05,
267
+ "loss": 1.6277,
268
+ "mean_token_accuracy": 0.577509343624115,
269
  "num_tokens": 47664.0,
270
  "step": 26
271
  },
272
  {
273
+ "entropy": 2.015763074159622,
274
  "epoch": 0.8275862068965517,
275
+ "grad_norm": 10.875,
276
  "learning_rate": 2.8748433838049642e-05,
277
+ "loss": 1.6878,
278
+ "mean_token_accuracy": 0.5588897317647934,
279
  "num_tokens": 49646.0,
280
  "step": 27
281
  },
282
  {
283
+ "entropy": 2.0416687428951263,
284
  "epoch": 0.8582375478927203,
285
+ "grad_norm": 13.0,
286
  "learning_rate": 2.8514533018536286e-05,
287
+ "loss": 1.5327,
288
+ "mean_token_accuracy": 0.5883619785308838,
289
  "num_tokens": 51235.0,
290
  "step": 28
291
  },
292
  {
293
+ "entropy": 2.029404863715172,
294
  "epoch": 0.8888888888888888,
295
+ "grad_norm": 10.8125,
296
  "learning_rate": 2.8261730902569146e-05,
297
+ "loss": 1.6362,
298
+ "mean_token_accuracy": 0.5863424465060234,
299
  "num_tokens": 53037.0,
300
  "step": 29
301
  },
302
  {
303
+ "entropy": 2.0645615607500076,
304
  "epoch": 0.9195402298850575,
305
+ "grad_norm": 10.0625,
306
  "learning_rate": 2.7990381056766583e-05,
307
+ "loss": 1.6623,
308
+ "mean_token_accuracy": 0.5610311627388,
309
  "num_tokens": 54826.0,
310
  "step": 30
311
  },
312
  {
313
+ "entropy": 2.090387746691704,
314
  "epoch": 0.9501915708812261,
315
+ "grad_norm": 12.0,
316
  "learning_rate": 2.770086298842426e-05,
317
+ "loss": 1.6578,
318
+ "mean_token_accuracy": 0.5568758621811867,
319
  "num_tokens": 56737.0,
320
  "step": 31
321
  },
322
  {
323
+ "entropy": 2.0354464948177338,
324
  "epoch": 0.9808429118773946,
325
+ "grad_norm": 12.5625,
326
  "learning_rate": 2.7393581614739924e-05,
327
+ "loss": 1.6745,
328
+ "mean_token_accuracy": 0.5604493953287601,
329
  "num_tokens": 58084.0,
330
  "step": 32
331
  },
332
  {
333
+ "entropy": 1.7894673347473145,
334
  "epoch": 1.0,
335
+ "grad_norm": 12.4375,
336
  "learning_rate": 2.7068966696500025e-05,
337
+ "loss": 1.6188,
338
+ "mean_token_accuracy": 0.5824247837066651,
339
  "num_tokens": 59142.0,
340
  "step": 33
341
  },
342
  {
343
+ "entropy": 1.63651242852211,
344
  "epoch": 1.0306513409961686,
345
+ "grad_norm": 8.0625,
346
  "learning_rate": 2.672747223702045e-05,
347
+ "loss": 0.9761,
348
+ "mean_token_accuracy": 0.7217265591025352,
349
  "num_tokens": 60897.0,
350
  "step": 34
351
  },
352
  {
353
+ "entropy": 1.7347675114870071,
354
  "epoch": 1.0613026819923372,
355
+ "grad_norm": 9.3125,
356
  "learning_rate": 2.6369575847181795e-05,
357
+ "loss": 1.1561,
358
+ "mean_token_accuracy": 0.7075180560350418,
359
  "num_tokens": 62325.0,
360
  "step": 35
361
  },
362
  {
363
+ "entropy": 1.5030861496925354,
364
  "epoch": 1.0919540229885056,
365
+ "grad_norm": 7.65625,
366
  "learning_rate": 2.5995778077447393e-05,
367
+ "loss": 0.8402,
368
+ "mean_token_accuracy": 0.7322944924235344,
369
  "num_tokens": 64163.0,
370
  "step": 36
371
  },
372
  {
373
+ "entropy": 1.3862270265817642,
374
  "epoch": 1.1226053639846743,
375
+ "grad_norm": 8.5625,
376
  "learning_rate": 2.5606601717798212e-05,
377
+ "loss": 0.9429,
378
+ "mean_token_accuracy": 0.7389034852385521,
379
  "num_tokens": 66168.0,
380
  "step": 37
381
  },
382
  {
383
+ "entropy": 1.3857311755418777,
384
  "epoch": 1.1532567049808429,
385
+ "grad_norm": 7.65625,
386
  "learning_rate": 2.520259106656379e-05,
387
+ "loss": 0.8564,
388
+ "mean_token_accuracy": 0.7321354225277901,
389
  "num_tokens": 68398.0,
390
  "step": 38
391
  },
392
  {
393
+ "entropy": 1.2590633258223534,
394
  "epoch": 1.1839080459770115,
395
+ "grad_norm": 9.75,
396
  "learning_rate": 2.4784311169171818e-05,
397
+ "loss": 0.9376,
398
+ "mean_token_accuracy": 0.7156714797019958,
399
  "num_tokens": 70548.0,
400
  "step": 39
401
  },
402
  {
403
+ "entropy": 1.2306247800588608,
404
  "epoch": 1.21455938697318,
405
+ "grad_norm": 10.9375,
406
  "learning_rate": 2.4352347027881003e-05,
407
+ "loss": 0.8899,
408
+ "mean_token_accuracy": 0.756280928850174,
409
  "num_tokens": 72463.0,
410
  "step": 40
411
  },
412
  {
413
+ "entropy": 1.110754244029522,
414
  "epoch": 1.2452107279693487,
415
+ "grad_norm": 12.125,
416
  "learning_rate": 2.3907302783602522e-05,
417
+ "loss": 0.7503,
418
+ "mean_token_accuracy": 0.7652318105101585,
419
  "num_tokens": 74061.0,
420
  "step": 41
421
  },
422
  {
423
+ "entropy": 1.1396447345614433,
424
  "epoch": 1.2758620689655173,
425
+ "grad_norm": 10.375,
426
  "learning_rate": 2.344980087095433e-05,
427
+ "loss": 0.774,
428
+ "mean_token_accuracy": 0.7681270688772202,
429
  "num_tokens": 76130.0,
430
  "step": 42
431
  },
432
  {
433
+ "entropy": 1.0957090184092522,
434
  "epoch": 1.3065134099616857,
435
+ "grad_norm": 12.4375,
436
  "learning_rate": 2.298048114773005e-05,
437
+ "loss": 0.7757,
438
+ "mean_token_accuracy": 0.767442375421524,
439
  "num_tokens": 77912.0,
440
  "step": 43
441
  },
442
  {
443
+ "entropy": 1.0323160290718079,
444
  "epoch": 1.3371647509578544,
445
+ "grad_norm": 10.625,
446
  "learning_rate": 2.25e-05,
447
+ "loss": 0.7192,
448
+ "mean_token_accuracy": 0.771703340113163,
449
  "num_tokens": 79873.0,
450
  "step": 44
451
  },
452
  {
453
+ "entropy": 1.1174012199044228,
454
  "epoch": 1.367816091954023,
455
+ "grad_norm": 13.1875,
456
  "learning_rate": 2.200902942409593e-05,
457
+ "loss": 0.7571,
458
+ "mean_token_accuracy": 0.7688822597265244,
459
  "num_tokens": 81708.0,
460
  "step": 45
461
  },
462
  {
463
+ "entropy": 1.133009672164917,
464
  "epoch": 1.3984674329501916,
465
+ "grad_norm": 11.4375,
466
  "learning_rate": 2.1508256086763372e-05,
467
+ "loss": 0.8328,
468
+ "mean_token_accuracy": 0.7457190081477165,
469
  "num_tokens": 83479.0,
470
  "step": 46
471
  },
472
  {
473
+ "entropy": 1.0821977257728577,
474
  "epoch": 1.4291187739463602,
475
+ "grad_norm": 12.25,
476
  "learning_rate": 2.0998380364796112e-05,
477
+ "loss": 0.8791,
478
+ "mean_token_accuracy": 0.7517153918743134,
479
  "num_tokens": 85091.0,
480
  "step": 47
481
  },
482
  {
483
+ "entropy": 1.160033829510212,
484
  "epoch": 1.4597701149425286,
485
+ "grad_norm": 10.25,
486
  "learning_rate": 2.0480115365495928e-05,
487
+ "loss": 0.7528,
488
+ "mean_token_accuracy": 0.7454545870423317,
489
  "num_tokens": 87067.0,
490
  "step": 48
491
  },
492
  {
493
+ "entropy": 1.09547870606184,
494
  "epoch": 1.4904214559386972,
495
+ "grad_norm": 8.1875,
496
  "learning_rate": 1.995418592932751e-05,
497
+ "loss": 0.6824,
498
+ "mean_token_accuracy": 0.8004695847630501,
499
  "num_tokens": 89257.0,
500
  "step": 49
501
  },
502
  {
503
+ "entropy": 1.1644561365246773,
504
  "epoch": 1.5210727969348659,
505
+ "grad_norm": 10.125,
506
  "learning_rate": 1.9421327616163564e-05,
507
+ "loss": 0.8229,
508
+ "mean_token_accuracy": 0.744444377720356,
509
  "num_tokens": 91129.0,
510
  "step": 50
511
  },
512
  {
513
+ "entropy": 1.1956558972597122,
514
  "epoch": 1.5517241379310345,
515
+ "grad_norm": 9.1875,
516
  "learning_rate": 1.888228567653781e-05,
517
+ "loss": 0.807,
518
+ "mean_token_accuracy": 0.7377020716667175,
519
  "num_tokens": 93217.0,
520
  "step": 51
521
  },
522
  {
523
+ "entropy": 1.2180762365460396,
524
  "epoch": 1.582375478927203,
525
+ "grad_norm": 9.125,
526
  "learning_rate": 1.8337814009344716e-05,
527
+ "loss": 0.6652,
528
+ "mean_token_accuracy": 0.7918966636061668,
529
  "num_tokens": 94882.0,
530
  "step": 52
531
  },
532
  {
533
+ "entropy": 1.2762009352445602,
534
  "epoch": 1.6130268199233715,
535
+ "grad_norm": 11.625,
536
  "learning_rate": 1.778867410744372e-05,
537
+ "loss": 0.8152,
538
+ "mean_token_accuracy": 0.7556928023695946,
539
  "num_tokens": 96226.0,
540
  "step": 53
541
  },
542
  {
543
+ "entropy": 1.2115763127803802,
544
  "epoch": 1.6436781609195403,
545
+ "grad_norm": 10.8125,
546
  "learning_rate": 1.7235633992642615e-05,
547
+ "loss": 0.7119,
548
+ "mean_token_accuracy": 0.7653274685144424,
549
  "num_tokens": 98064.0,
550
  "step": 54
551
  },
552
  {
553
+ "entropy": 1.301737241446972,
554
  "epoch": 1.6743295019157087,
555
+ "grad_norm": 8.75,
556
  "learning_rate": 1.667946714154962e-05,
557
+ "loss": 0.7362,
558
+ "mean_token_accuracy": 0.7743538916110992,
559
  "num_tokens": 99875.0,
560
  "step": 55
561
  },
562
  {
563
+ "entropy": 1.1645233482122421,
564
  "epoch": 1.7049808429118773,
565
+ "grad_norm": 8.125,
566
  "learning_rate": 1.6120951403796367e-05,
567
+ "loss": 0.7929,
568
+ "mean_token_accuracy": 0.7437388524413109,
569
  "num_tokens": 102303.0,
570
  "step": 56
571
  },
572
  {
573
+ "entropy": 1.2387544885277748,
574
  "epoch": 1.735632183908046,
575
+ "grad_norm": 10.125,
576
  "learning_rate": 1.5560867914144887e-05,
577
+ "loss": 0.7757,
578
+ "mean_token_accuracy": 0.760113924741745,
579
  "num_tokens": 103806.0,
580
  "step": 57
581
  },
582
  {
583
+ "entropy": 1.2401599884033203,
584
  "epoch": 1.7662835249042146,
585
+ "grad_norm": 12.25,
586
  "learning_rate": 1.5e-05,
587
+ "loss": 0.757,
588
+ "mean_token_accuracy": 0.7870561257004738,
589
  "num_tokens": 105012.0,
590
  "step": 58
591
  },
592
  {
593
+ "entropy": 1.3122059255838394,
594
  "epoch": 1.7969348659003832,
595
+ "grad_norm": 11.25,
596
  "learning_rate": 1.4439132085855117e-05,
597
+ "loss": 0.8231,
598
+ "mean_token_accuracy": 0.7717632800340652,
599
  "num_tokens": 106373.0,
600
  "step": 59
601
  },
602
  {
603
+ "entropy": 1.224107950925827,
604
  "epoch": 1.8275862068965516,
605
+ "grad_norm": 9.3125,
606
  "learning_rate": 1.3879048596203637e-05,
607
+ "loss": 0.6616,
608
+ "mean_token_accuracy": 0.8022700250148773,
609
  "num_tokens": 107938.0,
610
  "step": 60
611
  },
612
  {
613
+ "entropy": 1.2059504985809326,
614
  "epoch": 1.8582375478927204,
615
+ "grad_norm": 9.625,
616
  "learning_rate": 1.3320532858450382e-05,
617
+ "loss": 0.7585,
618
+ "mean_token_accuracy": 0.7686295211315155,
619
  "num_tokens": 109587.0,
620
  "step": 61
621
  },
622
  {
623
+ "entropy": 1.2734860181808472,
624
  "epoch": 1.8888888888888888,
625
+ "grad_norm": 12.4375,
626
  "learning_rate": 1.2764366007357382e-05,
627
+ "loss": 1.055,
628
+ "mean_token_accuracy": 0.707017719745636,
629
  "num_tokens": 111253.0,
630
  "step": 62
631
  },
632
  {
633
+ "entropy": 1.1893908977508545,
634
  "epoch": 1.9195402298850575,
635
+ "grad_norm": 11.1875,
636
  "learning_rate": 1.2211325892556282e-05,
637
+ "loss": 0.7912,
638
+ "mean_token_accuracy": 0.7822966873645782,
639
  "num_tokens": 112833.0,
640
  "step": 63
641
  },
642
  {
643
+ "entropy": 1.1533539071679115,
644
  "epoch": 1.950191570881226,
645
+ "grad_norm": 11.125,
646
  "learning_rate": 1.1662185990655285e-05,
647
+ "loss": 0.8553,
648
+ "mean_token_accuracy": 0.7498924359679222,
649
  "num_tokens": 114573.0,
650
  "step": 64
651
  },
652
  {
653
+ "entropy": 1.1270944774150848,
654
  "epoch": 1.9808429118773945,
655
+ "grad_norm": 8.25,
656
  "learning_rate": 1.1117714323462188e-05,
657
+ "loss": 0.7116,
658
+ "mean_token_accuracy": 0.7686784416437149,
659
  "num_tokens": 116981.0,
660
  "step": 65
661
  },
662
  {
663
+ "entropy": 1.2168401956558228,
664
  "epoch": 2.0,
665
+ "grad_norm": 18.375,
666
  "learning_rate": 1.0578672383836437e-05,
667
+ "loss": 1.1399,
668
+ "mean_token_accuracy": 0.6772964239120484,
669
  "num_tokens": 118284.0,
670
  "step": 66
671
  },
672
  {
673
+ "entropy": 1.0973209738731384,
674
  "epoch": 2.0306513409961684,
675
+ "grad_norm": 7.8125,
676
  "learning_rate": 1.0045814070672498e-05,
677
+ "loss": 0.3245,
678
+ "mean_token_accuracy": 0.9032263904809952,
679
  "num_tokens": 119663.0,
680
  "step": 67
681
  },
682
  {
683
+ "entropy": 1.053741380572319,
684
  "epoch": 2.0613026819923372,
685
+ "grad_norm": 6.0,
686
  "learning_rate": 9.519884634504074e-06,
687
+ "loss": 0.2808,
688
+ "mean_token_accuracy": 0.9356953203678131,
689
  "num_tokens": 121476.0,
690
  "step": 68
691
  },
692
  {
693
+ "entropy": 0.9946238845586777,
694
  "epoch": 2.0919540229885056,
695
+ "grad_norm": 5.375,
696
  "learning_rate": 9.001619635203889e-06,
697
+ "loss": 0.2809,
698
+ "mean_token_accuracy": 0.9175683632493019,
699
  "num_tokens": 123792.0,
700
  "step": 69
701
  },
702
  {
703
+ "entropy": 1.015475258231163,
704
  "epoch": 2.1226053639846745,
705
+ "grad_norm": 6.65625,
706
  "learning_rate": 8.491743913236629e-06,
707
+ "loss": 0.2802,
708
+ "mean_token_accuracy": 0.9311554208397865,
709
  "num_tokens": 125329.0,
710
  "step": 70
711
  },
712
  {
713
+ "entropy": 0.9921716600656509,
714
  "epoch": 2.153256704980843,
715
+ "grad_norm": 6.78125,
716
  "learning_rate": 7.99097057590407e-06,
717
+ "loss": 0.2807,
718
+ "mean_token_accuracy": 0.9192091822624207,
719
  "num_tokens": 126654.0,
720
  "step": 71
721
  },
722
  {
723
+ "entropy": 0.8778632581233978,
724
  "epoch": 2.1839080459770113,
725
+ "grad_norm": 6.09375,
726
  "learning_rate": 7.500000000000004e-06,
727
+ "loss": 0.2776,
728
+ "mean_token_accuracy": 0.9309542253613472,
729
  "num_tokens": 128629.0,
730
  "step": 72
731
  },
732
  {
733
+ "entropy": 0.953188918530941,
734
  "epoch": 2.21455938697318,
735
+ "grad_norm": 8.6875,
736
  "learning_rate": 7.019518852269953e-06,
737
+ "loss": 0.4596,
738
+ "mean_token_accuracy": 0.8634384647011757,
739
  "num_tokens": 130344.0,
740
  "step": 73
741
  },
742
  {
743
+ "entropy": 0.8518025800585747,
744
  "epoch": 2.2452107279693485,
745
+ "grad_norm": 7.46875,
746
  "learning_rate": 6.55019912904567e-06,
747
+ "loss": 0.3006,
748
+ "mean_token_accuracy": 0.9241785854101181,
749
  "num_tokens": 132152.0,
750
  "step": 74
751
  },
752
  {
753
+ "entropy": 0.8467591479420662,
754
  "epoch": 2.2758620689655173,
755
+ "grad_norm": 6.40625,
756
  "learning_rate": 6.092697216397478e-06,
757
+ "loss": 0.2682,
758
+ "mean_token_accuracy": 0.9179906323552132,
759
  "num_tokens": 134144.0,
760
  "step": 75
761
  },
762
  {
763
+ "entropy": 0.7837551906704903,
764
  "epoch": 2.3065134099616857,
765
+ "grad_norm": 7.25,
766
  "learning_rate": 5.647652972118998e-06,
767
+ "loss": 0.3422,
768
+ "mean_token_accuracy": 0.8964523077011108,
769
  "num_tokens": 136715.0,
770
  "step": 76
771
  },
772
  {
773
+ "entropy": 0.7817510291934013,
774
  "epoch": 2.3371647509578546,
775
+ "grad_norm": 7.25,
776
  "learning_rate": 5.2156888308281875e-06,
777
+ "loss": 0.2678,
778
+ "mean_token_accuracy": 0.9292137995362282,
779
  "num_tokens": 138907.0,
780
  "step": 77
781
  },
782
  {
783
+ "entropy": 0.7645558379590511,
784
  "epoch": 2.367816091954023,
785
+ "grad_norm": 7.6875,
786
  "learning_rate": 4.797408933436207e-06,
787
+ "loss": 0.2069,
788
+ "mean_token_accuracy": 0.9325998574495316,
789
  "num_tokens": 140536.0,
790
  "step": 78
791
  },
792
  {
793
+ "entropy": 0.756471686065197,
794
  "epoch": 2.3984674329501914,
795
+ "grad_norm": 8.6875,
796
  "learning_rate": 4.393398282201788e-06,
797
+ "loss": 0.2288,
798
+ "mean_token_accuracy": 0.924439363181591,
799
  "num_tokens": 142205.0,
800
  "step": 79
801
  },
802
  {
803
+ "entropy": 0.7203860953450203,
804
  "epoch": 2.42911877394636,
805
+ "grad_norm": 8.75,
806
  "learning_rate": 4.004221922552608e-06,
807
+ "loss": 0.3023,
808
+ "mean_token_accuracy": 0.9196523949503899,
809
  "num_tokens": 143937.0,
810
  "step": 80
811
  },
812
  {
813
+ "entropy": 0.7062718719244003,
814
  "epoch": 2.4597701149425286,
815
+ "grad_norm": 8.3125,
816
  "learning_rate": 3.630424152818203e-06,
817
+ "loss": 0.242,
818
+ "mean_token_accuracy": 0.9289174377918243,
819
  "num_tokens": 145867.0,
820
  "step": 81
821
  },
822
  {
823
+ "entropy": 0.7174801900982857,
824
  "epoch": 2.4904214559386975,
825
+ "grad_norm": 10.0625,
826
  "learning_rate": 3.272527762979553e-06,
827
+ "loss": 0.3277,
828
+ "mean_token_accuracy": 0.9081463739275932,
829
  "num_tokens": 147522.0,
830
  "step": 82
831
  },
832
  {
833
+ "entropy": 0.7576407790184021,
834
  "epoch": 2.521072796934866,
835
+ "grad_norm": 10.5,
836
  "learning_rate": 2.931033303499975e-06,
837
+ "loss": 0.2869,
838
+ "mean_token_accuracy": 0.9234072640538216,
839
  "num_tokens": 149154.0,
840
  "step": 83
841
  },
842
  {
843
+ "entropy": 0.6603295132517815,
844
  "epoch": 2.5517241379310347,
845
+ "grad_norm": 8.5,
846
  "learning_rate": 2.60641838526008e-06,
847
+ "loss": 0.2954,
848
+ "mean_token_accuracy": 0.9192768260836601,
849
  "num_tokens": 151443.0,
850
  "step": 84
851
  },
852
  {
853
+ "entropy": 0.7209493666887283,
854
  "epoch": 2.582375478927203,
855
+ "grad_norm": 7.625,
856
  "learning_rate": 2.2991370115757383e-06,
857
+ "loss": 0.2553,
858
+ "mean_token_accuracy": 0.9288515150547028,
859
  "num_tokens": 153346.0,
860
  "step": 85
861
  },
862
  {
863
+ "entropy": 0.7502265051007271,
864
  "epoch": 2.6130268199233715,
865
+ "grad_norm": 10.0625,
866
  "learning_rate": 2.0096189432334194e-06,
867
+ "loss": 0.2759,
868
+ "mean_token_accuracy": 0.9101333618164062,
869
  "num_tokens": 155041.0,
870
  "step": 86
871
  },
872
  {
873
+ "entropy": 0.6479271687567234,
874
  "epoch": 2.6436781609195403,
875
+ "grad_norm": 7.65625,
876
  "learning_rate": 1.7382690974308551e-06,
877
+ "loss": 0.1765,
878
+ "mean_token_accuracy": 0.9528548792004585,
879
  "num_tokens": 156508.0,
880
  "step": 87
881
  },
882
  {
883
+ "entropy": 0.686508409678936,
884
  "epoch": 2.6743295019157087,
885
+ "grad_norm": 6.5625,
886
  "learning_rate": 1.4854669814637145e-06,
887
+ "loss": 0.1907,
888
+ "mean_token_accuracy": 0.9471124485135078,
889
  "num_tokens": 158506.0,
890
  "step": 88
891
  },
892
  {
893
+ "entropy": 0.6940162889659405,
894
  "epoch": 2.704980842911877,
895
+ "grad_norm": 7.0625,
896
  "learning_rate": 1.2515661619503572e-06,
897
+ "loss": 0.2139,
898
+ "mean_token_accuracy": 0.9348281025886536,
899
  "num_tokens": 160511.0,
900
  "step": 89
901
  },
902
  {
903
+ "entropy": 0.7100252062082291,
904
  "epoch": 2.735632183908046,
905
+ "grad_norm": 9.0625,
906
  "learning_rate": 1.036893770336938e-06,
907
+ "loss": 0.2846,
908
+ "mean_token_accuracy": 0.9120082557201385,
909
  "num_tokens": 162548.0,
910
  "step": 90
911
  },
912
  {
913
+ "entropy": 0.689895510673523,
914
  "epoch": 2.766283524904215,
915
+ "grad_norm": 7.59375,
916
  "learning_rate": 8.417500453744864e-07,
917
+ "loss": 0.2794,
918
+ "mean_token_accuracy": 0.9187788665294647,
919
  "num_tokens": 164874.0,
920
  "step": 91
921
  },
922
  {
923
+ "entropy": 0.6664801873266697,
924
  "epoch": 2.796934865900383,
925
+ "grad_norm": 7.96875,
926
  "learning_rate": 6.664079132078881e-07,
927
+ "loss": 0.199,
928
+ "mean_token_accuracy": 0.94305020570755,
929
  "num_tokens": 166614.0,
930
  "step": 92
931
  },
932
  {
933
+ "entropy": 0.7356143966317177,
934
  "epoch": 2.8275862068965516,
935
+ "grad_norm": 29.25,
936
  "learning_rate": 5.11112605663977e-07,
937
+ "loss": 0.3566,
938
+ "mean_token_accuracy": 0.8869450762867928,
939
  "num_tokens": 168220.0,
940
  "step": 93
941
  },
942
  {
943
+ "entropy": 0.7260653525590897,
944
  "epoch": 2.8582375478927204,
945
+ "grad_norm": 12.0625,
946
  "learning_rate": 3.760813172726457e-07,
947
+ "loss": 0.2395,
948
+ "mean_token_accuracy": 0.9347701147198677,
949
  "num_tokens": 169540.0,
950
  "step": 94
951
  },
952
  {
953
+ "entropy": 0.6620675958693027,
954
  "epoch": 2.888888888888889,
955
+ "grad_norm": 7.3125,
956
  "learning_rate": 2.6150290150067593e-07,
957
+ "loss": 0.2358,
958
+ "mean_token_accuracy": 0.9333521574735641,
959
  "num_tokens": 171709.0,
960
  "step": 95
961
+ },
962
+ {
963
+ "entropy": 0.6657432429492474,
964
+ "epoch": 2.9195402298850572,
965
+ "grad_norm": 9.375,
966
+ "learning_rate": 1.6753760662307217e-07,
967
+ "loss": 0.2499,
968
+ "mean_token_accuracy": 0.9248412474989891,
969
+ "num_tokens": 173432.0,
970
+ "step": 96
971
+ },
972
+ {
973
+ "entropy": 0.6610175892710686,
974
+ "epoch": 2.950191570881226,
975
+ "grad_norm": 10.3125,
976
+ "learning_rate": 9.431685160136094e-08,
977
+ "loss": 0.2274,
978
+ "mean_token_accuracy": 0.9352346211671829,
979
+ "num_tokens": 174962.0,
980
+ "step": 97
981
+ },
982
+ {
983
+ "entropy": 0.6855079308152199,
984
+ "epoch": 2.9808429118773945,
985
+ "grad_norm": 9.4375,
986
+ "learning_rate": 4.194304228229806e-08,
987
+ "loss": 0.2806,
988
+ "mean_token_accuracy": 0.9201195910573006,
989
+ "num_tokens": 176611.0,
990
+ "step": 98
991
+ },
992
+ {
993
+ "entropy": 0.6942157626152039,
994
+ "epoch": 3.0,
995
+ "grad_norm": 11.0625,
996
+ "learning_rate": 1.0489428174020877e-08,
997
+ "loss": 0.1556,
998
+ "mean_token_accuracy": 0.9565272331237793,
999
+ "num_tokens": 177426.0,
1000
+ "step": 99
1001
  }
1002
  ],
1003
  "logging_steps": 1,
 
1012
  "should_evaluate": false,
1013
  "should_log": false,
1014
  "should_save": true,
1015
+ "should_training_stop": true
1016
  },
1017
  "attributes": {}
1018
  }
1019
  },
1020
+ "total_flos": 4854658109841408.0,
1021
  "train_batch_size": 2,
1022
  "trial_name": null,
1023
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcb900edd003d30a9496375af3b00e493c234370e0d58063b9ebd80fb0228298
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11dc7c8092aa2b8ebf234fc84d3e707b2126e3e231f1ae373dfe72c25a33e317
3
  size 6353