my-sft-stage1 / trainer_state.json
haebo1's picture
Upload folder using huggingface_hub
a5d3bbd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 10026,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 3.0024948120117188,
"epoch": 0.005984440454817474,
"grad_norm": 7.78125,
"learning_rate": 5.994614003590664e-05,
"loss": 1.692,
"mean_token_accuracy": 0.7139140546321869,
"num_tokens": 10416.0,
"step": 10
},
{
"entropy": 2.6948025226593018,
"epoch": 0.011968880909634948,
"grad_norm": 9.0,
"learning_rate": 5.9886295631358466e-05,
"loss": 0.4662,
"mean_token_accuracy": 0.8840459525585175,
"num_tokens": 20947.0,
"step": 20
},
{
"entropy": 2.547980785369873,
"epoch": 0.017953321364452424,
"grad_norm": 8.1875,
"learning_rate": 5.98264512268103e-05,
"loss": 0.287,
"mean_token_accuracy": 0.9282953619956971,
"num_tokens": 31370.0,
"step": 30
},
{
"entropy": 2.285873460769653,
"epoch": 0.023937761819269897,
"grad_norm": 4.53125,
"learning_rate": 5.976660682226212e-05,
"loss": 0.1783,
"mean_token_accuracy": 0.9513040065765381,
"num_tokens": 41996.0,
"step": 40
},
{
"entropy": 2.4315436601638796,
"epoch": 0.029922202274087373,
"grad_norm": 6.375,
"learning_rate": 5.9706762417713944e-05,
"loss": 0.183,
"mean_token_accuracy": 0.9531135976314544,
"num_tokens": 52464.0,
"step": 50
},
{
"entropy": 2.605304408073425,
"epoch": 0.03590664272890485,
"grad_norm": 4.75,
"learning_rate": 5.9646918013165774e-05,
"loss": 0.1501,
"mean_token_accuracy": 0.9575372993946075,
"num_tokens": 63125.0,
"step": 60
},
{
"entropy": 2.5587785959243776,
"epoch": 0.041891083183722325,
"grad_norm": 5.75,
"learning_rate": 5.95870736086176e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9575134456157685,
"num_tokens": 73701.0,
"step": 70
},
{
"entropy": 2.4741239070892336,
"epoch": 0.047875523638539794,
"grad_norm": 4.1875,
"learning_rate": 5.9527229204069415e-05,
"loss": 0.1891,
"mean_token_accuracy": 0.9568629384040832,
"num_tokens": 84353.0,
"step": 80
},
{
"entropy": 2.935908842086792,
"epoch": 0.05385996409335727,
"grad_norm": 4.0,
"learning_rate": 5.9467384799521245e-05,
"loss": 0.1554,
"mean_token_accuracy": 0.9589068710803985,
"num_tokens": 94954.0,
"step": 90
},
{
"entropy": 2.6647072076797484,
"epoch": 0.059844404548174746,
"grad_norm": 8.6875,
"learning_rate": 5.940754039497307e-05,
"loss": 0.1459,
"mean_token_accuracy": 0.962292617559433,
"num_tokens": 105675.0,
"step": 100
},
{
"entropy": 2.3634035110473635,
"epoch": 0.06582884500299221,
"grad_norm": 3.65625,
"learning_rate": 5.93476959904249e-05,
"loss": 0.1093,
"mean_token_accuracy": 0.9729574143886566,
"num_tokens": 116315.0,
"step": 110
},
{
"entropy": 2.365880823135376,
"epoch": 0.0718132854578097,
"grad_norm": 3.109375,
"learning_rate": 5.928785158587672e-05,
"loss": 0.0802,
"mean_token_accuracy": 0.9765478909015656,
"num_tokens": 126940.0,
"step": 120
},
{
"entropy": 2.392621302604675,
"epoch": 0.07779772591262717,
"grad_norm": 4.0625,
"learning_rate": 5.9228007181328546e-05,
"loss": 0.0809,
"mean_token_accuracy": 0.9781265318393707,
"num_tokens": 137513.0,
"step": 130
},
{
"entropy": 2.3370444774627686,
"epoch": 0.08378216636744465,
"grad_norm": 2.78125,
"learning_rate": 5.916816277678038e-05,
"loss": 0.1093,
"mean_token_accuracy": 0.9741817772388458,
"num_tokens": 148032.0,
"step": 140
},
{
"entropy": 2.2481669664382933,
"epoch": 0.08976660682226212,
"grad_norm": 5.71875,
"learning_rate": 5.91083183722322e-05,
"loss": 0.0974,
"mean_token_accuracy": 0.9788482308387756,
"num_tokens": 158653.0,
"step": 150
},
{
"entropy": 2.246716928482056,
"epoch": 0.09575104727707959,
"grad_norm": 3.015625,
"learning_rate": 5.904847396768402e-05,
"loss": 0.1056,
"mean_token_accuracy": 0.9712280333042145,
"num_tokens": 169103.0,
"step": 160
},
{
"entropy": 2.401676082611084,
"epoch": 0.10173548773189707,
"grad_norm": 3.8125,
"learning_rate": 5.898862956313585e-05,
"loss": 0.088,
"mean_token_accuracy": 0.978470116853714,
"num_tokens": 179647.0,
"step": 170
},
{
"entropy": 2.4804759502410887,
"epoch": 0.10771992818671454,
"grad_norm": 6.9375,
"learning_rate": 5.892878515858767e-05,
"loss": 0.0924,
"mean_token_accuracy": 0.9786226749420166,
"num_tokens": 190046.0,
"step": 180
},
{
"entropy": 2.4066271781921387,
"epoch": 0.11370436864153202,
"grad_norm": 3.71875,
"learning_rate": 5.88689407540395e-05,
"loss": 0.092,
"mean_token_accuracy": 0.9766816020011901,
"num_tokens": 200450.0,
"step": 190
},
{
"entropy": 2.479763078689575,
"epoch": 0.11968880909634949,
"grad_norm": 5.65625,
"learning_rate": 5.8809096349491325e-05,
"loss": 0.0814,
"mean_token_accuracy": 0.9800179958343506,
"num_tokens": 210921.0,
"step": 200
},
{
"entropy": 2.5948761463165284,
"epoch": 0.12567324955116696,
"grad_norm": 2.53125,
"learning_rate": 5.874925194494315e-05,
"loss": 0.0965,
"mean_token_accuracy": 0.9757415950298309,
"num_tokens": 221246.0,
"step": 210
},
{
"entropy": 2.6720625877380373,
"epoch": 0.13165769000598443,
"grad_norm": 3.265625,
"learning_rate": 5.868940754039498e-05,
"loss": 0.0904,
"mean_token_accuracy": 0.9789108991622925,
"num_tokens": 231857.0,
"step": 220
},
{
"entropy": 2.39876389503479,
"epoch": 0.13764213046080193,
"grad_norm": 4.28125,
"learning_rate": 5.86295631358468e-05,
"loss": 0.0519,
"mean_token_accuracy": 0.9840085744857788,
"num_tokens": 242592.0,
"step": 230
},
{
"entropy": 2.083107316493988,
"epoch": 0.1436265709156194,
"grad_norm": 0.7734375,
"learning_rate": 5.856971873129862e-05,
"loss": 0.0667,
"mean_token_accuracy": 0.9842009723186493,
"num_tokens": 253100.0,
"step": 240
},
{
"entropy": 2.1161860466003417,
"epoch": 0.14961101137043686,
"grad_norm": 4.34375,
"learning_rate": 5.850987432675045e-05,
"loss": 0.0655,
"mean_token_accuracy": 0.9847101211547852,
"num_tokens": 263645.0,
"step": 250
},
{
"entropy": 2.261358118057251,
"epoch": 0.15559545182525433,
"grad_norm": 1.5703125,
"learning_rate": 5.845002992220227e-05,
"loss": 0.0641,
"mean_token_accuracy": 0.9861262500286102,
"num_tokens": 274021.0,
"step": 260
},
{
"entropy": 2.2319637894630433,
"epoch": 0.1615798922800718,
"grad_norm": 4.15625,
"learning_rate": 5.83901855176541e-05,
"loss": 0.0549,
"mean_token_accuracy": 0.9827041685581207,
"num_tokens": 284428.0,
"step": 270
},
{
"entropy": 2.225303065776825,
"epoch": 0.1675643327348893,
"grad_norm": 3.546875,
"learning_rate": 5.833034111310593e-05,
"loss": 0.0712,
"mean_token_accuracy": 0.9861607074737548,
"num_tokens": 295043.0,
"step": 280
},
{
"entropy": 2.5183464765548704,
"epoch": 0.17354877318970677,
"grad_norm": 5.59375,
"learning_rate": 5.827049670855775e-05,
"loss": 0.0594,
"mean_token_accuracy": 0.9836657404899597,
"num_tokens": 305618.0,
"step": 290
},
{
"entropy": 2.3645459413528442,
"epoch": 0.17953321364452424,
"grad_norm": 2.5,
"learning_rate": 5.821065230400958e-05,
"loss": 0.0537,
"mean_token_accuracy": 0.9861385583877563,
"num_tokens": 316134.0,
"step": 300
},
{
"entropy": 2.3220778465270997,
"epoch": 0.1855176540993417,
"grad_norm": 0.70703125,
"learning_rate": 5.8150807899461405e-05,
"loss": 0.0412,
"mean_token_accuracy": 0.9872022569179535,
"num_tokens": 326594.0,
"step": 310
},
{
"entropy": 2.41537606716156,
"epoch": 0.19150209455415917,
"grad_norm": 2.84375,
"learning_rate": 5.809096349491323e-05,
"loss": 0.0741,
"mean_token_accuracy": 0.9834691464900971,
"num_tokens": 337278.0,
"step": 320
},
{
"entropy": 2.437514638900757,
"epoch": 0.19748653500897667,
"grad_norm": 6.1875,
"learning_rate": 5.803111909036505e-05,
"loss": 0.077,
"mean_token_accuracy": 0.9830995500087738,
"num_tokens": 347783.0,
"step": 330
},
{
"entropy": 2.54942626953125,
"epoch": 0.20347097546379414,
"grad_norm": 3.015625,
"learning_rate": 5.7971274685816876e-05,
"loss": 0.0908,
"mean_token_accuracy": 0.9789189040660858,
"num_tokens": 358140.0,
"step": 340
},
{
"entropy": 2.6029396533966063,
"epoch": 0.2094554159186116,
"grad_norm": 6.4375,
"learning_rate": 5.79114302812687e-05,
"loss": 0.0813,
"mean_token_accuracy": 0.9807312726974488,
"num_tokens": 368649.0,
"step": 350
},
{
"entropy": 2.7818240880966187,
"epoch": 0.21543985637342908,
"grad_norm": 1.8203125,
"learning_rate": 5.785158587672053e-05,
"loss": 0.0516,
"mean_token_accuracy": 0.9896690011024475,
"num_tokens": 379399.0,
"step": 360
},
{
"entropy": 2.6641830682754515,
"epoch": 0.22142429682824655,
"grad_norm": 5.5625,
"learning_rate": 5.779174147217235e-05,
"loss": 0.0422,
"mean_token_accuracy": 0.988187575340271,
"num_tokens": 389856.0,
"step": 370
},
{
"entropy": 2.4715123176574707,
"epoch": 0.22740873728306404,
"grad_norm": 2.8125,
"learning_rate": 5.773189706762418e-05,
"loss": 0.0468,
"mean_token_accuracy": 0.9897852420806885,
"num_tokens": 400419.0,
"step": 380
},
{
"entropy": 2.1948570370674134,
"epoch": 0.2333931777378815,
"grad_norm": 2.71875,
"learning_rate": 5.767205266307601e-05,
"loss": 0.0577,
"mean_token_accuracy": 0.9862517893314362,
"num_tokens": 410980.0,
"step": 390
},
{
"entropy": 2.1625555634498594,
"epoch": 0.23937761819269898,
"grad_norm": 3.390625,
"learning_rate": 5.761220825852783e-05,
"loss": 0.0446,
"mean_token_accuracy": 0.9873133063316345,
"num_tokens": 421285.0,
"step": 400
},
{
"entropy": 2.277107524871826,
"epoch": 0.24536205864751645,
"grad_norm": 2.734375,
"learning_rate": 5.7552363853979654e-05,
"loss": 0.0663,
"mean_token_accuracy": 0.984232223033905,
"num_tokens": 431743.0,
"step": 410
},
{
"entropy": 2.079650855064392,
"epoch": 0.2513464991023339,
"grad_norm": 2.609375,
"learning_rate": 5.749251944943148e-05,
"loss": 0.042,
"mean_token_accuracy": 0.9889630198478698,
"num_tokens": 442208.0,
"step": 420
},
{
"entropy": 2.0632903933525086,
"epoch": 0.2573309395571514,
"grad_norm": 3.28125,
"learning_rate": 5.74326750448833e-05,
"loss": 0.0658,
"mean_token_accuracy": 0.9887919366359711,
"num_tokens": 452907.0,
"step": 430
},
{
"entropy": 2.0579532027244567,
"epoch": 0.26331538001196886,
"grad_norm": 5.3125,
"learning_rate": 5.737283064033513e-05,
"loss": 0.0586,
"mean_token_accuracy": 0.9878389358520507,
"num_tokens": 463376.0,
"step": 440
},
{
"entropy": 2.272001266479492,
"epoch": 0.26929982046678635,
"grad_norm": 3.671875,
"learning_rate": 5.7312986235786956e-05,
"loss": 0.0555,
"mean_token_accuracy": 0.9843586683273315,
"num_tokens": 474050.0,
"step": 450
},
{
"entropy": 2.440229368209839,
"epoch": 0.27528426092160385,
"grad_norm": 0.765625,
"learning_rate": 5.725314183123878e-05,
"loss": 0.0539,
"mean_token_accuracy": 0.9886721253395081,
"num_tokens": 484638.0,
"step": 460
},
{
"entropy": 2.22398726940155,
"epoch": 0.2812687013764213,
"grad_norm": 2.375,
"learning_rate": 5.719329742669061e-05,
"loss": 0.038,
"mean_token_accuracy": 0.9924259662628174,
"num_tokens": 495422.0,
"step": 470
},
{
"entropy": 2.2964001417160036,
"epoch": 0.2872531418312388,
"grad_norm": 1.796875,
"learning_rate": 5.713345302214243e-05,
"loss": 0.0403,
"mean_token_accuracy": 0.9900494039058685,
"num_tokens": 506057.0,
"step": 480
},
{
"entropy": 2.329486632347107,
"epoch": 0.29323758228605623,
"grad_norm": 4.4375,
"learning_rate": 5.707360861759426e-05,
"loss": 0.0429,
"mean_token_accuracy": 0.989715838432312,
"num_tokens": 516463.0,
"step": 490
},
{
"entropy": 2.3366709470748903,
"epoch": 0.2992220227408737,
"grad_norm": 1.0,
"learning_rate": 5.701376421304608e-05,
"loss": 0.0645,
"mean_token_accuracy": 0.9869139313697814,
"num_tokens": 526981.0,
"step": 500
},
{
"entropy": 2.4269517421722413,
"epoch": 0.3052064631956912,
"grad_norm": 2.4375,
"learning_rate": 5.6953919808497904e-05,
"loss": 0.0475,
"mean_token_accuracy": 0.9899543404579163,
"num_tokens": 537626.0,
"step": 510
},
{
"entropy": 2.1964712858200075,
"epoch": 0.31119090365050867,
"grad_norm": 1.984375,
"learning_rate": 5.6894075403949734e-05,
"loss": 0.0672,
"mean_token_accuracy": 0.9869764924049378,
"num_tokens": 548046.0,
"step": 520
},
{
"entropy": 2.1039986968040467,
"epoch": 0.31717534410532616,
"grad_norm": 1.4921875,
"learning_rate": 5.683423099940156e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9941431105136871,
"num_tokens": 558631.0,
"step": 530
},
{
"entropy": 2.3732316970825194,
"epoch": 0.3231597845601436,
"grad_norm": 0.796875,
"learning_rate": 5.677438659485338e-05,
"loss": 0.0666,
"mean_token_accuracy": 0.9879998922348022,
"num_tokens": 569247.0,
"step": 540
},
{
"entropy": 2.28140549659729,
"epoch": 0.3291442250149611,
"grad_norm": 2.3125,
"learning_rate": 5.671454219030521e-05,
"loss": 0.0406,
"mean_token_accuracy": 0.9887649655342102,
"num_tokens": 579833.0,
"step": 550
},
{
"entropy": 2.4391223430633544,
"epoch": 0.3351286654697786,
"grad_norm": 1.1484375,
"learning_rate": 5.6654697785757035e-05,
"loss": 0.0495,
"mean_token_accuracy": 0.9919759035110474,
"num_tokens": 590311.0,
"step": 560
},
{
"entropy": 2.4771656036376952,
"epoch": 0.34111310592459604,
"grad_norm": 2.5625,
"learning_rate": 5.659485338120885e-05,
"loss": 0.0433,
"mean_token_accuracy": 0.9914680182933807,
"num_tokens": 600739.0,
"step": 570
},
{
"entropy": 2.2438557863235475,
"epoch": 0.34709754637941354,
"grad_norm": 1.34375,
"learning_rate": 5.653500897666068e-05,
"loss": 0.0469,
"mean_token_accuracy": 0.9890537202358246,
"num_tokens": 611176.0,
"step": 580
},
{
"entropy": 2.1311206340789797,
"epoch": 0.353081986834231,
"grad_norm": 3.25,
"learning_rate": 5.6475164572112506e-05,
"loss": 0.0274,
"mean_token_accuracy": 0.9945613145828247,
"num_tokens": 621635.0,
"step": 590
},
{
"entropy": 2.1030924916267395,
"epoch": 0.3590664272890485,
"grad_norm": 0.76171875,
"learning_rate": 5.641532016756434e-05,
"loss": 0.0312,
"mean_token_accuracy": 0.9910819947719574,
"num_tokens": 632248.0,
"step": 600
},
{
"entropy": 2.344179320335388,
"epoch": 0.36505086774386597,
"grad_norm": 0.984375,
"learning_rate": 5.635547576301616e-05,
"loss": 0.0472,
"mean_token_accuracy": 0.9889743030071259,
"num_tokens": 642682.0,
"step": 610
},
{
"entropy": 2.39926335811615,
"epoch": 0.3710353081986834,
"grad_norm": 1.9765625,
"learning_rate": 5.6295631358467984e-05,
"loss": 0.0532,
"mean_token_accuracy": 0.9905676782131195,
"num_tokens": 653445.0,
"step": 620
},
{
"entropy": 2.1937442779541017,
"epoch": 0.3770197486535009,
"grad_norm": 2.203125,
"learning_rate": 5.6235786953919814e-05,
"loss": 0.0378,
"mean_token_accuracy": 0.991233092546463,
"num_tokens": 663807.0,
"step": 630
},
{
"entropy": 2.0357667565345765,
"epoch": 0.38300418910831835,
"grad_norm": 2.78125,
"learning_rate": 5.617594254937164e-05,
"loss": 0.0412,
"mean_token_accuracy": 0.9888907968997955,
"num_tokens": 674458.0,
"step": 640
},
{
"entropy": 1.9218869924545288,
"epoch": 0.38898862956313585,
"grad_norm": 1.2890625,
"learning_rate": 5.6116098144823455e-05,
"loss": 0.024,
"mean_token_accuracy": 0.9945397853851319,
"num_tokens": 684951.0,
"step": 650
},
{
"entropy": 1.7186935544013977,
"epoch": 0.39497307001795334,
"grad_norm": 2.484375,
"learning_rate": 5.6056253740275285e-05,
"loss": 0.0378,
"mean_token_accuracy": 0.9907946050167084,
"num_tokens": 695456.0,
"step": 660
},
{
"entropy": 1.7377236485481262,
"epoch": 0.4009575104727708,
"grad_norm": 1.8828125,
"learning_rate": 5.599640933572711e-05,
"loss": 0.0589,
"mean_token_accuracy": 0.988366037607193,
"num_tokens": 706028.0,
"step": 670
},
{
"entropy": 2.0943747401237487,
"epoch": 0.4069419509275883,
"grad_norm": 1.5859375,
"learning_rate": 5.593656493117893e-05,
"loss": 0.0281,
"mean_token_accuracy": 0.9936488032341003,
"num_tokens": 716721.0,
"step": 680
},
{
"entropy": 2.1617534160614014,
"epoch": 0.4129263913824057,
"grad_norm": 1.0234375,
"learning_rate": 5.587672052663076e-05,
"loss": 0.0549,
"mean_token_accuracy": 0.9885023236274719,
"num_tokens": 727257.0,
"step": 690
},
{
"entropy": 2.031095576286316,
"epoch": 0.4189108318372232,
"grad_norm": 2.34375,
"learning_rate": 5.5816876122082586e-05,
"loss": 0.0341,
"mean_token_accuracy": 0.991800045967102,
"num_tokens": 737999.0,
"step": 700
},
{
"entropy": 1.7404110312461853,
"epoch": 0.4248952722920407,
"grad_norm": 1.8828125,
"learning_rate": 5.5757031717534417e-05,
"loss": 0.0375,
"mean_token_accuracy": 0.9914985001087189,
"num_tokens": 748758.0,
"step": 710
},
{
"entropy": 1.7197586655616761,
"epoch": 0.43087971274685816,
"grad_norm": 2.078125,
"learning_rate": 5.569718731298624e-05,
"loss": 0.0367,
"mean_token_accuracy": 0.9925475895404816,
"num_tokens": 759197.0,
"step": 720
},
{
"entropy": 1.8254467248916626,
"epoch": 0.43686415320167565,
"grad_norm": 0.251953125,
"learning_rate": 5.563734290843806e-05,
"loss": 0.0231,
"mean_token_accuracy": 0.9942022025585174,
"num_tokens": 769493.0,
"step": 730
},
{
"entropy": 2.1383556604385374,
"epoch": 0.4428485936564931,
"grad_norm": 0.33984375,
"learning_rate": 5.557749850388989e-05,
"loss": 0.0438,
"mean_token_accuracy": 0.9910104990005493,
"num_tokens": 779975.0,
"step": 740
},
{
"entropy": 2.26552300453186,
"epoch": 0.4488330341113106,
"grad_norm": 0.9296875,
"learning_rate": 5.551765409934171e-05,
"loss": 0.0396,
"mean_token_accuracy": 0.992530471086502,
"num_tokens": 790633.0,
"step": 750
},
{
"entropy": 2.0240984559059143,
"epoch": 0.4548174745661281,
"grad_norm": 2.328125,
"learning_rate": 5.5457809694793535e-05,
"loss": 0.0387,
"mean_token_accuracy": 0.9894236505031586,
"num_tokens": 801174.0,
"step": 760
},
{
"entropy": 1.966257667541504,
"epoch": 0.46080191502094553,
"grad_norm": 5.375,
"learning_rate": 5.5397965290245365e-05,
"loss": 0.0468,
"mean_token_accuracy": 0.9893896758556366,
"num_tokens": 811800.0,
"step": 770
},
{
"entropy": 2.0768365025520326,
"epoch": 0.466786355475763,
"grad_norm": 5.8125,
"learning_rate": 5.533812088569719e-05,
"loss": 0.0328,
"mean_token_accuracy": 0.9934167742729187,
"num_tokens": 822464.0,
"step": 780
},
{
"entropy": 1.9794872641563415,
"epoch": 0.47277079593058047,
"grad_norm": 1.8203125,
"learning_rate": 5.527827648114902e-05,
"loss": 0.0313,
"mean_token_accuracy": 0.9931986689567566,
"num_tokens": 832938.0,
"step": 790
},
{
"entropy": 1.9216319561004638,
"epoch": 0.47875523638539796,
"grad_norm": 5.28125,
"learning_rate": 5.521843207660084e-05,
"loss": 0.053,
"mean_token_accuracy": 0.9898009598255157,
"num_tokens": 843616.0,
"step": 800
},
{
"entropy": 1.9540860176086425,
"epoch": 0.48473967684021546,
"grad_norm": 0.76953125,
"learning_rate": 5.5158587672052666e-05,
"loss": 0.0231,
"mean_token_accuracy": 0.9942503988742828,
"num_tokens": 854328.0,
"step": 810
},
{
"entropy": 2.186979651451111,
"epoch": 0.4907241172950329,
"grad_norm": 0.07568359375,
"learning_rate": 5.509874326750449e-05,
"loss": 0.0117,
"mean_token_accuracy": 0.9971015155315399,
"num_tokens": 864919.0,
"step": 820
},
{
"entropy": 2.058599293231964,
"epoch": 0.4967085577498504,
"grad_norm": 2.984375,
"learning_rate": 5.503889886295631e-05,
"loss": 0.013,
"mean_token_accuracy": 0.9951699852943421,
"num_tokens": 875488.0,
"step": 830
},
{
"entropy": 1.8453468203544616,
"epoch": 0.5026929982046678,
"grad_norm": 2.21875,
"learning_rate": 5.497905445840814e-05,
"loss": 0.0298,
"mean_token_accuracy": 0.9947227597236633,
"num_tokens": 886025.0,
"step": 840
},
{
"entropy": 1.9660922765731812,
"epoch": 0.5086774386594853,
"grad_norm": 0.294921875,
"learning_rate": 5.491921005385997e-05,
"loss": 0.0315,
"mean_token_accuracy": 0.9947958946228027,
"num_tokens": 896575.0,
"step": 850
},
{
"entropy": 2.0921285152435303,
"epoch": 0.5146618791143028,
"grad_norm": 2.1875,
"learning_rate": 5.485936564931179e-05,
"loss": 0.0192,
"mean_token_accuracy": 0.9942731440067292,
"num_tokens": 907186.0,
"step": 860
},
{
"entropy": 2.0400147795677186,
"epoch": 0.5206463195691203,
"grad_norm": 1.7578125,
"learning_rate": 5.4799521244763614e-05,
"loss": 0.0356,
"mean_token_accuracy": 0.993338668346405,
"num_tokens": 917742.0,
"step": 870
},
{
"entropy": 2.140678858757019,
"epoch": 0.5266307600239377,
"grad_norm": 1.21875,
"learning_rate": 5.4739676840215445e-05,
"loss": 0.0227,
"mean_token_accuracy": 0.9957817852497101,
"num_tokens": 928312.0,
"step": 880
},
{
"entropy": 2.284876358509064,
"epoch": 0.5326152004787552,
"grad_norm": 2.71875,
"learning_rate": 5.467983243566727e-05,
"loss": 0.0576,
"mean_token_accuracy": 0.9898476004600525,
"num_tokens": 939023.0,
"step": 890
},
{
"entropy": 2.2926900386810303,
"epoch": 0.5385996409335727,
"grad_norm": 1.109375,
"learning_rate": 5.461998803111909e-05,
"loss": 0.0218,
"mean_token_accuracy": 0.9931512713432312,
"num_tokens": 949275.0,
"step": 900
},
{
"entropy": 1.954952347278595,
"epoch": 0.5445840813883902,
"grad_norm": 1.3984375,
"learning_rate": 5.4560143626570916e-05,
"loss": 0.0236,
"mean_token_accuracy": 0.993185955286026,
"num_tokens": 959821.0,
"step": 910
},
{
"entropy": 1.9035270810127258,
"epoch": 0.5505685218432077,
"grad_norm": 2.140625,
"learning_rate": 5.450029922202274e-05,
"loss": 0.0413,
"mean_token_accuracy": 0.9900493204593659,
"num_tokens": 970302.0,
"step": 920
},
{
"entropy": 2.1859104156494142,
"epoch": 0.5565529622980251,
"grad_norm": 1.78125,
"learning_rate": 5.444045481747457e-05,
"loss": 0.0351,
"mean_token_accuracy": 0.9932608604431152,
"num_tokens": 980883.0,
"step": 930
},
{
"entropy": 2.0663935422897337,
"epoch": 0.5625374027528426,
"grad_norm": 2.34375,
"learning_rate": 5.438061041292639e-05,
"loss": 0.023,
"mean_token_accuracy": 0.9912481963634491,
"num_tokens": 991526.0,
"step": 940
},
{
"entropy": 1.8086812853813172,
"epoch": 0.5685218432076601,
"grad_norm": 0.4453125,
"learning_rate": 5.432076600837822e-05,
"loss": 0.0156,
"mean_token_accuracy": 0.9955720365047455,
"num_tokens": 1002052.0,
"step": 950
},
{
"entropy": 1.7378576636314391,
"epoch": 0.5745062836624776,
"grad_norm": 18.125,
"learning_rate": 5.426092160383005e-05,
"loss": 0.0207,
"mean_token_accuracy": 0.9957412719726563,
"num_tokens": 1012679.0,
"step": 960
},
{
"entropy": 1.6939527988433838,
"epoch": 0.5804907241172951,
"grad_norm": 1.546875,
"learning_rate": 5.420107719928187e-05,
"loss": 0.0183,
"mean_token_accuracy": 0.9954462170600891,
"num_tokens": 1023349.0,
"step": 970
},
{
"entropy": 1.985690712928772,
"epoch": 0.5864751645721125,
"grad_norm": 0.357421875,
"learning_rate": 5.414123279473369e-05,
"loss": 0.0176,
"mean_token_accuracy": 0.995449674129486,
"num_tokens": 1034039.0,
"step": 980
},
{
"entropy": 2.013271224498749,
"epoch": 0.59245960502693,
"grad_norm": 2.03125,
"learning_rate": 5.408138839018552e-05,
"loss": 0.0299,
"mean_token_accuracy": 0.992346465587616,
"num_tokens": 1044713.0,
"step": 990
},
{
"entropy": 2.0139170050621034,
"epoch": 0.5984440454817475,
"grad_norm": 2.203125,
"learning_rate": 5.402154398563734e-05,
"loss": 0.0423,
"mean_token_accuracy": 0.9917679131031036,
"num_tokens": 1055311.0,
"step": 1000
},
{
"entropy": 2.1181130170822144,
"epoch": 0.604428485936565,
"grad_norm": 0.42578125,
"learning_rate": 5.396169958108917e-05,
"loss": 0.0331,
"mean_token_accuracy": 0.9931470155715942,
"num_tokens": 1065957.0,
"step": 1010
},
{
"entropy": 1.8641826033592224,
"epoch": 0.6104129263913824,
"grad_norm": 2.421875,
"learning_rate": 5.3901855176540995e-05,
"loss": 0.0322,
"mean_token_accuracy": 0.9937169075012207,
"num_tokens": 1076428.0,
"step": 1020
},
{
"entropy": 1.8986489415168761,
"epoch": 0.6163973668461998,
"grad_norm": 1.640625,
"learning_rate": 5.384201077199282e-05,
"loss": 0.023,
"mean_token_accuracy": 0.9938018679618835,
"num_tokens": 1086893.0,
"step": 1030
},
{
"entropy": 2.18128616809845,
"epoch": 0.6223818073010173,
"grad_norm": 1.4921875,
"learning_rate": 5.378216636744465e-05,
"loss": 0.0384,
"mean_token_accuracy": 0.9915229856967926,
"num_tokens": 1097584.0,
"step": 1040
},
{
"entropy": 2.236606168746948,
"epoch": 0.6283662477558348,
"grad_norm": 0.474609375,
"learning_rate": 5.372232196289647e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.994802838563919,
"num_tokens": 1108360.0,
"step": 1050
},
{
"entropy": 2.085828936100006,
"epoch": 0.6343506882106523,
"grad_norm": 1.640625,
"learning_rate": 5.366247755834829e-05,
"loss": 0.0192,
"mean_token_accuracy": 0.9961528956890107,
"num_tokens": 1118906.0,
"step": 1060
},
{
"entropy": 1.8950690627098083,
"epoch": 0.6403351286654698,
"grad_norm": 0.546875,
"learning_rate": 5.360263315380012e-05,
"loss": 0.0111,
"mean_token_accuracy": 0.9958807468414307,
"num_tokens": 1129629.0,
"step": 1070
},
{
"entropy": 1.7149085760116578,
"epoch": 0.6463195691202872,
"grad_norm": 0.76171875,
"learning_rate": 5.3542788749251944e-05,
"loss": 0.0275,
"mean_token_accuracy": 0.9933666825294495,
"num_tokens": 1140311.0,
"step": 1080
},
{
"entropy": 1.746055507659912,
"epoch": 0.6523040095751047,
"grad_norm": 2.03125,
"learning_rate": 5.3482944344703774e-05,
"loss": 0.0142,
"mean_token_accuracy": 0.9978117167949676,
"num_tokens": 1150847.0,
"step": 1090
},
{
"entropy": 1.9340453743934631,
"epoch": 0.6582884500299222,
"grad_norm": 3.8125,
"learning_rate": 5.34230999401556e-05,
"loss": 0.0457,
"mean_token_accuracy": 0.9922799825668335,
"num_tokens": 1161560.0,
"step": 1100
},
{
"entropy": 2.1036330103874206,
"epoch": 0.6642728904847397,
"grad_norm": 3.375,
"learning_rate": 5.336325553560742e-05,
"loss": 0.0294,
"mean_token_accuracy": 0.9932465612888336,
"num_tokens": 1172079.0,
"step": 1110
},
{
"entropy": 2.233074736595154,
"epoch": 0.6702573309395572,
"grad_norm": 0.6875,
"learning_rate": 5.330341113105925e-05,
"loss": 0.0183,
"mean_token_accuracy": 0.9958370923995972,
"num_tokens": 1182693.0,
"step": 1120
},
{
"entropy": 2.2601813077926636,
"epoch": 0.6762417713943746,
"grad_norm": 0.0712890625,
"learning_rate": 5.3243566726511075e-05,
"loss": 0.024,
"mean_token_accuracy": 0.9958759665489196,
"num_tokens": 1193488.0,
"step": 1130
},
{
"entropy": 2.336117482185364,
"epoch": 0.6822262118491921,
"grad_norm": 2.140625,
"learning_rate": 5.318372232196289e-05,
"loss": 0.025,
"mean_token_accuracy": 0.9926075279712677,
"num_tokens": 1204193.0,
"step": 1140
},
{
"entropy": 2.229608154296875,
"epoch": 0.6882106523040096,
"grad_norm": 1.046875,
"learning_rate": 5.312387791741472e-05,
"loss": 0.026,
"mean_token_accuracy": 0.9936797678470611,
"num_tokens": 1214759.0,
"step": 1150
},
{
"entropy": 2.1003764390945436,
"epoch": 0.6941950927588271,
"grad_norm": 0.61328125,
"learning_rate": 5.3064033512866546e-05,
"loss": 0.0224,
"mean_token_accuracy": 0.9928372919559478,
"num_tokens": 1225411.0,
"step": 1160
},
{
"entropy": 1.9683186292648316,
"epoch": 0.7001795332136446,
"grad_norm": 1.0546875,
"learning_rate": 5.300418910831837e-05,
"loss": 0.0234,
"mean_token_accuracy": 0.9944151937961578,
"num_tokens": 1236020.0,
"step": 1170
},
{
"entropy": 1.9860737323760986,
"epoch": 0.706163973668462,
"grad_norm": 2.125,
"learning_rate": 5.29443447037702e-05,
"loss": 0.0201,
"mean_token_accuracy": 0.9947565972805024,
"num_tokens": 1246629.0,
"step": 1180
},
{
"entropy": 1.9602022528648377,
"epoch": 0.7121484141232794,
"grad_norm": 1.0234375,
"learning_rate": 5.2884500299222024e-05,
"loss": 0.016,
"mean_token_accuracy": 0.9972622811794281,
"num_tokens": 1257242.0,
"step": 1190
},
{
"entropy": 1.962342917919159,
"epoch": 0.718132854578097,
"grad_norm": 3.296875,
"learning_rate": 5.2824655894673854e-05,
"loss": 0.0278,
"mean_token_accuracy": 0.9961660385131836,
"num_tokens": 1267654.0,
"step": 1200
},
{
"entropy": 2.012917125225067,
"epoch": 0.7241172950329144,
"grad_norm": 0.54296875,
"learning_rate": 5.276481149012568e-05,
"loss": 0.0294,
"mean_token_accuracy": 0.9938123047351837,
"num_tokens": 1278416.0,
"step": 1210
},
{
"entropy": 2.1606369018554688,
"epoch": 0.7301017354877319,
"grad_norm": 0.98046875,
"learning_rate": 5.2704967085577494e-05,
"loss": 0.0096,
"mean_token_accuracy": 0.9982011258602143,
"num_tokens": 1288836.0,
"step": 1220
},
{
"entropy": 2.1367745637893676,
"epoch": 0.7360861759425493,
"grad_norm": 0.58984375,
"learning_rate": 5.2645122681029325e-05,
"loss": 0.016,
"mean_token_accuracy": 0.9962705135345459,
"num_tokens": 1299376.0,
"step": 1230
},
{
"entropy": 2.040121853351593,
"epoch": 0.7420706163973668,
"grad_norm": 0.5546875,
"learning_rate": 5.258527827648115e-05,
"loss": 0.0071,
"mean_token_accuracy": 0.9982105016708374,
"num_tokens": 1310023.0,
"step": 1240
},
{
"entropy": 1.9288089156150818,
"epoch": 0.7480550568521843,
"grad_norm": 1.4609375,
"learning_rate": 5.252543387193297e-05,
"loss": 0.0246,
"mean_token_accuracy": 0.9950582385063171,
"num_tokens": 1320431.0,
"step": 1250
},
{
"entropy": 1.9065364241600036,
"epoch": 0.7540394973070018,
"grad_norm": 1.546875,
"learning_rate": 5.24655894673848e-05,
"loss": 0.0145,
"mean_token_accuracy": 0.9972732722759247,
"num_tokens": 1331070.0,
"step": 1260
},
{
"entropy": 1.9331876635551453,
"epoch": 0.7600239377618193,
"grad_norm": 1.921875,
"learning_rate": 5.2405745062836626e-05,
"loss": 0.0137,
"mean_token_accuracy": 0.9962199032306671,
"num_tokens": 1341780.0,
"step": 1270
},
{
"entropy": 1.9782670497894288,
"epoch": 0.7660083782166367,
"grad_norm": 1.046875,
"learning_rate": 5.234590065828845e-05,
"loss": 0.015,
"mean_token_accuracy": 0.9963721334934235,
"num_tokens": 1352078.0,
"step": 1280
},
{
"entropy": 1.9941368341445922,
"epoch": 0.7719928186714542,
"grad_norm": 1.4140625,
"learning_rate": 5.228605625374028e-05,
"loss": 0.0171,
"mean_token_accuracy": 0.995795214176178,
"num_tokens": 1362574.0,
"step": 1290
},
{
"entropy": 2.0281198143959047,
"epoch": 0.7779772591262717,
"grad_norm": 1.984375,
"learning_rate": 5.2226211849192104e-05,
"loss": 0.0248,
"mean_token_accuracy": 0.9931159555912018,
"num_tokens": 1373157.0,
"step": 1300
},
{
"entropy": 2.243196725845337,
"epoch": 0.7839616995810892,
"grad_norm": 2.078125,
"learning_rate": 5.216636744464393e-05,
"loss": 0.0182,
"mean_token_accuracy": 0.9942145645618439,
"num_tokens": 1383719.0,
"step": 1310
},
{
"entropy": 2.1937716722488405,
"epoch": 0.7899461400359067,
"grad_norm": 1.8203125,
"learning_rate": 5.210652304009575e-05,
"loss": 0.0237,
"mean_token_accuracy": 0.9958289206027985,
"num_tokens": 1394282.0,
"step": 1320
},
{
"entropy": 2.4126478910446165,
"epoch": 0.7959305804907241,
"grad_norm": 0.2578125,
"learning_rate": 5.2046678635547574e-05,
"loss": 0.0176,
"mean_token_accuracy": 0.9958206951618195,
"num_tokens": 1404993.0,
"step": 1330
},
{
"entropy": 2.3782467603683473,
"epoch": 0.8019150209455416,
"grad_norm": 1.34375,
"learning_rate": 5.1986834230999405e-05,
"loss": 0.0205,
"mean_token_accuracy": 0.995442909002304,
"num_tokens": 1415486.0,
"step": 1340
},
{
"entropy": 2.243998336791992,
"epoch": 0.8078994614003591,
"grad_norm": 5.125,
"learning_rate": 5.192698982645123e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9941241443157196,
"num_tokens": 1426258.0,
"step": 1350
},
{
"entropy": 2.1216912388801576,
"epoch": 0.8138839018551766,
"grad_norm": 0.796875,
"learning_rate": 5.186714542190305e-05,
"loss": 0.0227,
"mean_token_accuracy": 0.9941264271736145,
"num_tokens": 1436794.0,
"step": 1360
},
{
"entropy": 2.2492347717285157,
"epoch": 0.8198683423099941,
"grad_norm": 0.26953125,
"learning_rate": 5.180730101735488e-05,
"loss": 0.0225,
"mean_token_accuracy": 0.995734578371048,
"num_tokens": 1447419.0,
"step": 1370
},
{
"entropy": 2.2173564434051514,
"epoch": 0.8258527827648114,
"grad_norm": 2.96875,
"learning_rate": 5.1747456612806706e-05,
"loss": 0.021,
"mean_token_accuracy": 0.9952861666679382,
"num_tokens": 1457898.0,
"step": 1380
},
{
"entropy": 2.240031886100769,
"epoch": 0.8318372232196289,
"grad_norm": 0.52734375,
"learning_rate": 5.168761220825853e-05,
"loss": 0.0167,
"mean_token_accuracy": 0.9952712118625641,
"num_tokens": 1468317.0,
"step": 1390
},
{
"entropy": 2.1114137172698975,
"epoch": 0.8378216636744464,
"grad_norm": 2.296875,
"learning_rate": 5.162776780371035e-05,
"loss": 0.0147,
"mean_token_accuracy": 0.9961936414241791,
"num_tokens": 1479024.0,
"step": 1400
},
{
"entropy": 2.1817475318908692,
"epoch": 0.8438061041292639,
"grad_norm": 0.38671875,
"learning_rate": 5.156792339916218e-05,
"loss": 0.0256,
"mean_token_accuracy": 0.9960037291049957,
"num_tokens": 1489537.0,
"step": 1410
},
{
"entropy": 2.067359519004822,
"epoch": 0.8497905445840814,
"grad_norm": 0.53515625,
"learning_rate": 5.150807899461401e-05,
"loss": 0.0092,
"mean_token_accuracy": 0.9971863865852356,
"num_tokens": 1500071.0,
"step": 1420
},
{
"entropy": 1.7851839542388916,
"epoch": 0.8557749850388988,
"grad_norm": 0.1533203125,
"learning_rate": 5.144823459006583e-05,
"loss": 0.0106,
"mean_token_accuracy": 0.9977741360664367,
"num_tokens": 1510569.0,
"step": 1430
},
{
"entropy": 1.6537530064582824,
"epoch": 0.8617594254937163,
"grad_norm": 0.7109375,
"learning_rate": 5.1388390185517654e-05,
"loss": 0.0216,
"mean_token_accuracy": 0.9965560495853424,
"num_tokens": 1521246.0,
"step": 1440
},
{
"entropy": 1.557056224346161,
"epoch": 0.8677438659485338,
"grad_norm": 1.1484375,
"learning_rate": 5.1328545780969485e-05,
"loss": 0.0119,
"mean_token_accuracy": 0.9958473801612854,
"num_tokens": 1531684.0,
"step": 1450
},
{
"entropy": 1.686435067653656,
"epoch": 0.8737283064033513,
"grad_norm": 1.3984375,
"learning_rate": 5.126870137642131e-05,
"loss": 0.0323,
"mean_token_accuracy": 0.9941241443157196,
"num_tokens": 1542355.0,
"step": 1460
},
{
"entropy": 1.9768336772918702,
"epoch": 0.8797127468581688,
"grad_norm": 2.328125,
"learning_rate": 5.1208856971873125e-05,
"loss": 0.0321,
"mean_token_accuracy": 0.992108279466629,
"num_tokens": 1552858.0,
"step": 1470
},
{
"entropy": 2.234424018859863,
"epoch": 0.8856971873129862,
"grad_norm": 0.87890625,
"learning_rate": 5.1149012567324955e-05,
"loss": 0.0137,
"mean_token_accuracy": 0.9972717106342316,
"num_tokens": 1563443.0,
"step": 1480
},
{
"entropy": 2.180929946899414,
"epoch": 0.8916816277678037,
"grad_norm": 1.5,
"learning_rate": 5.108916816277678e-05,
"loss": 0.0233,
"mean_token_accuracy": 0.9935569524765014,
"num_tokens": 1574132.0,
"step": 1490
},
{
"entropy": 2.016967070102692,
"epoch": 0.8976660682226212,
"grad_norm": 1.8515625,
"learning_rate": 5.102932375822861e-05,
"loss": 0.0151,
"mean_token_accuracy": 0.9970189332962036,
"num_tokens": 1584513.0,
"step": 1500
},
{
"entropy": 2.0316667675971987,
"epoch": 0.9036505086774387,
"grad_norm": 0.435546875,
"learning_rate": 5.096947935368043e-05,
"loss": 0.0195,
"mean_token_accuracy": 0.9942385494709015,
"num_tokens": 1594911.0,
"step": 1510
},
{
"entropy": 2.189347743988037,
"epoch": 0.9096349491322562,
"grad_norm": 1.15625,
"learning_rate": 5.090963494913226e-05,
"loss": 0.0159,
"mean_token_accuracy": 0.995745187997818,
"num_tokens": 1605415.0,
"step": 1520
},
{
"entropy": 2.2969648122787474,
"epoch": 0.9156193895870736,
"grad_norm": 0.32421875,
"learning_rate": 5.084979054458409e-05,
"loss": 0.0177,
"mean_token_accuracy": 0.9965317666530609,
"num_tokens": 1615920.0,
"step": 1530
},
{
"entropy": 2.4979332208633425,
"epoch": 0.9216038300418911,
"grad_norm": 0.2734375,
"learning_rate": 5.078994614003591e-05,
"loss": 0.0261,
"mean_token_accuracy": 0.9930246412754059,
"num_tokens": 1626519.0,
"step": 1540
},
{
"entropy": 2.381369423866272,
"epoch": 0.9275882704967086,
"grad_norm": 0.16796875,
"learning_rate": 5.073010173548773e-05,
"loss": 0.0168,
"mean_token_accuracy": 0.9979732036590576,
"num_tokens": 1637154.0,
"step": 1550
},
{
"entropy": 2.2379947185516356,
"epoch": 0.933572710951526,
"grad_norm": 0.248046875,
"learning_rate": 5.067025733093956e-05,
"loss": 0.0093,
"mean_token_accuracy": 0.9971390187740325,
"num_tokens": 1647633.0,
"step": 1560
},
{
"entropy": 2.245289707183838,
"epoch": 0.9395571514063435,
"grad_norm": 4.21875,
"learning_rate": 5.061041292639138e-05,
"loss": 0.0249,
"mean_token_accuracy": 0.9938650369644165,
"num_tokens": 1658248.0,
"step": 1570
},
{
"entropy": 2.252811074256897,
"epoch": 0.9455415918611609,
"grad_norm": 0.3203125,
"learning_rate": 5.0550568521843205e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9979120135307312,
"num_tokens": 1668889.0,
"step": 1580
},
{
"entropy": 2.2055715322494507,
"epoch": 0.9515260323159784,
"grad_norm": 0.029052734375,
"learning_rate": 5.0490724117295035e-05,
"loss": 0.0159,
"mean_token_accuracy": 0.995630270242691,
"num_tokens": 1679264.0,
"step": 1590
},
{
"entropy": 2.1671520948410032,
"epoch": 0.9575104727707959,
"grad_norm": 0.08935546875,
"learning_rate": 5.043087971274686e-05,
"loss": 0.0246,
"mean_token_accuracy": 0.9956823647022247,
"num_tokens": 1689812.0,
"step": 1600
},
{
"entropy": 2.264160418510437,
"epoch": 0.9634949132256134,
"grad_norm": 1.390625,
"learning_rate": 5.037103530819869e-05,
"loss": 0.021,
"mean_token_accuracy": 0.9950319647789001,
"num_tokens": 1700414.0,
"step": 1610
},
{
"entropy": 2.3583990335464478,
"epoch": 0.9694793536804309,
"grad_norm": 0.322265625,
"learning_rate": 5.031119090365051e-05,
"loss": 0.0075,
"mean_token_accuracy": 0.9978622436523438,
"num_tokens": 1711140.0,
"step": 1620
},
{
"entropy": 2.2860071897506713,
"epoch": 0.9754637941352483,
"grad_norm": 0.04736328125,
"learning_rate": 5.025134649910233e-05,
"loss": 0.0071,
"mean_token_accuracy": 0.9978203475475311,
"num_tokens": 1721755.0,
"step": 1630
},
{
"entropy": 2.346220374107361,
"epoch": 0.9814482345900658,
"grad_norm": 0.69921875,
"learning_rate": 5.019150209455416e-05,
"loss": 0.0115,
"mean_token_accuracy": 0.9975032925605773,
"num_tokens": 1732426.0,
"step": 1640
},
{
"entropy": 2.3739873647689818,
"epoch": 0.9874326750448833,
"grad_norm": 1.4375,
"learning_rate": 5.0131657690005984e-05,
"loss": 0.0186,
"mean_token_accuracy": 0.9963673233985901,
"num_tokens": 1742943.0,
"step": 1650
},
{
"entropy": 2.418501615524292,
"epoch": 0.9934171154997008,
"grad_norm": 2.625,
"learning_rate": 5.007181328545781e-05,
"loss": 0.0316,
"mean_token_accuracy": 0.9935234546661377,
"num_tokens": 1753459.0,
"step": 1660
},
{
"entropy": 2.5557403326034547,
"epoch": 0.9994015559545183,
"grad_norm": 4.90625,
"learning_rate": 5.001196888090964e-05,
"loss": 0.0184,
"mean_token_accuracy": 0.9944635927677155,
"num_tokens": 1764253.0,
"step": 1670
},
{
"entropy": 2.444777250289917,
"epoch": 1.0053859964093357,
"grad_norm": 0.78125,
"learning_rate": 4.995212447636146e-05,
"loss": 0.0058,
"mean_token_accuracy": 0.9985901474952698,
"num_tokens": 1774801.0,
"step": 1680
},
{
"entropy": 2.319710397720337,
"epoch": 1.0113704368641532,
"grad_norm": 0.94921875,
"learning_rate": 4.989228007181329e-05,
"loss": 0.0099,
"mean_token_accuracy": 0.9978196382522583,
"num_tokens": 1785099.0,
"step": 1690
},
{
"entropy": 2.311413550376892,
"epoch": 1.0173548773189707,
"grad_norm": 0.0703125,
"learning_rate": 4.9832435667265115e-05,
"loss": 0.0057,
"mean_token_accuracy": 0.9981013059616088,
"num_tokens": 1795558.0,
"step": 1700
},
{
"entropy": 2.040494406223297,
"epoch": 1.0233393177737882,
"grad_norm": 0.07958984375,
"learning_rate": 4.977259126271694e-05,
"loss": 0.0059,
"mean_token_accuracy": 0.9982348620891571,
"num_tokens": 1806044.0,
"step": 1710
},
{
"entropy": 1.9538646340370178,
"epoch": 1.0293237582286057,
"grad_norm": 0.0255126953125,
"learning_rate": 4.971274685816876e-05,
"loss": 0.0091,
"mean_token_accuracy": 0.9989152610301971,
"num_tokens": 1816591.0,
"step": 1720
},
{
"entropy": 1.9389848232269287,
"epoch": 1.0353081986834232,
"grad_norm": 0.10888671875,
"learning_rate": 4.9652902453620586e-05,
"loss": 0.0064,
"mean_token_accuracy": 0.9982307553291321,
"num_tokens": 1827193.0,
"step": 1730
},
{
"entropy": 2.034816288948059,
"epoch": 1.0412926391382407,
"grad_norm": 0.87109375,
"learning_rate": 4.959305804907241e-05,
"loss": 0.0068,
"mean_token_accuracy": 0.9974384307861328,
"num_tokens": 1837562.0,
"step": 1740
},
{
"entropy": 1.9168062806129456,
"epoch": 1.0472770795930582,
"grad_norm": 1.1015625,
"learning_rate": 4.953321364452424e-05,
"loss": 0.0126,
"mean_token_accuracy": 0.9967582404613495,
"num_tokens": 1848040.0,
"step": 1750
},
{
"entropy": 1.914523994922638,
"epoch": 1.0532615200478754,
"grad_norm": 0.2041015625,
"learning_rate": 4.9473369239976064e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9989884674549103,
"num_tokens": 1858732.0,
"step": 1760
},
{
"entropy": 1.8829883575439452,
"epoch": 1.059245960502693,
"grad_norm": 1.1875,
"learning_rate": 4.941352483542789e-05,
"loss": 0.0104,
"mean_token_accuracy": 0.9978753447532653,
"num_tokens": 1869534.0,
"step": 1770
},
{
"entropy": 1.9079457640647888,
"epoch": 1.0652304009575104,
"grad_norm": 0.5234375,
"learning_rate": 4.935368043087972e-05,
"loss": 0.0158,
"mean_token_accuracy": 0.9964175879955292,
"num_tokens": 1880120.0,
"step": 1780
},
{
"entropy": 2.182413923740387,
"epoch": 1.071214841412328,
"grad_norm": 0.220703125,
"learning_rate": 4.929383602633154e-05,
"loss": 0.0081,
"mean_token_accuracy": 0.9970734059810639,
"num_tokens": 1890589.0,
"step": 1790
},
{
"entropy": 2.1202453851699827,
"epoch": 1.0771992818671454,
"grad_norm": 0.328125,
"learning_rate": 4.9233991621783365e-05,
"loss": 0.0095,
"mean_token_accuracy": 0.9975960195064545,
"num_tokens": 1901169.0,
"step": 1800
},
{
"entropy": 2.040514326095581,
"epoch": 1.083183722321963,
"grad_norm": 0.294921875,
"learning_rate": 4.917414721723519e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9978755116462708,
"num_tokens": 1911675.0,
"step": 1810
},
{
"entropy": 1.9972706317901612,
"epoch": 1.0891681627767804,
"grad_norm": 0.07275390625,
"learning_rate": 4.911430281268701e-05,
"loss": 0.0086,
"mean_token_accuracy": 0.9979366779327392,
"num_tokens": 1922328.0,
"step": 1820
},
{
"entropy": 1.960794413089752,
"epoch": 1.095152603231598,
"grad_norm": 0.0263671875,
"learning_rate": 4.905445840813884e-05,
"loss": 0.0069,
"mean_token_accuracy": 0.9982580840587616,
"num_tokens": 1932945.0,
"step": 1830
},
{
"entropy": 1.9782156348228455,
"epoch": 1.1011370436864154,
"grad_norm": 0.08447265625,
"learning_rate": 4.8994614003590666e-05,
"loss": 0.0134,
"mean_token_accuracy": 0.9974651634693146,
"num_tokens": 1943470.0,
"step": 1840
},
{
"entropy": 1.977518343925476,
"epoch": 1.1071214841412327,
"grad_norm": 0.98046875,
"learning_rate": 4.893476959904249e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.9989652216434479,
"num_tokens": 1954059.0,
"step": 1850
},
{
"entropy": 1.8379858493804933,
"epoch": 1.1131059245960502,
"grad_norm": 0.076171875,
"learning_rate": 4.887492519449432e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9982218623161316,
"num_tokens": 1964445.0,
"step": 1860
},
{
"entropy": 1.8461663961410522,
"epoch": 1.1190903650508677,
"grad_norm": 0.01025390625,
"learning_rate": 4.8815080789946143e-05,
"loss": 0.0082,
"mean_token_accuracy": 0.9989234507083893,
"num_tokens": 1975144.0,
"step": 1870
},
{
"entropy": 1.744893491268158,
"epoch": 1.1250748055056852,
"grad_norm": 0.451171875,
"learning_rate": 4.875523638539796e-05,
"loss": 0.008,
"mean_token_accuracy": 0.9982535362243652,
"num_tokens": 1985655.0,
"step": 1880
},
{
"entropy": 1.8510890007019043,
"epoch": 1.1310592459605027,
"grad_norm": 2.171875,
"learning_rate": 4.869539198084979e-05,
"loss": 0.0074,
"mean_token_accuracy": 0.9972193837165833,
"num_tokens": 1996254.0,
"step": 1890
},
{
"entropy": 1.8942458748817443,
"epoch": 1.1370436864153202,
"grad_norm": 0.1484375,
"learning_rate": 4.8635547576301614e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9989045560359955,
"num_tokens": 2006855.0,
"step": 1900
},
{
"entropy": 1.8198019742965699,
"epoch": 1.1430281268701377,
"grad_norm": 1.3828125,
"learning_rate": 4.8575703171753445e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9992660522460938,
"num_tokens": 2017681.0,
"step": 1910
},
{
"entropy": 1.6781391501426697,
"epoch": 1.1490125673249552,
"grad_norm": 0.0291748046875,
"learning_rate": 4.851585876720527e-05,
"loss": 0.01,
"mean_token_accuracy": 0.9977579891681672,
"num_tokens": 2028065.0,
"step": 1920
},
{
"entropy": 1.7453859210014344,
"epoch": 1.1549970077797727,
"grad_norm": 1.2578125,
"learning_rate": 4.845601436265709e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9985866487026215,
"num_tokens": 2038744.0,
"step": 1930
},
{
"entropy": 1.7961567163467407,
"epoch": 1.1609814482345902,
"grad_norm": 0.0810546875,
"learning_rate": 4.839616995810892e-05,
"loss": 0.0103,
"mean_token_accuracy": 0.9979851245880127,
"num_tokens": 2049482.0,
"step": 1940
},
{
"entropy": 1.811640238761902,
"epoch": 1.1669658886894076,
"grad_norm": 3.0,
"learning_rate": 4.8336325553560746e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.99932302236557,
"num_tokens": 2060236.0,
"step": 1950
},
{
"entropy": 1.7889364123344422,
"epoch": 1.172950329144225,
"grad_norm": 0.30078125,
"learning_rate": 4.827648114901256e-05,
"loss": 0.0137,
"mean_token_accuracy": 0.9968235552310943,
"num_tokens": 2070951.0,
"step": 1960
},
{
"entropy": 1.8480128169059753,
"epoch": 1.1789347695990424,
"grad_norm": 0.0771484375,
"learning_rate": 4.821663674446439e-05,
"loss": 0.0103,
"mean_token_accuracy": 0.9978732526302337,
"num_tokens": 2081499.0,
"step": 1970
},
{
"entropy": 1.7933115482330322,
"epoch": 1.18491921005386,
"grad_norm": 1.015625,
"learning_rate": 4.815679233991622e-05,
"loss": 0.0087,
"mean_token_accuracy": 0.9985387563705445,
"num_tokens": 2092063.0,
"step": 1980
},
{
"entropy": 1.7108964920043945,
"epoch": 1.1909036505086774,
"grad_norm": 0.380859375,
"learning_rate": 4.809694793536805e-05,
"loss": 0.0088,
"mean_token_accuracy": 0.9971836686134339,
"num_tokens": 2102495.0,
"step": 1990
},
{
"entropy": 1.7778636336326599,
"epoch": 1.196888090963495,
"grad_norm": 0.0859375,
"learning_rate": 4.803710353081987e-05,
"loss": 0.0068,
"mean_token_accuracy": 0.9988643646240234,
"num_tokens": 2112819.0,
"step": 2000
},
{
"entropy": 1.7624136567115785,
"epoch": 1.2028725314183124,
"grad_norm": 3.34375,
"learning_rate": 4.7977259126271694e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9986830711364746,
"num_tokens": 2123473.0,
"step": 2010
},
{
"entropy": 1.78448588848114,
"epoch": 1.20885697187313,
"grad_norm": 0.037353515625,
"learning_rate": 4.7917414721723525e-05,
"loss": 0.0172,
"mean_token_accuracy": 0.9978123724460601,
"num_tokens": 2134040.0,
"step": 2020
},
{
"entropy": 1.959210455417633,
"epoch": 1.2148414123279474,
"grad_norm": 0.08544921875,
"learning_rate": 4.785757031717535e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996268630027771,
"num_tokens": 2144616.0,
"step": 2030
},
{
"entropy": 1.9250733256340027,
"epoch": 1.220825852782765,
"grad_norm": 0.0225830078125,
"learning_rate": 4.7797725912627165e-05,
"loss": 0.0084,
"mean_token_accuracy": 0.9989728569984436,
"num_tokens": 2155216.0,
"step": 2040
},
{
"entropy": 1.9832014560699462,
"epoch": 1.2268102932375822,
"grad_norm": 0.5234375,
"learning_rate": 4.7737881508078995e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.9985514342784881,
"num_tokens": 2165907.0,
"step": 2050
},
{
"entropy": 2.074608051776886,
"epoch": 1.2327947336923997,
"grad_norm": 0.67578125,
"learning_rate": 4.767803710353082e-05,
"loss": 0.0073,
"mean_token_accuracy": 0.9982097864151,
"num_tokens": 2176500.0,
"step": 2060
},
{
"entropy": 2.164236378669739,
"epoch": 1.2387791741472172,
"grad_norm": 0.39453125,
"learning_rate": 4.761819269898264e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.998193335533142,
"num_tokens": 2186941.0,
"step": 2070
},
{
"entropy": 2.0435985803604124,
"epoch": 1.2447636146020347,
"grad_norm": 0.02197265625,
"learning_rate": 4.755834829443447e-05,
"loss": 0.0084,
"mean_token_accuracy": 0.9974483609199524,
"num_tokens": 2197437.0,
"step": 2080
},
{
"entropy": 1.7942885041236878,
"epoch": 1.2507480550568522,
"grad_norm": 0.51171875,
"learning_rate": 4.7498503889886297e-05,
"loss": 0.012,
"mean_token_accuracy": 0.9985887467861175,
"num_tokens": 2208037.0,
"step": 2090
},
{
"entropy": 1.7626053810119628,
"epoch": 1.2567324955116697,
"grad_norm": 0.78125,
"learning_rate": 4.743865948533813e-05,
"loss": 0.0136,
"mean_token_accuracy": 0.9975202858448029,
"num_tokens": 2218593.0,
"step": 2100
},
{
"entropy": 1.821165430545807,
"epoch": 1.2627169359664872,
"grad_norm": 0.6328125,
"learning_rate": 4.737881508078995e-05,
"loss": 0.0087,
"mean_token_accuracy": 0.9983973145484925,
"num_tokens": 2229361.0,
"step": 2110
},
{
"entropy": 1.8355701923370362,
"epoch": 1.2687013764213046,
"grad_norm": 0.060302734375,
"learning_rate": 4.731897067624177e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9985953152179718,
"num_tokens": 2240050.0,
"step": 2120
},
{
"entropy": 1.7403881430625916,
"epoch": 1.2746858168761221,
"grad_norm": 1.09375,
"learning_rate": 4.72591262716936e-05,
"loss": 0.0059,
"mean_token_accuracy": 0.9986009895801544,
"num_tokens": 2250554.0,
"step": 2130
},
{
"entropy": 1.7938685655593871,
"epoch": 1.2806702573309394,
"grad_norm": 0.024658203125,
"learning_rate": 4.719928186714542e-05,
"loss": 0.0071,
"mean_token_accuracy": 0.9985599517822266,
"num_tokens": 2261237.0,
"step": 2140
},
{
"entropy": 1.8329176306724548,
"epoch": 1.2866546977857571,
"grad_norm": 0.047119140625,
"learning_rate": 4.7139437462597245e-05,
"loss": 0.0072,
"mean_token_accuracy": 0.9985998690128326,
"num_tokens": 2271802.0,
"step": 2150
},
{
"entropy": 1.7967113852500916,
"epoch": 1.2926391382405744,
"grad_norm": 1.375,
"learning_rate": 4.7079593058049075e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9993126451969147,
"num_tokens": 2282449.0,
"step": 2160
},
{
"entropy": 1.8445377826690674,
"epoch": 1.298623578695392,
"grad_norm": 0.018310546875,
"learning_rate": 4.70197486535009e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9992965221405029,
"num_tokens": 2293084.0,
"step": 2170
},
{
"entropy": 1.8734865069389344,
"epoch": 1.3046080191502094,
"grad_norm": 0.0322265625,
"learning_rate": 4.695990424895272e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9989112138748169,
"num_tokens": 2303735.0,
"step": 2180
},
{
"entropy": 1.9646783590316772,
"epoch": 1.310592459605027,
"grad_norm": 0.5078125,
"learning_rate": 4.690005984440455e-05,
"loss": 0.0188,
"mean_token_accuracy": 0.9973492562770844,
"num_tokens": 2314333.0,
"step": 2190
},
{
"entropy": 2.0574665307998656,
"epoch": 1.3165769000598444,
"grad_norm": 0.029296875,
"learning_rate": 4.6840215439856376e-05,
"loss": 0.0058,
"mean_token_accuracy": 0.9989279210567474,
"num_tokens": 2325056.0,
"step": 2200
},
{
"entropy": 1.9682793974876405,
"epoch": 1.322561340514662,
"grad_norm": 0.1142578125,
"learning_rate": 4.67803710353082e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9985645651817322,
"num_tokens": 2335451.0,
"step": 2210
},
{
"entropy": 1.978387975692749,
"epoch": 1.3285457809694794,
"grad_norm": 0.08740234375,
"learning_rate": 4.6720526630760024e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9985677063465118,
"num_tokens": 2345726.0,
"step": 2220
},
{
"entropy": 2.020477998256683,
"epoch": 1.334530221424297,
"grad_norm": 0.1005859375,
"learning_rate": 4.666068222621185e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9985411047935486,
"num_tokens": 2356376.0,
"step": 2230
},
{
"entropy": 1.9890066623687743,
"epoch": 1.3405146618791144,
"grad_norm": 0.036376953125,
"learning_rate": 4.660083782166368e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9989206194877625,
"num_tokens": 2367064.0,
"step": 2240
},
{
"entropy": 1.907881224155426,
"epoch": 1.3464991023339317,
"grad_norm": 1.4609375,
"learning_rate": 4.65409934171155e-05,
"loss": 0.0062,
"mean_token_accuracy": 0.9981145322322845,
"num_tokens": 2377443.0,
"step": 2250
},
{
"entropy": 1.8871219635009766,
"epoch": 1.3524835427887494,
"grad_norm": 0.048095703125,
"learning_rate": 4.6481149012567325e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9985643148422241,
"num_tokens": 2388013.0,
"step": 2260
},
{
"entropy": 1.8452912092208862,
"epoch": 1.3584679832435667,
"grad_norm": 0.439453125,
"learning_rate": 4.6421304608019155e-05,
"loss": 0.0112,
"mean_token_accuracy": 0.9982944905757904,
"num_tokens": 2398654.0,
"step": 2270
},
{
"entropy": 1.8592095136642457,
"epoch": 1.3644524236983842,
"grad_norm": 0.70703125,
"learning_rate": 4.636146020347098e-05,
"loss": 0.0102,
"mean_token_accuracy": 0.9971856594085693,
"num_tokens": 2409209.0,
"step": 2280
},
{
"entropy": 1.9233819246292114,
"epoch": 1.3704368641532017,
"grad_norm": 0.09033203125,
"learning_rate": 4.63016157989228e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9982663273811341,
"num_tokens": 2419620.0,
"step": 2290
},
{
"entropy": 1.888377809524536,
"epoch": 1.3764213046080191,
"grad_norm": 0.16015625,
"learning_rate": 4.6241771394374626e-05,
"loss": 0.0079,
"mean_token_accuracy": 0.9981661736965179,
"num_tokens": 2430017.0,
"step": 2300
},
{
"entropy": 1.8767491102218627,
"epoch": 1.3824057450628366,
"grad_norm": 0.84375,
"learning_rate": 4.618192698982645e-05,
"loss": 0.0069,
"mean_token_accuracy": 0.9989192366600037,
"num_tokens": 2440671.0,
"step": 2310
},
{
"entropy": 1.8048344016075135,
"epoch": 1.3883901855176541,
"grad_norm": 1.1875,
"learning_rate": 4.612208258527828e-05,
"loss": 0.0154,
"mean_token_accuracy": 0.9961064517498016,
"num_tokens": 2451241.0,
"step": 2320
},
{
"entropy": 1.7816383957862854,
"epoch": 1.3943746259724716,
"grad_norm": 1.390625,
"learning_rate": 4.6062238180730103e-05,
"loss": 0.0123,
"mean_token_accuracy": 0.9972090303897858,
"num_tokens": 2461831.0,
"step": 2330
},
{
"entropy": 1.803758704662323,
"epoch": 1.400359066427289,
"grad_norm": 0.12109375,
"learning_rate": 4.600239377618193e-05,
"loss": 0.014,
"mean_token_accuracy": 0.9982601046562195,
"num_tokens": 2472433.0,
"step": 2340
},
{
"entropy": 1.8111782312393188,
"epoch": 1.4063435068821066,
"grad_norm": 1.296875,
"learning_rate": 4.594254937163376e-05,
"loss": 0.0132,
"mean_token_accuracy": 0.997297465801239,
"num_tokens": 2483131.0,
"step": 2350
},
{
"entropy": 1.841699206829071,
"epoch": 1.412327947336924,
"grad_norm": 0.212890625,
"learning_rate": 4.588270496708558e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9989056468009949,
"num_tokens": 2493799.0,
"step": 2360
},
{
"entropy": 1.9263641238212585,
"epoch": 1.4183123877917414,
"grad_norm": 2.890625,
"learning_rate": 4.58228605625374e-05,
"loss": 0.0153,
"mean_token_accuracy": 0.9967702269554138,
"num_tokens": 2504450.0,
"step": 2370
},
{
"entropy": 1.8764405727386475,
"epoch": 1.424296828246559,
"grad_norm": 0.0159912109375,
"learning_rate": 4.576301615798923e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9996539771556854,
"num_tokens": 2515097.0,
"step": 2380
},
{
"entropy": 1.8099690794944763,
"epoch": 1.4302812687013764,
"grad_norm": 0.88671875,
"learning_rate": 4.570317175344105e-05,
"loss": 0.0081,
"mean_token_accuracy": 0.9982343196868897,
"num_tokens": 2525527.0,
"step": 2390
},
{
"entropy": 1.7947665095329284,
"epoch": 1.436265709156194,
"grad_norm": 0.023681640625,
"learning_rate": 4.564332734889288e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.999330735206604,
"num_tokens": 2536145.0,
"step": 2400
},
{
"entropy": 1.8335639595985413,
"epoch": 1.4422501496110114,
"grad_norm": 1.0234375,
"learning_rate": 4.5583482944344706e-05,
"loss": 0.0153,
"mean_token_accuracy": 0.9968725681304932,
"num_tokens": 2546774.0,
"step": 2410
},
{
"entropy": 1.857043170928955,
"epoch": 1.4482345900658289,
"grad_norm": 0.39453125,
"learning_rate": 4.552363853979653e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9982848763465881,
"num_tokens": 2557417.0,
"step": 2420
},
{
"entropy": 1.8625115990638732,
"epoch": 1.4542190305206464,
"grad_norm": 0.1318359375,
"learning_rate": 4.546379413524836e-05,
"loss": 0.0132,
"mean_token_accuracy": 0.9964711248874665,
"num_tokens": 2567952.0,
"step": 2430
},
{
"entropy": 1.8767970681190491,
"epoch": 1.4602034709754639,
"grad_norm": 1.015625,
"learning_rate": 4.5403949730700183e-05,
"loss": 0.0079,
"mean_token_accuracy": 0.9982850551605225,
"num_tokens": 2578461.0,
"step": 2440
},
{
"entropy": 1.8251904726028443,
"epoch": 1.4661879114302812,
"grad_norm": 0.2197265625,
"learning_rate": 4.5344105326152e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9989565730094909,
"num_tokens": 2588841.0,
"step": 2450
},
{
"entropy": 1.8160880088806153,
"epoch": 1.4721723518850989,
"grad_norm": 0.09326171875,
"learning_rate": 4.528426092160383e-05,
"loss": 0.0149,
"mean_token_accuracy": 0.9972167491912842,
"num_tokens": 2599306.0,
"step": 2460
},
{
"entropy": 1.8673020839691161,
"epoch": 1.4781567923399161,
"grad_norm": 1.5546875,
"learning_rate": 4.5224416517055654e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9992740452289581,
"num_tokens": 2609837.0,
"step": 2470
},
{
"entropy": 1.7704484939575196,
"epoch": 1.4841412327947336,
"grad_norm": 0.2734375,
"learning_rate": 4.516457211250748e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9985692203044891,
"num_tokens": 2620420.0,
"step": 2480
},
{
"entropy": 1.6958755135536194,
"epoch": 1.4901256732495511,
"grad_norm": 0.310546875,
"learning_rate": 4.510472770795931e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9993295550346375,
"num_tokens": 2630937.0,
"step": 2490
},
{
"entropy": 1.7750421166419983,
"epoch": 1.4961101137043686,
"grad_norm": 1.5,
"learning_rate": 4.504488330341113e-05,
"loss": 0.0063,
"mean_token_accuracy": 0.9982142150402069,
"num_tokens": 2641439.0,
"step": 2500
},
{
"entropy": 1.7922300696372986,
"epoch": 1.5020945541591861,
"grad_norm": 0.625,
"learning_rate": 4.498503889886296e-05,
"loss": 0.0066,
"mean_token_accuracy": 0.9982458829879761,
"num_tokens": 2652117.0,
"step": 2510
},
{
"entropy": 1.7609119415283203,
"epoch": 1.5080789946140036,
"grad_norm": 1.625,
"learning_rate": 4.4925194494314786e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9981802880764008,
"num_tokens": 2662835.0,
"step": 2520
},
{
"entropy": 1.6831945180892944,
"epoch": 1.5140634350688211,
"grad_norm": 1.671875,
"learning_rate": 4.48653500897666e-05,
"loss": 0.0101,
"mean_token_accuracy": 0.9968818426132202,
"num_tokens": 2673431.0,
"step": 2530
},
{
"entropy": 1.6552810072898865,
"epoch": 1.5200478755236384,
"grad_norm": 0.396484375,
"learning_rate": 4.480550568521843e-05,
"loss": 0.0095,
"mean_token_accuracy": 0.9975987613201142,
"num_tokens": 2684029.0,
"step": 2540
},
{
"entropy": 1.6987668871879578,
"epoch": 1.5260323159784561,
"grad_norm": 0.04931640625,
"learning_rate": 4.4745661280670257e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 2694474.0,
"step": 2550
},
{
"entropy": 1.6680127382278442,
"epoch": 1.5320167564332734,
"grad_norm": 0.041015625,
"learning_rate": 4.468581687612208e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9989441931247711,
"num_tokens": 2705076.0,
"step": 2560
},
{
"entropy": 1.6850673317909242,
"epoch": 1.5380011968880911,
"grad_norm": 0.8359375,
"learning_rate": 4.462597247157391e-05,
"loss": 0.0079,
"mean_token_accuracy": 0.9978463172912597,
"num_tokens": 2715754.0,
"step": 2570
},
{
"entropy": 1.6692217469215394,
"epoch": 1.5439856373429084,
"grad_norm": 0.048828125,
"learning_rate": 4.4566128067025734e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9985604822635651,
"num_tokens": 2726154.0,
"step": 2580
},
{
"entropy": 1.7106538891792298,
"epoch": 1.5499700777977259,
"grad_norm": 0.03662109375,
"learning_rate": 4.4506283662477564e-05,
"loss": 0.0064,
"mean_token_accuracy": 0.9978056967258453,
"num_tokens": 2736682.0,
"step": 2590
},
{
"entropy": 1.724018120765686,
"epoch": 1.5559545182525434,
"grad_norm": 0.0810546875,
"learning_rate": 4.444643925792939e-05,
"loss": 0.0116,
"mean_token_accuracy": 0.9978777050971985,
"num_tokens": 2747261.0,
"step": 2600
},
{
"entropy": 1.8123233675956727,
"epoch": 1.5619389587073609,
"grad_norm": 0.345703125,
"learning_rate": 4.438659485338121e-05,
"loss": 0.0118,
"mean_token_accuracy": 0.9970783770084382,
"num_tokens": 2757969.0,
"step": 2610
},
{
"entropy": 1.7606332778930665,
"epoch": 1.5679233991621784,
"grad_norm": 0.037353515625,
"learning_rate": 4.4326750448833035e-05,
"loss": 0.0099,
"mean_token_accuracy": 0.998226261138916,
"num_tokens": 2768599.0,
"step": 2620
},
{
"entropy": 1.7248129010200501,
"epoch": 1.5739078396169957,
"grad_norm": 0.033203125,
"learning_rate": 4.426690604428486e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9982411623001098,
"num_tokens": 2779152.0,
"step": 2630
},
{
"entropy": 1.6984099984169005,
"epoch": 1.5798922800718134,
"grad_norm": 0.30078125,
"learning_rate": 4.420706163973668e-05,
"loss": 0.0099,
"mean_token_accuracy": 0.9974049687385559,
"num_tokens": 2789672.0,
"step": 2640
},
{
"entropy": 1.7985439896583557,
"epoch": 1.5858767205266306,
"grad_norm": 1.1484375,
"learning_rate": 4.414721723518851e-05,
"loss": 0.0068,
"mean_token_accuracy": 0.9985246062278748,
"num_tokens": 2800058.0,
"step": 2650
},
{
"entropy": 1.848671793937683,
"epoch": 1.5918611609814484,
"grad_norm": 0.0245361328125,
"learning_rate": 4.4087372830640336e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9985762298107147,
"num_tokens": 2810671.0,
"step": 2660
},
{
"entropy": 1.8393208622932433,
"epoch": 1.5978456014362656,
"grad_norm": 0.027587890625,
"learning_rate": 4.402752842609216e-05,
"loss": 0.0068,
"mean_token_accuracy": 0.9992953419685364,
"num_tokens": 2821280.0,
"step": 2670
},
{
"entropy": 1.763173222541809,
"epoch": 1.6038300418910831,
"grad_norm": 0.1279296875,
"learning_rate": 4.396768402154399e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.998900830745697,
"num_tokens": 2831716.0,
"step": 2680
},
{
"entropy": 1.8260416030883788,
"epoch": 1.6098144823459006,
"grad_norm": 0.0233154296875,
"learning_rate": 4.3907839616995814e-05,
"loss": 0.0069,
"mean_token_accuracy": 0.9986373245716095,
"num_tokens": 2842147.0,
"step": 2690
},
{
"entropy": 1.8105480790138244,
"epoch": 1.6157989228007181,
"grad_norm": 0.423828125,
"learning_rate": 4.384799521244764e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9993321299552917,
"num_tokens": 2852899.0,
"step": 2700
},
{
"entropy": 1.7745528101921082,
"epoch": 1.6217833632555356,
"grad_norm": 1.4296875,
"learning_rate": 4.378815080789946e-05,
"loss": 0.0117,
"mean_token_accuracy": 0.9975007236003876,
"num_tokens": 2863429.0,
"step": 2710
},
{
"entropy": 1.7714335441589355,
"epoch": 1.6277678037103531,
"grad_norm": 0.0169677734375,
"learning_rate": 4.3728306403351285e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9993084073066711,
"num_tokens": 2873934.0,
"step": 2720
},
{
"entropy": 1.7836796760559082,
"epoch": 1.6337522441651706,
"grad_norm": 0.00933837890625,
"learning_rate": 4.3668461998803115e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.998683512210846,
"num_tokens": 2884469.0,
"step": 2730
},
{
"entropy": 1.7828012824058532,
"epoch": 1.639736684619988,
"grad_norm": 0.5078125,
"learning_rate": 4.360861759425494e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9992062389850617,
"num_tokens": 2894762.0,
"step": 2740
},
{
"entropy": 1.835528552532196,
"epoch": 1.6457211250748056,
"grad_norm": 0.5234375,
"learning_rate": 4.354877318970676e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9989313840866089,
"num_tokens": 2905249.0,
"step": 2750
},
{
"entropy": 1.9072396397590636,
"epoch": 1.6517055655296229,
"grad_norm": 1.4921875,
"learning_rate": 4.348892878515859e-05,
"loss": 0.0191,
"mean_token_accuracy": 0.9963602304458619,
"num_tokens": 2915557.0,
"step": 2760
},
{
"entropy": 2.0372175931930543,
"epoch": 1.6576900059844406,
"grad_norm": 0.025146484375,
"learning_rate": 4.3429084380610416e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9993102729320527,
"num_tokens": 2926153.0,
"step": 2770
},
{
"entropy": 2.0790203332901003,
"epoch": 1.6636744464392579,
"grad_norm": 1.1796875,
"learning_rate": 4.336923997606223e-05,
"loss": 0.0075,
"mean_token_accuracy": 0.9975761890411377,
"num_tokens": 2936597.0,
"step": 2780
},
{
"entropy": 1.9823689937591553,
"epoch": 1.6696588868940754,
"grad_norm": 0.0308837890625,
"learning_rate": 4.3309395571514063e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.9985705196857453,
"num_tokens": 2947210.0,
"step": 2790
},
{
"entropy": 1.865414524078369,
"epoch": 1.6756433273488929,
"grad_norm": 1.4140625,
"learning_rate": 4.324955116696589e-05,
"loss": 0.0168,
"mean_token_accuracy": 0.9971051633358001,
"num_tokens": 2957718.0,
"step": 2800
},
{
"entropy": 1.8666903614997863,
"epoch": 1.6816277678037104,
"grad_norm": 0.7890625,
"learning_rate": 4.318970676241772e-05,
"loss": 0.0082,
"mean_token_accuracy": 0.998199051618576,
"num_tokens": 2968277.0,
"step": 2810
},
{
"entropy": 1.8577040791511537,
"epoch": 1.6876122082585279,
"grad_norm": 0.046875,
"learning_rate": 4.312986235786954e-05,
"loss": 0.0124,
"mean_token_accuracy": 0.9985236465930939,
"num_tokens": 2978802.0,
"step": 2820
},
{
"entropy": 1.8664780139923096,
"epoch": 1.6935966487133451,
"grad_norm": 0.05810546875,
"learning_rate": 4.3070017953321365e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9992798209190369,
"num_tokens": 2989285.0,
"step": 2830
},
{
"entropy": 1.8037607192993164,
"epoch": 1.6995810891681629,
"grad_norm": 0.376953125,
"learning_rate": 4.3010173548773195e-05,
"loss": 0.0155,
"mean_token_accuracy": 0.9978967070579529,
"num_tokens": 2999866.0,
"step": 2840
},
{
"entropy": 1.7806638836860658,
"epoch": 1.7055655296229801,
"grad_norm": 2.625,
"learning_rate": 4.295032914422502e-05,
"loss": 0.0143,
"mean_token_accuracy": 0.995397436618805,
"num_tokens": 3010270.0,
"step": 2850
},
{
"entropy": 1.6426480889320374,
"epoch": 1.7115499700777979,
"grad_norm": 0.1328125,
"learning_rate": 4.2890484739676835e-05,
"loss": 0.0099,
"mean_token_accuracy": 0.997680002450943,
"num_tokens": 3020978.0,
"step": 2860
},
{
"entropy": 1.570683479309082,
"epoch": 1.7175344105326151,
"grad_norm": 0.61328125,
"learning_rate": 4.2830640335128666e-05,
"loss": 0.0141,
"mean_token_accuracy": 0.997533792257309,
"num_tokens": 3031702.0,
"step": 2870
},
{
"entropy": 1.5966361045837403,
"epoch": 1.7235188509874326,
"grad_norm": 0.048828125,
"learning_rate": 4.277079593058049e-05,
"loss": 0.0052,
"mean_token_accuracy": 0.9989168524742127,
"num_tokens": 3042239.0,
"step": 2880
},
{
"entropy": 1.699927806854248,
"epoch": 1.7295032914422501,
"grad_norm": 0.71484375,
"learning_rate": 4.271095152603232e-05,
"loss": 0.009,
"mean_token_accuracy": 0.9989072799682617,
"num_tokens": 3052721.0,
"step": 2890
},
{
"entropy": 1.7357771277427674,
"epoch": 1.7354877318970676,
"grad_norm": 0.2578125,
"learning_rate": 4.2651107121484143e-05,
"loss": 0.0066,
"mean_token_accuracy": 0.9988929867744446,
"num_tokens": 3063139.0,
"step": 2900
},
{
"entropy": 1.7626645684242248,
"epoch": 1.7414721723518851,
"grad_norm": 0.025146484375,
"learning_rate": 4.259126271693597e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.999007374048233,
"num_tokens": 3073708.0,
"step": 2910
},
{
"entropy": 1.7304651737213135,
"epoch": 1.7474566128067026,
"grad_norm": 1.7421875,
"learning_rate": 4.25314183123878e-05,
"loss": 0.0052,
"mean_token_accuracy": 0.9979542791843414,
"num_tokens": 3084324.0,
"step": 2920
},
{
"entropy": 1.6680734276771545,
"epoch": 1.75344105326152,
"grad_norm": 2.96875,
"learning_rate": 4.247157390783962e-05,
"loss": 0.0073,
"mean_token_accuracy": 0.9978868365287781,
"num_tokens": 3094786.0,
"step": 2930
},
{
"entropy": 1.5931068778038024,
"epoch": 1.7594254937163374,
"grad_norm": 0.63671875,
"learning_rate": 4.241172950329144e-05,
"loss": 0.0067,
"mean_token_accuracy": 0.9985966563224793,
"num_tokens": 3105311.0,
"step": 2940
},
{
"entropy": 1.7409747838974,
"epoch": 1.765409934171155,
"grad_norm": 0.4921875,
"learning_rate": 4.235188509874327e-05,
"loss": 0.0078,
"mean_token_accuracy": 0.9985540926456451,
"num_tokens": 3115948.0,
"step": 2950
},
{
"entropy": 1.7411438941955566,
"epoch": 1.7713943746259724,
"grad_norm": 0.07958984375,
"learning_rate": 4.229204069419509e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9996478855609894,
"num_tokens": 3126449.0,
"step": 2960
},
{
"entropy": 1.652970790863037,
"epoch": 1.77737881508079,
"grad_norm": 0.032958984375,
"learning_rate": 4.2232196289646915e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 3137023.0,
"step": 2970
},
{
"entropy": 1.6763949155807496,
"epoch": 1.7833632555356074,
"grad_norm": 0.09619140625,
"learning_rate": 4.2172351885098746e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 3147623.0,
"step": 2980
},
{
"entropy": 1.6129281163215636,
"epoch": 1.7893476959904249,
"grad_norm": 1.1015625,
"learning_rate": 4.211250748055057e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9985423862934113,
"num_tokens": 3158087.0,
"step": 2990
},
{
"entropy": 1.5717525839805604,
"epoch": 1.7953321364452424,
"grad_norm": 0.020263671875,
"learning_rate": 4.20526630760024e-05,
"loss": 0.0072,
"mean_token_accuracy": 0.9992836058139801,
"num_tokens": 3168795.0,
"step": 3000
},
{
"entropy": 1.5834394574165345,
"epoch": 1.8013165769000599,
"grad_norm": 0.0218505859375,
"learning_rate": 4.199281867145422e-05,
"loss": 0.0156,
"mean_token_accuracy": 0.9971341669559479,
"num_tokens": 3179498.0,
"step": 3010
},
{
"entropy": 1.6837530136108398,
"epoch": 1.8073010173548774,
"grad_norm": 1.296875,
"learning_rate": 4.193297426690604e-05,
"loss": 0.0168,
"mean_token_accuracy": 0.9972832024097442,
"num_tokens": 3190100.0,
"step": 3020
},
{
"entropy": 1.8728898286819458,
"epoch": 1.8132854578096946,
"grad_norm": 0.984375,
"learning_rate": 4.187312986235787e-05,
"loss": 0.0061,
"mean_token_accuracy": 0.9981972455978394,
"num_tokens": 3200638.0,
"step": 3030
},
{
"entropy": 1.7914460897445679,
"epoch": 1.8192698982645124,
"grad_norm": 0.55859375,
"learning_rate": 4.1813285457809694e-05,
"loss": 0.007,
"mean_token_accuracy": 0.997859263420105,
"num_tokens": 3211123.0,
"step": 3040
},
{
"entropy": 1.7274253368377686,
"epoch": 1.8252543387193296,
"grad_norm": 0.671875,
"learning_rate": 4.175344105326152e-05,
"loss": 0.0055,
"mean_token_accuracy": 0.9982551515102387,
"num_tokens": 3221763.0,
"step": 3050
},
{
"entropy": 1.6481835126876831,
"epoch": 1.8312387791741473,
"grad_norm": 0.0179443359375,
"learning_rate": 4.169359664871335e-05,
"loss": 0.0059,
"mean_token_accuracy": 0.9985887408256531,
"num_tokens": 3232290.0,
"step": 3060
},
{
"entropy": 1.6477917194366456,
"epoch": 1.8372232196289646,
"grad_norm": 0.032470703125,
"learning_rate": 4.163375224416517e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9989434242248535,
"num_tokens": 3242758.0,
"step": 3070
},
{
"entropy": 1.7404741525650025,
"epoch": 1.8432076600837821,
"grad_norm": 0.044677734375,
"learning_rate": 4.1573907839616995e-05,
"loss": 0.0102,
"mean_token_accuracy": 0.9978300333023071,
"num_tokens": 3253221.0,
"step": 3080
},
{
"entropy": 1.7434081315994263,
"epoch": 1.8491921005385996,
"grad_norm": 0.1015625,
"learning_rate": 4.1514063435068826e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9996563553810119,
"num_tokens": 3263868.0,
"step": 3090
},
{
"entropy": 1.7428674221038818,
"epoch": 1.8551765409934171,
"grad_norm": 0.1640625,
"learning_rate": 4.145421903052065e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9989525496959686,
"num_tokens": 3274441.0,
"step": 3100
},
{
"entropy": 1.7549742221832276,
"epoch": 1.8611609814482346,
"grad_norm": 0.027099609375,
"learning_rate": 4.139437462597247e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996551752090455,
"num_tokens": 3285089.0,
"step": 3110
},
{
"entropy": 1.7254215598106384,
"epoch": 1.867145421903052,
"grad_norm": 0.2255859375,
"learning_rate": 4.1334530221424296e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.999311363697052,
"num_tokens": 3295802.0,
"step": 3120
},
{
"entropy": 1.6737028002738952,
"epoch": 1.8731298623578696,
"grad_norm": 0.5234375,
"learning_rate": 4.127468581687612e-05,
"loss": 0.0086,
"mean_token_accuracy": 0.9982663750648498,
"num_tokens": 3306563.0,
"step": 3130
},
{
"entropy": 1.725145435333252,
"epoch": 1.8791143028126869,
"grad_norm": 0.037841796875,
"learning_rate": 4.121484141232795e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9992632687091827,
"num_tokens": 3317087.0,
"step": 3140
},
{
"entropy": 1.7711864471435548,
"epoch": 1.8850987432675046,
"grad_norm": 0.224609375,
"learning_rate": 4.1154997007779774e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9992907822132111,
"num_tokens": 3327581.0,
"step": 3150
},
{
"entropy": 1.7485621452331543,
"epoch": 1.8910831837223219,
"grad_norm": 0.609375,
"learning_rate": 4.10951526032316e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9996168553829193,
"num_tokens": 3338131.0,
"step": 3160
},
{
"entropy": 1.6710041880607605,
"epoch": 1.8970676241771396,
"grad_norm": 0.059326171875,
"learning_rate": 4.103530819868343e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9993249118328095,
"num_tokens": 3348685.0,
"step": 3170
},
{
"entropy": 1.696055793762207,
"epoch": 1.9030520646319569,
"grad_norm": 0.01708984375,
"learning_rate": 4.097546379413525e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9989370882511139,
"num_tokens": 3359299.0,
"step": 3180
},
{
"entropy": 1.6661385297775269,
"epoch": 1.9090365050867744,
"grad_norm": 1.4140625,
"learning_rate": 4.0915619389587075e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.998573511838913,
"num_tokens": 3369753.0,
"step": 3190
},
{
"entropy": 1.6959844470024108,
"epoch": 1.9150209455415919,
"grad_norm": 0.369140625,
"learning_rate": 4.08557749850389e-05,
"loss": 0.0072,
"mean_token_accuracy": 0.9981562256813049,
"num_tokens": 3380273.0,
"step": 3200
},
{
"entropy": 1.6790688037872314,
"epoch": 1.9210053859964094,
"grad_norm": 0.0693359375,
"learning_rate": 4.079593058049072e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.999646645784378,
"num_tokens": 3390726.0,
"step": 3210
},
{
"entropy": 1.6851406931877135,
"epoch": 1.9269898264512269,
"grad_norm": 0.01177978515625,
"learning_rate": 4.073608617594255e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9992761254310608,
"num_tokens": 3401436.0,
"step": 3220
},
{
"entropy": 1.6872121214866638,
"epoch": 1.9329742669060441,
"grad_norm": 1.0546875,
"learning_rate": 4.0676241771394376e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9989988803863525,
"num_tokens": 3412090.0,
"step": 3230
},
{
"entropy": 1.716448712348938,
"epoch": 1.9389587073608618,
"grad_norm": 1.0390625,
"learning_rate": 4.06163973668462e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9992851316928864,
"num_tokens": 3422680.0,
"step": 3240
},
{
"entropy": 1.7055083990097046,
"epoch": 1.9449431478156791,
"grad_norm": 0.2109375,
"learning_rate": 4.055655296229803e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9986045360565186,
"num_tokens": 3433240.0,
"step": 3250
},
{
"entropy": 1.6845415115356446,
"epoch": 1.9509275882704968,
"grad_norm": 0.53515625,
"learning_rate": 4.0496708557749854e-05,
"loss": 0.006,
"mean_token_accuracy": 0.9979104697704315,
"num_tokens": 3443914.0,
"step": 3260
},
{
"entropy": 1.6189923405647277,
"epoch": 1.9569120287253141,
"grad_norm": 0.703125,
"learning_rate": 4.043686415320167e-05,
"loss": 0.0082,
"mean_token_accuracy": 0.9978470265865326,
"num_tokens": 3454396.0,
"step": 3270
},
{
"entropy": 1.6343070983886718,
"epoch": 1.9628964691801316,
"grad_norm": 0.029052734375,
"learning_rate": 4.03770197486535e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9992868721485137,
"num_tokens": 3465124.0,
"step": 3280
},
{
"entropy": 1.6529561638832093,
"epoch": 1.968880909634949,
"grad_norm": 0.054931640625,
"learning_rate": 4.0317175344105325e-05,
"loss": 0.0081,
"mean_token_accuracy": 0.998256516456604,
"num_tokens": 3475684.0,
"step": 3290
},
{
"entropy": 1.6316777467727661,
"epoch": 1.9748653500897666,
"grad_norm": 1.8125,
"learning_rate": 4.0257330939557155e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9990007400512695,
"num_tokens": 3486230.0,
"step": 3300
},
{
"entropy": 1.6272296071052552,
"epoch": 1.980849790544584,
"grad_norm": 0.130859375,
"learning_rate": 4.019748653500898e-05,
"loss": 0.0099,
"mean_token_accuracy": 0.9979246437549592,
"num_tokens": 3496872.0,
"step": 3310
},
{
"entropy": 1.5232258319854737,
"epoch": 1.9868342309994016,
"grad_norm": 0.0281982421875,
"learning_rate": 4.01376421304608e-05,
"loss": 0.0056,
"mean_token_accuracy": 0.9988779187202453,
"num_tokens": 3507359.0,
"step": 3320
},
{
"entropy": 1.4930276036262513,
"epoch": 1.992818671454219,
"grad_norm": 0.0225830078125,
"learning_rate": 4.007779772591263e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9993079602718353,
"num_tokens": 3517875.0,
"step": 3330
},
{
"entropy": 1.473480725288391,
"epoch": 1.9988031119090364,
"grad_norm": 0.171875,
"learning_rate": 4.0017953321364456e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.999308729171753,
"num_tokens": 3528455.0,
"step": 3340
},
{
"entropy": 1.4143877744674682,
"epoch": 2.004787552363854,
"grad_norm": 0.35546875,
"learning_rate": 3.995810891681627e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9985805928707123,
"num_tokens": 3538886.0,
"step": 3350
},
{
"entropy": 1.4655809164047242,
"epoch": 2.0107719928186714,
"grad_norm": 0.10400390625,
"learning_rate": 3.98982645122681e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 3549329.0,
"step": 3360
},
{
"entropy": 1.5789464473724366,
"epoch": 2.016756433273489,
"grad_norm": 0.02783203125,
"learning_rate": 3.983842010771993e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9988843858242035,
"num_tokens": 3559966.0,
"step": 3370
},
{
"entropy": 1.6663800716400146,
"epoch": 2.0227408737283064,
"grad_norm": 0.404296875,
"learning_rate": 3.977857570317175e-05,
"loss": 0.0022,
"mean_token_accuracy": 0.999292403459549,
"num_tokens": 3570588.0,
"step": 3380
},
{
"entropy": 1.5711905360221863,
"epoch": 2.028725314183124,
"grad_norm": 0.1337890625,
"learning_rate": 3.971873129862358e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9993067562580109,
"num_tokens": 3581142.0,
"step": 3390
},
{
"entropy": 1.5718475818634032,
"epoch": 2.0347097546379413,
"grad_norm": 0.054931640625,
"learning_rate": 3.9658886894075405e-05,
"loss": 0.0058,
"mean_token_accuracy": 0.9988965094089508,
"num_tokens": 3591649.0,
"step": 3400
},
{
"entropy": 1.667174506187439,
"epoch": 2.0406941950927586,
"grad_norm": 0.095703125,
"learning_rate": 3.9599042489527235e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9992761373519897,
"num_tokens": 3602268.0,
"step": 3410
},
{
"entropy": 1.6831801772117614,
"epoch": 2.0466786355475763,
"grad_norm": 0.072265625,
"learning_rate": 3.953919808497906e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.999646645784378,
"num_tokens": 3613010.0,
"step": 3420
},
{
"entropy": 1.6643374800682067,
"epoch": 2.0526630760023936,
"grad_norm": 0.0859375,
"learning_rate": 3.9479353680430875e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9996296286582946,
"num_tokens": 3623611.0,
"step": 3430
},
{
"entropy": 1.5809536933898927,
"epoch": 2.0586475164572113,
"grad_norm": 0.0198974609375,
"learning_rate": 3.9419509275882706e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.999646645784378,
"num_tokens": 3634063.0,
"step": 3440
},
{
"entropy": 1.6075554132461547,
"epoch": 2.0646319569120286,
"grad_norm": 0.0712890625,
"learning_rate": 3.935966487133453e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.998986691236496,
"num_tokens": 3644779.0,
"step": 3450
},
{
"entropy": 1.5977044224739074,
"epoch": 2.0706163973668463,
"grad_norm": 0.02734375,
"learning_rate": 3.929982046678635e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 3655353.0,
"step": 3460
},
{
"entropy": 1.5767899036407471,
"epoch": 2.0766008378216636,
"grad_norm": 0.515625,
"learning_rate": 3.923997606223818e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 3665953.0,
"step": 3470
},
{
"entropy": 1.57765634059906,
"epoch": 2.0825852782764813,
"grad_norm": 0.55078125,
"learning_rate": 3.918013165769001e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9988885760307312,
"num_tokens": 3676317.0,
"step": 3480
},
{
"entropy": 1.5800118088722228,
"epoch": 2.0885697187312986,
"grad_norm": 0.4453125,
"learning_rate": 3.912028725314184e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9989434778690338,
"num_tokens": 3686778.0,
"step": 3490
},
{
"entropy": 1.5637386918067933,
"epoch": 2.0945541591861163,
"grad_norm": 0.046875,
"learning_rate": 3.906044284859366e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.998903077840805,
"num_tokens": 3697341.0,
"step": 3500
},
{
"entropy": 1.5040214300155639,
"epoch": 2.1005385996409336,
"grad_norm": 0.0157470703125,
"learning_rate": 3.900059844404548e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9992480218410492,
"num_tokens": 3707750.0,
"step": 3510
},
{
"entropy": 1.53771892786026,
"epoch": 2.106523040095751,
"grad_norm": 0.02587890625,
"learning_rate": 3.894075403949731e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9992437899112702,
"num_tokens": 3718304.0,
"step": 3520
},
{
"entropy": 1.5398314595222473,
"epoch": 2.1125074805505686,
"grad_norm": 0.06884765625,
"learning_rate": 3.888090963494913e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9985848188400268,
"num_tokens": 3728881.0,
"step": 3530
},
{
"entropy": 1.5595148921012878,
"epoch": 2.118491921005386,
"grad_norm": 0.01031494140625,
"learning_rate": 3.8821065230400955e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996491253376008,
"num_tokens": 3739469.0,
"step": 3540
},
{
"entropy": 1.6275775909423829,
"epoch": 2.1244763614602036,
"grad_norm": 0.08642578125,
"learning_rate": 3.8761220825852786e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 3750007.0,
"step": 3550
},
{
"entropy": 1.5797725796699524,
"epoch": 2.130460801915021,
"grad_norm": 0.0390625,
"learning_rate": 3.870137642130461e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.998933631181717,
"num_tokens": 3760603.0,
"step": 3560
},
{
"entropy": 1.518853211402893,
"epoch": 2.1364452423698386,
"grad_norm": 0.0299072265625,
"learning_rate": 3.864153201675643e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 3771047.0,
"step": 3570
},
{
"entropy": 1.4537081837654113,
"epoch": 2.142429682824656,
"grad_norm": 0.0869140625,
"learning_rate": 3.858168761220826e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 3781678.0,
"step": 3580
},
{
"entropy": 1.4374495148658752,
"epoch": 2.1484141232794736,
"grad_norm": 0.059326171875,
"learning_rate": 3.852184320766009e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9989359259605408,
"num_tokens": 3792139.0,
"step": 3590
},
{
"entropy": 1.4887493014335633,
"epoch": 2.154398563734291,
"grad_norm": 0.061767578125,
"learning_rate": 3.846199880311191e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9985256731510163,
"num_tokens": 3802586.0,
"step": 3600
},
{
"entropy": 1.5625263094902038,
"epoch": 2.160383004189108,
"grad_norm": 0.412109375,
"learning_rate": 3.8402154398563734e-05,
"loss": 0.003,
"mean_token_accuracy": 0.998971951007843,
"num_tokens": 3813115.0,
"step": 3610
},
{
"entropy": 1.6105829358100892,
"epoch": 2.166367444643926,
"grad_norm": 0.12890625,
"learning_rate": 3.834230999401556e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9989726841449738,
"num_tokens": 3823587.0,
"step": 3620
},
{
"entropy": 1.543606126308441,
"epoch": 2.172351885098743,
"grad_norm": 0.01458740234375,
"learning_rate": 3.828246558946739e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9992787480354309,
"num_tokens": 3834170.0,
"step": 3630
},
{
"entropy": 1.5743085980415343,
"epoch": 2.178336325553561,
"grad_norm": 0.56640625,
"learning_rate": 3.822262118491921e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9985100030899048,
"num_tokens": 3844656.0,
"step": 3640
},
{
"entropy": 1.655414378643036,
"epoch": 2.184320766008378,
"grad_norm": 0.5625,
"learning_rate": 3.8162776780371035e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9978154480457306,
"num_tokens": 3855329.0,
"step": 3650
},
{
"entropy": 1.5091199278831482,
"epoch": 2.190305206463196,
"grad_norm": 0.00592041015625,
"learning_rate": 3.8102932375822866e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 3865991.0,
"step": 3660
},
{
"entropy": 1.4430976867675782,
"epoch": 2.196289646918013,
"grad_norm": 0.5390625,
"learning_rate": 3.804308797127469e-05,
"loss": 0.0061,
"mean_token_accuracy": 0.9989435613155365,
"num_tokens": 3876625.0,
"step": 3670
},
{
"entropy": 1.4487504601478576,
"epoch": 2.202274087372831,
"grad_norm": 0.0213623046875,
"learning_rate": 3.7983243566726506e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9992740869522094,
"num_tokens": 3887170.0,
"step": 3680
},
{
"entropy": 1.4686145186424255,
"epoch": 2.208258527827648,
"grad_norm": 0.0230712890625,
"learning_rate": 3.7923399162178336e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 3897911.0,
"step": 3690
},
{
"entropy": 1.4431620836257935,
"epoch": 2.2142429682824654,
"grad_norm": 0.77734375,
"learning_rate": 3.786355475763016e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9993333339691162,
"num_tokens": 3908336.0,
"step": 3700
},
{
"entropy": 1.4750329613685609,
"epoch": 2.220227408737283,
"grad_norm": 0.00927734375,
"learning_rate": 3.780371035308199e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 3918781.0,
"step": 3710
},
{
"entropy": 1.5059397101402283,
"epoch": 2.2262118491921004,
"grad_norm": 0.00958251953125,
"learning_rate": 3.7743865948533814e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 3929424.0,
"step": 3720
},
{
"entropy": 1.464370334148407,
"epoch": 2.232196289646918,
"grad_norm": 0.01165771484375,
"learning_rate": 3.768402154398564e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9993080615997314,
"num_tokens": 3939959.0,
"step": 3730
},
{
"entropy": 1.461155390739441,
"epoch": 2.2381807301017353,
"grad_norm": 0.06201171875,
"learning_rate": 3.762417713943747e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 3950544.0,
"step": 3740
},
{
"entropy": 1.4727044582366944,
"epoch": 2.244165170556553,
"grad_norm": 0.09033203125,
"learning_rate": 3.756433273488929e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9992726743221283,
"num_tokens": 3960981.0,
"step": 3750
},
{
"entropy": 1.5011940956115724,
"epoch": 2.2501496110113703,
"grad_norm": 0.25390625,
"learning_rate": 3.750448833034111e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9992779731750489,
"num_tokens": 3971564.0,
"step": 3760
},
{
"entropy": 1.4078798294067383,
"epoch": 2.256134051466188,
"grad_norm": 0.103515625,
"learning_rate": 3.744464392579294e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9996212124824524,
"num_tokens": 3982005.0,
"step": 3770
},
{
"entropy": 1.4928112626075745,
"epoch": 2.2621184919210053,
"grad_norm": 0.021728515625,
"learning_rate": 3.738479952124476e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9996710538864135,
"num_tokens": 3992692.0,
"step": 3780
},
{
"entropy": 1.5147884964942933,
"epoch": 2.2681029323758226,
"grad_norm": 1.1015625,
"learning_rate": 3.732495511669659e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9985506594181061,
"num_tokens": 4003180.0,
"step": 3790
},
{
"entropy": 1.541051721572876,
"epoch": 2.2740873728306403,
"grad_norm": 0.021728515625,
"learning_rate": 3.7265110712148416e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 4013778.0,
"step": 3800
},
{
"entropy": 1.5526644825935363,
"epoch": 2.280071813285458,
"grad_norm": 0.58203125,
"learning_rate": 3.720526630760024e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.999664431810379,
"num_tokens": 4024306.0,
"step": 3810
},
{
"entropy": 1.516521191596985,
"epoch": 2.2860562537402753,
"grad_norm": 1.2578125,
"learning_rate": 3.714542190305207e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9985815584659576,
"num_tokens": 4034819.0,
"step": 3820
},
{
"entropy": 1.4400922060012817,
"epoch": 2.2920406941950926,
"grad_norm": 0.033203125,
"learning_rate": 3.7085577498503894e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9993127107620239,
"num_tokens": 4045226.0,
"step": 3830
},
{
"entropy": 1.454219126701355,
"epoch": 2.2980251346499103,
"grad_norm": 0.58203125,
"learning_rate": 3.702573309395571e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9996282517910003,
"num_tokens": 4055858.0,
"step": 3840
},
{
"entropy": 1.423510491847992,
"epoch": 2.3040095751047276,
"grad_norm": 0.458984375,
"learning_rate": 3.696588868940754e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9992446184158326,
"num_tokens": 4066452.0,
"step": 3850
},
{
"entropy": 1.453119683265686,
"epoch": 2.3099940155595453,
"grad_norm": 0.037109375,
"learning_rate": 3.6906044284859365e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9992790877819061,
"num_tokens": 4077100.0,
"step": 3860
},
{
"entropy": 1.3985321760177611,
"epoch": 2.3159784560143626,
"grad_norm": 0.09033203125,
"learning_rate": 3.684619988031119e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 4087538.0,
"step": 3870
},
{
"entropy": 1.4676225781440735,
"epoch": 2.3219628964691803,
"grad_norm": 0.010498046875,
"learning_rate": 3.678635547576302e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9989520311355591,
"num_tokens": 4098159.0,
"step": 3880
},
{
"entropy": 1.4613074779510498,
"epoch": 2.3279473369239976,
"grad_norm": 0.00579833984375,
"learning_rate": 3.672651107121484e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9989755749702454,
"num_tokens": 4108693.0,
"step": 3890
},
{
"entropy": 1.4477983593940735,
"epoch": 2.3339317773788153,
"grad_norm": 0.06787109375,
"learning_rate": 3.666666666666667e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9993051826953888,
"num_tokens": 4119341.0,
"step": 3900
},
{
"entropy": 1.442084550857544,
"epoch": 2.3399162178336326,
"grad_norm": 0.0546875,
"learning_rate": 3.6606822262118496e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996254682540894,
"num_tokens": 4129710.0,
"step": 3910
},
{
"entropy": 1.4917728424072265,
"epoch": 2.34590065828845,
"grad_norm": 0.064453125,
"learning_rate": 3.654697785757031e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.999326479434967,
"num_tokens": 4140404.0,
"step": 3920
},
{
"entropy": 1.4556031465530395,
"epoch": 2.3518850987432676,
"grad_norm": 0.138671875,
"learning_rate": 3.648713345302214e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 4150763.0,
"step": 3930
},
{
"entropy": 1.594400453567505,
"epoch": 2.357869539198085,
"grad_norm": 0.01397705078125,
"learning_rate": 3.642728904847397e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9992455244064331,
"num_tokens": 4161320.0,
"step": 3940
},
{
"entropy": 1.4968407869338989,
"epoch": 2.3638539796529026,
"grad_norm": 0.0205078125,
"learning_rate": 3.636744464392579e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9992551147937775,
"num_tokens": 4171550.0,
"step": 3950
},
{
"entropy": 1.4865975022315978,
"epoch": 2.36983842010772,
"grad_norm": 0.0205078125,
"learning_rate": 3.630760023937762e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9993265926837921,
"num_tokens": 4182415.0,
"step": 3960
},
{
"entropy": 1.42852680683136,
"epoch": 2.3758228605625376,
"grad_norm": 0.0272216796875,
"learning_rate": 3.6247755834829444e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9996515691280365,
"num_tokens": 4193040.0,
"step": 3970
},
{
"entropy": 1.4439242601394653,
"epoch": 2.381807301017355,
"grad_norm": 0.0966796875,
"learning_rate": 3.618791143028127e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996632993221283,
"num_tokens": 4203681.0,
"step": 3980
},
{
"entropy": 1.4410004258155822,
"epoch": 2.3877917414721725,
"grad_norm": 0.0084228515625,
"learning_rate": 3.61280670257331e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9992619931697846,
"num_tokens": 4214173.0,
"step": 3990
},
{
"entropy": 1.3759098768234252,
"epoch": 2.39377618192699,
"grad_norm": 5.96875,
"learning_rate": 3.606822262118492e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9993297576904296,
"num_tokens": 4224654.0,
"step": 4000
},
{
"entropy": 1.417907428741455,
"epoch": 2.399760622381807,
"grad_norm": 0.02734375,
"learning_rate": 3.6008378216636746e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 4235203.0,
"step": 4010
},
{
"entropy": 1.4660828113555908,
"epoch": 2.405745062836625,
"grad_norm": 0.00799560546875,
"learning_rate": 3.594853381208857e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9993115305900574,
"num_tokens": 4245897.0,
"step": 4020
},
{
"entropy": 1.5142138838768004,
"epoch": 2.411729503291442,
"grad_norm": 0.197265625,
"learning_rate": 3.588868940754039e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 4256656.0,
"step": 4030
},
{
"entropy": 1.4816847085952758,
"epoch": 2.41771394374626,
"grad_norm": 0.11474609375,
"learning_rate": 3.582884500299222e-05,
"loss": 0.0069,
"mean_token_accuracy": 0.9981687843799592,
"num_tokens": 4267267.0,
"step": 4040
},
{
"entropy": 1.4937491059303283,
"epoch": 2.423698384201077,
"grad_norm": 0.2451171875,
"learning_rate": 3.576900059844405e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9981954574584961,
"num_tokens": 4277646.0,
"step": 4050
},
{
"entropy": 1.5693211913108827,
"epoch": 2.429682824655895,
"grad_norm": 0.126953125,
"learning_rate": 3.570915619389587e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996551752090455,
"num_tokens": 4288121.0,
"step": 4060
},
{
"entropy": 1.4717801094055176,
"epoch": 2.435667265110712,
"grad_norm": 0.0164794921875,
"learning_rate": 3.56493117893477e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9989271759986877,
"num_tokens": 4298628.0,
"step": 4070
},
{
"entropy": 1.4409169912338258,
"epoch": 2.44165170556553,
"grad_norm": 0.1083984375,
"learning_rate": 3.5589467384799524e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996268630027771,
"num_tokens": 4309009.0,
"step": 4080
},
{
"entropy": 1.4519354104995728,
"epoch": 2.447636146020347,
"grad_norm": 0.353515625,
"learning_rate": 3.552962298025135e-05,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 4319676.0,
"step": 4090
},
{
"entropy": 1.46173015832901,
"epoch": 2.4536205864751643,
"grad_norm": 0.07470703125,
"learning_rate": 3.546977857570317e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989534914493561,
"num_tokens": 4330203.0,
"step": 4100
},
{
"entropy": 1.4660939931869508,
"epoch": 2.459605026929982,
"grad_norm": 0.1513671875,
"learning_rate": 3.5409934171154995e-05,
"loss": 0.0059,
"mean_token_accuracy": 0.9986146986484528,
"num_tokens": 4340739.0,
"step": 4110
},
{
"entropy": 1.5220659852027894,
"epoch": 2.4655894673847993,
"grad_norm": 0.671875,
"learning_rate": 3.5350089766606826e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9989289164543151,
"num_tokens": 4351318.0,
"step": 4120
},
{
"entropy": 1.4552037119865417,
"epoch": 2.471573907839617,
"grad_norm": 0.234375,
"learning_rate": 3.529024536205865e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9989991009235382,
"num_tokens": 4361930.0,
"step": 4130
},
{
"entropy": 1.4578218817710877,
"epoch": 2.4775583482944343,
"grad_norm": 0.024169921875,
"learning_rate": 3.523040095751047e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996855318546295,
"num_tokens": 4372650.0,
"step": 4140
},
{
"entropy": 1.4377676606178285,
"epoch": 2.483542788749252,
"grad_norm": 0.05615234375,
"learning_rate": 3.51705565529623e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996784567832947,
"num_tokens": 4383173.0,
"step": 4150
},
{
"entropy": 1.453041100502014,
"epoch": 2.4895272292040693,
"grad_norm": 0.005828857421875,
"learning_rate": 3.511071214841413e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 4393773.0,
"step": 4160
},
{
"entropy": 1.4778467059135436,
"epoch": 2.495511669658887,
"grad_norm": 0.039794921875,
"learning_rate": 3.5050867743865943e-05,
"loss": 0.0079,
"mean_token_accuracy": 0.9985835254192352,
"num_tokens": 4404451.0,
"step": 4170
},
{
"entropy": 1.5435897827148437,
"epoch": 2.5014961101137043,
"grad_norm": 0.359375,
"learning_rate": 3.4991023339317774e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9992982268333435,
"num_tokens": 4414958.0,
"step": 4180
},
{
"entropy": 1.4607853293418884,
"epoch": 2.5074805505685216,
"grad_norm": 0.048828125,
"learning_rate": 3.49311789347696e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996515691280365,
"num_tokens": 4425511.0,
"step": 4190
},
{
"entropy": 1.5232563734054565,
"epoch": 2.5134649910233393,
"grad_norm": 0.014892578125,
"learning_rate": 3.487133453022143e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996666669845581,
"num_tokens": 4436154.0,
"step": 4200
},
{
"entropy": 1.5244243621826172,
"epoch": 2.519449431478157,
"grad_norm": 0.2158203125,
"learning_rate": 3.481149012567325e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 4446848.0,
"step": 4210
},
{
"entropy": 1.456788754463196,
"epoch": 2.5254338719329743,
"grad_norm": 1.2734375,
"learning_rate": 3.4751645721125075e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.9989286601543427,
"num_tokens": 4457385.0,
"step": 4220
},
{
"entropy": 1.5002227187156678,
"epoch": 2.5314183123877916,
"grad_norm": 1.0703125,
"learning_rate": 3.4691801316576905e-05,
"loss": 0.0072,
"mean_token_accuracy": 0.9986241698265076,
"num_tokens": 4468134.0,
"step": 4230
},
{
"entropy": 1.5510728359222412,
"epoch": 2.5374027528426093,
"grad_norm": 0.01348876953125,
"learning_rate": 3.463195691202873e-05,
"loss": 0.0022,
"mean_token_accuracy": 0.9992902100086212,
"num_tokens": 4478691.0,
"step": 4240
},
{
"entropy": 1.5891472101211548,
"epoch": 2.5433871932974266,
"grad_norm": 0.07568359375,
"learning_rate": 3.4572112507480546e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9989309251308441,
"num_tokens": 4489250.0,
"step": 4250
},
{
"entropy": 1.6060064077377318,
"epoch": 2.5493716337522443,
"grad_norm": 0.026611328125,
"learning_rate": 3.4512268102932376e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 4499839.0,
"step": 4260
},
{
"entropy": 1.5444846272468566,
"epoch": 2.5553560742070616,
"grad_norm": 0.0189208984375,
"learning_rate": 3.44524236983842e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996632993221283,
"num_tokens": 4510430.0,
"step": 4270
},
{
"entropy": 1.5540019631385804,
"epoch": 2.561340514661879,
"grad_norm": 0.0130615234375,
"learning_rate": 3.439257929383602e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9989399433135986,
"num_tokens": 4521077.0,
"step": 4280
},
{
"entropy": 1.5348394870758058,
"epoch": 2.5673249551166966,
"grad_norm": 0.03369140625,
"learning_rate": 3.4332734889287854e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996376812458039,
"num_tokens": 4531630.0,
"step": 4290
},
{
"entropy": 1.5469215869903565,
"epoch": 2.5733093955715143,
"grad_norm": 0.043701171875,
"learning_rate": 3.427289048473968e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9985177874565124,
"num_tokens": 4542041.0,
"step": 4300
},
{
"entropy": 1.5128920078277588,
"epoch": 2.5792938360263316,
"grad_norm": 0.1015625,
"learning_rate": 3.421304608019151e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 4552585.0,
"step": 4310
},
{
"entropy": 1.52936589717865,
"epoch": 2.585278276481149,
"grad_norm": 0.55859375,
"learning_rate": 3.415320167564333e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9992699146270752,
"num_tokens": 4563067.0,
"step": 4320
},
{
"entropy": 1.5417215347290039,
"epoch": 2.5912627169359665,
"grad_norm": 0.06005859375,
"learning_rate": 3.409335727109515e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9993090510368348,
"num_tokens": 4573914.0,
"step": 4330
},
{
"entropy": 1.5155917763710023,
"epoch": 2.597247157390784,
"grad_norm": 0.578125,
"learning_rate": 3.403351286654698e-05,
"loss": 0.0022,
"mean_token_accuracy": 0.9992700695991517,
"num_tokens": 4584524.0,
"step": 4340
},
{
"entropy": 1.5353880643844604,
"epoch": 2.6032315978456015,
"grad_norm": 0.1982421875,
"learning_rate": 3.39736684619988e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 4595094.0,
"step": 4350
},
{
"entropy": 1.5418899774551391,
"epoch": 2.609216038300419,
"grad_norm": 0.1318359375,
"learning_rate": 3.3913824057450626e-05,
"loss": 0.0061,
"mean_token_accuracy": 0.9981540739536285,
"num_tokens": 4605752.0,
"step": 4360
},
{
"entropy": 1.5859925389289855,
"epoch": 2.6152004787552365,
"grad_norm": 0.0595703125,
"learning_rate": 3.3853979652902456e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9996491253376008,
"num_tokens": 4616330.0,
"step": 4370
},
{
"entropy": 1.5407701492309571,
"epoch": 2.621184919210054,
"grad_norm": 0.2578125,
"learning_rate": 3.379413524835428e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.999646645784378,
"num_tokens": 4626881.0,
"step": 4380
},
{
"entropy": 1.5098812222480773,
"epoch": 2.6271693596648715,
"grad_norm": 0.04931640625,
"learning_rate": 3.373429084380611e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 4637420.0,
"step": 4390
},
{
"entropy": 1.5174281120300293,
"epoch": 2.633153800119689,
"grad_norm": 0.0654296875,
"learning_rate": 3.3674446439257934e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 4648257.0,
"step": 4400
},
{
"entropy": 1.5267613768577575,
"epoch": 2.639138240574506,
"grad_norm": 0.05859375,
"learning_rate": 3.361460203470975e-05,
"loss": 0.0065,
"mean_token_accuracy": 0.9981722176074982,
"num_tokens": 4658785.0,
"step": 4410
},
{
"entropy": 1.6269310355186462,
"epoch": 2.645122681029324,
"grad_norm": 0.007781982421875,
"learning_rate": 3.355475763016158e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9992606043815613,
"num_tokens": 4669411.0,
"step": 4420
},
{
"entropy": 1.5434075832366942,
"epoch": 2.651107121484141,
"grad_norm": 0.275390625,
"learning_rate": 3.3494913225613404e-05,
"loss": 0.0011,
"mean_token_accuracy": 1.0,
"num_tokens": 4680058.0,
"step": 4430
},
{
"entropy": 1.5232465744018555,
"epoch": 2.657091561938959,
"grad_norm": 0.0196533203125,
"learning_rate": 3.343506882106523e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9996710538864135,
"num_tokens": 4690722.0,
"step": 4440
},
{
"entropy": 1.5052083373069762,
"epoch": 2.663076002393776,
"grad_norm": 0.06689453125,
"learning_rate": 3.337522441651706e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9985235571861267,
"num_tokens": 4701334.0,
"step": 4450
},
{
"entropy": 1.4890037059783936,
"epoch": 2.669060442848594,
"grad_norm": 1.1796875,
"learning_rate": 3.331538001196888e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.999231505393982,
"num_tokens": 4711696.0,
"step": 4460
},
{
"entropy": 1.5000533580780029,
"epoch": 2.675044883303411,
"grad_norm": 0.0546875,
"learning_rate": 3.3255535607420706e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9996138989925385,
"num_tokens": 4722098.0,
"step": 4470
},
{
"entropy": 1.51080322265625,
"epoch": 2.6810293237582288,
"grad_norm": 0.044189453125,
"learning_rate": 3.3195691202872536e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 4732902.0,
"step": 4480
},
{
"entropy": 1.4408185839653016,
"epoch": 2.687013764213046,
"grad_norm": 0.63671875,
"learning_rate": 3.313584679832436e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989372074604035,
"num_tokens": 4743519.0,
"step": 4490
},
{
"entropy": 1.432204270362854,
"epoch": 2.6929982046678633,
"grad_norm": 0.54296875,
"learning_rate": 3.307600239377618e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 4754124.0,
"step": 4500
},
{
"entropy": 1.4771082758903504,
"epoch": 2.698982645122681,
"grad_norm": 0.205078125,
"learning_rate": 3.301615798922801e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9985949337482453,
"num_tokens": 4764763.0,
"step": 4510
},
{
"entropy": 1.4384092807769775,
"epoch": 2.7049670855774988,
"grad_norm": 0.00811767578125,
"learning_rate": 3.295631358467983e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9989839315414428,
"num_tokens": 4775386.0,
"step": 4520
},
{
"entropy": 1.4536986112594605,
"epoch": 2.710951526032316,
"grad_norm": 0.07373046875,
"learning_rate": 3.289646918013166e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9989380478858948,
"num_tokens": 4785874.0,
"step": 4530
},
{
"entropy": 1.5010742783546447,
"epoch": 2.7169359664871333,
"grad_norm": 0.033935546875,
"learning_rate": 3.2836624775583484e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 4796434.0,
"step": 4540
},
{
"entropy": 1.477715837955475,
"epoch": 2.722920406941951,
"grad_norm": 0.06298828125,
"learning_rate": 3.277678037103531e-05,
"loss": 0.0056,
"mean_token_accuracy": 0.9985794126987457,
"num_tokens": 4806938.0,
"step": 4550
},
{
"entropy": 1.5574210286140442,
"epoch": 2.7289048473967683,
"grad_norm": 0.0216064453125,
"learning_rate": 3.271693596648714e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 4817756.0,
"step": 4560
},
{
"entropy": 1.5755536079406738,
"epoch": 2.734889287851586,
"grad_norm": 0.8046875,
"learning_rate": 3.265709156193896e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9992808401584625,
"num_tokens": 4828491.0,
"step": 4570
},
{
"entropy": 1.4965651631355286,
"epoch": 2.7408737283064033,
"grad_norm": 0.0986328125,
"learning_rate": 3.259724715739078e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9992878794670105,
"num_tokens": 4839082.0,
"step": 4580
},
{
"entropy": 1.4643235802650452,
"epoch": 2.7468581687612206,
"grad_norm": 0.1640625,
"learning_rate": 3.253740275284261e-05,
"loss": 0.0085,
"mean_token_accuracy": 0.998158860206604,
"num_tokens": 4849584.0,
"step": 4590
},
{
"entropy": 1.4505114316940309,
"epoch": 2.7528426092160383,
"grad_norm": 0.40625,
"learning_rate": 3.247755834829443e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9993296384811401,
"num_tokens": 4860264.0,
"step": 4600
},
{
"entropy": 1.3683255314826965,
"epoch": 2.758827049670856,
"grad_norm": 0.01373291015625,
"learning_rate": 3.241771394374626e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9996655523777008,
"num_tokens": 4870822.0,
"step": 4610
},
{
"entropy": 1.3456801891326904,
"epoch": 2.7648114901256733,
"grad_norm": 0.01031494140625,
"learning_rate": 3.235786953919809e-05,
"loss": 0.0052,
"mean_token_accuracy": 0.9979090332984925,
"num_tokens": 4881289.0,
"step": 4620
},
{
"entropy": 1.3935713291168212,
"epoch": 2.7707959305804906,
"grad_norm": 0.16796875,
"learning_rate": 3.229802513464991e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988677442073822,
"num_tokens": 4891678.0,
"step": 4630
},
{
"entropy": 1.4410547733306884,
"epoch": 2.7767803710353083,
"grad_norm": 0.232421875,
"learning_rate": 3.223818073010174e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9989205062389374,
"num_tokens": 4902128.0,
"step": 4640
},
{
"entropy": 1.4120708227157592,
"epoch": 2.7827648114901256,
"grad_norm": 0.150390625,
"learning_rate": 3.2178336325553564e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.999294513463974,
"num_tokens": 4912785.0,
"step": 4650
},
{
"entropy": 1.4368455290794373,
"epoch": 2.7887492519449433,
"grad_norm": 0.09375,
"learning_rate": 3.211849192100538e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9992712080478668,
"num_tokens": 4923429.0,
"step": 4660
},
{
"entropy": 1.436909818649292,
"epoch": 2.7947336923997605,
"grad_norm": 0.53125,
"learning_rate": 3.205864751645721e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9989143192768097,
"num_tokens": 4934074.0,
"step": 4670
},
{
"entropy": 1.5463759303092957,
"epoch": 2.800718132854578,
"grad_norm": 0.016357421875,
"learning_rate": 3.1998803111909035e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.999250841140747,
"num_tokens": 4944751.0,
"step": 4680
},
{
"entropy": 1.4755305051803589,
"epoch": 2.8067025733093955,
"grad_norm": 2.015625,
"learning_rate": 3.1938958707360865e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9992857158184052,
"num_tokens": 4955387.0,
"step": 4690
},
{
"entropy": 1.5100136518478393,
"epoch": 2.8126870137642133,
"grad_norm": 0.11279296875,
"learning_rate": 3.187911430281269e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 4966201.0,
"step": 4700
},
{
"entropy": 1.4382469534873963,
"epoch": 2.8186714542190305,
"grad_norm": 0.018798828125,
"learning_rate": 3.181926989826451e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 4976551.0,
"step": 4710
},
{
"entropy": 1.470636510848999,
"epoch": 2.824655894673848,
"grad_norm": 0.69140625,
"learning_rate": 3.175942549371634e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996108949184418,
"num_tokens": 4987218.0,
"step": 4720
},
{
"entropy": 1.53121098279953,
"epoch": 2.8306403351286655,
"grad_norm": 0.0234375,
"learning_rate": 3.1699581089168167e-05,
"loss": 0.0023,
"mean_token_accuracy": 0.9993226230144501,
"num_tokens": 4997799.0,
"step": 4730
},
{
"entropy": 1.4530461430549622,
"epoch": 2.836624775583483,
"grad_norm": 0.0172119140625,
"learning_rate": 3.163973668461998e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9989462852478027,
"num_tokens": 5008234.0,
"step": 4740
},
{
"entropy": 1.4743767023086547,
"epoch": 2.8426092160383005,
"grad_norm": 0.41015625,
"learning_rate": 3.1579892280071814e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9992892682552338,
"num_tokens": 5018644.0,
"step": 4750
},
{
"entropy": 1.4847406983375548,
"epoch": 2.848593656493118,
"grad_norm": 0.1103515625,
"learning_rate": 3.152004787552364e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9986034750938415,
"num_tokens": 5029305.0,
"step": 4760
},
{
"entropy": 1.4679431319236755,
"epoch": 2.8545780969479355,
"grad_norm": 0.052001953125,
"learning_rate": 3.146020347097546e-05,
"loss": 0.002,
"mean_token_accuracy": 0.999646645784378,
"num_tokens": 5039922.0,
"step": 4770
},
{
"entropy": 1.5108673334121705,
"epoch": 2.860562537402753,
"grad_norm": 0.04248046875,
"learning_rate": 3.140035906642729e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 5050363.0,
"step": 4780
},
{
"entropy": 1.4920130252838135,
"epoch": 2.8665469778575705,
"grad_norm": 0.0194091796875,
"learning_rate": 3.1340514661879115e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 5061011.0,
"step": 4790
},
{
"entropy": 1.4699716448783875,
"epoch": 2.872531418312388,
"grad_norm": 0.051513671875,
"learning_rate": 3.1280670257330945e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 5071583.0,
"step": 4800
},
{
"entropy": 1.5170957326889039,
"epoch": 2.878515858767205,
"grad_norm": 0.028564453125,
"learning_rate": 3.122082585278277e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5082159.0,
"step": 4810
},
{
"entropy": 1.502884578704834,
"epoch": 2.884500299222023,
"grad_norm": 0.0235595703125,
"learning_rate": 3.1160981448234586e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 5092755.0,
"step": 4820
},
{
"entropy": 1.4837321162223815,
"epoch": 2.89048473967684,
"grad_norm": 0.3984375,
"learning_rate": 3.1101137043686416e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996296286582946,
"num_tokens": 5103320.0,
"step": 4830
},
{
"entropy": 1.4991668343544007,
"epoch": 2.8964691801316578,
"grad_norm": 0.337890625,
"learning_rate": 3.104129263913824e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9984982252120972,
"num_tokens": 5113891.0,
"step": 4840
},
{
"entropy": 1.528916025161743,
"epoch": 2.902453620586475,
"grad_norm": 0.1748046875,
"learning_rate": 3.098144823459006e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9992779791355133,
"num_tokens": 5124448.0,
"step": 4850
},
{
"entropy": 1.5347569346427918,
"epoch": 2.9084380610412928,
"grad_norm": 0.03515625,
"learning_rate": 3.0921603830041894e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 5134840.0,
"step": 4860
},
{
"entropy": 1.5149821162223815,
"epoch": 2.91442250149611,
"grad_norm": 0.10205078125,
"learning_rate": 3.086175942549372e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996666669845581,
"num_tokens": 5145434.0,
"step": 4870
},
{
"entropy": 1.4977423548698425,
"epoch": 2.9204069419509278,
"grad_norm": 0.07373046875,
"learning_rate": 3.080191502094554e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9996376812458039,
"num_tokens": 5155919.0,
"step": 4880
},
{
"entropy": 1.4655091524124146,
"epoch": 2.926391382405745,
"grad_norm": 0.016357421875,
"learning_rate": 3.074207061639737e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5166450.0,
"step": 4890
},
{
"entropy": 1.4794137835502625,
"epoch": 2.9323758228605623,
"grad_norm": 0.0299072265625,
"learning_rate": 3.068222621184919e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 5176892.0,
"step": 4900
},
{
"entropy": 1.497855818271637,
"epoch": 2.93836026331538,
"grad_norm": 0.017333984375,
"learning_rate": 3.062238180730102e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989420473575592,
"num_tokens": 5187383.0,
"step": 4910
},
{
"entropy": 1.5195743203163148,
"epoch": 2.9443447037701977,
"grad_norm": 0.0238037109375,
"learning_rate": 3.056253740275284e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9985025763511658,
"num_tokens": 5197908.0,
"step": 4920
},
{
"entropy": 1.4769899368286132,
"epoch": 2.950329144225015,
"grad_norm": 0.2197265625,
"learning_rate": 3.0502692998204666e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 5208366.0,
"step": 4930
},
{
"entropy": 1.4623586893081666,
"epoch": 2.9563135846798323,
"grad_norm": 0.08642578125,
"learning_rate": 3.0442848593656496e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 5219004.0,
"step": 4940
},
{
"entropy": 1.4855722069740296,
"epoch": 2.96229802513465,
"grad_norm": 0.265625,
"learning_rate": 3.038300418910832e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.9985096752643585,
"num_tokens": 5229507.0,
"step": 4950
},
{
"entropy": 1.4608722567558288,
"epoch": 2.9682824655894673,
"grad_norm": 0.072265625,
"learning_rate": 3.0323159784560143e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9985809206962586,
"num_tokens": 5240053.0,
"step": 4960
},
{
"entropy": 1.4974953174591064,
"epoch": 2.974266906044285,
"grad_norm": 0.169921875,
"learning_rate": 3.026331538001197e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9989298462867737,
"num_tokens": 5250618.0,
"step": 4970
},
{
"entropy": 1.5181520104408264,
"epoch": 2.9802513464991023,
"grad_norm": 0.06689453125,
"learning_rate": 3.0203470975463794e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 5261074.0,
"step": 4980
},
{
"entropy": 1.4660282492637635,
"epoch": 2.9862357869539196,
"grad_norm": 0.076171875,
"learning_rate": 3.0143626570915624e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 5271530.0,
"step": 4990
},
{
"entropy": 1.506136167049408,
"epoch": 2.9922202274087373,
"grad_norm": 0.578125,
"learning_rate": 3.0083782166367448e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9985557436943054,
"num_tokens": 5282008.0,
"step": 5000
},
{
"entropy": 1.4818431258201599,
"epoch": 2.998204667863555,
"grad_norm": 0.77734375,
"learning_rate": 3.0023937761819268e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9993121862411499,
"num_tokens": 5292635.0,
"step": 5010
},
{
"entropy": 1.4319651007652283,
"epoch": 3.0041891083183723,
"grad_norm": 0.134765625,
"learning_rate": 2.9964093357271095e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 5303190.0,
"step": 5020
},
{
"entropy": 1.4162240147590637,
"epoch": 3.0101735487731895,
"grad_norm": 0.01031494140625,
"learning_rate": 2.9904248952722922e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 5313752.0,
"step": 5030
},
{
"entropy": 1.4751434803009034,
"epoch": 3.0161579892280073,
"grad_norm": 0.028564453125,
"learning_rate": 2.984440454817475e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9996268630027771,
"num_tokens": 5324371.0,
"step": 5040
},
{
"entropy": 1.4873990535736084,
"epoch": 3.0221424296828245,
"grad_norm": 0.6328125,
"learning_rate": 2.978456014362657e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996563553810119,
"num_tokens": 5334881.0,
"step": 5050
},
{
"entropy": 1.4645564079284668,
"epoch": 3.0281268701376423,
"grad_norm": 0.00909423828125,
"learning_rate": 2.9724715739078396e-05,
"loss": 0.002,
"mean_token_accuracy": 0.9992962300777435,
"num_tokens": 5345310.0,
"step": 5060
},
{
"entropy": 1.5135003924369812,
"epoch": 3.0341113105924595,
"grad_norm": 0.0390625,
"learning_rate": 2.9664871334530223e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.999672132730484,
"num_tokens": 5355813.0,
"step": 5070
},
{
"entropy": 1.5595512986183167,
"epoch": 3.0400957510472773,
"grad_norm": 0.03076171875,
"learning_rate": 2.960502692998205e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 5366358.0,
"step": 5080
},
{
"entropy": 1.5525766372680665,
"epoch": 3.0460801915020945,
"grad_norm": 0.0264892578125,
"learning_rate": 2.954518252543387e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996138989925385,
"num_tokens": 5376822.0,
"step": 5090
},
{
"entropy": 1.518768072128296,
"epoch": 3.0520646319569122,
"grad_norm": 0.07080078125,
"learning_rate": 2.9485338120885697e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9992601990699768,
"num_tokens": 5387230.0,
"step": 5100
},
{
"entropy": 1.5416632294654846,
"epoch": 3.0580490724117295,
"grad_norm": 0.5859375,
"learning_rate": 2.9425493716337524e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9988771200180053,
"num_tokens": 5397736.0,
"step": 5110
},
{
"entropy": 1.587303638458252,
"epoch": 3.064033512866547,
"grad_norm": 0.03857421875,
"learning_rate": 2.936564931178935e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996731996536254,
"num_tokens": 5408278.0,
"step": 5120
},
{
"entropy": 1.5591753721237183,
"epoch": 3.0700179533213645,
"grad_norm": 0.058837890625,
"learning_rate": 2.930580490724117e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 5418970.0,
"step": 5130
},
{
"entropy": 1.6015469908714295,
"epoch": 3.076002393776182,
"grad_norm": 0.32421875,
"learning_rate": 2.9245960502693e-05,
"loss": 0.0009,
"mean_token_accuracy": 1.0,
"num_tokens": 5429604.0,
"step": 5140
},
{
"entropy": 1.5480207562446595,
"epoch": 3.0819868342309995,
"grad_norm": 0.62890625,
"learning_rate": 2.9186116098144825e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9992478489875793,
"num_tokens": 5440142.0,
"step": 5150
},
{
"entropy": 1.5280380129814148,
"epoch": 3.087971274685817,
"grad_norm": 0.314453125,
"learning_rate": 2.9126271693596652e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996551752090455,
"num_tokens": 5450721.0,
"step": 5160
},
{
"entropy": 1.5630383491516113,
"epoch": 3.0939557151406345,
"grad_norm": 0.0517578125,
"learning_rate": 2.9066427289048473e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 5461260.0,
"step": 5170
},
{
"entropy": 1.5174272894859313,
"epoch": 3.0999401555954518,
"grad_norm": 0.443359375,
"learning_rate": 2.90065828845003e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996632993221283,
"num_tokens": 5471701.0,
"step": 5180
},
{
"entropy": 1.5312930703163148,
"epoch": 3.1059245960502695,
"grad_norm": 0.54296875,
"learning_rate": 2.8946738479952127e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996655523777008,
"num_tokens": 5482325.0,
"step": 5190
},
{
"entropy": 1.5398496747016908,
"epoch": 3.1119090365050868,
"grad_norm": 0.007232666015625,
"learning_rate": 2.888689407540395e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 5492966.0,
"step": 5200
},
{
"entropy": 1.550033414363861,
"epoch": 3.117893476959904,
"grad_norm": 0.06103515625,
"learning_rate": 2.8827049670855774e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9989342570304871,
"num_tokens": 5503575.0,
"step": 5210
},
{
"entropy": 1.430702805519104,
"epoch": 3.1238779174147218,
"grad_norm": 0.328125,
"learning_rate": 2.87672052663076e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 5514038.0,
"step": 5220
},
{
"entropy": 1.5385449290275575,
"epoch": 3.129862357869539,
"grad_norm": 0.0390625,
"learning_rate": 2.8707360861759428e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996376812458039,
"num_tokens": 5524786.0,
"step": 5230
},
{
"entropy": 1.4950608015060425,
"epoch": 3.1358467983243568,
"grad_norm": 0.0341796875,
"learning_rate": 2.864751645721125e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 5535199.0,
"step": 5240
},
{
"entropy": 1.506748330593109,
"epoch": 3.141831238779174,
"grad_norm": 0.015625,
"learning_rate": 2.8587672052663075e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 5545783.0,
"step": 5250
},
{
"entropy": 1.5544589042663575,
"epoch": 3.1478156792339917,
"grad_norm": 0.0125732421875,
"learning_rate": 2.8527827648114902e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9992754220962524,
"num_tokens": 5556377.0,
"step": 5260
},
{
"entropy": 1.5457674264907837,
"epoch": 3.153800119688809,
"grad_norm": 0.076171875,
"learning_rate": 2.846798324356673e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 5566903.0,
"step": 5270
},
{
"entropy": 1.5506306290626526,
"epoch": 3.1597845601436267,
"grad_norm": 0.10205078125,
"learning_rate": 2.8408138839018552e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 5577356.0,
"step": 5280
},
{
"entropy": 1.5602880001068116,
"epoch": 3.165769000598444,
"grad_norm": 0.388671875,
"learning_rate": 2.8348294434470376e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9989531815052033,
"num_tokens": 5587965.0,
"step": 5290
},
{
"entropy": 1.563979482650757,
"epoch": 3.1717534410532613,
"grad_norm": 0.0147705078125,
"learning_rate": 2.8288450029922203e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 5598333.0,
"step": 5300
},
{
"entropy": 1.5785459518432616,
"epoch": 3.177737881508079,
"grad_norm": 0.056396484375,
"learning_rate": 2.822860562537403e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 5608941.0,
"step": 5310
},
{
"entropy": 1.589229130744934,
"epoch": 3.1837223219628963,
"grad_norm": 0.0186767578125,
"learning_rate": 2.8168761220825854e-05,
"loss": 0.001,
"mean_token_accuracy": 0.999609375,
"num_tokens": 5619596.0,
"step": 5320
},
{
"entropy": 1.5809313297271728,
"epoch": 3.189706762417714,
"grad_norm": 0.0224609375,
"learning_rate": 2.8108916816277677e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 5630232.0,
"step": 5330
},
{
"entropy": 1.5661462068557739,
"epoch": 3.1956912028725313,
"grad_norm": 0.024169921875,
"learning_rate": 2.8049072411729504e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 5640742.0,
"step": 5340
},
{
"entropy": 1.5573038578033447,
"epoch": 3.201675643327349,
"grad_norm": 0.0302734375,
"learning_rate": 2.7989228007181328e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.99963099360466,
"num_tokens": 5651153.0,
"step": 5350
},
{
"entropy": 1.5646984577178955,
"epoch": 3.2076600837821663,
"grad_norm": 0.035400390625,
"learning_rate": 2.7929383602633155e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.999304324388504,
"num_tokens": 5661922.0,
"step": 5360
},
{
"entropy": 1.5028715133666992,
"epoch": 3.213644524236984,
"grad_norm": 0.349609375,
"learning_rate": 2.786953919808498e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9992653369903565,
"num_tokens": 5672420.0,
"step": 5370
},
{
"entropy": 1.5278252124786378,
"epoch": 3.2196289646918013,
"grad_norm": 0.028076171875,
"learning_rate": 2.7809694793536805e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5682901.0,
"step": 5380
},
{
"entropy": 1.564482367038727,
"epoch": 3.225613405146619,
"grad_norm": 0.408203125,
"learning_rate": 2.774985038898863e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 5693412.0,
"step": 5390
},
{
"entropy": 1.507829475402832,
"epoch": 3.2315978456014363,
"grad_norm": 0.035888671875,
"learning_rate": 2.7690005984440456e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 5703833.0,
"step": 5400
},
{
"entropy": 1.5307364106178283,
"epoch": 3.2375822860562535,
"grad_norm": 0.057373046875,
"learning_rate": 2.7630161579892283e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9992765545845032,
"num_tokens": 5714464.0,
"step": 5410
},
{
"entropy": 1.5907382130622865,
"epoch": 3.2435667265110713,
"grad_norm": 0.08056640625,
"learning_rate": 2.7570317175344107e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 5725108.0,
"step": 5420
},
{
"entropy": 1.5480499148368836,
"epoch": 3.2495511669658885,
"grad_norm": 0.02978515625,
"learning_rate": 2.751047277079593e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996632993221283,
"num_tokens": 5735618.0,
"step": 5430
},
{
"entropy": 1.4962282299995422,
"epoch": 3.2555356074207062,
"grad_norm": 0.404296875,
"learning_rate": 2.7450628366247757e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996350347995758,
"num_tokens": 5746133.0,
"step": 5440
},
{
"entropy": 1.5920798301696777,
"epoch": 3.2615200478755235,
"grad_norm": 0.10546875,
"learning_rate": 2.7390783961699584e-05,
"loss": 0.0022,
"mean_token_accuracy": 0.9996240615844727,
"num_tokens": 5756785.0,
"step": 5450
},
{
"entropy": 1.597427499294281,
"epoch": 3.2675044883303412,
"grad_norm": 0.01495361328125,
"learning_rate": 2.7330939557151408e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996062994003296,
"num_tokens": 5767408.0,
"step": 5460
},
{
"entropy": 1.5337183713912963,
"epoch": 3.2734889287851585,
"grad_norm": 0.63671875,
"learning_rate": 2.727109515260323e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9989434540271759,
"num_tokens": 5777792.0,
"step": 5470
},
{
"entropy": 1.564418363571167,
"epoch": 3.2794733692399762,
"grad_norm": 0.017333984375,
"learning_rate": 2.7211250748055058e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5788108.0,
"step": 5480
},
{
"entropy": 1.6177725791931152,
"epoch": 3.2854578096947935,
"grad_norm": 0.099609375,
"learning_rate": 2.7151406343506885e-05,
"loss": 0.0009,
"mean_token_accuracy": 1.0,
"num_tokens": 5798863.0,
"step": 5490
},
{
"entropy": 1.671896493434906,
"epoch": 3.2914422501496112,
"grad_norm": 0.0078125,
"learning_rate": 2.7091561938958705e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 5809517.0,
"step": 5500
},
{
"entropy": 1.5772828936576844,
"epoch": 3.2974266906044285,
"grad_norm": 0.01068115234375,
"learning_rate": 2.7031717534410532e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9992832183837891,
"num_tokens": 5819979.0,
"step": 5510
},
{
"entropy": 1.6201642036437989,
"epoch": 3.3034111310592458,
"grad_norm": 0.74609375,
"learning_rate": 2.697187312986236e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996296286582946,
"num_tokens": 5830537.0,
"step": 5520
},
{
"entropy": 1.534337544441223,
"epoch": 3.3093955715140635,
"grad_norm": 0.03173828125,
"learning_rate": 2.6912028725314186e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9989047467708587,
"num_tokens": 5840879.0,
"step": 5530
},
{
"entropy": 1.4968370914459228,
"epoch": 3.3153800119688808,
"grad_norm": 0.62890625,
"learning_rate": 2.6852184320766007e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9993019163608551,
"num_tokens": 5851453.0,
"step": 5540
},
{
"entropy": 1.5559372425079345,
"epoch": 3.3213644524236985,
"grad_norm": 0.1640625,
"learning_rate": 2.6792339916217834e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5862229.0,
"step": 5550
},
{
"entropy": 1.4817229628562927,
"epoch": 3.3273488928785158,
"grad_norm": 0.11328125,
"learning_rate": 2.673249551166966e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996835470199585,
"num_tokens": 5872759.0,
"step": 5560
},
{
"entropy": 1.4754049897193908,
"epoch": 3.3333333333333335,
"grad_norm": 0.004852294921875,
"learning_rate": 2.6672651107121488e-05,
"loss": 0.0021,
"mean_token_accuracy": 0.9992820382118225,
"num_tokens": 5883416.0,
"step": 5570
},
{
"entropy": 1.499694275856018,
"epoch": 3.3393177737881508,
"grad_norm": 0.01019287109375,
"learning_rate": 2.6612806702573308e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 5893944.0,
"step": 5580
},
{
"entropy": 1.5177325487136841,
"epoch": 3.3453022142429685,
"grad_norm": 0.006927490234375,
"learning_rate": 2.6552962298025135e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9993056535720826,
"num_tokens": 5904513.0,
"step": 5590
},
{
"entropy": 1.5602369070053101,
"epoch": 3.3512866546977857,
"grad_norm": 0.64453125,
"learning_rate": 2.6493117893476962e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.999676376581192,
"num_tokens": 5915107.0,
"step": 5600
},
{
"entropy": 1.4615213394165039,
"epoch": 3.357271095152603,
"grad_norm": 0.006103515625,
"learning_rate": 2.643327348892879e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996539771556854,
"num_tokens": 5925481.0,
"step": 5610
},
{
"entropy": 1.5230321168899537,
"epoch": 3.3632555356074207,
"grad_norm": 0.04736328125,
"learning_rate": 2.637342908438061e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 5935980.0,
"step": 5620
},
{
"entropy": 1.4605862855911256,
"epoch": 3.369239976062238,
"grad_norm": 0.01336669921875,
"learning_rate": 2.6313584679832436e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9993006646633148,
"num_tokens": 5946578.0,
"step": 5630
},
{
"entropy": 1.5047585487365722,
"epoch": 3.3752244165170557,
"grad_norm": 0.046875,
"learning_rate": 2.6253740275284263e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 5957159.0,
"step": 5640
},
{
"entropy": 1.4994407773017884,
"epoch": 3.381208856971873,
"grad_norm": 0.01513671875,
"learning_rate": 2.6193895870736087e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9992932856082917,
"num_tokens": 5967691.0,
"step": 5650
},
{
"entropy": 1.5766767621040345,
"epoch": 3.3871932974266907,
"grad_norm": 0.228515625,
"learning_rate": 2.613405146618791e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996254682540894,
"num_tokens": 5978513.0,
"step": 5660
},
{
"entropy": 1.5303641319274903,
"epoch": 3.393177737881508,
"grad_norm": 0.359375,
"learning_rate": 2.6074207061639737e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.999622642993927,
"num_tokens": 5989129.0,
"step": 5670
},
{
"entropy": 1.518871784210205,
"epoch": 3.3991621783363257,
"grad_norm": 0.3125,
"learning_rate": 2.6014362657091564e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.999292927980423,
"num_tokens": 5999622.0,
"step": 5680
},
{
"entropy": 1.5809160351753235,
"epoch": 3.405146618791143,
"grad_norm": 0.220703125,
"learning_rate": 2.5954518252543388e-05,
"loss": 0.0009,
"mean_token_accuracy": 1.0,
"num_tokens": 6010229.0,
"step": 5690
},
{
"entropy": 1.514602744579315,
"epoch": 3.4111310592459603,
"grad_norm": 0.59375,
"learning_rate": 2.589467384799521e-05,
"loss": 0.0022,
"mean_token_accuracy": 0.9993149936199188,
"num_tokens": 6020944.0,
"step": 5700
},
{
"entropy": 1.5365632772445679,
"epoch": 3.417115499700778,
"grad_norm": 0.09716796875,
"learning_rate": 2.5834829443447038e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6031625.0,
"step": 5710
},
{
"entropy": 1.4886379003524781,
"epoch": 3.4230999401555953,
"grad_norm": 0.09326171875,
"learning_rate": 2.5774985038898865e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9985065758228302,
"num_tokens": 6042088.0,
"step": 5720
},
{
"entropy": 1.5176212072372437,
"epoch": 3.429084380610413,
"grad_norm": 0.0712890625,
"learning_rate": 2.571514063435069e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9996731996536254,
"num_tokens": 6052684.0,
"step": 5730
},
{
"entropy": 1.445726454257965,
"epoch": 3.4350688210652303,
"grad_norm": 0.047607421875,
"learning_rate": 2.5655296229802512e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6063071.0,
"step": 5740
},
{
"entropy": 1.516676378250122,
"epoch": 3.441053261520048,
"grad_norm": 0.0030059814453125,
"learning_rate": 2.559545182525434e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6073640.0,
"step": 5750
},
{
"entropy": 1.500998282432556,
"epoch": 3.4470377019748653,
"grad_norm": 0.053466796875,
"learning_rate": 2.5535607420706166e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 6084262.0,
"step": 5760
},
{
"entropy": 1.481552255153656,
"epoch": 3.453022142429683,
"grad_norm": 0.05224609375,
"learning_rate": 2.547576301615799e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996621608734131,
"num_tokens": 6094845.0,
"step": 5770
},
{
"entropy": 1.5202495098114013,
"epoch": 3.4590065828845002,
"grad_norm": 0.027587890625,
"learning_rate": 2.5415918611609814e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6105409.0,
"step": 5780
},
{
"entropy": 1.506263256072998,
"epoch": 3.464991023339318,
"grad_norm": 0.031005859375,
"learning_rate": 2.535607420706164e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9992824077606202,
"num_tokens": 6116062.0,
"step": 5790
},
{
"entropy": 1.5448615550994873,
"epoch": 3.4709754637941352,
"grad_norm": 0.17578125,
"learning_rate": 2.5296229802513464e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 6126660.0,
"step": 5800
},
{
"entropy": 1.4823444962501526,
"epoch": 3.476959904248953,
"grad_norm": 0.030029296875,
"learning_rate": 2.523638539796529e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.9990337431430817,
"num_tokens": 6137252.0,
"step": 5810
},
{
"entropy": 1.461562943458557,
"epoch": 3.4829443447037702,
"grad_norm": 0.06982421875,
"learning_rate": 2.5176540993417115e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9993062913417816,
"num_tokens": 6147825.0,
"step": 5820
},
{
"entropy": 1.4455949544906617,
"epoch": 3.4889287851585875,
"grad_norm": 0.0250244140625,
"learning_rate": 2.5116696588868942e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6158342.0,
"step": 5830
},
{
"entropy": 1.458959984779358,
"epoch": 3.4949132256134052,
"grad_norm": 0.04248046875,
"learning_rate": 2.5056852184320765e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9992868721485137,
"num_tokens": 6168949.0,
"step": 5840
},
{
"entropy": 1.5214141845703124,
"epoch": 3.5008976660682225,
"grad_norm": 0.78125,
"learning_rate": 2.4997007779772592e-05,
"loss": 0.002,
"mean_token_accuracy": 0.9989701211452484,
"num_tokens": 6179583.0,
"step": 5850
},
{
"entropy": 1.492224383354187,
"epoch": 3.50688210652304,
"grad_norm": 0.126953125,
"learning_rate": 2.493716337522442e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6190163.0,
"step": 5860
},
{
"entropy": 1.4495007276535035,
"epoch": 3.5128665469778575,
"grad_norm": 0.021728515625,
"learning_rate": 2.4877318970676243e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 6200698.0,
"step": 5870
},
{
"entropy": 1.4565007090568542,
"epoch": 3.5188509874326748,
"grad_norm": 0.1533203125,
"learning_rate": 2.4817474566128067e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 6211284.0,
"step": 5880
},
{
"entropy": 1.499528968334198,
"epoch": 3.5248354278874925,
"grad_norm": 0.028076171875,
"learning_rate": 2.4757630161579894e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996655523777008,
"num_tokens": 6221944.0,
"step": 5890
},
{
"entropy": 1.4583883881568909,
"epoch": 3.53081986834231,
"grad_norm": 0.59765625,
"learning_rate": 2.469778575703172e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9993514120578766,
"num_tokens": 6232700.0,
"step": 5900
},
{
"entropy": 1.5127885818481446,
"epoch": 3.5368043087971275,
"grad_norm": 0.033935546875,
"learning_rate": 2.4637941352483544e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6243487.0,
"step": 5910
},
{
"entropy": 1.5247217893600464,
"epoch": 3.5427887492519448,
"grad_norm": 0.07763671875,
"learning_rate": 2.4578096947935368e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996884763240814,
"num_tokens": 6254214.0,
"step": 5920
},
{
"entropy": 1.465972125530243,
"epoch": 3.5487731897067625,
"grad_norm": 0.06787109375,
"learning_rate": 2.4518252543387195e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996563553810119,
"num_tokens": 6264680.0,
"step": 5930
},
{
"entropy": 1.4788655638694763,
"epoch": 3.5547576301615798,
"grad_norm": 0.0203857421875,
"learning_rate": 2.445840813883902e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6275332.0,
"step": 5940
},
{
"entropy": 1.4835572361946106,
"epoch": 3.5607420706163975,
"grad_norm": 0.7734375,
"learning_rate": 2.4398563734290842e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.9992424249649048,
"num_tokens": 6285916.0,
"step": 5950
},
{
"entropy": 1.4669729709625243,
"epoch": 3.5667265110712147,
"grad_norm": 0.70703125,
"learning_rate": 2.433871932974267e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9985170543193818,
"num_tokens": 6296444.0,
"step": 5960
},
{
"entropy": 1.4348384976387023,
"epoch": 3.5727109515260325,
"grad_norm": 0.060302734375,
"learning_rate": 2.4278874925194496e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9993228852748871,
"num_tokens": 6307078.0,
"step": 5970
},
{
"entropy": 1.5015219926834107,
"epoch": 3.5786953919808497,
"grad_norm": 0.0184326171875,
"learning_rate": 2.4219030520646323e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 6317497.0,
"step": 5980
},
{
"entropy": 1.56508469581604,
"epoch": 3.5846798324356675,
"grad_norm": 0.2001953125,
"learning_rate": 2.4159186116098143e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6328052.0,
"step": 5990
},
{
"entropy": 1.4645266771316527,
"epoch": 3.5906642728904847,
"grad_norm": 0.11572265625,
"learning_rate": 2.409934171154997e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 6338487.0,
"step": 6000
},
{
"entropy": 1.4335883975028991,
"epoch": 3.596648713345302,
"grad_norm": 0.083984375,
"learning_rate": 2.4039497307001797e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6349011.0,
"step": 6010
},
{
"entropy": 1.4607133984565734,
"epoch": 3.6026331538001197,
"grad_norm": 0.0142822265625,
"learning_rate": 2.3979652902453624e-05,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 6359518.0,
"step": 6020
},
{
"entropy": 1.505584192276001,
"epoch": 3.608617594254937,
"grad_norm": 0.006195068359375,
"learning_rate": 2.3919808497905444e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996491253376008,
"num_tokens": 6370041.0,
"step": 6030
},
{
"entropy": 1.4937905073165894,
"epoch": 3.6146020347097547,
"grad_norm": 0.09326171875,
"learning_rate": 2.385996409335727e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6380815.0,
"step": 6040
},
{
"entropy": 1.4493468403816223,
"epoch": 3.620586475164572,
"grad_norm": 0.06005859375,
"learning_rate": 2.3800119688809098e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 6391275.0,
"step": 6050
},
{
"entropy": 1.5009562849998475,
"epoch": 3.6265709156193897,
"grad_norm": 0.0150146484375,
"learning_rate": 2.3740275284260925e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6401857.0,
"step": 6060
},
{
"entropy": 1.4806856632232666,
"epoch": 3.632555356074207,
"grad_norm": 0.018798828125,
"learning_rate": 2.3680430879712745e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9989627659320831,
"num_tokens": 6412386.0,
"step": 6070
},
{
"entropy": 1.4495960474014282,
"epoch": 3.6385397965290247,
"grad_norm": 0.0235595703125,
"learning_rate": 2.3620586475164572e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6422788.0,
"step": 6080
},
{
"entropy": 1.4762946724891663,
"epoch": 3.644524236983842,
"grad_norm": 0.02783203125,
"learning_rate": 2.35607420706164e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9992830693721771,
"num_tokens": 6433332.0,
"step": 6090
},
{
"entropy": 1.4833276271820068,
"epoch": 3.6505086774386593,
"grad_norm": 0.5703125,
"learning_rate": 2.3500897666068223e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 6444112.0,
"step": 6100
},
{
"entropy": 1.4752602219581603,
"epoch": 3.656493117893477,
"grad_norm": 0.00897216796875,
"learning_rate": 2.3441053261520047e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9992787480354309,
"num_tokens": 6454710.0,
"step": 6110
},
{
"entropy": 1.5009476304054261,
"epoch": 3.6624775583482947,
"grad_norm": 0.1181640625,
"learning_rate": 2.3381208856971874e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6465333.0,
"step": 6120
},
{
"entropy": 1.49513281583786,
"epoch": 3.668461998803112,
"grad_norm": 0.0673828125,
"learning_rate": 2.33213644524237e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6476047.0,
"step": 6130
},
{
"entropy": 1.4687823891639709,
"epoch": 3.6744464392579292,
"grad_norm": 0.033935546875,
"learning_rate": 2.3261520047875524e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6486587.0,
"step": 6140
},
{
"entropy": 1.5005357146263123,
"epoch": 3.680430879712747,
"grad_norm": 0.138671875,
"learning_rate": 2.3201675643327348e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996212124824524,
"num_tokens": 6497319.0,
"step": 6150
},
{
"entropy": 1.5394233107566833,
"epoch": 3.6864153201675642,
"grad_norm": 0.0703125,
"learning_rate": 2.3141831238779175e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 6508086.0,
"step": 6160
},
{
"entropy": 1.4763020753860474,
"epoch": 3.692399760622382,
"grad_norm": 0.02880859375,
"learning_rate": 2.3081986834231e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9992766141891479,
"num_tokens": 6518792.0,
"step": 6170
},
{
"entropy": 1.4043538212776183,
"epoch": 3.6983842010771992,
"grad_norm": 0.83203125,
"learning_rate": 2.3022142429682825e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.999609375,
"num_tokens": 6529300.0,
"step": 6180
},
{
"entropy": 1.4720135927200317,
"epoch": 3.7043686415320165,
"grad_norm": 0.1279296875,
"learning_rate": 2.296229802513465e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6539909.0,
"step": 6190
},
{
"entropy": 1.4578587770462037,
"epoch": 3.7103530819868342,
"grad_norm": 0.0240478515625,
"learning_rate": 2.2902453620586476e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9992994546890259,
"num_tokens": 6550391.0,
"step": 6200
},
{
"entropy": 1.4950265049934388,
"epoch": 3.716337522441652,
"grad_norm": 0.09228515625,
"learning_rate": 2.2842609216038303e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6560907.0,
"step": 6210
},
{
"entropy": 1.487864351272583,
"epoch": 3.722321962896469,
"grad_norm": 0.6328125,
"learning_rate": 2.2782764811490126e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996282517910003,
"num_tokens": 6571382.0,
"step": 6220
},
{
"entropy": 1.4353316664695739,
"epoch": 3.7283064033512865,
"grad_norm": 0.07177734375,
"learning_rate": 2.272292040694195e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996268630027771,
"num_tokens": 6582007.0,
"step": 6230
},
{
"entropy": 1.5229133486747741,
"epoch": 3.734290843806104,
"grad_norm": 0.004180908203125,
"learning_rate": 2.2663076002393777e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6592605.0,
"step": 6240
},
{
"entropy": 1.498645055294037,
"epoch": 3.7402752842609215,
"grad_norm": 0.017578125,
"learning_rate": 2.26032315978456e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9989560008049011,
"num_tokens": 6603206.0,
"step": 6250
},
{
"entropy": 1.4413471937179565,
"epoch": 3.746259724715739,
"grad_norm": 0.1337890625,
"learning_rate": 2.2543387193297428e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 6613700.0,
"step": 6260
},
{
"entropy": 1.4817732572555542,
"epoch": 3.7522441651705565,
"grad_norm": 0.0101318359375,
"learning_rate": 2.248354278874925e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6624266.0,
"step": 6270
},
{
"entropy": 1.485778033733368,
"epoch": 3.7582286056253738,
"grad_norm": 0.0089111328125,
"learning_rate": 2.2423698384201078e-05,
"loss": 0.0004,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 6634667.0,
"step": 6280
},
{
"entropy": 1.5231386780738831,
"epoch": 3.7642130460801915,
"grad_norm": 0.08740234375,
"learning_rate": 2.2363853979652902e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9996153831481933,
"num_tokens": 6645075.0,
"step": 6290
},
{
"entropy": 1.46702800989151,
"epoch": 3.770197486535009,
"grad_norm": 0.126953125,
"learning_rate": 2.230400957510473e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9992537319660186,
"num_tokens": 6655535.0,
"step": 6300
},
{
"entropy": 1.473765778541565,
"epoch": 3.7761819269898265,
"grad_norm": 0.0458984375,
"learning_rate": 2.2244165170556556e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6665998.0,
"step": 6310
},
{
"entropy": 1.5671903252601624,
"epoch": 3.7821663674446437,
"grad_norm": 0.045654296875,
"learning_rate": 2.218432076600838e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 6676480.0,
"step": 6320
},
{
"entropy": 1.5145403623580933,
"epoch": 3.7881508078994615,
"grad_norm": 0.0294189453125,
"learning_rate": 2.2124476361460203e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 6686964.0,
"step": 6330
},
{
"entropy": 1.5250129222869873,
"epoch": 3.7941352483542787,
"grad_norm": 0.035888671875,
"learning_rate": 2.206463195691203e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9992864787578583,
"num_tokens": 6697518.0,
"step": 6340
},
{
"entropy": 1.533209502696991,
"epoch": 3.8001196888090965,
"grad_norm": 0.03125,
"learning_rate": 2.2004787552363857e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 6708198.0,
"step": 6350
},
{
"entropy": 1.5412673473358154,
"epoch": 3.8061041292639137,
"grad_norm": 0.05029296875,
"learning_rate": 2.194494314781568e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9992578804492951,
"num_tokens": 6718602.0,
"step": 6360
},
{
"entropy": 1.5716055989265443,
"epoch": 3.8120885697187314,
"grad_norm": 0.609375,
"learning_rate": 2.1885098743267504e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9986324310302734,
"num_tokens": 6729226.0,
"step": 6370
},
{
"entropy": 1.5873057842254639,
"epoch": 3.8180730101735487,
"grad_norm": 0.0242919921875,
"learning_rate": 2.182525433871933e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 6739892.0,
"step": 6380
},
{
"entropy": 1.5300064086914062,
"epoch": 3.8240574506283664,
"grad_norm": 0.03955078125,
"learning_rate": 2.1765409934171158e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996197700500489,
"num_tokens": 6750433.0,
"step": 6390
},
{
"entropy": 1.5160815238952636,
"epoch": 3.8300418910831837,
"grad_norm": 0.016357421875,
"learning_rate": 2.1705565529622978e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996183216571808,
"num_tokens": 6760886.0,
"step": 6400
},
{
"entropy": 1.5610576629638673,
"epoch": 3.836026331538001,
"grad_norm": 0.00872802734375,
"learning_rate": 2.1645721125074805e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6771573.0,
"step": 6410
},
{
"entropy": 1.554826021194458,
"epoch": 3.8420107719928187,
"grad_norm": 0.0289306640625,
"learning_rate": 2.1585876720526632e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 6782078.0,
"step": 6420
},
{
"entropy": 1.546287226676941,
"epoch": 3.847995212447636,
"grad_norm": 0.020751953125,
"learning_rate": 2.152603231597846e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 6792786.0,
"step": 6430
},
{
"entropy": 1.5601895451545715,
"epoch": 3.8539796529024537,
"grad_norm": 0.1337890625,
"learning_rate": 2.146618791143028e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9992664158344269,
"num_tokens": 6803447.0,
"step": 6440
},
{
"entropy": 1.5343787312507629,
"epoch": 3.859964093357271,
"grad_norm": 0.03759765625,
"learning_rate": 2.1406343506882106e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9992684304714203,
"num_tokens": 6813995.0,
"step": 6450
},
{
"entropy": 1.546648907661438,
"epoch": 3.8659485338120887,
"grad_norm": 0.00958251953125,
"learning_rate": 2.1346499102333933e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 6824662.0,
"step": 6460
},
{
"entropy": 1.5174113750457763,
"epoch": 3.871932974266906,
"grad_norm": 0.0196533203125,
"learning_rate": 2.128665469778576e-05,
"loss": 0.0024,
"mean_token_accuracy": 0.9995884776115418,
"num_tokens": 6834963.0,
"step": 6470
},
{
"entropy": 1.5120728850364684,
"epoch": 3.8779174147217237,
"grad_norm": 0.1064453125,
"learning_rate": 2.122681029323758e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 6845537.0,
"step": 6480
},
{
"entropy": 1.5367425560951233,
"epoch": 3.883901855176541,
"grad_norm": 0.0279541015625,
"learning_rate": 2.1166965888689408e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 6856030.0,
"step": 6490
},
{
"entropy": 1.5492040872573853,
"epoch": 3.8898862956313582,
"grad_norm": 0.0654296875,
"learning_rate": 2.1107121484141235e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6866678.0,
"step": 6500
},
{
"entropy": 1.5331620454788208,
"epoch": 3.895870736086176,
"grad_norm": 0.039794921875,
"learning_rate": 2.104727707959306e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6877203.0,
"step": 6510
},
{
"entropy": 1.5546536684036254,
"epoch": 3.9018551765409937,
"grad_norm": 0.0201416015625,
"learning_rate": 2.0987432675044882e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9985446095466614,
"num_tokens": 6887841.0,
"step": 6520
},
{
"entropy": 1.5250784516334535,
"epoch": 3.907839616995811,
"grad_norm": 0.01287841796875,
"learning_rate": 2.092758827049671e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 6898328.0,
"step": 6530
},
{
"entropy": 1.5164906859397889,
"epoch": 3.9138240574506282,
"grad_norm": 0.8125,
"learning_rate": 2.0867743865948536e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996015965938568,
"num_tokens": 6908680.0,
"step": 6540
},
{
"entropy": 1.5334503293037414,
"epoch": 3.919808497905446,
"grad_norm": 0.185546875,
"learning_rate": 2.080789946140036e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 6919228.0,
"step": 6550
},
{
"entropy": 1.5119147062301637,
"epoch": 3.925792938360263,
"grad_norm": 0.024658203125,
"learning_rate": 2.0748055056852183e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9992709934711457,
"num_tokens": 6929730.0,
"step": 6560
},
{
"entropy": 1.622498083114624,
"epoch": 3.931777378815081,
"grad_norm": 0.1044921875,
"learning_rate": 2.068821065230401e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 6940536.0,
"step": 6570
},
{
"entropy": 1.6207013845443725,
"epoch": 3.937761819269898,
"grad_norm": 0.046630859375,
"learning_rate": 2.0628366247755837e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9993064045906067,
"num_tokens": 6951312.0,
"step": 6580
},
{
"entropy": 1.5574225544929505,
"epoch": 3.9437462597247155,
"grad_norm": 0.1220703125,
"learning_rate": 2.056852184320766e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996183216571808,
"num_tokens": 6961725.0,
"step": 6590
},
{
"entropy": 1.5951979756355286,
"epoch": 3.949730700179533,
"grad_norm": 0.0223388671875,
"learning_rate": 2.0508677438659484e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 6972283.0,
"step": 6600
},
{
"entropy": 1.6055678009986878,
"epoch": 3.955715140634351,
"grad_norm": 0.033447265625,
"learning_rate": 2.044883303411131e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 6982888.0,
"step": 6610
},
{
"entropy": 1.5691597938537598,
"epoch": 3.961699581089168,
"grad_norm": 0.039306640625,
"learning_rate": 2.0388988629563138e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 6993524.0,
"step": 6620
},
{
"entropy": 1.6033958792686462,
"epoch": 3.9676840215439855,
"grad_norm": 0.023193359375,
"learning_rate": 2.032914422501496e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7004124.0,
"step": 6630
},
{
"entropy": 1.56329288482666,
"epoch": 3.973668461998803,
"grad_norm": 0.026611328125,
"learning_rate": 2.0269299820466785e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7014539.0,
"step": 6640
},
{
"entropy": 1.5793173909187317,
"epoch": 3.9796529024536205,
"grad_norm": 0.06298828125,
"learning_rate": 2.0209455415918612e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7025021.0,
"step": 6650
},
{
"entropy": 1.515379798412323,
"epoch": 3.985637342908438,
"grad_norm": 0.005523681640625,
"learning_rate": 2.014961101137044e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7035600.0,
"step": 6660
},
{
"entropy": 1.6099679350852967,
"epoch": 3.9916217833632555,
"grad_norm": 0.0081787109375,
"learning_rate": 2.0089766606822263e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996563553810119,
"num_tokens": 7046232.0,
"step": 6670
},
{
"entropy": 1.5835115551948546,
"epoch": 3.9976062238180727,
"grad_norm": 0.01312255859375,
"learning_rate": 2.0029922202274086e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9993050158023834,
"num_tokens": 7056783.0,
"step": 6680
},
{
"entropy": 1.5779105305671692,
"epoch": 4.003590664272891,
"grad_norm": 0.058349609375,
"learning_rate": 1.9970077797725913e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9996598660945892,
"num_tokens": 7067409.0,
"step": 6690
},
{
"entropy": 1.5681239366531372,
"epoch": 4.009575104727708,
"grad_norm": 0.029541015625,
"learning_rate": 1.9910233393177737e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7078160.0,
"step": 6700
},
{
"entropy": 1.4813060760498047,
"epoch": 4.0155595451825254,
"grad_norm": 0.0186767578125,
"learning_rate": 1.9850388988629564e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7088509.0,
"step": 6710
},
{
"entropy": 1.594699537754059,
"epoch": 4.021543985637343,
"grad_norm": 0.11328125,
"learning_rate": 1.9790544584081388e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 7099175.0,
"step": 6720
},
{
"entropy": 1.6191529512405396,
"epoch": 4.02752842609216,
"grad_norm": 0.0228271484375,
"learning_rate": 1.9730700179533215e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7109906.0,
"step": 6730
},
{
"entropy": 1.5749483346939086,
"epoch": 4.033512866546978,
"grad_norm": 0.0086669921875,
"learning_rate": 1.9670855774985038e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7120497.0,
"step": 6740
},
{
"entropy": 1.5680004119873048,
"epoch": 4.039497307001795,
"grad_norm": 0.01104736328125,
"learning_rate": 1.9611011370436865e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7131051.0,
"step": 6750
},
{
"entropy": 1.6218904495239257,
"epoch": 4.045481747456613,
"grad_norm": 0.0242919921875,
"learning_rate": 1.955116696588869e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 7141748.0,
"step": 6760
},
{
"entropy": 1.5259785056114197,
"epoch": 4.05146618791143,
"grad_norm": 0.359375,
"learning_rate": 1.9491322561340516e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7152133.0,
"step": 6770
},
{
"entropy": 1.5659233808517456,
"epoch": 4.057450628366248,
"grad_norm": 0.031982421875,
"learning_rate": 1.943147815679234e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9995967745780945,
"num_tokens": 7162629.0,
"step": 6780
},
{
"entropy": 1.5872820377349854,
"epoch": 4.063435068821065,
"grad_norm": 0.486328125,
"learning_rate": 1.9371633752244166e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9992965221405029,
"num_tokens": 7173417.0,
"step": 6790
},
{
"entropy": 1.6818758726119996,
"epoch": 4.069419509275883,
"grad_norm": 0.037109375,
"learning_rate": 1.9311789347695993e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9993055462837219,
"num_tokens": 7184217.0,
"step": 6800
},
{
"entropy": 1.5825034737586976,
"epoch": 4.0754039497307,
"grad_norm": 0.12255859375,
"learning_rate": 1.9251944943147814e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7194696.0,
"step": 6810
},
{
"entropy": 1.590399932861328,
"epoch": 4.081388390185517,
"grad_norm": 0.0128173828125,
"learning_rate": 1.919210053859964e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7205359.0,
"step": 6820
},
{
"entropy": 1.5604485630989076,
"epoch": 4.087372830640335,
"grad_norm": 0.1328125,
"learning_rate": 1.9132256134051468e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7215909.0,
"step": 6830
},
{
"entropy": 1.5563522219657897,
"epoch": 4.093357271095153,
"grad_norm": 0.021484375,
"learning_rate": 1.9072411729503294e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7226295.0,
"step": 6840
},
{
"entropy": 1.549744188785553,
"epoch": 4.09934171154997,
"grad_norm": 0.08447265625,
"learning_rate": 1.9012567324955115e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7236827.0,
"step": 6850
},
{
"entropy": 1.5607978582382203,
"epoch": 4.105326152004787,
"grad_norm": 0.16796875,
"learning_rate": 1.895272292040694e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996491253376008,
"num_tokens": 7247388.0,
"step": 6860
},
{
"entropy": 1.5611071705818176,
"epoch": 4.111310592459605,
"grad_norm": 0.0284423828125,
"learning_rate": 1.889287851585877e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9995850622653961,
"num_tokens": 7257914.0,
"step": 6870
},
{
"entropy": 1.5587551593780518,
"epoch": 4.117295032914423,
"grad_norm": 0.10009765625,
"learning_rate": 1.8833034111310596e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996699690818787,
"num_tokens": 7268506.0,
"step": 6880
},
{
"entropy": 1.5508070588111877,
"epoch": 4.12327947336924,
"grad_norm": 0.0576171875,
"learning_rate": 1.8773189706762416e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7278955.0,
"step": 6890
},
{
"entropy": 1.5777355074882506,
"epoch": 4.129263913824057,
"grad_norm": 0.04296875,
"learning_rate": 1.8713345302214243e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7289604.0,
"step": 6900
},
{
"entropy": 1.5634498238563537,
"epoch": 4.135248354278875,
"grad_norm": 0.0732421875,
"learning_rate": 1.865350089766607e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7300222.0,
"step": 6910
},
{
"entropy": 1.5446319460868836,
"epoch": 4.141232794733693,
"grad_norm": 0.1396484375,
"learning_rate": 1.8593656493117897e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7310848.0,
"step": 6920
},
{
"entropy": 1.514329993724823,
"epoch": 4.14721723518851,
"grad_norm": 0.007354736328125,
"learning_rate": 1.8533812088569717e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996197700500489,
"num_tokens": 7321225.0,
"step": 6930
},
{
"entropy": 1.5420185565948485,
"epoch": 4.153201675643327,
"grad_norm": 0.443359375,
"learning_rate": 1.8473967684021544e-05,
"loss": 0.0005,
"mean_token_accuracy": 0.9996666669845581,
"num_tokens": 7331897.0,
"step": 6940
},
{
"entropy": 1.5471250891685486,
"epoch": 4.1591861160981445,
"grad_norm": 0.0162353515625,
"learning_rate": 1.841412327947337e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7342505.0,
"step": 6950
},
{
"entropy": 1.5706151604652405,
"epoch": 4.165170556552963,
"grad_norm": 0.1123046875,
"learning_rate": 1.8354278874925195e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7353115.0,
"step": 6960
},
{
"entropy": 1.5618083357810975,
"epoch": 4.17115499700778,
"grad_norm": 0.12060546875,
"learning_rate": 1.8294434470377018e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7363684.0,
"step": 6970
},
{
"entropy": 1.5535432338714599,
"epoch": 4.177139437462597,
"grad_norm": 0.037353515625,
"learning_rate": 1.8234590065828845e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7374096.0,
"step": 6980
},
{
"entropy": 1.5730727791786194,
"epoch": 4.1831238779174145,
"grad_norm": 0.361328125,
"learning_rate": 1.8174745661280672e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 7384486.0,
"step": 6990
},
{
"entropy": 1.5279602527618408,
"epoch": 4.189108318372233,
"grad_norm": 0.0274658203125,
"learning_rate": 1.8114901256732496e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 7394884.0,
"step": 7000
},
{
"entropy": 1.5611447095870972,
"epoch": 4.19509275882705,
"grad_norm": 0.091796875,
"learning_rate": 1.805505685218432e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996794879436492,
"num_tokens": 7405510.0,
"step": 7010
},
{
"entropy": 1.5359971404075623,
"epoch": 4.201077199281867,
"grad_norm": 0.00958251953125,
"learning_rate": 1.7995212447636146e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 7416021.0,
"step": 7020
},
{
"entropy": 1.5243404746055602,
"epoch": 4.2070616397366845,
"grad_norm": 0.0140380859375,
"learning_rate": 1.7935368043087973e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996655523777008,
"num_tokens": 7426626.0,
"step": 7030
},
{
"entropy": 1.6068053007125855,
"epoch": 4.213046080191502,
"grad_norm": 0.0595703125,
"learning_rate": 1.7875523638539797e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7437210.0,
"step": 7040
},
{
"entropy": 1.572656273841858,
"epoch": 4.21903052064632,
"grad_norm": 0.57421875,
"learning_rate": 1.781567923399162e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996621608734131,
"num_tokens": 7447772.0,
"step": 7050
},
{
"entropy": 1.5639835238456725,
"epoch": 4.225014961101137,
"grad_norm": 0.0390625,
"learning_rate": 1.7755834829443448e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9995951414108276,
"num_tokens": 7458249.0,
"step": 7060
},
{
"entropy": 1.5189979791641235,
"epoch": 4.230999401555954,
"grad_norm": 0.09814453125,
"learning_rate": 1.7695990424895274e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996108949184418,
"num_tokens": 7468537.0,
"step": 7070
},
{
"entropy": 1.575409507751465,
"epoch": 4.236983842010772,
"grad_norm": 0.01171875,
"learning_rate": 1.7636146020347098e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7479461.0,
"step": 7080
},
{
"entropy": 1.5510912537574768,
"epoch": 4.24296828246559,
"grad_norm": 0.03076171875,
"learning_rate": 1.757630161579892e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7489959.0,
"step": 7090
},
{
"entropy": 1.547593891620636,
"epoch": 4.248952722920407,
"grad_norm": 0.2333984375,
"learning_rate": 1.751645721125075e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 7500396.0,
"step": 7100
},
{
"entropy": 1.5397059679031373,
"epoch": 4.254937163375224,
"grad_norm": 0.023193359375,
"learning_rate": 1.7456612806702572e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7511001.0,
"step": 7110
},
{
"entropy": 1.566526746749878,
"epoch": 4.260921603830042,
"grad_norm": 0.07470703125,
"learning_rate": 1.73967684021544e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7521548.0,
"step": 7120
},
{
"entropy": 1.59982568025589,
"epoch": 4.266906044284859,
"grad_norm": 0.0223388671875,
"learning_rate": 1.7336923997606223e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996296286582946,
"num_tokens": 7532357.0,
"step": 7130
},
{
"entropy": 1.5388713359832764,
"epoch": 4.272890484739677,
"grad_norm": 0.0159912109375,
"learning_rate": 1.727707959305805e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996551752090455,
"num_tokens": 7543029.0,
"step": 7140
},
{
"entropy": 1.613177978992462,
"epoch": 4.278874925194494,
"grad_norm": 0.024169921875,
"learning_rate": 1.7217235188509873e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7553583.0,
"step": 7150
},
{
"entropy": 1.5655102014541626,
"epoch": 4.284859365649312,
"grad_norm": 0.0181884765625,
"learning_rate": 1.71573907839617e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7564054.0,
"step": 7160
},
{
"entropy": 1.5656741619110108,
"epoch": 4.290843806104129,
"grad_norm": 0.00689697265625,
"learning_rate": 1.7097546379413524e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7574548.0,
"step": 7170
},
{
"entropy": 1.5077351450920105,
"epoch": 4.296828246558947,
"grad_norm": 0.027099609375,
"learning_rate": 1.703770197486535e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 7585027.0,
"step": 7180
},
{
"entropy": 1.5920394659042358,
"epoch": 4.302812687013764,
"grad_norm": 0.00830078125,
"learning_rate": 1.6977857570317175e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7595631.0,
"step": 7190
},
{
"entropy": 1.5724570870399475,
"epoch": 4.308797127468582,
"grad_norm": 0.041259765625,
"learning_rate": 1.6918013165769e-05,
"loss": 0.0017,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 7606196.0,
"step": 7200
},
{
"entropy": 1.5241859555244446,
"epoch": 4.314781567923399,
"grad_norm": 0.03125,
"learning_rate": 1.6858168761220825e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996539771556854,
"num_tokens": 7616506.0,
"step": 7210
},
{
"entropy": 1.5611043930053712,
"epoch": 4.320766008378216,
"grad_norm": 0.0286865234375,
"learning_rate": 1.6798324356672652e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 7627126.0,
"step": 7220
},
{
"entropy": 1.5357125282287598,
"epoch": 4.326750448833034,
"grad_norm": 0.03955078125,
"learning_rate": 1.6738479952124476e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7637728.0,
"step": 7230
},
{
"entropy": 1.507653498649597,
"epoch": 4.332734889287852,
"grad_norm": 0.060791015625,
"learning_rate": 1.6678635547576303e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7648244.0,
"step": 7240
},
{
"entropy": 1.539263367652893,
"epoch": 4.338719329742669,
"grad_norm": 0.33984375,
"learning_rate": 1.661879114302813e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9986226677894592,
"num_tokens": 7658828.0,
"step": 7250
},
{
"entropy": 1.558404839038849,
"epoch": 4.344703770197486,
"grad_norm": 0.01239013671875,
"learning_rate": 1.655894673847995e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996478855609894,
"num_tokens": 7669461.0,
"step": 7260
},
{
"entropy": 1.5491536259651184,
"epoch": 4.350688210652304,
"grad_norm": 0.0289306640625,
"learning_rate": 1.6499102333931777e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7680062.0,
"step": 7270
},
{
"entropy": 1.5289602637290955,
"epoch": 4.356672651107122,
"grad_norm": 0.0250244140625,
"learning_rate": 1.6439257929383604e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 7690585.0,
"step": 7280
},
{
"entropy": 1.5975135564804077,
"epoch": 4.362657091561939,
"grad_norm": 0.064453125,
"learning_rate": 1.637941352483543e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7701131.0,
"step": 7290
},
{
"entropy": 1.6185077905654908,
"epoch": 4.368641532016756,
"grad_norm": 0.01171875,
"learning_rate": 1.631956912028725e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7711714.0,
"step": 7300
},
{
"entropy": 1.585240626335144,
"epoch": 4.3746259724715735,
"grad_norm": 0.107421875,
"learning_rate": 1.6259724715739078e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996539771556854,
"num_tokens": 7722427.0,
"step": 7310
},
{
"entropy": 1.621202325820923,
"epoch": 4.380610412926392,
"grad_norm": 0.0223388671875,
"learning_rate": 1.6199880311190905e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7733131.0,
"step": 7320
},
{
"entropy": 1.5082001328468322,
"epoch": 4.386594853381209,
"grad_norm": 0.0228271484375,
"learning_rate": 1.6140035906642732e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 7743525.0,
"step": 7330
},
{
"entropy": 1.5455591917037963,
"epoch": 4.392579293836026,
"grad_norm": 0.0106201171875,
"learning_rate": 1.6080191502094552e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7754121.0,
"step": 7340
},
{
"entropy": 1.5201449394226074,
"epoch": 4.3985637342908435,
"grad_norm": 0.142578125,
"learning_rate": 1.602034709754638e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996078431606292,
"num_tokens": 7764636.0,
"step": 7350
},
{
"entropy": 1.5823255062103272,
"epoch": 4.404548174745662,
"grad_norm": 0.0634765625,
"learning_rate": 1.5960502692998206e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.999664431810379,
"num_tokens": 7775359.0,
"step": 7360
},
{
"entropy": 1.5282755970954895,
"epoch": 4.410532615200479,
"grad_norm": 0.055908203125,
"learning_rate": 1.5900658288450033e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 7785851.0,
"step": 7370
},
{
"entropy": 1.5407449364662171,
"epoch": 4.416517055655296,
"grad_norm": 0.01806640625,
"learning_rate": 1.5840813883901853e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996350347995758,
"num_tokens": 7796400.0,
"step": 7380
},
{
"entropy": 1.5680436968803406,
"epoch": 4.4225014961101135,
"grad_norm": 0.0283203125,
"learning_rate": 1.578096947935368e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7806854.0,
"step": 7390
},
{
"entropy": 1.5389506220817566,
"epoch": 4.428485936564931,
"grad_norm": 0.05908203125,
"learning_rate": 1.5721125074805507e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9993071675300598,
"num_tokens": 7817406.0,
"step": 7400
},
{
"entropy": 1.5411053538322448,
"epoch": 4.434470377019749,
"grad_norm": 0.005645751953125,
"learning_rate": 1.566128067025733e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7827914.0,
"step": 7410
},
{
"entropy": 1.5534313559532165,
"epoch": 4.440454817474566,
"grad_norm": 0.6640625,
"learning_rate": 1.5601436265709155e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996677756309509,
"num_tokens": 7838628.0,
"step": 7420
},
{
"entropy": 1.5759409070014954,
"epoch": 4.446439257929383,
"grad_norm": 0.50390625,
"learning_rate": 1.554159186116098e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9993143141269684,
"num_tokens": 7849032.0,
"step": 7430
},
{
"entropy": 1.5122934222221374,
"epoch": 4.452423698384201,
"grad_norm": 0.07275390625,
"learning_rate": 1.548174745661281e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9993093550205231,
"num_tokens": 7859524.0,
"step": 7440
},
{
"entropy": 1.5497113466262817,
"epoch": 4.458408138839019,
"grad_norm": 0.166015625,
"learning_rate": 1.5421903052064632e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.999622642993927,
"num_tokens": 7870109.0,
"step": 7450
},
{
"entropy": 1.5085930705070496,
"epoch": 4.464392579293836,
"grad_norm": 0.1376953125,
"learning_rate": 1.5362058647516456e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 7880529.0,
"step": 7460
},
{
"entropy": 1.5613359570503236,
"epoch": 4.470377019748653,
"grad_norm": 0.0047607421875,
"learning_rate": 1.5302214242968283e-05,
"loss": 0.0005,
"mean_token_accuracy": 0.9996350347995758,
"num_tokens": 7891117.0,
"step": 7470
},
{
"entropy": 1.5748088836669922,
"epoch": 4.476361460203471,
"grad_norm": 0.01080322265625,
"learning_rate": 1.524236983842011e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7901694.0,
"step": 7480
},
{
"entropy": 1.543941354751587,
"epoch": 4.482345900658289,
"grad_norm": 0.0306396484375,
"learning_rate": 1.5182525433871932e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 7912307.0,
"step": 7490
},
{
"entropy": 1.5929329633712768,
"epoch": 4.488330341113106,
"grad_norm": 0.0185546875,
"learning_rate": 1.5122681029323759e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7923076.0,
"step": 7500
},
{
"entropy": 1.5893060326576234,
"epoch": 4.494314781567923,
"grad_norm": 0.046630859375,
"learning_rate": 1.5062836624775584e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 7933915.0,
"step": 7510
},
{
"entropy": 1.5541257858276367,
"epoch": 4.500299222022741,
"grad_norm": 0.0147705078125,
"learning_rate": 1.5002992220227411e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 7944357.0,
"step": 7520
},
{
"entropy": 1.500285267829895,
"epoch": 4.506283662477558,
"grad_norm": 0.0264892578125,
"learning_rate": 1.4943147815679234e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9992882609367371,
"num_tokens": 7954810.0,
"step": 7530
},
{
"entropy": 1.5186502814292908,
"epoch": 4.512268102932376,
"grad_norm": 0.031005859375,
"learning_rate": 1.488330341113106e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996598660945892,
"num_tokens": 7965436.0,
"step": 7540
},
{
"entropy": 1.6063961029052733,
"epoch": 4.518252543387193,
"grad_norm": 0.734375,
"learning_rate": 1.4823459006582885e-05,
"loss": 0.002,
"mean_token_accuracy": 0.9993270456790924,
"num_tokens": 7976072.0,
"step": 7550
},
{
"entropy": 1.5510151267051697,
"epoch": 4.524236983842011,
"grad_norm": 0.0439453125,
"learning_rate": 1.476361460203471e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 7986738.0,
"step": 7560
},
{
"entropy": 1.55672527551651,
"epoch": 4.530221424296828,
"grad_norm": 0.0478515625,
"learning_rate": 1.4703770197486534e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996282517910003,
"num_tokens": 7997279.0,
"step": 7570
},
{
"entropy": 1.5957432746887208,
"epoch": 4.536205864751645,
"grad_norm": 0.0120849609375,
"learning_rate": 1.4643925792938361e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 8007886.0,
"step": 7580
},
{
"entropy": 1.5532257080078125,
"epoch": 4.542190305206463,
"grad_norm": 0.00994873046875,
"learning_rate": 1.4584081388390185e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996478855609894,
"num_tokens": 8018282.0,
"step": 7590
},
{
"entropy": 1.5621094346046447,
"epoch": 4.548174745661281,
"grad_norm": 0.1787109375,
"learning_rate": 1.4524236983842012e-05,
"loss": 0.0018,
"mean_token_accuracy": 0.999331396818161,
"num_tokens": 8028884.0,
"step": 7600
},
{
"entropy": 1.5396092414855957,
"epoch": 4.554159186116098,
"grad_norm": 0.0078125,
"learning_rate": 1.4464392579293835e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 8039399.0,
"step": 7610
},
{
"entropy": 1.5150254487991333,
"epoch": 4.560143626570916,
"grad_norm": 0.055908203125,
"learning_rate": 1.4404548174745662e-05,
"loss": 0.0014,
"mean_token_accuracy": 0.9996621608734131,
"num_tokens": 8049911.0,
"step": 7620
},
{
"entropy": 1.5425100445747375,
"epoch": 4.566128067025733,
"grad_norm": 0.068359375,
"learning_rate": 1.4344703770197486e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8060483.0,
"step": 7630
},
{
"entropy": 1.6269634127616883,
"epoch": 4.572112507480551,
"grad_norm": 0.0233154296875,
"learning_rate": 1.4284859365649313e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9988581538200378,
"num_tokens": 8071125.0,
"step": 7640
},
{
"entropy": 1.5476625084877014,
"epoch": 4.578096947935368,
"grad_norm": 0.009765625,
"learning_rate": 1.4225014961101136e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8081725.0,
"step": 7650
},
{
"entropy": 1.5690398454666137,
"epoch": 4.584081388390185,
"grad_norm": 0.4140625,
"learning_rate": 1.4165170556552963e-05,
"loss": 0.0019,
"mean_token_accuracy": 0.9993531167507171,
"num_tokens": 8092450.0,
"step": 7660
},
{
"entropy": 1.5727166771888732,
"epoch": 4.590065828845003,
"grad_norm": 0.1240234375,
"learning_rate": 1.4105326152004787e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.99963099360466,
"num_tokens": 8102872.0,
"step": 7670
},
{
"entropy": 1.6031971335411073,
"epoch": 4.596050269299821,
"grad_norm": 0.00897216796875,
"learning_rate": 1.4045481747456614e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 8113419.0,
"step": 7680
},
{
"entropy": 1.609036648273468,
"epoch": 4.602034709754638,
"grad_norm": 0.00921630859375,
"learning_rate": 1.3985637342908439e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8124188.0,
"step": 7690
},
{
"entropy": 1.557384467124939,
"epoch": 4.608019150209455,
"grad_norm": 0.0086669921875,
"learning_rate": 1.3925792938360264e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8134745.0,
"step": 7700
},
{
"entropy": 1.6228225708007813,
"epoch": 4.614003590664273,
"grad_norm": 0.150390625,
"learning_rate": 1.386594853381209e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8145468.0,
"step": 7710
},
{
"entropy": 1.6001514673233033,
"epoch": 4.619988031119091,
"grad_norm": 0.408203125,
"learning_rate": 1.3806104129263913e-05,
"loss": 0.0006,
"mean_token_accuracy": 0.9996153831481933,
"num_tokens": 8156009.0,
"step": 7720
},
{
"entropy": 1.5820931553840638,
"epoch": 4.625972471573908,
"grad_norm": 0.162109375,
"learning_rate": 1.374625972471574e-05,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 8166610.0,
"step": 7730
},
{
"entropy": 1.5469802737236023,
"epoch": 4.631956912028725,
"grad_norm": 0.150390625,
"learning_rate": 1.3686415320167564e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8177136.0,
"step": 7740
},
{
"entropy": 1.6631050109863281,
"epoch": 4.6379413524835424,
"grad_norm": 0.005340576171875,
"learning_rate": 1.3626570915619391e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8187813.0,
"step": 7750
},
{
"entropy": 1.6081872582435608,
"epoch": 4.643925792938361,
"grad_norm": 0.01251220703125,
"learning_rate": 1.3566726511071214e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8198417.0,
"step": 7760
},
{
"entropy": 1.6233905792236327,
"epoch": 4.649910233393178,
"grad_norm": 0.0213623046875,
"learning_rate": 1.3506882106523041e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 8209083.0,
"step": 7770
},
{
"entropy": 1.5728107690811157,
"epoch": 4.655894673847995,
"grad_norm": 0.002838134765625,
"learning_rate": 1.3447037701974865e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 8219442.0,
"step": 7780
},
{
"entropy": 1.5606521964073181,
"epoch": 4.661879114302812,
"grad_norm": 0.0908203125,
"learning_rate": 1.3387193297426692e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8230155.0,
"step": 7790
},
{
"entropy": 1.572113835811615,
"epoch": 4.667863554757631,
"grad_norm": 0.033203125,
"learning_rate": 1.3327348892878516e-05,
"loss": 0.001,
"mean_token_accuracy": 0.9996677756309509,
"num_tokens": 8240827.0,
"step": 7800
},
{
"entropy": 1.5568471670150756,
"epoch": 4.673847995212448,
"grad_norm": 0.095703125,
"learning_rate": 1.3267504488330343e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8251414.0,
"step": 7810
},
{
"entropy": 1.5330272674560548,
"epoch": 4.679832435667265,
"grad_norm": 0.376953125,
"learning_rate": 1.3207660083782166e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8261779.0,
"step": 7820
},
{
"entropy": 1.5626088619232177,
"epoch": 4.685816876122082,
"grad_norm": 0.004425048828125,
"learning_rate": 1.3147815679233993e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 8272280.0,
"step": 7830
},
{
"entropy": 1.547228252887726,
"epoch": 4.6918013165769,
"grad_norm": 0.0576171875,
"learning_rate": 1.3087971274685817e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996078431606292,
"num_tokens": 8282690.0,
"step": 7840
},
{
"entropy": 1.5491391181945802,
"epoch": 4.697785757031718,
"grad_norm": 0.04248046875,
"learning_rate": 1.3028126870137644e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8293364.0,
"step": 7850
},
{
"entropy": 1.5352839589118958,
"epoch": 4.703770197486535,
"grad_norm": 0.055419921875,
"learning_rate": 1.2968282465589467e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8303770.0,
"step": 7860
},
{
"entropy": 1.5438146233558654,
"epoch": 4.709754637941352,
"grad_norm": 0.0174560546875,
"learning_rate": 1.2908438061041293e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 8314295.0,
"step": 7870
},
{
"entropy": 1.5843072772026061,
"epoch": 4.71573907839617,
"grad_norm": 0.09033203125,
"learning_rate": 1.2848593656493118e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8325010.0,
"step": 7880
},
{
"entropy": 1.6056226730346679,
"epoch": 4.721723518850988,
"grad_norm": 0.00604248046875,
"learning_rate": 1.2788749251944943e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8335587.0,
"step": 7890
},
{
"entropy": 1.5527130126953126,
"epoch": 4.727707959305805,
"grad_norm": 0.05322265625,
"learning_rate": 1.2728904847396769e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 8346105.0,
"step": 7900
},
{
"entropy": 1.577527070045471,
"epoch": 4.733692399760622,
"grad_norm": 0.003692626953125,
"learning_rate": 1.2669060442848594e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8356730.0,
"step": 7910
},
{
"entropy": 1.5795585989952088,
"epoch": 4.73967684021544,
"grad_norm": 0.00188446044921875,
"learning_rate": 1.2609216038300419e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8367484.0,
"step": 7920
},
{
"entropy": 1.5367600798606873,
"epoch": 4.745661280670257,
"grad_norm": 0.0303955078125,
"learning_rate": 1.2549371633752244e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8377998.0,
"step": 7930
},
{
"entropy": 1.5698032140731812,
"epoch": 4.751645721125075,
"grad_norm": 0.013916015625,
"learning_rate": 1.248952722920407e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996503472328186,
"num_tokens": 8388556.0,
"step": 7940
},
{
"entropy": 1.5445321798324585,
"epoch": 4.757630161579892,
"grad_norm": 0.003143310546875,
"learning_rate": 1.2429682824655895e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8399096.0,
"step": 7950
},
{
"entropy": 1.5702390313148498,
"epoch": 4.76361460203471,
"grad_norm": 0.31640625,
"learning_rate": 1.236983842010772e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8409625.0,
"step": 7960
},
{
"entropy": 1.579737651348114,
"epoch": 4.769599042489527,
"grad_norm": 0.265625,
"learning_rate": 1.2309994015559546e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996478855609894,
"num_tokens": 8420123.0,
"step": 7970
},
{
"entropy": 1.475909948348999,
"epoch": 4.775583482944345,
"grad_norm": 0.373046875,
"learning_rate": 1.2250149611011371e-05,
"loss": 0.0009,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 8430614.0,
"step": 7980
},
{
"entropy": 1.5520552158355714,
"epoch": 4.781567923399162,
"grad_norm": 0.00653076171875,
"learning_rate": 1.2190305206463196e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8441217.0,
"step": 7990
},
{
"entropy": 1.6095691800117493,
"epoch": 4.78755236385398,
"grad_norm": 0.28515625,
"learning_rate": 1.2130460801915021e-05,
"loss": 0.0013,
"mean_token_accuracy": 0.9992459297180176,
"num_tokens": 8451802.0,
"step": 8000
},
{
"entropy": 1.5282339096069335,
"epoch": 4.793536804308797,
"grad_norm": 0.047607421875,
"learning_rate": 1.2070616397366847e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8462413.0,
"step": 8010
},
{
"entropy": 1.6001452922821044,
"epoch": 4.799521244763614,
"grad_norm": 0.06591796875,
"learning_rate": 1.201077199281867e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8473104.0,
"step": 8020
},
{
"entropy": 1.5145896315574645,
"epoch": 4.805505685218432,
"grad_norm": 0.016845703125,
"learning_rate": 1.1950927588270497e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8483737.0,
"step": 8030
},
{
"entropy": 1.5419116020202637,
"epoch": 4.81149012567325,
"grad_norm": 0.05029296875,
"learning_rate": 1.1891083183722321e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8494357.0,
"step": 8040
},
{
"entropy": 1.549897277355194,
"epoch": 4.817474566128067,
"grad_norm": 0.08203125,
"learning_rate": 1.1831238779174148e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8504875.0,
"step": 8050
},
{
"entropy": 1.5398707032203673,
"epoch": 4.823459006582884,
"grad_norm": 0.038818359375,
"learning_rate": 1.1771394374625972e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8515439.0,
"step": 8060
},
{
"entropy": 1.5555207967758178,
"epoch": 4.829443447037702,
"grad_norm": 0.03271484375,
"learning_rate": 1.1711549970077799e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996376812458039,
"num_tokens": 8525836.0,
"step": 8070
},
{
"entropy": 1.5203309297561645,
"epoch": 4.83542788749252,
"grad_norm": 0.018310546875,
"learning_rate": 1.1651705565529622e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8536193.0,
"step": 8080
},
{
"entropy": 1.5281639218330383,
"epoch": 4.841412327947337,
"grad_norm": 0.0033111572265625,
"learning_rate": 1.1591861160981449e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8546804.0,
"step": 8090
},
{
"entropy": 1.5610571384429932,
"epoch": 4.847396768402154,
"grad_norm": 0.0130615234375,
"learning_rate": 1.1532016756433273e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8557454.0,
"step": 8100
},
{
"entropy": 1.5801493525505066,
"epoch": 4.853381208856971,
"grad_norm": 0.0035400390625,
"learning_rate": 1.14721723518851e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996699690818787,
"num_tokens": 8568170.0,
"step": 8110
},
{
"entropy": 1.559760570526123,
"epoch": 4.85936564931179,
"grad_norm": 0.07568359375,
"learning_rate": 1.1412327947336923e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8578714.0,
"step": 8120
},
{
"entropy": 1.5482122898101807,
"epoch": 4.865350089766607,
"grad_norm": 0.0179443359375,
"learning_rate": 1.135248354278875e-05,
"loss": 0.0012,
"mean_token_accuracy": 0.9996453881263733,
"num_tokens": 8589194.0,
"step": 8130
},
{
"entropy": 1.5715525984764098,
"epoch": 4.871334530221424,
"grad_norm": 0.099609375,
"learning_rate": 1.1292639138240574e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8599725.0,
"step": 8140
},
{
"entropy": 1.4884485244750976,
"epoch": 4.877318970676241,
"grad_norm": 0.126953125,
"learning_rate": 1.12327947336924e-05,
"loss": 0.0016,
"mean_token_accuracy": 0.9992669522762299,
"num_tokens": 8610166.0,
"step": 8150
},
{
"entropy": 1.5385358095169068,
"epoch": 4.88330341113106,
"grad_norm": 0.07568359375,
"learning_rate": 1.1172950329144226e-05,
"loss": 0.0008,
"mean_token_accuracy": 0.9996153831481933,
"num_tokens": 8620615.0,
"step": 8160
},
{
"entropy": 1.5727779269218445,
"epoch": 4.889287851585877,
"grad_norm": 0.00144195556640625,
"learning_rate": 1.111310592459605e-05,
"loss": 0.0011,
"mean_token_accuracy": 0.9996666669845581,
"num_tokens": 8631292.0,
"step": 8170
},
{
"entropy": 1.4883596062660218,
"epoch": 4.895272292040694,
"grad_norm": 0.00677490234375,
"learning_rate": 1.1053261520047877e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8641757.0,
"step": 8180
},
{
"entropy": 1.6173644065856934,
"epoch": 4.901256732495511,
"grad_norm": 0.08984375,
"learning_rate": 1.09934171154997e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8652330.0,
"step": 8190
},
{
"entropy": 1.56162348985672,
"epoch": 4.907241172950329,
"grad_norm": 0.025634765625,
"learning_rate": 1.0933572710951527e-05,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8662973.0,
"step": 8200
},
{
"entropy": 1.5082152366638184,
"epoch": 4.913225613405147,
"grad_norm": 0.011474609375,
"learning_rate": 1.0873728306403351e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8673530.0,
"step": 8210
},
{
"entropy": 1.553872811794281,
"epoch": 4.919210053859964,
"grad_norm": 0.0213623046875,
"learning_rate": 1.0813883901855178e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8684030.0,
"step": 8220
},
{
"entropy": 1.5669398188591004,
"epoch": 4.925194494314781,
"grad_norm": 0.09423828125,
"learning_rate": 1.0754039497307001e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 8694514.0,
"step": 8230
},
{
"entropy": 1.540639054775238,
"epoch": 4.931178934769599,
"grad_norm": 0.12451171875,
"learning_rate": 1.0694195092758828e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8704968.0,
"step": 8240
},
{
"entropy": 1.6199857950210572,
"epoch": 4.937163375224417,
"grad_norm": 0.0419921875,
"learning_rate": 1.0634350688210652e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8715556.0,
"step": 8250
},
{
"entropy": 1.6003392696380616,
"epoch": 4.943147815679234,
"grad_norm": 0.009765625,
"learning_rate": 1.0574506283662479e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8726172.0,
"step": 8260
},
{
"entropy": 1.5800185084342957,
"epoch": 4.949132256134051,
"grad_norm": 0.06591796875,
"learning_rate": 1.0514661879114303e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996621608734131,
"num_tokens": 8736769.0,
"step": 8270
},
{
"entropy": 1.5508555054664612,
"epoch": 4.955116696588869,
"grad_norm": 0.01373291015625,
"learning_rate": 1.045481747456613e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9996031761169434,
"num_tokens": 8747098.0,
"step": 8280
},
{
"entropy": 1.5614091634750367,
"epoch": 4.961101137043686,
"grad_norm": 0.0020599365234375,
"learning_rate": 1.0394973070017953e-05,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8757653.0,
"step": 8290
},
{
"entropy": 1.6161825299263,
"epoch": 4.967085577498504,
"grad_norm": 0.4453125,
"learning_rate": 1.033512866546978e-05,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 8768239.0,
"step": 8300
},
{
"entropy": 1.5423355102539062,
"epoch": 4.973070017953321,
"grad_norm": 0.023193359375,
"learning_rate": 1.0275284260921604e-05,
"loss": 0.0007,
"mean_token_accuracy": 0.9996376812458039,
"num_tokens": 8778716.0,
"step": 8310
},
{
"entropy": 1.5606900930404664,
"epoch": 4.979054458408139,
"grad_norm": 0.0830078125,
"learning_rate": 1.0215439856373429e-05,
"loss": 0.0015,
"mean_token_accuracy": 0.9992770373821258,
"num_tokens": 8789340.0,
"step": 8320
},
{
"entropy": 1.5358200788497924,
"epoch": 4.985038898862956,
"grad_norm": 0.048583984375,
"learning_rate": 1.0155595451825254e-05,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8799853.0,
"step": 8330
},
{
"entropy": 1.5231838941574096,
"epoch": 4.991023339317774,
"grad_norm": 0.016357421875,
"learning_rate": 1.009575104727708e-05,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8810489.0,
"step": 8340
},
{
"entropy": 1.5757528066635131,
"epoch": 4.997007779772591,
"grad_norm": 0.008056640625,
"learning_rate": 1.0035906642728905e-05,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8820931.0,
"step": 8350
},
{
"entropy": 1.5420665383338927,
"epoch": 5.002992220227409,
"grad_norm": 0.018798828125,
"learning_rate": 9.97606223818073e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8831504.0,
"step": 8360
},
{
"entropy": 1.5548468589782716,
"epoch": 5.008976660682226,
"grad_norm": 0.05908203125,
"learning_rate": 9.916217833632556e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8842213.0,
"step": 8370
},
{
"entropy": 1.5424367904663085,
"epoch": 5.014961101137044,
"grad_norm": 0.022705078125,
"learning_rate": 9.85637342908438e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 8852738.0,
"step": 8380
},
{
"entropy": 1.5555650353431703,
"epoch": 5.020945541591861,
"grad_norm": 0.042236328125,
"learning_rate": 9.796529024536206e-06,
"loss": 0.0011,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 8863141.0,
"step": 8390
},
{
"entropy": 1.5822317361831666,
"epoch": 5.026929982046679,
"grad_norm": 0.138671875,
"learning_rate": 9.736684619988031e-06,
"loss": 0.0014,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 8873691.0,
"step": 8400
},
{
"entropy": 1.5569300770759582,
"epoch": 5.032914422501496,
"grad_norm": 0.03955078125,
"learning_rate": 9.676840215439857e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 8884255.0,
"step": 8410
},
{
"entropy": 1.6120752453804017,
"epoch": 5.038898862956313,
"grad_norm": 0.02880859375,
"learning_rate": 9.616995810891682e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8894845.0,
"step": 8420
},
{
"entropy": 1.5797799468040465,
"epoch": 5.044883303411131,
"grad_norm": 0.08447265625,
"learning_rate": 9.557151406343507e-06,
"loss": 0.0006,
"mean_token_accuracy": 0.9996254682540894,
"num_tokens": 8905268.0,
"step": 8430
},
{
"entropy": 1.5771249175071715,
"epoch": 5.050867743865949,
"grad_norm": 0.0830078125,
"learning_rate": 9.497307001795333e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8915783.0,
"step": 8440
},
{
"entropy": 1.5601511240005492,
"epoch": 5.056852184320766,
"grad_norm": 0.0201416015625,
"learning_rate": 9.437462597247158e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8926367.0,
"step": 8450
},
{
"entropy": 1.5705560445785522,
"epoch": 5.062836624775583,
"grad_norm": 0.1025390625,
"learning_rate": 9.377618192698983e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996441304683685,
"num_tokens": 8936810.0,
"step": 8460
},
{
"entropy": 1.5224881649017334,
"epoch": 5.068821065230401,
"grad_norm": 0.01129150390625,
"learning_rate": 9.317773788150807e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8947316.0,
"step": 8470
},
{
"entropy": 1.5385950803756714,
"epoch": 5.074805505685219,
"grad_norm": 0.1376953125,
"learning_rate": 9.257929383602634e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 8957870.0,
"step": 8480
},
{
"entropy": 1.583323359489441,
"epoch": 5.080789946140036,
"grad_norm": 0.38671875,
"learning_rate": 9.198084979054457e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 8968652.0,
"step": 8490
},
{
"entropy": 1.524876642227173,
"epoch": 5.086774386594853,
"grad_norm": 0.056884765625,
"learning_rate": 9.138240574506284e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 8979194.0,
"step": 8500
},
{
"entropy": 1.565677809715271,
"epoch": 5.09275882704967,
"grad_norm": 0.04345703125,
"learning_rate": 9.078396169958108e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 8989644.0,
"step": 8510
},
{
"entropy": 1.5748510122299195,
"epoch": 5.098743267504489,
"grad_norm": 0.0286865234375,
"learning_rate": 9.018551765409935e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9000200.0,
"step": 8520
},
{
"entropy": 1.556013298034668,
"epoch": 5.104727707959306,
"grad_norm": 0.0235595703125,
"learning_rate": 8.958707360861759e-06,
"loss": 0.0006,
"mean_token_accuracy": 0.999622642993927,
"num_tokens": 9010599.0,
"step": 8530
},
{
"entropy": 1.5656542539596559,
"epoch": 5.110712148414123,
"grad_norm": 0.2197265625,
"learning_rate": 8.898862956313585e-06,
"loss": 0.0011,
"mean_token_accuracy": 0.9993050158023834,
"num_tokens": 9021333.0,
"step": 8540
},
{
"entropy": 1.597491943836212,
"epoch": 5.11669658886894,
"grad_norm": 0.038818359375,
"learning_rate": 8.839018551765409e-06,
"loss": 0.0012,
"mean_token_accuracy": 0.9996254682540894,
"num_tokens": 9031830.0,
"step": 8550
},
{
"entropy": 1.5429089546203614,
"epoch": 5.122681029323759,
"grad_norm": 0.035888671875,
"learning_rate": 8.779174147217236e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9042318.0,
"step": 8560
},
{
"entropy": 1.6095686435699463,
"epoch": 5.128665469778576,
"grad_norm": 0.044677734375,
"learning_rate": 8.71932974266906e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996539771556854,
"num_tokens": 9052817.0,
"step": 8570
},
{
"entropy": 1.5880122184753418,
"epoch": 5.134649910233393,
"grad_norm": 0.0098876953125,
"learning_rate": 8.659485338120887e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9063460.0,
"step": 8580
},
{
"entropy": 1.5641863226890564,
"epoch": 5.14063435068821,
"grad_norm": 0.00933837890625,
"learning_rate": 8.59964093357271e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9074005.0,
"step": 8590
},
{
"entropy": 1.5898365259170533,
"epoch": 5.146618791143029,
"grad_norm": 0.0078125,
"learning_rate": 8.539796529024537e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9084684.0,
"step": 8600
},
{
"entropy": 1.550035560131073,
"epoch": 5.152603231597846,
"grad_norm": 0.0703125,
"learning_rate": 8.479952124476363e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9095288.0,
"step": 8610
},
{
"entropy": 1.5923567295074463,
"epoch": 5.158587672052663,
"grad_norm": 0.02392578125,
"learning_rate": 8.420107719928186e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996478855609894,
"num_tokens": 9105815.0,
"step": 8620
},
{
"entropy": 1.5446609854698181,
"epoch": 5.16457211250748,
"grad_norm": 0.0108642578125,
"learning_rate": 8.360263315380013e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9116319.0,
"step": 8630
},
{
"entropy": 1.5514729619026184,
"epoch": 5.170556552962298,
"grad_norm": 0.0205078125,
"learning_rate": 8.300418910831837e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9126714.0,
"step": 8640
},
{
"entropy": 1.5688727736473083,
"epoch": 5.176540993417116,
"grad_norm": 0.0169677734375,
"learning_rate": 8.240574506283664e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9137258.0,
"step": 8650
},
{
"entropy": 1.5155110716819764,
"epoch": 5.182525433871933,
"grad_norm": 0.019287109375,
"learning_rate": 8.180730101735487e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9147983.0,
"step": 8660
},
{
"entropy": 1.576178824901581,
"epoch": 5.18850987432675,
"grad_norm": 0.1416015625,
"learning_rate": 8.120885697187314e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9158424.0,
"step": 8670
},
{
"entropy": 1.6217318296432495,
"epoch": 5.194494314781568,
"grad_norm": 0.012939453125,
"learning_rate": 8.061041292639138e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996515691280365,
"num_tokens": 9169127.0,
"step": 8680
},
{
"entropy": 1.5923057436943053,
"epoch": 5.200478755236386,
"grad_norm": 0.138671875,
"learning_rate": 8.001196888090965e-06,
"loss": 0.0009,
"mean_token_accuracy": 0.9996282517910003,
"num_tokens": 9179719.0,
"step": 8690
},
{
"entropy": 1.5494077682495118,
"epoch": 5.206463195691203,
"grad_norm": 0.4921875,
"learning_rate": 7.941352483542788e-06,
"loss": 0.0009,
"mean_token_accuracy": 0.9996153831481933,
"num_tokens": 9190368.0,
"step": 8700
},
{
"entropy": 1.6158392786979676,
"epoch": 5.21244763614602,
"grad_norm": 0.00604248046875,
"learning_rate": 7.881508078994615e-06,
"loss": 0.0005,
"mean_token_accuracy": 0.9996515691280365,
"num_tokens": 9201014.0,
"step": 8710
},
{
"entropy": 1.5365092873573303,
"epoch": 5.218432076600838,
"grad_norm": 0.010986328125,
"learning_rate": 7.821663674446439e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9211547.0,
"step": 8720
},
{
"entropy": 1.6001612663269043,
"epoch": 5.224416517055655,
"grad_norm": 0.00537109375,
"learning_rate": 7.761819269898266e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9222245.0,
"step": 8730
},
{
"entropy": 1.5610676050186156,
"epoch": 5.230400957510473,
"grad_norm": 0.021728515625,
"learning_rate": 7.70197486535009e-06,
"loss": 0.0021,
"mean_token_accuracy": 0.9992975473403931,
"num_tokens": 9232882.0,
"step": 8740
},
{
"entropy": 1.5196431040763856,
"epoch": 5.23638539796529,
"grad_norm": 0.1875,
"learning_rate": 7.642130460801917e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9243354.0,
"step": 8750
},
{
"entropy": 1.5230332970619203,
"epoch": 5.242369838420108,
"grad_norm": 0.00927734375,
"learning_rate": 7.58228605625374e-06,
"loss": 0.001,
"mean_token_accuracy": 1.0,
"num_tokens": 9253906.0,
"step": 8760
},
{
"entropy": 1.56902277469635,
"epoch": 5.248354278874925,
"grad_norm": 0.1689453125,
"learning_rate": 7.522441651705565e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9264531.0,
"step": 8770
},
{
"entropy": 1.5179225206375122,
"epoch": 5.254338719329743,
"grad_norm": 0.00555419921875,
"learning_rate": 7.462597247157391e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9275165.0,
"step": 8780
},
{
"entropy": 1.6126029729843139,
"epoch": 5.26032315978456,
"grad_norm": 0.138671875,
"learning_rate": 7.402752842609216e-06,
"loss": 0.0005,
"mean_token_accuracy": 0.9996389865875244,
"num_tokens": 9285899.0,
"step": 8790
},
{
"entropy": 1.5695497632026671,
"epoch": 5.266307600239378,
"grad_norm": 0.0118408203125,
"learning_rate": 7.342908438061041e-06,
"loss": 0.0006,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 9296579.0,
"step": 8800
},
{
"entropy": 1.5539406895637513,
"epoch": 5.272292040694195,
"grad_norm": 0.0146484375,
"learning_rate": 7.283064033512867e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9307184.0,
"step": 8810
},
{
"entropy": 1.6012183547019958,
"epoch": 5.278276481149012,
"grad_norm": 0.00762939453125,
"learning_rate": 7.223219628964692e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9317825.0,
"step": 8820
},
{
"entropy": 1.521978497505188,
"epoch": 5.28426092160383,
"grad_norm": 0.046875,
"learning_rate": 7.163375224416517e-06,
"loss": 0.0018,
"mean_token_accuracy": 0.9992892503738403,
"num_tokens": 9328583.0,
"step": 8830
},
{
"entropy": 1.5513456106185912,
"epoch": 5.290245362058648,
"grad_norm": 0.11083984375,
"learning_rate": 7.1035308198683425e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9339201.0,
"step": 8840
},
{
"entropy": 1.5179700970649719,
"epoch": 5.296229802513465,
"grad_norm": 0.13671875,
"learning_rate": 7.043686415320168e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9349689.0,
"step": 8850
},
{
"entropy": 1.5577180981636047,
"epoch": 5.302214242968282,
"grad_norm": 0.00531005859375,
"learning_rate": 6.983842010771993e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9360183.0,
"step": 8860
},
{
"entropy": 1.584286642074585,
"epoch": 5.3081986834231,
"grad_norm": 0.0478515625,
"learning_rate": 6.923997606223818e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9370657.0,
"step": 8870
},
{
"entropy": 1.5500143885612487,
"epoch": 5.314183123877918,
"grad_norm": 0.0228271484375,
"learning_rate": 6.864153201675644e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9381207.0,
"step": 8880
},
{
"entropy": 1.563618266582489,
"epoch": 5.320167564332735,
"grad_norm": 0.10595703125,
"learning_rate": 6.804308797127469e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996350347995758,
"num_tokens": 9391788.0,
"step": 8890
},
{
"entropy": 1.5550428748130798,
"epoch": 5.326152004787552,
"grad_norm": 0.0260009765625,
"learning_rate": 6.744464392579294e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9402365.0,
"step": 8900
},
{
"entropy": 1.5502114057540894,
"epoch": 5.332136445242369,
"grad_norm": 0.039794921875,
"learning_rate": 6.6846199880311196e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9412918.0,
"step": 8910
},
{
"entropy": 1.4951329708099366,
"epoch": 5.338120885697188,
"grad_norm": 0.296875,
"learning_rate": 6.624775583482945e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 9423435.0,
"step": 8920
},
{
"entropy": 1.556550133228302,
"epoch": 5.344105326152005,
"grad_norm": 0.0252685546875,
"learning_rate": 6.56493117893477e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9433957.0,
"step": 8930
},
{
"entropy": 1.532106137275696,
"epoch": 5.350089766606822,
"grad_norm": 0.06298828125,
"learning_rate": 6.5050867743865954e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9444403.0,
"step": 8940
},
{
"entropy": 1.5472781896591186,
"epoch": 5.356074207061639,
"grad_norm": 0.07568359375,
"learning_rate": 6.445242369838421e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 9454926.0,
"step": 8950
},
{
"entropy": 1.6014922738075257,
"epoch": 5.3620586475164576,
"grad_norm": 0.01348876953125,
"learning_rate": 6.385397965290246e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9465381.0,
"step": 8960
},
{
"entropy": 1.5169659018516541,
"epoch": 5.368043087971275,
"grad_norm": 0.62109375,
"learning_rate": 6.325553560742071e-06,
"loss": 0.0013,
"mean_token_accuracy": 0.9996610164642334,
"num_tokens": 9475976.0,
"step": 8970
},
{
"entropy": 1.547071349620819,
"epoch": 5.374027528426092,
"grad_norm": 0.08447265625,
"learning_rate": 6.265709156193896e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9486574.0,
"step": 8980
},
{
"entropy": 1.5052280187606812,
"epoch": 5.380011968880909,
"grad_norm": 0.0137939453125,
"learning_rate": 6.205864751645721e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9496974.0,
"step": 8990
},
{
"entropy": 1.5267866492271422,
"epoch": 5.385996409335727,
"grad_norm": 0.0142822265625,
"learning_rate": 6.146020347097546e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9507510.0,
"step": 9000
},
{
"entropy": 1.5410394072532654,
"epoch": 5.391980849790545,
"grad_norm": 0.330078125,
"learning_rate": 6.086175942549372e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 9518127.0,
"step": 9010
},
{
"entropy": 1.565923023223877,
"epoch": 5.397965290245362,
"grad_norm": 0.020751953125,
"learning_rate": 6.026331538001197e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9528583.0,
"step": 9020
},
{
"entropy": 1.5626705288887024,
"epoch": 5.403949730700179,
"grad_norm": 0.11669921875,
"learning_rate": 5.966487133453022e-06,
"loss": 0.0006,
"mean_token_accuracy": 0.9996240615844727,
"num_tokens": 9539067.0,
"step": 9030
},
{
"entropy": 1.5308679223060608,
"epoch": 5.409934171154997,
"grad_norm": 0.0498046875,
"learning_rate": 5.9066427289048475e-06,
"loss": 0.0005,
"mean_token_accuracy": 0.9996428549289703,
"num_tokens": 9549380.0,
"step": 9040
},
{
"entropy": 1.5556403517723083,
"epoch": 5.415918611609815,
"grad_norm": 0.171875,
"learning_rate": 5.846798324356673e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9560020.0,
"step": 9050
},
{
"entropy": 1.497513198852539,
"epoch": 5.421903052064632,
"grad_norm": 0.0247802734375,
"learning_rate": 5.786953919808498e-06,
"loss": 0.0014,
"mean_token_accuracy": 0.9996441304683685,
"num_tokens": 9570282.0,
"step": 9060
},
{
"entropy": 1.557295060157776,
"epoch": 5.427887492519449,
"grad_norm": 0.05810546875,
"learning_rate": 5.727109515260323e-06,
"loss": 0.0006,
"mean_token_accuracy": 0.9996282517910003,
"num_tokens": 9580781.0,
"step": 9070
},
{
"entropy": 1.5526091694831847,
"epoch": 5.433871932974267,
"grad_norm": 0.08203125,
"learning_rate": 5.667265110712149e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9591317.0,
"step": 9080
},
{
"entropy": 1.5796686291694642,
"epoch": 5.439856373429085,
"grad_norm": 0.0162353515625,
"learning_rate": 5.607420706163974e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 9601918.0,
"step": 9090
},
{
"entropy": 1.5661948442459106,
"epoch": 5.445840813883902,
"grad_norm": 0.0118408203125,
"learning_rate": 5.547576301615799e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 9612550.0,
"step": 9100
},
{
"entropy": 1.5428655982017516,
"epoch": 5.451825254338719,
"grad_norm": 0.083984375,
"learning_rate": 5.4877318970676245e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9623018.0,
"step": 9110
},
{
"entropy": 1.5452999949455262,
"epoch": 5.457809694793537,
"grad_norm": 0.0240478515625,
"learning_rate": 5.42788749251945e-06,
"loss": 0.0011,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 9633720.0,
"step": 9120
},
{
"entropy": 1.6031384706497191,
"epoch": 5.463794135248354,
"grad_norm": 0.2314453125,
"learning_rate": 5.368043087971274e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9644450.0,
"step": 9130
},
{
"entropy": 1.6057791471481324,
"epoch": 5.469778575703172,
"grad_norm": 0.020263671875,
"learning_rate": 5.3081986834230996e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 9655007.0,
"step": 9140
},
{
"entropy": 1.5202216625213623,
"epoch": 5.475763016157989,
"grad_norm": 0.0181884765625,
"learning_rate": 5.248354278874925e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9665606.0,
"step": 9150
},
{
"entropy": 1.5834308981895446,
"epoch": 5.481747456612807,
"grad_norm": 0.07080078125,
"learning_rate": 5.18850987432675e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 9676299.0,
"step": 9160
},
{
"entropy": 1.5219863414764405,
"epoch": 5.487731897067624,
"grad_norm": 0.007080078125,
"learning_rate": 5.1286654697785754e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9686557.0,
"step": 9170
},
{
"entropy": 1.5743539810180665,
"epoch": 5.493716337522442,
"grad_norm": 0.042236328125,
"learning_rate": 5.068821065230401e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9697132.0,
"step": 9180
},
{
"entropy": 1.5550188302993775,
"epoch": 5.499700777977259,
"grad_norm": 0.00360107421875,
"learning_rate": 5.008976660682226e-06,
"loss": 0.0013,
"mean_token_accuracy": 0.9996491253376008,
"num_tokens": 9707746.0,
"step": 9190
},
{
"entropy": 1.5604373812675476,
"epoch": 5.505685218432077,
"grad_norm": 0.01019287109375,
"learning_rate": 4.949132256134051e-06,
"loss": 0.0008,
"mean_token_accuracy": 1.0,
"num_tokens": 9718292.0,
"step": 9200
},
{
"entropy": 1.540794813632965,
"epoch": 5.511669658886894,
"grad_norm": 0.052001953125,
"learning_rate": 4.889287851585877e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9728807.0,
"step": 9210
},
{
"entropy": 1.5922885179519652,
"epoch": 5.517654099341712,
"grad_norm": 0.0189208984375,
"learning_rate": 4.829443447037702e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9739308.0,
"step": 9220
},
{
"entropy": 1.5752901792526246,
"epoch": 5.523638539796529,
"grad_norm": 0.0625,
"learning_rate": 4.769599042489527e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 9749922.0,
"step": 9230
},
{
"entropy": 1.5295986771583556,
"epoch": 5.529622980251347,
"grad_norm": 0.01324462890625,
"learning_rate": 4.7097546379413525e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9760377.0,
"step": 9240
},
{
"entropy": 1.5306970953941346,
"epoch": 5.535607420706164,
"grad_norm": 0.005706787109375,
"learning_rate": 4.649910233393178e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9770647.0,
"step": 9250
},
{
"entropy": 1.5370991468429565,
"epoch": 5.541591861160981,
"grad_norm": 0.453125,
"learning_rate": 4.590065828845003e-06,
"loss": 0.0011,
"mean_token_accuracy": 0.9992578208446503,
"num_tokens": 9781114.0,
"step": 9260
},
{
"entropy": 1.5558555841445922,
"epoch": 5.547576301615799,
"grad_norm": 0.072265625,
"learning_rate": 4.530221424296828e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9791718.0,
"step": 9270
},
{
"entropy": 1.5621288776397706,
"epoch": 5.553560742070617,
"grad_norm": 0.027099609375,
"learning_rate": 4.470377019748653e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9802302.0,
"step": 9280
},
{
"entropy": 1.5378905177116393,
"epoch": 5.559545182525434,
"grad_norm": 0.021240234375,
"learning_rate": 4.410532615200479e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996527791023254,
"num_tokens": 9812976.0,
"step": 9290
},
{
"entropy": 1.540464496612549,
"epoch": 5.565529622980251,
"grad_norm": 0.0177001953125,
"learning_rate": 4.350688210652304e-06,
"loss": 0.0005,
"mean_token_accuracy": 0.9996363639831543,
"num_tokens": 9823503.0,
"step": 9300
},
{
"entropy": 1.5626393675804138,
"epoch": 5.571514063435069,
"grad_norm": 0.125,
"learning_rate": 4.2908438061041295e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9833941.0,
"step": 9310
},
{
"entropy": 1.5677518129348755,
"epoch": 5.5774985038898865,
"grad_norm": 0.0262451171875,
"learning_rate": 4.230999401555955e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9844468.0,
"step": 9320
},
{
"entropy": 1.5441372871398926,
"epoch": 5.583482944344704,
"grad_norm": 0.04638671875,
"learning_rate": 4.17115499700778e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9855138.0,
"step": 9330
},
{
"entropy": 1.5927303791046143,
"epoch": 5.589467384799521,
"grad_norm": 0.00994873046875,
"learning_rate": 4.111310592459605e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9865684.0,
"step": 9340
},
{
"entropy": 1.525864827632904,
"epoch": 5.595451825254338,
"grad_norm": 0.0301513671875,
"learning_rate": 4.051466187911431e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 9875999.0,
"step": 9350
},
{
"entropy": 1.6044833660125732,
"epoch": 5.6014362657091565,
"grad_norm": 0.05419921875,
"learning_rate": 3.991621783363256e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9886567.0,
"step": 9360
},
{
"entropy": 1.5554206848144532,
"epoch": 5.607420706163974,
"grad_norm": 0.0322265625,
"learning_rate": 3.931777378815081e-06,
"loss": 0.0004,
"mean_token_accuracy": 0.9996336996555328,
"num_tokens": 9897281.0,
"step": 9370
},
{
"entropy": 1.533079206943512,
"epoch": 5.613405146618791,
"grad_norm": 0.042236328125,
"learning_rate": 3.8719329742669066e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9907747.0,
"step": 9380
},
{
"entropy": 1.588419759273529,
"epoch": 5.619389587073608,
"grad_norm": 0.037109375,
"learning_rate": 3.8120885697187314e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996710538864135,
"num_tokens": 9918369.0,
"step": 9390
},
{
"entropy": 1.5578839302062988,
"epoch": 5.6253740275284265,
"grad_norm": 0.03369140625,
"learning_rate": 3.7522441651705567e-06,
"loss": 0.0013,
"mean_token_accuracy": 0.9996587038040161,
"num_tokens": 9929021.0,
"step": 9400
},
{
"entropy": 1.5441142320632935,
"epoch": 5.631358467983244,
"grad_norm": 0.047119140625,
"learning_rate": 3.692399760622382e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 9939623.0,
"step": 9410
},
{
"entropy": 1.5249134063720704,
"epoch": 5.637342908438061,
"grad_norm": 0.0037689208984375,
"learning_rate": 3.6325553560742073e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9950139.0,
"step": 9420
},
{
"entropy": 1.5780072450637816,
"epoch": 5.643327348892878,
"grad_norm": 0.036376953125,
"learning_rate": 3.5727109515260326e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 9960664.0,
"step": 9430
},
{
"entropy": 1.5765936493873596,
"epoch": 5.649311789347696,
"grad_norm": 0.035888671875,
"learning_rate": 3.512866546977858e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 9971332.0,
"step": 9440
},
{
"entropy": 1.6016755819320678,
"epoch": 5.655296229802514,
"grad_norm": 0.447265625,
"learning_rate": 3.453022142429683e-06,
"loss": 0.0005,
"mean_token_accuracy": 0.9996254682540894,
"num_tokens": 9981877.0,
"step": 9450
},
{
"entropy": 1.5063655138015748,
"epoch": 5.661280670257331,
"grad_norm": 0.103515625,
"learning_rate": 3.3931777378815085e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 9992375.0,
"step": 9460
},
{
"entropy": 1.58121200799942,
"epoch": 5.667265110712148,
"grad_norm": 0.01513671875,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 10003041.0,
"step": 9470
},
{
"entropy": 1.6163565397262574,
"epoch": 5.673249551166966,
"grad_norm": 0.283203125,
"learning_rate": 3.2734889287851586e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10013816.0,
"step": 9480
},
{
"entropy": 1.5436020016670227,
"epoch": 5.679233991621784,
"grad_norm": 0.0167236328125,
"learning_rate": 3.213644524236984e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10024267.0,
"step": 9490
},
{
"entropy": 1.528675389289856,
"epoch": 5.685218432076601,
"grad_norm": 0.1953125,
"learning_rate": 3.153800119688809e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996453881263733,
"num_tokens": 10034645.0,
"step": 9500
},
{
"entropy": 1.5461339116096497,
"epoch": 5.691202872531418,
"grad_norm": 0.022216796875,
"learning_rate": 3.0939557151406345e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10045144.0,
"step": 9510
},
{
"entropy": 1.5538923740386963,
"epoch": 5.697187312986236,
"grad_norm": 0.04296875,
"learning_rate": 3.0341113105924598e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 10055781.0,
"step": 9520
},
{
"entropy": 1.56598619222641,
"epoch": 5.703171753441053,
"grad_norm": 0.0174560546875,
"learning_rate": 2.974266906044285e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 10066213.0,
"step": 9530
},
{
"entropy": 1.5303544521331787,
"epoch": 5.709156193895871,
"grad_norm": 0.10986328125,
"learning_rate": 2.9144225014961104e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10076706.0,
"step": 9540
},
{
"entropy": 1.5925257802009583,
"epoch": 5.715140634350688,
"grad_norm": 0.037109375,
"learning_rate": 2.8545780969479352e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10087369.0,
"step": 9550
},
{
"entropy": 1.607127583026886,
"epoch": 5.721125074805506,
"grad_norm": 0.0830078125,
"learning_rate": 2.7947336923997605e-06,
"loss": 0.0007,
"mean_token_accuracy": 0.9996183216571808,
"num_tokens": 10097967.0,
"step": 9560
},
{
"entropy": 1.5356394171714782,
"epoch": 5.727109515260323,
"grad_norm": 0.01226806640625,
"learning_rate": 2.734889287851586e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10108599.0,
"step": 9570
},
{
"entropy": 1.6435596108436585,
"epoch": 5.733093955715141,
"grad_norm": 0.00494384765625,
"learning_rate": 2.675044883303411e-06,
"loss": 0.0008,
"mean_token_accuracy": 0.999622642993927,
"num_tokens": 10119292.0,
"step": 9580
},
{
"entropy": 1.634545588493347,
"epoch": 5.739078396169958,
"grad_norm": 0.0517578125,
"learning_rate": 2.6152004787552364e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10129944.0,
"step": 9590
},
{
"entropy": 1.569711184501648,
"epoch": 5.745062836624776,
"grad_norm": 0.01123046875,
"learning_rate": 2.5553560742070617e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10140569.0,
"step": 9600
},
{
"entropy": 1.5555992722511292,
"epoch": 5.751047277079593,
"grad_norm": 0.0023956298828125,
"learning_rate": 2.495511669658887e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10151072.0,
"step": 9610
},
{
"entropy": 1.5273286819458007,
"epoch": 5.75703171753441,
"grad_norm": 0.029541015625,
"learning_rate": 2.435667265110712e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10161447.0,
"step": 9620
},
{
"entropy": 1.5150355458259583,
"epoch": 5.763016157989228,
"grad_norm": 0.53515625,
"learning_rate": 2.375822860562537e-06,
"loss": 0.0013,
"mean_token_accuracy": 0.9992917656898499,
"num_tokens": 10171906.0,
"step": 9630
},
{
"entropy": 1.6199348092079162,
"epoch": 5.769000598444046,
"grad_norm": 0.0810546875,
"learning_rate": 2.315978456014363e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10182715.0,
"step": 9640
},
{
"entropy": 1.541762149333954,
"epoch": 5.774985038898863,
"grad_norm": 0.26171875,
"learning_rate": 2.256134051466188e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 10193368.0,
"step": 9650
},
{
"entropy": 1.5536392092704774,
"epoch": 5.78096947935368,
"grad_norm": 0.051513671875,
"learning_rate": 2.1962896469180134e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10203739.0,
"step": 9660
},
{
"entropy": 1.5519014716148376,
"epoch": 5.786953919808498,
"grad_norm": 0.0152587890625,
"learning_rate": 2.1364452423698387e-06,
"loss": 0.0013,
"mean_token_accuracy": 0.9996515691280365,
"num_tokens": 10214252.0,
"step": 9670
},
{
"entropy": 1.553311824798584,
"epoch": 5.7929383602633155,
"grad_norm": 0.0247802734375,
"learning_rate": 2.076600837821664e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10224950.0,
"step": 9680
},
{
"entropy": 1.5511303305625916,
"epoch": 5.798922800718133,
"grad_norm": 0.0166015625,
"learning_rate": 2.0167564332734893e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 10235495.0,
"step": 9690
},
{
"entropy": 1.5440836668014526,
"epoch": 5.80490724117295,
"grad_norm": 0.005218505859375,
"learning_rate": 1.956912028725314e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10246127.0,
"step": 9700
},
{
"entropy": 1.5713715076446533,
"epoch": 5.810891681627767,
"grad_norm": 0.025390625,
"learning_rate": 1.8970676241771395e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10256733.0,
"step": 9710
},
{
"entropy": 1.5970672011375426,
"epoch": 5.8168761220825855,
"grad_norm": 0.271484375,
"learning_rate": 1.8372232196289648e-06,
"loss": 0.0008,
"mean_token_accuracy": 0.9996240615844727,
"num_tokens": 10267236.0,
"step": 9720
},
{
"entropy": 1.556793713569641,
"epoch": 5.822860562537403,
"grad_norm": 0.029296875,
"learning_rate": 1.77737881508079e-06,
"loss": 0.0005,
"mean_token_accuracy": 1.0,
"num_tokens": 10277749.0,
"step": 9730
},
{
"entropy": 1.6262453079223633,
"epoch": 5.82884500299222,
"grad_norm": 0.010986328125,
"learning_rate": 1.7175344105326153e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10288379.0,
"step": 9740
},
{
"entropy": 1.495825183391571,
"epoch": 5.834829443447037,
"grad_norm": 0.02978515625,
"learning_rate": 1.6576900059844404e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10298897.0,
"step": 9750
},
{
"entropy": 1.6074238777160645,
"epoch": 5.8408138839018555,
"grad_norm": 0.05615234375,
"learning_rate": 1.5978456014362657e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 10309496.0,
"step": 9760
},
{
"entropy": 1.5544676423072814,
"epoch": 5.846798324356673,
"grad_norm": 0.0030975341796875,
"learning_rate": 1.538001196888091e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10320081.0,
"step": 9770
},
{
"entropy": 1.580102515220642,
"epoch": 5.85278276481149,
"grad_norm": 0.2236328125,
"learning_rate": 1.4781567923399163e-06,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 10330768.0,
"step": 9780
},
{
"entropy": 1.5838265538215637,
"epoch": 5.858767205266307,
"grad_norm": 0.09423828125,
"learning_rate": 1.4183123877917414e-06,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10341607.0,
"step": 9790
},
{
"entropy": 1.5759095907211305,
"epoch": 5.864751645721125,
"grad_norm": 0.037841796875,
"learning_rate": 1.3584679832435667e-06,
"loss": 0.001,
"mean_token_accuracy": 0.9996323525905609,
"num_tokens": 10352215.0,
"step": 9800
},
{
"entropy": 1.5455079555511475,
"epoch": 5.870736086175943,
"grad_norm": 0.053466796875,
"learning_rate": 1.2986235786953922e-06,
"loss": 0.0015,
"mean_token_accuracy": 0.9996453881263733,
"num_tokens": 10362808.0,
"step": 9810
},
{
"entropy": 1.5143304467201233,
"epoch": 5.87672052663076,
"grad_norm": 0.09228515625,
"learning_rate": 1.2387791741472175e-06,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10373485.0,
"step": 9820
},
{
"entropy": 1.559544062614441,
"epoch": 5.882704967085577,
"grad_norm": 0.016845703125,
"learning_rate": 1.1789347695990425e-06,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 10384069.0,
"step": 9830
},
{
"entropy": 1.5978240847587586,
"epoch": 5.888689407540395,
"grad_norm": 0.007781982421875,
"learning_rate": 1.1190903650508678e-06,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10394680.0,
"step": 9840
},
{
"entropy": 1.5734472513198852,
"epoch": 5.894673847995213,
"grad_norm": 0.00860595703125,
"learning_rate": 1.0592459605026931e-06,
"loss": 0.0008,
"mean_token_accuracy": 0.9996575355529785,
"num_tokens": 10405414.0,
"step": 9850
},
{
"entropy": 1.5193457126617431,
"epoch": 5.90065828845003,
"grad_norm": 0.1552734375,
"learning_rate": 9.994015559545182e-07,
"loss": 0.0002,
"mean_token_accuracy": 1.0,
"num_tokens": 10415914.0,
"step": 9860
},
{
"entropy": 1.6170666337013244,
"epoch": 5.906642728904847,
"grad_norm": 0.08740234375,
"learning_rate": 9.395571514063435e-07,
"loss": 0.0012,
"mean_token_accuracy": 0.9993006944656372,
"num_tokens": 10426593.0,
"step": 9870
},
{
"entropy": 1.579539179801941,
"epoch": 5.912627169359665,
"grad_norm": 0.5703125,
"learning_rate": 8.797127468581688e-07,
"loss": 0.0014,
"mean_token_accuracy": 0.9996621608734131,
"num_tokens": 10437212.0,
"step": 9880
},
{
"entropy": 1.6408529162406922,
"epoch": 5.918611609814482,
"grad_norm": 0.021728515625,
"learning_rate": 8.19868342309994e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 10447988.0,
"step": 9890
},
{
"entropy": 1.5982860207557679,
"epoch": 5.9245960502693,
"grad_norm": 0.00531005859375,
"learning_rate": 7.600239377618194e-07,
"loss": 0.0006,
"mean_token_accuracy": 0.9996666669845581,
"num_tokens": 10458522.0,
"step": 9900
},
{
"entropy": 1.5749751448631286,
"epoch": 5.930580490724117,
"grad_norm": 0.00927734375,
"learning_rate": 7.001795332136445e-07,
"loss": 0.0007,
"mean_token_accuracy": 1.0,
"num_tokens": 10468943.0,
"step": 9910
},
{
"entropy": 1.5508565783500672,
"epoch": 5.936564931178935,
"grad_norm": 0.010986328125,
"learning_rate": 6.403351286654698e-07,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10479652.0,
"step": 9920
},
{
"entropy": 1.546996283531189,
"epoch": 5.942549371633753,
"grad_norm": 0.1669921875,
"learning_rate": 5.80490724117295e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10490439.0,
"step": 9930
},
{
"entropy": 1.5694025874137878,
"epoch": 5.94853381208857,
"grad_norm": 0.0123291015625,
"learning_rate": 5.206463195691203e-07,
"loss": 0.0008,
"mean_token_accuracy": 0.9996415793895721,
"num_tokens": 10501036.0,
"step": 9940
},
{
"entropy": 1.52097749710083,
"epoch": 5.954518252543387,
"grad_norm": 0.009521484375,
"learning_rate": 4.6080191502094555e-07,
"loss": 0.0003,
"mean_token_accuracy": 1.0,
"num_tokens": 10511443.0,
"step": 9950
},
{
"entropy": 1.5758405923843384,
"epoch": 5.960502692998205,
"grad_norm": 0.00811767578125,
"learning_rate": 4.009575104727708e-07,
"loss": 0.0007,
"mean_token_accuracy": 0.9996402859687805,
"num_tokens": 10521977.0,
"step": 9960
},
{
"entropy": 1.5352561831474305,
"epoch": 5.966487133453022,
"grad_norm": 0.0225830078125,
"learning_rate": 3.411131059245961e-07,
"loss": 0.0006,
"mean_token_accuracy": 0.9996710538864135,
"num_tokens": 10532647.0,
"step": 9970
},
{
"entropy": 1.564480447769165,
"epoch": 5.97247157390784,
"grad_norm": 0.0140380859375,
"learning_rate": 2.812687013764213e-07,
"loss": 0.0012,
"mean_token_accuracy": 0.9996138989925385,
"num_tokens": 10543219.0,
"step": 9980
},
{
"entropy": 1.5791572093963624,
"epoch": 5.978456014362657,
"grad_norm": 0.0087890625,
"learning_rate": 2.2142429682824658e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10553793.0,
"step": 9990
},
{
"entropy": 1.5388386368751525,
"epoch": 5.9844404548174746,
"grad_norm": 0.04833984375,
"learning_rate": 1.6157989228007181e-07,
"loss": 0.0004,
"mean_token_accuracy": 1.0,
"num_tokens": 10564325.0,
"step": 10000
},
{
"entropy": 1.4956823945045472,
"epoch": 5.990424895272292,
"grad_norm": 0.154296875,
"learning_rate": 1.0173548773189707e-07,
"loss": 0.0006,
"mean_token_accuracy": 1.0,
"num_tokens": 10574730.0,
"step": 10010
},
{
"entropy": 1.491278338432312,
"epoch": 5.99640933572711,
"grad_norm": 0.0146484375,
"learning_rate": 4.1891083183722324e-08,
"loss": 0.0001,
"mean_token_accuracy": 1.0,
"num_tokens": 10585227.0,
"step": 10020
}
],
"logging_steps": 10,
"max_steps": 10026,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.72830388146217e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}