Test-Repo / trainer_state.json
MRBSTUDIO's picture
sft-6900-step
ab2fbfe verified
{
"best_global_step": 6800,
"best_metric": 1.1395292282104492,
"best_model_checkpoint": "/workspace/project_2026_1/checkpoints/sft/checkpoint-6800",
"epoch": 2.030008826125331,
"eval_steps": 100,
"global_step": 6900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1349310517311095,
"epoch": 0.0029420417769932335,
"grad_norm": 5.712224006652832,
"learning_rate": 3.5294117647058825e-06,
"loss": 3.9359054565429688,
"mean_token_accuracy": 0.45907036662101747,
"num_tokens": 40265.0,
"step": 10
},
{
"entropy": 1.1904624342918395,
"epoch": 0.005884083553986467,
"grad_norm": 3.6260242462158203,
"learning_rate": 7.450980392156863e-06,
"loss": 3.447366714477539,
"mean_token_accuracy": 0.4740333199501038,
"num_tokens": 80768.0,
"step": 20
},
{
"entropy": 1.641614854335785,
"epoch": 0.0088261253309797,
"grad_norm": 1.758035659790039,
"learning_rate": 1.1372549019607843e-05,
"loss": 2.9342424392700197,
"mean_token_accuracy": 0.4756320804357529,
"num_tokens": 121287.0,
"step": 30
},
{
"entropy": 2.0289795875549315,
"epoch": 0.011768167107972934,
"grad_norm": 0.8422009348869324,
"learning_rate": 1.5294117647058826e-05,
"loss": 2.4612199783325197,
"mean_token_accuracy": 0.5210394144058228,
"num_tokens": 161772.0,
"step": 40
},
{
"entropy": 2.1589465618133543,
"epoch": 0.014710208884966167,
"grad_norm": 0.6909259557723999,
"learning_rate": 1.9215686274509807e-05,
"loss": 2.1433944702148438,
"mean_token_accuracy": 0.5696752369403839,
"num_tokens": 202309.0,
"step": 50
},
{
"entropy": 1.932080829143524,
"epoch": 0.0176522506619594,
"grad_norm": 0.7911986112594604,
"learning_rate": 2.3137254901960788e-05,
"loss": 1.8811990737915039,
"mean_token_accuracy": 0.6150015771389008,
"num_tokens": 242490.0,
"step": 60
},
{
"entropy": 1.6605080842971802,
"epoch": 0.020594292438952635,
"grad_norm": 0.790881872177124,
"learning_rate": 2.7058823529411766e-05,
"loss": 1.6363672256469726,
"mean_token_accuracy": 0.6599553287029266,
"num_tokens": 282845.0,
"step": 70
},
{
"entropy": 1.5992008209228517,
"epoch": 0.023536334215945868,
"grad_norm": 0.4468841850757599,
"learning_rate": 3.098039215686275e-05,
"loss": 1.6219806671142578,
"mean_token_accuracy": 0.6577221751213074,
"num_tokens": 323245.0,
"step": 80
},
{
"entropy": 1.5317162871360779,
"epoch": 0.0264783759929391,
"grad_norm": 0.49570420384407043,
"learning_rate": 3.4901960784313725e-05,
"loss": 1.5623438835144043,
"mean_token_accuracy": 0.6620652556419373,
"num_tokens": 363709.0,
"step": 90
},
{
"entropy": 1.5458295822143555,
"epoch": 0.029420417769932334,
"grad_norm": 0.4536747634410858,
"learning_rate": 3.882352941176471e-05,
"loss": 1.577082633972168,
"mean_token_accuracy": 0.6642140686511994,
"num_tokens": 404093.0,
"step": 100
},
{
"epoch": 0.029420417769932334,
"eval_entropy": 1.519468894467892,
"eval_loss": 1.5212204456329346,
"eval_mean_token_accuracy": 0.6755449191478919,
"eval_num_tokens": 404093.0,
"eval_runtime": 116.8522,
"eval_samples_per_second": 26.059,
"eval_steps_per_second": 3.261,
"step": 100
},
{
"entropy": 1.5117838263511658,
"epoch": 0.032362459546925564,
"grad_norm": 0.4160189628601074,
"learning_rate": 4.274509803921569e-05,
"loss": 1.5348424911499023,
"mean_token_accuracy": 0.6751707136631012,
"num_tokens": 444665.0,
"step": 110
},
{
"entropy": 1.4816083550453185,
"epoch": 0.0353045013239188,
"grad_norm": 0.4050695300102234,
"learning_rate": 4.666666666666667e-05,
"loss": 1.5027993202209473,
"mean_token_accuracy": 0.6793931305408478,
"num_tokens": 485247.0,
"step": 120
},
{
"entropy": 1.5376260161399842,
"epoch": 0.03824654310091203,
"grad_norm": 0.4373331367969513,
"learning_rate": 5.058823529411765e-05,
"loss": 1.5507566452026367,
"mean_token_accuracy": 0.6680718421936035,
"num_tokens": 525640.0,
"step": 130
},
{
"entropy": 1.568453598022461,
"epoch": 0.04118858487790527,
"grad_norm": 0.41970527172088623,
"learning_rate": 5.450980392156863e-05,
"loss": 1.5769514083862304,
"mean_token_accuracy": 0.6546712338924408,
"num_tokens": 565941.0,
"step": 140
},
{
"entropy": 1.4061901330947877,
"epoch": 0.0441306266548985,
"grad_norm": 0.43462952971458435,
"learning_rate": 5.843137254901961e-05,
"loss": 1.4184735298156739,
"mean_token_accuracy": 0.6973597705364227,
"num_tokens": 606491.0,
"step": 150
},
{
"entropy": 1.4754432439804077,
"epoch": 0.047072668431891736,
"grad_norm": 0.42188599705696106,
"learning_rate": 6.23529411764706e-05,
"loss": 1.5123021125793457,
"mean_token_accuracy": 0.6780355155467988,
"num_tokens": 646949.0,
"step": 160
},
{
"entropy": 1.4351889610290527,
"epoch": 0.05001471020888497,
"grad_norm": 0.4059581458568573,
"learning_rate": 6.627450980392157e-05,
"loss": 1.4382680892944335,
"mean_token_accuracy": 0.6908938407897949,
"num_tokens": 687299.0,
"step": 170
},
{
"entropy": 1.4386033058166503,
"epoch": 0.0529567519858782,
"grad_norm": 0.4081089198589325,
"learning_rate": 7.019607843137255e-05,
"loss": 1.4630658149719238,
"mean_token_accuracy": 0.6837093889713287,
"num_tokens": 727817.0,
"step": 180
},
{
"entropy": 1.4710845828056336,
"epoch": 0.055898793762871435,
"grad_norm": 0.4348011612892151,
"learning_rate": 7.411764705882354e-05,
"loss": 1.483832550048828,
"mean_token_accuracy": 0.6794860780239105,
"num_tokens": 767982.0,
"step": 190
},
{
"entropy": 1.4718186855316162,
"epoch": 0.05884083553986467,
"grad_norm": 0.3977382779121399,
"learning_rate": 7.803921568627451e-05,
"loss": 1.4843897819519043,
"mean_token_accuracy": 0.6810027420520782,
"num_tokens": 808206.0,
"step": 200
},
{
"epoch": 0.05884083553986467,
"eval_entropy": 1.4348028574402876,
"eval_loss": 1.4701354503631592,
"eval_mean_token_accuracy": 0.6813244935408664,
"eval_num_tokens": 808206.0,
"eval_runtime": 116.9082,
"eval_samples_per_second": 26.046,
"eval_steps_per_second": 3.259,
"step": 200
},
{
"entropy": 1.4585287928581239,
"epoch": 0.0617828773168579,
"grad_norm": 0.41666489839553833,
"learning_rate": 8.196078431372549e-05,
"loss": 1.4701197624206543,
"mean_token_accuracy": 0.6891506373882293,
"num_tokens": 848584.0,
"step": 210
},
{
"entropy": 1.404802179336548,
"epoch": 0.06472491909385113,
"grad_norm": 0.4331190288066864,
"learning_rate": 8.588235294117646e-05,
"loss": 1.4301957130432128,
"mean_token_accuracy": 0.6959148466587066,
"num_tokens": 889067.0,
"step": 220
},
{
"entropy": 1.419868004322052,
"epoch": 0.06766696087084437,
"grad_norm": 0.4185863435268402,
"learning_rate": 8.980392156862746e-05,
"loss": 1.4252424240112305,
"mean_token_accuracy": 0.6949863255023956,
"num_tokens": 929411.0,
"step": 230
},
{
"entropy": 1.4240824103355407,
"epoch": 0.0706090026478376,
"grad_norm": 0.43419042229652405,
"learning_rate": 9.372549019607843e-05,
"loss": 1.4376652717590332,
"mean_token_accuracy": 0.6870559632778168,
"num_tokens": 970055.0,
"step": 240
},
{
"entropy": 1.4307694911956788,
"epoch": 0.07355104442483083,
"grad_norm": 0.3987244963645935,
"learning_rate": 9.764705882352942e-05,
"loss": 1.4383943557739258,
"mean_token_accuracy": 0.685227632522583,
"num_tokens": 1010437.0,
"step": 250
},
{
"entropy": 1.50078626871109,
"epoch": 0.07649308620182406,
"grad_norm": 0.3530856668949127,
"learning_rate": 0.00010156862745098039,
"loss": 1.5160396575927735,
"mean_token_accuracy": 0.6708036184310913,
"num_tokens": 1051087.0,
"step": 260
},
{
"entropy": 1.4214369535446167,
"epoch": 0.0794351279788173,
"grad_norm": 0.35215267539024353,
"learning_rate": 0.00010549019607843139,
"loss": 1.4516315460205078,
"mean_token_accuracy": 0.6868359208106994,
"num_tokens": 1091565.0,
"step": 270
},
{
"entropy": 1.4430809259414672,
"epoch": 0.08237716975581054,
"grad_norm": 0.3903290629386902,
"learning_rate": 0.00010941176470588237,
"loss": 1.4475218772888183,
"mean_token_accuracy": 0.686697655916214,
"num_tokens": 1131001.0,
"step": 280
},
{
"entropy": 1.4334508419036864,
"epoch": 0.08531921153280377,
"grad_norm": 0.3959214985370636,
"learning_rate": 0.00011333333333333334,
"loss": 1.4455389976501465,
"mean_token_accuracy": 0.6854739308357238,
"num_tokens": 1171541.0,
"step": 290
},
{
"entropy": 1.5002384662628174,
"epoch": 0.088261253309797,
"grad_norm": 0.3723829984664917,
"learning_rate": 0.00011725490196078433,
"loss": 1.5168241500854491,
"mean_token_accuracy": 0.66768679022789,
"num_tokens": 1211498.0,
"step": 300
},
{
"epoch": 0.088261253309797,
"eval_entropy": 1.4693221822185467,
"eval_loss": 1.448572039604187,
"eval_mean_token_accuracy": 0.6847088332251301,
"eval_num_tokens": 1211498.0,
"eval_runtime": 116.8305,
"eval_samples_per_second": 26.063,
"eval_steps_per_second": 3.261,
"step": 300
},
{
"entropy": 1.3928457140922545,
"epoch": 0.09120329508679023,
"grad_norm": 0.4283956289291382,
"learning_rate": 0.0001211764705882353,
"loss": 1.4159896850585938,
"mean_token_accuracy": 0.6940667152404785,
"num_tokens": 1251562.0,
"step": 310
},
{
"entropy": 1.3655309796333313,
"epoch": 0.09414533686378347,
"grad_norm": 0.34452083706855774,
"learning_rate": 0.00012509803921568628,
"loss": 1.3777886390686036,
"mean_token_accuracy": 0.7042169332504272,
"num_tokens": 1292029.0,
"step": 320
},
{
"entropy": 1.4647573351860046,
"epoch": 0.0970873786407767,
"grad_norm": 0.3209940493106842,
"learning_rate": 0.00012901960784313728,
"loss": 1.4774354934692382,
"mean_token_accuracy": 0.6791389882564545,
"num_tokens": 1332530.0,
"step": 330
},
{
"entropy": 1.4148489713668824,
"epoch": 0.10002942041776994,
"grad_norm": 0.4031716585159302,
"learning_rate": 0.00013294117647058822,
"loss": 1.4249235153198243,
"mean_token_accuracy": 0.692065304517746,
"num_tokens": 1372940.0,
"step": 340
},
{
"entropy": 1.3905927181243896,
"epoch": 0.10297146219476316,
"grad_norm": 0.40045231580734253,
"learning_rate": 0.00013686274509803922,
"loss": 1.4047340393066405,
"mean_token_accuracy": 0.6952064216136933,
"num_tokens": 1413313.0,
"step": 350
},
{
"entropy": 1.393438732624054,
"epoch": 0.1059135039717564,
"grad_norm": 0.3352883458137512,
"learning_rate": 0.00014078431372549022,
"loss": 1.4050509452819824,
"mean_token_accuracy": 0.6900130629539489,
"num_tokens": 1453217.0,
"step": 360
},
{
"entropy": 1.3930254817008971,
"epoch": 0.10885554574874963,
"grad_norm": 0.31767141819000244,
"learning_rate": 0.0001447058823529412,
"loss": 1.3919607162475587,
"mean_token_accuracy": 0.6927550494670868,
"num_tokens": 1493584.0,
"step": 370
},
{
"entropy": 1.3573615312576295,
"epoch": 0.11179758752574287,
"grad_norm": 0.32946810126304626,
"learning_rate": 0.00014862745098039216,
"loss": 1.3821157455444335,
"mean_token_accuracy": 0.6986469984054565,
"num_tokens": 1534100.0,
"step": 380
},
{
"entropy": 1.4580389261245728,
"epoch": 0.1147396293027361,
"grad_norm": 0.3781375288963318,
"learning_rate": 0.00015254901960784313,
"loss": 1.4741416931152345,
"mean_token_accuracy": 0.6809489071369171,
"num_tokens": 1574623.0,
"step": 390
},
{
"entropy": 1.4030352354049682,
"epoch": 0.11768167107972934,
"grad_norm": 0.3613905906677246,
"learning_rate": 0.00015647058823529413,
"loss": 1.4134941101074219,
"mean_token_accuracy": 0.6923583388328552,
"num_tokens": 1615045.0,
"step": 400
},
{
"epoch": 0.11768167107972934,
"eval_entropy": 1.3998811366989856,
"eval_loss": 1.432859182357788,
"eval_mean_token_accuracy": 0.6876164205430999,
"eval_num_tokens": 1615045.0,
"eval_runtime": 116.8804,
"eval_samples_per_second": 26.052,
"eval_steps_per_second": 3.26,
"step": 400
},
{
"entropy": 1.3861081957817079,
"epoch": 0.12062371285672256,
"grad_norm": 0.32785341143608093,
"learning_rate": 0.0001603921568627451,
"loss": 1.4069743156433105,
"mean_token_accuracy": 0.6965118229389191,
"num_tokens": 1655438.0,
"step": 410
},
{
"entropy": 1.3972581624984741,
"epoch": 0.1235657546337158,
"grad_norm": 0.3461436331272125,
"learning_rate": 0.00016431372549019607,
"loss": 1.4036929130554199,
"mean_token_accuracy": 0.6954509198665619,
"num_tokens": 1696005.0,
"step": 420
},
{
"entropy": 1.3920337677001953,
"epoch": 0.12650779641070903,
"grad_norm": 0.330691933631897,
"learning_rate": 0.00016823529411764707,
"loss": 1.4209896087646485,
"mean_token_accuracy": 0.6902368903160095,
"num_tokens": 1736450.0,
"step": 430
},
{
"entropy": 1.447186005115509,
"epoch": 0.12944983818770225,
"grad_norm": 0.3359420895576477,
"learning_rate": 0.00017215686274509807,
"loss": 1.4491106986999511,
"mean_token_accuracy": 0.6825862407684327,
"num_tokens": 1776731.0,
"step": 440
},
{
"entropy": 1.3688698172569276,
"epoch": 0.1323918799646955,
"grad_norm": 0.31416866183280945,
"learning_rate": 0.000176078431372549,
"loss": 1.3883934020996094,
"mean_token_accuracy": 0.6974851131439209,
"num_tokens": 1816994.0,
"step": 450
},
{
"entropy": 1.4649083375930787,
"epoch": 0.13533392174168873,
"grad_norm": 0.3449016511440277,
"learning_rate": 0.00018,
"loss": 1.4666101455688476,
"mean_token_accuracy": 0.6721475541591644,
"num_tokens": 1857622.0,
"step": 460
},
{
"entropy": 1.394101870059967,
"epoch": 0.13827596351868196,
"grad_norm": 0.32241928577423096,
"learning_rate": 0.00018392156862745098,
"loss": 1.4229127883911132,
"mean_token_accuracy": 0.6893933176994324,
"num_tokens": 1898085.0,
"step": 470
},
{
"entropy": 1.4726597905158996,
"epoch": 0.1412180052956752,
"grad_norm": 0.3263033926486969,
"learning_rate": 0.00018784313725490198,
"loss": 1.495950412750244,
"mean_token_accuracy": 0.6736144661903382,
"num_tokens": 1938687.0,
"step": 480
},
{
"entropy": 1.4047853350639343,
"epoch": 0.14416004707266844,
"grad_norm": 0.32949718832969666,
"learning_rate": 0.00019176470588235295,
"loss": 1.4045245170593261,
"mean_token_accuracy": 0.6931259393692016,
"num_tokens": 1978716.0,
"step": 490
},
{
"entropy": 1.4191704154014588,
"epoch": 0.14710208884966167,
"grad_norm": 0.31867218017578125,
"learning_rate": 0.00019568627450980392,
"loss": 1.4353232383728027,
"mean_token_accuracy": 0.68796147108078,
"num_tokens": 2019187.0,
"step": 500
},
{
"epoch": 0.14710208884966167,
"eval_entropy": 1.3842839742582926,
"eval_loss": 1.4221439361572266,
"eval_mean_token_accuracy": 0.6893131504534423,
"eval_num_tokens": 2019187.0,
"eval_runtime": 116.869,
"eval_samples_per_second": 26.055,
"eval_steps_per_second": 3.26,
"step": 500
},
{
"entropy": 1.4995567321777343,
"epoch": 0.1500441306266549,
"grad_norm": 0.32378971576690674,
"learning_rate": 0.00019960784313725492,
"loss": 1.5118988037109375,
"mean_token_accuracy": 0.6660271644592285,
"num_tokens": 2059838.0,
"step": 510
},
{
"entropy": 1.3857444763183593,
"epoch": 0.15298617240364812,
"grad_norm": 0.32856202125549316,
"learning_rate": 0.00019999957403309267,
"loss": 1.3925944328308106,
"mean_token_accuracy": 0.6954998314380646,
"num_tokens": 2099887.0,
"step": 520
},
{
"entropy": 1.444947075843811,
"epoch": 0.15592821418064137,
"grad_norm": 0.34244897961616516,
"learning_rate": 0.00019999810155955347,
"loss": 1.4644223213195802,
"mean_token_accuracy": 0.6775131762027741,
"num_tokens": 2139863.0,
"step": 530
},
{
"entropy": 1.4525998115539551,
"epoch": 0.1588702559576346,
"grad_norm": 0.3342621624469757,
"learning_rate": 0.00019999557733601503,
"loss": 1.4777566909790039,
"mean_token_accuracy": 0.6743626773357392,
"num_tokens": 2180117.0,
"step": 540
},
{
"entropy": 1.3970038414001464,
"epoch": 0.16181229773462782,
"grad_norm": 0.33053645491600037,
"learning_rate": 0.00019999200138902642,
"loss": 1.4096016883850098,
"mean_token_accuracy": 0.6931852400302887,
"num_tokens": 2219829.0,
"step": 550
},
{
"entropy": 1.4066221356391906,
"epoch": 0.16475433951162108,
"grad_norm": 0.33736753463745117,
"learning_rate": 0.00019998737375619835,
"loss": 1.4260540008544922,
"mean_token_accuracy": 0.6908943474292755,
"num_tokens": 2260238.0,
"step": 560
},
{
"entropy": 1.3926284670829774,
"epoch": 0.1676963812886143,
"grad_norm": 0.3630967140197754,
"learning_rate": 0.0001999816944862029,
"loss": 1.408204936981201,
"mean_token_accuracy": 0.6962983906269073,
"num_tokens": 2300497.0,
"step": 570
},
{
"entropy": 1.3583880305290221,
"epoch": 0.17063842306560753,
"grad_norm": 0.3344590365886688,
"learning_rate": 0.0001999749636387729,
"loss": 1.3612471580505372,
"mean_token_accuracy": 0.7032208800315857,
"num_tokens": 2340896.0,
"step": 580
},
{
"entropy": 1.3552134156227111,
"epoch": 0.17358046484260076,
"grad_norm": 0.3601242005825043,
"learning_rate": 0.00019996718128470137,
"loss": 1.378493595123291,
"mean_token_accuracy": 0.6997579276561737,
"num_tokens": 2381277.0,
"step": 590
},
{
"entropy": 1.4107160449028016,
"epoch": 0.176522506619594,
"grad_norm": 0.38008254766464233,
"learning_rate": 0.00019995834750584078,
"loss": 1.4177864074707032,
"mean_token_accuracy": 0.6893383502960205,
"num_tokens": 2421617.0,
"step": 600
},
{
"epoch": 0.176522506619594,
"eval_entropy": 1.3129558118935332,
"eval_loss": 1.4119162559509277,
"eval_mean_token_accuracy": 0.6910557604524408,
"eval_num_tokens": 2421617.0,
"eval_runtime": 117.0182,
"eval_samples_per_second": 26.022,
"eval_steps_per_second": 3.256,
"step": 600
},
{
"entropy": 1.3507792115211488,
"epoch": 0.17946454839658724,
"grad_norm": 0.3318082392215729,
"learning_rate": 0.00019994846239510216,
"loss": 1.3647551536560059,
"mean_token_accuracy": 0.7037387728691101,
"num_tokens": 2462059.0,
"step": 610
},
{
"entropy": 1.395737397670746,
"epoch": 0.18240659017358046,
"grad_norm": 0.34794482588768005,
"learning_rate": 0.00019993752605645417,
"loss": 1.4275907516479491,
"mean_token_accuracy": 0.6850803017616272,
"num_tokens": 2502114.0,
"step": 620
},
{
"entropy": 1.4139577507972718,
"epoch": 0.1853486319505737,
"grad_norm": 0.3424963653087616,
"learning_rate": 0.00019992553860492191,
"loss": 1.4030399322509766,
"mean_token_accuracy": 0.6936035215854645,
"num_tokens": 2542391.0,
"step": 630
},
{
"entropy": 1.3600899696350097,
"epoch": 0.18829067372756694,
"grad_norm": 0.4584237039089203,
"learning_rate": 0.00019991250016658578,
"loss": 1.3970141410827637,
"mean_token_accuracy": 0.6953531980514527,
"num_tokens": 2582892.0,
"step": 640
},
{
"entropy": 1.4006186485290528,
"epoch": 0.19123271550456017,
"grad_norm": 0.3958089053630829,
"learning_rate": 0.00019989841087858019,
"loss": 1.4342127799987794,
"mean_token_accuracy": 0.6836777746677398,
"num_tokens": 2623322.0,
"step": 650
},
{
"entropy": 1.4262026309967042,
"epoch": 0.1941747572815534,
"grad_norm": 0.38685840368270874,
"learning_rate": 0.000199883270889092,
"loss": 1.4162178993225099,
"mean_token_accuracy": 0.6810121357440948,
"num_tokens": 2664008.0,
"step": 660
},
{
"entropy": 1.3658849358558656,
"epoch": 0.19711679905854662,
"grad_norm": 0.37710094451904297,
"learning_rate": 0.00019986708035735914,
"loss": 1.390056037902832,
"mean_token_accuracy": 0.6923020124435425,
"num_tokens": 2704594.0,
"step": 670
},
{
"entropy": 1.363602066040039,
"epoch": 0.20005884083553988,
"grad_norm": 0.32848283648490906,
"learning_rate": 0.0001998498394536687,
"loss": 1.4025785446166992,
"mean_token_accuracy": 0.6898211300373077,
"num_tokens": 2744928.0,
"step": 680
},
{
"entropy": 1.2932772994041444,
"epoch": 0.2030008826125331,
"grad_norm": 0.375255286693573,
"learning_rate": 0.00019983154835935535,
"loss": 1.271113681793213,
"mean_token_accuracy": 0.72461918592453,
"num_tokens": 2785405.0,
"step": 690
},
{
"entropy": 1.31307555437088,
"epoch": 0.20594292438952633,
"grad_norm": 0.3716314733028412,
"learning_rate": 0.0001998122072667993,
"loss": 1.3366676330566407,
"mean_token_accuracy": 0.7082294344902038,
"num_tokens": 2825853.0,
"step": 700
},
{
"epoch": 0.20594292438952633,
"eval_entropy": 1.43291619208854,
"eval_loss": 1.3930107355117798,
"eval_mean_token_accuracy": 0.6941823344531022,
"eval_num_tokens": 2825853.0,
"eval_runtime": 117.0332,
"eval_samples_per_second": 26.018,
"eval_steps_per_second": 3.255,
"step": 700
},
{
"entropy": 1.3790567636489868,
"epoch": 0.20888496616651955,
"grad_norm": 0.3361744284629822,
"learning_rate": 0.0001997918163794244,
"loss": 1.4195181846618652,
"mean_token_accuracy": 0.6918542444705963,
"num_tokens": 2866199.0,
"step": 710
},
{
"entropy": 1.4847285509109498,
"epoch": 0.2118270079435128,
"grad_norm": 0.5025491118431091,
"learning_rate": 0.00019977037591169583,
"loss": 1.479151153564453,
"mean_token_accuracy": 0.6787452459335327,
"num_tokens": 2906632.0,
"step": 720
},
{
"entropy": 1.3509209036827088,
"epoch": 0.21476904972050603,
"grad_norm": 0.3572346866130829,
"learning_rate": 0.00019974788608911802,
"loss": 1.3688506126403808,
"mean_token_accuracy": 0.6984890639781952,
"num_tokens": 2947203.0,
"step": 730
},
{
"entropy": 1.3631734371185302,
"epoch": 0.21771109149749926,
"grad_norm": 0.33028289675712585,
"learning_rate": 0.00019972434714823217,
"loss": 1.389684009552002,
"mean_token_accuracy": 0.7006505787372589,
"num_tokens": 2987399.0,
"step": 740
},
{
"entropy": 1.3604696154594422,
"epoch": 0.22065313327449249,
"grad_norm": 0.368522584438324,
"learning_rate": 0.00019969975933661378,
"loss": 1.376063919067383,
"mean_token_accuracy": 0.6953622698783875,
"num_tokens": 3027379.0,
"step": 750
},
{
"entropy": 1.4122770190238954,
"epoch": 0.22359517505148574,
"grad_norm": 0.3348753750324249,
"learning_rate": 0.00019967412291287007,
"loss": 1.406267261505127,
"mean_token_accuracy": 0.6962361812591553,
"num_tokens": 3067608.0,
"step": 760
},
{
"entropy": 1.357818615436554,
"epoch": 0.22653721682847897,
"grad_norm": 0.34611567854881287,
"learning_rate": 0.00019964743814663725,
"loss": 1.3942573547363282,
"mean_token_accuracy": 0.6927765250205994,
"num_tokens": 3108050.0,
"step": 770
},
{
"entropy": 1.3579653978347779,
"epoch": 0.2294792586054722,
"grad_norm": 0.42795485258102417,
"learning_rate": 0.0001996197053185777,
"loss": 1.369248104095459,
"mean_token_accuracy": 0.70080486536026,
"num_tokens": 3148448.0,
"step": 780
},
{
"entropy": 1.3725898623466493,
"epoch": 0.23242130038246542,
"grad_norm": 0.37439125776290894,
"learning_rate": 0.00019959092472037688,
"loss": 1.3844552040100098,
"mean_token_accuracy": 0.6901408195495605,
"num_tokens": 3189104.0,
"step": 790
},
{
"entropy": 1.3633452892303466,
"epoch": 0.23536334215945867,
"grad_norm": 0.3426375091075897,
"learning_rate": 0.0001995610966547406,
"loss": 1.3828603744506835,
"mean_token_accuracy": 0.699894517660141,
"num_tokens": 3229594.0,
"step": 800
},
{
"epoch": 0.23536334215945867,
"eval_entropy": 1.3796262137220288,
"eval_loss": 1.3804020881652832,
"eval_mean_token_accuracy": 0.6970919832157025,
"eval_num_tokens": 3229594.0,
"eval_runtime": 117.0459,
"eval_samples_per_second": 26.015,
"eval_steps_per_second": 3.255,
"step": 800
},
{
"entropy": 1.3366303324699402,
"epoch": 0.2383053839364519,
"grad_norm": 0.31953537464141846,
"learning_rate": 0.00019953022143539144,
"loss": 1.3653806686401366,
"mean_token_accuracy": 0.7049518287181854,
"num_tokens": 3269982.0,
"step": 810
},
{
"entropy": 1.3933040380477906,
"epoch": 0.24124742571344512,
"grad_norm": 0.39059045910835266,
"learning_rate": 0.00019949829938706567,
"loss": 1.4077239990234376,
"mean_token_accuracy": 0.6878176212310791,
"num_tokens": 3310501.0,
"step": 820
},
{
"entropy": 1.4102402210235596,
"epoch": 0.24418946749043838,
"grad_norm": 0.40408599376678467,
"learning_rate": 0.00019946533084550983,
"loss": 1.42384614944458,
"mean_token_accuracy": 0.6878543615341186,
"num_tokens": 3350905.0,
"step": 830
},
{
"entropy": 1.3664647340774536,
"epoch": 0.2471315092674316,
"grad_norm": 0.33828359842300415,
"learning_rate": 0.00019943131615747715,
"loss": 1.3716326713562013,
"mean_token_accuracy": 0.7006340861320496,
"num_tokens": 3391405.0,
"step": 840
},
{
"entropy": 1.427152693271637,
"epoch": 0.25007355104442486,
"grad_norm": 0.48738211393356323,
"learning_rate": 0.00019939625568072387,
"loss": 1.4487317085266114,
"mean_token_accuracy": 0.6779101848602295,
"num_tokens": 3431764.0,
"step": 850
},
{
"entropy": 1.3256952285766601,
"epoch": 0.25301559282141806,
"grad_norm": 0.4037957191467285,
"learning_rate": 0.00019936014978400558,
"loss": 1.341677474975586,
"mean_token_accuracy": 0.7102087676525116,
"num_tokens": 3472134.0,
"step": 860
},
{
"entropy": 1.3219331979751587,
"epoch": 0.2559576345984113,
"grad_norm": 0.3991295099258423,
"learning_rate": 0.00019932299884707324,
"loss": 1.3323281288146973,
"mean_token_accuracy": 0.7040995419025421,
"num_tokens": 3512618.0,
"step": 870
},
{
"entropy": 1.327842366695404,
"epoch": 0.2588996763754045,
"grad_norm": 0.37586092948913574,
"learning_rate": 0.00019928480326066925,
"loss": 1.3401626586914062,
"mean_token_accuracy": 0.7077195107936859,
"num_tokens": 3553039.0,
"step": 880
},
{
"entropy": 1.4176750898361206,
"epoch": 0.26184171815239776,
"grad_norm": 0.3479557931423187,
"learning_rate": 0.00019924556342652334,
"loss": 1.435785961151123,
"mean_token_accuracy": 0.6839641869068146,
"num_tokens": 3593551.0,
"step": 890
},
{
"entropy": 1.3467723608016968,
"epoch": 0.264783759929391,
"grad_norm": 0.3519476056098938,
"learning_rate": 0.00019920527975734827,
"loss": 1.3625640869140625,
"mean_token_accuracy": 0.6986214816570282,
"num_tokens": 3633984.0,
"step": 900
},
{
"epoch": 0.264783759929391,
"eval_entropy": 1.327493928861743,
"eval_loss": 1.3682215213775635,
"eval_mean_token_accuracy": 0.6995677048452883,
"eval_num_tokens": 3633984.0,
"eval_runtime": 117.0553,
"eval_samples_per_second": 26.013,
"eval_steps_per_second": 3.255,
"step": 900
},
{
"entropy": 1.377519929409027,
"epoch": 0.2677258017063842,
"grad_norm": 0.3842776417732239,
"learning_rate": 0.0001991639526768356,
"loss": 1.4039417266845704,
"mean_token_accuracy": 0.6875424087047577,
"num_tokens": 3674545.0,
"step": 910
},
{
"entropy": 1.314482867717743,
"epoch": 0.27066784348337747,
"grad_norm": 0.4236571192741394,
"learning_rate": 0.00019912158261965114,
"loss": 1.3185301780700684,
"mean_token_accuracy": 0.7092587351799011,
"num_tokens": 3714993.0,
"step": 920
},
{
"entropy": 1.3224342346191407,
"epoch": 0.2736098852603707,
"grad_norm": 0.3825512230396271,
"learning_rate": 0.0001990781700314304,
"loss": 1.3342713356018066,
"mean_token_accuracy": 0.7061914503574371,
"num_tokens": 3755443.0,
"step": 930
},
{
"entropy": 1.3411121726036073,
"epoch": 0.2765519270373639,
"grad_norm": 0.3913728594779968,
"learning_rate": 0.00019903371536877397,
"loss": 1.372488784790039,
"mean_token_accuracy": 0.6965407311916352,
"num_tokens": 3795944.0,
"step": 940
},
{
"entropy": 1.3303414344787599,
"epoch": 0.2794939688143572,
"grad_norm": 0.34784528613090515,
"learning_rate": 0.00019898821909924267,
"loss": 1.329643440246582,
"mean_token_accuracy": 0.7043894708156586,
"num_tokens": 3836572.0,
"step": 950
},
{
"entropy": 1.3415798902511598,
"epoch": 0.2824360105913504,
"grad_norm": 0.3249607980251312,
"learning_rate": 0.00019894168170135252,
"loss": 1.3672316551208497,
"mean_token_accuracy": 0.7032464861869812,
"num_tokens": 3876943.0,
"step": 960
},
{
"entropy": 1.3220824122428894,
"epoch": 0.2853780523683436,
"grad_norm": 0.3845159411430359,
"learning_rate": 0.00019889410366456995,
"loss": 1.326685905456543,
"mean_token_accuracy": 0.7063803553581238,
"num_tokens": 3917514.0,
"step": 970
},
{
"entropy": 1.3524356603622436,
"epoch": 0.2883200941453369,
"grad_norm": 0.49597394466400146,
"learning_rate": 0.00019884548548930648,
"loss": 1.3823152542114259,
"mean_token_accuracy": 0.688988733291626,
"num_tokens": 3958016.0,
"step": 980
},
{
"entropy": 1.3366973161697389,
"epoch": 0.2912621359223301,
"grad_norm": 0.3381548523902893,
"learning_rate": 0.00019879582768691343,
"loss": 1.3448416709899902,
"mean_token_accuracy": 0.7067325949668884,
"num_tokens": 3998636.0,
"step": 990
},
{
"entropy": 1.342777681350708,
"epoch": 0.29420417769932333,
"grad_norm": 0.3816724717617035,
"learning_rate": 0.0001987451307796767,
"loss": 1.3505562782287597,
"mean_token_accuracy": 0.7011382758617402,
"num_tokens": 4039116.0,
"step": 1000
},
{
"epoch": 0.29420417769932333,
"eval_entropy": 1.3210531396502898,
"eval_loss": 1.3546180725097656,
"eval_mean_token_accuracy": 0.7016779092040275,
"eval_num_tokens": 4039116.0,
"eval_runtime": 116.9387,
"eval_samples_per_second": 26.039,
"eval_steps_per_second": 3.258,
"step": 1000
},
{
"entropy": 1.366915476322174,
"epoch": 0.2971462194763166,
"grad_norm": 0.35182058811187744,
"learning_rate": 0.00019869339530081105,
"loss": 1.3859369277954101,
"mean_token_accuracy": 0.6906876146793366,
"num_tokens": 4079633.0,
"step": 1010
},
{
"entropy": 1.3304717183113097,
"epoch": 0.3000882612533098,
"grad_norm": 0.3552508056163788,
"learning_rate": 0.0001986406217944548,
"loss": 1.3530879974365235,
"mean_token_accuracy": 0.703288596868515,
"num_tokens": 4120199.0,
"step": 1020
},
{
"entropy": 1.3517386436462402,
"epoch": 0.30303030303030304,
"grad_norm": 0.3985072076320648,
"learning_rate": 0.0001985868108156638,
"loss": 1.3472537040710448,
"mean_token_accuracy": 0.7062719881534576,
"num_tokens": 4160213.0,
"step": 1030
},
{
"entropy": 1.2475824117660523,
"epoch": 0.30597234480729624,
"grad_norm": 0.35150817036628723,
"learning_rate": 0.00019853196293040577,
"loss": 1.2721343040466309,
"mean_token_accuracy": 0.7261709928512573,
"num_tokens": 4200515.0,
"step": 1040
},
{
"entropy": 1.3748682379722594,
"epoch": 0.3089143865842895,
"grad_norm": 0.3976305425167084,
"learning_rate": 0.00019847607871555426,
"loss": 1.3997004508972168,
"mean_token_accuracy": 0.6935756504535675,
"num_tokens": 4240452.0,
"step": 1050
},
{
"entropy": 1.250860321521759,
"epoch": 0.31185642836128274,
"grad_norm": 0.4553461968898773,
"learning_rate": 0.00019841915875888272,
"loss": 1.2498172760009765,
"mean_token_accuracy": 0.7244091928005219,
"num_tokens": 4280962.0,
"step": 1060
},
{
"entropy": 1.3760404348373414,
"epoch": 0.31479847013827594,
"grad_norm": 0.489533394575119,
"learning_rate": 0.00019836120365905813,
"loss": 1.3888651847839355,
"mean_token_accuracy": 0.693582957983017,
"num_tokens": 4321434.0,
"step": 1070
},
{
"entropy": 1.3051153898239136,
"epoch": 0.3177405119152692,
"grad_norm": 0.3459921181201935,
"learning_rate": 0.0001983022140256348,
"loss": 1.3199047088623046,
"mean_token_accuracy": 0.7050609290599823,
"num_tokens": 4361714.0,
"step": 1080
},
{
"entropy": 1.2985354661941528,
"epoch": 0.32068255369226245,
"grad_norm": 0.33598437905311584,
"learning_rate": 0.00019824219047904804,
"loss": 1.3020724296569823,
"mean_token_accuracy": 0.713257223367691,
"num_tokens": 4401949.0,
"step": 1090
},
{
"entropy": 1.276042139530182,
"epoch": 0.32362459546925565,
"grad_norm": 0.43967100977897644,
"learning_rate": 0.00019818113365060742,
"loss": 1.3166107177734374,
"mean_token_accuracy": 0.7159667015075684,
"num_tokens": 4442174.0,
"step": 1100
},
{
"epoch": 0.32362459546925565,
"eval_entropy": 1.3324664305514238,
"eval_loss": 1.3422644138336182,
"eval_mean_token_accuracy": 0.7045243512301307,
"eval_num_tokens": 4442174.0,
"eval_runtime": 116.854,
"eval_samples_per_second": 26.058,
"eval_steps_per_second": 3.26,
"step": 1100
},
{
"entropy": 1.3644051671028137,
"epoch": 0.3265666372462489,
"grad_norm": 0.3831937313079834,
"learning_rate": 0.0001981190441824903,
"loss": 1.3389662742614745,
"mean_token_accuracy": 0.7077198505401612,
"num_tokens": 4482638.0,
"step": 1110
},
{
"entropy": 1.3949143767356873,
"epoch": 0.32950867902324216,
"grad_norm": 0.4152087867259979,
"learning_rate": 0.0001980559227277352,
"loss": 1.4393955230712892,
"mean_token_accuracy": 0.6806177437305451,
"num_tokens": 4523231.0,
"step": 1120
},
{
"entropy": 1.5005442142486571,
"epoch": 0.33245072080023536,
"grad_norm": 0.3600977957248688,
"learning_rate": 0.00019799176995023446,
"loss": 1.5087374687194823,
"mean_token_accuracy": 0.6630822360515595,
"num_tokens": 4563875.0,
"step": 1130
},
{
"entropy": 1.3872592091560363,
"epoch": 0.3353927625772286,
"grad_norm": 0.41946983337402344,
"learning_rate": 0.00019792658652472784,
"loss": 1.3950308799743651,
"mean_token_accuracy": 0.6971809685230255,
"num_tokens": 4604493.0,
"step": 1140
},
{
"entropy": 1.2849906086921692,
"epoch": 0.3383348043542218,
"grad_norm": 0.44172239303588867,
"learning_rate": 0.00019786037313679496,
"loss": 1.3040260314941405,
"mean_token_accuracy": 0.7124627947807312,
"num_tokens": 4644971.0,
"step": 1150
},
{
"entropy": 1.3872546672821044,
"epoch": 0.34127684613121506,
"grad_norm": 0.42291128635406494,
"learning_rate": 0.0001977931304828484,
"loss": 1.3932037353515625,
"mean_token_accuracy": 0.6873931646347046,
"num_tokens": 4685402.0,
"step": 1160
},
{
"entropy": 1.3018505930900575,
"epoch": 0.3442188879082083,
"grad_norm": 0.40709924697875977,
"learning_rate": 0.00019772485927012617,
"loss": 1.3295734405517579,
"mean_token_accuracy": 0.708010071516037,
"num_tokens": 4725827.0,
"step": 1170
},
{
"entropy": 1.306389570236206,
"epoch": 0.3471609296852015,
"grad_norm": 0.3591320514678955,
"learning_rate": 0.00019765556021668438,
"loss": 1.2882349014282226,
"mean_token_accuracy": 0.7196535110473633,
"num_tokens": 4766299.0,
"step": 1180
},
{
"entropy": 1.274339497089386,
"epoch": 0.35010297146219477,
"grad_norm": 0.49734798073768616,
"learning_rate": 0.0001975852340513897,
"loss": 1.322108268737793,
"mean_token_accuracy": 0.7085251092910767,
"num_tokens": 4806716.0,
"step": 1190
},
{
"entropy": 1.3069317936897278,
"epoch": 0.353045013239188,
"grad_norm": 0.38912537693977356,
"learning_rate": 0.00019751388151391153,
"loss": 1.2966851234436034,
"mean_token_accuracy": 0.7151984930038452,
"num_tokens": 4846722.0,
"step": 1200
},
{
"epoch": 0.353045013239188,
"eval_entropy": 1.2881334237852122,
"eval_loss": 1.3329390287399292,
"eval_mean_token_accuracy": 0.7064482895095204,
"eval_num_tokens": 4846722.0,
"eval_runtime": 116.8679,
"eval_samples_per_second": 26.055,
"eval_steps_per_second": 3.26,
"step": 1200
},
{
"entropy": 1.323479461669922,
"epoch": 0.3559870550161812,
"grad_norm": 0.3935072720050812,
"learning_rate": 0.0001974415033547145,
"loss": 1.3559471130371095,
"mean_token_accuracy": 0.6980177283287048,
"num_tokens": 4887382.0,
"step": 1210
},
{
"entropy": 1.3160091280937194,
"epoch": 0.3589290967931745,
"grad_norm": 0.402215838432312,
"learning_rate": 0.00019736810033505037,
"loss": 1.3169782638549805,
"mean_token_accuracy": 0.7115081787109375,
"num_tokens": 4927725.0,
"step": 1220
},
{
"entropy": 1.3298014521598815,
"epoch": 0.36187113857016767,
"grad_norm": 0.4016437530517578,
"learning_rate": 0.00019729367322695,
"loss": 1.340937900543213,
"mean_token_accuracy": 0.7020221769809722,
"num_tokens": 4968297.0,
"step": 1230
},
{
"entropy": 1.3313292741775513,
"epoch": 0.3648131803471609,
"grad_norm": 0.4023797810077667,
"learning_rate": 0.00019721822281321537,
"loss": 1.3570178031921387,
"mean_token_accuracy": 0.695677000284195,
"num_tokens": 5008063.0,
"step": 1240
},
{
"entropy": 1.3236193537712098,
"epoch": 0.3677552221241542,
"grad_norm": 0.39722758531570435,
"learning_rate": 0.00019714174988741127,
"loss": 1.3275541305541991,
"mean_token_accuracy": 0.7125855982303619,
"num_tokens": 5048336.0,
"step": 1250
},
{
"entropy": 1.3190148949623108,
"epoch": 0.3706972639011474,
"grad_norm": 0.35919976234436035,
"learning_rate": 0.000197064255253857,
"loss": 1.3260068893432617,
"mean_token_accuracy": 0.7106278300285339,
"num_tokens": 5088942.0,
"step": 1260
},
{
"entropy": 1.3633437037467957,
"epoch": 0.37363930567814063,
"grad_norm": 0.41243675351142883,
"learning_rate": 0.0001969857397276178,
"loss": 1.3687942504882813,
"mean_token_accuracy": 0.6920778334140778,
"num_tokens": 5129372.0,
"step": 1270
},
{
"entropy": 1.2383417725563048,
"epoch": 0.3765813474551339,
"grad_norm": 0.40572160482406616,
"learning_rate": 0.00019690620413449642,
"loss": 1.2481071472167968,
"mean_token_accuracy": 0.7250577747821808,
"num_tokens": 5169845.0,
"step": 1280
},
{
"entropy": 1.2865507364273072,
"epoch": 0.3795233892321271,
"grad_norm": 0.42980971932411194,
"learning_rate": 0.00019682564931102435,
"loss": 1.3222503662109375,
"mean_token_accuracy": 0.7078245043754577,
"num_tokens": 5210494.0,
"step": 1290
},
{
"entropy": 1.3427109718322754,
"epoch": 0.38246543100912034,
"grad_norm": 0.39987629652023315,
"learning_rate": 0.000196744076104453,
"loss": 1.3458648681640626,
"mean_token_accuracy": 0.7039949476718903,
"num_tokens": 5250876.0,
"step": 1300
},
{
"epoch": 0.38246543100912034,
"eval_entropy": 1.3054086448639397,
"eval_loss": 1.328322172164917,
"eval_mean_token_accuracy": 0.7073736231470984,
"eval_num_tokens": 5250876.0,
"eval_runtime": 116.8194,
"eval_samples_per_second": 26.066,
"eval_steps_per_second": 3.261,
"step": 1300
},
{
"entropy": 1.3619997262954713,
"epoch": 0.38540747278611354,
"grad_norm": 0.39274510741233826,
"learning_rate": 0.00019666148537274486,
"loss": 1.393809986114502,
"mean_token_accuracy": 0.6958752512931824,
"num_tokens": 5291408.0,
"step": 1310
},
{
"entropy": 1.313580584526062,
"epoch": 0.3883495145631068,
"grad_norm": 0.4472017288208008,
"learning_rate": 0.00019657787798456447,
"loss": 1.3207698822021485,
"mean_token_accuracy": 0.7056093811988831,
"num_tokens": 5331902.0,
"step": 1320
},
{
"entropy": 1.3660961747169496,
"epoch": 0.39129155634010004,
"grad_norm": 0.3704398572444916,
"learning_rate": 0.00019649325481926918,
"loss": 1.3579423904418946,
"mean_token_accuracy": 0.6998582005500793,
"num_tokens": 5372344.0,
"step": 1330
},
{
"entropy": 1.2838171362876891,
"epoch": 0.39423359811709324,
"grad_norm": 0.42561763525009155,
"learning_rate": 0.0001964076167669001,
"loss": 1.3317262649536132,
"mean_token_accuracy": 0.707372397184372,
"num_tokens": 5412588.0,
"step": 1340
},
{
"entropy": 1.3983584880828857,
"epoch": 0.3971756398940865,
"grad_norm": 0.39428314566612244,
"learning_rate": 0.00019632096472817247,
"loss": 1.3956350326538085,
"mean_token_accuracy": 0.6877002000808716,
"num_tokens": 5452540.0,
"step": 1350
},
{
"entropy": 1.2444414377212525,
"epoch": 0.40011768167107975,
"grad_norm": 0.46192431449890137,
"learning_rate": 0.00019623329961446646,
"loss": 1.2581539154052734,
"mean_token_accuracy": 0.7236546576023102,
"num_tokens": 5492713.0,
"step": 1360
},
{
"entropy": 1.3872360110282898,
"epoch": 0.40305972344807295,
"grad_norm": 0.4279949367046356,
"learning_rate": 0.0001961446223478174,
"loss": 1.3998414993286132,
"mean_token_accuracy": 0.6933107733726501,
"num_tokens": 5532960.0,
"step": 1370
},
{
"entropy": 1.3212559223175049,
"epoch": 0.4060017652250662,
"grad_norm": 0.5924211740493774,
"learning_rate": 0.0001960549338609061,
"loss": 1.3529240608215332,
"mean_token_accuracy": 0.7018321037292481,
"num_tokens": 5573284.0,
"step": 1380
},
{
"entropy": 1.2787784337997437,
"epoch": 0.40894380700205946,
"grad_norm": 0.4120098948478699,
"learning_rate": 0.00019596423509704916,
"loss": 1.2666643142700196,
"mean_token_accuracy": 0.7207745730876922,
"num_tokens": 5613401.0,
"step": 1390
},
{
"entropy": 1.2354493618011475,
"epoch": 0.41188584877905265,
"grad_norm": 0.3948381245136261,
"learning_rate": 0.00019587252701018897,
"loss": 1.258436393737793,
"mean_token_accuracy": 0.7286782383918762,
"num_tokens": 5653527.0,
"step": 1400
},
{
"epoch": 0.41188584877905265,
"eval_entropy": 1.3250519281297217,
"eval_loss": 1.317603588104248,
"eval_mean_token_accuracy": 0.709270513902499,
"eval_num_tokens": 5653527.0,
"eval_runtime": 117.0125,
"eval_samples_per_second": 26.023,
"eval_steps_per_second": 3.256,
"step": 1400
},
{
"entropy": 1.2924874544143676,
"epoch": 0.4148278905560459,
"grad_norm": 0.3901238739490509,
"learning_rate": 0.0001957798105648836,
"loss": 1.3112905502319336,
"mean_token_accuracy": 0.7130284488201142,
"num_tokens": 5693830.0,
"step": 1410
},
{
"entropy": 1.3326545000076293,
"epoch": 0.4177699323330391,
"grad_norm": 0.4452548623085022,
"learning_rate": 0.0001956860867362968,
"loss": 1.3477538108825684,
"mean_token_accuracy": 0.6966541707515717,
"num_tokens": 5734423.0,
"step": 1420
},
{
"entropy": 1.3726879000663756,
"epoch": 0.42071197411003236,
"grad_norm": 0.41140350699424744,
"learning_rate": 0.00019559135651018764,
"loss": 1.364595603942871,
"mean_token_accuracy": 0.6968912184238434,
"num_tokens": 5774991.0,
"step": 1430
},
{
"entropy": 1.2218275964260101,
"epoch": 0.4236540158870256,
"grad_norm": 0.34981605410575867,
"learning_rate": 0.0001954956208829002,
"loss": 1.255105686187744,
"mean_token_accuracy": 0.7247668862342834,
"num_tokens": 5815447.0,
"step": 1440
},
{
"entropy": 1.277131152153015,
"epoch": 0.4265960576640188,
"grad_norm": 0.4556473195552826,
"learning_rate": 0.00019539888086135302,
"loss": 1.2920147895812988,
"mean_token_accuracy": 0.7172133564949036,
"num_tokens": 5855958.0,
"step": 1450
},
{
"entropy": 1.2765482187271118,
"epoch": 0.42953809944101207,
"grad_norm": 0.3995361924171448,
"learning_rate": 0.00019530113746302864,
"loss": 1.2754140853881837,
"mean_token_accuracy": 0.7180281519889832,
"num_tokens": 5896270.0,
"step": 1460
},
{
"entropy": 1.3740562319755554,
"epoch": 0.4324801412180053,
"grad_norm": 0.42559972405433655,
"learning_rate": 0.00019520239171596276,
"loss": 1.4036828994750976,
"mean_token_accuracy": 0.6909762322902679,
"num_tokens": 5936780.0,
"step": 1470
},
{
"entropy": 1.3417500495910644,
"epoch": 0.4354221829949985,
"grad_norm": 0.36076462268829346,
"learning_rate": 0.00019510264465873344,
"loss": 1.3366243362426757,
"mean_token_accuracy": 0.7046165943145752,
"num_tokens": 5977191.0,
"step": 1480
},
{
"entropy": 1.2735178232192994,
"epoch": 0.4383642247719918,
"grad_norm": 0.4245460033416748,
"learning_rate": 0.0001950018973404503,
"loss": 1.296212387084961,
"mean_token_accuracy": 0.7112137854099274,
"num_tokens": 6017786.0,
"step": 1490
},
{
"entropy": 1.2875243425369263,
"epoch": 0.44130626654898497,
"grad_norm": 0.39781293272972107,
"learning_rate": 0.00019490015082074342,
"loss": 1.2946128845214844,
"mean_token_accuracy": 0.7150110900402069,
"num_tokens": 6058171.0,
"step": 1500
},
{
"epoch": 0.44130626654898497,
"eval_entropy": 1.2906748921539526,
"eval_loss": 1.3084396123886108,
"eval_mean_token_accuracy": 0.711328562789076,
"eval_num_tokens": 6058171.0,
"eval_runtime": 116.9727,
"eval_samples_per_second": 26.032,
"eval_steps_per_second": 3.257,
"step": 1500
},
{
"entropy": 1.335238778591156,
"epoch": 0.4442483083259782,
"grad_norm": 0.46100911498069763,
"learning_rate": 0.00019479740616975207,
"loss": 1.3694096565246583,
"mean_token_accuracy": 0.6983109295368195,
"num_tokens": 6098236.0,
"step": 1510
},
{
"entropy": 1.3258445739746094,
"epoch": 0.4471903501029715,
"grad_norm": 0.4548512399196625,
"learning_rate": 0.00019469366446811368,
"loss": 1.3223464012145996,
"mean_token_accuracy": 0.7081819117069245,
"num_tokens": 6138323.0,
"step": 1520
},
{
"entropy": 1.4055490851402284,
"epoch": 0.4501323918799647,
"grad_norm": 0.5857915282249451,
"learning_rate": 0.0001945889268069523,
"loss": 1.4265625,
"mean_token_accuracy": 0.6870070040225983,
"num_tokens": 6178759.0,
"step": 1530
},
{
"entropy": 1.3535587549209596,
"epoch": 0.45307443365695793,
"grad_norm": 0.4886157810688019,
"learning_rate": 0.00019448319428786714,
"loss": 1.3616491317749024,
"mean_token_accuracy": 0.6958298921585083,
"num_tokens": 6219216.0,
"step": 1540
},
{
"entropy": 1.2599135279655456,
"epoch": 0.4560164754339512,
"grad_norm": 0.4452349543571472,
"learning_rate": 0.00019437646802292116,
"loss": 1.2533982276916504,
"mean_token_accuracy": 0.7242700159549713,
"num_tokens": 6259618.0,
"step": 1550
},
{
"entropy": 1.247275459766388,
"epoch": 0.4589585172109444,
"grad_norm": 0.43969377875328064,
"learning_rate": 0.0001942687491346291,
"loss": 1.3022977828979492,
"mean_token_accuracy": 0.7166135847568512,
"num_tokens": 6300125.0,
"step": 1560
},
{
"entropy": 1.2933772325515747,
"epoch": 0.46190055898793764,
"grad_norm": 0.4098397195339203,
"learning_rate": 0.0001941600387559459,
"loss": 1.2829959869384766,
"mean_token_accuracy": 0.7151767909526825,
"num_tokens": 6340600.0,
"step": 1570
},
{
"entropy": 1.3236228585243226,
"epoch": 0.46484260076493084,
"grad_norm": 0.9286667108535767,
"learning_rate": 0.0001940503380302547,
"loss": 1.3246389389038087,
"mean_token_accuracy": 0.7080819308757782,
"num_tokens": 6381016.0,
"step": 1580
},
{
"entropy": 1.3100301146507263,
"epoch": 0.4677846425419241,
"grad_norm": 0.44687584042549133,
"learning_rate": 0.00019393964811135475,
"loss": 1.3230223655700684,
"mean_token_accuracy": 0.7061593413352967,
"num_tokens": 6421104.0,
"step": 1590
},
{
"entropy": 1.3405604600906371,
"epoch": 0.47072668431891734,
"grad_norm": 0.4198276400566101,
"learning_rate": 0.00019382797016344937,
"loss": 1.372106170654297,
"mean_token_accuracy": 0.6942126870155334,
"num_tokens": 6461464.0,
"step": 1600
},
{
"epoch": 0.47072668431891734,
"eval_entropy": 1.3327079477898405,
"eval_loss": 1.3015934228897095,
"eval_mean_token_accuracy": 0.7128373644170486,
"eval_num_tokens": 6461464.0,
"eval_runtime": 116.8886,
"eval_samples_per_second": 26.05,
"eval_steps_per_second": 3.26,
"step": 1600
},
{
"entropy": 1.2775195240974426,
"epoch": 0.47366872609591054,
"grad_norm": 0.36669495701789856,
"learning_rate": 0.00019371530536113372,
"loss": 1.2723214149475097,
"mean_token_accuracy": 0.7203551054000854,
"num_tokens": 6501717.0,
"step": 1610
},
{
"entropy": 1.263567042350769,
"epoch": 0.4766107678729038,
"grad_norm": 0.3709637522697449,
"learning_rate": 0.00019360165488938228,
"loss": 1.2917292594909668,
"mean_token_accuracy": 0.7129571974277497,
"num_tokens": 6542157.0,
"step": 1620
},
{
"entropy": 1.2828770697116851,
"epoch": 0.47955280964989705,
"grad_norm": 0.40589454770088196,
"learning_rate": 0.00019348701994353662,
"loss": 1.2924720764160156,
"mean_token_accuracy": 0.7142295658588409,
"num_tokens": 6582169.0,
"step": 1630
},
{
"entropy": 1.304066789150238,
"epoch": 0.48249485142689025,
"grad_norm": 0.41759273409843445,
"learning_rate": 0.0001933714017292927,
"loss": 1.3207477569580077,
"mean_token_accuracy": 0.7090053021907806,
"num_tokens": 6622526.0,
"step": 1640
},
{
"entropy": 1.2956494092941284,
"epoch": 0.4854368932038835,
"grad_norm": 0.49899545311927795,
"learning_rate": 0.00019325480146268812,
"loss": 1.3091160774230957,
"mean_token_accuracy": 0.7111666679382325,
"num_tokens": 6663165.0,
"step": 1650
},
{
"entropy": 1.2508437275886535,
"epoch": 0.48837893498087676,
"grad_norm": 0.4445253610610962,
"learning_rate": 0.0001931372203700895,
"loss": 1.2658663749694825,
"mean_token_accuracy": 0.7174579679965973,
"num_tokens": 6703494.0,
"step": 1660
},
{
"entropy": 1.287984347343445,
"epoch": 0.49132097675786995,
"grad_norm": 0.4067074656486511,
"learning_rate": 0.00019301865968817948,
"loss": 1.317389678955078,
"mean_token_accuracy": 0.7164031744003296,
"num_tokens": 6743946.0,
"step": 1670
},
{
"entropy": 1.272541528940201,
"epoch": 0.4942630185348632,
"grad_norm": 0.36161571741104126,
"learning_rate": 0.0001928991206639436,
"loss": 1.2725687980651856,
"mean_token_accuracy": 0.7200010478496551,
"num_tokens": 6784135.0,
"step": 1680
},
{
"entropy": 1.2077086687088012,
"epoch": 0.4972050603118564,
"grad_norm": 0.45158880949020386,
"learning_rate": 0.00019277860455465753,
"loss": 1.2226747512817382,
"mean_token_accuracy": 0.7305562674999238,
"num_tokens": 6824602.0,
"step": 1690
},
{
"entropy": 1.3152117371559142,
"epoch": 0.5001471020888497,
"grad_norm": 0.4278320074081421,
"learning_rate": 0.00019265711262787347,
"loss": 1.3219596862792968,
"mean_token_accuracy": 0.7065966606140137,
"num_tokens": 6864339.0,
"step": 1700
},
{
"epoch": 0.5001471020888497,
"eval_entropy": 1.2670136558727956,
"eval_loss": 1.2904326915740967,
"eval_mean_token_accuracy": 0.7152716207692004,
"eval_num_tokens": 6864339.0,
"eval_runtime": 116.9086,
"eval_samples_per_second": 26.046,
"eval_steps_per_second": 3.259,
"step": 1700
},
{
"entropy": 1.3221548914909362,
"epoch": 0.5030891438658429,
"grad_norm": 0.3975633382797241,
"learning_rate": 0.00019253464616140702,
"loss": 1.349279022216797,
"mean_token_accuracy": 0.7058489739894866,
"num_tokens": 6904961.0,
"step": 1710
},
{
"entropy": 1.3088136434555053,
"epoch": 0.5060311856428361,
"grad_norm": 0.40347951650619507,
"learning_rate": 0.00019241120644332367,
"loss": 1.3145333290100099,
"mean_token_accuracy": 0.7059141278266907,
"num_tokens": 6945507.0,
"step": 1720
},
{
"entropy": 1.1808736383914948,
"epoch": 0.5089732274198293,
"grad_norm": 0.36107468605041504,
"learning_rate": 0.00019228679477192534,
"loss": 1.1965153694152832,
"mean_token_accuracy": 0.7326464653015137,
"num_tokens": 6985888.0,
"step": 1730
},
{
"entropy": 1.360763430595398,
"epoch": 0.5119152691968226,
"grad_norm": 0.4324724078178406,
"learning_rate": 0.0001921614124557366,
"loss": 1.3564807891845703,
"mean_token_accuracy": 0.7025493502616882,
"num_tokens": 7026252.0,
"step": 1740
},
{
"entropy": 1.2095451593399047,
"epoch": 0.5148573109738158,
"grad_norm": 0.39905378222465515,
"learning_rate": 0.000192035060813491,
"loss": 1.267704391479492,
"mean_token_accuracy": 0.7238605260848999,
"num_tokens": 7066705.0,
"step": 1750
},
{
"entropy": 1.379474115371704,
"epoch": 0.517799352750809,
"grad_norm": 0.44654110074043274,
"learning_rate": 0.00019190774117411717,
"loss": 1.352774715423584,
"mean_token_accuracy": 0.6996113300323487,
"num_tokens": 7106781.0,
"step": 1760
},
{
"entropy": 1.2537951588630676,
"epoch": 0.5207413945278023,
"grad_norm": 0.4148572087287903,
"learning_rate": 0.00019177945487672482,
"loss": 1.3051136970520019,
"mean_token_accuracy": 0.708976149559021,
"num_tokens": 7147220.0,
"step": 1770
},
{
"entropy": 1.2442652940750123,
"epoch": 0.5236834363047955,
"grad_norm": 0.45965835452079773,
"learning_rate": 0.00019165020327059073,
"loss": 1.2358969688415526,
"mean_token_accuracy": 0.7264375448226928,
"num_tokens": 7187657.0,
"step": 1780
},
{
"entropy": 1.2980039596557618,
"epoch": 0.5266254780817887,
"grad_norm": 0.4910111725330353,
"learning_rate": 0.00019151998771514442,
"loss": 1.3084582328796386,
"mean_token_accuracy": 0.7066324114799499,
"num_tokens": 7228181.0,
"step": 1790
},
{
"entropy": 1.2434914827346801,
"epoch": 0.529567519858782,
"grad_norm": 0.3658753037452698,
"learning_rate": 0.0001913888095799541,
"loss": 1.2698859214782714,
"mean_token_accuracy": 0.7207267701625824,
"num_tokens": 7268538.0,
"step": 1800
},
{
"epoch": 0.529567519858782,
"eval_entropy": 1.2639452193978578,
"eval_loss": 1.281688928604126,
"eval_mean_token_accuracy": 0.7168223152323345,
"eval_num_tokens": 7268538.0,
"eval_runtime": 116.9626,
"eval_samples_per_second": 26.034,
"eval_steps_per_second": 3.257,
"step": 1800
},
{
"entropy": 1.3006227135658264,
"epoch": 0.5325095616357752,
"grad_norm": 0.4497034549713135,
"learning_rate": 0.00019125667024471186,
"loss": 1.3108736991882324,
"mean_token_accuracy": 0.7042722702026367,
"num_tokens": 7308724.0,
"step": 1810
},
{
"entropy": 1.275818121433258,
"epoch": 0.5354516034127684,
"grad_norm": 0.4820014536380768,
"learning_rate": 0.00019112357109921964,
"loss": 1.2958525657653808,
"mean_token_accuracy": 0.7129449069499969,
"num_tokens": 7349172.0,
"step": 1820
},
{
"entropy": 1.245762574672699,
"epoch": 0.5383936451897617,
"grad_norm": 0.45138782262802124,
"learning_rate": 0.00019098951354337422,
"loss": 1.2449359893798828,
"mean_token_accuracy": 0.7270717859268189,
"num_tokens": 7389600.0,
"step": 1830
},
{
"entropy": 1.1886737942695618,
"epoch": 0.5413356869667549,
"grad_norm": 0.42880910634994507,
"learning_rate": 0.00019085449898715274,
"loss": 1.2165825843811036,
"mean_token_accuracy": 0.7328237056732178,
"num_tokens": 7429786.0,
"step": 1840
},
{
"entropy": 1.261668312549591,
"epoch": 0.5442777287437481,
"grad_norm": 0.41189083456993103,
"learning_rate": 0.0001907185288505978,
"loss": 1.2767746925354004,
"mean_token_accuracy": 0.718351137638092,
"num_tokens": 7470252.0,
"step": 1850
},
{
"entropy": 1.2039696455001831,
"epoch": 0.5472197705207414,
"grad_norm": 0.384387344121933,
"learning_rate": 0.0001905816045638024,
"loss": 1.2115536689758302,
"mean_token_accuracy": 0.7361586034297943,
"num_tokens": 7510473.0,
"step": 1860
},
{
"entropy": 1.2600649833679198,
"epoch": 0.5501618122977346,
"grad_norm": 0.4238559901714325,
"learning_rate": 0.00019044372756689504,
"loss": 1.2758872985839844,
"mean_token_accuracy": 0.7137023746967316,
"num_tokens": 7550932.0,
"step": 1870
},
{
"entropy": 1.3069803357124328,
"epoch": 0.5531038540747278,
"grad_norm": 0.44645845890045166,
"learning_rate": 0.00019030489931002461,
"loss": 1.3207664489746094,
"mean_token_accuracy": 0.7104817628860474,
"num_tokens": 7591311.0,
"step": 1880
},
{
"entropy": 1.3120111107826233,
"epoch": 0.556045895851721,
"grad_norm": 0.46638140082359314,
"learning_rate": 0.00019016512125334502,
"loss": 1.3248316764831543,
"mean_token_accuracy": 0.705955958366394,
"num_tokens": 7631397.0,
"step": 1890
},
{
"entropy": 1.279408586025238,
"epoch": 0.5589879376287143,
"grad_norm": 0.4149499535560608,
"learning_rate": 0.00019002439486699987,
"loss": 1.2773524284362794,
"mean_token_accuracy": 0.7147800862789154,
"num_tokens": 7671968.0,
"step": 1900
},
{
"epoch": 0.5589879376287143,
"eval_entropy": 1.2283503402249394,
"eval_loss": 1.2789214849472046,
"eval_mean_token_accuracy": 0.7170038914743058,
"eval_num_tokens": 7671968.0,
"eval_runtime": 117.0781,
"eval_samples_per_second": 26.008,
"eval_steps_per_second": 3.254,
"step": 1900
},
{
"entropy": 1.230725634098053,
"epoch": 0.5619299794057075,
"grad_norm": 0.4699569046497345,
"learning_rate": 0.00018988272163110703,
"loss": 1.2412220001220704,
"mean_token_accuracy": 0.7230030238628388,
"num_tokens": 7712399.0,
"step": 1910
},
{
"entropy": 1.3014813184738159,
"epoch": 0.5648720211827007,
"grad_norm": 0.4014744162559509,
"learning_rate": 0.0001897401030357431,
"loss": 1.3222554206848145,
"mean_token_accuracy": 0.7079461216926575,
"num_tokens": 7752805.0,
"step": 1920
},
{
"entropy": 1.2195597231388091,
"epoch": 0.567814062959694,
"grad_norm": 0.4482584297657013,
"learning_rate": 0.00018959654058092753,
"loss": 1.2318772315979003,
"mean_token_accuracy": 0.7294041752815247,
"num_tokens": 7793144.0,
"step": 1930
},
{
"entropy": 1.1877473592758179,
"epoch": 0.5707561047366873,
"grad_norm": 0.44251689314842224,
"learning_rate": 0.00018945203577660718,
"loss": 1.2138689041137696,
"mean_token_accuracy": 0.7385743677616119,
"num_tokens": 7833447.0,
"step": 1940
},
{
"entropy": 1.289076042175293,
"epoch": 0.5736981465136805,
"grad_norm": 0.39147278666496277,
"learning_rate": 0.00018930659014264017,
"loss": 1.2974119186401367,
"mean_token_accuracy": 0.712678074836731,
"num_tokens": 7873974.0,
"step": 1950
},
{
"entropy": 1.208195787668228,
"epoch": 0.5766401882906738,
"grad_norm": 0.5337726473808289,
"learning_rate": 0.00018916020520877994,
"loss": 1.2156153678894044,
"mean_token_accuracy": 0.726024466753006,
"num_tokens": 7914528.0,
"step": 1960
},
{
"entropy": 1.2428096413612366,
"epoch": 0.579582230067667,
"grad_norm": 0.41402509808540344,
"learning_rate": 0.00018901288251465937,
"loss": 1.2540960311889648,
"mean_token_accuracy": 0.7271076798439026,
"num_tokens": 7954995.0,
"step": 1970
},
{
"entropy": 1.3336944222450255,
"epoch": 0.5825242718446602,
"grad_norm": 0.47286257147789,
"learning_rate": 0.00018886462360977418,
"loss": 1.3511391639709474,
"mean_token_accuracy": 0.7000328660011291,
"num_tokens": 7995173.0,
"step": 1980
},
{
"entropy": 1.210561752319336,
"epoch": 0.5854663136216535,
"grad_norm": 0.4485618472099304,
"learning_rate": 0.00018871543005346712,
"loss": 1.222616958618164,
"mean_token_accuracy": 0.7286027550697327,
"num_tokens": 8035513.0,
"step": 1990
},
{
"entropy": 1.2922529458999634,
"epoch": 0.5884083553986467,
"grad_norm": 0.4895835220813751,
"learning_rate": 0.0001885653034149111,
"loss": 1.3031671524047852,
"mean_token_accuracy": 0.7130830705165863,
"num_tokens": 8075879.0,
"step": 2000
},
{
"epoch": 0.5884083553986467,
"eval_entropy": 1.2732777583004609,
"eval_loss": 1.2699540853500366,
"eval_mean_token_accuracy": 0.7192083870957843,
"eval_num_tokens": 8075879.0,
"eval_runtime": 116.9367,
"eval_samples_per_second": 26.04,
"eval_steps_per_second": 3.258,
"step": 2000
},
{
"entropy": 1.2049331665039062,
"epoch": 0.5913503971756399,
"grad_norm": 0.42137858271598816,
"learning_rate": 0.00018841424527309312,
"loss": 1.220776081085205,
"mean_token_accuracy": 0.7255268514156341,
"num_tokens": 8116494.0,
"step": 2010
},
{
"entropy": 1.2747172474861146,
"epoch": 0.5942924389526332,
"grad_norm": 0.47128528356552124,
"learning_rate": 0.0001882622572167973,
"loss": 1.2953474044799804,
"mean_token_accuracy": 0.7114850461483002,
"num_tokens": 8156917.0,
"step": 2020
},
{
"entropy": 1.2040826320648192,
"epoch": 0.5972344807296264,
"grad_norm": 0.477896124124527,
"learning_rate": 0.0001881093408445884,
"loss": 1.2177928924560546,
"mean_token_accuracy": 0.7306720972061157,
"num_tokens": 8197335.0,
"step": 2030
},
{
"entropy": 1.2654430389404296,
"epoch": 0.6001765225066196,
"grad_norm": 0.5165997743606567,
"learning_rate": 0.00018795549776479478,
"loss": 1.275616455078125,
"mean_token_accuracy": 0.7176172435283661,
"num_tokens": 8237564.0,
"step": 2040
},
{
"entropy": 1.2124343276023866,
"epoch": 0.6031185642836129,
"grad_norm": 0.47672176361083984,
"learning_rate": 0.0001878007295954919,
"loss": 1.2235189437866212,
"mean_token_accuracy": 0.7277295291423798,
"num_tokens": 8277586.0,
"step": 2050
},
{
"entropy": 1.2670048475265503,
"epoch": 0.6060606060606061,
"grad_norm": 0.42278149724006653,
"learning_rate": 0.00018764503796448478,
"loss": 1.284127426147461,
"mean_token_accuracy": 0.7204706132411957,
"num_tokens": 8318120.0,
"step": 2060
},
{
"entropy": 1.2286874294281005,
"epoch": 0.6090026478375993,
"grad_norm": 0.4719674587249756,
"learning_rate": 0.0001874884245092913,
"loss": 1.2510007858276366,
"mean_token_accuracy": 0.7220764279365539,
"num_tokens": 8358689.0,
"step": 2070
},
{
"entropy": 1.298528516292572,
"epoch": 0.6119446896145925,
"grad_norm": 0.46403807401657104,
"learning_rate": 0.00018733089087712469,
"loss": 1.2856470108032227,
"mean_token_accuracy": 0.7156751930713654,
"num_tokens": 8399274.0,
"step": 2080
},
{
"entropy": 1.268327033519745,
"epoch": 0.6148867313915858,
"grad_norm": 0.4383772611618042,
"learning_rate": 0.00018717243872487643,
"loss": 1.3167724609375,
"mean_token_accuracy": 0.7053386807441712,
"num_tokens": 8439732.0,
"step": 2090
},
{
"entropy": 1.3164120435714721,
"epoch": 0.617828773168579,
"grad_norm": 0.5242652893066406,
"learning_rate": 0.00018701306971909864,
"loss": 1.3042461395263671,
"mean_token_accuracy": 0.7102492094039917,
"num_tokens": 8479716.0,
"step": 2100
},
{
"epoch": 0.617828773168579,
"eval_entropy": 1.2351457424989836,
"eval_loss": 1.2649626731872559,
"eval_mean_token_accuracy": 0.7205018502833649,
"eval_num_tokens": 8479716.0,
"eval_runtime": 117.0816,
"eval_samples_per_second": 26.008,
"eval_steps_per_second": 3.254,
"step": 2100
},
{
"entropy": 1.2392801344394684,
"epoch": 0.6207708149455722,
"grad_norm": 0.47463512420654297,
"learning_rate": 0.00018685278553598665,
"loss": 1.2612558364868165,
"mean_token_accuracy": 0.7227189362049102,
"num_tokens": 8520008.0,
"step": 2110
},
{
"entropy": 1.2834307312965394,
"epoch": 0.6237128567225655,
"grad_norm": 0.4995603859424591,
"learning_rate": 0.0001866915878613614,
"loss": 1.2821990966796875,
"mean_token_accuracy": 0.7140654444694519,
"num_tokens": 8560586.0,
"step": 2120
},
{
"entropy": 1.2452851295471192,
"epoch": 0.6266548984995587,
"grad_norm": 0.40243977308273315,
"learning_rate": 0.00018652947839065159,
"loss": 1.2768383026123047,
"mean_token_accuracy": 0.7194826602935791,
"num_tokens": 8601116.0,
"step": 2130
},
{
"entropy": 1.2355848252773285,
"epoch": 0.6295969402765519,
"grad_norm": 0.3945181369781494,
"learning_rate": 0.00018636645882887592,
"loss": 1.2494622230529786,
"mean_token_accuracy": 0.7192557215690613,
"num_tokens": 8641094.0,
"step": 2140
},
{
"entropy": 1.243088138103485,
"epoch": 0.6325389820535452,
"grad_norm": 0.43109068274497986,
"learning_rate": 0.0001862025308906252,
"loss": 1.2423103332519532,
"mean_token_accuracy": 0.7271890878677368,
"num_tokens": 8681458.0,
"step": 2150
},
{
"entropy": 1.2035149574279784,
"epoch": 0.6354810238305384,
"grad_norm": 0.4727862477302551,
"learning_rate": 0.0001860376963000443,
"loss": 1.2361689567565919,
"mean_token_accuracy": 0.7284120082855224,
"num_tokens": 8722066.0,
"step": 2160
},
{
"entropy": 1.2721379399299622,
"epoch": 0.6384230656075316,
"grad_norm": 0.5057058930397034,
"learning_rate": 0.00018587195679081386,
"loss": 1.2708622932434082,
"mean_token_accuracy": 0.7155144691467286,
"num_tokens": 8762672.0,
"step": 2170
},
{
"entropy": 1.2714126229286193,
"epoch": 0.6413651073845249,
"grad_norm": 0.4464847147464752,
"learning_rate": 0.0001857053141061323,
"loss": 1.2917202949523925,
"mean_token_accuracy": 0.7170847177505493,
"num_tokens": 8802819.0,
"step": 2180
},
{
"entropy": 1.2385571956634522,
"epoch": 0.6443071491615181,
"grad_norm": 0.4425857365131378,
"learning_rate": 0.00018553776999869737,
"loss": 1.252675437927246,
"mean_token_accuracy": 0.7212274014949799,
"num_tokens": 8842977.0,
"step": 2190
},
{
"entropy": 1.2919608235359192,
"epoch": 0.6472491909385113,
"grad_norm": 0.43156683444976807,
"learning_rate": 0.00018536932623068757,
"loss": 1.2964573860168458,
"mean_token_accuracy": 0.7152363657951355,
"num_tokens": 8883579.0,
"step": 2200
},
{
"epoch": 0.6472491909385113,
"eval_entropy": 1.2316402986919472,
"eval_loss": 1.2606068849563599,
"eval_mean_token_accuracy": 0.7208151789162103,
"eval_num_tokens": 8883579.0,
"eval_runtime": 117.0496,
"eval_samples_per_second": 26.015,
"eval_steps_per_second": 3.255,
"step": 2200
},
{
"entropy": 1.3093620181083678,
"epoch": 0.6501912327155046,
"grad_norm": 0.4637244641780853,
"learning_rate": 0.00018519998457374395,
"loss": 1.34036865234375,
"mean_token_accuracy": 0.7034331321716308,
"num_tokens": 8923903.0,
"step": 2210
},
{
"entropy": 1.214417290687561,
"epoch": 0.6531332744924978,
"grad_norm": 0.5183310508728027,
"learning_rate": 0.00018502974680895115,
"loss": 1.230905532836914,
"mean_token_accuracy": 0.7264467597007751,
"num_tokens": 8963848.0,
"step": 2220
},
{
"entropy": 1.2735732913017273,
"epoch": 0.656075316269491,
"grad_norm": 0.6038491725921631,
"learning_rate": 0.00018485861472681888,
"loss": 1.2594423294067383,
"mean_token_accuracy": 0.7218607306480408,
"num_tokens": 9003224.0,
"step": 2230
},
{
"entropy": 1.278822934627533,
"epoch": 0.6590173580464843,
"grad_norm": 0.47614696621894836,
"learning_rate": 0.00018468659012726301,
"loss": 1.3205299377441406,
"mean_token_accuracy": 0.7092170417308807,
"num_tokens": 9043662.0,
"step": 2240
},
{
"entropy": 1.2612668752670289,
"epoch": 0.6619593998234775,
"grad_norm": 0.4333754777908325,
"learning_rate": 0.00018451367481958655,
"loss": 1.269089412689209,
"mean_token_accuracy": 0.7188269674777985,
"num_tokens": 9084132.0,
"step": 2250
},
{
"entropy": 1.2136012673377992,
"epoch": 0.6649014416004707,
"grad_norm": 0.44086429476737976,
"learning_rate": 0.0001843398706224608,
"loss": 1.222397518157959,
"mean_token_accuracy": 0.7309954643249512,
"num_tokens": 9124313.0,
"step": 2260
},
{
"entropy": 1.2528056859970094,
"epoch": 0.6678434833774639,
"grad_norm": 0.45429208874702454,
"learning_rate": 0.0001841651793639061,
"loss": 1.2543872833251952,
"mean_token_accuracy": 0.7223168253898621,
"num_tokens": 9164680.0,
"step": 2270
},
{
"entropy": 1.2683658719062805,
"epoch": 0.6707855251544572,
"grad_norm": 0.4545646011829376,
"learning_rate": 0.00018398960288127264,
"loss": 1.3083954811096192,
"mean_token_accuracy": 0.7077820837497711,
"num_tokens": 9205179.0,
"step": 2280
},
{
"entropy": 1.267857301235199,
"epoch": 0.6737275669314504,
"grad_norm": 0.43586161732673645,
"learning_rate": 0.00018381314302122115,
"loss": 1.2671592712402344,
"mean_token_accuracy": 0.7227232694625855,
"num_tokens": 9245707.0,
"step": 2290
},
{
"entropy": 1.191249167919159,
"epoch": 0.6766696087084436,
"grad_norm": 0.5259418487548828,
"learning_rate": 0.00018363580163970343,
"loss": 1.1978882789611816,
"mean_token_accuracy": 0.7335732400417327,
"num_tokens": 9286200.0,
"step": 2300
},
{
"epoch": 0.6766696087084436,
"eval_entropy": 1.1682455519365826,
"eval_loss": 1.257144808769226,
"eval_mean_token_accuracy": 0.7223104308909318,
"eval_num_tokens": 9286200.0,
"eval_runtime": 116.9696,
"eval_samples_per_second": 26.032,
"eval_steps_per_second": 3.257,
"step": 2300
},
{
"entropy": 1.2716636419296266,
"epoch": 0.6796116504854369,
"grad_norm": 0.415056049823761,
"learning_rate": 0.00018345758060194287,
"loss": 1.2905259132385254,
"mean_token_accuracy": 0.710951566696167,
"num_tokens": 9326847.0,
"step": 2310
},
{
"entropy": 1.1874773681163788,
"epoch": 0.6825536922624301,
"grad_norm": 0.49722516536712646,
"learning_rate": 0.00018327848178241481,
"loss": 1.217663288116455,
"mean_token_accuracy": 0.7317953050136566,
"num_tokens": 9367084.0,
"step": 2320
},
{
"entropy": 1.2677314758300782,
"epoch": 0.6854957340394233,
"grad_norm": 0.4097212255001068,
"learning_rate": 0.00018309850706482687,
"loss": 1.2633783340454101,
"mean_token_accuracy": 0.7190278351306916,
"num_tokens": 9407189.0,
"step": 2330
},
{
"entropy": 1.2582902312278748,
"epoch": 0.6884377758164166,
"grad_norm": 0.40571051836013794,
"learning_rate": 0.00018291765834209907,
"loss": 1.2858672142028809,
"mean_token_accuracy": 0.7170897841453552,
"num_tokens": 9447880.0,
"step": 2340
},
{
"entropy": 1.327420747280121,
"epoch": 0.6913798175934098,
"grad_norm": 0.48363062739372253,
"learning_rate": 0.0001827359375163439,
"loss": 1.327678108215332,
"mean_token_accuracy": 0.7030242502689361,
"num_tokens": 9488502.0,
"step": 2350
},
{
"entropy": 1.1727963089942932,
"epoch": 0.694321859370403,
"grad_norm": 0.6001095175743103,
"learning_rate": 0.00018255334649884653,
"loss": 1.2080462455749512,
"mean_token_accuracy": 0.7356064558029175,
"num_tokens": 9529082.0,
"step": 2360
},
{
"entropy": 1.2679656863212585,
"epoch": 0.6972639011473963,
"grad_norm": 0.4376157522201538,
"learning_rate": 0.00018236988721004435,
"loss": 1.2510211944580079,
"mean_token_accuracy": 0.7209162175655365,
"num_tokens": 9569520.0,
"step": 2370
},
{
"entropy": 1.192594301700592,
"epoch": 0.7002059429243895,
"grad_norm": 0.4717561900615692,
"learning_rate": 0.00018218556157950712,
"loss": 1.2164905548095704,
"mean_token_accuracy": 0.7315115988254547,
"num_tokens": 9609903.0,
"step": 2380
},
{
"entropy": 1.3375505089759827,
"epoch": 0.7031479847013827,
"grad_norm": 0.5176673531532288,
"learning_rate": 0.00018200037154591643,
"loss": 1.3507174491882323,
"mean_token_accuracy": 0.69825981259346,
"num_tokens": 9650434.0,
"step": 2390
},
{
"entropy": 1.2428280234336853,
"epoch": 0.706090026478376,
"grad_norm": 0.4390230178833008,
"learning_rate": 0.00018181431905704546,
"loss": 1.254446029663086,
"mean_token_accuracy": 0.7244620621204376,
"num_tokens": 9690991.0,
"step": 2400
},
{
"epoch": 0.706090026478376,
"eval_entropy": 1.2351404309585652,
"eval_loss": 1.2472655773162842,
"eval_mean_token_accuracy": 0.7242527881006556,
"eval_num_tokens": 9690991.0,
"eval_runtime": 117.0968,
"eval_samples_per_second": 26.004,
"eval_steps_per_second": 3.254,
"step": 2400
},
{
"entropy": 1.1983890414237977,
"epoch": 0.7090320682553692,
"grad_norm": 0.44956910610198975,
"learning_rate": 0.0001816274060697384,
"loss": 1.2177659034729005,
"mean_token_accuracy": 0.731769061088562,
"num_tokens": 9731440.0,
"step": 2410
},
{
"entropy": 1.2803593873977661,
"epoch": 0.7119741100323624,
"grad_norm": 0.4463217854499817,
"learning_rate": 0.00018143963454988994,
"loss": 1.2735008239746093,
"mean_token_accuracy": 0.7187759101390838,
"num_tokens": 9771061.0,
"step": 2420
},
{
"entropy": 1.22296462059021,
"epoch": 0.7149161518093556,
"grad_norm": 0.4359455406665802,
"learning_rate": 0.0001812510064724245,
"loss": 1.260395622253418,
"mean_token_accuracy": 0.7190466344356536,
"num_tokens": 9811550.0,
"step": 2430
},
{
"entropy": 1.2306616604328156,
"epoch": 0.717858193586349,
"grad_norm": 0.4980376660823822,
"learning_rate": 0.0001810615238212755,
"loss": 1.227048110961914,
"mean_token_accuracy": 0.7294842720031738,
"num_tokens": 9852102.0,
"step": 2440
},
{
"entropy": 1.2669499397277832,
"epoch": 0.7208002353633421,
"grad_norm": 0.4520755708217621,
"learning_rate": 0.00018087118858936462,
"loss": 1.2932658195495605,
"mean_token_accuracy": 0.7133265674114228,
"num_tokens": 9892746.0,
"step": 2450
},
{
"entropy": 1.2530406713485718,
"epoch": 0.7237422771403353,
"grad_norm": 0.6179884076118469,
"learning_rate": 0.00018068000277858065,
"loss": 1.2789620399475097,
"mean_token_accuracy": 0.7135308802127838,
"num_tokens": 9933185.0,
"step": 2460
},
{
"entropy": 1.2204147577285767,
"epoch": 0.7266843189173287,
"grad_norm": 0.4976007044315338,
"learning_rate": 0.00018048796839975856,
"loss": 1.2141535758972168,
"mean_token_accuracy": 0.7293932437896729,
"num_tokens": 9973384.0,
"step": 2470
},
{
"entropy": 1.1704169631004333,
"epoch": 0.7296263606943219,
"grad_norm": 0.44038140773773193,
"learning_rate": 0.0001802950874726582,
"loss": 1.1908206939697266,
"mean_token_accuracy": 0.7376551747322082,
"num_tokens": 10013764.0,
"step": 2480
},
{
"entropy": 1.2523088693618774,
"epoch": 0.732568402471315,
"grad_norm": 0.46554332971572876,
"learning_rate": 0.00018010136202594332,
"loss": 1.2656194686889648,
"mean_token_accuracy": 0.7242594540119172,
"num_tokens": 10054063.0,
"step": 2490
},
{
"entropy": 1.3448559761047363,
"epoch": 0.7355104442483084,
"grad_norm": 0.4372340142726898,
"learning_rate": 0.00017990679409715993,
"loss": 1.3519328117370606,
"mean_token_accuracy": 0.698470801115036,
"num_tokens": 10094687.0,
"step": 2500
},
{
"epoch": 0.7355104442483084,
"eval_entropy": 1.2676614907782848,
"eval_loss": 1.2452576160430908,
"eval_mean_token_accuracy": 0.7236289200507436,
"eval_num_tokens": 10094687.0,
"eval_runtime": 116.8804,
"eval_samples_per_second": 26.052,
"eval_steps_per_second": 3.26,
"step": 2500
},
{
"entropy": 1.2084831714630127,
"epoch": 0.7384524860253016,
"grad_norm": 0.4193192422389984,
"learning_rate": 0.00017971138573271507,
"loss": 1.218832778930664,
"mean_token_accuracy": 0.7289343297481536,
"num_tokens": 10135026.0,
"step": 2510
},
{
"entropy": 1.1639327347278594,
"epoch": 0.7413945278022948,
"grad_norm": 0.48731788992881775,
"learning_rate": 0.0001795151389878552,
"loss": 1.1885252952575684,
"mean_token_accuracy": 0.7402738213539124,
"num_tokens": 10175310.0,
"step": 2520
},
{
"entropy": 1.2578992068767547,
"epoch": 0.7443365695792881,
"grad_norm": 0.434038370847702,
"learning_rate": 0.00017931805592664472,
"loss": 1.26021728515625,
"mean_token_accuracy": 0.7184948623180389,
"num_tokens": 10215765.0,
"step": 2530
},
{
"entropy": 1.2038448989391326,
"epoch": 0.7472786113562813,
"grad_norm": 0.48660168051719666,
"learning_rate": 0.00017912013862194404,
"loss": 1.211390781402588,
"mean_token_accuracy": 0.7284208476543427,
"num_tokens": 10256371.0,
"step": 2540
},
{
"entropy": 1.1407162606716157,
"epoch": 0.7502206531332745,
"grad_norm": 0.5016793608665466,
"learning_rate": 0.0001789213891553879,
"loss": 1.1628236770629883,
"mean_token_accuracy": 0.7418089389801026,
"num_tokens": 10296590.0,
"step": 2550
},
{
"entropy": 1.2328106760978699,
"epoch": 0.7531626949102678,
"grad_norm": 0.42070743441581726,
"learning_rate": 0.00017872180961736356,
"loss": 1.245603656768799,
"mean_token_accuracy": 0.7262676537036896,
"num_tokens": 10337069.0,
"step": 2560
},
{
"entropy": 1.2932706713676452,
"epoch": 0.756104736687261,
"grad_norm": 0.46600809693336487,
"learning_rate": 0.00017852140210698858,
"loss": 1.2860488891601562,
"mean_token_accuracy": 0.7088023841381073,
"num_tokens": 10377428.0,
"step": 2570
},
{
"entropy": 1.147745430469513,
"epoch": 0.7590467784642542,
"grad_norm": 0.4609155058860779,
"learning_rate": 0.00017832016873208905,
"loss": 1.1787425994873046,
"mean_token_accuracy": 0.7361489832401276,
"num_tokens": 10417739.0,
"step": 2580
},
{
"entropy": 1.2378079295158386,
"epoch": 0.7619888202412475,
"grad_norm": 0.4880058467388153,
"learning_rate": 0.00017811811160917712,
"loss": 1.2569812774658202,
"mean_token_accuracy": 0.7204902648925782,
"num_tokens": 10458215.0,
"step": 2590
},
{
"entropy": 1.2285701274871825,
"epoch": 0.7649308620182407,
"grad_norm": 0.5326588153839111,
"learning_rate": 0.0001779152328634289,
"loss": 1.221341609954834,
"mean_token_accuracy": 0.7287818729877472,
"num_tokens": 10498647.0,
"step": 2600
},
{
"epoch": 0.7649308620182407,
"eval_entropy": 1.2003295410649357,
"eval_loss": 1.236678957939148,
"eval_mean_token_accuracy": 0.7259835525760501,
"eval_num_tokens": 10498647.0,
"eval_runtime": 116.9331,
"eval_samples_per_second": 26.041,
"eval_steps_per_second": 3.258,
"step": 2600
},
{
"entropy": 1.2428824484348298,
"epoch": 0.7678729037952339,
"grad_norm": 0.45255497097969055,
"learning_rate": 0.00017771153462866216,
"loss": 1.2709949493408204,
"mean_token_accuracy": 0.7177605211734772,
"num_tokens": 10539009.0,
"step": 2610
},
{
"entropy": 1.277386212348938,
"epoch": 0.7708149455722271,
"grad_norm": 0.48587530851364136,
"learning_rate": 0.00017750701904731373,
"loss": 1.2778440475463868,
"mean_token_accuracy": 0.7140256285667419,
"num_tokens": 10579502.0,
"step": 2620
},
{
"entropy": 1.2311038613319396,
"epoch": 0.7737569873492204,
"grad_norm": 0.6130372285842896,
"learning_rate": 0.00017730168827041708,
"loss": 1.2585201263427734,
"mean_token_accuracy": 0.7184097468852997,
"num_tokens": 10619385.0,
"step": 2630
},
{
"entropy": 1.2518787860870362,
"epoch": 0.7766990291262136,
"grad_norm": 0.43607237935066223,
"learning_rate": 0.00017709554445757966,
"loss": 1.2394478797912598,
"mean_token_accuracy": 0.726053637266159,
"num_tokens": 10659852.0,
"step": 2640
},
{
"entropy": 1.2289941668510438,
"epoch": 0.7796410709032068,
"grad_norm": 0.4687124788761139,
"learning_rate": 0.00017688858977696014,
"loss": 1.2466455459594727,
"mean_token_accuracy": 0.7200910389423371,
"num_tokens": 10700123.0,
"step": 2650
},
{
"entropy": 1.232028889656067,
"epoch": 0.7825831126802001,
"grad_norm": 0.45022356510162354,
"learning_rate": 0.00017668082640524574,
"loss": 1.2503207206726075,
"mean_token_accuracy": 0.7186446607112884,
"num_tokens": 10740503.0,
"step": 2660
},
{
"entropy": 1.204378592967987,
"epoch": 0.7855251544571933,
"grad_norm": 0.41389408707618713,
"learning_rate": 0.0001764722565276292,
"loss": 1.211115264892578,
"mean_token_accuracy": 0.7328132688999176,
"num_tokens": 10780655.0,
"step": 2670
},
{
"entropy": 1.2730337023735045,
"epoch": 0.7884671962341865,
"grad_norm": 0.4794485569000244,
"learning_rate": 0.00017626288233778582,
"loss": 1.2866595268249512,
"mean_token_accuracy": 0.7171245098114014,
"num_tokens": 10821171.0,
"step": 2680
},
{
"entropy": 1.2131190776824952,
"epoch": 0.7914092380111798,
"grad_norm": 0.4519226551055908,
"learning_rate": 0.00017605270603785047,
"loss": 1.228813934326172,
"mean_token_accuracy": 0.7272532522678375,
"num_tokens": 10861594.0,
"step": 2690
},
{
"entropy": 1.2511601805686952,
"epoch": 0.794351279788173,
"grad_norm": 0.4799201488494873,
"learning_rate": 0.00017584172983839435,
"loss": 1.2575819969177247,
"mean_token_accuracy": 0.7226161539554596,
"num_tokens": 10902043.0,
"step": 2700
},
{
"epoch": 0.794351279788173,
"eval_entropy": 1.2509275682642078,
"eval_loss": 1.2339112758636475,
"eval_mean_token_accuracy": 0.7265911623248904,
"eval_num_tokens": 10902043.0,
"eval_runtime": 116.9442,
"eval_samples_per_second": 26.038,
"eval_steps_per_second": 3.258,
"step": 2700
},
{
"entropy": 1.2342944502830506,
"epoch": 0.7972933215651662,
"grad_norm": 0.49187448620796204,
"learning_rate": 0.00017562995595840178,
"loss": 1.2416543006896972,
"mean_token_accuracy": 0.7259420096874237,
"num_tokens": 10942392.0,
"step": 2710
},
{
"entropy": 1.1988507807254791,
"epoch": 0.8002353633421595,
"grad_norm": 0.4817524254322052,
"learning_rate": 0.00017541738662524677,
"loss": 1.2237167358398438,
"mean_token_accuracy": 0.7307404637336731,
"num_tokens": 10982924.0,
"step": 2720
},
{
"entropy": 1.236850619316101,
"epoch": 0.8031774051191527,
"grad_norm": 0.4729112684726715,
"learning_rate": 0.0001752040240746698,
"loss": 1.2266542434692382,
"mean_token_accuracy": 0.7280332922935486,
"num_tokens": 11023305.0,
"step": 2730
},
{
"entropy": 1.1383702993392943,
"epoch": 0.8061194468961459,
"grad_norm": 0.42324507236480713,
"learning_rate": 0.00017498987055075403,
"loss": 1.1699549674987793,
"mean_token_accuracy": 0.7414192616939544,
"num_tokens": 11063084.0,
"step": 2740
},
{
"entropy": 1.1376792788505554,
"epoch": 0.8090614886731392,
"grad_norm": 0.5435130000114441,
"learning_rate": 0.00017477492830590192,
"loss": 1.1505720138549804,
"mean_token_accuracy": 0.7437731683254242,
"num_tokens": 11103518.0,
"step": 2750
},
{
"entropy": 1.2045920014381408,
"epoch": 0.8120035304501324,
"grad_norm": 0.4564155340194702,
"learning_rate": 0.00017455919960081149,
"loss": 1.1978718757629394,
"mean_token_accuracy": 0.7353939712047577,
"num_tokens": 11143802.0,
"step": 2760
},
{
"entropy": 1.197198224067688,
"epoch": 0.8149455722271256,
"grad_norm": 0.457720011472702,
"learning_rate": 0.0001743426867044524,
"loss": 1.2358501434326172,
"mean_token_accuracy": 0.7267086863517761,
"num_tokens": 11184158.0,
"step": 2770
},
{
"entropy": 1.2718565106391906,
"epoch": 0.8178876140041189,
"grad_norm": 0.4250863194465637,
"learning_rate": 0.00017412539189404233,
"loss": 1.2535717010498046,
"mean_token_accuracy": 0.7206644594669342,
"num_tokens": 11224574.0,
"step": 2780
},
{
"entropy": 1.172040694952011,
"epoch": 0.8208296557811121,
"grad_norm": 0.5394912958145142,
"learning_rate": 0.00017390731745502283,
"loss": 1.2080710411071778,
"mean_token_accuracy": 0.7338366210460663,
"num_tokens": 11265027.0,
"step": 2790
},
{
"entropy": 1.187865948677063,
"epoch": 0.8237716975581053,
"grad_norm": 0.49136948585510254,
"learning_rate": 0.00017368846568103529,
"loss": 1.1811614990234376,
"mean_token_accuracy": 0.7365565001964569,
"num_tokens": 11305432.0,
"step": 2800
},
{
"epoch": 0.8237716975581053,
"eval_entropy": 1.2183332227346466,
"eval_loss": 1.2257474660873413,
"eval_mean_token_accuracy": 0.7285947153574526,
"eval_num_tokens": 11305432.0,
"eval_runtime": 116.8432,
"eval_samples_per_second": 26.061,
"eval_steps_per_second": 3.261,
"step": 2800
},
{
"entropy": 1.1511970162391663,
"epoch": 0.8267137393350985,
"grad_norm": 0.487531840801239,
"learning_rate": 0.00017346883887389702,
"loss": 1.1708711624145507,
"mean_token_accuracy": 0.7420612633228302,
"num_tokens": 11345702.0,
"step": 2810
},
{
"entropy": 1.2634961485862732,
"epoch": 0.8296557811120918,
"grad_norm": 0.4640207886695862,
"learning_rate": 0.00017324843934357674,
"loss": 1.277150821685791,
"mean_token_accuracy": 0.7138958215713501,
"num_tokens": 11386240.0,
"step": 2820
},
{
"entropy": 1.1891654789447785,
"epoch": 0.832597822889085,
"grad_norm": 0.47752827405929565,
"learning_rate": 0.0001730272694081706,
"loss": 1.193849754333496,
"mean_token_accuracy": 0.7351171731948852,
"num_tokens": 11426663.0,
"step": 2830
},
{
"entropy": 1.2108049154281617,
"epoch": 0.8355398646660782,
"grad_norm": 0.514695942401886,
"learning_rate": 0.0001728053313938775,
"loss": 1.2478459358215332,
"mean_token_accuracy": 0.721454119682312,
"num_tokens": 11467195.0,
"step": 2840
},
{
"entropy": 1.2818018913269043,
"epoch": 0.8384819064430715,
"grad_norm": 0.517238438129425,
"learning_rate": 0.00017258262763497482,
"loss": 1.2610112190246583,
"mean_token_accuracy": 0.7156127452850342,
"num_tokens": 11507742.0,
"step": 2850
},
{
"entropy": 1.1540677964687347,
"epoch": 0.8414239482200647,
"grad_norm": 0.4621984362602234,
"learning_rate": 0.00017235916047379383,
"loss": 1.187222385406494,
"mean_token_accuracy": 0.7345345914363861,
"num_tokens": 11548215.0,
"step": 2860
},
{
"entropy": 1.2128564953804015,
"epoch": 0.8443659899970579,
"grad_norm": 0.4317253530025482,
"learning_rate": 0.000172134932260695,
"loss": 1.2142438888549805,
"mean_token_accuracy": 0.7298630118370056,
"num_tokens": 11588711.0,
"step": 2870
},
{
"entropy": 1.1962445259094239,
"epoch": 0.8473080317740512,
"grad_norm": 0.6231359839439392,
"learning_rate": 0.00017190994535404332,
"loss": 1.221367359161377,
"mean_token_accuracy": 0.7313773334026337,
"num_tokens": 11629162.0,
"step": 2880
},
{
"entropy": 1.2900652885437012,
"epoch": 0.8502500735510444,
"grad_norm": 0.4637579321861267,
"learning_rate": 0.00017168420212018354,
"loss": 1.2854097366333008,
"mean_token_accuracy": 0.7109606087207794,
"num_tokens": 11669650.0,
"step": 2890
},
{
"entropy": 1.2192873358726501,
"epoch": 0.8531921153280376,
"grad_norm": 0.4562380909919739,
"learning_rate": 0.00017145770493341518,
"loss": 1.2436570167541503,
"mean_token_accuracy": 0.7263434827327728,
"num_tokens": 11710292.0,
"step": 2900
},
{
"epoch": 0.8531921153280376,
"eval_entropy": 1.2169157832939168,
"eval_loss": 1.219910979270935,
"eval_mean_token_accuracy": 0.7299123456471861,
"eval_num_tokens": 11710292.0,
"eval_runtime": 116.9129,
"eval_samples_per_second": 26.045,
"eval_steps_per_second": 3.259,
"step": 2900
},
{
"entropy": 1.2365688323974608,
"epoch": 0.8561341571050309,
"grad_norm": 0.46011191606521606,
"learning_rate": 0.00017123045617596763,
"loss": 1.2509427070617676,
"mean_token_accuracy": 0.7250486254692078,
"num_tokens": 11750649.0,
"step": 2910
},
{
"entropy": 1.2337135434150697,
"epoch": 0.8590761988820241,
"grad_norm": 0.5224452018737793,
"learning_rate": 0.00017100245823797503,
"loss": 1.2394111633300782,
"mean_token_accuracy": 0.729231595993042,
"num_tokens": 11791097.0,
"step": 2920
},
{
"entropy": 1.2292932152748108,
"epoch": 0.8620182406590173,
"grad_norm": 0.48644399642944336,
"learning_rate": 0.00017077371351745124,
"loss": 1.2508816719055176,
"mean_token_accuracy": 0.718151307106018,
"num_tokens": 11831392.0,
"step": 2930
},
{
"entropy": 1.1478191256523131,
"epoch": 0.8649602824360106,
"grad_norm": 0.4293639063835144,
"learning_rate": 0.00017054422442026456,
"loss": 1.1457470893859862,
"mean_token_accuracy": 0.7468528985977173,
"num_tokens": 11871925.0,
"step": 2940
},
{
"entropy": 1.2405227303504944,
"epoch": 0.8679023242130038,
"grad_norm": 0.6136884689331055,
"learning_rate": 0.00017031399336011238,
"loss": 1.2617270469665527,
"mean_token_accuracy": 0.716576772928238,
"num_tokens": 11912605.0,
"step": 2950
},
{
"entropy": 1.2667588710784912,
"epoch": 0.870844365989997,
"grad_norm": 0.4269845485687256,
"learning_rate": 0.00017008302275849582,
"loss": 1.283921241760254,
"mean_token_accuracy": 0.7173857808113098,
"num_tokens": 11953010.0,
"step": 2960
},
{
"entropy": 1.3015403628349305,
"epoch": 0.8737864077669902,
"grad_norm": 0.46401771903038025,
"learning_rate": 0.0001698513150446943,
"loss": 1.3019817352294922,
"mean_token_accuracy": 0.7130701661109924,
"num_tokens": 11993545.0,
"step": 2970
},
{
"entropy": 1.2569140315055847,
"epoch": 0.8767284495439835,
"grad_norm": 0.47312793135643005,
"learning_rate": 0.00016961887265574,
"loss": 1.279769515991211,
"mean_token_accuracy": 0.7160746216773987,
"num_tokens": 12033961.0,
"step": 2980
},
{
"entropy": 1.253390657901764,
"epoch": 0.8796704913209767,
"grad_norm": 0.5449343323707581,
"learning_rate": 0.0001693856980363921,
"loss": 1.259514045715332,
"mean_token_accuracy": 0.7175013661384583,
"num_tokens": 12074553.0,
"step": 2990
},
{
"entropy": 1.3062225699424743,
"epoch": 0.8826125330979699,
"grad_norm": 0.4499056041240692,
"learning_rate": 0.00016915179363911125,
"loss": 1.3181246757507323,
"mean_token_accuracy": 0.7042843997478485,
"num_tokens": 12115075.0,
"step": 3000
},
{
"epoch": 0.8826125330979699,
"eval_entropy": 1.2066716964789264,
"eval_loss": 1.2160207033157349,
"eval_mean_token_accuracy": 0.7302276686107706,
"eval_num_tokens": 12115075.0,
"eval_runtime": 117.081,
"eval_samples_per_second": 26.008,
"eval_steps_per_second": 3.254,
"step": 3000
},
{
"entropy": 1.1849471926689148,
"epoch": 0.8855545748749633,
"grad_norm": 0.45680174231529236,
"learning_rate": 0.00016891716192403365,
"loss": 1.18974027633667,
"mean_token_accuracy": 0.735182011127472,
"num_tokens": 12155441.0,
"step": 3010
},
{
"entropy": 1.218670165538788,
"epoch": 0.8884966166519565,
"grad_norm": 0.4255404770374298,
"learning_rate": 0.0001686818053589452,
"loss": 1.2337156295776368,
"mean_token_accuracy": 0.7265771627426147,
"num_tokens": 12196036.0,
"step": 3020
},
{
"entropy": 1.2351657152175903,
"epoch": 0.8914386584289496,
"grad_norm": 0.49140819907188416,
"learning_rate": 0.0001684457264192556,
"loss": 1.2371573448181152,
"mean_token_accuracy": 0.7276750206947327,
"num_tokens": 12236509.0,
"step": 3030
},
{
"entropy": 1.1386435210704804,
"epoch": 0.894380700205943,
"grad_norm": 0.4721025824546814,
"learning_rate": 0.00016820892758797218,
"loss": 1.1611692428588867,
"mean_token_accuracy": 0.7427328288555145,
"num_tokens": 12276991.0,
"step": 3040
},
{
"entropy": 1.2254295825958252,
"epoch": 0.8973227419829362,
"grad_norm": 0.5237665772438049,
"learning_rate": 0.000167971411355674,
"loss": 1.2355279922485352,
"mean_token_accuracy": 0.7242857456207276,
"num_tokens": 12317371.0,
"step": 3050
},
{
"entropy": 1.2108076691627503,
"epoch": 0.9002647837599294,
"grad_norm": 0.46400943398475647,
"learning_rate": 0.00016773318022048536,
"loss": 1.210099983215332,
"mean_token_accuracy": 0.7319884955883026,
"num_tokens": 12357779.0,
"step": 3060
},
{
"entropy": 1.1315430402755737,
"epoch": 0.9032068255369227,
"grad_norm": 0.46516865491867065,
"learning_rate": 0.00016749423668804988,
"loss": 1.160158634185791,
"mean_token_accuracy": 0.7439006865024567,
"num_tokens": 12398230.0,
"step": 3070
},
{
"entropy": 1.2317523241043091,
"epoch": 0.9061488673139159,
"grad_norm": 0.45237967371940613,
"learning_rate": 0.00016725458327150383,
"loss": 1.228554630279541,
"mean_token_accuracy": 0.728976035118103,
"num_tokens": 12438762.0,
"step": 3080
},
{
"entropy": 1.1924173712730408,
"epoch": 0.9090909090909091,
"grad_norm": 0.4868202805519104,
"learning_rate": 0.00016701422249144985,
"loss": 1.2131217956542968,
"mean_token_accuracy": 0.7263190448284149,
"num_tokens": 12479195.0,
"step": 3090
},
{
"entropy": 1.159307700395584,
"epoch": 0.9120329508679024,
"grad_norm": 0.44851839542388916,
"learning_rate": 0.00016677315687593048,
"loss": 1.1793177604675293,
"mean_token_accuracy": 0.7422453939914704,
"num_tokens": 12518938.0,
"step": 3100
},
{
"epoch": 0.9120329508679024,
"eval_entropy": 1.193232403965447,
"eval_loss": 1.2098932266235352,
"eval_mean_token_accuracy": 0.7319249039872737,
"eval_num_tokens": 12518938.0,
"eval_runtime": 117.0019,
"eval_samples_per_second": 26.025,
"eval_steps_per_second": 3.256,
"step": 3100
},
{
"entropy": 1.2515848875045776,
"epoch": 0.9149749926448956,
"grad_norm": 0.46064046025276184,
"learning_rate": 0.00016653138896040144,
"loss": 1.24728364944458,
"mean_token_accuracy": 0.7212517559528351,
"num_tokens": 12559307.0,
"step": 3110
},
{
"entropy": 1.2065839052200318,
"epoch": 0.9179170344218888,
"grad_norm": 0.4273196756839752,
"learning_rate": 0.00016628892128770506,
"loss": 1.2376407623291015,
"mean_token_accuracy": 0.7283597230911255,
"num_tokens": 12599752.0,
"step": 3120
},
{
"entropy": 1.245788073539734,
"epoch": 0.9208590761988821,
"grad_norm": 0.4527672529220581,
"learning_rate": 0.0001660457564080435,
"loss": 1.2551823616027833,
"mean_token_accuracy": 0.7254024922847748,
"num_tokens": 12640368.0,
"step": 3130
},
{
"entropy": 1.2258686184883119,
"epoch": 0.9238011179758753,
"grad_norm": 0.4471310079097748,
"learning_rate": 0.00016580189687895192,
"loss": 1.2315049171447754,
"mean_token_accuracy": 0.7266620457172394,
"num_tokens": 12681011.0,
"step": 3140
},
{
"entropy": 1.2006858110427856,
"epoch": 0.9267431597528685,
"grad_norm": 0.5084729790687561,
"learning_rate": 0.00016555734526527163,
"loss": 1.20996150970459,
"mean_token_accuracy": 0.733843994140625,
"num_tokens": 12721291.0,
"step": 3150
},
{
"entropy": 1.240491509437561,
"epoch": 0.9296852015298617,
"grad_norm": 0.5322201251983643,
"learning_rate": 0.000165312104139123,
"loss": 1.2419602394104003,
"mean_token_accuracy": 0.7259994149208069,
"num_tokens": 12761832.0,
"step": 3160
},
{
"entropy": 1.2303740501403808,
"epoch": 0.932627243306855,
"grad_norm": 0.48525163531303406,
"learning_rate": 0.00016506617607987863,
"loss": 1.266739845275879,
"mean_token_accuracy": 0.7198700964450836,
"num_tokens": 12802050.0,
"step": 3170
},
{
"entropy": 1.2406162559986114,
"epoch": 0.9355692850838482,
"grad_norm": 0.5300395488739014,
"learning_rate": 0.0001648195636741359,
"loss": 1.236351776123047,
"mean_token_accuracy": 0.7296032607555389,
"num_tokens": 12842323.0,
"step": 3180
},
{
"entropy": 1.1737658739089967,
"epoch": 0.9385113268608414,
"grad_norm": 0.580947756767273,
"learning_rate": 0.0001645722695156901,
"loss": 1.1983850479125977,
"mean_token_accuracy": 0.7319401502609253,
"num_tokens": 12882948.0,
"step": 3190
},
{
"entropy": 1.2107102632522584,
"epoch": 0.9414533686378347,
"grad_norm": 0.4780050814151764,
"learning_rate": 0.00016432429620550688,
"loss": 1.1875343322753906,
"mean_token_accuracy": 0.7371236264705658,
"num_tokens": 12923316.0,
"step": 3200
},
{
"epoch": 0.9414533686378347,
"eval_entropy": 1.192403563051399,
"eval_loss": 1.204405426979065,
"eval_mean_token_accuracy": 0.7331006780384094,
"eval_num_tokens": 12923316.0,
"eval_runtime": 117.0112,
"eval_samples_per_second": 26.023,
"eval_steps_per_second": 3.256,
"step": 3200
},
{
"entropy": 1.1371994316577911,
"epoch": 0.9443954104148279,
"grad_norm": 0.5067015290260315,
"learning_rate": 0.00016407564635169503,
"loss": 1.186887264251709,
"mean_token_accuracy": 0.7382100522518158,
"num_tokens": 12963817.0,
"step": 3210
},
{
"entropy": 1.217560636997223,
"epoch": 0.9473374521918211,
"grad_norm": 0.4357713460922241,
"learning_rate": 0.00016382632256947908,
"loss": 1.2167092323303224,
"mean_token_accuracy": 0.7259755432605743,
"num_tokens": 13004277.0,
"step": 3220
},
{
"entropy": 1.2078778982162475,
"epoch": 0.9502794939688144,
"grad_norm": 0.45559161901474,
"learning_rate": 0.0001635763274811716,
"loss": 1.2228084564208985,
"mean_token_accuracy": 0.7331404030323029,
"num_tokens": 13044683.0,
"step": 3230
},
{
"entropy": 1.210153341293335,
"epoch": 0.9532215357458076,
"grad_norm": 0.48570674657821655,
"learning_rate": 0.00016332566371614595,
"loss": 1.2366246223449706,
"mean_token_accuracy": 0.7288424909114838,
"num_tokens": 13085217.0,
"step": 3240
},
{
"entropy": 1.200468325614929,
"epoch": 0.9561635775228008,
"grad_norm": 0.43572553992271423,
"learning_rate": 0.0001630743339108083,
"loss": 1.1855230331420898,
"mean_token_accuracy": 0.7426514804363251,
"num_tokens": 13125762.0,
"step": 3250
},
{
"entropy": 1.192023515701294,
"epoch": 0.9591056192997941,
"grad_norm": 0.6283465623855591,
"learning_rate": 0.00016282234070857,
"loss": 1.2304601669311523,
"mean_token_accuracy": 0.7263703107833862,
"num_tokens": 13166141.0,
"step": 3260
},
{
"entropy": 1.3121253371238708,
"epoch": 0.9620476610767873,
"grad_norm": 0.4841027855873108,
"learning_rate": 0.0001625696867598199,
"loss": 1.3120348930358887,
"mean_token_accuracy": 0.7042850077152252,
"num_tokens": 13206360.0,
"step": 3270
},
{
"entropy": 1.2325164914131164,
"epoch": 0.9649897028537805,
"grad_norm": 0.5523087978363037,
"learning_rate": 0.0001623163747218964,
"loss": 1.2477660179138184,
"mean_token_accuracy": 0.7236345887184144,
"num_tokens": 13246935.0,
"step": 3280
},
{
"entropy": 1.2571437239646912,
"epoch": 0.9679317446307738,
"grad_norm": 0.4331943690776825,
"learning_rate": 0.00016206240725905938,
"loss": 1.263328742980957,
"mean_token_accuracy": 0.7202905654907227,
"num_tokens": 13287422.0,
"step": 3290
},
{
"entropy": 1.132285052537918,
"epoch": 0.970873786407767,
"grad_norm": 0.4219360053539276,
"learning_rate": 0.00016180778704246238,
"loss": 1.1526874542236327,
"mean_token_accuracy": 0.745033609867096,
"num_tokens": 13327893.0,
"step": 3300
},
{
"epoch": 0.970873786407767,
"eval_entropy": 1.2065895927233958,
"eval_loss": 1.2001736164093018,
"eval_mean_token_accuracy": 0.7332249573209467,
"eval_num_tokens": 13327893.0,
"eval_runtime": 117.023,
"eval_samples_per_second": 26.021,
"eval_steps_per_second": 3.256,
"step": 3300
},
{
"entropy": 1.2010907173156737,
"epoch": 0.9738158281847602,
"grad_norm": 0.45471927523612976,
"learning_rate": 0.00016155251675012433,
"loss": 1.2022515296936036,
"mean_token_accuracy": 0.7346913456916809,
"num_tokens": 13368297.0,
"step": 3310
},
{
"entropy": 1.235162889957428,
"epoch": 0.9767578699617535,
"grad_norm": 0.4965246319770813,
"learning_rate": 0.0001612965990669015,
"loss": 1.2688727378845215,
"mean_token_accuracy": 0.7194641828536987,
"num_tokens": 13408602.0,
"step": 3320
},
{
"entropy": 1.3277331352233888,
"epoch": 0.9796999117387467,
"grad_norm": 0.5565961599349976,
"learning_rate": 0.00016104003668445925,
"loss": 1.3193525314331054,
"mean_token_accuracy": 0.7008399486541748,
"num_tokens": 13448838.0,
"step": 3330
},
{
"entropy": 1.1913153290748597,
"epoch": 0.9826419535157399,
"grad_norm": 0.49679502844810486,
"learning_rate": 0.00016078283230124365,
"loss": 1.2235237121582032,
"mean_token_accuracy": 0.7243378221988678,
"num_tokens": 13489366.0,
"step": 3340
},
{
"entropy": 1.2956088483333588,
"epoch": 0.9855839952927331,
"grad_norm": 0.49954238533973694,
"learning_rate": 0.00016052498862245313,
"loss": 1.2841950416564942,
"mean_token_accuracy": 0.7132591784000397,
"num_tokens": 13529414.0,
"step": 3350
},
{
"entropy": 1.182485854625702,
"epoch": 0.9885260370697264,
"grad_norm": 0.4281366169452667,
"learning_rate": 0.00016026650836001012,
"loss": 1.2035736083984374,
"mean_token_accuracy": 0.7331153571605682,
"num_tokens": 13569904.0,
"step": 3360
},
{
"entropy": 1.2611793637275697,
"epoch": 0.9914680788467196,
"grad_norm": 0.4736562669277191,
"learning_rate": 0.0001600073942325323,
"loss": 1.2855722427368164,
"mean_token_accuracy": 0.7184321641921997,
"num_tokens": 13610365.0,
"step": 3370
},
{
"entropy": 1.2419211864471436,
"epoch": 0.9944101206237128,
"grad_norm": 0.45400166511535645,
"learning_rate": 0.00015974764896530433,
"loss": 1.2359369277954102,
"mean_token_accuracy": 0.7274314403533936,
"num_tokens": 13650738.0,
"step": 3380
},
{
"entropy": 1.16963592171669,
"epoch": 0.9973521624007061,
"grad_norm": 0.5148010849952698,
"learning_rate": 0.0001594872752902489,
"loss": 1.1966312408447266,
"mean_token_accuracy": 0.7361967086791992,
"num_tokens": 13691179.0,
"step": 3390
},
{
"entropy": 1.263899064064026,
"epoch": 1.0002942041776994,
"grad_norm": 0.47132888436317444,
"learning_rate": 0.0001592262759458981,
"loss": 1.2513002395629882,
"mean_token_accuracy": 0.7206339836120605,
"num_tokens": 13729142.0,
"step": 3400
},
{
"epoch": 1.0002942041776994,
"eval_entropy": 1.1728849974204236,
"eval_loss": 1.196743369102478,
"eval_mean_token_accuracy": 0.7349204848131795,
"eval_num_tokens": 13729142.0,
"eval_runtime": 116.9564,
"eval_samples_per_second": 26.035,
"eval_steps_per_second": 3.258,
"step": 3400
},
{
"entropy": 1.0056753396987914,
"epoch": 1.0032362459546926,
"grad_norm": 0.48898911476135254,
"learning_rate": 0.00015896465367736467,
"loss": 0.9896906852722168,
"mean_token_accuracy": 0.7708241283893585,
"num_tokens": 13769748.0,
"step": 3410
},
{
"entropy": 0.9479109048843384,
"epoch": 1.0061782877316858,
"grad_norm": 0.5644809603691101,
"learning_rate": 0.00015870241123631303,
"loss": 0.969275951385498,
"mean_token_accuracy": 0.7748298406600952,
"num_tokens": 13810433.0,
"step": 3420
},
{
"entropy": 1.0169321179389954,
"epoch": 1.009120329508679,
"grad_norm": 0.5554332733154297,
"learning_rate": 0.00015843955138093043,
"loss": 1.0067197799682617,
"mean_token_accuracy": 0.7651751101016998,
"num_tokens": 13850895.0,
"step": 3430
},
{
"entropy": 0.9419119358062744,
"epoch": 1.0120623712856722,
"grad_norm": 0.5447728037834167,
"learning_rate": 0.00015817607687589787,
"loss": 0.9617524147033691,
"mean_token_accuracy": 0.771773511171341,
"num_tokens": 13890924.0,
"step": 3440
},
{
"entropy": 0.9768797099590302,
"epoch": 1.0150044130626654,
"grad_norm": 0.5598154664039612,
"learning_rate": 0.00015791199049236106,
"loss": 0.984192180633545,
"mean_token_accuracy": 0.7716309785842895,
"num_tokens": 13931279.0,
"step": 3450
},
{
"entropy": 0.9213016629219055,
"epoch": 1.0179464548396586,
"grad_norm": 0.5594082474708557,
"learning_rate": 0.00015764729500790132,
"loss": 0.925694465637207,
"mean_token_accuracy": 0.7817917823791504,
"num_tokens": 13971714.0,
"step": 3460
},
{
"entropy": 0.9738560080528259,
"epoch": 1.020888496616652,
"grad_norm": 0.576998233795166,
"learning_rate": 0.00015738199320650622,
"loss": 0.9819319725036622,
"mean_token_accuracy": 0.7748230636119843,
"num_tokens": 14012038.0,
"step": 3470
},
{
"entropy": 0.9529994606971741,
"epoch": 1.0238305383936452,
"grad_norm": 0.507428765296936,
"learning_rate": 0.00015711608787854041,
"loss": 0.968116569519043,
"mean_token_accuracy": 0.7762803137302399,
"num_tokens": 14052608.0,
"step": 3480
},
{
"entropy": 0.9343097984790802,
"epoch": 1.0267725801706384,
"grad_norm": 0.48517364263534546,
"learning_rate": 0.0001568495818207163,
"loss": 0.9297596931457519,
"mean_token_accuracy": 0.783079195022583,
"num_tokens": 14093119.0,
"step": 3490
},
{
"entropy": 0.9860086500644684,
"epoch": 1.0297146219476316,
"grad_norm": 0.4717691242694855,
"learning_rate": 0.00015658247783606455,
"loss": 1.004935073852539,
"mean_token_accuracy": 0.765727037191391,
"num_tokens": 14133569.0,
"step": 3500
},
{
"epoch": 1.0297146219476316,
"eval_entropy": 1.0715967291609196,
"eval_loss": 1.2108769416809082,
"eval_mean_token_accuracy": 0.7342690184047529,
"eval_num_tokens": 14133569.0,
"eval_runtime": 116.9449,
"eval_samples_per_second": 26.038,
"eval_steps_per_second": 3.258,
"step": 3500
},
{
"entropy": 0.9845475435256958,
"epoch": 1.0326566637246248,
"grad_norm": 0.6196256875991821,
"learning_rate": 0.00015631477873390463,
"loss": 0.9737442970275879,
"mean_token_accuracy": 0.7720641791820526,
"num_tokens": 14174112.0,
"step": 3510
},
{
"entropy": 0.9606890559196473,
"epoch": 1.035598705501618,
"grad_norm": 0.5352163910865784,
"learning_rate": 0.00015604648732981535,
"loss": 0.9936755180358887,
"mean_token_accuracy": 0.767872554063797,
"num_tokens": 14214732.0,
"step": 3520
},
{
"entropy": 0.9683861494064331,
"epoch": 1.0385407472786115,
"grad_norm": 0.5286763906478882,
"learning_rate": 0.00015577760644560506,
"loss": 0.9670245170593261,
"mean_token_accuracy": 0.7790086328983307,
"num_tokens": 14255029.0,
"step": 3530
},
{
"entropy": 0.9281998634338379,
"epoch": 1.0414827890556047,
"grad_norm": 0.5544895529747009,
"learning_rate": 0.0001555081389092822,
"loss": 0.9275782585144043,
"mean_token_accuracy": 0.7800623893737793,
"num_tokens": 14294859.0,
"step": 3540
},
{
"entropy": 0.9516816318035126,
"epoch": 1.0444248308325979,
"grad_norm": 0.5528038144111633,
"learning_rate": 0.0001552380875550253,
"loss": 0.971556282043457,
"mean_token_accuracy": 0.7744876265525817,
"num_tokens": 14335364.0,
"step": 3550
},
{
"entropy": 1.0375867664813996,
"epoch": 1.047366872609591,
"grad_norm": 0.5946918725967407,
"learning_rate": 0.00015496745522315352,
"loss": 1.036135482788086,
"mean_token_accuracy": 0.7563816487789154,
"num_tokens": 14375785.0,
"step": 3560
},
{
"entropy": 0.9602185845375061,
"epoch": 1.0503089143865842,
"grad_norm": 0.55278080701828,
"learning_rate": 0.00015469624476009637,
"loss": 0.9790426254272461,
"mean_token_accuracy": 0.7711789608001709,
"num_tokens": 14416298.0,
"step": 3570
},
{
"entropy": 0.9802630722522736,
"epoch": 1.0532509561635774,
"grad_norm": 0.5366263389587402,
"learning_rate": 0.00015442445901836407,
"loss": 0.9828317642211915,
"mean_token_accuracy": 0.7732825756072998,
"num_tokens": 14456758.0,
"step": 3580
},
{
"entropy": 0.9827463984489441,
"epoch": 1.0561929979405709,
"grad_norm": 0.5232464075088501,
"learning_rate": 0.0001541521008565174,
"loss": 0.9965373039245605,
"mean_token_accuracy": 0.7664778053760528,
"num_tokens": 14497079.0,
"step": 3590
},
{
"entropy": 0.9974855959415436,
"epoch": 1.059135039717564,
"grad_norm": 0.5640583634376526,
"learning_rate": 0.0001538791731391377,
"loss": 1.001423168182373,
"mean_token_accuracy": 0.7696728587150574,
"num_tokens": 14537442.0,
"step": 3600
},
{
"epoch": 1.059135039717564,
"eval_entropy": 1.059611619926813,
"eval_loss": 1.2121630907058716,
"eval_mean_token_accuracy": 0.7346383677379978,
"eval_num_tokens": 14537442.0,
"eval_runtime": 116.9964,
"eval_samples_per_second": 26.026,
"eval_steps_per_second": 3.257,
"step": 3600
},
{
"entropy": 1.019939649105072,
"epoch": 1.0620770814945573,
"grad_norm": 0.6510360836982727,
"learning_rate": 0.00015360567873679682,
"loss": 1.0335427284240724,
"mean_token_accuracy": 0.7557238221168519,
"num_tokens": 14577903.0,
"step": 3610
},
{
"entropy": 0.9929381251335144,
"epoch": 1.0650191232715505,
"grad_norm": 0.4985535740852356,
"learning_rate": 0.00015333162052602663,
"loss": 0.9860858917236328,
"mean_token_accuracy": 0.7693518400192261,
"num_tokens": 14618362.0,
"step": 3620
},
{
"entropy": 1.0021346986293793,
"epoch": 1.0679611650485437,
"grad_norm": 0.5036218762397766,
"learning_rate": 0.00015305700138928914,
"loss": 1.0195841789245605,
"mean_token_accuracy": 0.7599446833133697,
"num_tokens": 14658524.0,
"step": 3630
},
{
"entropy": 0.9681568443775177,
"epoch": 1.0709032068255369,
"grad_norm": 0.685461699962616,
"learning_rate": 0.00015278182421494597,
"loss": 0.9650713920593261,
"mean_token_accuracy": 0.7742046892642975,
"num_tokens": 14699023.0,
"step": 3640
},
{
"entropy": 0.9974366188049316,
"epoch": 1.07384524860253,
"grad_norm": 0.5122579336166382,
"learning_rate": 0.0001525060918972279,
"loss": 1.010261631011963,
"mean_token_accuracy": 0.7668596982955933,
"num_tokens": 14739433.0,
"step": 3650
},
{
"entropy": 0.9449751615524292,
"epoch": 1.0767872903795235,
"grad_norm": 0.5020188689231873,
"learning_rate": 0.00015222980733620473,
"loss": 0.9527727127075195,
"mean_token_accuracy": 0.7792690694332123,
"num_tokens": 14779729.0,
"step": 3660
},
{
"entropy": 0.9212399244308471,
"epoch": 1.0797293321565167,
"grad_norm": 0.4754965901374817,
"learning_rate": 0.0001519529734377545,
"loss": 0.931304931640625,
"mean_token_accuracy": 0.780491977930069,
"num_tokens": 14819988.0,
"step": 3670
},
{
"entropy": 0.9701619386672974,
"epoch": 1.0826713739335099,
"grad_norm": 0.51582270860672,
"learning_rate": 0.0001516755931135329,
"loss": 0.9845802307128906,
"mean_token_accuracy": 0.7685158431529999,
"num_tokens": 14860489.0,
"step": 3680
},
{
"entropy": 0.9930408954620361,
"epoch": 1.085613415710503,
"grad_norm": 0.5440912246704102,
"learning_rate": 0.00015139766928094303,
"loss": 0.9996217727661133,
"mean_token_accuracy": 0.7675297498703003,
"num_tokens": 14901041.0,
"step": 3690
},
{
"entropy": 0.9450788199901581,
"epoch": 1.0885554574874963,
"grad_norm": 0.5000481605529785,
"learning_rate": 0.00015111920486310417,
"loss": 0.9332949638366699,
"mean_token_accuracy": 0.7813515663146973,
"num_tokens": 14941532.0,
"step": 3700
},
{
"epoch": 1.0885554574874963,
"eval_entropy": 1.0368518566522074,
"eval_loss": 1.209765076637268,
"eval_mean_token_accuracy": 0.7356539055744181,
"eval_num_tokens": 14941532.0,
"eval_runtime": 117.1062,
"eval_samples_per_second": 26.002,
"eval_steps_per_second": 3.253,
"step": 3700
},
{
"entropy": 0.9497840762138366,
"epoch": 1.0914974992644895,
"grad_norm": 0.5802638530731201,
"learning_rate": 0.00015084020278882153,
"loss": 0.9746996879577636,
"mean_token_accuracy": 0.7703352689743042,
"num_tokens": 14981818.0,
"step": 3710
},
{
"entropy": 0.9469231545925141,
"epoch": 1.0944395410414829,
"grad_norm": 0.5750518441200256,
"learning_rate": 0.00015056066599255502,
"loss": 0.9662343025207519,
"mean_token_accuracy": 0.7726805984973908,
"num_tokens": 15022307.0,
"step": 3720
},
{
"entropy": 1.0127877593040466,
"epoch": 1.097381582818476,
"grad_norm": 0.5340924859046936,
"learning_rate": 0.0001502805974143888,
"loss": 0.9845216751098633,
"mean_token_accuracy": 0.7723982453346252,
"num_tokens": 15062810.0,
"step": 3730
},
{
"entropy": 0.974898761510849,
"epoch": 1.1003236245954693,
"grad_norm": 0.5145857334136963,
"learning_rate": 0.00015000000000000001,
"loss": 1.0081477165222168,
"mean_token_accuracy": 0.7643806636333466,
"num_tokens": 15103409.0,
"step": 3740
},
{
"entropy": 0.9510980069637298,
"epoch": 1.1032656663724625,
"grad_norm": 0.5267722606658936,
"learning_rate": 0.00014971887670062802,
"loss": 0.9365800857543946,
"mean_token_accuracy": 0.780946570634842,
"num_tokens": 15143947.0,
"step": 3750
},
{
"entropy": 1.0326339960098267,
"epoch": 1.1062077081494557,
"grad_norm": 0.517667293548584,
"learning_rate": 0.0001494372304730432,
"loss": 1.0606879234313964,
"mean_token_accuracy": 0.7520448863506317,
"num_tokens": 15184466.0,
"step": 3760
},
{
"entropy": 0.9095244884490967,
"epoch": 1.1091497499264489,
"grad_norm": 0.5059592127799988,
"learning_rate": 0.00014915506427951605,
"loss": 0.9141671180725097,
"mean_token_accuracy": 0.7883690059185028,
"num_tokens": 15224876.0,
"step": 3770
},
{
"entropy": 0.922091954946518,
"epoch": 1.1120917917034423,
"grad_norm": 0.5040479898452759,
"learning_rate": 0.0001488723810877858,
"loss": 0.9369124412536621,
"mean_token_accuracy": 0.7827723801136017,
"num_tokens": 15265334.0,
"step": 3780
},
{
"entropy": 0.9985373139381408,
"epoch": 1.1150338334804355,
"grad_norm": 0.593623697757721,
"learning_rate": 0.00014858918387102943,
"loss": 1.003388023376465,
"mean_token_accuracy": 0.7662831544876099,
"num_tokens": 15305821.0,
"step": 3790
},
{
"entropy": 1.0036361336708068,
"epoch": 1.1179758752574287,
"grad_norm": 0.46527841687202454,
"learning_rate": 0.00014830547560783013,
"loss": 1.0003900527954102,
"mean_token_accuracy": 0.7689646422863007,
"num_tokens": 15346212.0,
"step": 3800
},
{
"epoch": 1.1179758752574287,
"eval_entropy": 1.058281412587704,
"eval_loss": 1.207934856414795,
"eval_mean_token_accuracy": 0.7360553802467706,
"eval_num_tokens": 15346212.0,
"eval_runtime": 117.1795,
"eval_samples_per_second": 25.986,
"eval_steps_per_second": 3.251,
"step": 3800
},
{
"entropy": 1.0539171755313874,
"epoch": 1.120917917034422,
"grad_norm": 0.5680721402168274,
"learning_rate": 0.00014802125928214626,
"loss": 1.0743833541870118,
"mean_token_accuracy": 0.7493933796882629,
"num_tokens": 15386724.0,
"step": 3810
},
{
"entropy": 0.916973739862442,
"epoch": 1.123859958811415,
"grad_norm": 0.5414533019065857,
"learning_rate": 0.0001477365378832797,
"loss": 0.9400577545166016,
"mean_token_accuracy": 0.7826773941516876,
"num_tokens": 15427047.0,
"step": 3820
},
{
"entropy": 1.044243198633194,
"epoch": 1.1268020005884083,
"grad_norm": 0.5607292056083679,
"learning_rate": 0.0001474513144058447,
"loss": 1.0428730964660644,
"mean_token_accuracy": 0.7559836447238922,
"num_tokens": 15467253.0,
"step": 3830
},
{
"entropy": 0.9271435976028443,
"epoch": 1.1297440423654015,
"grad_norm": 0.5749255418777466,
"learning_rate": 0.0001471655918497361,
"loss": 0.9598716735839844,
"mean_token_accuracy": 0.7791375935077667,
"num_tokens": 15507806.0,
"step": 3840
},
{
"entropy": 0.9984059453010559,
"epoch": 1.132686084142395,
"grad_norm": 0.671238362789154,
"learning_rate": 0.00014687937322009793,
"loss": 0.9737402915954589,
"mean_token_accuracy": 0.7700112521648407,
"num_tokens": 15548383.0,
"step": 3850
},
{
"entropy": 0.9276021301746369,
"epoch": 1.135628125919388,
"grad_norm": 0.5724707841873169,
"learning_rate": 0.00014659266152729176,
"loss": 0.9540791511535645,
"mean_token_accuracy": 0.7781682848930359,
"num_tokens": 15588893.0,
"step": 3860
},
{
"entropy": 0.9864838421344757,
"epoch": 1.1385701676963813,
"grad_norm": 0.5926504731178284,
"learning_rate": 0.0001463054597868651,
"loss": 0.9837197303771973,
"mean_token_accuracy": 0.7700894057750702,
"num_tokens": 15629429.0,
"step": 3870
},
{
"entropy": 1.044099175930023,
"epoch": 1.1415122094733745,
"grad_norm": 0.5998210906982422,
"learning_rate": 0.00014601777101951957,
"loss": 1.064276123046875,
"mean_token_accuracy": 0.7496702373027802,
"num_tokens": 15669112.0,
"step": 3880
},
{
"entropy": 0.9448180139064789,
"epoch": 1.1444542512503677,
"grad_norm": 0.5697433948516846,
"learning_rate": 0.00014572959825107922,
"loss": 0.939006233215332,
"mean_token_accuracy": 0.7818219542503357,
"num_tokens": 15709418.0,
"step": 3890
},
{
"entropy": 0.9624788880348205,
"epoch": 1.147396293027361,
"grad_norm": 0.5536630749702454,
"learning_rate": 0.0001454409445124587,
"loss": 0.9612356185913086,
"mean_token_accuracy": 0.7735930144786834,
"num_tokens": 15749926.0,
"step": 3900
},
{
"epoch": 1.147396293027361,
"eval_entropy": 1.002416886056815,
"eval_loss": 1.2163069248199463,
"eval_mean_token_accuracy": 0.736245130147208,
"eval_num_tokens": 15749926.0,
"eval_runtime": 117.0848,
"eval_samples_per_second": 26.007,
"eval_steps_per_second": 3.254,
"step": 3900
},
{
"entropy": 0.9970716178417206,
"epoch": 1.150338334804354,
"grad_norm": 0.551414966583252,
"learning_rate": 0.00014515181283963132,
"loss": 1.024658489227295,
"mean_token_accuracy": 0.7571725130081177,
"num_tokens": 15790231.0,
"step": 3910
},
{
"entropy": 1.0260444402694702,
"epoch": 1.1532803765813475,
"grad_norm": 0.565596878528595,
"learning_rate": 0.0001448622062735972,
"loss": 1.0220839500427246,
"mean_token_accuracy": 0.7611811697483063,
"num_tokens": 15830791.0,
"step": 3920
},
{
"entropy": 0.9745386719703675,
"epoch": 1.1562224183583407,
"grad_norm": 0.522496223449707,
"learning_rate": 0.00014457212786035122,
"loss": 0.9898022651672364,
"mean_token_accuracy": 0.7702635705471039,
"num_tokens": 15871330.0,
"step": 3930
},
{
"entropy": 1.00519802570343,
"epoch": 1.159164460135334,
"grad_norm": 0.579716145992279,
"learning_rate": 0.00014428158065085098,
"loss": 1.0124700546264649,
"mean_token_accuracy": 0.7646716058254241,
"num_tokens": 15911828.0,
"step": 3940
},
{
"entropy": 0.9504435777664184,
"epoch": 1.1621065019123271,
"grad_norm": 0.5688005089759827,
"learning_rate": 0.00014399056770098478,
"loss": 0.9538630485534668,
"mean_token_accuracy": 0.776702755689621,
"num_tokens": 15952424.0,
"step": 3950
},
{
"entropy": 0.972985816001892,
"epoch": 1.1650485436893203,
"grad_norm": 0.5217563509941101,
"learning_rate": 0.00014369909207153947,
"loss": 0.9882010459899903,
"mean_token_accuracy": 0.7707372605800629,
"num_tokens": 15992995.0,
"step": 3960
},
{
"entropy": 1.0266568660736084,
"epoch": 1.1679905854663137,
"grad_norm": 0.607991099357605,
"learning_rate": 0.00014340715682816806,
"loss": 1.0269791603088378,
"mean_token_accuracy": 0.7627450406551362,
"num_tokens": 16033357.0,
"step": 3970
},
{
"entropy": 1.023914647102356,
"epoch": 1.170932627243307,
"grad_norm": 0.5779170393943787,
"learning_rate": 0.00014311476504135794,
"loss": 1.0473231315612792,
"mean_token_accuracy": 0.7546798884868622,
"num_tokens": 16073799.0,
"step": 3980
},
{
"entropy": 0.9414635598659515,
"epoch": 1.1738746690203001,
"grad_norm": 0.5319372415542603,
"learning_rate": 0.00014282191978639799,
"loss": 0.9280409812927246,
"mean_token_accuracy": 0.7800300478935241,
"num_tokens": 16114313.0,
"step": 3990
},
{
"entropy": 0.9413889229297638,
"epoch": 1.1768167107972933,
"grad_norm": 0.6013742089271545,
"learning_rate": 0.00014252862414334665,
"loss": 0.9760286331176757,
"mean_token_accuracy": 0.7707565903663636,
"num_tokens": 16154854.0,
"step": 4000
},
{
"epoch": 1.1768167107972933,
"eval_entropy": 1.0498265117954395,
"eval_loss": 1.2053319215774536,
"eval_mean_token_accuracy": 0.7371757869019596,
"eval_num_tokens": 16154854.0,
"eval_runtime": 116.9953,
"eval_samples_per_second": 26.027,
"eval_steps_per_second": 3.257,
"step": 4000
},
{
"entropy": 1.01264528632164,
"epoch": 1.1797587525742865,
"grad_norm": 0.5052332282066345,
"learning_rate": 0.00014223488119699944,
"loss": 0.9941823959350586,
"mean_token_accuracy": 0.7712079107761383,
"num_tokens": 16195404.0,
"step": 4010
},
{
"entropy": 0.9542388617992401,
"epoch": 1.1827007943512797,
"grad_norm": 0.6624171733856201,
"learning_rate": 0.00014194069403685643,
"loss": 0.9699134826660156,
"mean_token_accuracy": 0.7701153516769409,
"num_tokens": 16235933.0,
"step": 4020
},
{
"entropy": 0.9835664987564087,
"epoch": 1.185642836128273,
"grad_norm": 0.6638393998146057,
"learning_rate": 0.00014164606575708984,
"loss": 1.004053497314453,
"mean_token_accuracy": 0.7658522069454193,
"num_tokens": 16276402.0,
"step": 4030
},
{
"entropy": 0.9395282685756683,
"epoch": 1.1885848779052663,
"grad_norm": 0.5497994422912598,
"learning_rate": 0.0001413509994565114,
"loss": 0.9349452972412109,
"mean_token_accuracy": 0.7839109897613525,
"num_tokens": 16316751.0,
"step": 4040
},
{
"entropy": 0.9585356175899505,
"epoch": 1.1915269196822595,
"grad_norm": 0.5739843845367432,
"learning_rate": 0.00014105549823853987,
"loss": 0.9759317398071289,
"mean_token_accuracy": 0.7685169577598572,
"num_tokens": 16357253.0,
"step": 4050
},
{
"entropy": 0.9427942156791687,
"epoch": 1.1944689614592527,
"grad_norm": 0.5962517261505127,
"learning_rate": 0.00014075956521116827,
"loss": 0.9541123390197754,
"mean_token_accuracy": 0.7789060473442078,
"num_tokens": 16397470.0,
"step": 4060
},
{
"entropy": 0.9317695260047912,
"epoch": 1.197411003236246,
"grad_norm": 0.5742291212081909,
"learning_rate": 0.00014046320348693134,
"loss": 0.9425789833068847,
"mean_token_accuracy": 0.7804486751556396,
"num_tokens": 16438035.0,
"step": 4070
},
{
"entropy": 0.9994288563728333,
"epoch": 1.2003530450132391,
"grad_norm": 0.628220796585083,
"learning_rate": 0.00014016641618287264,
"loss": 1.0083752632141114,
"mean_token_accuracy": 0.7678903639316559,
"num_tokens": 16478563.0,
"step": 4080
},
{
"entropy": 0.9661390006542205,
"epoch": 1.2032950867902323,
"grad_norm": 0.5717418789863586,
"learning_rate": 0.00013986920642051196,
"loss": 0.956269645690918,
"mean_token_accuracy": 0.7756177723407746,
"num_tokens": 16519059.0,
"step": 4090
},
{
"entropy": 0.9637384474277496,
"epoch": 1.2062371285672255,
"grad_norm": 0.5957377552986145,
"learning_rate": 0.00013957157732581227,
"loss": 0.9740910530090332,
"mean_token_accuracy": 0.7705820441246033,
"num_tokens": 16559585.0,
"step": 4100
},
{
"epoch": 1.2062371285672255,
"eval_entropy": 1.0279845802646297,
"eval_loss": 1.204230546951294,
"eval_mean_token_accuracy": 0.7378412169108554,
"eval_num_tokens": 16559585.0,
"eval_runtime": 116.9158,
"eval_samples_per_second": 26.044,
"eval_steps_per_second": 3.259,
"step": 4100
},
{
"entropy": 0.9299012839794158,
"epoch": 1.209179170344219,
"grad_norm": 0.5047076344490051,
"learning_rate": 0.00013927353202914704,
"loss": 0.9325620651245117,
"mean_token_accuracy": 0.7856215178966522,
"num_tokens": 16599897.0,
"step": 4110
},
{
"entropy": 0.9579800009727478,
"epoch": 1.2121212121212122,
"grad_norm": 0.6236160397529602,
"learning_rate": 0.00013897507366526717,
"loss": 0.9825181007385254,
"mean_token_accuracy": 0.771970808506012,
"num_tokens": 16640445.0,
"step": 4120
},
{
"entropy": 0.9490958392620087,
"epoch": 1.2150632538982054,
"grad_norm": 0.5913335084915161,
"learning_rate": 0.00013867620537326807,
"loss": 0.9593384742736817,
"mean_token_accuracy": 0.7774133384227753,
"num_tokens": 16681074.0,
"step": 4130
},
{
"entropy": 0.9436961114406586,
"epoch": 1.2180052956751986,
"grad_norm": 0.5566667318344116,
"learning_rate": 0.00013837693029655673,
"loss": 0.9432112693786621,
"mean_token_accuracy": 0.7793719172477722,
"num_tokens": 16721515.0,
"step": 4140
},
{
"entropy": 0.9555536270141601,
"epoch": 1.2209473374521917,
"grad_norm": 0.5549019575119019,
"learning_rate": 0.00013807725158281845,
"loss": 0.964748764038086,
"mean_token_accuracy": 0.7745252251625061,
"num_tokens": 16761971.0,
"step": 4150
},
{
"entropy": 1.0964530289173127,
"epoch": 1.2238893792291852,
"grad_norm": 0.5607911944389343,
"learning_rate": 0.000137777172383984,
"loss": 1.1134785652160644,
"mean_token_accuracy": 0.7422545254230499,
"num_tokens": 16801992.0,
"step": 4160
},
{
"entropy": 0.991803640127182,
"epoch": 1.2268314210061784,
"grad_norm": 0.5912790894508362,
"learning_rate": 0.00013747669585619621,
"loss": 0.9909832954406739,
"mean_token_accuracy": 0.7725964307785034,
"num_tokens": 16842361.0,
"step": 4170
},
{
"entropy": 0.9360088646411896,
"epoch": 1.2297734627831716,
"grad_norm": 0.5747997760772705,
"learning_rate": 0.00013717582515977703,
"loss": 0.9449604988098145,
"mean_token_accuracy": 0.7815338909626007,
"num_tokens": 16882838.0,
"step": 4180
},
{
"entropy": 0.9765557944774628,
"epoch": 1.2327155045601648,
"grad_norm": 0.6022299528121948,
"learning_rate": 0.000136874563459194,
"loss": 0.9870802879333496,
"mean_token_accuracy": 0.7712435305118561,
"num_tokens": 16923293.0,
"step": 4190
},
{
"entropy": 0.9462984323501586,
"epoch": 1.235657546337158,
"grad_norm": 0.6731317639350891,
"learning_rate": 0.0001365729139230273,
"loss": 0.9600887298583984,
"mean_token_accuracy": 0.776117742061615,
"num_tokens": 16963785.0,
"step": 4200
},
{
"epoch": 1.235657546337158,
"eval_entropy": 1.0305638493984703,
"eval_loss": 1.2021441459655762,
"eval_mean_token_accuracy": 0.7379450076834111,
"eval_num_tokens": 16963785.0,
"eval_runtime": 117.0903,
"eval_samples_per_second": 26.006,
"eval_steps_per_second": 3.254,
"step": 4200
},
{
"entropy": 0.8939768195152282,
"epoch": 1.2385995881141512,
"grad_norm": 0.6204389333724976,
"learning_rate": 0.00013627087972393605,
"loss": 0.8930576324462891,
"mean_token_accuracy": 0.7922165811061859,
"num_tokens": 17004178.0,
"step": 4210
},
{
"entropy": 0.9507308840751648,
"epoch": 1.2415416298911444,
"grad_norm": 0.6163705587387085,
"learning_rate": 0.00013596846403862535,
"loss": 0.9666755676269532,
"mean_token_accuracy": 0.7786275684833527,
"num_tokens": 17044627.0,
"step": 4220
},
{
"entropy": 0.975046980381012,
"epoch": 1.2444836716681378,
"grad_norm": 0.5746111273765564,
"learning_rate": 0.00013566567004781246,
"loss": 0.9781759262084961,
"mean_token_accuracy": 0.7753113329410553,
"num_tokens": 17084732.0,
"step": 4230
},
{
"entropy": 1.0526322185993195,
"epoch": 1.247425713445131,
"grad_norm": 0.5716729164123535,
"learning_rate": 0.00013536250093619369,
"loss": 1.072258472442627,
"mean_token_accuracy": 0.7464600443840027,
"num_tokens": 17125248.0,
"step": 4240
},
{
"entropy": 1.0479264855384827,
"epoch": 1.2503677552221242,
"grad_norm": 0.493936151266098,
"learning_rate": 0.0001350589598924107,
"loss": 1.0373758316040038,
"mean_token_accuracy": 0.755853122472763,
"num_tokens": 17165782.0,
"step": 4250
},
{
"entropy": 1.0287334680557252,
"epoch": 1.2533097969991174,
"grad_norm": 0.5333849191665649,
"learning_rate": 0.000134755050109017,
"loss": 1.04658203125,
"mean_token_accuracy": 0.7597375035285949,
"num_tokens": 17206071.0,
"step": 4260
},
{
"entropy": 1.031979387998581,
"epoch": 1.2562518387761106,
"grad_norm": 0.5658190250396729,
"learning_rate": 0.00013445077478244443,
"loss": 1.043376350402832,
"mean_token_accuracy": 0.7583476483821869,
"num_tokens": 17246566.0,
"step": 4270
},
{
"entropy": 0.9648872256278992,
"epoch": 1.2591938805531038,
"grad_norm": 0.5289658308029175,
"learning_rate": 0.00013414613711296952,
"loss": 0.9618735313415527,
"mean_token_accuracy": 0.7800238966941834,
"num_tokens": 17286973.0,
"step": 4280
},
{
"entropy": 1.0292047560214996,
"epoch": 1.262135922330097,
"grad_norm": 0.5604351162910461,
"learning_rate": 0.0001338411403046797,
"loss": 1.0540773391723632,
"mean_token_accuracy": 0.7540550827980042,
"num_tokens": 17327524.0,
"step": 4290
},
{
"entropy": 0.9784096240997314,
"epoch": 1.2650779641070904,
"grad_norm": 0.4879390597343445,
"learning_rate": 0.0001335357875654399,
"loss": 0.9683723449707031,
"mean_token_accuracy": 0.7743236780166626,
"num_tokens": 17367944.0,
"step": 4300
},
{
"epoch": 1.2650779641070904,
"eval_entropy": 1.024107585976443,
"eval_loss": 1.2039283514022827,
"eval_mean_token_accuracy": 0.7379744303195183,
"eval_num_tokens": 17367944.0,
"eval_runtime": 117.345,
"eval_samples_per_second": 25.949,
"eval_steps_per_second": 3.247,
"step": 4300
},
{
"entropy": 0.9147277295589447,
"epoch": 1.2680200058840836,
"grad_norm": 0.6553487181663513,
"learning_rate": 0.00013323008210685847,
"loss": 0.9428836822509765,
"mean_token_accuracy": 0.7800225138664245,
"num_tokens": 17408180.0,
"step": 4310
},
{
"entropy": 0.9862501919269562,
"epoch": 1.2709620476610768,
"grad_norm": 0.5519722700119019,
"learning_rate": 0.00013292402714425362,
"loss": 0.986777400970459,
"mean_token_accuracy": 0.7709893763065339,
"num_tokens": 17448767.0,
"step": 4320
},
{
"entropy": 0.984834861755371,
"epoch": 1.27390408943807,
"grad_norm": 0.6772357821464539,
"learning_rate": 0.0001326176258966195,
"loss": 1.0062361717224122,
"mean_token_accuracy": 0.7673668503761292,
"num_tokens": 17488778.0,
"step": 4330
},
{
"entropy": 1.0079510390758515,
"epoch": 1.2768461312150632,
"grad_norm": 0.6486600637435913,
"learning_rate": 0.00013231088158659245,
"loss": 1.0013746261596679,
"mean_token_accuracy": 0.7689732432365417,
"num_tokens": 17529357.0,
"step": 4340
},
{
"entropy": 0.930513882637024,
"epoch": 1.2797881729920566,
"grad_norm": 0.5784080028533936,
"learning_rate": 0.000132003797440417,
"loss": 0.941129207611084,
"mean_token_accuracy": 0.7789569735527039,
"num_tokens": 17569912.0,
"step": 4350
},
{
"entropy": 0.9446238994598388,
"epoch": 1.2827302147690498,
"grad_norm": 0.5628758072853088,
"learning_rate": 0.00013169637668791192,
"loss": 0.9538597106933594,
"mean_token_accuracy": 0.77459996342659,
"num_tokens": 17609948.0,
"step": 4360
},
{
"entropy": 0.9073866546154022,
"epoch": 1.285672256546043,
"grad_norm": 0.530838131904602,
"learning_rate": 0.0001313886225624364,
"loss": 0.9122503280639649,
"mean_token_accuracy": 0.7910377562046051,
"num_tokens": 17650396.0,
"step": 4370
},
{
"entropy": 0.9176636934280396,
"epoch": 1.2886142983230362,
"grad_norm": 0.5510920286178589,
"learning_rate": 0.00013108053830085585,
"loss": 0.9334745407104492,
"mean_token_accuracy": 0.7811925649642945,
"num_tokens": 17690414.0,
"step": 4380
},
{
"entropy": 0.9487575829029083,
"epoch": 1.2915563401000294,
"grad_norm": 0.5764872431755066,
"learning_rate": 0.00013077212714350807,
"loss": 0.9546429634094238,
"mean_token_accuracy": 0.7754026472568512,
"num_tokens": 17730995.0,
"step": 4390
},
{
"entropy": 0.9580233573913575,
"epoch": 1.2944983818770226,
"grad_norm": 0.510247528553009,
"learning_rate": 0.00013046339233416896,
"loss": 0.9728780746459961,
"mean_token_accuracy": 0.7761204540729523,
"num_tokens": 17771461.0,
"step": 4400
},
{
"epoch": 1.2944983818770226,
"eval_entropy": 1.0582224565541025,
"eval_loss": 1.1942919492721558,
"eval_mean_token_accuracy": 0.7387891846691842,
"eval_num_tokens": 17771461.0,
"eval_runtime": 116.9989,
"eval_samples_per_second": 26.026,
"eval_steps_per_second": 3.256,
"step": 4400
},
{
"entropy": 1.009384435415268,
"epoch": 1.2974404236540158,
"grad_norm": 0.6525319218635559,
"learning_rate": 0.00013015433712001853,
"loss": 1.0117576599121094,
"mean_token_accuracy": 0.7641228914260865,
"num_tokens": 17811743.0,
"step": 4410
},
{
"entropy": 1.0025621831417084,
"epoch": 1.3003824654310092,
"grad_norm": 0.573704719543457,
"learning_rate": 0.00012984496475160667,
"loss": 1.0045761108398437,
"mean_token_accuracy": 0.7632306814193726,
"num_tokens": 17852150.0,
"step": 4420
},
{
"entropy": 0.889021772146225,
"epoch": 1.3033245072080024,
"grad_norm": 0.5899218916893005,
"learning_rate": 0.00012953527848281907,
"loss": 0.9004623413085937,
"mean_token_accuracy": 0.7905885875225067,
"num_tokens": 17892636.0,
"step": 4430
},
{
"entropy": 0.9145092189311981,
"epoch": 1.3062665489849956,
"grad_norm": 0.5533415675163269,
"learning_rate": 0.00012922528157084288,
"loss": 0.9265445709228516,
"mean_token_accuracy": 0.7852813005447388,
"num_tokens": 17933064.0,
"step": 4440
},
{
"entropy": 0.9506058990955353,
"epoch": 1.3092085907619888,
"grad_norm": 0.6637270450592041,
"learning_rate": 0.00012891497727613254,
"loss": 0.9665675163269043,
"mean_token_accuracy": 0.7746671974658966,
"num_tokens": 17973649.0,
"step": 4450
},
{
"entropy": 0.9762279152870178,
"epoch": 1.312150632538982,
"grad_norm": 0.5837135314941406,
"learning_rate": 0.0001286043688623754,
"loss": 0.9735669136047364,
"mean_token_accuracy": 0.7707210063934327,
"num_tokens": 18013745.0,
"step": 4460
},
{
"entropy": 0.9729628086090087,
"epoch": 1.3150926743159752,
"grad_norm": 0.5597095489501953,
"learning_rate": 0.00012829345959645744,
"loss": 0.983339500427246,
"mean_token_accuracy": 0.7697983980178833,
"num_tokens": 18054069.0,
"step": 4470
},
{
"entropy": 0.9756720840930939,
"epoch": 1.3180347160929684,
"grad_norm": 0.5869277119636536,
"learning_rate": 0.00012798225274842902,
"loss": 0.9763286590576172,
"mean_token_accuracy": 0.772778332233429,
"num_tokens": 18094600.0,
"step": 4480
},
{
"entropy": 0.9129612624645234,
"epoch": 1.3209767578699618,
"grad_norm": 0.5384027361869812,
"learning_rate": 0.00012767075159147022,
"loss": 0.9275237083435058,
"mean_token_accuracy": 0.7822975754737854,
"num_tokens": 18135046.0,
"step": 4490
},
{
"entropy": 1.018626469373703,
"epoch": 1.323918799646955,
"grad_norm": 0.6402458548545837,
"learning_rate": 0.0001273589594018567,
"loss": 1.0211774826049804,
"mean_token_accuracy": 0.7581977605819702,
"num_tokens": 18175538.0,
"step": 4500
},
{
"epoch": 1.323918799646955,
"eval_entropy": 1.0286849455570612,
"eval_loss": 1.1943385601043701,
"eval_mean_token_accuracy": 0.7395750786375812,
"eval_num_tokens": 18175538.0,
"eval_runtime": 117.1237,
"eval_samples_per_second": 25.998,
"eval_steps_per_second": 3.253,
"step": 4500
},
{
"entropy": 0.9917169988155365,
"epoch": 1.3268608414239482,
"grad_norm": 0.5651717185974121,
"learning_rate": 0.00012704687945892505,
"loss": 0.9929667472839355,
"mean_token_accuracy": 0.7704780220985412,
"num_tokens": 18215849.0,
"step": 4510
},
{
"entropy": 1.033211100101471,
"epoch": 1.3298028832009414,
"grad_norm": 0.6076104640960693,
"learning_rate": 0.00012673451504503842,
"loss": 1.053134059906006,
"mean_token_accuracy": 0.7530263483524322,
"num_tokens": 18256385.0,
"step": 4520
},
{
"entropy": 0.9446504592895508,
"epoch": 1.3327449249779346,
"grad_norm": 0.6524848341941833,
"learning_rate": 0.00012642186944555186,
"loss": 0.9612871170043945,
"mean_token_accuracy": 0.7765836179256439,
"num_tokens": 18296852.0,
"step": 4530
},
{
"entropy": 0.95798819065094,
"epoch": 1.335686966754928,
"grad_norm": 0.5617368817329407,
"learning_rate": 0.00012610894594877788,
"loss": 0.945002555847168,
"mean_token_accuracy": 0.7801140964031219,
"num_tokens": 18337142.0,
"step": 4540
},
{
"entropy": 0.9328159093856812,
"epoch": 1.338629008531921,
"grad_norm": 0.5558400750160217,
"learning_rate": 0.00012579574784595188,
"loss": 0.9508322715759278,
"mean_token_accuracy": 0.7804535567760468,
"num_tokens": 18377435.0,
"step": 4550
},
{
"entropy": 0.9236328303813934,
"epoch": 1.3415710503089144,
"grad_norm": 0.711991548538208,
"learning_rate": 0.00012548227843119743,
"loss": 0.9464892387390137,
"mean_token_accuracy": 0.781309175491333,
"num_tokens": 18418008.0,
"step": 4560
},
{
"entropy": 0.9790382087230682,
"epoch": 1.3445130920859076,
"grad_norm": 0.6019552946090698,
"learning_rate": 0.00012516854100149164,
"loss": 0.9772819519042969,
"mean_token_accuracy": 0.7732869625091553,
"num_tokens": 18458454.0,
"step": 4570
},
{
"entropy": 0.9923386096954345,
"epoch": 1.3474551338629008,
"grad_norm": 0.5897583365440369,
"learning_rate": 0.00012485453885663063,
"loss": 0.9893753051757812,
"mean_token_accuracy": 0.7679760038852692,
"num_tokens": 18498893.0,
"step": 4580
},
{
"entropy": 0.975125765800476,
"epoch": 1.350397175639894,
"grad_norm": 0.5963767766952515,
"learning_rate": 0.00012454027529919462,
"loss": 0.9868227958679199,
"mean_token_accuracy": 0.7681001186370849,
"num_tokens": 18539290.0,
"step": 4590
},
{
"entropy": 0.9119367241859436,
"epoch": 1.3533392174168872,
"grad_norm": 0.6379720568656921,
"learning_rate": 0.00012422575363451335,
"loss": 0.9250626564025879,
"mean_token_accuracy": 0.7873954355716706,
"num_tokens": 18579755.0,
"step": 4600
},
{
"epoch": 1.3533392174168872,
"eval_entropy": 1.045042129717474,
"eval_loss": 1.1902120113372803,
"eval_mean_token_accuracy": 0.739525359915936,
"eval_num_tokens": 18579755.0,
"eval_runtime": 117.0159,
"eval_samples_per_second": 26.022,
"eval_steps_per_second": 3.256,
"step": 4600
},
{
"entropy": 0.9590096414089203,
"epoch": 1.3562812591938807,
"grad_norm": 0.6441698670387268,
"learning_rate": 0.00012391097717063117,
"loss": 0.9705069541931153,
"mean_token_accuracy": 0.772186666727066,
"num_tokens": 18620196.0,
"step": 4610
},
{
"entropy": 0.9852827608585357,
"epoch": 1.3592233009708738,
"grad_norm": 0.6008490920066833,
"learning_rate": 0.00012359594921827245,
"loss": 0.9953920364379882,
"mean_token_accuracy": 0.7687745451927185,
"num_tokens": 18660133.0,
"step": 4620
},
{
"entropy": 1.0110926747322082,
"epoch": 1.362165342747867,
"grad_norm": 0.5604993104934692,
"learning_rate": 0.00012328067309080653,
"loss": 1.0147642135620116,
"mean_token_accuracy": 0.764192932844162,
"num_tokens": 18700534.0,
"step": 4630
},
{
"entropy": 0.9773748695850373,
"epoch": 1.3651073845248602,
"grad_norm": 0.5655143857002258,
"learning_rate": 0.0001229651521042131,
"loss": 1.0055460929870605,
"mean_token_accuracy": 0.7645916283130646,
"num_tokens": 18741011.0,
"step": 4640
},
{
"entropy": 0.912693589925766,
"epoch": 1.3680494263018534,
"grad_norm": 0.5726847648620605,
"learning_rate": 0.00012264938957704707,
"loss": 0.9029450416564941,
"mean_token_accuracy": 0.7917129874229432,
"num_tokens": 18781532.0,
"step": 4650
},
{
"entropy": 0.9796160280704498,
"epoch": 1.3709914680788466,
"grad_norm": 0.5168410539627075,
"learning_rate": 0.00012233338883040385,
"loss": 0.994998550415039,
"mean_token_accuracy": 0.7707984328269959,
"num_tokens": 18821962.0,
"step": 4660
},
{
"entropy": 0.9919624567031861,
"epoch": 1.3739335098558398,
"grad_norm": 0.5887457728385925,
"learning_rate": 0.00012201715318788445,
"loss": 0.9917054176330566,
"mean_token_accuracy": 0.7673246085643768,
"num_tokens": 18862330.0,
"step": 4670
},
{
"entropy": 1.0523385763168336,
"epoch": 1.3768755516328333,
"grad_norm": 0.5428098440170288,
"learning_rate": 0.00012170068597556035,
"loss": 1.077983283996582,
"mean_token_accuracy": 0.753084135055542,
"num_tokens": 18902801.0,
"step": 4680
},
{
"entropy": 0.9840884447097779,
"epoch": 1.3798175934098265,
"grad_norm": 0.6384422183036804,
"learning_rate": 0.00012138399052193867,
"loss": 0.9722138404846191,
"mean_token_accuracy": 0.7726257860660553,
"num_tokens": 18943381.0,
"step": 4690
},
{
"entropy": 0.9885792315006257,
"epoch": 1.3827596351868197,
"grad_norm": 0.5873745679855347,
"learning_rate": 0.00012106707015792702,
"loss": 1.0171488761901855,
"mean_token_accuracy": 0.7622905492782592,
"num_tokens": 18983723.0,
"step": 4700
},
{
"epoch": 1.3827596351868197,
"eval_entropy": 1.0410449004235856,
"eval_loss": 1.1852467060089111,
"eval_mean_token_accuracy": 0.7411133794021105,
"eval_num_tokens": 18983723.0,
"eval_runtime": 117.0373,
"eval_samples_per_second": 26.017,
"eval_steps_per_second": 3.255,
"step": 4700
},
{
"entropy": 0.9628095984458923,
"epoch": 1.3857016769638129,
"grad_norm": 0.6936764121055603,
"learning_rate": 0.00012074992821679866,
"loss": 0.9454580307006836,
"mean_token_accuracy": 0.7766146242618561,
"num_tokens": 19024209.0,
"step": 4710
},
{
"entropy": 0.9361280083656311,
"epoch": 1.388643718740806,
"grad_norm": 0.6228808164596558,
"learning_rate": 0.00012043256803415723,
"loss": 0.9670154571533203,
"mean_token_accuracy": 0.7755923092365264,
"num_tokens": 19064478.0,
"step": 4720
},
{
"entropy": 0.9650130271911621,
"epoch": 1.3915857605177995,
"grad_norm": 0.6201843619346619,
"learning_rate": 0.00012011499294790188,
"loss": 0.9677058219909668,
"mean_token_accuracy": 0.7731058478355408,
"num_tokens": 19104877.0,
"step": 4730
},
{
"entropy": 0.9925507187843323,
"epoch": 1.3945278022947925,
"grad_norm": 0.6081472635269165,
"learning_rate": 0.00011979720629819195,
"loss": 0.9994287490844727,
"mean_token_accuracy": 0.7684627115726471,
"num_tokens": 19145110.0,
"step": 4740
},
{
"entropy": 0.9522078454494476,
"epoch": 1.3974698440717859,
"grad_norm": 0.5446240305900574,
"learning_rate": 0.00011947921142741197,
"loss": 0.9563077926635742,
"mean_token_accuracy": 0.7776003420352936,
"num_tokens": 19185735.0,
"step": 4750
},
{
"entropy": 0.9502395629882813,
"epoch": 1.400411885848779,
"grad_norm": 0.6443742513656616,
"learning_rate": 0.00011916101168013649,
"loss": 0.9714900016784668,
"mean_token_accuracy": 0.7751095175743103,
"num_tokens": 19226033.0,
"step": 4760
},
{
"entropy": 0.9543413400650025,
"epoch": 1.4033539276257723,
"grad_norm": 0.60639488697052,
"learning_rate": 0.0001188426104030949,
"loss": 0.9422737121582031,
"mean_token_accuracy": 0.77926025390625,
"num_tokens": 19266072.0,
"step": 4770
},
{
"entropy": 0.8988826811313629,
"epoch": 1.4062959694027655,
"grad_norm": 0.6495632529258728,
"learning_rate": 0.00011852401094513621,
"loss": 0.9340031623840332,
"mean_token_accuracy": 0.7852738976478577,
"num_tokens": 19306236.0,
"step": 4780
},
{
"entropy": 1.0078811585903167,
"epoch": 1.4092380111797587,
"grad_norm": 0.6039602160453796,
"learning_rate": 0.00011820521665719377,
"loss": 1.0128409385681152,
"mean_token_accuracy": 0.7650022029876709,
"num_tokens": 19346514.0,
"step": 4790
},
{
"entropy": 1.002649539709091,
"epoch": 1.412180052956752,
"grad_norm": 0.5959292054176331,
"learning_rate": 0.00011788623089225024,
"loss": 0.9958960533142089,
"mean_token_accuracy": 0.7670526385307312,
"num_tokens": 19387036.0,
"step": 4800
},
{
"epoch": 1.412180052956752,
"eval_entropy": 1.0362290414612436,
"eval_loss": 1.1831785440444946,
"eval_mean_token_accuracy": 0.741490173058247,
"eval_num_tokens": 19387036.0,
"eval_runtime": 116.8979,
"eval_samples_per_second": 26.048,
"eval_steps_per_second": 3.259,
"step": 4800
},
{
"entropy": 0.9328351199626923,
"epoch": 1.4151220947337453,
"grad_norm": 0.5852963924407959,
"learning_rate": 0.00011756705700530206,
"loss": 0.9353754043579101,
"mean_token_accuracy": 0.7827379524707794,
"num_tokens": 19427397.0,
"step": 4810
},
{
"entropy": 0.9303468823432922,
"epoch": 1.4180641365107385,
"grad_norm": 0.5715077519416809,
"learning_rate": 0.0001172476983533243,
"loss": 0.9525899887084961,
"mean_token_accuracy": 0.7789464175701142,
"num_tokens": 19467357.0,
"step": 4820
},
{
"entropy": 0.9718841493129731,
"epoch": 1.4210061782877317,
"grad_norm": 0.5573017597198486,
"learning_rate": 0.00011692815829523536,
"loss": 0.974174690246582,
"mean_token_accuracy": 0.7711581230163574,
"num_tokens": 19507831.0,
"step": 4830
},
{
"entropy": 1.0064192593097687,
"epoch": 1.4239482200647249,
"grad_norm": 0.5526002645492554,
"learning_rate": 0.00011660844019186159,
"loss": 1.024794101715088,
"mean_token_accuracy": 0.7663941740989685,
"num_tokens": 19548408.0,
"step": 4840
},
{
"entropy": 1.0319006383419036,
"epoch": 1.426890261841718,
"grad_norm": 0.5624271035194397,
"learning_rate": 0.000116288547405902,
"loss": 1.0340099334716797,
"mean_token_accuracy": 0.760799127817154,
"num_tokens": 19588730.0,
"step": 4850
},
{
"entropy": 0.9651375532150268,
"epoch": 1.4298323036187113,
"grad_norm": 0.5835928320884705,
"learning_rate": 0.00011596848330189282,
"loss": 0.9745287895202637,
"mean_token_accuracy": 0.770064502954483,
"num_tokens": 19629242.0,
"step": 4860
},
{
"entropy": 0.9604414582252503,
"epoch": 1.4327743453957047,
"grad_norm": 0.6139530539512634,
"learning_rate": 0.00011564825124617218,
"loss": 0.967037582397461,
"mean_token_accuracy": 0.7793013870716095,
"num_tokens": 19669879.0,
"step": 4870
},
{
"entropy": 0.9346582233905792,
"epoch": 1.435716387172698,
"grad_norm": 0.6224908232688904,
"learning_rate": 0.00011532785460684466,
"loss": 0.9508832931518555,
"mean_token_accuracy": 0.7788917005062104,
"num_tokens": 19710258.0,
"step": 4880
},
{
"entropy": 0.9697970807552337,
"epoch": 1.438658428949691,
"grad_norm": 0.6372175216674805,
"learning_rate": 0.00011500729675374589,
"loss": 0.9690608978271484,
"mean_token_accuracy": 0.773440134525299,
"num_tokens": 19750412.0,
"step": 4890
},
{
"entropy": 0.9443894863128662,
"epoch": 1.4416004707266843,
"grad_norm": 0.6615795493125916,
"learning_rate": 0.00011468658105840706,
"loss": 0.9526325225830078,
"mean_token_accuracy": 0.7742028653621673,
"num_tokens": 19790931.0,
"step": 4900
},
{
"epoch": 1.4416004707266843,
"eval_entropy": 1.0257837637828717,
"eval_loss": 1.1847437620162964,
"eval_mean_token_accuracy": 0.7422285342779685,
"eval_num_tokens": 19790931.0,
"eval_runtime": 117.0067,
"eval_samples_per_second": 26.024,
"eval_steps_per_second": 3.256,
"step": 4900
},
{
"entropy": 0.9378713190555572,
"epoch": 1.4445425125036775,
"grad_norm": 0.5865938663482666,
"learning_rate": 0.0001143657108940196,
"loss": 0.9491632461547852,
"mean_token_accuracy": 0.7826192319393158,
"num_tokens": 19830998.0,
"step": 4910
},
{
"entropy": 0.8981156468391418,
"epoch": 1.447484554280671,
"grad_norm": 0.5529859066009521,
"learning_rate": 0.00011404468963539945,
"loss": 0.9059307098388671,
"mean_token_accuracy": 0.7881741523742676,
"num_tokens": 19871496.0,
"step": 4920
},
{
"entropy": 0.9764446496963501,
"epoch": 1.4504265960576639,
"grad_norm": 0.5437819361686707,
"learning_rate": 0.00011372352065895185,
"loss": 0.9850223541259766,
"mean_token_accuracy": 0.7667160212993622,
"num_tokens": 19912139.0,
"step": 4930
},
{
"entropy": 1.009915566444397,
"epoch": 1.4533686378346573,
"grad_norm": 0.48672956228256226,
"learning_rate": 0.00011340220734263562,
"loss": 1.009783935546875,
"mean_token_accuracy": 0.765262508392334,
"num_tokens": 19952597.0,
"step": 4940
},
{
"entropy": 0.9249416530132294,
"epoch": 1.4563106796116505,
"grad_norm": 0.5625399351119995,
"learning_rate": 0.00011308075306592771,
"loss": 0.9417881011962891,
"mean_token_accuracy": 0.7811116933822632,
"num_tokens": 19992805.0,
"step": 4950
},
{
"entropy": 0.9421619713306427,
"epoch": 1.4592527213886437,
"grad_norm": 0.5547005534172058,
"learning_rate": 0.00011275916120978769,
"loss": 0.9452463150024414,
"mean_token_accuracy": 0.7800073266029358,
"num_tokens": 20033035.0,
"step": 4960
},
{
"entropy": 0.9370833516120911,
"epoch": 1.462194763165637,
"grad_norm": 0.6140998601913452,
"learning_rate": 0.00011243743515662209,
"loss": 0.9688581466674805,
"mean_token_accuracy": 0.7777487993240356,
"num_tokens": 20073484.0,
"step": 4970
},
{
"entropy": 1.0171671450138091,
"epoch": 1.46513680494263,
"grad_norm": 0.6281487345695496,
"learning_rate": 0.00011211557829024892,
"loss": 0.9987648963928223,
"mean_token_accuracy": 0.7690559566020966,
"num_tokens": 20113884.0,
"step": 4980
},
{
"entropy": 0.9113238871097564,
"epoch": 1.4680788467196235,
"grad_norm": 0.5871062278747559,
"learning_rate": 0.00011179359399586202,
"loss": 0.9162681579589844,
"mean_token_accuracy": 0.7859906852245331,
"num_tokens": 20154035.0,
"step": 4990
},
{
"entropy": 0.9293498694896698,
"epoch": 1.4710208884966167,
"grad_norm": 0.5837708115577698,
"learning_rate": 0.00011147148565999553,
"loss": 0.9455188751220703,
"mean_token_accuracy": 0.7774161994457245,
"num_tokens": 20194398.0,
"step": 5000
},
{
"epoch": 1.4710208884966167,
"eval_entropy": 1.0215178381583196,
"eval_loss": 1.182082176208496,
"eval_mean_token_accuracy": 0.7424087036313034,
"eval_num_tokens": 20194398.0,
"eval_runtime": 116.9583,
"eval_samples_per_second": 26.035,
"eval_steps_per_second": 3.258,
"step": 5000
},
{
"entropy": 0.8972196578979492,
"epoch": 1.47396293027361,
"grad_norm": 0.6664556264877319,
"learning_rate": 0.00011114925667048814,
"loss": 0.897000789642334,
"mean_token_accuracy": 0.7910800576210022,
"num_tokens": 20234842.0,
"step": 5010
},
{
"entropy": 0.9366280138492584,
"epoch": 1.4769049720506031,
"grad_norm": 0.5105628967285156,
"learning_rate": 0.00011082691041644762,
"loss": 0.9634222984313965,
"mean_token_accuracy": 0.7767743766307831,
"num_tokens": 20275050.0,
"step": 5020
},
{
"entropy": 1.0160198926925659,
"epoch": 1.4798470138275963,
"grad_norm": 0.6369785666465759,
"learning_rate": 0.00011050445028821504,
"loss": 1.0192986488342286,
"mean_token_accuracy": 0.763582181930542,
"num_tokens": 20315449.0,
"step": 5030
},
{
"entropy": 1.007281619310379,
"epoch": 1.4827890556045895,
"grad_norm": 0.6209942698478699,
"learning_rate": 0.00011018187967732918,
"loss": 0.9973898887634277,
"mean_token_accuracy": 0.7695520281791687,
"num_tokens": 20355896.0,
"step": 5040
},
{
"entropy": 0.9635446310043335,
"epoch": 1.4857310973815827,
"grad_norm": 0.6402096748352051,
"learning_rate": 0.00010985920197649086,
"loss": 1.0030365943908692,
"mean_token_accuracy": 0.7655979931354523,
"num_tokens": 20396451.0,
"step": 5050
},
{
"entropy": 0.9760343492031097,
"epoch": 1.4886731391585761,
"grad_norm": 0.658566415309906,
"learning_rate": 0.00010953642057952722,
"loss": 0.9697424888610839,
"mean_token_accuracy": 0.7760293245315552,
"num_tokens": 20436615.0,
"step": 5060
},
{
"entropy": 0.9795664429664612,
"epoch": 1.4916151809355693,
"grad_norm": 0.5999899506568909,
"learning_rate": 0.00010921353888135605,
"loss": 0.9814806938171386,
"mean_token_accuracy": 0.7703676760196686,
"num_tokens": 20477070.0,
"step": 5070
},
{
"entropy": 0.9684806585311889,
"epoch": 1.4945572227125625,
"grad_norm": 0.6002531051635742,
"learning_rate": 0.00010889056027795009,
"loss": 0.9861810684204102,
"mean_token_accuracy": 0.7723352909088135,
"num_tokens": 20517284.0,
"step": 5080
},
{
"entropy": 0.9738320171833038,
"epoch": 1.4974992644895557,
"grad_norm": 0.6089113354682922,
"learning_rate": 0.00010856748816630127,
"loss": 0.984062385559082,
"mean_token_accuracy": 0.772194218635559,
"num_tokens": 20557266.0,
"step": 5090
},
{
"entropy": 0.9736992299556733,
"epoch": 1.500441306266549,
"grad_norm": 0.5777798295021057,
"learning_rate": 0.00010824432594438505,
"loss": 0.9862478256225586,
"mean_token_accuracy": 0.7714533090591431,
"num_tokens": 20597675.0,
"step": 5100
},
{
"epoch": 1.500441306266549,
"eval_entropy": 1.0292494011519775,
"eval_loss": 1.1772292852401733,
"eval_mean_token_accuracy": 0.7431638131304363,
"eval_num_tokens": 20597675.0,
"eval_runtime": 116.9139,
"eval_samples_per_second": 26.045,
"eval_steps_per_second": 3.259,
"step": 5100
},
{
"entropy": 0.902402263879776,
"epoch": 1.5033833480435423,
"grad_norm": 0.6047548055648804,
"learning_rate": 0.0001079210770111246,
"loss": 0.9013402938842774,
"mean_token_accuracy": 0.7890763878822327,
"num_tokens": 20638028.0,
"step": 5110
},
{
"entropy": 0.9433676958084106,
"epoch": 1.5063253898205353,
"grad_norm": 0.7120152711868286,
"learning_rate": 0.00010759774476635513,
"loss": 0.9629843711853028,
"mean_token_accuracy": 0.7767750382423401,
"num_tokens": 20677777.0,
"step": 5120
},
{
"entropy": 1.028594321012497,
"epoch": 1.5092674315975287,
"grad_norm": 0.5008658766746521,
"learning_rate": 0.00010727433261078808,
"loss": 1.0339035987854004,
"mean_token_accuracy": 0.757422798871994,
"num_tokens": 20718022.0,
"step": 5130
},
{
"entropy": 1.0165768265724182,
"epoch": 1.512209473374522,
"grad_norm": 0.5533917546272278,
"learning_rate": 0.00010695084394597537,
"loss": 1.0265631675720215,
"mean_token_accuracy": 0.7609834551811219,
"num_tokens": 20758730.0,
"step": 5140
},
{
"entropy": 1.011590701341629,
"epoch": 1.5151515151515151,
"grad_norm": 0.6278111934661865,
"learning_rate": 0.00010662728217427362,
"loss": 1.0116978645324708,
"mean_token_accuracy": 0.7609262108802796,
"num_tokens": 20799202.0,
"step": 5150
},
{
"entropy": 0.9122114419937134,
"epoch": 1.5180935569285083,
"grad_norm": 0.5556227564811707,
"learning_rate": 0.00010630365069880837,
"loss": 0.9219463348388672,
"mean_token_accuracy": 0.7841397285461426,
"num_tokens": 20839777.0,
"step": 5160
},
{
"entropy": 0.9289700865745545,
"epoch": 1.5210355987055015,
"grad_norm": 0.656017005443573,
"learning_rate": 0.00010597995292343827,
"loss": 0.9393370628356934,
"mean_token_accuracy": 0.7785055756568908,
"num_tokens": 20880324.0,
"step": 5170
},
{
"entropy": 0.9671324849128723,
"epoch": 1.523977640482495,
"grad_norm": 0.5946719646453857,
"learning_rate": 0.00010565619225271934,
"loss": 0.9726341247558594,
"mean_token_accuracy": 0.772115957736969,
"num_tokens": 20920903.0,
"step": 5180
},
{
"entropy": 0.9615132927894592,
"epoch": 1.526919682259488,
"grad_norm": 0.5949153304100037,
"learning_rate": 0.00010533237209186904,
"loss": 0.9622700691223145,
"mean_token_accuracy": 0.7711909949779511,
"num_tokens": 20960934.0,
"step": 5190
},
{
"entropy": 0.9015084564685821,
"epoch": 1.5298617240364814,
"grad_norm": 0.594940721988678,
"learning_rate": 0.00010500849584673059,
"loss": 0.9115975379943848,
"mean_token_accuracy": 0.7880061626434326,
"num_tokens": 21001387.0,
"step": 5200
},
{
"epoch": 1.5298617240364814,
"eval_entropy": 1.0002246155669996,
"eval_loss": 1.1774756908416748,
"eval_mean_token_accuracy": 0.7437332308511408,
"eval_num_tokens": 21001387.0,
"eval_runtime": 116.917,
"eval_samples_per_second": 26.044,
"eval_steps_per_second": 3.259,
"step": 5200
},
{
"entropy": 0.9508689403533935,
"epoch": 1.5328037658134746,
"grad_norm": 0.5927796959877014,
"learning_rate": 0.00010468456692373703,
"loss": 0.9860681533813477,
"mean_token_accuracy": 0.7731941938400269,
"num_tokens": 21041244.0,
"step": 5210
},
{
"entropy": 0.9849535644054412,
"epoch": 1.5357458075904677,
"grad_norm": 0.6247168183326721,
"learning_rate": 0.0001043605887298755,
"loss": 0.9714067459106446,
"mean_token_accuracy": 0.7714132785797119,
"num_tokens": 21081657.0,
"step": 5220
},
{
"entropy": 0.9713167667388916,
"epoch": 1.5386878493674612,
"grad_norm": 0.677769124507904,
"learning_rate": 0.00010403656467265138,
"loss": 0.9748648643493653,
"mean_token_accuracy": 0.7746425211429596,
"num_tokens": 21122109.0,
"step": 5230
},
{
"entropy": 0.9380859136581421,
"epoch": 1.5416298911444541,
"grad_norm": 0.6017981767654419,
"learning_rate": 0.00010371249816005235,
"loss": 0.950676441192627,
"mean_token_accuracy": 0.7762335240840912,
"num_tokens": 21162652.0,
"step": 5240
},
{
"entropy": 0.9800868451595306,
"epoch": 1.5445719329214476,
"grad_norm": 0.6067262291908264,
"learning_rate": 0.00010338839260051265,
"loss": 0.9775652885437012,
"mean_token_accuracy": 0.7736145675182342,
"num_tokens": 21203018.0,
"step": 5250
},
{
"entropy": 0.9683707654476166,
"epoch": 1.5475139746984408,
"grad_norm": 0.5600547790527344,
"learning_rate": 0.00010306425140287724,
"loss": 0.9947422027587891,
"mean_token_accuracy": 0.768592232465744,
"num_tokens": 21243193.0,
"step": 5260
},
{
"entropy": 0.9973979473114014,
"epoch": 1.550456016475434,
"grad_norm": 0.6263974905014038,
"learning_rate": 0.00010274007797636589,
"loss": 1.0072562217712402,
"mean_token_accuracy": 0.7686869263648987,
"num_tokens": 21283229.0,
"step": 5270
},
{
"entropy": 0.9464682042598724,
"epoch": 1.5533980582524272,
"grad_norm": 0.6058053970336914,
"learning_rate": 0.00010241587573053732,
"loss": 0.9497817993164063,
"mean_token_accuracy": 0.7785175144672394,
"num_tokens": 21323783.0,
"step": 5280
},
{
"entropy": 1.0010394990444182,
"epoch": 1.5563401000294204,
"grad_norm": 0.6609871983528137,
"learning_rate": 0.0001020916480752534,
"loss": 1.0204454421997071,
"mean_token_accuracy": 0.761556738615036,
"num_tokens": 21364201.0,
"step": 5290
},
{
"entropy": 0.9770765423774719,
"epoch": 1.5592821418064138,
"grad_norm": 0.5533031225204468,
"learning_rate": 0.00010176739842064323,
"loss": 0.9723864555358886,
"mean_token_accuracy": 0.7763941168785096,
"num_tokens": 21404698.0,
"step": 5300
},
{
"epoch": 1.5592821418064138,
"eval_entropy": 1.0454554280896826,
"eval_loss": 1.168828010559082,
"eval_mean_token_accuracy": 0.744419585844976,
"eval_num_tokens": 21404698.0,
"eval_runtime": 116.9992,
"eval_samples_per_second": 26.026,
"eval_steps_per_second": 3.256,
"step": 5300
},
{
"entropy": 0.9656676173210144,
"epoch": 1.5622241835834068,
"grad_norm": 0.6528343558311462,
"learning_rate": 0.00010144313017706726,
"loss": 0.9678720474243164,
"mean_token_accuracy": 0.779698771238327,
"num_tokens": 21445187.0,
"step": 5310
},
{
"entropy": 0.9781722486019134,
"epoch": 1.5651662253604002,
"grad_norm": 0.5709927678108215,
"learning_rate": 0.00010111884675508151,
"loss": 1.010976505279541,
"mean_token_accuracy": 0.7641195952892303,
"num_tokens": 21485680.0,
"step": 5320
},
{
"entropy": 0.9678059935569763,
"epoch": 1.5681082671373934,
"grad_norm": 0.5990722179412842,
"learning_rate": 0.00010079455156540163,
"loss": 0.9739880561828613,
"mean_token_accuracy": 0.777262145280838,
"num_tokens": 21526255.0,
"step": 5330
},
{
"entropy": 0.934472793340683,
"epoch": 1.5710503089143866,
"grad_norm": 0.5266041159629822,
"learning_rate": 0.00010047024801886702,
"loss": 0.9320767402648926,
"mean_token_accuracy": 0.7839205145835877,
"num_tokens": 21566600.0,
"step": 5340
},
{
"entropy": 0.9387928783893585,
"epoch": 1.5739923506913798,
"grad_norm": 0.5499687790870667,
"learning_rate": 0.00010014593952640494,
"loss": 0.9497169494628906,
"mean_token_accuracy": 0.7800655484199523,
"num_tokens": 21607156.0,
"step": 5350
},
{
"entropy": 1.0074778258800507,
"epoch": 1.576934392468373,
"grad_norm": 0.6578675508499146,
"learning_rate": 9.982162949899479e-05,
"loss": 1.020614242553711,
"mean_token_accuracy": 0.765311861038208,
"num_tokens": 21647444.0,
"step": 5360
},
{
"entropy": 0.9886070728302002,
"epoch": 1.5798764342453664,
"grad_norm": 0.6587820053100586,
"learning_rate": 9.949732134763199e-05,
"loss": 0.993044662475586,
"mean_token_accuracy": 0.7681563913822174,
"num_tokens": 21688001.0,
"step": 5370
},
{
"entropy": 0.899987381696701,
"epoch": 1.5828184760223594,
"grad_norm": 0.5484562516212463,
"learning_rate": 9.917301848329231e-05,
"loss": 0.9030593872070313,
"mean_token_accuracy": 0.7870559990406036,
"num_tokens": 21727882.0,
"step": 5380
},
{
"entropy": 0.9620799243450164,
"epoch": 1.5857605177993528,
"grad_norm": 0.619149386882782,
"learning_rate": 9.884872431689581e-05,
"loss": 0.9912420272827148,
"mean_token_accuracy": 0.7723720014095307,
"num_tokens": 21767753.0,
"step": 5390
},
{
"entropy": 1.000234466791153,
"epoch": 1.588702559576346,
"grad_norm": 0.5575194954872131,
"learning_rate": 9.852444225927122e-05,
"loss": 0.9978320121765136,
"mean_token_accuracy": 0.7687974095344543,
"num_tokens": 21808266.0,
"step": 5400
},
{
"epoch": 1.588702559576346,
"eval_entropy": 1.0398932406752128,
"eval_loss": 1.1691675186157227,
"eval_mean_token_accuracy": 0.7446287646694133,
"eval_num_tokens": 21808266.0,
"eval_runtime": 116.9348,
"eval_samples_per_second": 26.04,
"eval_steps_per_second": 3.258,
"step": 5400
},
{
"entropy": 0.9786124050617218,
"epoch": 1.5916446013533392,
"grad_norm": 0.5668993592262268,
"learning_rate": 9.820017572111973e-05,
"loss": 0.9736597061157226,
"mean_token_accuracy": 0.768933230638504,
"num_tokens": 21848699.0,
"step": 5410
},
{
"entropy": 0.9540457367897034,
"epoch": 1.5945866431303326,
"grad_norm": 0.5640490651130676,
"learning_rate": 9.787592811297946e-05,
"loss": 0.9902207374572753,
"mean_token_accuracy": 0.7725074052810669,
"num_tokens": 21888992.0,
"step": 5420
},
{
"entropy": 1.0212572634220123,
"epoch": 1.5975286849073256,
"grad_norm": 0.5303104519844055,
"learning_rate": 9.755170284518941e-05,
"loss": 1.0194875717163085,
"mean_token_accuracy": 0.7608138382434845,
"num_tokens": 21929626.0,
"step": 5430
},
{
"entropy": 0.9701859831809998,
"epoch": 1.600470726684319,
"grad_norm": 0.5962478518486023,
"learning_rate": 9.722750332785349e-05,
"loss": 0.9606605529785156,
"mean_token_accuracy": 0.7774016797542572,
"num_tokens": 21970234.0,
"step": 5440
},
{
"entropy": 0.9712904334068299,
"epoch": 1.6034127684613122,
"grad_norm": 0.6927991509437561,
"learning_rate": 9.690333297080493e-05,
"loss": 0.9966094970703125,
"mean_token_accuracy": 0.7730933606624604,
"num_tokens": 22010748.0,
"step": 5450
},
{
"entropy": 0.9969470083713532,
"epoch": 1.6063548102383054,
"grad_norm": 0.48069873452186584,
"learning_rate": 9.657919518357008e-05,
"loss": 1.0035072326660157,
"mean_token_accuracy": 0.7701967000961304,
"num_tokens": 22051287.0,
"step": 5460
},
{
"entropy": 0.9618437588214874,
"epoch": 1.6092968520152986,
"grad_norm": 0.5836319923400879,
"learning_rate": 9.625509337533296e-05,
"loss": 0.9715272903442382,
"mean_token_accuracy": 0.7741812229156494,
"num_tokens": 22091789.0,
"step": 5470
},
{
"entropy": 0.92412930727005,
"epoch": 1.6122388937922918,
"grad_norm": 0.5578837990760803,
"learning_rate": 9.593103095489895e-05,
"loss": 0.9371700286865234,
"mean_token_accuracy": 0.780053973197937,
"num_tokens": 22132202.0,
"step": 5480
},
{
"entropy": 0.9944902658462524,
"epoch": 1.6151809355692852,
"grad_norm": 0.6106790900230408,
"learning_rate": 9.560701133065932e-05,
"loss": 0.9925059318542481,
"mean_token_accuracy": 0.771945059299469,
"num_tokens": 22172268.0,
"step": 5490
},
{
"entropy": 0.9864640951156616,
"epoch": 1.6181229773462782,
"grad_norm": 0.6734248399734497,
"learning_rate": 9.528303791055511e-05,
"loss": 1.001873779296875,
"mean_token_accuracy": 0.7679209113121033,
"num_tokens": 22212768.0,
"step": 5500
},
{
"epoch": 1.6181229773462782,
"eval_entropy": 1.017620304985622,
"eval_loss": 1.1692047119140625,
"eval_mean_token_accuracy": 0.7452891367314056,
"eval_num_tokens": 22212768.0,
"eval_runtime": 116.9536,
"eval_samples_per_second": 26.036,
"eval_steps_per_second": 3.258,
"step": 5500
},
{
"entropy": 1.0032637119293213,
"epoch": 1.6210650191232716,
"grad_norm": 0.5667787194252014,
"learning_rate": 9.49591141020415e-05,
"loss": 1.0222906112670898,
"mean_token_accuracy": 0.7597410917282105,
"num_tokens": 22252809.0,
"step": 5510
},
{
"entropy": 0.9571436047554016,
"epoch": 1.6240070609002648,
"grad_norm": 0.5619379281997681,
"learning_rate": 9.463524331205183e-05,
"loss": 0.9508034706115722,
"mean_token_accuracy": 0.7767503380775451,
"num_tokens": 22293323.0,
"step": 5520
},
{
"entropy": 0.9551607072353363,
"epoch": 1.626949102677258,
"grad_norm": 0.5588138699531555,
"learning_rate": 9.431142894696174e-05,
"loss": 0.968760871887207,
"mean_token_accuracy": 0.7735034108161927,
"num_tokens": 22333734.0,
"step": 5530
},
{
"entropy": 0.9952653288841248,
"epoch": 1.6298911444542512,
"grad_norm": 0.6699135899543762,
"learning_rate": 9.398767441255356e-05,
"loss": 1.0053104400634765,
"mean_token_accuracy": 0.7669021725654602,
"num_tokens": 22373932.0,
"step": 5540
},
{
"entropy": 0.9081286668777466,
"epoch": 1.6328331862312444,
"grad_norm": 0.5072507858276367,
"learning_rate": 9.366398311398013e-05,
"loss": 0.9016472816467285,
"mean_token_accuracy": 0.7893808305263519,
"num_tokens": 22413965.0,
"step": 5550
},
{
"entropy": 0.8828806400299072,
"epoch": 1.6357752280082378,
"grad_norm": 0.5978274941444397,
"learning_rate": 9.334035845572945e-05,
"loss": 0.8982778549194336,
"mean_token_accuracy": 0.7933094263076782,
"num_tokens": 22454470.0,
"step": 5560
},
{
"entropy": 0.9773084461688996,
"epoch": 1.6387172697852308,
"grad_norm": 0.6699267029762268,
"learning_rate": 9.301680384158834e-05,
"loss": 0.9834060668945312,
"mean_token_accuracy": 0.7695248782634735,
"num_tokens": 22494739.0,
"step": 5570
},
{
"entropy": 0.9948910295963287,
"epoch": 1.6416593115622242,
"grad_norm": 0.6252007484436035,
"learning_rate": 9.269332267460717e-05,
"loss": 1.0045485496520996,
"mean_token_accuracy": 0.7613544285297393,
"num_tokens": 22535199.0,
"step": 5580
},
{
"entropy": 0.9052659869194031,
"epoch": 1.6446013533392174,
"grad_norm": 0.6294770240783691,
"learning_rate": 9.236991835706361e-05,
"loss": 0.9096663475036622,
"mean_token_accuracy": 0.7889019668102264,
"num_tokens": 22575473.0,
"step": 5590
},
{
"entropy": 0.9944833815097809,
"epoch": 1.6475433951162106,
"grad_norm": 0.6281090378761292,
"learning_rate": 9.204659429042723e-05,
"loss": 1.001138401031494,
"mean_token_accuracy": 0.766866946220398,
"num_tokens": 22615909.0,
"step": 5600
},
{
"epoch": 1.6475433951162106,
"eval_entropy": 1.0232390931897901,
"eval_loss": 1.1652106046676636,
"eval_mean_token_accuracy": 0.7458901661900398,
"eval_num_tokens": 22615909.0,
"eval_runtime": 116.9138,
"eval_samples_per_second": 26.045,
"eval_steps_per_second": 3.259,
"step": 5600
},
{
"entropy": 0.9154108762741089,
"epoch": 1.650485436893204,
"grad_norm": 0.6548126935958862,
"learning_rate": 9.172335387532337e-05,
"loss": 0.9116446495056152,
"mean_token_accuracy": 0.7843240320682525,
"num_tokens": 22656406.0,
"step": 5610
},
{
"entropy": 0.9418833196163178,
"epoch": 1.653427478670197,
"grad_norm": 0.5818053483963013,
"learning_rate": 9.140020051149768e-05,
"loss": 0.9640005111694336,
"mean_token_accuracy": 0.7761074662208557,
"num_tokens": 22697053.0,
"step": 5620
},
{
"entropy": 1.0457957625389098,
"epoch": 1.6563695204471904,
"grad_norm": 0.563709020614624,
"learning_rate": 9.107713759778022e-05,
"loss": 1.0556187629699707,
"mean_token_accuracy": 0.7564215302467346,
"num_tokens": 22737587.0,
"step": 5630
},
{
"entropy": 0.9574743151664734,
"epoch": 1.6593115622241836,
"grad_norm": 0.5461353659629822,
"learning_rate": 9.075416853204962e-05,
"loss": 0.9538365364074707,
"mean_token_accuracy": 0.7788529932498932,
"num_tokens": 22778087.0,
"step": 5640
},
{
"entropy": 0.9407840669155121,
"epoch": 1.6622536040011768,
"grad_norm": 0.6229255199432373,
"learning_rate": 9.043129671119757e-05,
"loss": 0.9592649459838867,
"mean_token_accuracy": 0.7759726524353028,
"num_tokens": 22818636.0,
"step": 5650
},
{
"entropy": 0.9525330364704132,
"epoch": 1.66519564577817,
"grad_norm": 0.517737627029419,
"learning_rate": 9.010852553109286e-05,
"loss": 0.9527083396911621,
"mean_token_accuracy": 0.7764707028865814,
"num_tokens": 22858762.0,
"step": 5660
},
{
"entropy": 0.9349599361419678,
"epoch": 1.6681376875551632,
"grad_norm": 0.5596241354942322,
"learning_rate": 8.97858583865459e-05,
"loss": 0.958860206604004,
"mean_token_accuracy": 0.7790948987007141,
"num_tokens": 22898952.0,
"step": 5670
},
{
"entropy": 1.007050359249115,
"epoch": 1.6710797293321566,
"grad_norm": 0.5459507703781128,
"learning_rate": 8.946329867127271e-05,
"loss": 1.009850311279297,
"mean_token_accuracy": 0.7651616752147674,
"num_tokens": 22939561.0,
"step": 5680
},
{
"entropy": 1.0099753439426422,
"epoch": 1.6740217711091496,
"grad_norm": 0.5861520171165466,
"learning_rate": 8.91408497778596e-05,
"loss": 1.0120088577270507,
"mean_token_accuracy": 0.7637844383716583,
"num_tokens": 22979861.0,
"step": 5690
},
{
"entropy": 0.9966277837753296,
"epoch": 1.676963812886143,
"grad_norm": 0.5946571826934814,
"learning_rate": 8.88185150977271e-05,
"loss": 1.007601833343506,
"mean_token_accuracy": 0.7666476130485534,
"num_tokens": 23020082.0,
"step": 5700
},
{
"epoch": 1.676963812886143,
"eval_entropy": 1.0183745707113911,
"eval_loss": 1.1620737314224243,
"eval_mean_token_accuracy": 0.7465485564367039,
"eval_num_tokens": 23020082.0,
"eval_runtime": 116.931,
"eval_samples_per_second": 26.041,
"eval_steps_per_second": 3.258,
"step": 5700
},
{
"entropy": 0.9435779750347137,
"epoch": 1.6799058546631362,
"grad_norm": 0.6357449293136597,
"learning_rate": 8.849629802109463e-05,
"loss": 0.9518097877502442,
"mean_token_accuracy": 0.7776012420654297,
"num_tokens": 23060478.0,
"step": 5710
},
{
"entropy": 0.9298249185085297,
"epoch": 1.6828478964401294,
"grad_norm": 0.5645864009857178,
"learning_rate": 8.817420193694458e-05,
"loss": 0.9362122535705566,
"mean_token_accuracy": 0.7816990375518799,
"num_tokens": 23100989.0,
"step": 5720
},
{
"entropy": 0.9536093771457672,
"epoch": 1.6857899382171226,
"grad_norm": 0.6313226819038391,
"learning_rate": 8.785223023298684e-05,
"loss": 0.9790238380432129,
"mean_token_accuracy": 0.7684462070465088,
"num_tokens": 23141372.0,
"step": 5730
},
{
"entropy": 0.980869197845459,
"epoch": 1.6887319799941158,
"grad_norm": 0.5673984885215759,
"learning_rate": 8.753038629562313e-05,
"loss": 0.9640357971191407,
"mean_token_accuracy": 0.7741734623908997,
"num_tokens": 23181824.0,
"step": 5740
},
{
"entropy": 0.9453027725219727,
"epoch": 1.6916740217711093,
"grad_norm": 0.6107286214828491,
"learning_rate": 8.720867350991128e-05,
"loss": 0.973298454284668,
"mean_token_accuracy": 0.7727717578411102,
"num_tokens": 23222185.0,
"step": 5750
},
{
"entropy": 0.9626853942871094,
"epoch": 1.6946160635481022,
"grad_norm": 0.5757405757904053,
"learning_rate": 8.688709525952978e-05,
"loss": 0.9758338928222656,
"mean_token_accuracy": 0.772016829252243,
"num_tokens": 23262748.0,
"step": 5760
},
{
"entropy": 1.010034316778183,
"epoch": 1.6975581053250957,
"grad_norm": 0.5179459452629089,
"learning_rate": 8.656565492674205e-05,
"loss": 1.0052608489990233,
"mean_token_accuracy": 0.7618889510631561,
"num_tokens": 23303156.0,
"step": 5770
},
{
"entropy": 0.8848201990127563,
"epoch": 1.7005001471020889,
"grad_norm": 0.5171420574188232,
"learning_rate": 8.6244355892361e-05,
"loss": 0.8950881958007812,
"mean_token_accuracy": 0.7918769180774688,
"num_tokens": 23343542.0,
"step": 5780
},
{
"entropy": 0.97591233253479,
"epoch": 1.703442188879082,
"grad_norm": 0.5939028859138489,
"learning_rate": 8.592320153571334e-05,
"loss": 1.006781005859375,
"mean_token_accuracy": 0.7655499637126922,
"num_tokens": 23384111.0,
"step": 5790
},
{
"entropy": 1.0557245433330535,
"epoch": 1.7063842306560755,
"grad_norm": 0.5849112868309021,
"learning_rate": 8.56021952346042e-05,
"loss": 1.0547979354858399,
"mean_token_accuracy": 0.7541541934013367,
"num_tokens": 23424325.0,
"step": 5800
},
{
"epoch": 1.7063842306560755,
"eval_entropy": 1.0359160511356014,
"eval_loss": 1.1574015617370605,
"eval_mean_token_accuracy": 0.7470338016044437,
"eval_num_tokens": 23424325.0,
"eval_runtime": 117.0424,
"eval_samples_per_second": 26.016,
"eval_steps_per_second": 3.255,
"step": 5800
},
{
"entropy": 0.9237102508544922,
"epoch": 1.7093262724330684,
"grad_norm": 0.6681418418884277,
"learning_rate": 8.528134036528137e-05,
"loss": 0.9113170623779296,
"mean_token_accuracy": 0.7863486051559448,
"num_tokens": 23464980.0,
"step": 5810
},
{
"entropy": 0.9206311643123627,
"epoch": 1.7122683142100619,
"grad_norm": 0.6511064767837524,
"learning_rate": 8.496064030240009e-05,
"loss": 0.9329397201538085,
"mean_token_accuracy": 0.7813911080360413,
"num_tokens": 23505413.0,
"step": 5820
},
{
"entropy": 0.9037490963935852,
"epoch": 1.715210355987055,
"grad_norm": 0.5818105936050415,
"learning_rate": 8.464009841898721e-05,
"loss": 0.9231014251708984,
"mean_token_accuracy": 0.7818919003009797,
"num_tokens": 23545900.0,
"step": 5830
},
{
"entropy": 0.9442632138729096,
"epoch": 1.7181523977640483,
"grad_norm": 0.5522459745407104,
"learning_rate": 8.431971808640604e-05,
"loss": 0.9421136856079102,
"mean_token_accuracy": 0.7791707038879394,
"num_tokens": 23586448.0,
"step": 5840
},
{
"entropy": 0.9893321812152862,
"epoch": 1.7210944395410415,
"grad_norm": 0.5959654450416565,
"learning_rate": 8.399950267432073e-05,
"loss": 0.9956209182739257,
"mean_token_accuracy": 0.7700819849967957,
"num_tokens": 23626702.0,
"step": 5850
},
{
"entropy": 0.9568989872932434,
"epoch": 1.7240364813180347,
"grad_norm": 0.573197066783905,
"learning_rate": 8.367945555066072e-05,
"loss": 0.9633212089538574,
"mean_token_accuracy": 0.7744130253791809,
"num_tokens": 23666990.0,
"step": 5860
},
{
"entropy": 0.9612068772315979,
"epoch": 1.726978523095028,
"grad_norm": 0.5556803941726685,
"learning_rate": 8.335958008158564e-05,
"loss": 0.9782578468322753,
"mean_token_accuracy": 0.7722434878349305,
"num_tokens": 23707419.0,
"step": 5870
},
{
"entropy": 0.9662620186805725,
"epoch": 1.729920564872021,
"grad_norm": 0.5682553648948669,
"learning_rate": 8.303987963144951e-05,
"loss": 0.980229663848877,
"mean_token_accuracy": 0.7722213745117188,
"num_tokens": 23747899.0,
"step": 5880
},
{
"entropy": 0.9563367903232575,
"epoch": 1.7328626066490145,
"grad_norm": 0.644250214099884,
"learning_rate": 8.272035756276574e-05,
"loss": 0.9506181716918946,
"mean_token_accuracy": 0.7787576794624329,
"num_tokens": 23788539.0,
"step": 5890
},
{
"entropy": 0.9529013156890869,
"epoch": 1.7358046484260077,
"grad_norm": 0.6191350817680359,
"learning_rate": 8.240101723617138e-05,
"loss": 0.9677048683166504,
"mean_token_accuracy": 0.7735692262649536,
"num_tokens": 23829048.0,
"step": 5900
},
{
"epoch": 1.7358046484260077,
"eval_entropy": 1.0162569234377443,
"eval_loss": 1.158705234527588,
"eval_mean_token_accuracy": 0.7472113214452748,
"eval_num_tokens": 23829048.0,
"eval_runtime": 117.138,
"eval_samples_per_second": 25.995,
"eval_steps_per_second": 3.253,
"step": 5900
},
{
"entropy": 0.9016938626766204,
"epoch": 1.7387466902030009,
"grad_norm": 0.5551770329475403,
"learning_rate": 8.208186201039215e-05,
"loss": 0.9020340919494629,
"mean_token_accuracy": 0.7864646017551422,
"num_tokens": 23869463.0,
"step": 5910
},
{
"entropy": 0.9148910164833068,
"epoch": 1.741688731979994,
"grad_norm": 0.6301241517066956,
"learning_rate": 8.176289524220682e-05,
"loss": 0.9415826797485352,
"mean_token_accuracy": 0.7839051008224487,
"num_tokens": 23909844.0,
"step": 5920
},
{
"entropy": 0.9921420514583588,
"epoch": 1.7446307737569873,
"grad_norm": 0.5697426199913025,
"learning_rate": 8.144412028641213e-05,
"loss": 0.9879927635192871,
"mean_token_accuracy": 0.7715160071849823,
"num_tokens": 23949594.0,
"step": 5930
},
{
"entropy": 0.9673746526241302,
"epoch": 1.7475728155339807,
"grad_norm": 0.7032338976860046,
"learning_rate": 8.112554049578723e-05,
"loss": 0.9581163406372071,
"mean_token_accuracy": 0.7709846436977387,
"num_tokens": 23989917.0,
"step": 5940
},
{
"entropy": 0.9318383634090424,
"epoch": 1.7505148573109737,
"grad_norm": 0.5314655900001526,
"learning_rate": 8.080715922105873e-05,
"loss": 0.948590087890625,
"mean_token_accuracy": 0.7803104817867279,
"num_tokens": 24029981.0,
"step": 5950
},
{
"entropy": 0.9275396764278412,
"epoch": 1.753456899087967,
"grad_norm": 0.6153765320777893,
"learning_rate": 8.048897981086527e-05,
"loss": 0.9455156326293945,
"mean_token_accuracy": 0.7832433462142945,
"num_tokens": 24070430.0,
"step": 5960
},
{
"entropy": 0.9975035011768341,
"epoch": 1.7563989408649603,
"grad_norm": 0.5570023059844971,
"learning_rate": 8.017100561172228e-05,
"loss": 0.9986433982849121,
"mean_token_accuracy": 0.7639374434947968,
"num_tokens": 24110808.0,
"step": 5970
},
{
"entropy": 0.985156637430191,
"epoch": 1.7593409826419535,
"grad_norm": 0.5818830132484436,
"learning_rate": 7.985323996798696e-05,
"loss": 1.0052236557006835,
"mean_token_accuracy": 0.7657383203506469,
"num_tokens": 24151389.0,
"step": 5980
},
{
"entropy": 0.8605277717113495,
"epoch": 1.762283024418947,
"grad_norm": 0.6189702153205872,
"learning_rate": 7.953568622182283e-05,
"loss": 0.8629916191101075,
"mean_token_accuracy": 0.8023348867893219,
"num_tokens": 24191459.0,
"step": 5990
},
{
"entropy": 0.9501297771930695,
"epoch": 1.7652250661959399,
"grad_norm": 0.6018420457839966,
"learning_rate": 7.921834771316489e-05,
"loss": 0.9521394729614258,
"mean_token_accuracy": 0.7792097210884095,
"num_tokens": 24231918.0,
"step": 6000
},
{
"epoch": 1.7652250661959399,
"eval_entropy": 1.0099175068650033,
"eval_loss": 1.1563514471054077,
"eval_mean_token_accuracy": 0.7476834809686256,
"eval_num_tokens": 24231918.0,
"eval_runtime": 117.1325,
"eval_samples_per_second": 25.996,
"eval_steps_per_second": 3.253,
"step": 6000
},
{
"entropy": 0.9893746197223663,
"epoch": 1.7681671079729333,
"grad_norm": 0.6696788668632507,
"learning_rate": 7.890122777968419e-05,
"loss": 1.0002012252807617,
"mean_token_accuracy": 0.7695098519325256,
"num_tokens": 24271949.0,
"step": 6010
},
{
"entropy": 0.9130432605743408,
"epoch": 1.7711091497499265,
"grad_norm": 0.558464527130127,
"learning_rate": 7.858432975675304e-05,
"loss": 0.9236593246459961,
"mean_token_accuracy": 0.7876148581504822,
"num_tokens": 24312102.0,
"step": 6020
},
{
"entropy": 0.9415574371814728,
"epoch": 1.7740511915269197,
"grad_norm": 0.5865418314933777,
"learning_rate": 7.826765697740957e-05,
"loss": 0.9479406356811524,
"mean_token_accuracy": 0.7780888140201568,
"num_tokens": 24352693.0,
"step": 6030
},
{
"entropy": 0.9924417555332183,
"epoch": 1.776993233303913,
"grad_norm": 0.6885764598846436,
"learning_rate": 7.795121277232302e-05,
"loss": 0.9945806503295899,
"mean_token_accuracy": 0.7648349285125733,
"num_tokens": 24393351.0,
"step": 6040
},
{
"entropy": 0.984746390581131,
"epoch": 1.779935275080906,
"grad_norm": 0.5965508222579956,
"learning_rate": 7.763500046975853e-05,
"loss": 0.9933752059936524,
"mean_token_accuracy": 0.7661060273647309,
"num_tokens": 24433759.0,
"step": 6050
},
{
"entropy": 0.9540082156658173,
"epoch": 1.7828773168578995,
"grad_norm": 0.5714673399925232,
"learning_rate": 7.731902339554206e-05,
"loss": 0.9627056121826172,
"mean_token_accuracy": 0.7745068728923797,
"num_tokens": 24474130.0,
"step": 6060
},
{
"entropy": 0.9777018666267395,
"epoch": 1.7858193586348925,
"grad_norm": 0.6372447609901428,
"learning_rate": 7.700328487302565e-05,
"loss": 1.0012830734252929,
"mean_token_accuracy": 0.7722647070884705,
"num_tokens": 24514543.0,
"step": 6070
},
{
"entropy": 0.9325755536556244,
"epoch": 1.788761400411886,
"grad_norm": 0.6036056876182556,
"learning_rate": 7.668778822305217e-05,
"loss": 0.9412235260009766,
"mean_token_accuracy": 0.7827513933181762,
"num_tokens": 24554910.0,
"step": 6080
},
{
"entropy": 0.9325757265090943,
"epoch": 1.7917034421888791,
"grad_norm": 0.5404115319252014,
"learning_rate": 7.637253676392074e-05,
"loss": 0.9362624168395997,
"mean_token_accuracy": 0.7818540692329407,
"num_tokens": 24595471.0,
"step": 6090
},
{
"entropy": 0.9863007128238678,
"epoch": 1.7946454839658723,
"grad_norm": 0.5063208341598511,
"learning_rate": 7.605753381135138e-05,
"loss": 0.9955570220947265,
"mean_token_accuracy": 0.7685826182365417,
"num_tokens": 24635954.0,
"step": 6100
},
{
"epoch": 1.7946454839658723,
"eval_entropy": 1.018375001085086,
"eval_loss": 1.1510425806045532,
"eval_mean_token_accuracy": 0.7484834164459249,
"eval_num_tokens": 24635954.0,
"eval_runtime": 117.0613,
"eval_samples_per_second": 26.012,
"eval_steps_per_second": 3.255,
"step": 6100
},
{
"entropy": 0.9858646035194397,
"epoch": 1.7975875257428655,
"grad_norm": 0.5223714113235474,
"learning_rate": 7.574278267845069e-05,
"loss": 0.9794650077819824,
"mean_token_accuracy": 0.7720021724700927,
"num_tokens": 24676082.0,
"step": 6110
},
{
"entropy": 0.9158162832260132,
"epoch": 1.8005295675198587,
"grad_norm": 0.5589065551757812,
"learning_rate": 7.542828667567643e-05,
"loss": 0.9300893783569336,
"mean_token_accuracy": 0.7815610766410828,
"num_tokens": 24716092.0,
"step": 6120
},
{
"entropy": 0.9701406300067902,
"epoch": 1.8034716092968521,
"grad_norm": 0.5639879107475281,
"learning_rate": 7.511404911080319e-05,
"loss": 0.9756647109985351,
"mean_token_accuracy": 0.7717652976512909,
"num_tokens": 24756492.0,
"step": 6130
},
{
"entropy": 0.9247495532035828,
"epoch": 1.806413651073845,
"grad_norm": 0.5797274708747864,
"learning_rate": 7.480007328888724e-05,
"loss": 0.936155128479004,
"mean_token_accuracy": 0.7829030692577362,
"num_tokens": 24796749.0,
"step": 6140
},
{
"entropy": 0.938025814294815,
"epoch": 1.8093556928508385,
"grad_norm": 0.5666895508766174,
"learning_rate": 7.4486362512232e-05,
"loss": 0.9645057678222656,
"mean_token_accuracy": 0.7746810853481293,
"num_tokens": 24837331.0,
"step": 6150
},
{
"entropy": 0.9641852080821991,
"epoch": 1.8122977346278317,
"grad_norm": 0.599879801273346,
"learning_rate": 7.417292008035324e-05,
"loss": 0.9777825355529786,
"mean_token_accuracy": 0.7724472403526306,
"num_tokens": 24877763.0,
"step": 6160
},
{
"entropy": 0.9553499221801758,
"epoch": 1.815239776404825,
"grad_norm": 0.6117326617240906,
"learning_rate": 7.385974928994424e-05,
"loss": 0.9453524589538574,
"mean_token_accuracy": 0.7802985668182373,
"num_tokens": 24918137.0,
"step": 6170
},
{
"entropy": 0.9422242820262909,
"epoch": 1.8181818181818183,
"grad_norm": 0.5855090618133545,
"learning_rate": 7.354685343484141e-05,
"loss": 0.9547875404357911,
"mean_token_accuracy": 0.7761439323425293,
"num_tokens": 24958540.0,
"step": 6180
},
{
"entropy": 0.9967733740806579,
"epoch": 1.8211238599588113,
"grad_norm": 0.6166797876358032,
"learning_rate": 7.323423580598929e-05,
"loss": 1.0093032836914062,
"mean_token_accuracy": 0.7644744515419006,
"num_tokens": 24999010.0,
"step": 6190
},
{
"entropy": 0.941829913854599,
"epoch": 1.8240659017358047,
"grad_norm": 0.5976388454437256,
"learning_rate": 7.292189969140627e-05,
"loss": 0.947477912902832,
"mean_token_accuracy": 0.7809899151325226,
"num_tokens": 25039340.0,
"step": 6200
},
{
"epoch": 1.8240659017358047,
"eval_entropy": 1.0210820323056748,
"eval_loss": 1.149355173110962,
"eval_mean_token_accuracy": 0.7488088197908377,
"eval_num_tokens": 25039340.0,
"eval_runtime": 117.0315,
"eval_samples_per_second": 26.019,
"eval_steps_per_second": 3.256,
"step": 6200
},
{
"entropy": 0.968848192691803,
"epoch": 1.8270079435127977,
"grad_norm": 0.5740299224853516,
"learning_rate": 7.260984837614976e-05,
"loss": 0.97001953125,
"mean_token_accuracy": 0.7758668541908265,
"num_tokens": 25079782.0,
"step": 6210
},
{
"entropy": 0.9311902105808259,
"epoch": 1.8299499852897911,
"grad_norm": 0.595485508441925,
"learning_rate": 7.229808514228182e-05,
"loss": 0.9422932624816894,
"mean_token_accuracy": 0.779161137342453,
"num_tokens": 25120387.0,
"step": 6220
},
{
"entropy": 0.904720538854599,
"epoch": 1.8328920270667843,
"grad_norm": 0.6421226263046265,
"learning_rate": 7.198661326883446e-05,
"loss": 0.8994368553161621,
"mean_token_accuracy": 0.7901805222034455,
"num_tokens": 25160952.0,
"step": 6230
},
{
"entropy": 0.9412755012512207,
"epoch": 1.8358340688437775,
"grad_norm": 0.5241137742996216,
"learning_rate": 7.167543603177535e-05,
"loss": 0.9635884284973144,
"mean_token_accuracy": 0.7794827520847321,
"num_tokens": 25201298.0,
"step": 6240
},
{
"entropy": 0.9426091015338898,
"epoch": 1.838776110620771,
"grad_norm": 0.599162220954895,
"learning_rate": 7.136455670397317e-05,
"loss": 0.9373735427856446,
"mean_token_accuracy": 0.7805384159088135,
"num_tokens": 25241730.0,
"step": 6250
},
{
"entropy": 0.8965267837047577,
"epoch": 1.841718152397764,
"grad_norm": 0.5050822496414185,
"learning_rate": 7.105397855516332e-05,
"loss": 0.8982341766357422,
"mean_token_accuracy": 0.7888839960098266,
"num_tokens": 25282116.0,
"step": 6260
},
{
"entropy": 0.9214460253715515,
"epoch": 1.8446601941747574,
"grad_norm": 0.6280463933944702,
"learning_rate": 7.074370485191353e-05,
"loss": 0.9190691947937012,
"mean_token_accuracy": 0.7820965766906738,
"num_tokens": 25322627.0,
"step": 6270
},
{
"entropy": 0.892383873462677,
"epoch": 1.8476022359517505,
"grad_norm": 0.582876980304718,
"learning_rate": 7.043373885758938e-05,
"loss": 0.9149553298950195,
"mean_token_accuracy": 0.786136794090271,
"num_tokens": 25362914.0,
"step": 6280
},
{
"entropy": 0.9371729671955109,
"epoch": 1.8505442777287437,
"grad_norm": 0.5959920287132263,
"learning_rate": 7.012408383232016e-05,
"loss": 0.9573171615600586,
"mean_token_accuracy": 0.7788842916488647,
"num_tokens": 25403259.0,
"step": 6290
},
{
"entropy": 0.9701938092708587,
"epoch": 1.853486319505737,
"grad_norm": 0.5748452544212341,
"learning_rate": 6.981474303296436e-05,
"loss": 0.9730587005615234,
"mean_token_accuracy": 0.7726157248020172,
"num_tokens": 25443719.0,
"step": 6300
},
{
"epoch": 1.853486319505737,
"eval_entropy": 1.0170868068855266,
"eval_loss": 1.148207426071167,
"eval_mean_token_accuracy": 0.7491307286765632,
"eval_num_tokens": 25443719.0,
"eval_runtime": 116.8997,
"eval_samples_per_second": 26.048,
"eval_steps_per_second": 3.259,
"step": 6300
},
{
"entropy": 0.9635331034660339,
"epoch": 1.8564283612827301,
"grad_norm": 0.6185274720191956,
"learning_rate": 6.950571971307566e-05,
"loss": 0.9592901229858398,
"mean_token_accuracy": 0.7760994493961334,
"num_tokens": 25484099.0,
"step": 6310
},
{
"entropy": 0.9286525547504425,
"epoch": 1.8593704030597236,
"grad_norm": 0.6655700206756592,
"learning_rate": 6.919701712286845e-05,
"loss": 0.949979591369629,
"mean_token_accuracy": 0.7771933794021606,
"num_tokens": 25524367.0,
"step": 6320
},
{
"entropy": 0.9677168548107147,
"epoch": 1.8623124448367165,
"grad_norm": 0.6009240746498108,
"learning_rate": 6.888863850918397e-05,
"loss": 0.9582759857177734,
"mean_token_accuracy": 0.7760764300823212,
"num_tokens": 25564686.0,
"step": 6330
},
{
"entropy": 0.8910984754562378,
"epoch": 1.86525448661371,
"grad_norm": 0.5529386401176453,
"learning_rate": 6.858058711545576e-05,
"loss": 0.8990293502807617,
"mean_token_accuracy": 0.7910106480121613,
"num_tokens": 25604757.0,
"step": 6340
},
{
"entropy": 0.9290744543075562,
"epoch": 1.8681965283907032,
"grad_norm": 0.5405545234680176,
"learning_rate": 6.827286618167593e-05,
"loss": 0.9487281799316406,
"mean_token_accuracy": 0.7793277740478516,
"num_tokens": 25645415.0,
"step": 6350
},
{
"entropy": 0.9454367399215698,
"epoch": 1.8711385701676964,
"grad_norm": 0.5304380059242249,
"learning_rate": 6.796547894436078e-05,
"loss": 0.9380789756774902,
"mean_token_accuracy": 0.7802990317344666,
"num_tokens": 25685916.0,
"step": 6360
},
{
"entropy": 0.9380621254444123,
"epoch": 1.8740806119446898,
"grad_norm": 0.5882839560508728,
"learning_rate": 6.7658428636517e-05,
"loss": 0.9496076583862305,
"mean_token_accuracy": 0.7798330843448639,
"num_tokens": 25726346.0,
"step": 6370
},
{
"entropy": 0.9862012684345245,
"epoch": 1.8770226537216828,
"grad_norm": 0.614160418510437,
"learning_rate": 6.735171848760753e-05,
"loss": 1.0059632301330566,
"mean_token_accuracy": 0.7649614214897156,
"num_tokens": 25766824.0,
"step": 6380
},
{
"entropy": 0.9311308979988098,
"epoch": 1.8799646954986762,
"grad_norm": 0.6380969882011414,
"learning_rate": 6.704535172351752e-05,
"loss": 0.9215089797973632,
"mean_token_accuracy": 0.7852351903915405,
"num_tokens": 25807230.0,
"step": 6390
},
{
"entropy": 0.9507179379463195,
"epoch": 1.8829067372756692,
"grad_norm": 0.5307073593139648,
"learning_rate": 6.673933156652068e-05,
"loss": 0.9578986167907715,
"mean_token_accuracy": 0.7749875366687775,
"num_tokens": 25847836.0,
"step": 6400
},
{
"epoch": 1.8829067372756692,
"eval_entropy": 0.996097157007753,
"eval_loss": 1.1488333940505981,
"eval_mean_token_accuracy": 0.7494633900211864,
"eval_num_tokens": 25847836.0,
"eval_runtime": 116.9155,
"eval_samples_per_second": 26.044,
"eval_steps_per_second": 3.259,
"step": 6400
},
{
"entropy": 0.9198467969894409,
"epoch": 1.8858487790526626,
"grad_norm": 0.4932635426521301,
"learning_rate": 6.643366123524502e-05,
"loss": 0.9418664932250976,
"mean_token_accuracy": 0.7808252394199371,
"num_tokens": 25887974.0,
"step": 6410
},
{
"entropy": 0.9422998249530792,
"epoch": 1.8887908208296558,
"grad_norm": 0.5207474231719971,
"learning_rate": 6.612834394463936e-05,
"loss": 0.9341909408569335,
"mean_token_accuracy": 0.7823355376720429,
"num_tokens": 25927559.0,
"step": 6420
},
{
"entropy": 1.0029336094856263,
"epoch": 1.891732862606649,
"grad_norm": 0.6492148041725159,
"learning_rate": 6.582338290593918e-05,
"loss": 1.0240444183349608,
"mean_token_accuracy": 0.7613063991069794,
"num_tokens": 25968038.0,
"step": 6430
},
{
"entropy": 0.9833552062511444,
"epoch": 1.8946749043836424,
"grad_norm": 0.590535044670105,
"learning_rate": 6.55187813266332e-05,
"loss": 1.003129005432129,
"mean_token_accuracy": 0.7678163826465607,
"num_tokens": 26008348.0,
"step": 6440
},
{
"entropy": 0.8829509735107421,
"epoch": 1.8976169461606354,
"grad_norm": 0.574521005153656,
"learning_rate": 6.521454241042924e-05,
"loss": 0.8665548324584961,
"mean_token_accuracy": 0.7962438046932221,
"num_tokens": 26048371.0,
"step": 6450
},
{
"entropy": 0.9392775595188141,
"epoch": 1.9005589879376288,
"grad_norm": 0.5355552434921265,
"learning_rate": 6.491066935722091e-05,
"loss": 0.9620846748352051,
"mean_token_accuracy": 0.7788772523403168,
"num_tokens": 26088711.0,
"step": 6460
},
{
"entropy": 1.0055284202098846,
"epoch": 1.903501029714622,
"grad_norm": 0.6064245104789734,
"learning_rate": 6.460716536305371e-05,
"loss": 1.0159719467163086,
"mean_token_accuracy": 0.7620534121990203,
"num_tokens": 26128624.0,
"step": 6470
},
{
"entropy": 0.9730878472328186,
"epoch": 1.9064430714916152,
"grad_norm": 0.5835123062133789,
"learning_rate": 6.430403362009148e-05,
"loss": 0.9895232200622559,
"mean_token_accuracy": 0.7685076415538787,
"num_tokens": 26168887.0,
"step": 6480
},
{
"entropy": 1.0025964260101319,
"epoch": 1.9093851132686084,
"grad_norm": 0.5540186762809753,
"learning_rate": 6.400127731658288e-05,
"loss": 1.0111456871032716,
"mean_token_accuracy": 0.7678469300270081,
"num_tokens": 26209115.0,
"step": 6490
},
{
"entropy": 0.9000465154647828,
"epoch": 1.9123271550456016,
"grad_norm": 0.6538901329040527,
"learning_rate": 6.369889963682775e-05,
"loss": 0.8854584693908691,
"mean_token_accuracy": 0.7922401070594788,
"num_tokens": 26249571.0,
"step": 6500
},
{
"epoch": 1.9123271550456016,
"eval_entropy": 1.0023321302856987,
"eval_loss": 1.144269347190857,
"eval_mean_token_accuracy": 0.750116533494684,
"eval_num_tokens": 26249571.0,
"eval_runtime": 116.9879,
"eval_samples_per_second": 26.028,
"eval_steps_per_second": 3.257,
"step": 6500
},
{
"entropy": 0.8953414797782898,
"epoch": 1.915269196822595,
"grad_norm": 0.653469979763031,
"learning_rate": 6.339690376114376e-05,
"loss": 0.9109439849853516,
"mean_token_accuracy": 0.7873705565929413,
"num_tokens": 26289741.0,
"step": 6510
},
{
"entropy": 0.9372224569320678,
"epoch": 1.918211238599588,
"grad_norm": 0.5968233942985535,
"learning_rate": 6.309529286583277e-05,
"loss": 0.9563371658325195,
"mean_token_accuracy": 0.7744209468364716,
"num_tokens": 26330177.0,
"step": 6520
},
{
"entropy": 0.9607797980308532,
"epoch": 1.9211532803765814,
"grad_norm": 0.5552541613578796,
"learning_rate": 6.279407012314767e-05,
"loss": 0.9554115295410156,
"mean_token_accuracy": 0.7792925059795379,
"num_tokens": 26370862.0,
"step": 6530
},
{
"entropy": 0.907786774635315,
"epoch": 1.9240953221535746,
"grad_norm": 0.5680334568023682,
"learning_rate": 6.24932387012587e-05,
"loss": 0.9195439338684082,
"mean_token_accuracy": 0.7899148881435394,
"num_tokens": 26411179.0,
"step": 6540
},
{
"entropy": 0.9673022449016571,
"epoch": 1.9270373639305678,
"grad_norm": 0.6412580013275146,
"learning_rate": 6.219280176422049e-05,
"loss": 0.954334831237793,
"mean_token_accuracy": 0.7745688319206238,
"num_tokens": 26451456.0,
"step": 6550
},
{
"entropy": 0.87952721118927,
"epoch": 1.9299794057075612,
"grad_norm": 0.5724232792854309,
"learning_rate": 6.189276247193843e-05,
"loss": 0.8992857933044434,
"mean_token_accuracy": 0.7907371282577514,
"num_tokens": 26491838.0,
"step": 6560
},
{
"entropy": 0.9637729346752166,
"epoch": 1.9329214474845542,
"grad_norm": 0.5687930583953857,
"learning_rate": 6.159312398013575e-05,
"loss": 0.9747313499450684,
"mean_token_accuracy": 0.7751651465892792,
"num_tokens": 26532331.0,
"step": 6570
},
{
"entropy": 0.8893602311611175,
"epoch": 1.9358634892615476,
"grad_norm": 0.6103338003158569,
"learning_rate": 6.129388944032013e-05,
"loss": 0.8748321533203125,
"mean_token_accuracy": 0.7961347103118896,
"num_tokens": 26572762.0,
"step": 6580
},
{
"entropy": 0.9709321618080139,
"epoch": 1.9388055310385406,
"grad_norm": 0.657652735710144,
"learning_rate": 6.0995061999750516e-05,
"loss": 0.9933636665344239,
"mean_token_accuracy": 0.7684541165828704,
"num_tokens": 26613213.0,
"step": 6590
},
{
"entropy": 0.8971224725246429,
"epoch": 1.941747572815534,
"grad_norm": 0.5717299580574036,
"learning_rate": 6.069664480140424e-05,
"loss": 0.9132408142089844,
"mean_token_accuracy": 0.7878471255302429,
"num_tokens": 26653598.0,
"step": 6600
},
{
"epoch": 1.941747572815534,
"eval_entropy": 0.9943520688009387,
"eval_loss": 1.1435333490371704,
"eval_mean_token_accuracy": 0.7505803089442216,
"eval_num_tokens": 26653598.0,
"eval_runtime": 117.1237,
"eval_samples_per_second": 25.998,
"eval_steps_per_second": 3.253,
"step": 6600
},
{
"entropy": 0.9399532020092011,
"epoch": 1.9446896145925272,
"grad_norm": 0.6174861788749695,
"learning_rate": 6.0398640983943745e-05,
"loss": 0.9668970108032227,
"mean_token_accuracy": 0.7775733530521393,
"num_tokens": 26693838.0,
"step": 6610
},
{
"entropy": 1.0020717442035676,
"epoch": 1.9476316563695204,
"grad_norm": 0.5993858575820923,
"learning_rate": 6.0101053681683684e-05,
"loss": 0.9971044540405274,
"mean_token_accuracy": 0.7669116199016571,
"num_tokens": 26734003.0,
"step": 6620
},
{
"entropy": 1.0606256008148194,
"epoch": 1.9505736981465138,
"grad_norm": 0.6137609481811523,
"learning_rate": 5.980388602455791e-05,
"loss": 1.0581299781799316,
"mean_token_accuracy": 0.752843850851059,
"num_tokens": 26774676.0,
"step": 6630
},
{
"entropy": 0.9770977616310119,
"epoch": 1.9535157399235068,
"grad_norm": 0.5965011715888977,
"learning_rate": 5.950714113808663e-05,
"loss": 0.9810546875,
"mean_token_accuracy": 0.7688040852546691,
"num_tokens": 26815034.0,
"step": 6640
},
{
"entropy": 0.9412834465503692,
"epoch": 1.9564577817005002,
"grad_norm": 0.5246943235397339,
"learning_rate": 5.921082214334339e-05,
"loss": 0.9711008071899414,
"mean_token_accuracy": 0.7793717920780182,
"num_tokens": 26855573.0,
"step": 6650
},
{
"entropy": 0.9855255424976349,
"epoch": 1.9593998234774934,
"grad_norm": 0.550930917263031,
"learning_rate": 5.891493215692243e-05,
"loss": 0.9931858062744141,
"mean_token_accuracy": 0.7706417441368103,
"num_tokens": 26895609.0,
"step": 6660
},
{
"entropy": 0.9438577890396118,
"epoch": 1.9623418652544866,
"grad_norm": 0.5098885297775269,
"learning_rate": 5.861947429090572e-05,
"loss": 0.9524516105651856,
"mean_token_accuracy": 0.7804840981960297,
"num_tokens": 26936122.0,
"step": 6670
},
{
"entropy": 0.9131796836853028,
"epoch": 1.9652839070314798,
"grad_norm": 0.5577981472015381,
"learning_rate": 5.832445165283038e-05,
"loss": 0.9228861808776856,
"mean_token_accuracy": 0.78643758893013,
"num_tokens": 26976422.0,
"step": 6680
},
{
"entropy": 0.9258975267410279,
"epoch": 1.968225948808473,
"grad_norm": 0.44099509716033936,
"learning_rate": 5.8029867345655885e-05,
"loss": 0.9167799949645996,
"mean_token_accuracy": 0.785828173160553,
"num_tokens": 27016819.0,
"step": 6690
},
{
"entropy": 0.9321108460426331,
"epoch": 1.9711679905854664,
"grad_norm": 0.6357897520065308,
"learning_rate": 5.773572446773157e-05,
"loss": 0.9454483032226563,
"mean_token_accuracy": 0.7806357145309448,
"num_tokens": 27057488.0,
"step": 6700
},
{
"epoch": 1.9711679905854664,
"eval_entropy": 0.9999957291942256,
"eval_loss": 1.140816330909729,
"eval_mean_token_accuracy": 0.7509619468466191,
"eval_num_tokens": 27057488.0,
"eval_runtime": 116.9853,
"eval_samples_per_second": 26.029,
"eval_steps_per_second": 3.257,
"step": 6700
},
{
"entropy": 0.9509491920471191,
"epoch": 1.9741100323624594,
"grad_norm": 0.6006605625152588,
"learning_rate": 5.744202611276379e-05,
"loss": 0.9503057479858399,
"mean_token_accuracy": 0.7786516189575196,
"num_tokens": 27097949.0,
"step": 6710
},
{
"entropy": 1.0138035595417023,
"epoch": 1.9770520741394528,
"grad_norm": 0.5902991890907288,
"learning_rate": 5.7148775369783694e-05,
"loss": 1.0296749114990233,
"mean_token_accuracy": 0.7590757310390472,
"num_tokens": 27138453.0,
"step": 6720
},
{
"entropy": 0.933520519733429,
"epoch": 1.979994115916446,
"grad_norm": 0.5484936833381653,
"learning_rate": 5.685597532311455e-05,
"loss": 0.957374095916748,
"mean_token_accuracy": 0.7793904483318329,
"num_tokens": 27178805.0,
"step": 6730
},
{
"entropy": 0.9440324783325196,
"epoch": 1.9829361576934392,
"grad_norm": 0.5826029777526855,
"learning_rate": 5.656362905233923e-05,
"loss": 0.9262220382690429,
"mean_token_accuracy": 0.7845340669155121,
"num_tokens": 27219347.0,
"step": 6740
},
{
"entropy": 0.9071877419948577,
"epoch": 1.9858781994704324,
"grad_norm": 0.5721964836120605,
"learning_rate": 5.6271739632268094e-05,
"loss": 0.9060114860534668,
"mean_token_accuracy": 0.7890908360481262,
"num_tokens": 27258890.0,
"step": 6750
},
{
"entropy": 0.9562793612480164,
"epoch": 1.9888202412474256,
"grad_norm": 0.614380955696106,
"learning_rate": 5.598031013290631e-05,
"loss": 0.9876157760620117,
"mean_token_accuracy": 0.768429833650589,
"num_tokens": 27299053.0,
"step": 6760
},
{
"entropy": 0.9924969553947449,
"epoch": 1.991762283024419,
"grad_norm": 0.6030513644218445,
"learning_rate": 5.5689343619421906e-05,
"loss": 0.9977625846862793,
"mean_token_accuracy": 0.7658666670322418,
"num_tokens": 27339515.0,
"step": 6770
},
{
"entropy": 0.9534170269966126,
"epoch": 1.994704324801412,
"grad_norm": 0.5039950609207153,
"learning_rate": 5.539884315211321e-05,
"loss": 0.9545814514160156,
"mean_token_accuracy": 0.7779964745044708,
"num_tokens": 27379693.0,
"step": 6780
},
{
"entropy": 0.9789716601371765,
"epoch": 1.9976463665784054,
"grad_norm": 0.5822030305862427,
"learning_rate": 5.5108811786376925e-05,
"loss": 0.9928366661071777,
"mean_token_accuracy": 0.7682704031467438,
"num_tokens": 27419734.0,
"step": 6790
},
{
"entropy": 0.915216040611267,
"epoch": 2.000588408355399,
"grad_norm": 0.4654218554496765,
"learning_rate": 5.481925257267589e-05,
"loss": 0.8871613502502441,
"mean_token_accuracy": 0.7920856356620789,
"num_tokens": 27458303.0,
"step": 6800
},
{
"epoch": 2.000588408355399,
"eval_entropy": 0.9942471109663095,
"eval_loss": 1.1395292282104492,
"eval_mean_token_accuracy": 0.7511763375575148,
"eval_num_tokens": 27458303.0,
"eval_runtime": 116.8845,
"eval_samples_per_second": 26.051,
"eval_steps_per_second": 3.26,
"step": 6800
},
{
"entropy": 0.7543269693851471,
"epoch": 2.003530450132392,
"grad_norm": 0.6209985613822937,
"learning_rate": 5.4530168556506875e-05,
"loss": 0.6749869823455811,
"mean_token_accuracy": 0.8347735464572906,
"num_tokens": 27498607.0,
"step": 6810
},
{
"entropy": 0.6835850536823272,
"epoch": 2.0064724919093853,
"grad_norm": 0.781541109085083,
"learning_rate": 5.424156277836881e-05,
"loss": 0.6951170921325683,
"mean_token_accuracy": 0.8288436651229858,
"num_tokens": 27538904.0,
"step": 6820
},
{
"entropy": 0.6437631964683532,
"epoch": 2.0094145336863782,
"grad_norm": 0.8998324871063232,
"learning_rate": 5.395343827373053e-05,
"loss": 0.6296420574188233,
"mean_token_accuracy": 0.8461188077926636,
"num_tokens": 27579223.0,
"step": 6830
},
{
"entropy": 0.6127074956893921,
"epoch": 2.0123565754633717,
"grad_norm": 0.6167740225791931,
"learning_rate": 5.366579807299909e-05,
"loss": 0.5965664386749268,
"mean_token_accuracy": 0.850104957818985,
"num_tokens": 27619638.0,
"step": 6840
},
{
"entropy": 0.6964607417583466,
"epoch": 2.0152986172403646,
"grad_norm": 0.637476921081543,
"learning_rate": 5.337864520148768e-05,
"loss": 0.6968545913696289,
"mean_token_accuracy": 0.8300110459327698,
"num_tokens": 27660158.0,
"step": 6850
},
{
"entropy": 0.6738093435764313,
"epoch": 2.018240659017358,
"grad_norm": 0.7894798517227173,
"learning_rate": 5.309198267938402e-05,
"loss": 0.6670093059539794,
"mean_token_accuracy": 0.8377935826778412,
"num_tokens": 27700212.0,
"step": 6860
},
{
"entropy": 0.6280623555183411,
"epoch": 2.0211827007943515,
"grad_norm": 0.80244380235672,
"learning_rate": 5.280581352171836e-05,
"loss": 0.6267249107360839,
"mean_token_accuracy": 0.8437743067741394,
"num_tokens": 27740554.0,
"step": 6870
},
{
"entropy": 0.6882079899311065,
"epoch": 2.0241247425713444,
"grad_norm": 0.7488958835601807,
"learning_rate": 5.2520140738332025e-05,
"loss": 0.6897297382354737,
"mean_token_accuracy": 0.8309988558292389,
"num_tokens": 27781034.0,
"step": 6880
},
{
"entropy": 0.676528149843216,
"epoch": 2.027066784348338,
"grad_norm": 0.8301676511764526,
"learning_rate": 5.2234967333845466e-05,
"loss": 0.6622447490692138,
"mean_token_accuracy": 0.8345989942550659,
"num_tokens": 27821579.0,
"step": 6890
},
{
"entropy": 0.6388787865638733,
"epoch": 2.030008826125331,
"grad_norm": 0.7029614448547363,
"learning_rate": 5.1950296307626956e-05,
"loss": 0.6487605571746826,
"mean_token_accuracy": 0.8400563955307007,
"num_tokens": 27861899.0,
"step": 6900
},
{
"epoch": 2.030008826125331,
"eval_entropy": 0.8397969613707285,
"eval_loss": 1.2236672639846802,
"eval_mean_token_accuracy": 0.7469185830101254,
"eval_num_tokens": 27861899.0,
"eval_runtime": 116.8259,
"eval_samples_per_second": 26.064,
"eval_steps_per_second": 3.261,
"step": 6900
}
],
"logging_steps": 10,
"max_steps": 10197,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2944070017481708e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}