code-1b-instruct / last-checkpoint /trainer_state.json
rovdetection's picture
Training in progress, step 5000, checkpoint
a83e93d verified
Raw
History Blame Contribute Delete
148 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.591446378680422,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.4190640166401862,
"epoch": 0.017193208682570384,
"grad_norm": 0.44122377038002014,
"learning_rate": 0.00019964,
"loss": 4.624512481689453,
"mean_token_accuracy": 0.48470579609274866,
"num_tokens": 59125.0,
"step": 10
},
{
"entropy": 1.3190652154386044,
"epoch": 0.03438641736514077,
"grad_norm": 0.5728761553764343,
"learning_rate": 0.00019924,
"loss": 4.138050842285156,
"mean_token_accuracy": 0.5170843195170164,
"num_tokens": 121732.0,
"step": 20
},
{
"entropy": 1.899017508327961,
"epoch": 0.05157962604771115,
"grad_norm": 0.5385442972183228,
"learning_rate": 0.00019884000000000001,
"loss": 3.410753631591797,
"mean_token_accuracy": 0.5273028708994388,
"num_tokens": 180496.0,
"step": 30
},
{
"entropy": 2.605507677793503,
"epoch": 0.06877283473028153,
"grad_norm": 0.39409318566322327,
"learning_rate": 0.00019844,
"loss": 3.210155487060547,
"mean_token_accuracy": 0.5257176972925663,
"num_tokens": 239996.0,
"step": 40
},
{
"entropy": 2.37664245814085,
"epoch": 0.08596604341285193,
"grad_norm": 0.32786861062049866,
"learning_rate": 0.00019804,
"loss": 2.821852111816406,
"mean_token_accuracy": 0.5590792961418629,
"num_tokens": 298043.0,
"step": 50
},
{
"entropy": 2.395954003930092,
"epoch": 0.1031592520954223,
"grad_norm": 0.3047294616699219,
"learning_rate": 0.00019764,
"loss": 2.7597124099731447,
"mean_token_accuracy": 0.5563199404627085,
"num_tokens": 355876.0,
"step": 60
},
{
"entropy": 2.411456014215946,
"epoch": 0.12035246077799269,
"grad_norm": 0.3513432741165161,
"learning_rate": 0.00019724,
"loss": 2.7809066772460938,
"mean_token_accuracy": 0.5538371551781893,
"num_tokens": 415356.0,
"step": 70
},
{
"entropy": 2.396346817910671,
"epoch": 0.13754566946056307,
"grad_norm": 0.32153311371803284,
"learning_rate": 0.00019684,
"loss": 2.673196029663086,
"mean_token_accuracy": 0.5653545051813126,
"num_tokens": 472895.0,
"step": 80
},
{
"entropy": 2.4005219876766204,
"epoch": 0.15473887814313347,
"grad_norm": 0.37381669878959656,
"learning_rate": 0.00019644,
"loss": 2.6783329010009767,
"mean_token_accuracy": 0.565613779053092,
"num_tokens": 531787.0,
"step": 90
},
{
"entropy": 2.291259491443634,
"epoch": 0.17193208682570385,
"grad_norm": 0.3424990475177765,
"learning_rate": 0.00019604,
"loss": 2.518219757080078,
"mean_token_accuracy": 0.5815062165260315,
"num_tokens": 591947.0,
"step": 100
},
{
"entropy": 2.3215643003582955,
"epoch": 0.18912529550827423,
"grad_norm": 0.37565672397613525,
"learning_rate": 0.00019564,
"loss": 2.5377925872802733,
"mean_token_accuracy": 0.5835155539214611,
"num_tokens": 649619.0,
"step": 110
},
{
"entropy": 2.4258039399981497,
"epoch": 0.2063185041908446,
"grad_norm": 0.4689179062843323,
"learning_rate": 0.00019524,
"loss": 2.6080394744873048,
"mean_token_accuracy": 0.5646311499178409,
"num_tokens": 707340.0,
"step": 120
},
{
"entropy": 2.317111870646477,
"epoch": 0.223511712873415,
"grad_norm": 0.3885783851146698,
"learning_rate": 0.00019484,
"loss": 2.477190399169922,
"mean_token_accuracy": 0.5836748830974102,
"num_tokens": 769505.0,
"step": 130
},
{
"entropy": 2.3055995970964434,
"epoch": 0.24070492155598538,
"grad_norm": 0.43443477153778076,
"learning_rate": 0.00019444,
"loss": 2.4609752655029298,
"mean_token_accuracy": 0.5811029966920614,
"num_tokens": 829367.0,
"step": 140
},
{
"entropy": 2.2776345878839495,
"epoch": 0.2578981302385558,
"grad_norm": 0.47762158513069153,
"learning_rate": 0.00019404,
"loss": 2.4081003189086916,
"mean_token_accuracy": 0.5853929311037064,
"num_tokens": 886215.0,
"step": 150
},
{
"entropy": 2.2963428094983103,
"epoch": 0.27509133892112614,
"grad_norm": 0.46731501817703247,
"learning_rate": 0.00019364,
"loss": 2.4675086975097655,
"mean_token_accuracy": 0.5888176921755075,
"num_tokens": 945101.0,
"step": 160
},
{
"entropy": 2.2366374909877775,
"epoch": 0.29228454760369654,
"grad_norm": 0.4367876350879669,
"learning_rate": 0.00019323999999999999,
"loss": 2.3875659942626952,
"mean_token_accuracy": 0.5896096613258124,
"num_tokens": 1004603.0,
"step": 170
},
{
"entropy": 2.254362703859806,
"epoch": 0.30947775628626695,
"grad_norm": 0.4333887994289398,
"learning_rate": 0.00019284,
"loss": 2.4054040908813477,
"mean_token_accuracy": 0.5891438674181699,
"num_tokens": 1067233.0,
"step": 180
},
{
"entropy": 2.2392648085951805,
"epoch": 0.3266709649688373,
"grad_norm": 0.45199301838874817,
"learning_rate": 0.00019244000000000002,
"loss": 2.3508541107177736,
"mean_token_accuracy": 0.591214832291007,
"num_tokens": 1124472.0,
"step": 190
},
{
"entropy": 2.149501931667328,
"epoch": 0.3438641736514077,
"grad_norm": 0.44520440697669983,
"learning_rate": 0.00019204,
"loss": 2.281546783447266,
"mean_token_accuracy": 0.5986809592694045,
"num_tokens": 1180382.0,
"step": 200
},
{
"entropy": 2.2454783216118814,
"epoch": 0.3610573823339781,
"grad_norm": 0.5927078723907471,
"learning_rate": 0.00019164000000000003,
"loss": 2.4131927490234375,
"mean_token_accuracy": 0.5888259880244732,
"num_tokens": 1240228.0,
"step": 210
},
{
"entropy": 2.2189586043357847,
"epoch": 0.37825059101654845,
"grad_norm": 0.4634048044681549,
"learning_rate": 0.00019124000000000002,
"loss": 2.336981201171875,
"mean_token_accuracy": 0.5954784743487835,
"num_tokens": 1300889.0,
"step": 220
},
{
"entropy": 2.317907977104187,
"epoch": 0.39544379969911886,
"grad_norm": 0.492512971162796,
"learning_rate": 0.00019084,
"loss": 2.4568838119506835,
"mean_token_accuracy": 0.5841229122132063,
"num_tokens": 1357396.0,
"step": 230
},
{
"entropy": 2.2429257184267044,
"epoch": 0.4126370083816892,
"grad_norm": 0.515352725982666,
"learning_rate": 0.00019044000000000003,
"loss": 2.3383663177490233,
"mean_token_accuracy": 0.5922708168625832,
"num_tokens": 1417505.0,
"step": 240
},
{
"entropy": 2.2210252806544304,
"epoch": 0.4298302170642596,
"grad_norm": 0.4831322133541107,
"learning_rate": 0.00019004000000000002,
"loss": 2.4192705154418945,
"mean_token_accuracy": 0.5948628049343825,
"num_tokens": 1478753.0,
"step": 250
},
{
"entropy": 2.274476508796215,
"epoch": 0.44702342574683,
"grad_norm": 0.49397456645965576,
"learning_rate": 0.00018964,
"loss": 2.3850372314453123,
"mean_token_accuracy": 0.5882141895592212,
"num_tokens": 1540514.0,
"step": 260
},
{
"entropy": 2.2252144277095796,
"epoch": 0.46421663442940037,
"grad_norm": 0.48239603638648987,
"learning_rate": 0.00018924000000000002,
"loss": 2.3765859603881836,
"mean_token_accuracy": 0.597905408218503,
"num_tokens": 1602363.0,
"step": 270
},
{
"entropy": 2.2030586138367654,
"epoch": 0.48140984311197077,
"grad_norm": 0.47931915521621704,
"learning_rate": 0.00018884000000000001,
"loss": 2.3378921508789063,
"mean_token_accuracy": 0.5946248725056649,
"num_tokens": 1663013.0,
"step": 280
},
{
"entropy": 2.193090632557869,
"epoch": 0.4986030517945412,
"grad_norm": 0.4267734885215759,
"learning_rate": 0.00018844,
"loss": 2.301167106628418,
"mean_token_accuracy": 0.600122318789363,
"num_tokens": 1724937.0,
"step": 290
},
{
"entropy": 2.2401370778679848,
"epoch": 0.5157962604771116,
"grad_norm": 0.49861446022987366,
"learning_rate": 0.00018804000000000002,
"loss": 2.3409378051757814,
"mean_token_accuracy": 0.5942602109163999,
"num_tokens": 1782528.0,
"step": 300
},
{
"entropy": 2.2543238058686255,
"epoch": 0.5329894691596819,
"grad_norm": 0.5549066066741943,
"learning_rate": 0.00018764,
"loss": 2.397447967529297,
"mean_token_accuracy": 0.5927667014300824,
"num_tokens": 1841865.0,
"step": 310
},
{
"entropy": 2.1446528255939485,
"epoch": 0.5501826778422523,
"grad_norm": 0.6057388782501221,
"learning_rate": 0.00018724,
"loss": 2.2854946136474608,
"mean_token_accuracy": 0.5992342013865709,
"num_tokens": 1901375.0,
"step": 320
},
{
"entropy": 2.2384325101971627,
"epoch": 0.5673758865248227,
"grad_norm": 0.6120573282241821,
"learning_rate": 0.00018684000000000002,
"loss": 2.3560409545898438,
"mean_token_accuracy": 0.5913927119225264,
"num_tokens": 1959650.0,
"step": 330
},
{
"entropy": 2.103813698887825,
"epoch": 0.5845690952073931,
"grad_norm": 0.5133985280990601,
"learning_rate": 0.00018644,
"loss": 2.2106700897216798,
"mean_token_accuracy": 0.6131400100886821,
"num_tokens": 2020031.0,
"step": 340
},
{
"entropy": 2.11156629472971,
"epoch": 0.6017623038899634,
"grad_norm": 0.4958188235759735,
"learning_rate": 0.00018604,
"loss": 2.2597396850585936,
"mean_token_accuracy": 0.6059262081980705,
"num_tokens": 2080034.0,
"step": 350
},
{
"entropy": 2.2104950502514837,
"epoch": 0.6189555125725339,
"grad_norm": 0.5096895098686218,
"learning_rate": 0.00018564000000000002,
"loss": 2.271474075317383,
"mean_token_accuracy": 0.597117318212986,
"num_tokens": 2137629.0,
"step": 360
},
{
"entropy": 2.134429484605789,
"epoch": 0.6361487212551042,
"grad_norm": 0.5682357549667358,
"learning_rate": 0.00018524,
"loss": 2.237981605529785,
"mean_token_accuracy": 0.6059913612902165,
"num_tokens": 2196125.0,
"step": 370
},
{
"entropy": 2.151672175526619,
"epoch": 0.6533419299376746,
"grad_norm": 0.5488378405570984,
"learning_rate": 0.00018484000000000003,
"loss": 2.264353942871094,
"mean_token_accuracy": 0.5990234814584255,
"num_tokens": 2256258.0,
"step": 380
},
{
"entropy": 2.1240685641765595,
"epoch": 0.670535138620245,
"grad_norm": 0.5736668705940247,
"learning_rate": 0.00018444000000000002,
"loss": 2.2498214721679686,
"mean_token_accuracy": 0.6068044692277909,
"num_tokens": 2317743.0,
"step": 390
},
{
"entropy": 2.110594576597214,
"epoch": 0.6877283473028154,
"grad_norm": 0.5833790302276611,
"learning_rate": 0.00018404,
"loss": 2.222176361083984,
"mean_token_accuracy": 0.6128960818052291,
"num_tokens": 2381669.0,
"step": 400
},
{
"entropy": 2.1233398094773293,
"epoch": 0.7049215559853858,
"grad_norm": 0.5612857937812805,
"learning_rate": 0.00018364000000000002,
"loss": 2.2054920196533203,
"mean_token_accuracy": 0.6041553311049939,
"num_tokens": 2439094.0,
"step": 410
},
{
"entropy": 2.1500213012099265,
"epoch": 0.7221147646679562,
"grad_norm": 0.5670902132987976,
"learning_rate": 0.00018324000000000001,
"loss": 2.2393463134765623,
"mean_token_accuracy": 0.6055185578763485,
"num_tokens": 2498246.0,
"step": 420
},
{
"entropy": 2.03703038841486,
"epoch": 0.7393079733505266,
"grad_norm": 0.6171953082084656,
"learning_rate": 0.00018284,
"loss": 2.129292678833008,
"mean_token_accuracy": 0.6159544993191958,
"num_tokens": 2557403.0,
"step": 430
},
{
"entropy": 2.0861524820327757,
"epoch": 0.7565011820330969,
"grad_norm": 0.5753834247589111,
"learning_rate": 0.00018244000000000002,
"loss": 2.189851760864258,
"mean_token_accuracy": 0.6077463660389185,
"num_tokens": 2614912.0,
"step": 440
},
{
"entropy": 2.127491444349289,
"epoch": 0.7736943907156673,
"grad_norm": 0.5808554291725159,
"learning_rate": 0.00018204,
"loss": 2.2198902130126954,
"mean_token_accuracy": 0.6067756544798613,
"num_tokens": 2673278.0,
"step": 450
},
{
"entropy": 2.10256717056036,
"epoch": 0.7908875993982377,
"grad_norm": 0.6682887077331543,
"learning_rate": 0.00018164,
"loss": 2.1562450408935545,
"mean_token_accuracy": 0.6123722370713949,
"num_tokens": 2733477.0,
"step": 460
},
{
"entropy": 2.0725875943899155,
"epoch": 0.8080808080808081,
"grad_norm": 0.5611984729766846,
"learning_rate": 0.00018124000000000002,
"loss": 2.1555084228515624,
"mean_token_accuracy": 0.6104017984122038,
"num_tokens": 2793130.0,
"step": 470
},
{
"entropy": 2.0911933913826943,
"epoch": 0.8252740167633784,
"grad_norm": 0.4832659661769867,
"learning_rate": 0.00018084,
"loss": 2.2131584167480467,
"mean_token_accuracy": 0.6108913067728281,
"num_tokens": 2856643.0,
"step": 480
},
{
"entropy": 2.089042477309704,
"epoch": 0.8424672254459489,
"grad_norm": 0.6528189182281494,
"learning_rate": 0.00018044,
"loss": 2.222637939453125,
"mean_token_accuracy": 0.6140136975795031,
"num_tokens": 2918669.0,
"step": 490
},
{
"entropy": 2.177516208589077,
"epoch": 0.8596604341285192,
"grad_norm": 0.511565089225769,
"learning_rate": 0.00018004000000000002,
"loss": 2.2838220596313477,
"mean_token_accuracy": 0.6013776436448097,
"num_tokens": 2977952.0,
"step": 500
},
{
"entropy": 2.0694020837545395,
"epoch": 0.8768536428110896,
"grad_norm": 0.6112110614776611,
"learning_rate": 0.00017964,
"loss": 2.1932716369628906,
"mean_token_accuracy": 0.6094281867146492,
"num_tokens": 3039858.0,
"step": 510
},
{
"entropy": 2.074494983255863,
"epoch": 0.89404685149366,
"grad_norm": 0.6264903545379639,
"learning_rate": 0.00017924,
"loss": 2.1746356964111326,
"mean_token_accuracy": 0.6137691352516412,
"num_tokens": 3095095.0,
"step": 520
},
{
"entropy": 2.1704643085598945,
"epoch": 0.9112400601762304,
"grad_norm": 0.5588786602020264,
"learning_rate": 0.00017884000000000002,
"loss": 2.256443977355957,
"mean_token_accuracy": 0.6048448126763105,
"num_tokens": 3154541.0,
"step": 530
},
{
"entropy": 2.0287919655442237,
"epoch": 0.9284332688588007,
"grad_norm": 0.5964768528938293,
"learning_rate": 0.00017844,
"loss": 2.1472768783569336,
"mean_token_accuracy": 0.6183896280825139,
"num_tokens": 3213587.0,
"step": 540
},
{
"entropy": 2.0924746826291085,
"epoch": 0.9456264775413712,
"grad_norm": 0.6021146774291992,
"learning_rate": 0.00017804,
"loss": 2.194413757324219,
"mean_token_accuracy": 0.6094567842781544,
"num_tokens": 3270420.0,
"step": 550
},
{
"entropy": 2.2008628591895105,
"epoch": 0.9628196862239415,
"grad_norm": 0.5850458741188049,
"learning_rate": 0.00017764000000000001,
"loss": 2.325449752807617,
"mean_token_accuracy": 0.5982100035995245,
"num_tokens": 3329358.0,
"step": 560
},
{
"entropy": 2.065661500394344,
"epoch": 0.9800128949065119,
"grad_norm": 0.5808996558189392,
"learning_rate": 0.00017724,
"loss": 2.197132873535156,
"mean_token_accuracy": 0.614894449710846,
"num_tokens": 3389408.0,
"step": 570
},
{
"entropy": 2.0561472952365873,
"epoch": 0.9972061035890823,
"grad_norm": 0.5550947189331055,
"learning_rate": 0.00017684,
"loss": 2.1736166000366213,
"mean_token_accuracy": 0.6167749039828777,
"num_tokens": 3448108.0,
"step": 580
},
{
"entropy": 2.017703249857023,
"epoch": 1.0137545669460564,
"grad_norm": 0.5885869860649109,
"learning_rate": 0.00017644,
"loss": 2.093535614013672,
"mean_token_accuracy": 0.623307110427262,
"num_tokens": 3510556.0,
"step": 590
},
{
"entropy": 2.0724975898861886,
"epoch": 1.0309477756286267,
"grad_norm": 0.6038488745689392,
"learning_rate": 0.00017604,
"loss": 2.1512643814086916,
"mean_token_accuracy": 0.6218964882194996,
"num_tokens": 3572796.0,
"step": 600
},
{
"entropy": 2.0171437337994576,
"epoch": 1.048140984311197,
"grad_norm": 0.5356580018997192,
"learning_rate": 0.00017564,
"loss": 2.1061470031738283,
"mean_token_accuracy": 0.6229204799979925,
"num_tokens": 3636040.0,
"step": 610
},
{
"entropy": 2.069349633157253,
"epoch": 1.0653341929937674,
"grad_norm": 0.6440969109535217,
"learning_rate": 0.00017524,
"loss": 2.1538244247436524,
"mean_token_accuracy": 0.6151916943490505,
"num_tokens": 3692494.0,
"step": 620
},
{
"entropy": 1.9605562821030618,
"epoch": 1.0825274016763378,
"grad_norm": 0.6160545349121094,
"learning_rate": 0.00017484,
"loss": 2.036081314086914,
"mean_token_accuracy": 0.6326734255999327,
"num_tokens": 3750269.0,
"step": 630
},
{
"entropy": 1.9889371052384377,
"epoch": 1.0997206103589083,
"grad_norm": 0.6164005398750305,
"learning_rate": 0.00017444,
"loss": 2.051021194458008,
"mean_token_accuracy": 0.6301001563668251,
"num_tokens": 3808993.0,
"step": 640
},
{
"entropy": 2.1208505019545556,
"epoch": 1.1169138190414787,
"grad_norm": 0.5549023151397705,
"learning_rate": 0.00017404,
"loss": 2.2218536376953124,
"mean_token_accuracy": 0.6127480801194907,
"num_tokens": 3875366.0,
"step": 650
},
{
"entropy": 2.0680324912071226,
"epoch": 1.134107027724049,
"grad_norm": 0.6039656400680542,
"learning_rate": 0.00017364,
"loss": 2.1767093658447267,
"mean_token_accuracy": 0.61400815397501,
"num_tokens": 3936258.0,
"step": 660
},
{
"entropy": 2.072723534703255,
"epoch": 1.1513002364066194,
"grad_norm": 0.5685736536979675,
"learning_rate": 0.00017324000000000002,
"loss": 2.16034049987793,
"mean_token_accuracy": 0.6158209484070539,
"num_tokens": 3996786.0,
"step": 670
},
{
"entropy": 2.0064551383256912,
"epoch": 1.1684934450891897,
"grad_norm": 0.6133168935775757,
"learning_rate": 0.00017284,
"loss": 2.0968536376953124,
"mean_token_accuracy": 0.6204963065683842,
"num_tokens": 4057237.0,
"step": 680
},
{
"entropy": 1.999112318456173,
"epoch": 1.18568665377176,
"grad_norm": 0.6392707228660583,
"learning_rate": 0.00017244,
"loss": 2.1093589782714846,
"mean_token_accuracy": 0.6260342009365558,
"num_tokens": 4115641.0,
"step": 690
},
{
"entropy": 1.9792790532112121,
"epoch": 1.2028798624543304,
"grad_norm": 0.5575782656669617,
"learning_rate": 0.00017204,
"loss": 2.097017288208008,
"mean_token_accuracy": 0.6311597619205713,
"num_tokens": 4179186.0,
"step": 700
},
{
"entropy": 2.0830755099654197,
"epoch": 1.220073071136901,
"grad_norm": 0.5660914182662964,
"learning_rate": 0.00017164,
"loss": 2.1811811447143556,
"mean_token_accuracy": 0.6163320489227772,
"num_tokens": 4239862.0,
"step": 710
},
{
"entropy": 2.0135093346238135,
"epoch": 1.2372662798194713,
"grad_norm": 0.535127580165863,
"learning_rate": 0.00017124,
"loss": 2.1185089111328126,
"mean_token_accuracy": 0.6227246847003698,
"num_tokens": 4299020.0,
"step": 720
},
{
"entropy": 2.0245131298899652,
"epoch": 1.2544594885020417,
"grad_norm": 0.5870150327682495,
"learning_rate": 0.00017084,
"loss": 2.1228567123413087,
"mean_token_accuracy": 0.620493221282959,
"num_tokens": 4357565.0,
"step": 730
},
{
"entropy": 2.0833021104335785,
"epoch": 1.271652697184612,
"grad_norm": 0.6691998243331909,
"learning_rate": 0.00017044,
"loss": 2.1688915252685548,
"mean_token_accuracy": 0.6160433337092399,
"num_tokens": 4416113.0,
"step": 740
},
{
"entropy": 1.9562881767749787,
"epoch": 1.2888459058671824,
"grad_norm": 0.5513840317726135,
"learning_rate": 0.00017004,
"loss": 2.038709831237793,
"mean_token_accuracy": 0.6274638958275318,
"num_tokens": 4479913.0,
"step": 750
},
{
"entropy": 2.0899336978793146,
"epoch": 1.306039114549753,
"grad_norm": 0.6334884166717529,
"learning_rate": 0.00016964,
"loss": 2.1464771270751952,
"mean_token_accuracy": 0.6156820185482502,
"num_tokens": 4538630.0,
"step": 760
},
{
"entropy": 2.006402041018009,
"epoch": 1.3232323232323233,
"grad_norm": 0.6608359813690186,
"learning_rate": 0.00016924,
"loss": 2.0910036087036135,
"mean_token_accuracy": 0.6266825262457132,
"num_tokens": 4596432.0,
"step": 770
},
{
"entropy": 1.981143780052662,
"epoch": 1.3404255319148937,
"grad_norm": 0.6512318849563599,
"learning_rate": 0.00016884,
"loss": 2.0733022689819336,
"mean_token_accuracy": 0.6271612212061882,
"num_tokens": 4653388.0,
"step": 780
},
{
"entropy": 2.046033799648285,
"epoch": 1.357618740597464,
"grad_norm": 0.657543957233429,
"learning_rate": 0.00016844,
"loss": 2.122422790527344,
"mean_token_accuracy": 0.6139081876724959,
"num_tokens": 4707749.0,
"step": 790
},
{
"entropy": 2.0472232535481454,
"epoch": 1.3748119492800344,
"grad_norm": 0.5705983638763428,
"learning_rate": 0.00016804,
"loss": 2.1095462799072267,
"mean_token_accuracy": 0.6147159416228533,
"num_tokens": 4768198.0,
"step": 800
},
{
"entropy": 2.0032111018896104,
"epoch": 1.3920051579626047,
"grad_norm": 0.6248787045478821,
"learning_rate": 0.00016764,
"loss": 2.06416015625,
"mean_token_accuracy": 0.626778207719326,
"num_tokens": 4830049.0,
"step": 810
},
{
"entropy": 2.008875849843025,
"epoch": 1.409198366645175,
"grad_norm": 0.6245584487915039,
"learning_rate": 0.00016724000000000003,
"loss": 2.096910285949707,
"mean_token_accuracy": 0.626385198161006,
"num_tokens": 4887278.0,
"step": 820
},
{
"entropy": 2.080148513615131,
"epoch": 1.4263915753277456,
"grad_norm": 0.6255568861961365,
"learning_rate": 0.00016684000000000002,
"loss": 2.1830982208251952,
"mean_token_accuracy": 0.6152387134730816,
"num_tokens": 4946401.0,
"step": 830
},
{
"entropy": 1.999508222937584,
"epoch": 1.443584784010316,
"grad_norm": 0.6275898218154907,
"learning_rate": 0.00016644,
"loss": 2.1005062103271483,
"mean_token_accuracy": 0.6225087266415358,
"num_tokens": 5007769.0,
"step": 840
},
{
"entropy": 1.937928880751133,
"epoch": 1.4607779926928863,
"grad_norm": 0.5719529986381531,
"learning_rate": 0.00016604000000000003,
"loss": 2.026857376098633,
"mean_token_accuracy": 0.6374159537255764,
"num_tokens": 5065155.0,
"step": 850
},
{
"entropy": 2.0230891808867453,
"epoch": 1.4779712013754567,
"grad_norm": 0.6000656485557556,
"learning_rate": 0.00016564000000000002,
"loss": 2.1284557342529298,
"mean_token_accuracy": 0.6221452355384827,
"num_tokens": 5127964.0,
"step": 860
},
{
"entropy": 2.0356423661112784,
"epoch": 1.495164410058027,
"grad_norm": 0.6452302932739258,
"learning_rate": 0.00016524,
"loss": 2.136619758605957,
"mean_token_accuracy": 0.6180427376180887,
"num_tokens": 5186867.0,
"step": 870
},
{
"entropy": 1.980474129319191,
"epoch": 1.5123576187405976,
"grad_norm": 0.6840422749519348,
"learning_rate": 0.00016484000000000003,
"loss": 2.1068920135498046,
"mean_token_accuracy": 0.6276284489780665,
"num_tokens": 5243496.0,
"step": 880
},
{
"entropy": 1.9933902084827424,
"epoch": 1.5295508274231677,
"grad_norm": 0.6497262716293335,
"learning_rate": 0.00016444000000000002,
"loss": 2.0951356887817383,
"mean_token_accuracy": 0.625996507704258,
"num_tokens": 5305224.0,
"step": 890
},
{
"entropy": 1.9787120044231414,
"epoch": 1.5467440361057383,
"grad_norm": 0.5943708419799805,
"learning_rate": 0.00016404,
"loss": 2.050846481323242,
"mean_token_accuracy": 0.6277968011796474,
"num_tokens": 5362047.0,
"step": 900
},
{
"entropy": 1.9207568421959877,
"epoch": 1.5639372447883086,
"grad_norm": 0.7813357710838318,
"learning_rate": 0.00016364000000000003,
"loss": 2.01483097076416,
"mean_token_accuracy": 0.6335461936891079,
"num_tokens": 5418646.0,
"step": 910
},
{
"entropy": 1.991297036409378,
"epoch": 1.581130453470879,
"grad_norm": 0.6333187818527222,
"learning_rate": 0.00016324000000000002,
"loss": 2.042116165161133,
"mean_token_accuracy": 0.6277558326721191,
"num_tokens": 5474990.0,
"step": 920
},
{
"entropy": 2.049289306998253,
"epoch": 1.5983236621534493,
"grad_norm": 0.6549156308174133,
"learning_rate": 0.00016284,
"loss": 2.160791778564453,
"mean_token_accuracy": 0.6178662430495023,
"num_tokens": 5536798.0,
"step": 930
},
{
"entropy": 2.0044725999236106,
"epoch": 1.6155168708360197,
"grad_norm": 0.7054678201675415,
"learning_rate": 0.00016244000000000002,
"loss": 2.1012857437133787,
"mean_token_accuracy": 0.6249351866543293,
"num_tokens": 5592825.0,
"step": 940
},
{
"entropy": 1.8883072763681412,
"epoch": 1.6327100795185903,
"grad_norm": 0.6150692701339722,
"learning_rate": 0.00016204000000000001,
"loss": 1.933417510986328,
"mean_token_accuracy": 0.6382322389632463,
"num_tokens": 5647293.0,
"step": 950
},
{
"entropy": 1.9773345232009887,
"epoch": 1.6499032882011604,
"grad_norm": 0.6604284048080444,
"learning_rate": 0.00016164,
"loss": 2.0537574768066404,
"mean_token_accuracy": 0.6284147780388594,
"num_tokens": 5707851.0,
"step": 960
},
{
"entropy": 1.9602369159460067,
"epoch": 1.667096496883731,
"grad_norm": 0.5877639055252075,
"learning_rate": 0.00016124000000000002,
"loss": 2.0338172912597656,
"mean_token_accuracy": 0.6378967847675086,
"num_tokens": 5766339.0,
"step": 970
},
{
"entropy": 2.0475835338234902,
"epoch": 1.6842897055663013,
"grad_norm": 0.6029936075210571,
"learning_rate": 0.00016084,
"loss": 2.161564254760742,
"mean_token_accuracy": 0.6204672615975142,
"num_tokens": 5827901.0,
"step": 980
},
{
"entropy": 1.9963667973876,
"epoch": 1.7014829142488717,
"grad_norm": 0.6379806399345398,
"learning_rate": 0.00016044,
"loss": 2.087441825866699,
"mean_token_accuracy": 0.6302045777440071,
"num_tokens": 5888724.0,
"step": 990
},
{
"entropy": 2.003598253428936,
"epoch": 1.7186761229314422,
"grad_norm": 0.6113580465316772,
"learning_rate": 0.00016004000000000002,
"loss": 2.136977195739746,
"mean_token_accuracy": 0.6279609728604555,
"num_tokens": 5948999.0,
"step": 1000
},
{
"entropy": 2.009714852273464,
"epoch": 1.7358693316140124,
"grad_norm": 0.6397438645362854,
"learning_rate": 0.00015964,
"loss": 2.1206714630126955,
"mean_token_accuracy": 0.6264089956879616,
"num_tokens": 6010131.0,
"step": 1010
},
{
"entropy": 1.9690447196364402,
"epoch": 1.753062540296583,
"grad_norm": 0.6629226803779602,
"learning_rate": 0.00015924,
"loss": 2.0379661560058593,
"mean_token_accuracy": 0.6306859996169806,
"num_tokens": 6070105.0,
"step": 1020
},
{
"entropy": 1.9761606559157372,
"epoch": 1.7702557489791533,
"grad_norm": 0.6340969800949097,
"learning_rate": 0.00015884000000000002,
"loss": 2.0979576110839844,
"mean_token_accuracy": 0.6255223523825407,
"num_tokens": 6129369.0,
"step": 1030
},
{
"entropy": 2.0855698764324186,
"epoch": 1.7874489576617236,
"grad_norm": 0.6090606451034546,
"learning_rate": 0.00015844,
"loss": 2.1925222396850588,
"mean_token_accuracy": 0.6140031859278678,
"num_tokens": 6191194.0,
"step": 1040
},
{
"entropy": 2.0149157389998438,
"epoch": 1.804642166344294,
"grad_norm": 0.6012734174728394,
"learning_rate": 0.00015804,
"loss": 2.104481506347656,
"mean_token_accuracy": 0.6318388734012842,
"num_tokens": 6254136.0,
"step": 1050
},
{
"entropy": 1.9774292945861816,
"epoch": 1.8218353750268643,
"grad_norm": 0.5775039792060852,
"learning_rate": 0.00015764000000000002,
"loss": 2.0788572311401365,
"mean_token_accuracy": 0.6315916679799557,
"num_tokens": 6311831.0,
"step": 1060
},
{
"entropy": 2.0096932500600815,
"epoch": 1.839028583709435,
"grad_norm": 0.6441799998283386,
"learning_rate": 0.00015724,
"loss": 2.067369079589844,
"mean_token_accuracy": 0.6272157531231641,
"num_tokens": 6368999.0,
"step": 1070
},
{
"entropy": 1.9509294107556343,
"epoch": 1.856221792392005,
"grad_norm": 0.7183738946914673,
"learning_rate": 0.00015684,
"loss": 2.046398162841797,
"mean_token_accuracy": 0.6311775099486112,
"num_tokens": 6426522.0,
"step": 1080
},
{
"entropy": 2.0291523337364197,
"epoch": 1.8734150010745756,
"grad_norm": 0.6105868816375732,
"learning_rate": 0.00015644,
"loss": 2.103832244873047,
"mean_token_accuracy": 0.6217470221221447,
"num_tokens": 6488781.0,
"step": 1090
},
{
"entropy": 1.9441853806376457,
"epoch": 1.890608209757146,
"grad_norm": 0.6209670901298523,
"learning_rate": 0.00015604,
"loss": 1.9830631256103515,
"mean_token_accuracy": 0.6288409855216741,
"num_tokens": 6545378.0,
"step": 1100
},
{
"entropy": 1.9924452617764472,
"epoch": 1.9078014184397163,
"grad_norm": 0.6691610813140869,
"learning_rate": 0.00015564000000000002,
"loss": 2.0985618591308595,
"mean_token_accuracy": 0.6272476647049189,
"num_tokens": 6605917.0,
"step": 1110
},
{
"entropy": 1.9443897798657417,
"epoch": 1.9249946271222869,
"grad_norm": 0.5956576466560364,
"learning_rate": 0.00015524,
"loss": 2.0018499374389647,
"mean_token_accuracy": 0.6381504714488984,
"num_tokens": 6661811.0,
"step": 1120
},
{
"entropy": 1.9805707216262818,
"epoch": 1.942187835804857,
"grad_norm": 0.5946056842803955,
"learning_rate": 0.00015484,
"loss": 2.0427513122558594,
"mean_token_accuracy": 0.6258678704500198,
"num_tokens": 6723336.0,
"step": 1130
},
{
"entropy": 1.9137805163860322,
"epoch": 1.9593810444874276,
"grad_norm": 0.6030678749084473,
"learning_rate": 0.00015444000000000002,
"loss": 1.9697463989257813,
"mean_token_accuracy": 0.641855177283287,
"num_tokens": 6781069.0,
"step": 1140
},
{
"entropy": 1.9368563026189805,
"epoch": 1.9765742531699977,
"grad_norm": 0.6237244009971619,
"learning_rate": 0.00015404,
"loss": 2.034241485595703,
"mean_token_accuracy": 0.6384489141404629,
"num_tokens": 6840129.0,
"step": 1150
},
{
"entropy": 1.8723769560456276,
"epoch": 1.9937674618525683,
"grad_norm": 0.6409602761268616,
"learning_rate": 0.00015364,
"loss": 1.9205358505249024,
"mean_token_accuracy": 0.6437219835817813,
"num_tokens": 6894472.0,
"step": 1160
},
{
"entropy": 1.8628178556244095,
"epoch": 2.010315925209542,
"grad_norm": 0.6614267230033875,
"learning_rate": 0.00015324000000000002,
"loss": 1.8996776580810546,
"mean_token_accuracy": 0.6492502008165631,
"num_tokens": 6952821.0,
"step": 1170
},
{
"entropy": 1.8918349608778953,
"epoch": 2.0275091338921127,
"grad_norm": 0.6476360559463501,
"learning_rate": 0.00015284,
"loss": 1.9498180389404296,
"mean_token_accuracy": 0.6428254719823598,
"num_tokens": 7014989.0,
"step": 1180
},
{
"entropy": 1.945443508028984,
"epoch": 2.044702342574683,
"grad_norm": 0.7725419402122498,
"learning_rate": 0.00015244,
"loss": 2.020453453063965,
"mean_token_accuracy": 0.6328805617988109,
"num_tokens": 7073087.0,
"step": 1190
},
{
"entropy": 2.039360311627388,
"epoch": 2.0618955512572534,
"grad_norm": 0.6981257200241089,
"learning_rate": 0.00015204000000000001,
"loss": 2.137688636779785,
"mean_token_accuracy": 0.6245126206427812,
"num_tokens": 7129761.0,
"step": 1200
},
{
"entropy": 1.9039745211601258,
"epoch": 2.0790887599398236,
"grad_norm": 0.6344786882400513,
"learning_rate": 0.00015164,
"loss": 1.945602798461914,
"mean_token_accuracy": 0.6396142981946469,
"num_tokens": 7187187.0,
"step": 1210
},
{
"entropy": 1.9067329421639443,
"epoch": 2.096281968622394,
"grad_norm": 0.6346563696861267,
"learning_rate": 0.00015124,
"loss": 1.990530014038086,
"mean_token_accuracy": 0.6383092008531094,
"num_tokens": 7247237.0,
"step": 1220
},
{
"entropy": 1.8649237960577012,
"epoch": 2.1134751773049647,
"grad_norm": 0.6211341023445129,
"learning_rate": 0.00015084,
"loss": 1.9000562667846679,
"mean_token_accuracy": 0.6510101232677699,
"num_tokens": 7301760.0,
"step": 1230
},
{
"entropy": 1.9364619553089142,
"epoch": 2.130668385987535,
"grad_norm": 0.6074926257133484,
"learning_rate": 0.00015044,
"loss": 2.0338695526123045,
"mean_token_accuracy": 0.6359073251485825,
"num_tokens": 7361681.0,
"step": 1240
},
{
"entropy": 1.86348085552454,
"epoch": 2.1478615946701054,
"grad_norm": 0.7472113370895386,
"learning_rate": 0.00015004,
"loss": 1.9359153747558593,
"mean_token_accuracy": 0.6460991870611906,
"num_tokens": 7424162.0,
"step": 1250
},
{
"entropy": 1.9580871596932412,
"epoch": 2.1650548033526755,
"grad_norm": 0.6993629336357117,
"learning_rate": 0.00014964,
"loss": 2.0203479766845702,
"mean_token_accuracy": 0.630286343768239,
"num_tokens": 7486191.0,
"step": 1260
},
{
"entropy": 1.9111438870429993,
"epoch": 2.182248012035246,
"grad_norm": 0.6560285687446594,
"learning_rate": 0.00014924,
"loss": 1.9584983825683593,
"mean_token_accuracy": 0.6434897668659687,
"num_tokens": 7544447.0,
"step": 1270
},
{
"entropy": 1.931545352935791,
"epoch": 2.1994412207178167,
"grad_norm": 0.6713767647743225,
"learning_rate": 0.00014884,
"loss": 2.106512451171875,
"mean_token_accuracy": 0.6318223185837268,
"num_tokens": 7604237.0,
"step": 1280
},
{
"entropy": 1.9441128447651863,
"epoch": 2.216634429400387,
"grad_norm": 0.6440369486808777,
"learning_rate": 0.00014844,
"loss": 2.0373985290527346,
"mean_token_accuracy": 0.6320536743849516,
"num_tokens": 7661475.0,
"step": 1290
},
{
"entropy": 1.949166515469551,
"epoch": 2.2338276380829574,
"grad_norm": 0.6829583644866943,
"learning_rate": 0.00014804,
"loss": 2.005051040649414,
"mean_token_accuracy": 0.6337476786226034,
"num_tokens": 7720352.0,
"step": 1300
},
{
"entropy": 1.8687394335865974,
"epoch": 2.2510208467655275,
"grad_norm": 0.6745384335517883,
"learning_rate": 0.00014764,
"loss": 1.9336997985839843,
"mean_token_accuracy": 0.6435953237116336,
"num_tokens": 7780071.0,
"step": 1310
},
{
"entropy": 1.8999060586094856,
"epoch": 2.268214055448098,
"grad_norm": 0.5983019471168518,
"learning_rate": 0.00014724,
"loss": 1.9348258972167969,
"mean_token_accuracy": 0.6431682731956243,
"num_tokens": 7839864.0,
"step": 1320
},
{
"entropy": 1.868900626897812,
"epoch": 2.285407264130668,
"grad_norm": 0.6673943400382996,
"learning_rate": 0.00014684,
"loss": 1.9038555145263671,
"mean_token_accuracy": 0.6503972858190536,
"num_tokens": 7901781.0,
"step": 1330
},
{
"entropy": 2.0273515924811365,
"epoch": 2.3026004728132388,
"grad_norm": 0.7098233103752136,
"learning_rate": 0.00014644,
"loss": 2.1501066207885744,
"mean_token_accuracy": 0.6240016058087349,
"num_tokens": 7962103.0,
"step": 1340
},
{
"entropy": 1.9079027369618415,
"epoch": 2.3197936814958093,
"grad_norm": 0.6738902926445007,
"learning_rate": 0.00014604,
"loss": 1.9681257247924804,
"mean_token_accuracy": 0.6416124865412712,
"num_tokens": 8021532.0,
"step": 1350
},
{
"entropy": 1.9230551555752755,
"epoch": 2.3369868901783795,
"grad_norm": 0.7348875999450684,
"learning_rate": 0.00014564,
"loss": 1.995201301574707,
"mean_token_accuracy": 0.6393462974578142,
"num_tokens": 8083084.0,
"step": 1360
},
{
"entropy": 1.9632413163781166,
"epoch": 2.35418009886095,
"grad_norm": 0.7093244194984436,
"learning_rate": 0.00014523999999999998,
"loss": 2.0105892181396485,
"mean_token_accuracy": 0.6326792053878307,
"num_tokens": 8144387.0,
"step": 1370
},
{
"entropy": 1.9028392255306243,
"epoch": 2.37137330754352,
"grad_norm": 0.7751646041870117,
"learning_rate": 0.00014484,
"loss": 1.9892047882080077,
"mean_token_accuracy": 0.6404657427221536,
"num_tokens": 8203720.0,
"step": 1380
},
{
"entropy": 1.860563676059246,
"epoch": 2.3885665162260907,
"grad_norm": 0.7243943214416504,
"learning_rate": 0.00014444,
"loss": 1.9201997756958007,
"mean_token_accuracy": 0.6510312400758267,
"num_tokens": 8263059.0,
"step": 1390
},
{
"entropy": 1.8883199632167815,
"epoch": 2.405759724908661,
"grad_norm": 0.6106081604957581,
"learning_rate": 0.00014404,
"loss": 1.9687911987304687,
"mean_token_accuracy": 0.6493277192115784,
"num_tokens": 8323826.0,
"step": 1400
},
{
"entropy": 1.9286921486258506,
"epoch": 2.4229529335912314,
"grad_norm": 0.697307825088501,
"learning_rate": 0.00014364,
"loss": 2.030810546875,
"mean_token_accuracy": 0.6362422123551369,
"num_tokens": 8383325.0,
"step": 1410
},
{
"entropy": 1.9122009217739104,
"epoch": 2.440146142273802,
"grad_norm": 0.7119978070259094,
"learning_rate": 0.00014324,
"loss": 1.9788457870483398,
"mean_token_accuracy": 0.6400811962783337,
"num_tokens": 8442393.0,
"step": 1420
},
{
"entropy": 1.9088031873106956,
"epoch": 2.457339350956372,
"grad_norm": 0.6792617440223694,
"learning_rate": 0.00014284,
"loss": 1.995138931274414,
"mean_token_accuracy": 0.63965779915452,
"num_tokens": 8501631.0,
"step": 1430
},
{
"entropy": 1.8871790513396263,
"epoch": 2.4745325596389427,
"grad_norm": 0.7191487550735474,
"learning_rate": 0.00014244000000000003,
"loss": 1.9728309631347656,
"mean_token_accuracy": 0.642948642373085,
"num_tokens": 8561457.0,
"step": 1440
},
{
"entropy": 1.8643269062042236,
"epoch": 2.491725768321513,
"grad_norm": 0.636345386505127,
"learning_rate": 0.00014204000000000002,
"loss": 1.9090641021728516,
"mean_token_accuracy": 0.6537593178451061,
"num_tokens": 8618281.0,
"step": 1450
},
{
"entropy": 1.90698651522398,
"epoch": 2.5089189770040834,
"grad_norm": 0.7444325685501099,
"learning_rate": 0.00014164,
"loss": 1.9631458282470704,
"mean_token_accuracy": 0.6428062118589878,
"num_tokens": 8675547.0,
"step": 1460
},
{
"entropy": 1.942822016775608,
"epoch": 2.526112185686654,
"grad_norm": 0.6707946062088013,
"learning_rate": 0.00014124000000000002,
"loss": 2.0348419189453124,
"mean_token_accuracy": 0.6381070952862501,
"num_tokens": 8735546.0,
"step": 1470
},
{
"entropy": 1.979990416765213,
"epoch": 2.543305394369224,
"grad_norm": 0.6958721280097961,
"learning_rate": 0.00014084000000000001,
"loss": 2.072053146362305,
"mean_token_accuracy": 0.6283687971532345,
"num_tokens": 8796404.0,
"step": 1480
},
{
"entropy": 1.9207274168729782,
"epoch": 2.5604986030517947,
"grad_norm": 0.6575210690498352,
"learning_rate": 0.00014044,
"loss": 1.9427066802978517,
"mean_token_accuracy": 0.642588010430336,
"num_tokens": 8853405.0,
"step": 1490
},
{
"entropy": 1.9980120360851288,
"epoch": 2.577691811734365,
"grad_norm": 0.7412211298942566,
"learning_rate": 0.00014004000000000002,
"loss": 2.1047718048095705,
"mean_token_accuracy": 0.6264939974993468,
"num_tokens": 8909416.0,
"step": 1500
},
{
"entropy": 1.8110749498009682,
"epoch": 2.5948850204169354,
"grad_norm": 0.7108538746833801,
"learning_rate": 0.00013964,
"loss": 1.8952640533447265,
"mean_token_accuracy": 0.6537120632827282,
"num_tokens": 8968510.0,
"step": 1510
},
{
"entropy": 1.977073846757412,
"epoch": 2.612078229099506,
"grad_norm": 0.7554802298545837,
"learning_rate": 0.00013924,
"loss": 2.0621898651123045,
"mean_token_accuracy": 0.6327366977930069,
"num_tokens": 9026884.0,
"step": 1520
},
{
"entropy": 1.8783492282032968,
"epoch": 2.629271437782076,
"grad_norm": 0.6592015027999878,
"learning_rate": 0.00013884000000000002,
"loss": 1.9230785369873047,
"mean_token_accuracy": 0.6494536675512791,
"num_tokens": 9085571.0,
"step": 1530
},
{
"entropy": 1.9282778173685073,
"epoch": 2.6464646464646466,
"grad_norm": 0.7717080116271973,
"learning_rate": 0.00013844,
"loss": 2.0319377899169924,
"mean_token_accuracy": 0.6344667036086321,
"num_tokens": 9147549.0,
"step": 1540
},
{
"entropy": 1.903467869758606,
"epoch": 2.6636578551472168,
"grad_norm": 0.6227516531944275,
"learning_rate": 0.00013804000000000003,
"loss": 1.9306724548339844,
"mean_token_accuracy": 0.644033481925726,
"num_tokens": 9204942.0,
"step": 1550
},
{
"entropy": 1.8967040538787843,
"epoch": 2.6808510638297873,
"grad_norm": 0.6684938073158264,
"learning_rate": 0.00013764000000000002,
"loss": 2.001560592651367,
"mean_token_accuracy": 0.6470274899154902,
"num_tokens": 9266446.0,
"step": 1560
},
{
"entropy": 1.8590586185455322,
"epoch": 2.6980442725123575,
"grad_norm": 0.6150694489479065,
"learning_rate": 0.00013724,
"loss": 1.9280338287353516,
"mean_token_accuracy": 0.6484670951962471,
"num_tokens": 9326109.0,
"step": 1570
},
{
"entropy": 1.9293041676282883,
"epoch": 2.715237481194928,
"grad_norm": 0.6057704091072083,
"learning_rate": 0.00013684000000000002,
"loss": 1.9943519592285157,
"mean_token_accuracy": 0.6371258046478033,
"num_tokens": 9385073.0,
"step": 1580
},
{
"entropy": 1.8843669161200522,
"epoch": 2.732430689877498,
"grad_norm": 0.6834639310836792,
"learning_rate": 0.00013644000000000002,
"loss": 1.9569879531860352,
"mean_token_accuracy": 0.6437417894601822,
"num_tokens": 9445137.0,
"step": 1590
},
{
"entropy": 1.8529930964112282,
"epoch": 2.7496238985600687,
"grad_norm": 0.6442180871963501,
"learning_rate": 0.00013604,
"loss": 1.8902450561523438,
"mean_token_accuracy": 0.6518216013908387,
"num_tokens": 9504160.0,
"step": 1600
},
{
"entropy": 1.939158782362938,
"epoch": 2.7668171072426393,
"grad_norm": 0.6240729689598083,
"learning_rate": 0.00013564000000000002,
"loss": 2.0188575744628907,
"mean_token_accuracy": 0.63564417026937,
"num_tokens": 9564675.0,
"step": 1610
},
{
"entropy": 1.9281259045004844,
"epoch": 2.7840103159252094,
"grad_norm": 0.750890851020813,
"learning_rate": 0.00013524,
"loss": 2.017038345336914,
"mean_token_accuracy": 0.6387452960014344,
"num_tokens": 9625026.0,
"step": 1620
},
{
"entropy": 1.873080413043499,
"epoch": 2.80120352460778,
"grad_norm": 0.776397168636322,
"learning_rate": 0.00013484,
"loss": 1.9759422302246095,
"mean_token_accuracy": 0.6433901283890009,
"num_tokens": 9685967.0,
"step": 1630
},
{
"entropy": 1.9089648619294166,
"epoch": 2.81839673329035,
"grad_norm": 0.6481618881225586,
"learning_rate": 0.00013444000000000002,
"loss": 1.956050491333008,
"mean_token_accuracy": 0.6402542922645807,
"num_tokens": 9745233.0,
"step": 1640
},
{
"entropy": 1.975960558652878,
"epoch": 2.8355899419729207,
"grad_norm": 0.6896694302558899,
"learning_rate": 0.00013404,
"loss": 2.0583721160888673,
"mean_token_accuracy": 0.6340504981577396,
"num_tokens": 9805150.0,
"step": 1650
},
{
"entropy": 1.945571132004261,
"epoch": 2.8527831506554913,
"grad_norm": 0.6386220455169678,
"learning_rate": 0.00013364,
"loss": 2.03116512298584,
"mean_token_accuracy": 0.6365220382809639,
"num_tokens": 9861196.0,
"step": 1660
},
{
"entropy": 1.9110410138964653,
"epoch": 2.8699763593380614,
"grad_norm": 0.7503199577331543,
"learning_rate": 0.00013324000000000002,
"loss": 1.9521196365356446,
"mean_token_accuracy": 0.6381696432828903,
"num_tokens": 9921155.0,
"step": 1670
},
{
"entropy": 1.849820225685835,
"epoch": 2.887169568020632,
"grad_norm": 0.6197855472564697,
"learning_rate": 0.00013284,
"loss": 1.8909440994262696,
"mean_token_accuracy": 0.6426266122609376,
"num_tokens": 9979351.0,
"step": 1680
},
{
"entropy": 1.8932805389165879,
"epoch": 2.904362776703202,
"grad_norm": 0.6703120470046997,
"learning_rate": 0.00013244,
"loss": 2.0233718872070314,
"mean_token_accuracy": 0.646468547359109,
"num_tokens": 10041238.0,
"step": 1690
},
{
"entropy": 1.8625088930130005,
"epoch": 2.9215559853857727,
"grad_norm": 0.73073410987854,
"learning_rate": 0.00013204000000000002,
"loss": 1.9317462921142579,
"mean_token_accuracy": 0.6454428397119045,
"num_tokens": 10099496.0,
"step": 1700
},
{
"entropy": 1.9354272544384004,
"epoch": 2.9387491940683432,
"grad_norm": 0.6566579937934875,
"learning_rate": 0.00013164,
"loss": 2.0027164459228515,
"mean_token_accuracy": 0.6403926335275173,
"num_tokens": 10161720.0,
"step": 1710
},
{
"entropy": 1.88578300178051,
"epoch": 2.9559424027509134,
"grad_norm": 0.7905747890472412,
"learning_rate": 0.00013124,
"loss": 1.9767372131347656,
"mean_token_accuracy": 0.6421503167599439,
"num_tokens": 10221734.0,
"step": 1720
},
{
"entropy": 1.870301403105259,
"epoch": 2.973135611433484,
"grad_norm": 0.7210419774055481,
"learning_rate": 0.00013084000000000001,
"loss": 1.9475433349609375,
"mean_token_accuracy": 0.6528905872255564,
"num_tokens": 10280223.0,
"step": 1730
},
{
"entropy": 1.8696911588311196,
"epoch": 2.990328820116054,
"grad_norm": 0.626354992389679,
"learning_rate": 0.00013044,
"loss": 1.926706314086914,
"mean_token_accuracy": 0.6482070714235306,
"num_tokens": 10339813.0,
"step": 1740
},
{
"entropy": 1.821205088844547,
"epoch": 3.006877283473028,
"grad_norm": 0.6353569030761719,
"learning_rate": 0.00013004,
"loss": 1.8657075881958007,
"mean_token_accuracy": 0.6556356762136731,
"num_tokens": 10398519.0,
"step": 1750
},
{
"entropy": 1.8890676617622375,
"epoch": 3.0240704921555985,
"grad_norm": 0.783729076385498,
"learning_rate": 0.00012964,
"loss": 1.9794137954711915,
"mean_token_accuracy": 0.643126554042101,
"num_tokens": 10456386.0,
"step": 1760
},
{
"entropy": 1.8766882956027984,
"epoch": 3.041263700838169,
"grad_norm": 0.7075045108795166,
"learning_rate": 0.00012924,
"loss": 1.9388771057128906,
"mean_token_accuracy": 0.6462941512465477,
"num_tokens": 10516721.0,
"step": 1770
},
{
"entropy": 1.7985384911298752,
"epoch": 3.0584569095207392,
"grad_norm": 0.7116262912750244,
"learning_rate": 0.00012884,
"loss": 1.8379974365234375,
"mean_token_accuracy": 0.6582404263317585,
"num_tokens": 10575553.0,
"step": 1780
},
{
"entropy": 1.8475583091378212,
"epoch": 3.07565011820331,
"grad_norm": 0.69736248254776,
"learning_rate": 0.00012844,
"loss": 1.9197765350341798,
"mean_token_accuracy": 0.6509403776377439,
"num_tokens": 10632501.0,
"step": 1790
},
{
"entropy": 1.8264927819371224,
"epoch": 3.09284332688588,
"grad_norm": 0.6354222297668457,
"learning_rate": 0.00012804,
"loss": 1.8965986251831055,
"mean_token_accuracy": 0.6518782209604979,
"num_tokens": 10693167.0,
"step": 1800
},
{
"entropy": 1.8696907818317414,
"epoch": 3.1100365355684505,
"grad_norm": 0.7568804621696472,
"learning_rate": 0.00012764,
"loss": 1.9332853317260743,
"mean_token_accuracy": 0.6471077598631382,
"num_tokens": 10753837.0,
"step": 1810
},
{
"entropy": 1.886954003572464,
"epoch": 3.1272297442510206,
"grad_norm": 0.7069846391677856,
"learning_rate": 0.00012724,
"loss": 1.9263908386230468,
"mean_token_accuracy": 0.6466126769781113,
"num_tokens": 10815256.0,
"step": 1820
},
{
"entropy": 1.8424360305070877,
"epoch": 3.144422952933591,
"grad_norm": 0.6524083614349365,
"learning_rate": 0.00012684,
"loss": 1.9088315963745117,
"mean_token_accuracy": 0.6496367674320936,
"num_tokens": 10877848.0,
"step": 1830
},
{
"entropy": 1.8966794192790986,
"epoch": 3.1616161616161618,
"grad_norm": 0.687421977519989,
"learning_rate": 0.00012644000000000002,
"loss": 1.9748069763183593,
"mean_token_accuracy": 0.6424707356840372,
"num_tokens": 10938042.0,
"step": 1840
},
{
"entropy": 1.81406429708004,
"epoch": 3.178809370298732,
"grad_norm": 0.7668496370315552,
"learning_rate": 0.00012604,
"loss": 1.8712465286254882,
"mean_token_accuracy": 0.6571074567735196,
"num_tokens": 10996204.0,
"step": 1850
},
{
"entropy": 1.8159340515732765,
"epoch": 3.1960025789813025,
"grad_norm": 0.7182545065879822,
"learning_rate": 0.00012564,
"loss": 1.830276107788086,
"mean_token_accuracy": 0.6546356856822968,
"num_tokens": 11056605.0,
"step": 1860
},
{
"entropy": 1.9095668271183968,
"epoch": 3.2131957876638726,
"grad_norm": 0.7548812031745911,
"learning_rate": 0.00012524000000000001,
"loss": 1.998922348022461,
"mean_token_accuracy": 0.6411306612193585,
"num_tokens": 11116614.0,
"step": 1870
},
{
"entropy": 1.8717206478118897,
"epoch": 3.230388996346443,
"grad_norm": 0.7692223191261292,
"learning_rate": 0.00012484,
"loss": 1.914438247680664,
"mean_token_accuracy": 0.6441164951771498,
"num_tokens": 11175802.0,
"step": 1880
},
{
"entropy": 1.8943733513355254,
"epoch": 3.2475822050290137,
"grad_norm": 0.6439138650894165,
"learning_rate": 0.00012444,
"loss": 1.9280553817749024,
"mean_token_accuracy": 0.6476396139711141,
"num_tokens": 11236477.0,
"step": 1890
},
{
"entropy": 1.8841392308473588,
"epoch": 3.264775413711584,
"grad_norm": 0.6971343159675598,
"learning_rate": 0.00012404,
"loss": 1.942568588256836,
"mean_token_accuracy": 0.6398356795310974,
"num_tokens": 11295146.0,
"step": 1900
},
{
"entropy": 1.8830088019371032,
"epoch": 3.2819686223941544,
"grad_norm": 0.7196023464202881,
"learning_rate": 0.00012364,
"loss": 1.963007354736328,
"mean_token_accuracy": 0.6452915534377098,
"num_tokens": 11355726.0,
"step": 1910
},
{
"entropy": 1.927216087281704,
"epoch": 3.2991618310767246,
"grad_norm": 0.790634274482727,
"learning_rate": 0.00012324,
"loss": 2.0809165954589846,
"mean_token_accuracy": 0.6384686015546321,
"num_tokens": 11415237.0,
"step": 1920
},
{
"entropy": 1.849087017774582,
"epoch": 3.316355039759295,
"grad_norm": 0.6752087473869324,
"learning_rate": 0.00012284,
"loss": 1.9017595291137694,
"mean_token_accuracy": 0.6522149413824081,
"num_tokens": 11476337.0,
"step": 1930
},
{
"entropy": 1.8517325416207313,
"epoch": 3.3335482484418657,
"grad_norm": 0.8036973476409912,
"learning_rate": 0.00012244,
"loss": 1.9011222839355468,
"mean_token_accuracy": 0.6499856971204281,
"num_tokens": 11537529.0,
"step": 1940
},
{
"entropy": 1.7622334837913514,
"epoch": 3.350741457124436,
"grad_norm": 0.7138587832450867,
"learning_rate": 0.00012204,
"loss": 1.7955827713012695,
"mean_token_accuracy": 0.6596556272357702,
"num_tokens": 11595421.0,
"step": 1950
},
{
"entropy": 1.8950866341590882,
"epoch": 3.3679346658070064,
"grad_norm": 0.6869714260101318,
"learning_rate": 0.00012164,
"loss": 1.948552131652832,
"mean_token_accuracy": 0.6493024453520775,
"num_tokens": 11655749.0,
"step": 1960
},
{
"entropy": 1.9235218942165375,
"epoch": 3.3851278744895765,
"grad_norm": 0.656403124332428,
"learning_rate": 0.00012124,
"loss": 2.04327449798584,
"mean_token_accuracy": 0.6389912366867065,
"num_tokens": 11717271.0,
"step": 1970
},
{
"entropy": 1.834906594455242,
"epoch": 3.402321083172147,
"grad_norm": 0.7343699932098389,
"learning_rate": 0.00012084,
"loss": 1.9038848876953125,
"mean_token_accuracy": 0.6569048661738635,
"num_tokens": 11778095.0,
"step": 1980
},
{
"entropy": 1.8515655741095542,
"epoch": 3.4195142918547172,
"grad_norm": 0.7009745240211487,
"learning_rate": 0.00012043999999999999,
"loss": 1.9157728195190429,
"mean_token_accuracy": 0.6512683361768723,
"num_tokens": 11835954.0,
"step": 1990
},
{
"entropy": 1.8634012743830681,
"epoch": 3.436707500537288,
"grad_norm": 0.6880552172660828,
"learning_rate": 0.00012004,
"loss": 1.9772762298583983,
"mean_token_accuracy": 0.6531724959611893,
"num_tokens": 11896615.0,
"step": 2000
},
{
"entropy": 1.8952298507094383,
"epoch": 3.453900709219858,
"grad_norm": 0.7292787432670593,
"learning_rate": 0.00011964,
"loss": 1.9302806854248047,
"mean_token_accuracy": 0.6462091594934464,
"num_tokens": 11954949.0,
"step": 2010
},
{
"entropy": 1.8723753660917282,
"epoch": 3.4710939179024285,
"grad_norm": 0.730530858039856,
"learning_rate": 0.00011923999999999999,
"loss": 1.9216194152832031,
"mean_token_accuracy": 0.6504904717206955,
"num_tokens": 12013803.0,
"step": 2020
},
{
"entropy": 1.8673277243971824,
"epoch": 3.488287126584999,
"grad_norm": 0.7530126571655273,
"learning_rate": 0.00011884,
"loss": 1.968985366821289,
"mean_token_accuracy": 0.646847129613161,
"num_tokens": 12073284.0,
"step": 2030
},
{
"entropy": 1.8757897645235062,
"epoch": 3.505480335267569,
"grad_norm": 0.7031217813491821,
"learning_rate": 0.00011844,
"loss": 1.9071741104125977,
"mean_token_accuracy": 0.6450003884732723,
"num_tokens": 12126451.0,
"step": 2040
},
{
"entropy": 1.7986262783408165,
"epoch": 3.5226735439501398,
"grad_norm": 0.7223983407020569,
"learning_rate": 0.00011804,
"loss": 1.8450950622558593,
"mean_token_accuracy": 0.6576410517096519,
"num_tokens": 12183343.0,
"step": 2050
},
{
"entropy": 1.8884935915470122,
"epoch": 3.53986675263271,
"grad_norm": 0.7206518650054932,
"learning_rate": 0.00011763999999999999,
"loss": 1.9660964965820313,
"mean_token_accuracy": 0.6422303304076195,
"num_tokens": 12243607.0,
"step": 2060
},
{
"entropy": 1.8009026944637299,
"epoch": 3.5570599613152805,
"grad_norm": 0.7229637503623962,
"learning_rate": 0.00011724000000000002,
"loss": 1.851433563232422,
"mean_token_accuracy": 0.6556052915751934,
"num_tokens": 12304867.0,
"step": 2070
},
{
"entropy": 1.7949693977832795,
"epoch": 3.574253169997851,
"grad_norm": 0.6935518383979797,
"learning_rate": 0.00011684000000000001,
"loss": 1.8848058700561523,
"mean_token_accuracy": 0.6580755174160003,
"num_tokens": 12367633.0,
"step": 2080
},
{
"entropy": 1.8038981169462205,
"epoch": 3.591446378680421,
"grad_norm": 0.7003904581069946,
"learning_rate": 0.00011644000000000002,
"loss": 1.8867233276367188,
"mean_token_accuracy": 0.655081395432353,
"num_tokens": 12423928.0,
"step": 2090
},
{
"entropy": 1.850062020123005,
"epoch": 3.6086395873629917,
"grad_norm": 0.6852926015853882,
"learning_rate": 0.00011604000000000002,
"loss": 1.9325201034545898,
"mean_token_accuracy": 0.6472255479544401,
"num_tokens": 12479411.0,
"step": 2100
},
{
"entropy": 1.8294448778033257,
"epoch": 3.625832796045562,
"grad_norm": 0.7044693827629089,
"learning_rate": 0.00011564000000000001,
"loss": 1.8989273071289063,
"mean_token_accuracy": 0.6499249216169118,
"num_tokens": 12539175.0,
"step": 2110
},
{
"entropy": 1.8719267755746842,
"epoch": 3.6430260047281324,
"grad_norm": 0.7180586457252502,
"learning_rate": 0.00011524000000000001,
"loss": 1.925216293334961,
"mean_token_accuracy": 0.648950444161892,
"num_tokens": 12598337.0,
"step": 2120
},
{
"entropy": 1.88923449665308,
"epoch": 3.660219213410703,
"grad_norm": 0.7464597821235657,
"learning_rate": 0.00011484000000000002,
"loss": 1.990826416015625,
"mean_token_accuracy": 0.6456409864127636,
"num_tokens": 12656592.0,
"step": 2130
},
{
"entropy": 1.8126205861568452,
"epoch": 3.677412422093273,
"grad_norm": 0.7253774404525757,
"learning_rate": 0.00011444000000000001,
"loss": 1.9414216995239257,
"mean_token_accuracy": 0.6552157323807478,
"num_tokens": 12717791.0,
"step": 2140
},
{
"entropy": 1.8930377542972565,
"epoch": 3.6946056307758437,
"grad_norm": 0.7404170036315918,
"learning_rate": 0.00011404000000000001,
"loss": 1.9364784240722657,
"mean_token_accuracy": 0.6434980578720569,
"num_tokens": 12775445.0,
"step": 2150
},
{
"entropy": 1.7652419656515121,
"epoch": 3.711798839458414,
"grad_norm": 0.688732385635376,
"learning_rate": 0.00011364000000000002,
"loss": 1.7636165618896484,
"mean_token_accuracy": 0.6639453627169132,
"num_tokens": 12834599.0,
"step": 2160
},
{
"entropy": 1.7745767116546631,
"epoch": 3.7289920481409844,
"grad_norm": 0.7011992335319519,
"learning_rate": 0.00011324000000000001,
"loss": 1.8347841262817384,
"mean_token_accuracy": 0.6586773280054331,
"num_tokens": 12889887.0,
"step": 2170
},
{
"entropy": 1.7952505484223367,
"epoch": 3.746185256823555,
"grad_norm": 0.7646785378456116,
"learning_rate": 0.00011284000000000001,
"loss": 1.883163070678711,
"mean_token_accuracy": 0.6589437790215016,
"num_tokens": 12950286.0,
"step": 2180
},
{
"entropy": 1.8878965258598328,
"epoch": 3.763378465506125,
"grad_norm": 0.7722623944282532,
"learning_rate": 0.00011244000000000001,
"loss": 1.9674694061279296,
"mean_token_accuracy": 0.6422343414276839,
"num_tokens": 13011083.0,
"step": 2190
},
{
"entropy": 1.919720321893692,
"epoch": 3.780571674188695,
"grad_norm": 0.7656893134117126,
"learning_rate": 0.00011204000000000002,
"loss": 1.9919773101806642,
"mean_token_accuracy": 0.6393908958882093,
"num_tokens": 13069376.0,
"step": 2200
},
{
"entropy": 1.77825688123703,
"epoch": 3.797764882871266,
"grad_norm": 0.8324808478355408,
"learning_rate": 0.00011164000000000001,
"loss": 1.8173160552978516,
"mean_token_accuracy": 0.659475727379322,
"num_tokens": 13124851.0,
"step": 2210
},
{
"entropy": 1.8232837438583374,
"epoch": 3.8149580915538364,
"grad_norm": 0.741481363773346,
"learning_rate": 0.00011124000000000001,
"loss": 1.860748291015625,
"mean_token_accuracy": 0.6524971850216389,
"num_tokens": 13182576.0,
"step": 2220
},
{
"entropy": 1.8588940657675266,
"epoch": 3.8321513002364065,
"grad_norm": 0.7748705148696899,
"learning_rate": 0.00011084000000000002,
"loss": 1.9206954956054687,
"mean_token_accuracy": 0.6516353718936443,
"num_tokens": 13242703.0,
"step": 2230
},
{
"entropy": 1.823398308455944,
"epoch": 3.849344508918977,
"grad_norm": 0.6341049671173096,
"learning_rate": 0.00011044,
"loss": 1.8718917846679688,
"mean_token_accuracy": 0.6596139155328273,
"num_tokens": 13303181.0,
"step": 2240
},
{
"entropy": 1.8098929420113563,
"epoch": 3.866537717601547,
"grad_norm": 0.6672969460487366,
"learning_rate": 0.00011004000000000001,
"loss": 1.8999752044677733,
"mean_token_accuracy": 0.6594760783016682,
"num_tokens": 13364371.0,
"step": 2250
},
{
"entropy": 1.7795367375016213,
"epoch": 3.8837309262841178,
"grad_norm": 0.6343891024589539,
"learning_rate": 0.00010964000000000001,
"loss": 1.827276611328125,
"mean_token_accuracy": 0.6668465688824654,
"num_tokens": 13425450.0,
"step": 2260
},
{
"entropy": 1.8673226684331894,
"epoch": 3.9009241349666883,
"grad_norm": 0.7357877492904663,
"learning_rate": 0.00010924,
"loss": 1.9206443786621095,
"mean_token_accuracy": 0.647479448094964,
"num_tokens": 13485806.0,
"step": 2270
},
{
"entropy": 1.806484942883253,
"epoch": 3.9181173436492585,
"grad_norm": 0.7172144055366516,
"learning_rate": 0.00010884000000000001,
"loss": 1.8789045333862304,
"mean_token_accuracy": 0.6594084780663252,
"num_tokens": 13544934.0,
"step": 2280
},
{
"entropy": 1.7970930591225625,
"epoch": 3.935310552331829,
"grad_norm": 0.7578801512718201,
"learning_rate": 0.00010844000000000001,
"loss": 1.8405040740966796,
"mean_token_accuracy": 0.6608923889696598,
"num_tokens": 13606653.0,
"step": 2290
},
{
"entropy": 1.8469372361898422,
"epoch": 3.952503761014399,
"grad_norm": 0.7626324892044067,
"learning_rate": 0.00010804,
"loss": 1.8629837036132812,
"mean_token_accuracy": 0.6560039456933737,
"num_tokens": 13663938.0,
"step": 2300
},
{
"entropy": 1.836122378706932,
"epoch": 3.9696969696969697,
"grad_norm": 0.7074365615844727,
"learning_rate": 0.00010764,
"loss": 1.8942070007324219,
"mean_token_accuracy": 0.647238065674901,
"num_tokens": 13722549.0,
"step": 2310
},
{
"entropy": 1.821449062973261,
"epoch": 3.9868901783795403,
"grad_norm": 0.6956577301025391,
"learning_rate": 0.00010724000000000001,
"loss": 1.8947336196899414,
"mean_token_accuracy": 0.6528103355318308,
"num_tokens": 13785922.0,
"step": 2320
},
{
"entropy": 1.839719023023333,
"epoch": 4.003438641736514,
"grad_norm": 0.6865222454071045,
"learning_rate": 0.00010684,
"loss": 1.8803377151489258,
"mean_token_accuracy": 0.6526942384707464,
"num_tokens": 13844647.0,
"step": 2330
},
{
"entropy": 1.855065654218197,
"epoch": 4.020631850419084,
"grad_norm": 0.7424384355545044,
"learning_rate": 0.00010644,
"loss": 1.9461166381835937,
"mean_token_accuracy": 0.6463506512343884,
"num_tokens": 13904724.0,
"step": 2340
},
{
"entropy": 1.7508789122104644,
"epoch": 4.037825059101655,
"grad_norm": 0.6670609712600708,
"learning_rate": 0.00010604000000000001,
"loss": 1.781893539428711,
"mean_token_accuracy": 0.6653038747608662,
"num_tokens": 13963472.0,
"step": 2350
},
{
"entropy": 1.8165026590228082,
"epoch": 4.0550182677842255,
"grad_norm": 0.7823750376701355,
"learning_rate": 0.00010564000000000001,
"loss": 1.8847312927246094,
"mean_token_accuracy": 0.6607359856367111,
"num_tokens": 14019708.0,
"step": 2360
},
{
"entropy": 1.794335062801838,
"epoch": 4.072211476466796,
"grad_norm": 0.8262340426445007,
"learning_rate": 0.00010524,
"loss": 1.8576740264892577,
"mean_token_accuracy": 0.6582343481481076,
"num_tokens": 14076178.0,
"step": 2370
},
{
"entropy": 1.8828865155577659,
"epoch": 4.089404685149366,
"grad_norm": 0.784656822681427,
"learning_rate": 0.00010484,
"loss": 1.9146394729614258,
"mean_token_accuracy": 0.6491621173918247,
"num_tokens": 14133662.0,
"step": 2380
},
{
"entropy": 1.918326808512211,
"epoch": 4.106597893831936,
"grad_norm": 0.7571077346801758,
"learning_rate": 0.00010444000000000001,
"loss": 2.024713897705078,
"mean_token_accuracy": 0.643079025298357,
"num_tokens": 14196967.0,
"step": 2390
},
{
"entropy": 1.7909317679703236,
"epoch": 4.123791102514507,
"grad_norm": 0.7276471257209778,
"learning_rate": 0.00010404,
"loss": 1.845133399963379,
"mean_token_accuracy": 0.6548417568206787,
"num_tokens": 14256866.0,
"step": 2400
},
{
"entropy": 1.7750686906278133,
"epoch": 4.140984311197077,
"grad_norm": 0.668246328830719,
"learning_rate": 0.00010364,
"loss": 1.7945009231567384,
"mean_token_accuracy": 0.6641525950282812,
"num_tokens": 14318324.0,
"step": 2410
},
{
"entropy": 1.823828212916851,
"epoch": 4.158177519879647,
"grad_norm": 0.7596518993377686,
"learning_rate": 0.00010324000000000001,
"loss": 1.898871612548828,
"mean_token_accuracy": 0.6519910141825676,
"num_tokens": 14380775.0,
"step": 2420
},
{
"entropy": 1.7938876405358315,
"epoch": 4.175370728562218,
"grad_norm": 0.6834619641304016,
"learning_rate": 0.00010284,
"loss": 1.8518138885498048,
"mean_token_accuracy": 0.6622516691684723,
"num_tokens": 14440862.0,
"step": 2430
},
{
"entropy": 1.8744625180959702,
"epoch": 4.192563937244788,
"grad_norm": 0.8088146448135376,
"learning_rate": 0.00010244,
"loss": 1.9542848587036132,
"mean_token_accuracy": 0.6499028638005256,
"num_tokens": 14500841.0,
"step": 2440
},
{
"entropy": 1.8284114554524422,
"epoch": 4.209757145927359,
"grad_norm": 0.82193523645401,
"learning_rate": 0.00010204,
"loss": 1.9107404708862306,
"mean_token_accuracy": 0.6551219135522842,
"num_tokens": 14564257.0,
"step": 2450
},
{
"entropy": 1.8538024842739105,
"epoch": 4.226950354609929,
"grad_norm": 0.7263757586479187,
"learning_rate": 0.00010164,
"loss": 1.8713863372802735,
"mean_token_accuracy": 0.6510257624089718,
"num_tokens": 14623019.0,
"step": 2460
},
{
"entropy": 1.756752038002014,
"epoch": 4.244143563292499,
"grad_norm": 0.7334346175193787,
"learning_rate": 0.00010124,
"loss": 1.7855047225952148,
"mean_token_accuracy": 0.6687729060649872,
"num_tokens": 14682191.0,
"step": 2470
},
{
"entropy": 1.7032470375299453,
"epoch": 4.26133677197507,
"grad_norm": 0.7168938517570496,
"learning_rate": 0.00010084,
"loss": 1.7648530960083009,
"mean_token_accuracy": 0.6696467150002718,
"num_tokens": 14739840.0,
"step": 2480
},
{
"entropy": 1.7426577515900135,
"epoch": 4.27852998065764,
"grad_norm": 0.7091065645217896,
"learning_rate": 0.00010044000000000001,
"loss": 1.8180946350097655,
"mean_token_accuracy": 0.6640235505998134,
"num_tokens": 14798444.0,
"step": 2490
},
{
"entropy": 1.8743537411093711,
"epoch": 4.295723189340211,
"grad_norm": 0.6376718878746033,
"learning_rate": 0.00010004,
"loss": 1.9534942626953125,
"mean_token_accuracy": 0.6467559643089771,
"num_tokens": 14861262.0,
"step": 2500
},
{
"entropy": 1.8234948687255383,
"epoch": 4.312916398022781,
"grad_norm": 0.778538167476654,
"learning_rate": 9.964e-05,
"loss": 1.8733020782470704,
"mean_token_accuracy": 0.6553889319300652,
"num_tokens": 14920923.0,
"step": 2510
},
{
"entropy": 1.812998068332672,
"epoch": 4.330109606705351,
"grad_norm": 0.7861834764480591,
"learning_rate": 9.924e-05,
"loss": 1.8699317932128907,
"mean_token_accuracy": 0.6555795632302761,
"num_tokens": 14978173.0,
"step": 2520
},
{
"entropy": 1.8013822883367538,
"epoch": 4.347302815387922,
"grad_norm": 0.751916229724884,
"learning_rate": 9.884e-05,
"loss": 1.8372121810913087,
"mean_token_accuracy": 0.664341426640749,
"num_tokens": 15034480.0,
"step": 2530
},
{
"entropy": 1.7700918450951577,
"epoch": 4.364496024070492,
"grad_norm": 0.7365695834159851,
"learning_rate": 9.844000000000001e-05,
"loss": 1.8166645050048829,
"mean_token_accuracy": 0.6654425717890262,
"num_tokens": 15093226.0,
"step": 2540
},
{
"entropy": 1.7808674454689026,
"epoch": 4.381689232753063,
"grad_norm": 0.7306393980979919,
"learning_rate": 9.804e-05,
"loss": 1.8363780975341797,
"mean_token_accuracy": 0.6601886965334416,
"num_tokens": 15149937.0,
"step": 2550
},
{
"entropy": 1.7890540674328803,
"epoch": 4.398882441435633,
"grad_norm": 0.7466715574264526,
"learning_rate": 9.764000000000001e-05,
"loss": 1.847653579711914,
"mean_token_accuracy": 0.6586611110717058,
"num_tokens": 15210500.0,
"step": 2560
},
{
"entropy": 1.7866264268755914,
"epoch": 4.416075650118203,
"grad_norm": 0.7825273871421814,
"learning_rate": 9.724000000000001e-05,
"loss": 1.82576904296875,
"mean_token_accuracy": 0.6592508733272553,
"num_tokens": 15268262.0,
"step": 2570
},
{
"entropy": 1.8321722269058227,
"epoch": 4.433268858800774,
"grad_norm": 0.7158058285713196,
"learning_rate": 9.684000000000001e-05,
"loss": 1.8807327270507812,
"mean_token_accuracy": 0.6545467376708984,
"num_tokens": 15330745.0,
"step": 2580
},
{
"entropy": 1.739266212284565,
"epoch": 4.450462067483344,
"grad_norm": 0.7281847596168518,
"learning_rate": 9.644e-05,
"loss": 1.7686588287353515,
"mean_token_accuracy": 0.6666045777499676,
"num_tokens": 15391266.0,
"step": 2590
},
{
"entropy": 1.8295569285750388,
"epoch": 4.467655276165915,
"grad_norm": 0.7166727781295776,
"learning_rate": 9.604000000000001e-05,
"loss": 1.9156217575073242,
"mean_token_accuracy": 0.655017600953579,
"num_tokens": 15449819.0,
"step": 2600
},
{
"entropy": 1.8236071288585662,
"epoch": 4.484848484848484,
"grad_norm": 0.6946532726287842,
"learning_rate": 9.564000000000001e-05,
"loss": 1.9035514831542968,
"mean_token_accuracy": 0.649907086789608,
"num_tokens": 15513231.0,
"step": 2610
},
{
"entropy": 1.7869442969560623,
"epoch": 4.502041693531055,
"grad_norm": 0.7257023453712463,
"learning_rate": 9.524e-05,
"loss": 1.841336441040039,
"mean_token_accuracy": 0.6655759517103433,
"num_tokens": 15568973.0,
"step": 2620
},
{
"entropy": 1.7462848544120788,
"epoch": 4.519234902213626,
"grad_norm": 0.7239391803741455,
"learning_rate": 9.484e-05,
"loss": 1.7989360809326171,
"mean_token_accuracy": 0.6646886244416237,
"num_tokens": 15627655.0,
"step": 2630
},
{
"entropy": 1.7926493644714356,
"epoch": 4.536428110896196,
"grad_norm": 0.7628325819969177,
"learning_rate": 9.444000000000001e-05,
"loss": 1.8627632141113282,
"mean_token_accuracy": 0.654141866415739,
"num_tokens": 15687626.0,
"step": 2640
},
{
"entropy": 1.7928333327174186,
"epoch": 4.553621319578767,
"grad_norm": 0.629107654094696,
"learning_rate": 9.404e-05,
"loss": 1.8784042358398438,
"mean_token_accuracy": 0.6618591919541359,
"num_tokens": 15750035.0,
"step": 2650
},
{
"entropy": 1.7438783437013625,
"epoch": 4.570814528261336,
"grad_norm": 0.6948845982551575,
"learning_rate": 9.364e-05,
"loss": 1.7456579208374023,
"mean_token_accuracy": 0.6722261719405651,
"num_tokens": 15809533.0,
"step": 2660
},
{
"entropy": 1.7451874181628226,
"epoch": 4.588007736943907,
"grad_norm": 0.7213107943534851,
"learning_rate": 9.324000000000001e-05,
"loss": 1.8111917495727539,
"mean_token_accuracy": 0.6621977139264346,
"num_tokens": 15866570.0,
"step": 2670
},
{
"entropy": 1.806991095095873,
"epoch": 4.6052009456264775,
"grad_norm": 0.9146936535835266,
"learning_rate": 9.284e-05,
"loss": 1.8761199951171874,
"mean_token_accuracy": 0.6552402298897505,
"num_tokens": 15923681.0,
"step": 2680
},
{
"entropy": 1.854476225376129,
"epoch": 4.622394154309048,
"grad_norm": 0.675061047077179,
"learning_rate": 9.244e-05,
"loss": 1.8601364135742187,
"mean_token_accuracy": 0.656403211131692,
"num_tokens": 15979879.0,
"step": 2690
},
{
"entropy": 1.8345128282904626,
"epoch": 4.639587362991619,
"grad_norm": 0.7702699303627014,
"learning_rate": 9.204e-05,
"loss": 1.9170707702636718,
"mean_token_accuracy": 0.6507652081549168,
"num_tokens": 16040136.0,
"step": 2700
},
{
"entropy": 1.8444690719246863,
"epoch": 4.656780571674188,
"grad_norm": 0.7249677181243896,
"learning_rate": 9.164000000000001e-05,
"loss": 1.9021928787231446,
"mean_token_accuracy": 0.6553504541516304,
"num_tokens": 16097652.0,
"step": 2710
},
{
"entropy": 1.8083212688565253,
"epoch": 4.673973780356759,
"grad_norm": 0.7018275260925293,
"learning_rate": 9.124e-05,
"loss": 1.87921199798584,
"mean_token_accuracy": 0.6609590038657188,
"num_tokens": 16159014.0,
"step": 2720
},
{
"entropy": 1.793540646135807,
"epoch": 4.6911669890393295,
"grad_norm": 0.731863796710968,
"learning_rate": 9.084e-05,
"loss": 1.847224807739258,
"mean_token_accuracy": 0.6638176888227463,
"num_tokens": 16223636.0,
"step": 2730
},
{
"entropy": 1.7947301134467124,
"epoch": 4.7083601977219,
"grad_norm": 0.7208489775657654,
"learning_rate": 9.044000000000001e-05,
"loss": 1.8400375366210937,
"mean_token_accuracy": 0.6600434482097626,
"num_tokens": 16281647.0,
"step": 2740
},
{
"entropy": 1.8043948471546174,
"epoch": 4.725553406404471,
"grad_norm": 0.7633848190307617,
"learning_rate": 9.004e-05,
"loss": 1.8509382247924804,
"mean_token_accuracy": 0.6632162068039179,
"num_tokens": 16340706.0,
"step": 2750
},
{
"entropy": 1.8240734949707984,
"epoch": 4.74274661508704,
"grad_norm": 0.7516812086105347,
"learning_rate": 8.964e-05,
"loss": 1.9139686584472657,
"mean_token_accuracy": 0.6504824224859476,
"num_tokens": 16398077.0,
"step": 2760
},
{
"entropy": 1.7775158017873764,
"epoch": 4.759939823769611,
"grad_norm": 0.7677133679389954,
"learning_rate": 8.924e-05,
"loss": 1.8351661682128906,
"mean_token_accuracy": 0.6568478621542454,
"num_tokens": 16458898.0,
"step": 2770
},
{
"entropy": 1.8671277523040772,
"epoch": 4.7771330324521815,
"grad_norm": 0.750451385974884,
"learning_rate": 8.884e-05,
"loss": 1.9589305877685548,
"mean_token_accuracy": 0.6506143860518933,
"num_tokens": 16519496.0,
"step": 2780
},
{
"entropy": 1.7745324671268463,
"epoch": 4.794326241134752,
"grad_norm": 0.8302338719367981,
"learning_rate": 8.844e-05,
"loss": 1.8637496948242187,
"mean_token_accuracy": 0.6621543657034635,
"num_tokens": 16579080.0,
"step": 2790
},
{
"entropy": 1.73246541172266,
"epoch": 4.811519449817322,
"grad_norm": 0.778176486492157,
"learning_rate": 8.804e-05,
"loss": 1.752696418762207,
"mean_token_accuracy": 0.6727286443114281,
"num_tokens": 16640932.0,
"step": 2800
},
{
"entropy": 1.8060437709093093,
"epoch": 4.828712658499892,
"grad_norm": 0.9019444584846497,
"learning_rate": 8.764e-05,
"loss": 1.9031681060791015,
"mean_token_accuracy": 0.6563040159642697,
"num_tokens": 16702244.0,
"step": 2810
},
{
"entropy": 1.8732322439551354,
"epoch": 4.845905867182463,
"grad_norm": 0.7397829294204712,
"learning_rate": 8.724e-05,
"loss": 1.9326038360595703,
"mean_token_accuracy": 0.6478111572563648,
"num_tokens": 16764555.0,
"step": 2820
},
{
"entropy": 1.842681024968624,
"epoch": 4.863099075865033,
"grad_norm": 0.8511717915534973,
"learning_rate": 8.684e-05,
"loss": 1.9107376098632813,
"mean_token_accuracy": 0.6531910292804242,
"num_tokens": 16821936.0,
"step": 2830
},
{
"entropy": 1.7571960732340812,
"epoch": 4.880292284547604,
"grad_norm": 0.7064304947853088,
"learning_rate": 8.643999999999999e-05,
"loss": 1.7985404968261718,
"mean_token_accuracy": 0.6667480751872062,
"num_tokens": 16882205.0,
"step": 2840
},
{
"entropy": 1.8695308573544025,
"epoch": 4.897485493230175,
"grad_norm": 0.7386742234230042,
"learning_rate": 8.604000000000001e-05,
"loss": 1.9543342590332031,
"mean_token_accuracy": 0.6496741093695164,
"num_tokens": 16939799.0,
"step": 2850
},
{
"entropy": 1.7877972453832627,
"epoch": 4.914678701912744,
"grad_norm": 0.7687976956367493,
"learning_rate": 8.564000000000001e-05,
"loss": 1.7994373321533204,
"mean_token_accuracy": 0.6637697361409665,
"num_tokens": 16997716.0,
"step": 2860
},
{
"entropy": 1.761916320025921,
"epoch": 4.931871910595315,
"grad_norm": 0.7507193088531494,
"learning_rate": 8.524e-05,
"loss": 1.788670539855957,
"mean_token_accuracy": 0.6648910716176033,
"num_tokens": 17057260.0,
"step": 2870
},
{
"entropy": 1.804823537170887,
"epoch": 4.949065119277885,
"grad_norm": 0.727188229560852,
"learning_rate": 8.484000000000001e-05,
"loss": 1.855522346496582,
"mean_token_accuracy": 0.657912939786911,
"num_tokens": 17116073.0,
"step": 2880
},
{
"entropy": 1.8259041801095008,
"epoch": 4.966258327960456,
"grad_norm": 0.7195336818695068,
"learning_rate": 8.444000000000001e-05,
"loss": 1.8942272186279296,
"mean_token_accuracy": 0.6546841934323311,
"num_tokens": 17174141.0,
"step": 2890
},
{
"entropy": 1.7153871595859527,
"epoch": 4.983451536643026,
"grad_norm": 0.7093940377235413,
"learning_rate": 8.404e-05,
"loss": 1.7350996017456055,
"mean_token_accuracy": 0.6728265054523945,
"num_tokens": 17233307.0,
"step": 2900
},
{
"entropy": 1.7630670566063422,
"epoch": 5.0,
"grad_norm": 0.979345440864563,
"learning_rate": 8.364e-05,
"loss": 1.8098876953125,
"mean_token_accuracy": 0.6604567510741097,
"num_tokens": 17289810.0,
"step": 2910
},
{
"entropy": 1.8877688512206077,
"epoch": 5.017193208682571,
"grad_norm": 0.8140257596969604,
"learning_rate": 8.324000000000001e-05,
"loss": 1.9562681198120118,
"mean_token_accuracy": 0.6476880256086588,
"num_tokens": 17349922.0,
"step": 2920
},
{
"entropy": 1.6694072388112544,
"epoch": 5.034386417365141,
"grad_norm": 0.7486578226089478,
"learning_rate": 8.284000000000001e-05,
"loss": 1.71788330078125,
"mean_token_accuracy": 0.6781885512173176,
"num_tokens": 17409363.0,
"step": 2930
},
{
"entropy": 1.8061093628406524,
"epoch": 5.051579626047711,
"grad_norm": 0.8148984313011169,
"learning_rate": 8.244e-05,
"loss": 1.8484228134155274,
"mean_token_accuracy": 0.6591597832739353,
"num_tokens": 17468218.0,
"step": 2940
},
{
"entropy": 1.7561381176114081,
"epoch": 5.068772834730281,
"grad_norm": 0.7412339448928833,
"learning_rate": 8.204000000000001e-05,
"loss": 1.8109855651855469,
"mean_token_accuracy": 0.6648329850286245,
"num_tokens": 17529603.0,
"step": 2950
},
{
"entropy": 1.7058369636535644,
"epoch": 5.085966043412852,
"grad_norm": 0.7845883369445801,
"learning_rate": 8.164000000000001e-05,
"loss": 1.7577402114868164,
"mean_token_accuracy": 0.675883399322629,
"num_tokens": 17587275.0,
"step": 2960
},
{
"entropy": 1.7319279327988624,
"epoch": 5.1031592520954225,
"grad_norm": 0.7546029090881348,
"learning_rate": 8.124e-05,
"loss": 1.8096488952636718,
"mean_token_accuracy": 0.668717809766531,
"num_tokens": 17647368.0,
"step": 2970
},
{
"entropy": 1.7872621923685075,
"epoch": 5.120352460777993,
"grad_norm": 0.7214957475662231,
"learning_rate": 8.084e-05,
"loss": 1.7827239990234376,
"mean_token_accuracy": 0.663322826102376,
"num_tokens": 17708210.0,
"step": 2980
},
{
"entropy": 1.7479579642415046,
"epoch": 5.137545669460563,
"grad_norm": 0.6938044428825378,
"learning_rate": 8.044000000000001e-05,
"loss": 1.837489700317383,
"mean_token_accuracy": 0.666904554143548,
"num_tokens": 17770498.0,
"step": 2990
},
{
"entropy": 1.760008592903614,
"epoch": 5.154738878143133,
"grad_norm": 0.7440096139907837,
"learning_rate": 8.004e-05,
"loss": 1.7957250595092773,
"mean_token_accuracy": 0.6704145818948746,
"num_tokens": 17831493.0,
"step": 3000
},
{
"entropy": 1.7866925299167633,
"epoch": 5.171932086825704,
"grad_norm": 0.775793731212616,
"learning_rate": 7.964e-05,
"loss": 1.8513370513916017,
"mean_token_accuracy": 0.6593568369746208,
"num_tokens": 17893338.0,
"step": 3010
},
{
"entropy": 1.720614206790924,
"epoch": 5.1891252955082745,
"grad_norm": 0.7855071425437927,
"learning_rate": 7.924000000000001e-05,
"loss": 1.7529998779296876,
"mean_token_accuracy": 0.6738685265183448,
"num_tokens": 17949102.0,
"step": 3020
},
{
"entropy": 1.8130397230386734,
"epoch": 5.206318504190844,
"grad_norm": 0.7261347770690918,
"learning_rate": 7.884e-05,
"loss": 1.8530288696289063,
"mean_token_accuracy": 0.6672368694096804,
"num_tokens": 18011291.0,
"step": 3030
},
{
"entropy": 1.7442916065454483,
"epoch": 5.223511712873415,
"grad_norm": 0.7350125908851624,
"learning_rate": 7.844e-05,
"loss": 1.8185455322265625,
"mean_token_accuracy": 0.6648106183856726,
"num_tokens": 18069969.0,
"step": 3040
},
{
"entropy": 1.8336029559373856,
"epoch": 5.240704921555985,
"grad_norm": 0.9380921125411987,
"learning_rate": 7.804e-05,
"loss": 1.8585586547851562,
"mean_token_accuracy": 0.6559876747429371,
"num_tokens": 18126839.0,
"step": 3050
},
{
"entropy": 1.831410789489746,
"epoch": 5.257898130238556,
"grad_norm": 0.7422699928283691,
"learning_rate": 7.764e-05,
"loss": 1.8669567108154297,
"mean_token_accuracy": 0.660079612582922,
"num_tokens": 18189169.0,
"step": 3060
},
{
"entropy": 1.7772829428315162,
"epoch": 5.2750913389211265,
"grad_norm": 0.80905681848526,
"learning_rate": 7.724e-05,
"loss": 1.8516859054565429,
"mean_token_accuracy": 0.6614492174237967,
"num_tokens": 18247092.0,
"step": 3070
},
{
"entropy": 1.8644750490784645,
"epoch": 5.292284547603696,
"grad_norm": 0.7613252997398376,
"learning_rate": 7.684e-05,
"loss": 1.8771135330200195,
"mean_token_accuracy": 0.6572393793612719,
"num_tokens": 18305337.0,
"step": 3080
},
{
"entropy": 1.7888765200972556,
"epoch": 5.309477756286267,
"grad_norm": 0.8216497302055359,
"learning_rate": 7.644e-05,
"loss": 1.877157211303711,
"mean_token_accuracy": 0.6567147132009268,
"num_tokens": 18366693.0,
"step": 3090
},
{
"entropy": 1.8046741798520087,
"epoch": 5.326670964968837,
"grad_norm": 0.7475964426994324,
"learning_rate": 7.604e-05,
"loss": 1.8193256378173828,
"mean_token_accuracy": 0.6620738692581654,
"num_tokens": 18424592.0,
"step": 3100
},
{
"entropy": 1.7556863978505135,
"epoch": 5.343864173651408,
"grad_norm": 0.7376730442047119,
"learning_rate": 7.564e-05,
"loss": 1.8117481231689454,
"mean_token_accuracy": 0.6669185206294059,
"num_tokens": 18480554.0,
"step": 3110
},
{
"entropy": 1.680773164331913,
"epoch": 5.361057382333978,
"grad_norm": 0.8276366591453552,
"learning_rate": 7.524e-05,
"loss": 1.7203754425048827,
"mean_token_accuracy": 0.6801572386175394,
"num_tokens": 18533859.0,
"step": 3120
},
{
"entropy": 1.7824992030858993,
"epoch": 5.378250591016548,
"grad_norm": 0.7689419984817505,
"learning_rate": 7.484e-05,
"loss": 1.7929088592529296,
"mean_token_accuracy": 0.6625824831426144,
"num_tokens": 18591432.0,
"step": 3130
},
{
"entropy": 1.7943954214453697,
"epoch": 5.395443799699119,
"grad_norm": 0.7818305492401123,
"learning_rate": 7.444e-05,
"loss": 1.9027202606201172,
"mean_token_accuracy": 0.6574487689882517,
"num_tokens": 18653780.0,
"step": 3140
},
{
"entropy": 1.6858137652277947,
"epoch": 5.412637008381689,
"grad_norm": 0.7783890962600708,
"learning_rate": 7.404e-05,
"loss": 1.6952400207519531,
"mean_token_accuracy": 0.681446236371994,
"num_tokens": 18712089.0,
"step": 3150
},
{
"entropy": 1.781475655734539,
"epoch": 5.42983021706426,
"grad_norm": 0.8033313751220703,
"learning_rate": 7.364e-05,
"loss": 1.8575824737548827,
"mean_token_accuracy": 0.6616954285651445,
"num_tokens": 18770760.0,
"step": 3160
},
{
"entropy": 1.8044284671545028,
"epoch": 5.44702342574683,
"grad_norm": 0.8778691291809082,
"learning_rate": 7.324000000000001e-05,
"loss": 1.868129348754883,
"mean_token_accuracy": 0.6609551507979632,
"num_tokens": 18830053.0,
"step": 3170
},
{
"entropy": 1.7783053085207938,
"epoch": 5.4642166344294,
"grad_norm": 0.794116735458374,
"learning_rate": 7.284000000000001e-05,
"loss": 1.8013723373413086,
"mean_token_accuracy": 0.6687252540141344,
"num_tokens": 18889827.0,
"step": 3180
},
{
"entropy": 1.7062184020876885,
"epoch": 5.481409843111971,
"grad_norm": 0.7348354458808899,
"learning_rate": 7.244e-05,
"loss": 1.7668045043945313,
"mean_token_accuracy": 0.6689080417156219,
"num_tokens": 18948434.0,
"step": 3190
},
{
"entropy": 1.7791135892271996,
"epoch": 5.498603051794541,
"grad_norm": 0.7285153865814209,
"learning_rate": 7.204000000000001e-05,
"loss": 1.8126005172729491,
"mean_token_accuracy": 0.6612196549773216,
"num_tokens": 19009735.0,
"step": 3200
},
{
"entropy": 1.7333651915192605,
"epoch": 5.515796260477112,
"grad_norm": 0.768817126750946,
"learning_rate": 7.164000000000001e-05,
"loss": 1.7593820571899415,
"mean_token_accuracy": 0.6708316601812839,
"num_tokens": 19070839.0,
"step": 3210
},
{
"entropy": 1.7646871596574782,
"epoch": 5.532989469159682,
"grad_norm": 0.7897234559059143,
"learning_rate": 7.124e-05,
"loss": 1.8217975616455078,
"mean_token_accuracy": 0.6624716755002737,
"num_tokens": 19131701.0,
"step": 3220
},
{
"entropy": 1.79796422123909,
"epoch": 5.550182677842252,
"grad_norm": 0.7753779292106628,
"learning_rate": 7.084e-05,
"loss": 1.8724674224853515,
"mean_token_accuracy": 0.6603185098618269,
"num_tokens": 19192206.0,
"step": 3230
},
{
"entropy": 1.6741029411554336,
"epoch": 5.567375886524823,
"grad_norm": 0.7559053301811218,
"learning_rate": 7.044000000000001e-05,
"loss": 1.7434120178222656,
"mean_token_accuracy": 0.6718135714530945,
"num_tokens": 19250887.0,
"step": 3240
},
{
"entropy": 1.8032452374696732,
"epoch": 5.584569095207393,
"grad_norm": 0.8887183666229248,
"learning_rate": 7.004e-05,
"loss": 1.8439495086669921,
"mean_token_accuracy": 0.6600725017488003,
"num_tokens": 19310066.0,
"step": 3250
},
{
"entropy": 1.7758998274803162,
"epoch": 5.601762303889964,
"grad_norm": 0.7295767068862915,
"learning_rate": 6.964e-05,
"loss": 1.8148815155029296,
"mean_token_accuracy": 0.6652825616300106,
"num_tokens": 19369366.0,
"step": 3260
},
{
"entropy": 1.662617878615856,
"epoch": 5.6189555125725335,
"grad_norm": 0.7842978835105896,
"learning_rate": 6.924000000000001e-05,
"loss": 1.7120464324951172,
"mean_token_accuracy": 0.6784614086151123,
"num_tokens": 19427377.0,
"step": 3270
},
{
"entropy": 1.8431208834052086,
"epoch": 5.636148721255104,
"grad_norm": 0.8514787554740906,
"learning_rate": 6.884e-05,
"loss": 1.9169921875,
"mean_token_accuracy": 0.6530084304511548,
"num_tokens": 19488230.0,
"step": 3280
},
{
"entropy": 1.6891573801636697,
"epoch": 5.653341929937675,
"grad_norm": 0.7638376951217651,
"learning_rate": 6.844e-05,
"loss": 1.7445995330810546,
"mean_token_accuracy": 0.6780110366642476,
"num_tokens": 19546500.0,
"step": 3290
},
{
"entropy": 1.7722659215331078,
"epoch": 5.670535138620245,
"grad_norm": 0.8072571754455566,
"learning_rate": 6.804e-05,
"loss": 1.8334453582763672,
"mean_token_accuracy": 0.6662346951663494,
"num_tokens": 19604586.0,
"step": 3300
},
{
"entropy": 1.824295823276043,
"epoch": 5.687728347302816,
"grad_norm": 0.8235921263694763,
"learning_rate": 6.764000000000001e-05,
"loss": 1.936505126953125,
"mean_token_accuracy": 0.6559072963893413,
"num_tokens": 19664352.0,
"step": 3310
},
{
"entropy": 1.776401199400425,
"epoch": 5.704921555985385,
"grad_norm": 0.7260850071907043,
"learning_rate": 6.724e-05,
"loss": 1.8124887466430664,
"mean_token_accuracy": 0.6629696622490883,
"num_tokens": 19725395.0,
"step": 3320
},
{
"entropy": 1.8356325037777423,
"epoch": 5.722114764667956,
"grad_norm": 0.7962324619293213,
"learning_rate": 6.684e-05,
"loss": 1.865267562866211,
"mean_token_accuracy": 0.6575648851692677,
"num_tokens": 19786886.0,
"step": 3330
},
{
"entropy": 1.7975003249943255,
"epoch": 5.739307973350527,
"grad_norm": 0.8319332599639893,
"learning_rate": 6.644000000000001e-05,
"loss": 1.8043830871582032,
"mean_token_accuracy": 0.6615023009479046,
"num_tokens": 19846593.0,
"step": 3340
},
{
"entropy": 1.7411245226860046,
"epoch": 5.756501182033097,
"grad_norm": 0.8770884871482849,
"learning_rate": 6.604e-05,
"loss": 1.8150835037231445,
"mean_token_accuracy": 0.6629907101392746,
"num_tokens": 19901728.0,
"step": 3350
},
{
"entropy": 1.7868980005383492,
"epoch": 5.773694390715667,
"grad_norm": 0.8012292385101318,
"learning_rate": 6.564e-05,
"loss": 1.8411848068237304,
"mean_token_accuracy": 0.6610983822494745,
"num_tokens": 19962294.0,
"step": 3360
},
{
"entropy": 1.8151665195822715,
"epoch": 5.790887599398237,
"grad_norm": 0.6628616452217102,
"learning_rate": 6.524e-05,
"loss": 1.873934555053711,
"mean_token_accuracy": 0.6563344091176987,
"num_tokens": 20023732.0,
"step": 3370
},
{
"entropy": 1.7544716522097588,
"epoch": 5.808080808080808,
"grad_norm": 0.765192985534668,
"learning_rate": 6.484e-05,
"loss": 1.7676244735717774,
"mean_token_accuracy": 0.6681830242276192,
"num_tokens": 20084955.0,
"step": 3380
},
{
"entropy": 1.7742430947721004,
"epoch": 5.8252740167633785,
"grad_norm": 0.9363911747932434,
"learning_rate": 6.444e-05,
"loss": 1.7936756134033203,
"mean_token_accuracy": 0.6640898622572422,
"num_tokens": 20145523.0,
"step": 3390
},
{
"entropy": 1.7672609627246856,
"epoch": 5.842467225445949,
"grad_norm": 0.8024185299873352,
"learning_rate": 6.404e-05,
"loss": 1.8390132904052734,
"mean_token_accuracy": 0.66049126945436,
"num_tokens": 20207613.0,
"step": 3400
},
{
"entropy": 1.7419263988733291,
"epoch": 5.85966043412852,
"grad_norm": 0.7452662587165833,
"learning_rate": 6.364e-05,
"loss": 1.7592693328857423,
"mean_token_accuracy": 0.6729626737534999,
"num_tokens": 20264080.0,
"step": 3410
},
{
"entropy": 1.794348457455635,
"epoch": 5.876853642811089,
"grad_norm": 0.7698886394500732,
"learning_rate": 6.324e-05,
"loss": 1.8215929031372071,
"mean_token_accuracy": 0.6594825953245163,
"num_tokens": 20325935.0,
"step": 3420
},
{
"entropy": 1.8058848246932029,
"epoch": 5.89404685149366,
"grad_norm": 0.7813654541969299,
"learning_rate": 6.284e-05,
"loss": 1.8705635070800781,
"mean_token_accuracy": 0.6575064200907945,
"num_tokens": 20384219.0,
"step": 3430
},
{
"entropy": 1.8296722590923309,
"epoch": 5.9112400601762305,
"grad_norm": 0.7985308766365051,
"learning_rate": 6.244e-05,
"loss": 1.8622390747070312,
"mean_token_accuracy": 0.6598837457597255,
"num_tokens": 20442055.0,
"step": 3440
},
{
"entropy": 1.745755286514759,
"epoch": 5.928433268858801,
"grad_norm": 0.7957124710083008,
"learning_rate": 6.204e-05,
"loss": 1.778817367553711,
"mean_token_accuracy": 0.6711658544838428,
"num_tokens": 20500787.0,
"step": 3450
},
{
"entropy": 1.8037077650427817,
"epoch": 5.945626477541371,
"grad_norm": 0.8097943067550659,
"learning_rate": 6.164e-05,
"loss": 1.8763154983520507,
"mean_token_accuracy": 0.6581781908869744,
"num_tokens": 20560544.0,
"step": 3460
},
{
"entropy": 1.6478220209479333,
"epoch": 5.962819686223941,
"grad_norm": 0.7882372736930847,
"learning_rate": 6.124e-05,
"loss": 1.7081596374511718,
"mean_token_accuracy": 0.6812212504446507,
"num_tokens": 20621866.0,
"step": 3470
},
{
"entropy": 1.7625097312033176,
"epoch": 5.980012894906512,
"grad_norm": 0.780114471912384,
"learning_rate": 6.084000000000001e-05,
"loss": 1.7864303588867188,
"mean_token_accuracy": 0.6720023825764656,
"num_tokens": 20677413.0,
"step": 3480
},
{
"entropy": 1.7418652877211571,
"epoch": 5.9972061035890825,
"grad_norm": 0.8374961614608765,
"learning_rate": 6.044000000000001e-05,
"loss": 1.8426128387451173,
"mean_token_accuracy": 0.6612365163862706,
"num_tokens": 20737424.0,
"step": 3490
},
{
"entropy": 1.8112690711950328,
"epoch": 6.013754566946056,
"grad_norm": 0.7742412686347961,
"learning_rate": 6.004000000000001e-05,
"loss": 1.8320732116699219,
"mean_token_accuracy": 0.6643109286760355,
"num_tokens": 20795175.0,
"step": 3500
},
{
"entropy": 1.687648557126522,
"epoch": 6.0309477756286265,
"grad_norm": 0.8348304629325867,
"learning_rate": 5.9640000000000005e-05,
"loss": 1.7558349609375,
"mean_token_accuracy": 0.6784385897219181,
"num_tokens": 20852486.0,
"step": 3510
},
{
"entropy": 1.6863658234477044,
"epoch": 6.048140984311197,
"grad_norm": 0.7642632126808167,
"learning_rate": 5.924000000000001e-05,
"loss": 1.6536775588989259,
"mean_token_accuracy": 0.680523382127285,
"num_tokens": 20908597.0,
"step": 3520
},
{
"entropy": 1.6652932062745094,
"epoch": 6.065334192993768,
"grad_norm": 0.8676924109458923,
"learning_rate": 5.8840000000000006e-05,
"loss": 1.7443069458007812,
"mean_token_accuracy": 0.6719188451766968,
"num_tokens": 20966567.0,
"step": 3530
},
{
"entropy": 1.7391631960868836,
"epoch": 6.082527401676338,
"grad_norm": 0.8444374799728394,
"learning_rate": 5.844e-05,
"loss": 1.7849775314331056,
"mean_token_accuracy": 0.672398941218853,
"num_tokens": 21023832.0,
"step": 3540
},
{
"entropy": 1.7432220742106437,
"epoch": 6.099720610358908,
"grad_norm": 0.7972187995910645,
"learning_rate": 5.804000000000001e-05,
"loss": 1.8264921188354493,
"mean_token_accuracy": 0.6713483344763518,
"num_tokens": 21080325.0,
"step": 3550
},
{
"entropy": 1.7394985787570476,
"epoch": 6.1169138190414785,
"grad_norm": 0.8266369700431824,
"learning_rate": 5.7640000000000004e-05,
"loss": 1.819821548461914,
"mean_token_accuracy": 0.6708907049149275,
"num_tokens": 21143316.0,
"step": 3560
},
{
"entropy": 1.7923602670431138,
"epoch": 6.134107027724049,
"grad_norm": 0.8315872550010681,
"learning_rate": 5.724000000000001e-05,
"loss": 1.8086809158325194,
"mean_token_accuracy": 0.665992408245802,
"num_tokens": 21203848.0,
"step": 3570
},
{
"entropy": 1.711188006401062,
"epoch": 6.15130023640662,
"grad_norm": 0.8174048066139221,
"learning_rate": 5.6840000000000005e-05,
"loss": 1.7656991958618165,
"mean_token_accuracy": 0.6711975857615471,
"num_tokens": 21266260.0,
"step": 3580
},
{
"entropy": 1.8437035098671912,
"epoch": 6.16849344508919,
"grad_norm": 0.8155949711799622,
"learning_rate": 5.644e-05,
"loss": 1.877999496459961,
"mean_token_accuracy": 0.6532085236161947,
"num_tokens": 21326008.0,
"step": 3590
},
{
"entropy": 1.7264528393745422,
"epoch": 6.18568665377176,
"grad_norm": 0.7951272130012512,
"learning_rate": 5.6040000000000006e-05,
"loss": 1.747119140625,
"mean_token_accuracy": 0.6696909107267857,
"num_tokens": 21385356.0,
"step": 3600
},
{
"entropy": 1.68227918446064,
"epoch": 6.20287986245433,
"grad_norm": 0.779587984085083,
"learning_rate": 5.564e-05,
"loss": 1.7062965393066407,
"mean_token_accuracy": 0.6786911800503731,
"num_tokens": 21443231.0,
"step": 3610
},
{
"entropy": 1.7644565671682357,
"epoch": 6.220073071136901,
"grad_norm": 0.9153981804847717,
"learning_rate": 5.524e-05,
"loss": 1.8082721710205079,
"mean_token_accuracy": 0.6671201888471842,
"num_tokens": 21499309.0,
"step": 3620
},
{
"entropy": 1.7211210913956165,
"epoch": 6.237266279819472,
"grad_norm": 0.8166586756706238,
"learning_rate": 5.4840000000000003e-05,
"loss": 1.769371795654297,
"mean_token_accuracy": 0.6694241009652615,
"num_tokens": 21558565.0,
"step": 3630
},
{
"entropy": 1.7693689942359925,
"epoch": 6.254459488502041,
"grad_norm": 0.7773623466491699,
"learning_rate": 5.444e-05,
"loss": 1.848412322998047,
"mean_token_accuracy": 0.66685731112957,
"num_tokens": 21618504.0,
"step": 3640
},
{
"entropy": 1.8090675905346871,
"epoch": 6.271652697184612,
"grad_norm": 0.9420453310012817,
"learning_rate": 5.4040000000000004e-05,
"loss": 1.8266836166381837,
"mean_token_accuracy": 0.6643423162400722,
"num_tokens": 21676861.0,
"step": 3650
},
{
"entropy": 1.7340097561478616,
"epoch": 6.288845905867182,
"grad_norm": 0.805880069732666,
"learning_rate": 5.364e-05,
"loss": 1.7760274887084961,
"mean_token_accuracy": 0.6729184173047542,
"num_tokens": 21734874.0,
"step": 3660
},
{
"entropy": 1.733542764186859,
"epoch": 6.306039114549753,
"grad_norm": 0.7459798455238342,
"learning_rate": 5.324e-05,
"loss": 1.7874065399169923,
"mean_token_accuracy": 0.6733234331011773,
"num_tokens": 21797467.0,
"step": 3670
},
{
"entropy": 1.6855479300022125,
"epoch": 6.3232323232323235,
"grad_norm": 0.7362611889839172,
"learning_rate": 5.284e-05,
"loss": 1.7557338714599608,
"mean_token_accuracy": 0.6742986045777798,
"num_tokens": 21856704.0,
"step": 3680
},
{
"entropy": 1.762756396830082,
"epoch": 6.340425531914893,
"grad_norm": 0.8349901437759399,
"learning_rate": 5.244e-05,
"loss": 1.784174346923828,
"mean_token_accuracy": 0.6732991166412831,
"num_tokens": 21915781.0,
"step": 3690
},
{
"entropy": 1.7664957396686076,
"epoch": 6.357618740597464,
"grad_norm": 0.8295337557792664,
"learning_rate": 5.204e-05,
"loss": 1.8338695526123048,
"mean_token_accuracy": 0.6659718155860901,
"num_tokens": 21973568.0,
"step": 3700
},
{
"entropy": 1.7744196206331253,
"epoch": 6.374811949280034,
"grad_norm": 0.739115297794342,
"learning_rate": 5.164e-05,
"loss": 1.8148929595947265,
"mean_token_accuracy": 0.6660460762679576,
"num_tokens": 22032979.0,
"step": 3710
},
{
"entropy": 1.7459667712450027,
"epoch": 6.392005157962605,
"grad_norm": 0.7716593146324158,
"learning_rate": 5.124e-05,
"loss": 1.8079204559326172,
"mean_token_accuracy": 0.66551748290658,
"num_tokens": 22092283.0,
"step": 3720
},
{
"entropy": 1.7491293936967849,
"epoch": 6.4091983666451755,
"grad_norm": 0.8270374536514282,
"learning_rate": 5.084e-05,
"loss": 1.8020380020141602,
"mean_token_accuracy": 0.6673273537307978,
"num_tokens": 22150667.0,
"step": 3730
},
{
"entropy": 1.6887403331696986,
"epoch": 6.426391575327745,
"grad_norm": 0.8306758403778076,
"learning_rate": 5.044e-05,
"loss": 1.7328964233398438,
"mean_token_accuracy": 0.676455694437027,
"num_tokens": 22211170.0,
"step": 3740
},
{
"entropy": 1.8332835257053375,
"epoch": 6.443584784010316,
"grad_norm": 0.8369497656822205,
"learning_rate": 5.0039999999999995e-05,
"loss": 1.913273239135742,
"mean_token_accuracy": 0.656198850646615,
"num_tokens": 22269928.0,
"step": 3750
},
{
"entropy": 1.6914366707205772,
"epoch": 6.460777992692886,
"grad_norm": 0.7562059164047241,
"learning_rate": 4.9640000000000006e-05,
"loss": 1.7506240844726562,
"mean_token_accuracy": 0.67936124317348,
"num_tokens": 22328611.0,
"step": 3760
},
{
"entropy": 1.7604179099202155,
"epoch": 6.477971201375457,
"grad_norm": 0.7541300058364868,
"learning_rate": 4.924e-05,
"loss": 1.8065948486328125,
"mean_token_accuracy": 0.6697364591062069,
"num_tokens": 22389219.0,
"step": 3770
},
{
"entropy": 1.731757602095604,
"epoch": 6.4951644100580275,
"grad_norm": 0.8319364190101624,
"learning_rate": 4.884e-05,
"loss": 1.7902181625366211,
"mean_token_accuracy": 0.6673447206616402,
"num_tokens": 22449858.0,
"step": 3780
},
{
"entropy": 1.7152166068553925,
"epoch": 6.512357618740597,
"grad_norm": 0.8575091361999512,
"learning_rate": 4.8440000000000004e-05,
"loss": 1.7424659729003906,
"mean_token_accuracy": 0.6707747709006071,
"num_tokens": 22509375.0,
"step": 3790
},
{
"entropy": 1.6641680032014847,
"epoch": 6.529550827423168,
"grad_norm": 0.7516652345657349,
"learning_rate": 4.804e-05,
"loss": 1.6937873840332032,
"mean_token_accuracy": 0.6811798132956028,
"num_tokens": 22566440.0,
"step": 3800
},
{
"entropy": 1.7551555022597314,
"epoch": 6.546744036105738,
"grad_norm": 0.817863941192627,
"learning_rate": 4.7640000000000005e-05,
"loss": 1.8282489776611328,
"mean_token_accuracy": 0.6655839093029499,
"num_tokens": 22627900.0,
"step": 3810
},
{
"entropy": 1.7025569766759872,
"epoch": 6.563937244788309,
"grad_norm": 0.757764458656311,
"learning_rate": 4.724e-05,
"loss": 1.7325496673583984,
"mean_token_accuracy": 0.6785391330718994,
"num_tokens": 22685738.0,
"step": 3820
},
{
"entropy": 1.699775031208992,
"epoch": 6.5811304534708785,
"grad_norm": 0.7960421442985535,
"learning_rate": 4.684e-05,
"loss": 1.7602745056152345,
"mean_token_accuracy": 0.6698532458394766,
"num_tokens": 22745696.0,
"step": 3830
},
{
"entropy": 1.8100605458021164,
"epoch": 6.598323662153449,
"grad_norm": 0.8477244973182678,
"learning_rate": 4.644e-05,
"loss": 1.8226333618164063,
"mean_token_accuracy": 0.6646727129817009,
"num_tokens": 22805783.0,
"step": 3840
},
{
"entropy": 1.7685839846730231,
"epoch": 6.61551687083602,
"grad_norm": 0.7853493690490723,
"learning_rate": 4.604e-05,
"loss": 1.8230281829833985,
"mean_token_accuracy": 0.664577030390501,
"num_tokens": 22866822.0,
"step": 3850
},
{
"entropy": 1.7810854628682136,
"epoch": 6.63271007951859,
"grad_norm": 0.7139444351196289,
"learning_rate": 4.564e-05,
"loss": 1.855198287963867,
"mean_token_accuracy": 0.6652711797505617,
"num_tokens": 22928790.0,
"step": 3860
},
{
"entropy": 1.7815292954444886,
"epoch": 6.649903288201161,
"grad_norm": 0.7039018869400024,
"learning_rate": 4.524000000000001e-05,
"loss": 1.845859909057617,
"mean_token_accuracy": 0.6595252249389887,
"num_tokens": 22990170.0,
"step": 3870
},
{
"entropy": 1.7107908308506012,
"epoch": 6.667096496883731,
"grad_norm": 0.7651708126068115,
"learning_rate": 4.4840000000000004e-05,
"loss": 1.7340824127197265,
"mean_token_accuracy": 0.6750431463122368,
"num_tokens": 23047902.0,
"step": 3880
},
{
"entropy": 1.7069460928440094,
"epoch": 6.684289705566301,
"grad_norm": 0.7385950088500977,
"learning_rate": 4.444e-05,
"loss": 1.758881187438965,
"mean_token_accuracy": 0.6745327576994896,
"num_tokens": 23112106.0,
"step": 3890
},
{
"entropy": 1.821124967932701,
"epoch": 6.701482914248872,
"grad_norm": 0.7827627658843994,
"learning_rate": 4.4040000000000005e-05,
"loss": 1.913480567932129,
"mean_token_accuracy": 0.6593531377613544,
"num_tokens": 23170056.0,
"step": 3900
},
{
"entropy": 1.7924881175160408,
"epoch": 6.718676122931442,
"grad_norm": 0.8166612386703491,
"learning_rate": 4.364e-05,
"loss": 1.855017852783203,
"mean_token_accuracy": 0.6593458168208599,
"num_tokens": 23228582.0,
"step": 3910
},
{
"entropy": 1.736910080909729,
"epoch": 6.735869331614013,
"grad_norm": 0.779629647731781,
"learning_rate": 4.324e-05,
"loss": 1.7581821441650392,
"mean_token_accuracy": 0.6779871381819248,
"num_tokens": 23288702.0,
"step": 3920
},
{
"entropy": 1.6776573412120341,
"epoch": 6.7530625402965825,
"grad_norm": 0.7625913619995117,
"learning_rate": 4.284e-05,
"loss": 1.7102031707763672,
"mean_token_accuracy": 0.6794889360666275,
"num_tokens": 23349004.0,
"step": 3930
},
{
"entropy": 1.8100020587444305,
"epoch": 6.770255748979153,
"grad_norm": 0.7499405145645142,
"learning_rate": 4.244e-05,
"loss": 1.8514158248901367,
"mean_token_accuracy": 0.6620845705270767,
"num_tokens": 23410874.0,
"step": 3940
},
{
"entropy": 1.697011759877205,
"epoch": 6.787448957661724,
"grad_norm": 0.736323893070221,
"learning_rate": 4.2040000000000004e-05,
"loss": 1.7609180450439452,
"mean_token_accuracy": 0.6772994473576546,
"num_tokens": 23472518.0,
"step": 3950
},
{
"entropy": 1.764576494693756,
"epoch": 6.804642166344294,
"grad_norm": 0.8523833751678467,
"learning_rate": 4.164e-05,
"loss": 1.81484375,
"mean_token_accuracy": 0.6644324712455273,
"num_tokens": 23531203.0,
"step": 3960
},
{
"entropy": 1.7241224959492683,
"epoch": 6.821835375026865,
"grad_norm": 0.8820350766181946,
"learning_rate": 4.124e-05,
"loss": 1.739130401611328,
"mean_token_accuracy": 0.6771424360573292,
"num_tokens": 23590289.0,
"step": 3970
},
{
"entropy": 1.6967746496200562,
"epoch": 6.8390285837094345,
"grad_norm": 0.8161067962646484,
"learning_rate": 4.084e-05,
"loss": 1.7659534454345702,
"mean_token_accuracy": 0.6744477659463882,
"num_tokens": 23647985.0,
"step": 3980
},
{
"entropy": 1.8578275874257089,
"epoch": 6.856221792392005,
"grad_norm": 0.778160810470581,
"learning_rate": 4.044e-05,
"loss": 1.9046249389648438,
"mean_token_accuracy": 0.6525318272411823,
"num_tokens": 23707387.0,
"step": 3990
},
{
"entropy": 1.781902502477169,
"epoch": 6.873415001074576,
"grad_norm": 0.9398592710494995,
"learning_rate": 4.004e-05,
"loss": 1.8081722259521484,
"mean_token_accuracy": 0.6625144556164742,
"num_tokens": 23764831.0,
"step": 4000
},
{
"entropy": 1.699565550684929,
"epoch": 6.890608209757146,
"grad_norm": 0.7662839889526367,
"learning_rate": 3.964e-05,
"loss": 1.7373327255249023,
"mean_token_accuracy": 0.6809282444417477,
"num_tokens": 23825367.0,
"step": 4010
},
{
"entropy": 1.6455101184546947,
"epoch": 6.907801418439716,
"grad_norm": 0.7619901299476624,
"learning_rate": 3.9240000000000004e-05,
"loss": 1.709805679321289,
"mean_token_accuracy": 0.6812954246997833,
"num_tokens": 23887369.0,
"step": 4020
},
{
"entropy": 1.7952800825238229,
"epoch": 6.924994627122286,
"grad_norm": 0.7858437299728394,
"learning_rate": 3.884e-05,
"loss": 1.8688398361206056,
"mean_token_accuracy": 0.6621494639664889,
"num_tokens": 23949358.0,
"step": 4030
},
{
"entropy": 1.772008201479912,
"epoch": 6.942187835804857,
"grad_norm": 0.7586779594421387,
"learning_rate": 3.8440000000000005e-05,
"loss": 1.798760986328125,
"mean_token_accuracy": 0.667642817273736,
"num_tokens": 24009691.0,
"step": 4040
},
{
"entropy": 1.7289930269122125,
"epoch": 6.959381044487428,
"grad_norm": 0.854505717754364,
"learning_rate": 3.804e-05,
"loss": 1.771562385559082,
"mean_token_accuracy": 0.6692178774625063,
"num_tokens": 24064506.0,
"step": 4050
},
{
"entropy": 1.715189914405346,
"epoch": 6.976574253169998,
"grad_norm": 0.758488655090332,
"learning_rate": 3.7640000000000006e-05,
"loss": 1.756412887573242,
"mean_token_accuracy": 0.6710222817957401,
"num_tokens": 24126841.0,
"step": 4060
},
{
"entropy": 1.7383173301815986,
"epoch": 6.993767461852569,
"grad_norm": 0.7450618147850037,
"learning_rate": 3.724e-05,
"loss": 1.7997669219970702,
"mean_token_accuracy": 0.6649864386767149,
"num_tokens": 24186159.0,
"step": 4070
},
{
"entropy": 1.7172312767474682,
"epoch": 7.010315925209542,
"grad_norm": 0.8475770950317383,
"learning_rate": 3.684e-05,
"loss": 1.7585922241210938,
"mean_token_accuracy": 0.6746863397684965,
"num_tokens": 24239759.0,
"step": 4080
},
{
"entropy": 1.7192407630383968,
"epoch": 7.027509133892113,
"grad_norm": 0.7818967700004578,
"learning_rate": 3.6440000000000003e-05,
"loss": 1.7634265899658204,
"mean_token_accuracy": 0.6724576361477375,
"num_tokens": 24298775.0,
"step": 4090
},
{
"entropy": 1.7496131911873818,
"epoch": 7.044702342574683,
"grad_norm": 0.8118335008621216,
"learning_rate": 3.604e-05,
"loss": 1.802253532409668,
"mean_token_accuracy": 0.6702191606163979,
"num_tokens": 24361142.0,
"step": 4100
},
{
"entropy": 1.7090509735047816,
"epoch": 7.061895551257253,
"grad_norm": 0.8414726257324219,
"learning_rate": 3.5640000000000004e-05,
"loss": 1.7347373962402344,
"mean_token_accuracy": 0.679864277690649,
"num_tokens": 24419838.0,
"step": 4110
},
{
"entropy": 1.6807728812098504,
"epoch": 7.079088759939824,
"grad_norm": 0.8567139506340027,
"learning_rate": 3.524e-05,
"loss": 1.7365150451660156,
"mean_token_accuracy": 0.6765194039791822,
"num_tokens": 24477518.0,
"step": 4120
},
{
"entropy": 1.709678091108799,
"epoch": 7.096281968622394,
"grad_norm": 0.8345620036125183,
"learning_rate": 3.484e-05,
"loss": 1.730575180053711,
"mean_token_accuracy": 0.6709145799279213,
"num_tokens": 24534560.0,
"step": 4130
},
{
"entropy": 1.6541544690728187,
"epoch": 7.113475177304965,
"grad_norm": 0.8509814143180847,
"learning_rate": 3.444e-05,
"loss": 1.6795757293701172,
"mean_token_accuracy": 0.6856038823723793,
"num_tokens": 24594829.0,
"step": 4140
},
{
"entropy": 1.7498343527317046,
"epoch": 7.130668385987535,
"grad_norm": 0.8674039244651794,
"learning_rate": 3.404e-05,
"loss": 1.8083892822265626,
"mean_token_accuracy": 0.6709578204900026,
"num_tokens": 24656798.0,
"step": 4150
},
{
"entropy": 1.677807478606701,
"epoch": 7.147861594670105,
"grad_norm": 0.8016234040260315,
"learning_rate": 3.3639999999999996e-05,
"loss": 1.7206790924072266,
"mean_token_accuracy": 0.6754934191703796,
"num_tokens": 24714009.0,
"step": 4160
},
{
"entropy": 1.672835360467434,
"epoch": 7.1650548033526755,
"grad_norm": 0.7139334082603455,
"learning_rate": 3.324e-05,
"loss": 1.7049163818359374,
"mean_token_accuracy": 0.6851269513368606,
"num_tokens": 24778022.0,
"step": 4170
},
{
"entropy": 1.6577355667948723,
"epoch": 7.182248012035246,
"grad_norm": 0.9129847288131714,
"learning_rate": 3.2840000000000004e-05,
"loss": 1.7073640823364258,
"mean_token_accuracy": 0.6768647953867912,
"num_tokens": 24837669.0,
"step": 4180
},
{
"entropy": 1.7049853071570396,
"epoch": 7.199441220717817,
"grad_norm": 0.7545643448829651,
"learning_rate": 3.244e-05,
"loss": 1.754374122619629,
"mean_token_accuracy": 0.6808854278177023,
"num_tokens": 24898991.0,
"step": 4190
},
{
"entropy": 1.6785477355122567,
"epoch": 7.216634429400387,
"grad_norm": 0.8802333474159241,
"learning_rate": 3.2040000000000005e-05,
"loss": 1.6974828720092774,
"mean_token_accuracy": 0.6824289247393608,
"num_tokens": 24957348.0,
"step": 4200
},
{
"entropy": 1.7312355414032936,
"epoch": 7.233827638082957,
"grad_norm": 0.8227038383483887,
"learning_rate": 3.164e-05,
"loss": 1.7645183563232423,
"mean_token_accuracy": 0.6661410238593817,
"num_tokens": 25016658.0,
"step": 4210
},
{
"entropy": 1.8124181643128394,
"epoch": 7.2510208467655275,
"grad_norm": 0.8563106060028076,
"learning_rate": 3.1240000000000006e-05,
"loss": 1.8163776397705078,
"mean_token_accuracy": 0.6610642150044441,
"num_tokens": 25074658.0,
"step": 4220
},
{
"entropy": 1.776869924366474,
"epoch": 7.268214055448098,
"grad_norm": 0.8615058064460754,
"learning_rate": 3.084e-05,
"loss": 1.861563491821289,
"mean_token_accuracy": 0.6624562762677669,
"num_tokens": 25132732.0,
"step": 4230
},
{
"entropy": 1.742109003663063,
"epoch": 7.285407264130669,
"grad_norm": 0.7851050496101379,
"learning_rate": 3.0440000000000003e-05,
"loss": 1.7527351379394531,
"mean_token_accuracy": 0.6712357953190804,
"num_tokens": 25194009.0,
"step": 4240
},
{
"entropy": 1.7356494843959809,
"epoch": 7.302600472813239,
"grad_norm": 0.8842288255691528,
"learning_rate": 3.004e-05,
"loss": 1.8091196060180663,
"mean_token_accuracy": 0.6680308949202299,
"num_tokens": 25250681.0,
"step": 4250
},
{
"entropy": 1.714112138748169,
"epoch": 7.319793681495809,
"grad_norm": 0.8050926923751831,
"learning_rate": 2.964e-05,
"loss": 1.741617774963379,
"mean_token_accuracy": 0.6764710985124112,
"num_tokens": 25307119.0,
"step": 4260
},
{
"entropy": 1.7806825146079064,
"epoch": 7.3369868901783795,
"grad_norm": 0.755797803401947,
"learning_rate": 2.924e-05,
"loss": 1.8448747634887694,
"mean_token_accuracy": 0.6646751999855042,
"num_tokens": 25365721.0,
"step": 4270
},
{
"entropy": 1.7478718511760234,
"epoch": 7.35418009886095,
"grad_norm": 0.8148614764213562,
"learning_rate": 2.8840000000000002e-05,
"loss": 1.8303293228149413,
"mean_token_accuracy": 0.6662985436618328,
"num_tokens": 25423309.0,
"step": 4280
},
{
"entropy": 1.6996045634150505,
"epoch": 7.371373307543521,
"grad_norm": 0.7613778114318848,
"learning_rate": 2.844e-05,
"loss": 1.7077817916870117,
"mean_token_accuracy": 0.679437268525362,
"num_tokens": 25480080.0,
"step": 4290
},
{
"entropy": 1.8055237784981728,
"epoch": 7.38856651622609,
"grad_norm": 0.899900496006012,
"learning_rate": 2.804e-05,
"loss": 1.882634735107422,
"mean_token_accuracy": 0.659589122608304,
"num_tokens": 25538885.0,
"step": 4300
},
{
"entropy": 1.6835025876760483,
"epoch": 7.405759724908661,
"grad_norm": 0.7718909382820129,
"learning_rate": 2.764e-05,
"loss": 1.7145641326904297,
"mean_token_accuracy": 0.6805526971817016,
"num_tokens": 25598830.0,
"step": 4310
},
{
"entropy": 1.7392980232834816,
"epoch": 7.422952933591231,
"grad_norm": 0.7144562005996704,
"learning_rate": 2.724e-05,
"loss": 1.7779796600341797,
"mean_token_accuracy": 0.6709600411355495,
"num_tokens": 25660275.0,
"step": 4320
},
{
"entropy": 1.7193088322877883,
"epoch": 7.440146142273802,
"grad_norm": 0.8038010001182556,
"learning_rate": 2.6840000000000004e-05,
"loss": 1.7928234100341798,
"mean_token_accuracy": 0.6767275612801313,
"num_tokens": 25719958.0,
"step": 4330
},
{
"entropy": 1.7314304433763028,
"epoch": 7.457339350956373,
"grad_norm": 0.7783089876174927,
"learning_rate": 2.6440000000000004e-05,
"loss": 1.7952003479003906,
"mean_token_accuracy": 0.6740467935800553,
"num_tokens": 25776689.0,
"step": 4340
},
{
"entropy": 1.74028614833951,
"epoch": 7.474532559638942,
"grad_norm": 0.8052565455436707,
"learning_rate": 2.6040000000000005e-05,
"loss": 1.7803146362304687,
"mean_token_accuracy": 0.6733121275901794,
"num_tokens": 25837916.0,
"step": 4350
},
{
"entropy": 1.6831192195415496,
"epoch": 7.491725768321513,
"grad_norm": 0.8941977024078369,
"learning_rate": 2.5640000000000002e-05,
"loss": 1.7077743530273437,
"mean_token_accuracy": 0.6749852932989597,
"num_tokens": 25896712.0,
"step": 4360
},
{
"entropy": 1.7840609520673751,
"epoch": 7.508918977004083,
"grad_norm": 0.818671703338623,
"learning_rate": 2.5240000000000002e-05,
"loss": 1.8329656600952149,
"mean_token_accuracy": 0.6679215718060731,
"num_tokens": 25958383.0,
"step": 4370
},
{
"entropy": 1.76528559923172,
"epoch": 7.526112185686654,
"grad_norm": 0.7579294443130493,
"learning_rate": 2.4840000000000003e-05,
"loss": 1.7914703369140625,
"mean_token_accuracy": 0.6695499271154404,
"num_tokens": 26017754.0,
"step": 4380
},
{
"entropy": 1.704708030819893,
"epoch": 7.5433053943692245,
"grad_norm": 0.8200159668922424,
"learning_rate": 2.4440000000000003e-05,
"loss": 1.774311637878418,
"mean_token_accuracy": 0.6739427134394645,
"num_tokens": 26075760.0,
"step": 4390
},
{
"entropy": 1.7540104657411575,
"epoch": 7.560498603051794,
"grad_norm": 0.8373399972915649,
"learning_rate": 2.404e-05,
"loss": 1.796240997314453,
"mean_token_accuracy": 0.6640590511262416,
"num_tokens": 26133858.0,
"step": 4400
},
{
"entropy": 1.754172220826149,
"epoch": 7.577691811734365,
"grad_norm": 0.7368677258491516,
"learning_rate": 2.364e-05,
"loss": 1.8175994873046875,
"mean_token_accuracy": 0.6717667855322361,
"num_tokens": 26197518.0,
"step": 4410
},
{
"entropy": 1.6564558774232865,
"epoch": 7.594885020416935,
"grad_norm": 0.8868939280509949,
"learning_rate": 2.324e-05,
"loss": 1.669070053100586,
"mean_token_accuracy": 0.6839951984584332,
"num_tokens": 26250823.0,
"step": 4420
},
{
"entropy": 1.7594470486044884,
"epoch": 7.612078229099506,
"grad_norm": 0.86412513256073,
"learning_rate": 2.284e-05,
"loss": 1.8095222473144532,
"mean_token_accuracy": 0.666244950518012,
"num_tokens": 26312548.0,
"step": 4430
},
{
"entropy": 1.7646627604961396,
"epoch": 7.6292714377820765,
"grad_norm": 0.7128214836120605,
"learning_rate": 2.244e-05,
"loss": 1.832158660888672,
"mean_token_accuracy": 0.6679420609027147,
"num_tokens": 26376747.0,
"step": 4440
},
{
"entropy": 1.7401177063584328,
"epoch": 7.646464646464646,
"grad_norm": 0.7479432225227356,
"learning_rate": 2.2040000000000002e-05,
"loss": 1.7779264450073242,
"mean_token_accuracy": 0.6710429213941097,
"num_tokens": 26438907.0,
"step": 4450
},
{
"entropy": 1.6960709124803544,
"epoch": 7.663657855147217,
"grad_norm": 0.8182732462882996,
"learning_rate": 2.1640000000000003e-05,
"loss": 1.7709745407104491,
"mean_token_accuracy": 0.6782359674572944,
"num_tokens": 26499840.0,
"step": 4460
},
{
"entropy": 1.8024938970804214,
"epoch": 7.680851063829787,
"grad_norm": 0.8208670020103455,
"learning_rate": 2.124e-05,
"loss": 1.8752277374267579,
"mean_token_accuracy": 0.6610838636755944,
"num_tokens": 26561739.0,
"step": 4470
},
{
"entropy": 1.6679524429142476,
"epoch": 7.698044272512358,
"grad_norm": 0.7669119834899902,
"learning_rate": 2.084e-05,
"loss": 1.6840700149536132,
"mean_token_accuracy": 0.6839361816644669,
"num_tokens": 26618997.0,
"step": 4480
},
{
"entropy": 1.669876104593277,
"epoch": 7.715237481194928,
"grad_norm": 0.8296427130699158,
"learning_rate": 2.044e-05,
"loss": 1.6926704406738282,
"mean_token_accuracy": 0.6837400387972593,
"num_tokens": 26677617.0,
"step": 4490
},
{
"entropy": 1.7478768080472946,
"epoch": 7.732430689877498,
"grad_norm": 0.9231081008911133,
"learning_rate": 2.004e-05,
"loss": 1.8043970108032226,
"mean_token_accuracy": 0.6680058591067791,
"num_tokens": 26735542.0,
"step": 4500
},
{
"entropy": 1.7587152615189552,
"epoch": 7.749623898560069,
"grad_norm": 0.8131846189498901,
"learning_rate": 1.9640000000000002e-05,
"loss": 1.798016357421875,
"mean_token_accuracy": 0.6655693002045154,
"num_tokens": 26796245.0,
"step": 4510
},
{
"entropy": 1.7238084524869919,
"epoch": 7.766817107242639,
"grad_norm": 0.8774024248123169,
"learning_rate": 1.924e-05,
"loss": 1.7398443222045898,
"mean_token_accuracy": 0.6723451249301433,
"num_tokens": 26852843.0,
"step": 4520
},
{
"entropy": 1.8012757793068885,
"epoch": 7.78401031592521,
"grad_norm": 0.881601095199585,
"learning_rate": 1.8840000000000003e-05,
"loss": 1.851584243774414,
"mean_token_accuracy": 0.6612551022320986,
"num_tokens": 26912327.0,
"step": 4530
},
{
"entropy": 1.7035338878631592,
"epoch": 7.8012035246077795,
"grad_norm": 0.8460244536399841,
"learning_rate": 1.8440000000000003e-05,
"loss": 1.7524948120117188,
"mean_token_accuracy": 0.6760960537940264,
"num_tokens": 26971076.0,
"step": 4540
},
{
"entropy": 1.6795054778456688,
"epoch": 7.81839673329035,
"grad_norm": 0.7720061540603638,
"learning_rate": 1.804e-05,
"loss": 1.70491943359375,
"mean_token_accuracy": 0.6768644891679287,
"num_tokens": 27031120.0,
"step": 4550
},
{
"entropy": 1.775759120285511,
"epoch": 7.835589941972921,
"grad_norm": 0.8407703638076782,
"learning_rate": 1.764e-05,
"loss": 1.8208852767944337,
"mean_token_accuracy": 0.6638765886425972,
"num_tokens": 27089926.0,
"step": 4560
},
{
"entropy": 1.7749223679304122,
"epoch": 7.852783150655491,
"grad_norm": 0.8033788204193115,
"learning_rate": 1.724e-05,
"loss": 1.8128280639648438,
"mean_token_accuracy": 0.6697524327784776,
"num_tokens": 27155776.0,
"step": 4570
},
{
"entropy": 1.7019891321659089,
"epoch": 7.869976359338062,
"grad_norm": 0.8756063580513,
"learning_rate": 1.684e-05,
"loss": 1.752833366394043,
"mean_token_accuracy": 0.6720911644399166,
"num_tokens": 27213676.0,
"step": 4580
},
{
"entropy": 1.7089907452464104,
"epoch": 7.8871695680206315,
"grad_norm": 0.8547044396400452,
"learning_rate": 1.644e-05,
"loss": 1.7329090118408204,
"mean_token_accuracy": 0.6730512753129005,
"num_tokens": 27273812.0,
"step": 4590
},
{
"entropy": 1.8000069722533225,
"epoch": 7.904362776703202,
"grad_norm": 0.8191949725151062,
"learning_rate": 1.604e-05,
"loss": 1.8508378982543945,
"mean_token_accuracy": 0.6602330446243286,
"num_tokens": 27334482.0,
"step": 4600
},
{
"entropy": 1.6531485810875892,
"epoch": 7.921555985385773,
"grad_norm": 0.7952063679695129,
"learning_rate": 1.5640000000000003e-05,
"loss": 1.6732818603515625,
"mean_token_accuracy": 0.6840143203735352,
"num_tokens": 27390777.0,
"step": 4610
},
{
"entropy": 1.7451679170131684,
"epoch": 7.938749194068343,
"grad_norm": 0.7736355066299438,
"learning_rate": 1.5240000000000001e-05,
"loss": 1.836105728149414,
"mean_token_accuracy": 0.6631482250988483,
"num_tokens": 27452458.0,
"step": 4620
},
{
"entropy": 1.6219932287931442,
"epoch": 7.955942402750914,
"grad_norm": 0.7429597973823547,
"learning_rate": 1.4840000000000002e-05,
"loss": 1.6252763748168946,
"mean_token_accuracy": 0.6922797068953515,
"num_tokens": 27510793.0,
"step": 4630
},
{
"entropy": 1.7097622737288476,
"epoch": 7.9731356114334835,
"grad_norm": 0.7546749114990234,
"learning_rate": 1.444e-05,
"loss": 1.7529830932617188,
"mean_token_accuracy": 0.6756818048655987,
"num_tokens": 27570434.0,
"step": 4640
},
{
"entropy": 1.7681476891040802,
"epoch": 7.990328820116054,
"grad_norm": 0.8919919729232788,
"learning_rate": 1.4040000000000001e-05,
"loss": 1.8469413757324218,
"mean_token_accuracy": 0.6651480123400688,
"num_tokens": 27632017.0,
"step": 4650
},
{
"entropy": 1.7464849283168842,
"epoch": 8.006877283473028,
"grad_norm": 0.8629288077354431,
"learning_rate": 1.364e-05,
"loss": 1.7770162582397462,
"mean_token_accuracy": 0.6717489861048661,
"num_tokens": 27687721.0,
"step": 4660
},
{
"entropy": 1.733792708069086,
"epoch": 8.024070492155598,
"grad_norm": 0.8012450337409973,
"learning_rate": 1.324e-05,
"loss": 1.7535259246826171,
"mean_token_accuracy": 0.6781957261264324,
"num_tokens": 27748609.0,
"step": 4670
},
{
"entropy": 1.673891542851925,
"epoch": 8.041263700838169,
"grad_norm": 0.8763530850410461,
"learning_rate": 1.2839999999999999e-05,
"loss": 1.7353546142578125,
"mean_token_accuracy": 0.6773874297738075,
"num_tokens": 27805200.0,
"step": 4680
},
{
"entropy": 1.6245143353939056,
"epoch": 8.05845690952074,
"grad_norm": 0.7880796194076538,
"learning_rate": 1.244e-05,
"loss": 1.6489152908325195,
"mean_token_accuracy": 0.6891307681798935,
"num_tokens": 27866189.0,
"step": 4690
},
{
"entropy": 1.7772031486034394,
"epoch": 8.07565011820331,
"grad_norm": 0.894481360912323,
"learning_rate": 1.204e-05,
"loss": 1.8237220764160156,
"mean_token_accuracy": 0.6645158022642136,
"num_tokens": 27929040.0,
"step": 4700
},
{
"entropy": 1.6911936491727828,
"epoch": 8.09284332688588,
"grad_norm": 0.8212205171585083,
"learning_rate": 1.164e-05,
"loss": 1.718613624572754,
"mean_token_accuracy": 0.6778515942394734,
"num_tokens": 27989259.0,
"step": 4710
},
{
"entropy": 1.7341958984732628,
"epoch": 8.110036535568451,
"grad_norm": 0.8757619261741638,
"learning_rate": 1.124e-05,
"loss": 1.83496150970459,
"mean_token_accuracy": 0.67105031311512,
"num_tokens": 28051037.0,
"step": 4720
},
{
"entropy": 1.6540620133280755,
"epoch": 8.127229744251022,
"grad_norm": 0.6871177554130554,
"learning_rate": 1.084e-05,
"loss": 1.6868721008300782,
"mean_token_accuracy": 0.6824644193053245,
"num_tokens": 28117218.0,
"step": 4730
},
{
"entropy": 1.7760244339704514,
"epoch": 8.144422952933592,
"grad_norm": 0.8672593832015991,
"learning_rate": 1.0440000000000002e-05,
"loss": 1.8467548370361329,
"mean_token_accuracy": 0.6605620160698891,
"num_tokens": 28176643.0,
"step": 4740
},
{
"entropy": 1.6998422421514987,
"epoch": 8.16161616161616,
"grad_norm": 0.9853087663650513,
"learning_rate": 1.004e-05,
"loss": 1.7283611297607422,
"mean_token_accuracy": 0.6775359824299813,
"num_tokens": 28234550.0,
"step": 4750
},
{
"entropy": 1.7665151111781596,
"epoch": 8.178809370298731,
"grad_norm": 0.8272210955619812,
"learning_rate": 9.640000000000001e-06,
"loss": 1.8442218780517579,
"mean_token_accuracy": 0.6675057601183653,
"num_tokens": 28292004.0,
"step": 4760
},
{
"entropy": 1.7351939789950848,
"epoch": 8.196002578981302,
"grad_norm": 0.8758223652839661,
"learning_rate": 9.24e-06,
"loss": 1.7823253631591798,
"mean_token_accuracy": 0.6717655852437019,
"num_tokens": 28351089.0,
"step": 4770
},
{
"entropy": 1.7320286817848682,
"epoch": 8.213195787663873,
"grad_norm": 0.8538162708282471,
"learning_rate": 8.840000000000002e-06,
"loss": 1.758108139038086,
"mean_token_accuracy": 0.6750058546662331,
"num_tokens": 28411108.0,
"step": 4780
},
{
"entropy": 1.7250167533755303,
"epoch": 8.230388996346443,
"grad_norm": 0.8055081963539124,
"learning_rate": 8.44e-06,
"loss": 1.7342365264892579,
"mean_token_accuracy": 0.6727670766413212,
"num_tokens": 28469910.0,
"step": 4790
},
{
"entropy": 1.6715928614139557,
"epoch": 8.247582205029014,
"grad_norm": 0.8282851576805115,
"learning_rate": 8.040000000000001e-06,
"loss": 1.7284685134887696,
"mean_token_accuracy": 0.6803247310221195,
"num_tokens": 28528732.0,
"step": 4800
},
{
"entropy": 1.7717369854450227,
"epoch": 8.264775413711584,
"grad_norm": 0.7199074029922485,
"learning_rate": 7.64e-06,
"loss": 1.8089387893676758,
"mean_token_accuracy": 0.6684400778263807,
"num_tokens": 28591231.0,
"step": 4810
},
{
"entropy": 1.6829568967223167,
"epoch": 8.281968622394155,
"grad_norm": 0.8212400674819946,
"learning_rate": 7.240000000000001e-06,
"loss": 1.6901424407958985,
"mean_token_accuracy": 0.6812582932412624,
"num_tokens": 28651538.0,
"step": 4820
},
{
"entropy": 1.7792557999491692,
"epoch": 8.299161831076725,
"grad_norm": 0.8251553773880005,
"learning_rate": 6.840000000000001e-06,
"loss": 1.8440101623535157,
"mean_token_accuracy": 0.6635224357247352,
"num_tokens": 28713818.0,
"step": 4830
},
{
"entropy": 1.6888219453394413,
"epoch": 8.316355039759294,
"grad_norm": 0.799067497253418,
"learning_rate": 6.44e-06,
"loss": 1.7452951431274415,
"mean_token_accuracy": 0.6766478583216667,
"num_tokens": 28771713.0,
"step": 4840
},
{
"entropy": 1.6663143932819366,
"epoch": 8.333548248441865,
"grad_norm": 0.7468796968460083,
"learning_rate": 6.040000000000001e-06,
"loss": 1.6975286483764649,
"mean_token_accuracy": 0.6818139903247357,
"num_tokens": 28833584.0,
"step": 4850
},
{
"entropy": 1.736840507388115,
"epoch": 8.350741457124435,
"grad_norm": 0.9168211817741394,
"learning_rate": 5.64e-06,
"loss": 1.8019765853881835,
"mean_token_accuracy": 0.6729365028440952,
"num_tokens": 28891158.0,
"step": 4860
},
{
"entropy": 1.7159839145839215,
"epoch": 8.367934665807006,
"grad_norm": 0.8348814249038696,
"learning_rate": 5.240000000000001e-06,
"loss": 1.7910118103027344,
"mean_token_accuracy": 0.67631860896945,
"num_tokens": 28948026.0,
"step": 4870
},
{
"entropy": 1.7169093780219555,
"epoch": 8.385127874489577,
"grad_norm": 0.8493881821632385,
"learning_rate": 4.84e-06,
"loss": 1.7167430877685548,
"mean_token_accuracy": 0.6753393478691578,
"num_tokens": 29005197.0,
"step": 4880
},
{
"entropy": 1.6801239594817161,
"epoch": 8.402321083172147,
"grad_norm": 0.8069011569023132,
"learning_rate": 4.440000000000001e-06,
"loss": 1.6674100875854492,
"mean_token_accuracy": 0.681441531330347,
"num_tokens": 29062454.0,
"step": 4890
},
{
"entropy": 1.7267012923955918,
"epoch": 8.419514291854718,
"grad_norm": 0.8063756823539734,
"learning_rate": 4.04e-06,
"loss": 1.7544673919677733,
"mean_token_accuracy": 0.6745367147028446,
"num_tokens": 29121055.0,
"step": 4900
},
{
"entropy": 1.6062462359666825,
"epoch": 8.436707500537288,
"grad_norm": 0.8285024762153625,
"learning_rate": 3.6400000000000003e-06,
"loss": 1.6273128509521484,
"mean_token_accuracy": 0.690464211255312,
"num_tokens": 29176963.0,
"step": 4910
},
{
"entropy": 1.7958560451865195,
"epoch": 8.453900709219859,
"grad_norm": 0.8202657103538513,
"learning_rate": 3.24e-06,
"loss": 1.8311897277832032,
"mean_token_accuracy": 0.661663169786334,
"num_tokens": 29235880.0,
"step": 4920
},
{
"entropy": 1.665907260030508,
"epoch": 8.47109391790243,
"grad_norm": 0.8672494292259216,
"learning_rate": 2.8400000000000003e-06,
"loss": 1.6878423690795898,
"mean_token_accuracy": 0.6819184564054013,
"num_tokens": 29295823.0,
"step": 4930
},
{
"entropy": 1.7426866918802262,
"epoch": 8.488287126584998,
"grad_norm": 0.8398126363754272,
"learning_rate": 2.4400000000000004e-06,
"loss": 1.810443115234375,
"mean_token_accuracy": 0.6639036998152733,
"num_tokens": 29355386.0,
"step": 4940
},
{
"entropy": 1.6938614405691623,
"epoch": 8.505480335267569,
"grad_norm": 0.7652584314346313,
"learning_rate": 2.0400000000000004e-06,
"loss": 1.7690727233886718,
"mean_token_accuracy": 0.6737098075449467,
"num_tokens": 29414966.0,
"step": 4950
},
{
"entropy": 1.7538506165146828,
"epoch": 8.52267354395014,
"grad_norm": 0.8389163017272949,
"learning_rate": 1.6400000000000002e-06,
"loss": 1.8067062377929688,
"mean_token_accuracy": 0.6728679880499839,
"num_tokens": 29472960.0,
"step": 4960
},
{
"entropy": 1.7591105610132218,
"epoch": 8.53986675263271,
"grad_norm": 0.8280366063117981,
"learning_rate": 1.24e-06,
"loss": 1.7855098724365235,
"mean_token_accuracy": 0.6670263484120369,
"num_tokens": 29531300.0,
"step": 4970
},
{
"entropy": 1.6825189530849456,
"epoch": 8.55705996131528,
"grad_norm": 0.8177328109741211,
"learning_rate": 8.4e-07,
"loss": 1.731926727294922,
"mean_token_accuracy": 0.6818420931696891,
"num_tokens": 29591290.0,
"step": 4980
},
{
"entropy": 1.7112577512860299,
"epoch": 8.574253169997851,
"grad_norm": 0.8413036465644836,
"learning_rate": 4.4e-07,
"loss": 1.7446353912353516,
"mean_token_accuracy": 0.6750271447002888,
"num_tokens": 29646086.0,
"step": 4990
},
{
"entropy": 1.7419164210557938,
"epoch": 8.591446378680422,
"grad_norm": 0.9462088346481323,
"learning_rate": 4e-08,
"loss": 1.7870445251464844,
"mean_token_accuracy": 0.666933435574174,
"num_tokens": 29704815.0,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.438188209453138e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}