unity-coder-30b / trainer_state.json
vishnuOI's picture
Upload folder using huggingface_hub
4e3ad49 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 7839,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.0787046998739243,
"epoch": 0.003827018752391887,
"grad_norm": 0.37200024724006653,
"learning_rate": 4.591836734693878e-06,
"loss": 2.590205955505371,
"mean_token_accuracy": 0.5478626236319541,
"num_tokens": 43996.0,
"step": 10
},
{
"entropy": 1.1275236845016479,
"epoch": 0.007654037504783774,
"grad_norm": 0.4282406270503998,
"learning_rate": 9.693877551020408e-06,
"loss": 2.732739067077637,
"mean_token_accuracy": 0.532574575394392,
"num_tokens": 84448.0,
"step": 20
},
{
"entropy": 1.1098140180110931,
"epoch": 0.011481056257175661,
"grad_norm": 0.45254817605018616,
"learning_rate": 1.479591836734694e-05,
"loss": 2.595915603637695,
"mean_token_accuracy": 0.5385765254497528,
"num_tokens": 127136.0,
"step": 30
},
{
"entropy": 1.1594089552760125,
"epoch": 0.015308075009567547,
"grad_norm": 0.26816287636756897,
"learning_rate": 1.989795918367347e-05,
"loss": 2.3643749237060545,
"mean_token_accuracy": 0.5597088657319546,
"num_tokens": 172549.0,
"step": 40
},
{
"entropy": 1.2817068248987198,
"epoch": 0.019135093761959432,
"grad_norm": 0.19904343783855438,
"learning_rate": 2.5e-05,
"loss": 2.1694852828979494,
"mean_token_accuracy": 0.5605865910649299,
"num_tokens": 218317.0,
"step": 50
},
{
"entropy": 1.356394973397255,
"epoch": 0.022962112514351322,
"grad_norm": 0.21708081662654877,
"learning_rate": 3.0102040816326533e-05,
"loss": 2.0824514389038087,
"mean_token_accuracy": 0.584179612249136,
"num_tokens": 255107.0,
"step": 60
},
{
"entropy": 1.4309053242206573,
"epoch": 0.026789131266743208,
"grad_norm": 0.09860006719827652,
"learning_rate": 3.520408163265306e-05,
"loss": 1.777943229675293,
"mean_token_accuracy": 0.6243460461497307,
"num_tokens": 298973.0,
"step": 70
},
{
"entropy": 1.4572882741689681,
"epoch": 0.030616150019135095,
"grad_norm": 0.07813975214958191,
"learning_rate": 4.0306122448979596e-05,
"loss": 1.7085393905639648,
"mean_token_accuracy": 0.641383134573698,
"num_tokens": 343287.0,
"step": 80
},
{
"entropy": 1.5205285474658012,
"epoch": 0.03444316877152698,
"grad_norm": 0.08015387505292892,
"learning_rate": 4.5408163265306124e-05,
"loss": 1.680305290222168,
"mean_token_accuracy": 0.6431211873888969,
"num_tokens": 376882.0,
"step": 90
},
{
"entropy": 1.529197846353054,
"epoch": 0.038270187523918864,
"grad_norm": 0.1533895879983902,
"learning_rate": 5.051020408163265e-05,
"loss": 1.538798999786377,
"mean_token_accuracy": 0.6604955434799195,
"num_tokens": 414084.0,
"step": 100
},
{
"entropy": 1.454574093222618,
"epoch": 0.04209720627631076,
"grad_norm": 0.08893708884716034,
"learning_rate": 5.561224489795919e-05,
"loss": 1.482753372192383,
"mean_token_accuracy": 0.6748222857713699,
"num_tokens": 451489.0,
"step": 110
},
{
"entropy": 1.4169663548469544,
"epoch": 0.045924225028702644,
"grad_norm": 0.10994797945022583,
"learning_rate": 6.0714285714285715e-05,
"loss": 1.4192767143249512,
"mean_token_accuracy": 0.6815833821892738,
"num_tokens": 492001.0,
"step": 120
},
{
"entropy": 1.3825553365051746,
"epoch": 0.04975124378109453,
"grad_norm": 0.09565065056085587,
"learning_rate": 6.581632653061225e-05,
"loss": 1.4434242248535156,
"mean_token_accuracy": 0.6830388471484184,
"num_tokens": 534229.0,
"step": 130
},
{
"entropy": 1.3660246580839157,
"epoch": 0.053578262533486416,
"grad_norm": 0.09277962148189545,
"learning_rate": 7.091836734693877e-05,
"loss": 1.3881919860839844,
"mean_token_accuracy": 0.6916770502924919,
"num_tokens": 573333.0,
"step": 140
},
{
"entropy": 1.3041961744427681,
"epoch": 0.0574052812858783,
"grad_norm": 0.14179331064224243,
"learning_rate": 7.60204081632653e-05,
"loss": 1.3051923751831054,
"mean_token_accuracy": 0.7079917460680007,
"num_tokens": 612198.0,
"step": 150
},
{
"entropy": 1.3157715648412704,
"epoch": 0.06123230003827019,
"grad_norm": 0.11061020940542221,
"learning_rate": 8.112244897959184e-05,
"loss": 1.3127019882202149,
"mean_token_accuracy": 0.6986684441566468,
"num_tokens": 654309.0,
"step": 160
},
{
"entropy": 1.2894421368837357,
"epoch": 0.06505931879066207,
"grad_norm": 0.12903185188770294,
"learning_rate": 8.622448979591838e-05,
"loss": 1.3279677391052247,
"mean_token_accuracy": 0.7028318449854851,
"num_tokens": 694076.0,
"step": 170
},
{
"entropy": 1.2676123276352882,
"epoch": 0.06888633754305395,
"grad_norm": 0.10816285014152527,
"learning_rate": 9.13265306122449e-05,
"loss": 1.3254461288452148,
"mean_token_accuracy": 0.7063754379749299,
"num_tokens": 733806.0,
"step": 180
},
{
"entropy": 1.0988808318972587,
"epoch": 0.07271335629544584,
"grad_norm": 0.08472651243209839,
"learning_rate": 9.642857142857143e-05,
"loss": 1.1510659217834474,
"mean_token_accuracy": 0.7411173984408379,
"num_tokens": 771622.0,
"step": 190
},
{
"entropy": 1.2284281507134438,
"epoch": 0.07654037504783773,
"grad_norm": 0.10897475481033325,
"learning_rate": 0.00010153061224489797,
"loss": 1.2724005699157714,
"mean_token_accuracy": 0.7178638219833374,
"num_tokens": 813167.0,
"step": 200
},
{
"entropy": 1.2159623876214027,
"epoch": 0.08036739380022963,
"grad_norm": 0.12170197069644928,
"learning_rate": 0.0001066326530612245,
"loss": 1.26397647857666,
"mean_token_accuracy": 0.7138236090540886,
"num_tokens": 856210.0,
"step": 210
},
{
"entropy": 1.2309471271932124,
"epoch": 0.08419441255262151,
"grad_norm": 0.08406181633472443,
"learning_rate": 0.00011173469387755102,
"loss": 1.3110918998718262,
"mean_token_accuracy": 0.7173333883285522,
"num_tokens": 893432.0,
"step": 220
},
{
"entropy": 1.2228039711713792,
"epoch": 0.0880214313050134,
"grad_norm": 0.10588081181049347,
"learning_rate": 0.00011683673469387754,
"loss": 1.2445635795593262,
"mean_token_accuracy": 0.7160170584917068,
"num_tokens": 931919.0,
"step": 230
},
{
"entropy": 1.138296764343977,
"epoch": 0.09184845005740529,
"grad_norm": 0.110760398209095,
"learning_rate": 0.00012193877551020409,
"loss": 1.2083134651184082,
"mean_token_accuracy": 0.7306654810905456,
"num_tokens": 976039.0,
"step": 240
},
{
"entropy": 1.1915819495916367,
"epoch": 0.09567546880979717,
"grad_norm": 0.15018120408058167,
"learning_rate": 0.00012704081632653063,
"loss": 1.2230369567871093,
"mean_token_accuracy": 0.719833716750145,
"num_tokens": 1019312.0,
"step": 250
},
{
"entropy": 1.2996815636754036,
"epoch": 0.09950248756218906,
"grad_norm": 0.10838313400745392,
"learning_rate": 0.00013214285714285715,
"loss": 1.3142367362976075,
"mean_token_accuracy": 0.7013067752122879,
"num_tokens": 1062901.0,
"step": 260
},
{
"entropy": 1.149668525904417,
"epoch": 0.10332950631458095,
"grad_norm": 0.09911312907934189,
"learning_rate": 0.00013724489795918367,
"loss": 1.1573083877563477,
"mean_token_accuracy": 0.728775355219841,
"num_tokens": 1102630.0,
"step": 270
},
{
"entropy": 1.271340447664261,
"epoch": 0.10715652506697283,
"grad_norm": 0.09665267169475555,
"learning_rate": 0.00014234693877551022,
"loss": 1.341374111175537,
"mean_token_accuracy": 0.7027333110570908,
"num_tokens": 1142262.0,
"step": 280
},
{
"entropy": 1.225122657418251,
"epoch": 0.11098354381936472,
"grad_norm": 0.13240815699100494,
"learning_rate": 0.00014744897959183674,
"loss": 1.2614737510681153,
"mean_token_accuracy": 0.7198452442884445,
"num_tokens": 1182386.0,
"step": 290
},
{
"entropy": 1.2733432039618493,
"epoch": 0.1148105625717566,
"grad_norm": 0.10651895403862,
"learning_rate": 0.00015255102040816326,
"loss": 1.2933347702026368,
"mean_token_accuracy": 0.7025195896625519,
"num_tokens": 1222805.0,
"step": 300
},
{
"entropy": 1.1543171763420106,
"epoch": 0.11863758132414849,
"grad_norm": 0.08577804267406464,
"learning_rate": 0.00015765306122448978,
"loss": 1.197078323364258,
"mean_token_accuracy": 0.7300360783934593,
"num_tokens": 1263121.0,
"step": 310
},
{
"entropy": 1.1908820882439612,
"epoch": 0.12246460007654038,
"grad_norm": 0.11925600469112396,
"learning_rate": 0.00016275510204081633,
"loss": 1.2366827964782714,
"mean_token_accuracy": 0.7277334719896317,
"num_tokens": 1296346.0,
"step": 320
},
{
"entropy": 1.1711702406406403,
"epoch": 0.12629161882893225,
"grad_norm": 0.12476309388875961,
"learning_rate": 0.00016785714285714288,
"loss": 1.2408350944519042,
"mean_token_accuracy": 0.7282937213778495,
"num_tokens": 1335547.0,
"step": 330
},
{
"entropy": 1.1748667433857918,
"epoch": 0.13011863758132414,
"grad_norm": 0.08671289682388306,
"learning_rate": 0.0001729591836734694,
"loss": 1.199030303955078,
"mean_token_accuracy": 0.7312668621540069,
"num_tokens": 1377093.0,
"step": 340
},
{
"entropy": 1.153764547407627,
"epoch": 0.13394565633371602,
"grad_norm": 0.10536976903676987,
"learning_rate": 0.00017806122448979592,
"loss": 1.201906967163086,
"mean_token_accuracy": 0.7266524419188499,
"num_tokens": 1417236.0,
"step": 350
},
{
"entropy": 1.2303058430552483,
"epoch": 0.1377726750861079,
"grad_norm": 0.09069176018238068,
"learning_rate": 0.00018316326530612247,
"loss": 1.2867681503295898,
"mean_token_accuracy": 0.7198954582214355,
"num_tokens": 1460306.0,
"step": 360
},
{
"entropy": 1.1944106668233871,
"epoch": 0.1415996938384998,
"grad_norm": 0.08539925515651703,
"learning_rate": 0.000188265306122449,
"loss": 1.2451179504394532,
"mean_token_accuracy": 0.7186401098966598,
"num_tokens": 1505966.0,
"step": 370
},
{
"entropy": 1.2239032357931137,
"epoch": 0.14542671259089168,
"grad_norm": 0.08434446156024933,
"learning_rate": 0.0001933673469387755,
"loss": 1.2803629875183105,
"mean_token_accuracy": 0.7178368359804154,
"num_tokens": 1544132.0,
"step": 380
},
{
"entropy": 1.1901779979467393,
"epoch": 0.14925373134328357,
"grad_norm": 0.08662886172533035,
"learning_rate": 0.00019846938775510203,
"loss": 1.2282370567321776,
"mean_token_accuracy": 0.7221132159233093,
"num_tokens": 1587355.0,
"step": 390
},
{
"entropy": 1.106569343805313,
"epoch": 0.15308075009567546,
"grad_norm": 0.13149450719356537,
"learning_rate": 0.00019981200483416142,
"loss": 1.136556625366211,
"mean_token_accuracy": 0.7384186327457428,
"num_tokens": 1624638.0,
"step": 400
},
{
"entropy": 1.0393452920019626,
"epoch": 0.15690776884806737,
"grad_norm": 0.13831719756126404,
"learning_rate": 0.00019954344031153484,
"loss": 1.074817180633545,
"mean_token_accuracy": 0.7567356958985328,
"num_tokens": 1665215.0,
"step": 410
},
{
"entropy": 1.1298492863774299,
"epoch": 0.16073478760045926,
"grad_norm": 0.10244159400463104,
"learning_rate": 0.0001992748757889083,
"loss": 1.1741769790649415,
"mean_token_accuracy": 0.7414689466357232,
"num_tokens": 1701543.0,
"step": 420
},
{
"entropy": 1.1646860882639885,
"epoch": 0.16456180635285114,
"grad_norm": 0.09356453269720078,
"learning_rate": 0.00019900631126628174,
"loss": 1.2229989051818848,
"mean_token_accuracy": 0.7278152450919151,
"num_tokens": 1744719.0,
"step": 430
},
{
"entropy": 1.1580850452184677,
"epoch": 0.16838882510524303,
"grad_norm": 0.08699047565460205,
"learning_rate": 0.00019873774674365518,
"loss": 1.1999470710754394,
"mean_token_accuracy": 0.7270642057061195,
"num_tokens": 1787999.0,
"step": 440
},
{
"entropy": 1.105088683962822,
"epoch": 0.17221584385763491,
"grad_norm": 0.10489863902330399,
"learning_rate": 0.0001984691822210286,
"loss": 1.123628807067871,
"mean_token_accuracy": 0.7375599846243859,
"num_tokens": 1825171.0,
"step": 450
},
{
"entropy": 1.0744315460324287,
"epoch": 0.1760428626100268,
"grad_norm": 0.10170256346464157,
"learning_rate": 0.00019820061769840205,
"loss": 1.1449885368347168,
"mean_token_accuracy": 0.7466330513358116,
"num_tokens": 1863245.0,
"step": 460
},
{
"entropy": 1.021899376064539,
"epoch": 0.1798698813624187,
"grad_norm": 0.09046658873558044,
"learning_rate": 0.0001979320531757755,
"loss": 1.0087275505065918,
"mean_token_accuracy": 0.7654190301895142,
"num_tokens": 1902205.0,
"step": 470
},
{
"entropy": 1.1595304682850838,
"epoch": 0.18369690011481057,
"grad_norm": 0.09361740201711655,
"learning_rate": 0.00019766348865314892,
"loss": 1.2238757133483886,
"mean_token_accuracy": 0.7275362908840179,
"num_tokens": 1943827.0,
"step": 480
},
{
"entropy": 1.1023890599608421,
"epoch": 0.18752391886720246,
"grad_norm": 0.08471602201461792,
"learning_rate": 0.00019739492413052236,
"loss": 1.1567111015319824,
"mean_token_accuracy": 0.742167092859745,
"num_tokens": 1982630.0,
"step": 490
},
{
"entropy": 1.1427962884306908,
"epoch": 0.19135093761959435,
"grad_norm": 0.1170063391327858,
"learning_rate": 0.0001971263596078958,
"loss": 1.208080005645752,
"mean_token_accuracy": 0.7308674260973931,
"num_tokens": 2021311.0,
"step": 500
},
{
"entropy": 1.0002792343497275,
"epoch": 0.19517795637198623,
"grad_norm": 0.10567828267812729,
"learning_rate": 0.00019685779508526926,
"loss": 1.026076889038086,
"mean_token_accuracy": 0.7641370877623558,
"num_tokens": 2055396.0,
"step": 510
},
{
"entropy": 1.1096693962812423,
"epoch": 0.19900497512437812,
"grad_norm": 0.08597096055746078,
"learning_rate": 0.00019658923056264268,
"loss": 1.1631651878356934,
"mean_token_accuracy": 0.7389342650771141,
"num_tokens": 2097931.0,
"step": 520
},
{
"entropy": 1.04201333373785,
"epoch": 0.20283199387677,
"grad_norm": 0.1260094940662384,
"learning_rate": 0.00019632066604001613,
"loss": 1.0765873908996582,
"mean_token_accuracy": 0.7539548426866531,
"num_tokens": 2137657.0,
"step": 530
},
{
"entropy": 1.0748623803257942,
"epoch": 0.2066590126291619,
"grad_norm": 0.0845552608370781,
"learning_rate": 0.00019605210151738955,
"loss": 1.1419748306274413,
"mean_token_accuracy": 0.7481048628687859,
"num_tokens": 2177343.0,
"step": 540
},
{
"entropy": 1.0970528617501258,
"epoch": 0.21048603138155378,
"grad_norm": 0.07105763256549835,
"learning_rate": 0.000195783536994763,
"loss": 1.1284755706787108,
"mean_token_accuracy": 0.747196614742279,
"num_tokens": 2211423.0,
"step": 550
},
{
"entropy": 1.0551751986145974,
"epoch": 0.21431305013394567,
"grad_norm": 0.12569685280323029,
"learning_rate": 0.00019551497247213644,
"loss": 1.1170102119445802,
"mean_token_accuracy": 0.7472440049052238,
"num_tokens": 2249350.0,
"step": 560
},
{
"entropy": 1.0562219873070717,
"epoch": 0.21814006888633755,
"grad_norm": 0.08452208340167999,
"learning_rate": 0.0001952464079495099,
"loss": 1.0921730995178223,
"mean_token_accuracy": 0.7492805704474449,
"num_tokens": 2289564.0,
"step": 570
},
{
"entropy": 1.022915106266737,
"epoch": 0.22196708763872944,
"grad_norm": 0.08168510347604752,
"learning_rate": 0.00019497784342688333,
"loss": 1.064980697631836,
"mean_token_accuracy": 0.7562290355563164,
"num_tokens": 2332592.0,
"step": 580
},
{
"entropy": 1.1041950330138206,
"epoch": 0.22579410639112132,
"grad_norm": 0.07596516609191895,
"learning_rate": 0.00019470927890425675,
"loss": 1.1410536766052246,
"mean_token_accuracy": 0.7350378915667534,
"num_tokens": 2380215.0,
"step": 590
},
{
"entropy": 1.1252056039869784,
"epoch": 0.2296211251435132,
"grad_norm": 0.07240597158670425,
"learning_rate": 0.0001944407143816302,
"loss": 1.1348044395446777,
"mean_token_accuracy": 0.7389790266752243,
"num_tokens": 2417819.0,
"step": 600
},
{
"entropy": 1.0117394506931305,
"epoch": 0.2334481438959051,
"grad_norm": 0.08603253215551376,
"learning_rate": 0.00019417214985900362,
"loss": 1.0533303260803222,
"mean_token_accuracy": 0.7578112691640854,
"num_tokens": 2459072.0,
"step": 610
},
{
"entropy": 1.0193642653524875,
"epoch": 0.23727516264829698,
"grad_norm": 0.08400722593069077,
"learning_rate": 0.00019390358533637707,
"loss": 1.0955985069274903,
"mean_token_accuracy": 0.7598358646035195,
"num_tokens": 2497901.0,
"step": 620
},
{
"entropy": 1.0488237984478475,
"epoch": 0.24110218140068887,
"grad_norm": 0.07221511751413345,
"learning_rate": 0.00019363502081375052,
"loss": 1.151495361328125,
"mean_token_accuracy": 0.7541669681668282,
"num_tokens": 2536530.0,
"step": 630
},
{
"entropy": 1.0595403373241425,
"epoch": 0.24492920015308076,
"grad_norm": 0.10258961468935013,
"learning_rate": 0.00019336645629112396,
"loss": 1.0888574600219727,
"mean_token_accuracy": 0.7474928990006446,
"num_tokens": 2571599.0,
"step": 640
},
{
"entropy": 1.0924376487731933,
"epoch": 0.24875621890547264,
"grad_norm": 0.0751282125711441,
"learning_rate": 0.0001930978917684974,
"loss": 1.1468082427978517,
"mean_token_accuracy": 0.7465573191642761,
"num_tokens": 2612401.0,
"step": 650
},
{
"entropy": 0.9765479557216168,
"epoch": 0.2525832376578645,
"grad_norm": 0.09039046615362167,
"learning_rate": 0.00019282932724587083,
"loss": 1.054959201812744,
"mean_token_accuracy": 0.7649626806378365,
"num_tokens": 2652042.0,
"step": 660
},
{
"entropy": 1.0833388939499855,
"epoch": 0.2564102564102564,
"grad_norm": 0.08905521035194397,
"learning_rate": 0.00019256076272324425,
"loss": 1.0937233924865724,
"mean_token_accuracy": 0.7510278865694999,
"num_tokens": 2692211.0,
"step": 670
},
{
"entropy": 1.0478161230683327,
"epoch": 0.2602372751626483,
"grad_norm": 0.09634676575660706,
"learning_rate": 0.0001922921982006177,
"loss": 1.1077991485595704,
"mean_token_accuracy": 0.7490576148033142,
"num_tokens": 2734562.0,
"step": 680
},
{
"entropy": 1.13061283826828,
"epoch": 0.26406429391504016,
"grad_norm": 0.07757367938756943,
"learning_rate": 0.00019202363367799114,
"loss": 1.1330499649047852,
"mean_token_accuracy": 0.7350772902369499,
"num_tokens": 2778990.0,
"step": 690
},
{
"entropy": 1.0450415380299092,
"epoch": 0.26789131266743205,
"grad_norm": 0.06570328027009964,
"learning_rate": 0.0001917550691553646,
"loss": 1.1425201416015625,
"mean_token_accuracy": 0.7551864832639694,
"num_tokens": 2814475.0,
"step": 700
},
{
"entropy": 1.0114438571035862,
"epoch": 0.27171833141982393,
"grad_norm": 0.11020322889089584,
"learning_rate": 0.00019148650463273804,
"loss": 1.0655290603637695,
"mean_token_accuracy": 0.7622251763939858,
"num_tokens": 2847751.0,
"step": 710
},
{
"entropy": 1.1008106037974357,
"epoch": 0.2755453501722158,
"grad_norm": 0.07282241433858871,
"learning_rate": 0.00019121794011011146,
"loss": 1.139061450958252,
"mean_token_accuracy": 0.7368278667330742,
"num_tokens": 2889553.0,
"step": 720
},
{
"entropy": 1.0155851803719997,
"epoch": 0.2793723689246077,
"grad_norm": 0.11799076199531555,
"learning_rate": 0.0001909493755874849,
"loss": 1.0634971618652345,
"mean_token_accuracy": 0.7578275159001351,
"num_tokens": 2926860.0,
"step": 730
},
{
"entropy": 1.0430648550391197,
"epoch": 0.2831993876769996,
"grad_norm": 0.08702066540718079,
"learning_rate": 0.00019068081106485832,
"loss": 1.0851760864257813,
"mean_token_accuracy": 0.7573095709085464,
"num_tokens": 2964029.0,
"step": 740
},
{
"entropy": 1.0908455178141594,
"epoch": 0.2870264064293915,
"grad_norm": 0.06593967229127884,
"learning_rate": 0.00019041224654223177,
"loss": 1.0929256439208985,
"mean_token_accuracy": 0.7440102145075798,
"num_tokens": 3004528.0,
"step": 750
},
{
"entropy": 0.971546346694231,
"epoch": 0.29085342518178336,
"grad_norm": 0.08857332915067673,
"learning_rate": 0.00019014368201960522,
"loss": 1.0575197219848633,
"mean_token_accuracy": 0.7712547823786735,
"num_tokens": 3041203.0,
"step": 760
},
{
"entropy": 1.0496620319783687,
"epoch": 0.29468044393417525,
"grad_norm": 0.07172030210494995,
"learning_rate": 0.00018987511749697867,
"loss": 1.100113582611084,
"mean_token_accuracy": 0.747462597489357,
"num_tokens": 3086263.0,
"step": 770
},
{
"entropy": 1.0853942684829234,
"epoch": 0.29850746268656714,
"grad_norm": 0.0861373096704483,
"learning_rate": 0.0001896065529743521,
"loss": 1.1116449356079101,
"mean_token_accuracy": 0.7485424548387527,
"num_tokens": 3126741.0,
"step": 780
},
{
"entropy": 1.039051755145192,
"epoch": 0.302334481438959,
"grad_norm": 0.07344193756580353,
"learning_rate": 0.00018933798845172553,
"loss": 1.092859935760498,
"mean_token_accuracy": 0.7524166733026505,
"num_tokens": 3164039.0,
"step": 790
},
{
"entropy": 1.021885236352682,
"epoch": 0.3061615001913509,
"grad_norm": 0.09843221306800842,
"learning_rate": 0.00018906942392909895,
"loss": 1.0813950538635253,
"mean_token_accuracy": 0.7612547591328621,
"num_tokens": 3202455.0,
"step": 800
},
{
"entropy": 1.0329275727272034,
"epoch": 0.3099885189437428,
"grad_norm": 0.07059452682733536,
"learning_rate": 0.0001888008594064724,
"loss": 1.0516587257385255,
"mean_token_accuracy": 0.7528441205620766,
"num_tokens": 3239911.0,
"step": 810
},
{
"entropy": 1.0202949695289134,
"epoch": 0.31381553769613474,
"grad_norm": 0.07269048690795898,
"learning_rate": 0.00018853229488384585,
"loss": 1.0879244804382324,
"mean_token_accuracy": 0.7542849883437157,
"num_tokens": 3278682.0,
"step": 820
},
{
"entropy": 1.0690899170935153,
"epoch": 0.3176425564485266,
"grad_norm": 0.14370054006576538,
"learning_rate": 0.0001882637303612193,
"loss": 1.1063778877258301,
"mean_token_accuracy": 0.75286915153265,
"num_tokens": 3325465.0,
"step": 830
},
{
"entropy": 1.0819261983036994,
"epoch": 0.3214695752009185,
"grad_norm": 0.0973975881934166,
"learning_rate": 0.00018799516583859274,
"loss": 1.0978353500366211,
"mean_token_accuracy": 0.749411192536354,
"num_tokens": 3363479.0,
"step": 840
},
{
"entropy": 1.0502549454569816,
"epoch": 0.3252965939533104,
"grad_norm": 0.11021706461906433,
"learning_rate": 0.0001877266013159662,
"loss": 1.1489330291748048,
"mean_token_accuracy": 0.7476440489292144,
"num_tokens": 3405863.0,
"step": 850
},
{
"entropy": 1.1556663788855075,
"epoch": 0.3291236127057023,
"grad_norm": 0.06459799408912659,
"learning_rate": 0.0001874580367933396,
"loss": 1.1840539932250977,
"mean_token_accuracy": 0.7288095027208328,
"num_tokens": 3450829.0,
"step": 860
},
{
"entropy": 1.097336183488369,
"epoch": 0.33295063145809417,
"grad_norm": 0.06765513867139816,
"learning_rate": 0.00018718947227071303,
"loss": 1.1286226272583009,
"mean_token_accuracy": 0.7439226225018502,
"num_tokens": 3490640.0,
"step": 870
},
{
"entropy": 1.0772622771561147,
"epoch": 0.33677765021048606,
"grad_norm": 0.08126482367515564,
"learning_rate": 0.00018692090774808648,
"loss": 1.1434885025024415,
"mean_token_accuracy": 0.7434220835566521,
"num_tokens": 3529438.0,
"step": 880
},
{
"entropy": 1.0091869838535785,
"epoch": 0.34060466896287794,
"grad_norm": 0.0654602199792862,
"learning_rate": 0.00018665234322545992,
"loss": 1.0767542839050293,
"mean_token_accuracy": 0.7644046351313591,
"num_tokens": 3565217.0,
"step": 890
},
{
"entropy": 1.0432863399386405,
"epoch": 0.34443168771526983,
"grad_norm": 0.10025763511657715,
"learning_rate": 0.00018638377870283337,
"loss": 1.0826923370361328,
"mean_token_accuracy": 0.7591656729578972,
"num_tokens": 3603940.0,
"step": 900
},
{
"entropy": 1.007722695171833,
"epoch": 0.3482587064676617,
"grad_norm": 0.06779270619153976,
"learning_rate": 0.00018611521418020682,
"loss": 1.0158637046813965,
"mean_token_accuracy": 0.763145099580288,
"num_tokens": 3644547.0,
"step": 910
},
{
"entropy": 1.0556264080107212,
"epoch": 0.3520857252200536,
"grad_norm": 0.07834554463624954,
"learning_rate": 0.00018584664965758026,
"loss": 1.091851806640625,
"mean_token_accuracy": 0.7483858004212379,
"num_tokens": 3691979.0,
"step": 920
},
{
"entropy": 1.0730156242847442,
"epoch": 0.3559127439724455,
"grad_norm": 0.10772417485713959,
"learning_rate": 0.00018557808513495368,
"loss": 1.1370153427124023,
"mean_token_accuracy": 0.7466916054487228,
"num_tokens": 3728767.0,
"step": 930
},
{
"entropy": 1.081335111707449,
"epoch": 0.3597397627248374,
"grad_norm": 0.07669705897569656,
"learning_rate": 0.0001853095206123271,
"loss": 1.141366958618164,
"mean_token_accuracy": 0.744081811606884,
"num_tokens": 3772234.0,
"step": 940
},
{
"entropy": 0.9984517656266689,
"epoch": 0.36356678147722926,
"grad_norm": 0.0695272758603096,
"learning_rate": 0.00018504095608970055,
"loss": 1.0501303672790527,
"mean_token_accuracy": 0.7590280339121819,
"num_tokens": 3816970.0,
"step": 950
},
{
"entropy": 0.910194194689393,
"epoch": 0.36739380022962115,
"grad_norm": 0.06411932408809662,
"learning_rate": 0.000184772391567074,
"loss": 0.9656248092651367,
"mean_token_accuracy": 0.7823562085628509,
"num_tokens": 3853816.0,
"step": 960
},
{
"entropy": 0.9763211451470852,
"epoch": 0.37122081898201303,
"grad_norm": 0.08389662951231003,
"learning_rate": 0.00018450382704444744,
"loss": 1.0703671455383301,
"mean_token_accuracy": 0.76742093116045,
"num_tokens": 3896404.0,
"step": 970
},
{
"entropy": 1.076941692829132,
"epoch": 0.3750478377344049,
"grad_norm": 0.13239043951034546,
"learning_rate": 0.0001842352625218209,
"loss": 1.1353830337524413,
"mean_token_accuracy": 0.7479456245899201,
"num_tokens": 3934187.0,
"step": 980
},
{
"entropy": 1.077423833310604,
"epoch": 0.3788748564867968,
"grad_norm": 0.06203702092170715,
"learning_rate": 0.00018396669799919434,
"loss": 1.1363765716552734,
"mean_token_accuracy": 0.7437105163931846,
"num_tokens": 3975766.0,
"step": 990
},
{
"entropy": 1.009491826593876,
"epoch": 0.3827018752391887,
"grad_norm": 0.06740409135818481,
"learning_rate": 0.00018369813347656776,
"loss": 1.0752355575561523,
"mean_token_accuracy": 0.7598015293478966,
"num_tokens": 4018368.0,
"step": 1000
},
{
"entropy": 0.9744428530335426,
"epoch": 0.3865288939915806,
"grad_norm": 0.07750537246465683,
"learning_rate": 0.00018342956895394118,
"loss": 1.0554892539978027,
"mean_token_accuracy": 0.7682438552379608,
"num_tokens": 4057647.0,
"step": 1010
},
{
"entropy": 1.0335246473550797,
"epoch": 0.39035591274397247,
"grad_norm": 0.07627248764038086,
"learning_rate": 0.00018316100443131463,
"loss": 1.0600407600402832,
"mean_token_accuracy": 0.7552958622574806,
"num_tokens": 4098377.0,
"step": 1020
},
{
"entropy": 1.0256185740232469,
"epoch": 0.39418293149636435,
"grad_norm": 0.10117889940738678,
"learning_rate": 0.00018289243990868807,
"loss": 1.0706727027893066,
"mean_token_accuracy": 0.75880047082901,
"num_tokens": 4141633.0,
"step": 1030
},
{
"entropy": 0.9883378148078918,
"epoch": 0.39800995024875624,
"grad_norm": 0.064593605697155,
"learning_rate": 0.00018262387538606152,
"loss": 1.007016372680664,
"mean_token_accuracy": 0.764974731206894,
"num_tokens": 4181155.0,
"step": 1040
},
{
"entropy": 1.0692595109343528,
"epoch": 0.4018369690011481,
"grad_norm": 0.07493151724338531,
"learning_rate": 0.00018235531086343497,
"loss": 1.123647975921631,
"mean_token_accuracy": 0.7452841177582741,
"num_tokens": 4218175.0,
"step": 1050
},
{
"entropy": 0.990117172151804,
"epoch": 0.40566398775354,
"grad_norm": 0.06332839280366898,
"learning_rate": 0.0001820867463408084,
"loss": 1.0538661003112793,
"mean_token_accuracy": 0.7637043848633767,
"num_tokens": 4262326.0,
"step": 1060
},
{
"entropy": 1.002436650544405,
"epoch": 0.4094910065059319,
"grad_norm": 0.07898294180631638,
"learning_rate": 0.00018181818181818183,
"loss": 0.9973239898681641,
"mean_token_accuracy": 0.7644759714603424,
"num_tokens": 4300876.0,
"step": 1070
},
{
"entropy": 0.9635432817041873,
"epoch": 0.4133180252583238,
"grad_norm": 0.09760674089193344,
"learning_rate": 0.00018154961729555525,
"loss": 1.0411369323730468,
"mean_token_accuracy": 0.7664693981409073,
"num_tokens": 4338887.0,
"step": 1080
},
{
"entropy": 0.9780610945075751,
"epoch": 0.41714504401071567,
"grad_norm": 0.08076441287994385,
"learning_rate": 0.0001812810527729287,
"loss": 1.0544751167297364,
"mean_token_accuracy": 0.76624975502491,
"num_tokens": 4380678.0,
"step": 1090
},
{
"entropy": 1.0548741944134234,
"epoch": 0.42097206276310756,
"grad_norm": 0.0646439641714096,
"learning_rate": 0.00018101248825030215,
"loss": 1.128230667114258,
"mean_token_accuracy": 0.7504925444722176,
"num_tokens": 4422899.0,
"step": 1100
},
{
"entropy": 1.0767812803387642,
"epoch": 0.42479908151549944,
"grad_norm": 0.06994366645812988,
"learning_rate": 0.0001807439237276756,
"loss": 1.1209583282470703,
"mean_token_accuracy": 0.7477620646357537,
"num_tokens": 4461864.0,
"step": 1110
},
{
"entropy": 1.062944334745407,
"epoch": 0.42862610026789133,
"grad_norm": 0.11016593873500824,
"learning_rate": 0.00018047535920504904,
"loss": 1.0880105018615722,
"mean_token_accuracy": 0.7455767750740051,
"num_tokens": 4501378.0,
"step": 1120
},
{
"entropy": 1.0511136516928672,
"epoch": 0.4324531190202832,
"grad_norm": 0.08707646280527115,
"learning_rate": 0.00018020679468242246,
"loss": 1.0764313697814942,
"mean_token_accuracy": 0.7519838035106658,
"num_tokens": 4541448.0,
"step": 1130
},
{
"entropy": 0.9529998056590557,
"epoch": 0.4362801377726751,
"grad_norm": 0.07353853434324265,
"learning_rate": 0.00017993823015979588,
"loss": 1.0098756790161132,
"mean_token_accuracy": 0.7720077604055404,
"num_tokens": 4586147.0,
"step": 1140
},
{
"entropy": 1.143844011425972,
"epoch": 0.440107156525067,
"grad_norm": 0.06268489360809326,
"learning_rate": 0.00017966966563716933,
"loss": 1.1934361457824707,
"mean_token_accuracy": 0.7281116575002671,
"num_tokens": 4631906.0,
"step": 1150
},
{
"entropy": 1.0761573910713196,
"epoch": 0.4439341752774589,
"grad_norm": 0.07078517228364944,
"learning_rate": 0.00017940110111454278,
"loss": 1.1359615325927734,
"mean_token_accuracy": 0.7409780561923981,
"num_tokens": 4674626.0,
"step": 1160
},
{
"entropy": 0.9940036550164223,
"epoch": 0.44776119402985076,
"grad_norm": 0.08054502308368683,
"learning_rate": 0.00017913253659191622,
"loss": 1.033839225769043,
"mean_token_accuracy": 0.7682256817817688,
"num_tokens": 4715717.0,
"step": 1170
},
{
"entropy": 0.9752239182591438,
"epoch": 0.45158821278224265,
"grad_norm": 0.08600450307130814,
"learning_rate": 0.00017886397206928967,
"loss": 1.0254844665527343,
"mean_token_accuracy": 0.7688430979847908,
"num_tokens": 4747316.0,
"step": 1180
},
{
"entropy": 1.064694558084011,
"epoch": 0.45541523153463453,
"grad_norm": 0.07270248234272003,
"learning_rate": 0.0001785954075466631,
"loss": 1.0806646347045898,
"mean_token_accuracy": 0.7534265503287315,
"num_tokens": 4788606.0,
"step": 1190
},
{
"entropy": 0.958549628406763,
"epoch": 0.4592422502870264,
"grad_norm": 0.0644846111536026,
"learning_rate": 0.00017832684302403654,
"loss": 1.0015847206115722,
"mean_token_accuracy": 0.7644492238759995,
"num_tokens": 4831371.0,
"step": 1200
},
{
"entropy": 1.0885677203536033,
"epoch": 0.4630692690394183,
"grad_norm": 0.13487283885478973,
"learning_rate": 0.00017805827850140996,
"loss": 1.1495524406433106,
"mean_token_accuracy": 0.7414823487401009,
"num_tokens": 4871231.0,
"step": 1210
},
{
"entropy": 1.1117899976670742,
"epoch": 0.4668962877918102,
"grad_norm": 0.08015701174736023,
"learning_rate": 0.0001777897139787834,
"loss": 1.1366958618164062,
"mean_token_accuracy": 0.7351289570331574,
"num_tokens": 4911520.0,
"step": 1220
},
{
"entropy": 0.9722193017601967,
"epoch": 0.4707233065442021,
"grad_norm": 0.06839531660079956,
"learning_rate": 0.00017752114945615685,
"loss": 1.0259140968322753,
"mean_token_accuracy": 0.7658233359456063,
"num_tokens": 4950296.0,
"step": 1230
},
{
"entropy": 1.0021446757018566,
"epoch": 0.47455032529659397,
"grad_norm": 0.08231978863477707,
"learning_rate": 0.0001772525849335303,
"loss": 1.0437036514282227,
"mean_token_accuracy": 0.7644398525357247,
"num_tokens": 4989688.0,
"step": 1240
},
{
"entropy": 0.9640353135764599,
"epoch": 0.47837734404898585,
"grad_norm": 0.11587074398994446,
"learning_rate": 0.00017698402041090375,
"loss": 1.0072126388549805,
"mean_token_accuracy": 0.7740294471383095,
"num_tokens": 5029135.0,
"step": 1250
},
{
"entropy": 1.0122342824935913,
"epoch": 0.48220436280137774,
"grad_norm": 0.07646426558494568,
"learning_rate": 0.00017671545588827717,
"loss": 1.0733034133911132,
"mean_token_accuracy": 0.7619047269225121,
"num_tokens": 5066488.0,
"step": 1260
},
{
"entropy": 1.0465880073606968,
"epoch": 0.4860313815537696,
"grad_norm": 0.07594821602106094,
"learning_rate": 0.0001764468913656506,
"loss": 1.0953254699707031,
"mean_token_accuracy": 0.7511951208114624,
"num_tokens": 5102103.0,
"step": 1270
},
{
"entropy": 1.0104024082422256,
"epoch": 0.4898584003061615,
"grad_norm": 0.07695835083723068,
"learning_rate": 0.00017617832684302403,
"loss": 1.1025714874267578,
"mean_token_accuracy": 0.7583977058529854,
"num_tokens": 5141113.0,
"step": 1280
},
{
"entropy": 1.044089037179947,
"epoch": 0.4936854190585534,
"grad_norm": 0.07186906039714813,
"learning_rate": 0.00017590976232039748,
"loss": 1.0713683128356934,
"mean_token_accuracy": 0.7546971932053566,
"num_tokens": 5181454.0,
"step": 1290
},
{
"entropy": 0.9154780797660351,
"epoch": 0.4975124378109453,
"grad_norm": 0.08934911340475082,
"learning_rate": 0.00017564119779777093,
"loss": 0.9871469497680664,
"mean_token_accuracy": 0.7756836161017417,
"num_tokens": 5214143.0,
"step": 1300
},
{
"entropy": 1.05635926425457,
"epoch": 0.5013394565633371,
"grad_norm": 0.07880513370037079,
"learning_rate": 0.00017537263327514437,
"loss": 1.0985527038574219,
"mean_token_accuracy": 0.7524559125304222,
"num_tokens": 5258310.0,
"step": 1310
},
{
"entropy": 1.0448619149625302,
"epoch": 0.505166475315729,
"grad_norm": 0.10507462918758392,
"learning_rate": 0.0001751040687525178,
"loss": 1.1040778160095215,
"mean_token_accuracy": 0.7477249845862388,
"num_tokens": 5296865.0,
"step": 1320
},
{
"entropy": 1.0706947155296802,
"epoch": 0.5089934940681209,
"grad_norm": 0.09437765926122665,
"learning_rate": 0.00017483550422989124,
"loss": 1.1372867584228517,
"mean_token_accuracy": 0.7502224639058113,
"num_tokens": 5335301.0,
"step": 1330
},
{
"entropy": 0.9736435614526272,
"epoch": 0.5128205128205128,
"grad_norm": 0.07162626087665558,
"learning_rate": 0.0001745669397072647,
"loss": 1.0273897171020507,
"mean_token_accuracy": 0.7711918234825135,
"num_tokens": 5372533.0,
"step": 1340
},
{
"entropy": 0.9989161014556884,
"epoch": 0.5166475315729047,
"grad_norm": 0.08805254101753235,
"learning_rate": 0.0001742983751846381,
"loss": 1.0603778839111329,
"mean_token_accuracy": 0.7600028276443481,
"num_tokens": 5412651.0,
"step": 1350
},
{
"entropy": 1.063752220571041,
"epoch": 0.5204745503252965,
"grad_norm": 0.08056829869747162,
"learning_rate": 0.00017402981066201156,
"loss": 1.0876687049865723,
"mean_token_accuracy": 0.7487231969833374,
"num_tokens": 5454518.0,
"step": 1360
},
{
"entropy": 0.966426993906498,
"epoch": 0.5243015690776884,
"grad_norm": 0.06970727443695068,
"learning_rate": 0.000173761246139385,
"loss": 1.0302441596984864,
"mean_token_accuracy": 0.7640074551105499,
"num_tokens": 5495257.0,
"step": 1370
},
{
"entropy": 0.9727898858487606,
"epoch": 0.5281285878300803,
"grad_norm": 0.09694326668977737,
"learning_rate": 0.00017349268161675842,
"loss": 1.0327792167663574,
"mean_token_accuracy": 0.7687779292464256,
"num_tokens": 5527573.0,
"step": 1380
},
{
"entropy": 1.043993879854679,
"epoch": 0.5319556065824722,
"grad_norm": 0.05676735192537308,
"learning_rate": 0.00017322411709413187,
"loss": 1.1139988899230957,
"mean_token_accuracy": 0.7597961351275444,
"num_tokens": 5566542.0,
"step": 1390
},
{
"entropy": 0.9891408108174801,
"epoch": 0.5357826253348641,
"grad_norm": 0.08670998364686966,
"learning_rate": 0.00017295555257150532,
"loss": 1.0878351211547852,
"mean_token_accuracy": 0.7640718072652817,
"num_tokens": 5604986.0,
"step": 1400
},
{
"entropy": 1.0097288101911546,
"epoch": 0.539609644087256,
"grad_norm": 0.09190856665372849,
"learning_rate": 0.00017268698804887876,
"loss": 1.079444408416748,
"mean_token_accuracy": 0.7590912491083145,
"num_tokens": 5642224.0,
"step": 1410
},
{
"entropy": 0.9927844725549221,
"epoch": 0.5434366628396479,
"grad_norm": 0.08191007375717163,
"learning_rate": 0.00017241842352625218,
"loss": 1.0661033630371093,
"mean_token_accuracy": 0.7664914444088936,
"num_tokens": 5680912.0,
"step": 1420
},
{
"entropy": 0.973179691657424,
"epoch": 0.5472636815920398,
"grad_norm": 0.08161566406488419,
"learning_rate": 0.00017214985900362563,
"loss": 1.078667163848877,
"mean_token_accuracy": 0.7686992704868316,
"num_tokens": 5717171.0,
"step": 1430
},
{
"entropy": 1.0467095457017421,
"epoch": 0.5510907003444316,
"grad_norm": 0.09403429925441742,
"learning_rate": 0.00017188129448099908,
"loss": 1.0912303924560547,
"mean_token_accuracy": 0.7550861686468124,
"num_tokens": 5755956.0,
"step": 1440
},
{
"entropy": 1.0082954704761504,
"epoch": 0.5549177190968235,
"grad_norm": 0.09858231991529465,
"learning_rate": 0.0001716127299583725,
"loss": 1.0449023246765137,
"mean_token_accuracy": 0.7586705282330513,
"num_tokens": 5798765.0,
"step": 1450
},
{
"entropy": 1.0528397418558597,
"epoch": 0.5587447378492154,
"grad_norm": 0.06697855144739151,
"learning_rate": 0.00017134416543574594,
"loss": 1.0901053428649903,
"mean_token_accuracy": 0.7517547190189362,
"num_tokens": 5839833.0,
"step": 1460
},
{
"entropy": 1.009619940817356,
"epoch": 0.5625717566016073,
"grad_norm": 0.07271189987659454,
"learning_rate": 0.0001710756009131194,
"loss": 1.0171070098876953,
"mean_token_accuracy": 0.7594649389386177,
"num_tokens": 5880613.0,
"step": 1470
},
{
"entropy": 0.9699329622089863,
"epoch": 0.5663987753539992,
"grad_norm": 0.07800697535276413,
"learning_rate": 0.0001708070363904928,
"loss": 1.1077412605285644,
"mean_token_accuracy": 0.7667307928204536,
"num_tokens": 5918218.0,
"step": 1480
},
{
"entropy": 0.9389957278966904,
"epoch": 0.5702257941063911,
"grad_norm": 0.08150342851877213,
"learning_rate": 0.00017053847186786626,
"loss": 0.9634763717651367,
"mean_token_accuracy": 0.7806087970733643,
"num_tokens": 5960284.0,
"step": 1490
},
{
"entropy": 1.0140163496136665,
"epoch": 0.574052812858783,
"grad_norm": 0.06430503726005554,
"learning_rate": 0.0001702699073452397,
"loss": 1.0751851081848145,
"mean_token_accuracy": 0.7564210310578346,
"num_tokens": 6000562.0,
"step": 1500
},
{
"entropy": 1.047791599482298,
"epoch": 0.5778798316111748,
"grad_norm": 0.07922326028347015,
"learning_rate": 0.00017000134282261313,
"loss": 1.1397698402404786,
"mean_token_accuracy": 0.7494736298918724,
"num_tokens": 6045192.0,
"step": 1510
},
{
"entropy": 1.1085532158613205,
"epoch": 0.5817068503635667,
"grad_norm": 0.1093953400850296,
"learning_rate": 0.00016973277829998657,
"loss": 1.142368698120117,
"mean_token_accuracy": 0.7414237394928932,
"num_tokens": 6090148.0,
"step": 1520
},
{
"entropy": 0.9427969709038735,
"epoch": 0.5855338691159586,
"grad_norm": 0.09579843282699585,
"learning_rate": 0.00016946421377736002,
"loss": 0.9980224609375,
"mean_token_accuracy": 0.7730853497982025,
"num_tokens": 6129845.0,
"step": 1530
},
{
"entropy": 1.0571323171257974,
"epoch": 0.5893608878683505,
"grad_norm": 0.09482655674219131,
"learning_rate": 0.00016919564925473347,
"loss": 1.0665513038635255,
"mean_token_accuracy": 0.749831511080265,
"num_tokens": 6171961.0,
"step": 1540
},
{
"entropy": 0.978803563863039,
"epoch": 0.5931879066207424,
"grad_norm": 0.08609842509031296,
"learning_rate": 0.0001689270847321069,
"loss": 1.0541341781616211,
"mean_token_accuracy": 0.769312071800232,
"num_tokens": 6210126.0,
"step": 1550
},
{
"entropy": 1.0714900024235248,
"epoch": 0.5970149253731343,
"grad_norm": 0.05390879884362221,
"learning_rate": 0.00016865852020948033,
"loss": 1.1057682037353516,
"mean_token_accuracy": 0.7402923837304115,
"num_tokens": 6262394.0,
"step": 1560
},
{
"entropy": 0.9234071888029576,
"epoch": 0.6008419441255262,
"grad_norm": 0.09692159295082092,
"learning_rate": 0.00016838995568685378,
"loss": 0.9622941017150879,
"mean_token_accuracy": 0.7832354381680489,
"num_tokens": 6292140.0,
"step": 1570
},
{
"entropy": 0.9591108359396457,
"epoch": 0.604668962877918,
"grad_norm": 0.0720645934343338,
"learning_rate": 0.0001681213911642272,
"loss": 1.019169235229492,
"mean_token_accuracy": 0.771478471159935,
"num_tokens": 6332134.0,
"step": 1580
},
{
"entropy": 1.0945825845003128,
"epoch": 0.6084959816303099,
"grad_norm": 0.07380460202693939,
"learning_rate": 0.00016785282664160065,
"loss": 1.1463271141052247,
"mean_token_accuracy": 0.7419712334871292,
"num_tokens": 6373967.0,
"step": 1590
},
{
"entropy": 1.055589073896408,
"epoch": 0.6123230003827018,
"grad_norm": 0.07209772616624832,
"learning_rate": 0.0001675842621189741,
"loss": 1.1144783973693848,
"mean_token_accuracy": 0.7461996227502823,
"num_tokens": 6418236.0,
"step": 1600
},
{
"entropy": 1.1149650782346725,
"epoch": 0.6161500191350937,
"grad_norm": 0.07935164868831635,
"learning_rate": 0.00016731569759634754,
"loss": 1.1727729797363282,
"mean_token_accuracy": 0.7365738078951836,
"num_tokens": 6464589.0,
"step": 1610
},
{
"entropy": 1.0068970195949078,
"epoch": 0.6199770378874856,
"grad_norm": 0.0804886594414711,
"learning_rate": 0.00016704713307372096,
"loss": 1.0483062744140625,
"mean_token_accuracy": 0.7632505163550377,
"num_tokens": 6504385.0,
"step": 1620
},
{
"entropy": 0.9996877416968346,
"epoch": 0.6238040566398775,
"grad_norm": 0.0723455473780632,
"learning_rate": 0.0001667785685510944,
"loss": 1.0592655181884765,
"mean_token_accuracy": 0.7597218692302704,
"num_tokens": 6545928.0,
"step": 1630
},
{
"entropy": 0.9159566629678011,
"epoch": 0.6276310753922695,
"grad_norm": 0.08028513193130493,
"learning_rate": 0.00016651000402846783,
"loss": 0.9434080123901367,
"mean_token_accuracy": 0.7826011970639228,
"num_tokens": 6586166.0,
"step": 1640
},
{
"entropy": 1.0089009895920753,
"epoch": 0.6314580941446614,
"grad_norm": 0.09154181182384491,
"learning_rate": 0.00016624143950584128,
"loss": 1.0339744567871094,
"mean_token_accuracy": 0.7604109585285187,
"num_tokens": 6624706.0,
"step": 1650
},
{
"entropy": 1.0208431974053382,
"epoch": 0.6352851128970533,
"grad_norm": 0.08039630204439163,
"learning_rate": 0.00016597287498321472,
"loss": 1.0823830604553222,
"mean_token_accuracy": 0.7548153042793274,
"num_tokens": 6665626.0,
"step": 1660
},
{
"entropy": 0.9645413011312485,
"epoch": 0.6391121316494451,
"grad_norm": 0.08834270387887955,
"learning_rate": 0.00016570431046058817,
"loss": 1.0401340484619142,
"mean_token_accuracy": 0.7703565835952759,
"num_tokens": 6699532.0,
"step": 1670
},
{
"entropy": 1.0028597339987755,
"epoch": 0.642939150401837,
"grad_norm": 0.08974612504243851,
"learning_rate": 0.00016543574593796162,
"loss": 1.0680928230285645,
"mean_token_accuracy": 0.7617413088679313,
"num_tokens": 6740574.0,
"step": 1680
},
{
"entropy": 0.8952145710587501,
"epoch": 0.6467661691542289,
"grad_norm": 0.09289242327213287,
"learning_rate": 0.00016516718141533504,
"loss": 0.9696210861206055,
"mean_token_accuracy": 0.7848611980676651,
"num_tokens": 6782208.0,
"step": 1690
},
{
"entropy": 0.9520700328052044,
"epoch": 0.6505931879066208,
"grad_norm": 0.07298107445240021,
"learning_rate": 0.00016489861689270848,
"loss": 0.9891908645629883,
"mean_token_accuracy": 0.7725306749343872,
"num_tokens": 6818937.0,
"step": 1700
},
{
"entropy": 0.9734554067254066,
"epoch": 0.6544202066590127,
"grad_norm": 0.08233233541250229,
"learning_rate": 0.0001646300523700819,
"loss": 1.0311893463134765,
"mean_token_accuracy": 0.7683428943157196,
"num_tokens": 6851548.0,
"step": 1710
},
{
"entropy": 0.9947349905967713,
"epoch": 0.6582472254114046,
"grad_norm": 0.08351403474807739,
"learning_rate": 0.00016436148784745535,
"loss": 1.0382192611694336,
"mean_token_accuracy": 0.7587384819984436,
"num_tokens": 6891553.0,
"step": 1720
},
{
"entropy": 1.0383310310542584,
"epoch": 0.6620742441637965,
"grad_norm": 0.07240983843803406,
"learning_rate": 0.0001640929233248288,
"loss": 1.0978598594665527,
"mean_token_accuracy": 0.7515711337327957,
"num_tokens": 6930875.0,
"step": 1730
},
{
"entropy": 1.085533195734024,
"epoch": 0.6659012629161883,
"grad_norm": 0.06999973207712173,
"learning_rate": 0.00016382435880220225,
"loss": 1.1412755966186523,
"mean_token_accuracy": 0.7471029132604599,
"num_tokens": 6971721.0,
"step": 1740
},
{
"entropy": 0.978867219388485,
"epoch": 0.6697282816685802,
"grad_norm": 0.06091843172907829,
"learning_rate": 0.0001635557942795757,
"loss": 1.023170566558838,
"mean_token_accuracy": 0.7686347916722298,
"num_tokens": 7010340.0,
"step": 1750
},
{
"entropy": 1.0823599390685559,
"epoch": 0.6735553004209721,
"grad_norm": 0.07732617110013962,
"learning_rate": 0.0001632872297569491,
"loss": 1.1036816596984864,
"mean_token_accuracy": 0.7404509574174881,
"num_tokens": 7061189.0,
"step": 1760
},
{
"entropy": 0.9984334908425808,
"epoch": 0.677382319173364,
"grad_norm": 0.11516186594963074,
"learning_rate": 0.00016301866523432253,
"loss": 1.1070829391479493,
"mean_token_accuracy": 0.7593477964401245,
"num_tokens": 7098345.0,
"step": 1770
},
{
"entropy": 1.0023914370685816,
"epoch": 0.6812093379257559,
"grad_norm": 0.08624757081270218,
"learning_rate": 0.00016275010071169598,
"loss": 1.043964958190918,
"mean_token_accuracy": 0.7635682225227356,
"num_tokens": 7136873.0,
"step": 1780
},
{
"entropy": 1.0823404759168624,
"epoch": 0.6850363566781478,
"grad_norm": 0.0846925675868988,
"learning_rate": 0.00016248153618906943,
"loss": 1.1333115577697754,
"mean_token_accuracy": 0.7394865393638611,
"num_tokens": 7181553.0,
"step": 1790
},
{
"entropy": 1.0224985226988792,
"epoch": 0.6888633754305397,
"grad_norm": 0.060152288526296616,
"learning_rate": 0.00016221297166644287,
"loss": 1.0782301902770997,
"mean_token_accuracy": 0.759938097000122,
"num_tokens": 7223076.0,
"step": 1800
},
{
"entropy": 1.0880159534513951,
"epoch": 0.6926903941829315,
"grad_norm": 0.06577905267477036,
"learning_rate": 0.00016194440714381632,
"loss": 1.1103734016418456,
"mean_token_accuracy": 0.7457254812121391,
"num_tokens": 7265594.0,
"step": 1810
},
{
"entropy": 1.0144932381808758,
"epoch": 0.6965174129353234,
"grad_norm": 0.07276095449924469,
"learning_rate": 0.00016167584262118974,
"loss": 1.0733102798461913,
"mean_token_accuracy": 0.759276558458805,
"num_tokens": 7305359.0,
"step": 1820
},
{
"entropy": 1.0186967477202415,
"epoch": 0.7003444316877153,
"grad_norm": 0.08775337040424347,
"learning_rate": 0.0001614072780985632,
"loss": 1.0672088623046876,
"mean_token_accuracy": 0.7567476496100426,
"num_tokens": 7348060.0,
"step": 1830
},
{
"entropy": 0.9723582908511161,
"epoch": 0.7041714504401072,
"grad_norm": 0.09250030666589737,
"learning_rate": 0.0001611387135759366,
"loss": 1.0032880783081055,
"mean_token_accuracy": 0.7693375036120415,
"num_tokens": 7387110.0,
"step": 1840
},
{
"entropy": 0.9738463938236237,
"epoch": 0.7079984691924991,
"grad_norm": 0.09884033352136612,
"learning_rate": 0.00016087014905331006,
"loss": 1.0514408111572267,
"mean_token_accuracy": 0.7625621780753136,
"num_tokens": 7428511.0,
"step": 1850
},
{
"entropy": 1.0910252556204796,
"epoch": 0.711825487944891,
"grad_norm": 0.09194686263799667,
"learning_rate": 0.0001606015845306835,
"loss": 1.1226488113403321,
"mean_token_accuracy": 0.7452121302485466,
"num_tokens": 7474736.0,
"step": 1860
},
{
"entropy": 1.0159518368542195,
"epoch": 0.7156525066972829,
"grad_norm": 0.07921712845563889,
"learning_rate": 0.00016033302000805695,
"loss": 1.061795711517334,
"mean_token_accuracy": 0.761272345483303,
"num_tokens": 7516891.0,
"step": 1870
},
{
"entropy": 0.8903906352818012,
"epoch": 0.7194795254496748,
"grad_norm": 0.10288332402706146,
"learning_rate": 0.0001600644554854304,
"loss": 0.9471863746643067,
"mean_token_accuracy": 0.7831206247210503,
"num_tokens": 7551597.0,
"step": 1880
},
{
"entropy": 1.0759326584637166,
"epoch": 0.7233065442020666,
"grad_norm": 0.06488945335149765,
"learning_rate": 0.00015979589096280382,
"loss": 1.136262798309326,
"mean_token_accuracy": 0.742707334458828,
"num_tokens": 7597529.0,
"step": 1890
},
{
"entropy": 0.9951202683150768,
"epoch": 0.7271335629544585,
"grad_norm": 0.06628359109163284,
"learning_rate": 0.00015952732644017724,
"loss": 1.0448930740356446,
"mean_token_accuracy": 0.7615623638033867,
"num_tokens": 7634291.0,
"step": 1900
},
{
"entropy": 1.0593122780323028,
"epoch": 0.7309605817068504,
"grad_norm": 0.08212320506572723,
"learning_rate": 0.00015925876191755068,
"loss": 1.1099414825439453,
"mean_token_accuracy": 0.7455122962594032,
"num_tokens": 7677086.0,
"step": 1910
},
{
"entropy": 1.0122868783771992,
"epoch": 0.7347876004592423,
"grad_norm": 0.06458455324172974,
"learning_rate": 0.00015899019739492413,
"loss": 1.0447346687316894,
"mean_token_accuracy": 0.7542453840374946,
"num_tokens": 7721785.0,
"step": 1920
},
{
"entropy": 1.0801358975470066,
"epoch": 0.7386146192116342,
"grad_norm": 0.06971931457519531,
"learning_rate": 0.00015872163287229758,
"loss": 1.109630012512207,
"mean_token_accuracy": 0.7450100436806679,
"num_tokens": 7761887.0,
"step": 1930
},
{
"entropy": 0.9081138484179974,
"epoch": 0.7424416379640261,
"grad_norm": 0.06223156675696373,
"learning_rate": 0.00015845306834967102,
"loss": 1.0026588439941406,
"mean_token_accuracy": 0.7772255912423134,
"num_tokens": 7807072.0,
"step": 1940
},
{
"entropy": 1.0170219503343105,
"epoch": 0.746268656716418,
"grad_norm": 0.0685853511095047,
"learning_rate": 0.00015818450382704447,
"loss": 1.055277442932129,
"mean_token_accuracy": 0.7580551549792289,
"num_tokens": 7845791.0,
"step": 1950
},
{
"entropy": 1.0753178864717483,
"epoch": 0.7500956754688098,
"grad_norm": 0.08306553959846497,
"learning_rate": 0.0001579159393044179,
"loss": 1.1332826614379883,
"mean_token_accuracy": 0.7421450033783913,
"num_tokens": 7891091.0,
"step": 1960
},
{
"entropy": 0.9297005102038384,
"epoch": 0.7539226942212017,
"grad_norm": 0.08018683642148972,
"learning_rate": 0.0001576473747817913,
"loss": 1.000318431854248,
"mean_token_accuracy": 0.7793557167053222,
"num_tokens": 7928252.0,
"step": 1970
},
{
"entropy": 1.0840253300964833,
"epoch": 0.7577497129735936,
"grad_norm": 0.06487595289945602,
"learning_rate": 0.00015737881025916476,
"loss": 1.1166275024414063,
"mean_token_accuracy": 0.7378593400120735,
"num_tokens": 7972071.0,
"step": 1980
},
{
"entropy": 1.0406386695802212,
"epoch": 0.7615767317259855,
"grad_norm": 0.0615115687251091,
"learning_rate": 0.0001571102457365382,
"loss": 1.0869349479675292,
"mean_token_accuracy": 0.7490768045186996,
"num_tokens": 8016865.0,
"step": 1990
},
{
"entropy": 0.9573215276002884,
"epoch": 0.7654037504783774,
"grad_norm": 0.0715412124991417,
"learning_rate": 0.00015684168121391165,
"loss": 1.0404720306396484,
"mean_token_accuracy": 0.7706617951393128,
"num_tokens": 8055917.0,
"step": 2000
},
{
"entropy": 0.9201878193765879,
"epoch": 0.7692307692307693,
"grad_norm": 0.07988248765468597,
"learning_rate": 0.0001565731166912851,
"loss": 0.9380558967590332,
"mean_token_accuracy": 0.782890722155571,
"num_tokens": 8093252.0,
"step": 2010
},
{
"entropy": 1.0045961767435074,
"epoch": 0.7730577879831612,
"grad_norm": 0.061089444905519485,
"learning_rate": 0.00015630455216865855,
"loss": 1.0528027534484863,
"mean_token_accuracy": 0.7598949059844017,
"num_tokens": 8135244.0,
"step": 2020
},
{
"entropy": 0.9942824639379978,
"epoch": 0.776884806735553,
"grad_norm": 0.06443686783313751,
"learning_rate": 0.00015603598764603197,
"loss": 1.0168493270874024,
"mean_token_accuracy": 0.7590687796473503,
"num_tokens": 8178961.0,
"step": 2030
},
{
"entropy": 0.9773981764912605,
"epoch": 0.7807118254879449,
"grad_norm": 0.0818348303437233,
"learning_rate": 0.0001557674231234054,
"loss": 1.0193141937255858,
"mean_token_accuracy": 0.7708378821611405,
"num_tokens": 8217139.0,
"step": 2040
},
{
"entropy": 0.9836540646851063,
"epoch": 0.7845388442403368,
"grad_norm": 0.06240411475300789,
"learning_rate": 0.00015549885860077883,
"loss": 1.0662775993347169,
"mean_token_accuracy": 0.7658124819397927,
"num_tokens": 8252825.0,
"step": 2050
},
{
"entropy": 1.036501456052065,
"epoch": 0.7883658629927287,
"grad_norm": 0.09231610596179962,
"learning_rate": 0.00015523029407815228,
"loss": 1.112645435333252,
"mean_token_accuracy": 0.7541953936219216,
"num_tokens": 8295113.0,
"step": 2060
},
{
"entropy": 0.9800528183579444,
"epoch": 0.7921928817451206,
"grad_norm": 0.08806589245796204,
"learning_rate": 0.00015496172955552573,
"loss": 1.0401280403137207,
"mean_token_accuracy": 0.7672899037599563,
"num_tokens": 8335977.0,
"step": 2070
},
{
"entropy": 0.9678378522396087,
"epoch": 0.7960199004975125,
"grad_norm": 0.08777868002653122,
"learning_rate": 0.00015469316503289918,
"loss": 1.0509014129638672,
"mean_token_accuracy": 0.7696513712406159,
"num_tokens": 8374917.0,
"step": 2080
},
{
"entropy": 1.042826947569847,
"epoch": 0.7998469192499044,
"grad_norm": 0.09018490463495255,
"learning_rate": 0.00015442460051027262,
"loss": 1.0869378089904784,
"mean_token_accuracy": 0.7507286682724953,
"num_tokens": 8415614.0,
"step": 2090
},
{
"entropy": 1.0548966623842717,
"epoch": 0.8036739380022963,
"grad_norm": 0.07267605513334274,
"learning_rate": 0.00015415603598764604,
"loss": 1.0960289001464845,
"mean_token_accuracy": 0.7545556098222732,
"num_tokens": 8455059.0,
"step": 2100
},
{
"entropy": 1.044345210492611,
"epoch": 0.8075009567546881,
"grad_norm": 0.08414279669523239,
"learning_rate": 0.00015388747146501946,
"loss": 1.1200661659240723,
"mean_token_accuracy": 0.7490431442856789,
"num_tokens": 8493866.0,
"step": 2110
},
{
"entropy": 1.0317029684782029,
"epoch": 0.81132797550708,
"grad_norm": 0.06549747288227081,
"learning_rate": 0.0001536189069423929,
"loss": 1.0583623886108398,
"mean_token_accuracy": 0.7555923700332642,
"num_tokens": 8536147.0,
"step": 2120
},
{
"entropy": 0.9694572634994983,
"epoch": 0.8151549942594719,
"grad_norm": 0.08112777769565582,
"learning_rate": 0.00015335034241976636,
"loss": 1.0503274917602539,
"mean_token_accuracy": 0.7646921187639236,
"num_tokens": 8578007.0,
"step": 2130
},
{
"entropy": 0.9358880028128624,
"epoch": 0.8189820130118638,
"grad_norm": 0.07176466286182404,
"learning_rate": 0.0001530817778971398,
"loss": 1.000410270690918,
"mean_token_accuracy": 0.773740467429161,
"num_tokens": 8620999.0,
"step": 2140
},
{
"entropy": 1.0444137938320637,
"epoch": 0.8228090317642557,
"grad_norm": 0.06355756521224976,
"learning_rate": 0.00015281321337451325,
"loss": 1.0860448837280274,
"mean_token_accuracy": 0.751850588619709,
"num_tokens": 8663354.0,
"step": 2150
},
{
"entropy": 0.9044980220496655,
"epoch": 0.8266360505166476,
"grad_norm": 0.080223448574543,
"learning_rate": 0.00015254464885188667,
"loss": 0.9434403419494629,
"mean_token_accuracy": 0.7828752338886261,
"num_tokens": 8699748.0,
"step": 2160
},
{
"entropy": 1.0172922544181346,
"epoch": 0.8304630692690395,
"grad_norm": 0.06971501559019089,
"learning_rate": 0.00015227608432926012,
"loss": 1.0325962066650392,
"mean_token_accuracy": 0.7651202365756035,
"num_tokens": 8739901.0,
"step": 2170
},
{
"entropy": 0.9639742732048034,
"epoch": 0.8342900880214313,
"grad_norm": 0.06396778672933578,
"learning_rate": 0.00015200751980663354,
"loss": 1.0435317039489747,
"mean_token_accuracy": 0.7667818054556846,
"num_tokens": 8778980.0,
"step": 2180
},
{
"entropy": 0.8876220636069775,
"epoch": 0.8381171067738232,
"grad_norm": 0.09910868853330612,
"learning_rate": 0.00015173895528400698,
"loss": 0.9876300811767578,
"mean_token_accuracy": 0.7865215808153152,
"num_tokens": 8815525.0,
"step": 2190
},
{
"entropy": 1.0369405087083579,
"epoch": 0.8419441255262151,
"grad_norm": 0.08775259554386139,
"learning_rate": 0.00015147039076138043,
"loss": 1.1244413375854492,
"mean_token_accuracy": 0.7550949841737747,
"num_tokens": 8857085.0,
"step": 2200
},
{
"entropy": 0.9762422502040863,
"epoch": 0.845771144278607,
"grad_norm": 0.08659302443265915,
"learning_rate": 0.00015120182623875388,
"loss": 1.0164811134338378,
"mean_token_accuracy": 0.771617329120636,
"num_tokens": 8894271.0,
"step": 2210
},
{
"entropy": 0.9543228000402451,
"epoch": 0.8495981630309989,
"grad_norm": 0.09588434547185898,
"learning_rate": 0.00015093326171612733,
"loss": 1.0303520202636718,
"mean_token_accuracy": 0.768992331624031,
"num_tokens": 8934095.0,
"step": 2220
},
{
"entropy": 1.1307236567139625,
"epoch": 0.8534251817833908,
"grad_norm": 0.07016360014677048,
"learning_rate": 0.00015066469719350075,
"loss": 1.1526556968688966,
"mean_token_accuracy": 0.7296861469745636,
"num_tokens": 8982341.0,
"step": 2230
},
{
"entropy": 1.0867296956479549,
"epoch": 0.8572522005357827,
"grad_norm": 0.07838597148656845,
"learning_rate": 0.00015039613267087417,
"loss": 1.1031158447265625,
"mean_token_accuracy": 0.7445572927594185,
"num_tokens": 9027401.0,
"step": 2240
},
{
"entropy": 0.9492381684482097,
"epoch": 0.8610792192881745,
"grad_norm": 0.08416638523340225,
"learning_rate": 0.0001501275681482476,
"loss": 1.0079804420471192,
"mean_token_accuracy": 0.7709973976016045,
"num_tokens": 9069985.0,
"step": 2250
},
{
"entropy": 0.9767517909407616,
"epoch": 0.8649062380405664,
"grad_norm": 0.09798935055732727,
"learning_rate": 0.00014985900362562106,
"loss": 1.0394697189331055,
"mean_token_accuracy": 0.7647709026932716,
"num_tokens": 9108246.0,
"step": 2260
},
{
"entropy": 0.9779160171747208,
"epoch": 0.8687332567929583,
"grad_norm": 0.08669373393058777,
"learning_rate": 0.0001495904391029945,
"loss": 1.0398100852966308,
"mean_token_accuracy": 0.7669417649507523,
"num_tokens": 9147055.0,
"step": 2270
},
{
"entropy": 1.014696953445673,
"epoch": 0.8725602755453502,
"grad_norm": 0.07674991339445114,
"learning_rate": 0.00014932187458036795,
"loss": 1.0742408752441406,
"mean_token_accuracy": 0.7583330690860748,
"num_tokens": 9187727.0,
"step": 2280
},
{
"entropy": 0.9619584158062935,
"epoch": 0.8763872942977421,
"grad_norm": 0.09512930363416672,
"learning_rate": 0.00014905331005774137,
"loss": 1.01895112991333,
"mean_token_accuracy": 0.7718996241688728,
"num_tokens": 9228518.0,
"step": 2290
},
{
"entropy": 0.8759313493967056,
"epoch": 0.880214313050134,
"grad_norm": 0.06927543133497238,
"learning_rate": 0.00014878474553511482,
"loss": 0.9590776443481446,
"mean_token_accuracy": 0.783099564909935,
"num_tokens": 9269392.0,
"step": 2300
},
{
"entropy": 1.0930156745016575,
"epoch": 0.8840413318025259,
"grad_norm": 0.07149595022201538,
"learning_rate": 0.00014851618101248824,
"loss": 1.132398796081543,
"mean_token_accuracy": 0.7445679202675819,
"num_tokens": 9310993.0,
"step": 2310
},
{
"entropy": 0.9991384916007519,
"epoch": 0.8878683505549178,
"grad_norm": 0.100126251578331,
"learning_rate": 0.0001482476164898617,
"loss": 1.0395862579345703,
"mean_token_accuracy": 0.7618231356143952,
"num_tokens": 9349210.0,
"step": 2320
},
{
"entropy": 0.9891969002783298,
"epoch": 0.8916953693073096,
"grad_norm": 0.07942050695419312,
"learning_rate": 0.00014797905196723514,
"loss": 1.0403067588806152,
"mean_token_accuracy": 0.7636258214712143,
"num_tokens": 9386251.0,
"step": 2330
},
{
"entropy": 1.034816125780344,
"epoch": 0.8955223880597015,
"grad_norm": 0.07803855836391449,
"learning_rate": 0.00014771048744460858,
"loss": 1.088371467590332,
"mean_token_accuracy": 0.7585563778877258,
"num_tokens": 9425492.0,
"step": 2340
},
{
"entropy": 0.998091223090887,
"epoch": 0.8993494068120934,
"grad_norm": 0.06696243584156036,
"learning_rate": 0.00014744192292198203,
"loss": 1.0410521507263184,
"mean_token_accuracy": 0.7595112159848213,
"num_tokens": 9466862.0,
"step": 2350
},
{
"entropy": 0.9615898832678795,
"epoch": 0.9031764255644853,
"grad_norm": 0.07813845574855804,
"learning_rate": 0.00014717335839935545,
"loss": 1.0265610694885254,
"mean_token_accuracy": 0.7707905381917953,
"num_tokens": 9503827.0,
"step": 2360
},
{
"entropy": 0.8776158876717091,
"epoch": 0.9070034443168772,
"grad_norm": 0.10287057608366013,
"learning_rate": 0.0001469047938767289,
"loss": 0.9231206893920898,
"mean_token_accuracy": 0.7909859612584114,
"num_tokens": 9536194.0,
"step": 2370
},
{
"entropy": 0.980732673406601,
"epoch": 0.9108304630692691,
"grad_norm": 0.06174289435148239,
"learning_rate": 0.00014663622935410232,
"loss": 1.0316704750061034,
"mean_token_accuracy": 0.7596900418400765,
"num_tokens": 9577621.0,
"step": 2380
},
{
"entropy": 1.0083129487931728,
"epoch": 0.914657481821661,
"grad_norm": 0.08805451542139053,
"learning_rate": 0.00014636766483147576,
"loss": 1.0296180725097657,
"mean_token_accuracy": 0.7577597886323929,
"num_tokens": 9616522.0,
"step": 2390
},
{
"entropy": 1.0002505116164684,
"epoch": 0.9184845005740528,
"grad_norm": 0.07697928696870804,
"learning_rate": 0.0001460991003088492,
"loss": 1.0411831855773925,
"mean_token_accuracy": 0.7589930936694145,
"num_tokens": 9659217.0,
"step": 2400
},
{
"entropy": 0.971958789229393,
"epoch": 0.9223115193264447,
"grad_norm": 0.08504882454872131,
"learning_rate": 0.00014583053578622266,
"loss": 1.015835952758789,
"mean_token_accuracy": 0.7664303690195083,
"num_tokens": 9694120.0,
"step": 2410
},
{
"entropy": 0.9250703640282154,
"epoch": 0.9261385380788366,
"grad_norm": 0.06279303133487701,
"learning_rate": 0.00014556197126359608,
"loss": 0.9673631668090821,
"mean_token_accuracy": 0.782692727446556,
"num_tokens": 9732460.0,
"step": 2420
},
{
"entropy": 1.0777716524899006,
"epoch": 0.9299655568312285,
"grad_norm": 0.06884833425283432,
"learning_rate": 0.00014529340674096952,
"loss": 1.1415311813354492,
"mean_token_accuracy": 0.7447684407234192,
"num_tokens": 9773760.0,
"step": 2430
},
{
"entropy": 1.0116477236151695,
"epoch": 0.9337925755836204,
"grad_norm": 0.06346814334392548,
"learning_rate": 0.00014502484221834297,
"loss": 1.0904932975769044,
"mean_token_accuracy": 0.7616935014724732,
"num_tokens": 9808910.0,
"step": 2440
},
{
"entropy": 0.9434679664671421,
"epoch": 0.9376195943360123,
"grad_norm": 0.09843038022518158,
"learning_rate": 0.0001447562776957164,
"loss": 1.0111047744750976,
"mean_token_accuracy": 0.774254959821701,
"num_tokens": 9846472.0,
"step": 2450
},
{
"entropy": 1.035598163306713,
"epoch": 0.9414466130884042,
"grad_norm": 0.08025770634412766,
"learning_rate": 0.00014448771317308984,
"loss": 1.1550275802612304,
"mean_token_accuracy": 0.7497850373387337,
"num_tokens": 9885082.0,
"step": 2460
},
{
"entropy": 1.057615876197815,
"epoch": 0.945273631840796,
"grad_norm": 0.07916443794965744,
"learning_rate": 0.00014421914865046329,
"loss": 1.114585781097412,
"mean_token_accuracy": 0.7495191320776939,
"num_tokens": 9924849.0,
"step": 2470
},
{
"entropy": 0.9576205931603908,
"epoch": 0.9491006505931879,
"grad_norm": 0.10745597630739212,
"learning_rate": 0.00014395058412783673,
"loss": 1.0471231460571289,
"mean_token_accuracy": 0.7697127804160118,
"num_tokens": 9969210.0,
"step": 2480
},
{
"entropy": 1.012363300472498,
"epoch": 0.9529276693455798,
"grad_norm": 0.09448845684528351,
"learning_rate": 0.00014368201960521015,
"loss": 1.0322566986083985,
"mean_token_accuracy": 0.7568502962589264,
"num_tokens": 10009532.0,
"step": 2490
},
{
"entropy": 0.9387446999549866,
"epoch": 0.9567546880979717,
"grad_norm": 0.08835543692111969,
"learning_rate": 0.0001434134550825836,
"loss": 0.9836790084838867,
"mean_token_accuracy": 0.7740270137786865,
"num_tokens": 10051767.0,
"step": 2500
},
{
"entropy": 1.043863268941641,
"epoch": 0.9605817068503636,
"grad_norm": 0.0590866394340992,
"learning_rate": 0.00014314489055995705,
"loss": 1.1286373138427734,
"mean_token_accuracy": 0.755294018983841,
"num_tokens": 10093518.0,
"step": 2510
},
{
"entropy": 1.068480123579502,
"epoch": 0.9644087256027555,
"grad_norm": 0.06240773946046829,
"learning_rate": 0.00014287632603733047,
"loss": 1.1243531227111816,
"mean_token_accuracy": 0.7457959160208703,
"num_tokens": 10137842.0,
"step": 2520
},
{
"entropy": 0.9648511357605457,
"epoch": 0.9682357443551474,
"grad_norm": 0.07577214390039444,
"learning_rate": 0.00014260776151470391,
"loss": 1.0646875381469727,
"mean_token_accuracy": 0.7689151406288147,
"num_tokens": 10177541.0,
"step": 2530
},
{
"entropy": 1.0034234993159772,
"epoch": 0.9720627631075393,
"grad_norm": 0.06887607276439667,
"learning_rate": 0.00014233919699207736,
"loss": 1.0736650466918944,
"mean_token_accuracy": 0.7580653995275497,
"num_tokens": 10217056.0,
"step": 2540
},
{
"entropy": 0.9054977536201477,
"epoch": 0.9758897818599311,
"grad_norm": 0.12731540203094482,
"learning_rate": 0.00014207063246945078,
"loss": 0.9581779479980469,
"mean_token_accuracy": 0.7800818130373954,
"num_tokens": 10249622.0,
"step": 2550
},
{
"entropy": 1.0892111197113992,
"epoch": 0.979716800612323,
"grad_norm": 0.08707671612501144,
"learning_rate": 0.00014180206794682423,
"loss": 1.1551457405090333,
"mean_token_accuracy": 0.7434241071343421,
"num_tokens": 10287483.0,
"step": 2560
},
{
"entropy": 0.9462251186370849,
"epoch": 0.9835438193647149,
"grad_norm": 0.10457631945610046,
"learning_rate": 0.00014153350342419768,
"loss": 0.9859563827514648,
"mean_token_accuracy": 0.7729493409395218,
"num_tokens": 10324562.0,
"step": 2570
},
{
"entropy": 0.9609014384448529,
"epoch": 0.9873708381171068,
"grad_norm": 0.1095169261097908,
"learning_rate": 0.0001412649389015711,
"loss": 1.00408992767334,
"mean_token_accuracy": 0.769461353123188,
"num_tokens": 10368482.0,
"step": 2580
},
{
"entropy": 0.9500531531870365,
"epoch": 0.9911978568694987,
"grad_norm": 0.12787973880767822,
"learning_rate": 0.00014099637437894454,
"loss": 1.0082733154296875,
"mean_token_accuracy": 0.7726384818553924,
"num_tokens": 10407666.0,
"step": 2590
},
{
"entropy": 0.9639500208199024,
"epoch": 0.9950248756218906,
"grad_norm": 0.08555731922388077,
"learning_rate": 0.000140727809856318,
"loss": 0.9910324096679688,
"mean_token_accuracy": 0.7700270056724549,
"num_tokens": 10445419.0,
"step": 2600
},
{
"entropy": 0.9984636768698693,
"epoch": 0.9988518943742825,
"grad_norm": 0.10294629633426666,
"learning_rate": 0.00014045924533369144,
"loss": 1.0837631225585938,
"mean_token_accuracy": 0.7655858203768731,
"num_tokens": 10483287.0,
"step": 2610
},
{
"entropy": 0.940229170024395,
"epoch": 1.0026789131266742,
"grad_norm": 0.10580310225486755,
"learning_rate": 0.00014019068081106486,
"loss": 0.9650541305541992,
"mean_token_accuracy": 0.7728109017014504,
"num_tokens": 10523841.0,
"step": 2620
},
{
"entropy": 0.9358184114098549,
"epoch": 1.0065059318790661,
"grad_norm": 0.12460961192846298,
"learning_rate": 0.0001399221162884383,
"loss": 0.9570166587829589,
"mean_token_accuracy": 0.7772100657224655,
"num_tokens": 10561636.0,
"step": 2630
},
{
"entropy": 1.010379894077778,
"epoch": 1.010332950631458,
"grad_norm": 0.0781383365392685,
"learning_rate": 0.00013965355176581175,
"loss": 1.0524909019470214,
"mean_token_accuracy": 0.7589353621006012,
"num_tokens": 10605899.0,
"step": 2640
},
{
"entropy": 0.977487600594759,
"epoch": 1.01415996938385,
"grad_norm": 0.0902724489569664,
"learning_rate": 0.00013938498724318517,
"loss": 1.0475889205932618,
"mean_token_accuracy": 0.7629667386412621,
"num_tokens": 10642372.0,
"step": 2650
},
{
"entropy": 0.9681369736790657,
"epoch": 1.0179869881362418,
"grad_norm": 0.06344746798276901,
"learning_rate": 0.00013911642272055862,
"loss": 1.0268775939941406,
"mean_token_accuracy": 0.7677509978413581,
"num_tokens": 10682308.0,
"step": 2660
},
{
"entropy": 0.9013996437191963,
"epoch": 1.0218140068886337,
"grad_norm": 0.09890369325876236,
"learning_rate": 0.00013884785819793206,
"loss": 0.969085693359375,
"mean_token_accuracy": 0.7815661624073982,
"num_tokens": 10720755.0,
"step": 2670
},
{
"entropy": 0.9415140472352505,
"epoch": 1.0256410256410255,
"grad_norm": 0.08691754937171936,
"learning_rate": 0.00013857929367530548,
"loss": 0.9783688545227051,
"mean_token_accuracy": 0.7722749456763267,
"num_tokens": 10759842.0,
"step": 2680
},
{
"entropy": 0.9437286920845509,
"epoch": 1.0294680443934174,
"grad_norm": 0.06577731668949127,
"learning_rate": 0.00013831072915267893,
"loss": 0.9904938697814941,
"mean_token_accuracy": 0.7716649904847145,
"num_tokens": 10803740.0,
"step": 2690
},
{
"entropy": 0.9657303221523762,
"epoch": 1.0332950631458093,
"grad_norm": 0.07847272604703903,
"learning_rate": 0.00013804216463005238,
"loss": 1.0073646545410155,
"mean_token_accuracy": 0.7678608119487762,
"num_tokens": 10841808.0,
"step": 2700
},
{
"entropy": 0.881027878075838,
"epoch": 1.0371220818982012,
"grad_norm": 0.12755495309829712,
"learning_rate": 0.00013777360010742583,
"loss": 0.955751895904541,
"mean_token_accuracy": 0.7835927039384842,
"num_tokens": 10880108.0,
"step": 2710
},
{
"entropy": 0.8458237417042256,
"epoch": 1.040949100650593,
"grad_norm": 0.07641884684562683,
"learning_rate": 0.00013750503558479925,
"loss": 0.9140083312988281,
"mean_token_accuracy": 0.7939343526959419,
"num_tokens": 10916272.0,
"step": 2720
},
{
"entropy": 0.8845301080495119,
"epoch": 1.044776119402985,
"grad_norm": 0.08896184712648392,
"learning_rate": 0.0001372364710621727,
"loss": 0.9332797050476074,
"mean_token_accuracy": 0.7884662911295891,
"num_tokens": 10951932.0,
"step": 2730
},
{
"entropy": 0.963884600251913,
"epoch": 1.0486031381553769,
"grad_norm": 0.10196536034345627,
"learning_rate": 0.00013696790653954614,
"loss": 1.0123867988586426,
"mean_token_accuracy": 0.7659088596701622,
"num_tokens": 10991548.0,
"step": 2740
},
{
"entropy": 0.9720129862427711,
"epoch": 1.0524301569077688,
"grad_norm": 0.07552212476730347,
"learning_rate": 0.00013669934201691956,
"loss": 1.015409564971924,
"mean_token_accuracy": 0.7689290955662728,
"num_tokens": 11028749.0,
"step": 2750
},
{
"entropy": 0.9871743015944958,
"epoch": 1.0562571756601606,
"grad_norm": 0.09255808591842651,
"learning_rate": 0.000136430777494293,
"loss": 1.0351217269897461,
"mean_token_accuracy": 0.7620491668581962,
"num_tokens": 11071336.0,
"step": 2760
},
{
"entropy": 0.809666246920824,
"epoch": 1.0600841944125525,
"grad_norm": 0.08891233056783676,
"learning_rate": 0.00013616221297166645,
"loss": 0.8595174789428711,
"mean_token_accuracy": 0.8053640425205231,
"num_tokens": 11107708.0,
"step": 2770
},
{
"entropy": 0.9220615286380053,
"epoch": 1.0639112131649444,
"grad_norm": 0.0731620192527771,
"learning_rate": 0.0001358936484490399,
"loss": 0.9694333076477051,
"mean_token_accuracy": 0.7767527863383293,
"num_tokens": 11149005.0,
"step": 2780
},
{
"entropy": 0.8744502332061529,
"epoch": 1.0677382319173363,
"grad_norm": 0.0865791067481041,
"learning_rate": 0.00013562508392641332,
"loss": 0.9401009559631348,
"mean_token_accuracy": 0.7854847684502602,
"num_tokens": 11189214.0,
"step": 2790
},
{
"entropy": 0.989877526462078,
"epoch": 1.0715652506697282,
"grad_norm": 0.09394430369138718,
"learning_rate": 0.00013535651940378677,
"loss": 1.0487696647644043,
"mean_token_accuracy": 0.7607394486665726,
"num_tokens": 11225161.0,
"step": 2800
},
{
"entropy": 0.8656694941222668,
"epoch": 1.07539226942212,
"grad_norm": 0.10940351337194443,
"learning_rate": 0.0001350879548811602,
"loss": 0.9236039161682129,
"mean_token_accuracy": 0.7919901207089424,
"num_tokens": 11261274.0,
"step": 2810
},
{
"entropy": 1.063130483776331,
"epoch": 1.079219288174512,
"grad_norm": 0.06853083521127701,
"learning_rate": 0.00013481939035853364,
"loss": 1.0725152015686035,
"mean_token_accuracy": 0.7454188778996468,
"num_tokens": 11302522.0,
"step": 2820
},
{
"entropy": 0.92764787748456,
"epoch": 1.0830463069269038,
"grad_norm": 0.10344231128692627,
"learning_rate": 0.00013455082583590708,
"loss": 0.9725144386291504,
"mean_token_accuracy": 0.7810687303543091,
"num_tokens": 11339898.0,
"step": 2830
},
{
"entropy": 0.9415482886135578,
"epoch": 1.0868733256792957,
"grad_norm": 0.12117484956979752,
"learning_rate": 0.00013428226131328053,
"loss": 1.0216625213623047,
"mean_token_accuracy": 0.7713929772377014,
"num_tokens": 11380187.0,
"step": 2840
},
{
"entropy": 0.9300718136131764,
"epoch": 1.0907003444316876,
"grad_norm": 0.09950343519449234,
"learning_rate": 0.00013401369679065398,
"loss": 0.9862215042114257,
"mean_token_accuracy": 0.7748491272330285,
"num_tokens": 11417351.0,
"step": 2850
},
{
"entropy": 0.9016943011432886,
"epoch": 1.0945273631840795,
"grad_norm": 0.10104110836982727,
"learning_rate": 0.0001337451322680274,
"loss": 0.9565576553344727,
"mean_token_accuracy": 0.7823473244905472,
"num_tokens": 11455566.0,
"step": 2860
},
{
"entropy": 1.0184541821479798,
"epoch": 1.0983543819364714,
"grad_norm": 0.07055146247148514,
"learning_rate": 0.00013347656774540084,
"loss": 1.0644380569458007,
"mean_token_accuracy": 0.7551941126585007,
"num_tokens": 11499960.0,
"step": 2870
},
{
"entropy": 0.9143499568104744,
"epoch": 1.1021814006888633,
"grad_norm": 0.09798481315374374,
"learning_rate": 0.00013320800322277426,
"loss": 0.9477805137634278,
"mean_token_accuracy": 0.778240317106247,
"num_tokens": 11536434.0,
"step": 2880
},
{
"entropy": 0.8803758375346661,
"epoch": 1.1060084194412552,
"grad_norm": 0.09720771014690399,
"learning_rate": 0.0001329394387001477,
"loss": 0.9369168281555176,
"mean_token_accuracy": 0.786097663640976,
"num_tokens": 11572420.0,
"step": 2890
},
{
"entropy": 0.9127089619636536,
"epoch": 1.109835438193647,
"grad_norm": 0.07493265718221664,
"learning_rate": 0.00013267087417752116,
"loss": 0.9610566139221192,
"mean_token_accuracy": 0.7780416712164879,
"num_tokens": 11607494.0,
"step": 2900
},
{
"entropy": 0.9359945230185985,
"epoch": 1.113662456946039,
"grad_norm": 0.09086300432682037,
"learning_rate": 0.0001324023096548946,
"loss": 0.9519670486450196,
"mean_token_accuracy": 0.7745376393198967,
"num_tokens": 11647057.0,
"step": 2910
},
{
"entropy": 0.9206651791930198,
"epoch": 1.1174894756984308,
"grad_norm": 0.10007902979850769,
"learning_rate": 0.00013213374513226805,
"loss": 0.9783179283142089,
"mean_token_accuracy": 0.778519794344902,
"num_tokens": 11685762.0,
"step": 2920
},
{
"entropy": 0.9937357418239117,
"epoch": 1.1213164944508227,
"grad_norm": 0.0993100181221962,
"learning_rate": 0.00013186518060964147,
"loss": 1.0440019607543944,
"mean_token_accuracy": 0.7590440228581429,
"num_tokens": 11727379.0,
"step": 2930
},
{
"entropy": 1.048055526614189,
"epoch": 1.1251435132032146,
"grad_norm": 0.11140380054712296,
"learning_rate": 0.0001315966160870149,
"loss": 1.1046284675598144,
"mean_token_accuracy": 0.7413847833871842,
"num_tokens": 11770734.0,
"step": 2940
},
{
"entropy": 0.9562077779322863,
"epoch": 1.1289705319556065,
"grad_norm": 0.11506770551204681,
"learning_rate": 0.00013132805156438834,
"loss": 0.9946146011352539,
"mean_token_accuracy": 0.7750585973262787,
"num_tokens": 11806270.0,
"step": 2950
},
{
"entropy": 0.9747304327785968,
"epoch": 1.1327975507079984,
"grad_norm": 0.1126897856593132,
"learning_rate": 0.00013105948704176179,
"loss": 1.061129093170166,
"mean_token_accuracy": 0.7613553464412689,
"num_tokens": 11852779.0,
"step": 2960
},
{
"entropy": 1.0132145062088966,
"epoch": 1.1366245694603903,
"grad_norm": 0.08260762691497803,
"learning_rate": 0.00013079092251913523,
"loss": 1.0199948310852052,
"mean_token_accuracy": 0.7617463275790215,
"num_tokens": 11897084.0,
"step": 2970
},
{
"entropy": 0.9878915682435035,
"epoch": 1.1404515882127821,
"grad_norm": 0.08098926395177841,
"learning_rate": 0.00013052235799650868,
"loss": 1.0480783462524415,
"mean_token_accuracy": 0.763205036520958,
"num_tokens": 11938987.0,
"step": 2980
},
{
"entropy": 1.0176467482000588,
"epoch": 1.144278606965174,
"grad_norm": 0.0966029092669487,
"learning_rate": 0.0001302537934738821,
"loss": 1.093599796295166,
"mean_token_accuracy": 0.7526282608509064,
"num_tokens": 11981156.0,
"step": 2990
},
{
"entropy": 1.0054687768220902,
"epoch": 1.148105625717566,
"grad_norm": 0.09327300637960434,
"learning_rate": 0.00012998522895125555,
"loss": 1.039564609527588,
"mean_token_accuracy": 0.7592228040099144,
"num_tokens": 12025389.0,
"step": 3000
},
{
"entropy": 0.9626951858401298,
"epoch": 1.1519326444699578,
"grad_norm": 0.06154703348875046,
"learning_rate": 0.00012971666442862897,
"loss": 0.9993762016296387,
"mean_token_accuracy": 0.769777101278305,
"num_tokens": 12069545.0,
"step": 3010
},
{
"entropy": 0.9221224367618561,
"epoch": 1.1557596632223497,
"grad_norm": 0.1140643060207367,
"learning_rate": 0.00012944809990600241,
"loss": 0.9887493133544922,
"mean_token_accuracy": 0.7754134178161621,
"num_tokens": 12113892.0,
"step": 3020
},
{
"entropy": 1.011741641908884,
"epoch": 1.1595866819747416,
"grad_norm": 0.08721659332513809,
"learning_rate": 0.00012917953538337586,
"loss": 1.068478488922119,
"mean_token_accuracy": 0.7615607067942619,
"num_tokens": 12153746.0,
"step": 3030
},
{
"entropy": 0.9926261432468891,
"epoch": 1.1634137007271335,
"grad_norm": 0.07577186822891235,
"learning_rate": 0.0001289109708607493,
"loss": 1.047102451324463,
"mean_token_accuracy": 0.7669480383396149,
"num_tokens": 12199067.0,
"step": 3040
},
{
"entropy": 0.945004402846098,
"epoch": 1.1672407194795253,
"grad_norm": 0.08443465083837509,
"learning_rate": 0.00012864240633812276,
"loss": 0.9891506195068359,
"mean_token_accuracy": 0.7756656989455223,
"num_tokens": 12243766.0,
"step": 3050
},
{
"entropy": 0.9602406993508339,
"epoch": 1.1710677382319172,
"grad_norm": 0.07647141069173813,
"learning_rate": 0.00012837384181549618,
"loss": 1.0091946601867676,
"mean_token_accuracy": 0.7702717915177345,
"num_tokens": 12279555.0,
"step": 3060
},
{
"entropy": 0.9430582121014595,
"epoch": 1.1748947569843091,
"grad_norm": 0.10050038248300552,
"learning_rate": 0.0001281052772928696,
"loss": 1.0251899719238282,
"mean_token_accuracy": 0.7759435445070266,
"num_tokens": 12316974.0,
"step": 3070
},
{
"entropy": 1.0339640237390995,
"epoch": 1.178721775736701,
"grad_norm": 0.09026551991701126,
"learning_rate": 0.00012783671277024304,
"loss": 1.0652464866638183,
"mean_token_accuracy": 0.7533303231000901,
"num_tokens": 12358111.0,
"step": 3080
},
{
"entropy": 0.9808862328529357,
"epoch": 1.182548794489093,
"grad_norm": 0.08769362419843674,
"learning_rate": 0.0001275681482476165,
"loss": 1.0068347930908204,
"mean_token_accuracy": 0.7660810023546218,
"num_tokens": 12401669.0,
"step": 3090
},
{
"entropy": 0.9436531282961369,
"epoch": 1.1863758132414848,
"grad_norm": 0.09366963803768158,
"learning_rate": 0.00012729958372498994,
"loss": 1.0298351287841796,
"mean_token_accuracy": 0.7704201564192772,
"num_tokens": 12442005.0,
"step": 3100
},
{
"entropy": 0.8712134130299092,
"epoch": 1.1902028319938767,
"grad_norm": 0.14041900634765625,
"learning_rate": 0.00012703101920236338,
"loss": 0.9094470977783203,
"mean_token_accuracy": 0.7861496224999428,
"num_tokens": 12484476.0,
"step": 3110
},
{
"entropy": 0.9474696554243565,
"epoch": 1.1940298507462686,
"grad_norm": 0.10449594259262085,
"learning_rate": 0.00012676245467973683,
"loss": 0.9729720115661621,
"mean_token_accuracy": 0.7746587276458741,
"num_tokens": 12521351.0,
"step": 3120
},
{
"entropy": 0.9215874671936035,
"epoch": 1.1978568694986604,
"grad_norm": 0.07733117789030075,
"learning_rate": 0.00012649389015711025,
"loss": 0.992548942565918,
"mean_token_accuracy": 0.7789316549897194,
"num_tokens": 12564603.0,
"step": 3130
},
{
"entropy": 0.9349980562925339,
"epoch": 1.2016838882510523,
"grad_norm": 0.06924714148044586,
"learning_rate": 0.00012622532563448367,
"loss": 1.010727596282959,
"mean_token_accuracy": 0.7728876963257789,
"num_tokens": 12606025.0,
"step": 3140
},
{
"entropy": 0.9719727545976639,
"epoch": 1.2055109070034442,
"grad_norm": 0.07646770775318146,
"learning_rate": 0.00012595676111185712,
"loss": 1.0482423782348633,
"mean_token_accuracy": 0.7659243881702423,
"num_tokens": 12647703.0,
"step": 3150
},
{
"entropy": 1.0236301876604557,
"epoch": 1.209337925755836,
"grad_norm": 0.08547945320606232,
"learning_rate": 0.00012568819658923056,
"loss": 1.0771334648132325,
"mean_token_accuracy": 0.7551302567124367,
"num_tokens": 12692347.0,
"step": 3160
},
{
"entropy": 0.9277745552361012,
"epoch": 1.213164944508228,
"grad_norm": 0.10816850513219833,
"learning_rate": 0.000125419632066604,
"loss": 0.9680308341979981,
"mean_token_accuracy": 0.7722468450665474,
"num_tokens": 12729671.0,
"step": 3170
},
{
"entropy": 0.9760092988610267,
"epoch": 1.2169919632606199,
"grad_norm": 0.08950033783912659,
"learning_rate": 0.00012515106754397746,
"loss": 1.000643539428711,
"mean_token_accuracy": 0.7665232941508293,
"num_tokens": 12768100.0,
"step": 3180
},
{
"entropy": 0.9292771026492119,
"epoch": 1.2208189820130118,
"grad_norm": 0.08686704933643341,
"learning_rate": 0.0001248825030213509,
"loss": 1.019674015045166,
"mean_token_accuracy": 0.7758068069815636,
"num_tokens": 12801323.0,
"step": 3190
},
{
"entropy": 0.8500060614198446,
"epoch": 1.2246460007654036,
"grad_norm": 0.07462778687477112,
"learning_rate": 0.00012461393849872433,
"loss": 0.9042973518371582,
"mean_token_accuracy": 0.7897424980998039,
"num_tokens": 12839880.0,
"step": 3200
},
{
"entropy": 0.9205234386026859,
"epoch": 1.2284730195177955,
"grad_norm": 0.07027672231197357,
"learning_rate": 0.00012434537397609775,
"loss": 0.9424190521240234,
"mean_token_accuracy": 0.7767854332923889,
"num_tokens": 12878349.0,
"step": 3210
},
{
"entropy": 0.9074239492416382,
"epoch": 1.2323000382701874,
"grad_norm": 0.09741132706403732,
"learning_rate": 0.0001240768094534712,
"loss": 0.9651589393615723,
"mean_token_accuracy": 0.7790584430098534,
"num_tokens": 12917588.0,
"step": 3220
},
{
"entropy": 0.8874296098947525,
"epoch": 1.2361270570225793,
"grad_norm": 0.08608463406562805,
"learning_rate": 0.00012380824493084464,
"loss": 0.9437139511108399,
"mean_token_accuracy": 0.7854243695735932,
"num_tokens": 12956199.0,
"step": 3230
},
{
"entropy": 0.9470510125160218,
"epoch": 1.2399540757749712,
"grad_norm": 0.09247037768363953,
"learning_rate": 0.0001235396804082181,
"loss": 1.032781982421875,
"mean_token_accuracy": 0.7712572082877159,
"num_tokens": 13000822.0,
"step": 3240
},
{
"entropy": 0.8850176699459553,
"epoch": 1.243781094527363,
"grad_norm": 0.08397585898637772,
"learning_rate": 0.00012327111588559153,
"loss": 0.9292671203613281,
"mean_token_accuracy": 0.787578609585762,
"num_tokens": 13043532.0,
"step": 3250
},
{
"entropy": 0.8605544999241829,
"epoch": 1.247608113279755,
"grad_norm": 0.0952179804444313,
"learning_rate": 0.00012300255136296498,
"loss": 0.8990240097045898,
"mean_token_accuracy": 0.7919793605804444,
"num_tokens": 13081376.0,
"step": 3260
},
{
"entropy": 1.003395075351,
"epoch": 1.2514351320321468,
"grad_norm": 0.08914512395858765,
"learning_rate": 0.0001227339868403384,
"loss": 1.1446642875671387,
"mean_token_accuracy": 0.7565032340586185,
"num_tokens": 13119474.0,
"step": 3270
},
{
"entropy": 0.9566417217254639,
"epoch": 1.2552621507845387,
"grad_norm": 0.13220350444316864,
"learning_rate": 0.00012246542231771182,
"loss": 0.9976698875427246,
"mean_token_accuracy": 0.7722181305289268,
"num_tokens": 13162637.0,
"step": 3280
},
{
"entropy": 0.888442064449191,
"epoch": 1.2590891695369306,
"grad_norm": 0.10493922978639603,
"learning_rate": 0.00012219685779508527,
"loss": 0.916744613647461,
"mean_token_accuracy": 0.7896391779184342,
"num_tokens": 13199412.0,
"step": 3290
},
{
"entropy": 0.9262259535491466,
"epoch": 1.2629161882893225,
"grad_norm": 0.09022962301969528,
"learning_rate": 0.00012192829327245872,
"loss": 0.9885137557983399,
"mean_token_accuracy": 0.778158649802208,
"num_tokens": 13240292.0,
"step": 3300
},
{
"entropy": 0.9356066003441811,
"epoch": 1.2667432070417144,
"grad_norm": 0.09693239629268646,
"learning_rate": 0.00012165972874983216,
"loss": 0.9731400489807129,
"mean_token_accuracy": 0.7748182758688926,
"num_tokens": 13275876.0,
"step": 3310
},
{
"entropy": 0.868951104208827,
"epoch": 1.2705702257941063,
"grad_norm": 0.09237370640039444,
"learning_rate": 0.0001213911642272056,
"loss": 0.9127277374267578,
"mean_token_accuracy": 0.7890144631266593,
"num_tokens": 13314857.0,
"step": 3320
},
{
"entropy": 0.9311054348945618,
"epoch": 1.2743972445464982,
"grad_norm": 0.08701436221599579,
"learning_rate": 0.00012112259970457902,
"loss": 0.9666108131408692,
"mean_token_accuracy": 0.7752738267183303,
"num_tokens": 13357039.0,
"step": 3330
},
{
"entropy": 0.9256260149180889,
"epoch": 1.27822426329889,
"grad_norm": 0.08751461654901505,
"learning_rate": 0.00012085403518195246,
"loss": 0.9926286697387695,
"mean_token_accuracy": 0.7750931903719902,
"num_tokens": 13397058.0,
"step": 3340
},
{
"entropy": 1.0074332721531392,
"epoch": 1.282051282051282,
"grad_norm": 0.07409587502479553,
"learning_rate": 0.00012058547065932591,
"loss": 1.062586498260498,
"mean_token_accuracy": 0.7546869352459907,
"num_tokens": 13441381.0,
"step": 3350
},
{
"entropy": 0.9596263833343983,
"epoch": 1.2858783008036738,
"grad_norm": 0.09343665838241577,
"learning_rate": 0.00012031690613669934,
"loss": 1.0023324012756347,
"mean_token_accuracy": 0.7719831839203835,
"num_tokens": 13481914.0,
"step": 3360
},
{
"entropy": 0.9313522674143314,
"epoch": 1.2897053195560657,
"grad_norm": 0.0879049226641655,
"learning_rate": 0.00012004834161407279,
"loss": 0.9833806991577149,
"mean_token_accuracy": 0.7737741976976394,
"num_tokens": 13519831.0,
"step": 3370
},
{
"entropy": 0.8369917057454586,
"epoch": 1.2935323383084576,
"grad_norm": 0.14339204132556915,
"learning_rate": 0.00011977977709144624,
"loss": 0.9147489547729493,
"mean_token_accuracy": 0.7984762340784073,
"num_tokens": 13559768.0,
"step": 3380
},
{
"entropy": 0.9055653363466263,
"epoch": 1.2973593570608495,
"grad_norm": 0.1441742479801178,
"learning_rate": 0.00011951121256881967,
"loss": 0.9521515846252442,
"mean_token_accuracy": 0.7834478095173836,
"num_tokens": 13595966.0,
"step": 3390
},
{
"entropy": 0.9677796266973019,
"epoch": 1.3011863758132414,
"grad_norm": 0.11233013868331909,
"learning_rate": 0.00011924264804619309,
"loss": 1.0522055625915527,
"mean_token_accuracy": 0.7664702609181404,
"num_tokens": 13638463.0,
"step": 3400
},
{
"entropy": 0.9398517791181803,
"epoch": 1.3050133945656333,
"grad_norm": 0.088468998670578,
"learning_rate": 0.00011897408352356654,
"loss": 0.9618704795837403,
"mean_token_accuracy": 0.7755557060241699,
"num_tokens": 13677769.0,
"step": 3410
},
{
"entropy": 0.8900398269295693,
"epoch": 1.3088404133180251,
"grad_norm": 0.09742283076047897,
"learning_rate": 0.00011870551900093999,
"loss": 0.9422917366027832,
"mean_token_accuracy": 0.7865706130862236,
"num_tokens": 13713374.0,
"step": 3420
},
{
"entropy": 0.9008657015860081,
"epoch": 1.312667432070417,
"grad_norm": 0.09111864864826202,
"learning_rate": 0.00011843695447831342,
"loss": 0.9726786613464355,
"mean_token_accuracy": 0.7835188135504723,
"num_tokens": 13753165.0,
"step": 3430
},
{
"entropy": 0.954158465564251,
"epoch": 1.316494450822809,
"grad_norm": 0.0949985608458519,
"learning_rate": 0.00011816838995568687,
"loss": 1.0072153091430665,
"mean_token_accuracy": 0.7668681025505066,
"num_tokens": 13790265.0,
"step": 3440
},
{
"entropy": 0.9259054005146027,
"epoch": 1.3203214695752008,
"grad_norm": 0.09144506603479385,
"learning_rate": 0.00011789982543306031,
"loss": 1.0319811820983886,
"mean_token_accuracy": 0.77575224339962,
"num_tokens": 13830720.0,
"step": 3450
},
{
"entropy": 0.9554400585591794,
"epoch": 1.3241484883275927,
"grad_norm": 0.05986972153186798,
"learning_rate": 0.00011763126091043373,
"loss": 0.9840157508850098,
"mean_token_accuracy": 0.7714304268360138,
"num_tokens": 13874024.0,
"step": 3460
},
{
"entropy": 0.9618137650191784,
"epoch": 1.3279755070799846,
"grad_norm": 0.08746087551116943,
"learning_rate": 0.00011736269638780717,
"loss": 1.0280908584594726,
"mean_token_accuracy": 0.7679046332836151,
"num_tokens": 13916099.0,
"step": 3470
},
{
"entropy": 1.02601458132267,
"epoch": 1.3318025258323765,
"grad_norm": 0.09883694350719452,
"learning_rate": 0.00011709413186518061,
"loss": 1.0893220901489258,
"mean_token_accuracy": 0.7487106472253799,
"num_tokens": 13955163.0,
"step": 3480
},
{
"entropy": 1.025067638605833,
"epoch": 1.3356295445847683,
"grad_norm": 0.07656730711460114,
"learning_rate": 0.00011682556734255406,
"loss": 1.0527194023132325,
"mean_token_accuracy": 0.7569629296660423,
"num_tokens": 13996990.0,
"step": 3490
},
{
"entropy": 0.8709930831566453,
"epoch": 1.3394565633371602,
"grad_norm": 0.1119026467204094,
"learning_rate": 0.0001165570028199275,
"loss": 0.9183405876159668,
"mean_token_accuracy": 0.784464044868946,
"num_tokens": 14040315.0,
"step": 3500
},
{
"entropy": 0.9783565014600754,
"epoch": 1.3432835820895521,
"grad_norm": 0.09997576475143433,
"learning_rate": 0.00011628843829730094,
"loss": 1.0318940162658692,
"mean_token_accuracy": 0.7614112690091133,
"num_tokens": 14083204.0,
"step": 3510
},
{
"entropy": 0.9975252889096737,
"epoch": 1.347110600841944,
"grad_norm": 0.10046812891960144,
"learning_rate": 0.00011601987377467437,
"loss": 1.0214290618896484,
"mean_token_accuracy": 0.7584437146782875,
"num_tokens": 14127039.0,
"step": 3520
},
{
"entropy": 0.8959422588348389,
"epoch": 1.350937619594336,
"grad_norm": 0.09512703120708466,
"learning_rate": 0.0001157513092520478,
"loss": 0.9528075218200683,
"mean_token_accuracy": 0.7823959946632385,
"num_tokens": 14163989.0,
"step": 3530
},
{
"entropy": 0.8903120748698712,
"epoch": 1.3547646383467278,
"grad_norm": 0.10500185191631317,
"learning_rate": 0.00011548274472942124,
"loss": 0.9784683227539063,
"mean_token_accuracy": 0.7854589730501175,
"num_tokens": 14198562.0,
"step": 3540
},
{
"entropy": 0.8580869071185588,
"epoch": 1.3585916570991197,
"grad_norm": 0.08716659992933273,
"learning_rate": 0.00011521418020679469,
"loss": 0.9078399658203125,
"mean_token_accuracy": 0.7894850671291351,
"num_tokens": 14236952.0,
"step": 3550
},
{
"entropy": 0.9841447554528713,
"epoch": 1.3624186758515116,
"grad_norm": 0.08638570457696915,
"learning_rate": 0.00011494561568416812,
"loss": 1.0438207626342773,
"mean_token_accuracy": 0.7629329964518548,
"num_tokens": 14278208.0,
"step": 3560
},
{
"entropy": 0.9100395441055298,
"epoch": 1.3662456946039034,
"grad_norm": 0.09058145433664322,
"learning_rate": 0.00011467705116154157,
"loss": 0.9560261726379394,
"mean_token_accuracy": 0.7807327762246132,
"num_tokens": 14314076.0,
"step": 3570
},
{
"entropy": 0.8529263667762279,
"epoch": 1.3700727133562953,
"grad_norm": 0.08847236633300781,
"learning_rate": 0.00011440848663891502,
"loss": 0.9192025184631347,
"mean_token_accuracy": 0.7945622354745865,
"num_tokens": 14349740.0,
"step": 3580
},
{
"entropy": 0.8977530397474766,
"epoch": 1.3738997321086872,
"grad_norm": 0.09535886347293854,
"learning_rate": 0.00011413992211628844,
"loss": 0.9331538200378418,
"mean_token_accuracy": 0.7803975984454155,
"num_tokens": 14392492.0,
"step": 3590
},
{
"entropy": 1.0430821359157563,
"epoch": 1.377726750861079,
"grad_norm": 0.08564139902591705,
"learning_rate": 0.00011387135759366187,
"loss": 1.0767670631408692,
"mean_token_accuracy": 0.7479040876030922,
"num_tokens": 14436961.0,
"step": 3600
},
{
"entropy": 0.8358541168272495,
"epoch": 1.381553769613471,
"grad_norm": 0.09847365319728851,
"learning_rate": 0.00011360279307103532,
"loss": 0.8758580207824707,
"mean_token_accuracy": 0.7964837267994881,
"num_tokens": 14472251.0,
"step": 3610
},
{
"entropy": 0.8302674755454064,
"epoch": 1.3853807883658629,
"grad_norm": 0.08570406585931778,
"learning_rate": 0.00011333422854840876,
"loss": 0.9068514823913574,
"mean_token_accuracy": 0.7943103745579719,
"num_tokens": 14509818.0,
"step": 3620
},
{
"entropy": 0.9825982883572578,
"epoch": 1.3892078071182548,
"grad_norm": 0.10844281315803528,
"learning_rate": 0.0001130656640257822,
"loss": 1.0484787940979003,
"mean_token_accuracy": 0.7600376740097999,
"num_tokens": 14553567.0,
"step": 3630
},
{
"entropy": 1.0431513242423534,
"epoch": 1.3930348258706466,
"grad_norm": 0.0750717744231224,
"learning_rate": 0.00011279709950315564,
"loss": 1.0337225914001464,
"mean_token_accuracy": 0.7504511162638664,
"num_tokens": 14598239.0,
"step": 3640
},
{
"entropy": 0.9319969929754734,
"epoch": 1.3968618446230385,
"grad_norm": 0.08307385444641113,
"learning_rate": 0.00011252853498052909,
"loss": 0.9771868705749511,
"mean_token_accuracy": 0.7778135031461716,
"num_tokens": 14638064.0,
"step": 3650
},
{
"entropy": 0.9992426164448261,
"epoch": 1.4006888633754304,
"grad_norm": 0.09222020208835602,
"learning_rate": 0.00011225997045790251,
"loss": 1.0516475677490233,
"mean_token_accuracy": 0.7587143570184708,
"num_tokens": 14682012.0,
"step": 3660
},
{
"entropy": 0.9670721650123596,
"epoch": 1.4045158821278223,
"grad_norm": 0.09432315081357956,
"learning_rate": 0.00011199140593527595,
"loss": 1.0164658546447753,
"mean_token_accuracy": 0.7670722231268883,
"num_tokens": 14722922.0,
"step": 3670
},
{
"entropy": 0.9808389253914356,
"epoch": 1.4083429008802142,
"grad_norm": 0.08502112329006195,
"learning_rate": 0.00011172284141264939,
"loss": 1.0553858757019043,
"mean_token_accuracy": 0.76065753698349,
"num_tokens": 14765083.0,
"step": 3680
},
{
"entropy": 1.011240091174841,
"epoch": 1.412169919632606,
"grad_norm": 0.07948844134807587,
"learning_rate": 0.00011145427689002284,
"loss": 1.0446209907531738,
"mean_token_accuracy": 0.75536377876997,
"num_tokens": 14806465.0,
"step": 3690
},
{
"entropy": 0.911352240294218,
"epoch": 1.415996938384998,
"grad_norm": 0.08382374793291092,
"learning_rate": 0.00011118571236739627,
"loss": 0.9388965606689453,
"mean_token_accuracy": 0.7807439729571343,
"num_tokens": 14850133.0,
"step": 3700
},
{
"entropy": 0.9055514119565486,
"epoch": 1.4198239571373898,
"grad_norm": 0.10713934898376465,
"learning_rate": 0.00011091714784476972,
"loss": 0.9727254867553711,
"mean_token_accuracy": 0.7801795959472656,
"num_tokens": 14887327.0,
"step": 3710
},
{
"entropy": 0.9338000696152449,
"epoch": 1.4236509758897817,
"grad_norm": 0.11418487876653671,
"learning_rate": 0.00011064858332214314,
"loss": 0.9989487648010253,
"mean_token_accuracy": 0.7747065275907516,
"num_tokens": 14927730.0,
"step": 3720
},
{
"entropy": 0.869029226526618,
"epoch": 1.4274779946421736,
"grad_norm": 0.10778038948774338,
"learning_rate": 0.00011038001879951659,
"loss": 0.9393071174621582,
"mean_token_accuracy": 0.7909289851784707,
"num_tokens": 14964847.0,
"step": 3730
},
{
"entropy": 0.8993408516049385,
"epoch": 1.4313050133945655,
"grad_norm": 0.08339972048997879,
"learning_rate": 0.00011011145427689002,
"loss": 0.9511364936828614,
"mean_token_accuracy": 0.7844893127679825,
"num_tokens": 15003449.0,
"step": 3740
},
{
"entropy": 0.9478372372686863,
"epoch": 1.4351320321469574,
"grad_norm": 0.07547847181558609,
"learning_rate": 0.00010984288975426347,
"loss": 0.9942925453186036,
"mean_token_accuracy": 0.772410535812378,
"num_tokens": 15046091.0,
"step": 3750
},
{
"entropy": 0.8367562972009182,
"epoch": 1.4389590508993493,
"grad_norm": 0.06902482360601425,
"learning_rate": 0.00010957432523163691,
"loss": 0.8951096534729004,
"mean_token_accuracy": 0.7985799089074135,
"num_tokens": 15091826.0,
"step": 3760
},
{
"entropy": 0.9437298484146595,
"epoch": 1.4427860696517412,
"grad_norm": 0.10231524705886841,
"learning_rate": 0.00010930576070901035,
"loss": 0.9919009208679199,
"mean_token_accuracy": 0.7663119360804558,
"num_tokens": 15133719.0,
"step": 3770
},
{
"entropy": 1.0057852260768414,
"epoch": 1.446613088404133,
"grad_norm": 0.09349844604730606,
"learning_rate": 0.0001090371961863838,
"loss": 1.0667811393737794,
"mean_token_accuracy": 0.757930365204811,
"num_tokens": 15173670.0,
"step": 3780
},
{
"entropy": 0.9152357578277588,
"epoch": 1.450440107156525,
"grad_norm": 0.09612533450126648,
"learning_rate": 0.00010876863166375722,
"loss": 0.9641363143920898,
"mean_token_accuracy": 0.7791497871279717,
"num_tokens": 15215154.0,
"step": 3790
},
{
"entropy": 0.849637558311224,
"epoch": 1.4542671259089168,
"grad_norm": 0.07079404592514038,
"learning_rate": 0.00010850006714113066,
"loss": 0.8924535751342774,
"mean_token_accuracy": 0.7958060145378113,
"num_tokens": 15261773.0,
"step": 3800
},
{
"entropy": 0.9689324770122767,
"epoch": 1.4580941446613087,
"grad_norm": 0.10107272863388062,
"learning_rate": 0.0001082315026185041,
"loss": 1.000623607635498,
"mean_token_accuracy": 0.7690365821123123,
"num_tokens": 15295693.0,
"step": 3810
},
{
"entropy": 0.8926774315536022,
"epoch": 1.4619211634137006,
"grad_norm": 0.0883372351527214,
"learning_rate": 0.00010796293809587754,
"loss": 0.9312380790710449,
"mean_token_accuracy": 0.7839185446500778,
"num_tokens": 15332324.0,
"step": 3820
},
{
"entropy": 0.9962236389517785,
"epoch": 1.4657481821660925,
"grad_norm": 0.09174945950508118,
"learning_rate": 0.00010769437357325099,
"loss": 1.0419865608215333,
"mean_token_accuracy": 0.7592507138848305,
"num_tokens": 15370812.0,
"step": 3830
},
{
"entropy": 1.0249286435544491,
"epoch": 1.4695752009184844,
"grad_norm": 0.07152284681797028,
"learning_rate": 0.00010742580905062442,
"loss": 1.0437363624572753,
"mean_token_accuracy": 0.7567671984434128,
"num_tokens": 15417719.0,
"step": 3840
},
{
"entropy": 0.903605168312788,
"epoch": 1.4734022196708763,
"grad_norm": 0.09400783479213715,
"learning_rate": 0.00010715724452799784,
"loss": 0.9410040855407715,
"mean_token_accuracy": 0.7839412048459053,
"num_tokens": 15455856.0,
"step": 3850
},
{
"entropy": 1.0259956195950508,
"epoch": 1.4772292384232681,
"grad_norm": 0.08671914041042328,
"learning_rate": 0.00010688868000537129,
"loss": 1.1025453567504884,
"mean_token_accuracy": 0.7507242172956466,
"num_tokens": 15492109.0,
"step": 3860
},
{
"entropy": 0.9178053669631481,
"epoch": 1.48105625717566,
"grad_norm": 0.07717446982860565,
"learning_rate": 0.00010662011548274474,
"loss": 0.96353178024292,
"mean_token_accuracy": 0.7797438561916351,
"num_tokens": 15532130.0,
"step": 3870
},
{
"entropy": 0.9423278756439686,
"epoch": 1.484883275928052,
"grad_norm": 0.11039029061794281,
"learning_rate": 0.00010635155096011817,
"loss": 0.979669189453125,
"mean_token_accuracy": 0.7755513936281204,
"num_tokens": 15575609.0,
"step": 3880
},
{
"entropy": 0.8999218411743641,
"epoch": 1.4887102946804438,
"grad_norm": 0.08974706381559372,
"learning_rate": 0.00010608298643749162,
"loss": 0.9477033615112305,
"mean_token_accuracy": 0.7822227850556374,
"num_tokens": 15621264.0,
"step": 3890
},
{
"entropy": 0.8756623603403568,
"epoch": 1.4925373134328357,
"grad_norm": 0.10864510387182236,
"learning_rate": 0.00010581442191486505,
"loss": 0.9711783409118653,
"mean_token_accuracy": 0.7893951386213303,
"num_tokens": 15656959.0,
"step": 3900
},
{
"entropy": 0.951158057898283,
"epoch": 1.4963643321852276,
"grad_norm": 0.09398993104696274,
"learning_rate": 0.0001055458573922385,
"loss": 1.0387070655822754,
"mean_token_accuracy": 0.7698590591549873,
"num_tokens": 15700293.0,
"step": 3910
},
{
"entropy": 0.9240442231297493,
"epoch": 1.5001913509376195,
"grad_norm": 0.09761729091405869,
"learning_rate": 0.00010527729286961192,
"loss": 0.9758125305175781,
"mean_token_accuracy": 0.7737968236207962,
"num_tokens": 15739304.0,
"step": 3920
},
{
"entropy": 0.9025500647723674,
"epoch": 1.5040183696900113,
"grad_norm": 0.08816131204366684,
"learning_rate": 0.00010500872834698537,
"loss": 0.913144302368164,
"mean_token_accuracy": 0.7775477200746537,
"num_tokens": 15785086.0,
"step": 3930
},
{
"entropy": 0.8958883471786976,
"epoch": 1.5078453884424032,
"grad_norm": 0.09690563380718231,
"learning_rate": 0.0001047401638243588,
"loss": 0.9484706878662109,
"mean_token_accuracy": 0.7867727875709534,
"num_tokens": 15822631.0,
"step": 3940
},
{
"entropy": 0.8738761503249407,
"epoch": 1.5116724071947951,
"grad_norm": 0.08325833082199097,
"learning_rate": 0.00010447159930173225,
"loss": 0.9258977890014648,
"mean_token_accuracy": 0.7862061053514481,
"num_tokens": 15863533.0,
"step": 3950
},
{
"entropy": 0.952784775942564,
"epoch": 1.515499425947187,
"grad_norm": 0.09089304506778717,
"learning_rate": 0.0001042030347791057,
"loss": 0.9893428802490234,
"mean_token_accuracy": 0.769037912786007,
"num_tokens": 15903798.0,
"step": 3960
},
{
"entropy": 0.9974973328411579,
"epoch": 1.519326444699579,
"grad_norm": 0.06594393402338028,
"learning_rate": 0.00010393447025647913,
"loss": 0.9982621192932128,
"mean_token_accuracy": 0.7653156638145446,
"num_tokens": 15947894.0,
"step": 3970
},
{
"entropy": 1.042479208856821,
"epoch": 1.5231534634519708,
"grad_norm": 0.09250905364751816,
"learning_rate": 0.00010366590573385255,
"loss": 1.0862640380859374,
"mean_token_accuracy": 0.7515693128108978,
"num_tokens": 15985609.0,
"step": 3980
},
{
"entropy": 0.869631578028202,
"epoch": 1.5269804822043627,
"grad_norm": 0.10154584795236588,
"learning_rate": 0.000103397341211226,
"loss": 0.9275701522827149,
"mean_token_accuracy": 0.7910413116216659,
"num_tokens": 16022339.0,
"step": 3990
},
{
"entropy": 0.9228729590773582,
"epoch": 1.5308075009567546,
"grad_norm": 0.08860265463590622,
"learning_rate": 0.00010312877668859944,
"loss": 1.0074289321899415,
"mean_token_accuracy": 0.7770419105887413,
"num_tokens": 16063778.0,
"step": 4000
},
{
"entropy": 0.9469372771680356,
"epoch": 1.5346345197091464,
"grad_norm": 0.08613952249288559,
"learning_rate": 0.00010286021216597287,
"loss": 1.0328418731689453,
"mean_token_accuracy": 0.7784339845180511,
"num_tokens": 16103389.0,
"step": 4010
},
{
"entropy": 0.9240258730947971,
"epoch": 1.5384615384615383,
"grad_norm": 0.09255630522966385,
"learning_rate": 0.00010259164764334632,
"loss": 0.9813838958740234,
"mean_token_accuracy": 0.779928731918335,
"num_tokens": 16141739.0,
"step": 4020
},
{
"entropy": 0.8300335463136435,
"epoch": 1.5422885572139302,
"grad_norm": 0.11173315346240997,
"learning_rate": 0.00010232308312071977,
"loss": 0.8650222778320312,
"mean_token_accuracy": 0.8004546627402306,
"num_tokens": 16179442.0,
"step": 4030
},
{
"entropy": 0.970530441403389,
"epoch": 1.546115575966322,
"grad_norm": 0.08758437633514404,
"learning_rate": 0.0001020545185980932,
"loss": 1.029263401031494,
"mean_token_accuracy": 0.7669389978051185,
"num_tokens": 16220502.0,
"step": 4040
},
{
"entropy": 0.8929917253553867,
"epoch": 1.549942594718714,
"grad_norm": 0.0840209424495697,
"learning_rate": 0.00010178595407546662,
"loss": 0.9574555397033692,
"mean_token_accuracy": 0.7882895812392234,
"num_tokens": 16263944.0,
"step": 4050
},
{
"entropy": 0.9571633011102676,
"epoch": 1.5537696134711059,
"grad_norm": 0.07731885462999344,
"learning_rate": 0.00010151738955284007,
"loss": 1.014600658416748,
"mean_token_accuracy": 0.7691228404641152,
"num_tokens": 16307508.0,
"step": 4060
},
{
"entropy": 0.9627384431660175,
"epoch": 1.5575966322234978,
"grad_norm": 0.09968744218349457,
"learning_rate": 0.00010124882503021352,
"loss": 1.0220794677734375,
"mean_token_accuracy": 0.7685489565134048,
"num_tokens": 16349178.0,
"step": 4070
},
{
"entropy": 0.8696753971278668,
"epoch": 1.5614236509758896,
"grad_norm": 0.08411276340484619,
"learning_rate": 0.00010098026050758695,
"loss": 0.9325771331787109,
"mean_token_accuracy": 0.7903442814946174,
"num_tokens": 16390375.0,
"step": 4080
},
{
"entropy": 0.8790203854441643,
"epoch": 1.5652506697282815,
"grad_norm": 0.0969686210155487,
"learning_rate": 0.0001007116959849604,
"loss": 0.9325167655944824,
"mean_token_accuracy": 0.7890133559703827,
"num_tokens": 16429198.0,
"step": 4090
},
{
"entropy": 0.9447548128664494,
"epoch": 1.5690776884806734,
"grad_norm": 0.07992373406887054,
"learning_rate": 0.00010044313146233384,
"loss": 0.9708291053771972,
"mean_token_accuracy": 0.7737105548381805,
"num_tokens": 16472336.0,
"step": 4100
},
{
"entropy": 0.974559249728918,
"epoch": 1.5729047072330653,
"grad_norm": 0.09685226529836655,
"learning_rate": 0.00010017456693970726,
"loss": 1.0289334297180175,
"mean_token_accuracy": 0.7674296617507934,
"num_tokens": 16511109.0,
"step": 4110
},
{
"entropy": 0.8575489681214095,
"epoch": 1.5767317259854572,
"grad_norm": 0.09298260509967804,
"learning_rate": 9.990600241708071e-05,
"loss": 0.8897696495056152,
"mean_token_accuracy": 0.7952411189675331,
"num_tokens": 16552802.0,
"step": 4120
},
{
"entropy": 0.869475956633687,
"epoch": 1.580558744737849,
"grad_norm": 0.129170760512352,
"learning_rate": 9.963743789445414e-05,
"loss": 0.9408356666564941,
"mean_token_accuracy": 0.7868246123194694,
"num_tokens": 16592603.0,
"step": 4130
},
{
"entropy": 0.9167623318731785,
"epoch": 1.584385763490241,
"grad_norm": 0.08131655305624008,
"learning_rate": 9.936887337182759e-05,
"loss": 1.005775260925293,
"mean_token_accuracy": 0.7779423877596855,
"num_tokens": 16633674.0,
"step": 4140
},
{
"entropy": 0.9069061763584614,
"epoch": 1.5882127822426328,
"grad_norm": 0.07485036551952362,
"learning_rate": 9.910030884920103e-05,
"loss": 0.9540878295898437,
"mean_token_accuracy": 0.7809736356139183,
"num_tokens": 16669966.0,
"step": 4150
},
{
"entropy": 1.0095594763755797,
"epoch": 1.5920398009950247,
"grad_norm": 0.11678522825241089,
"learning_rate": 9.883174432657446e-05,
"loss": 1.0742655754089356,
"mean_token_accuracy": 0.7636483564972878,
"num_tokens": 16711538.0,
"step": 4160
},
{
"entropy": 0.8342153321951628,
"epoch": 1.5958668197474166,
"grad_norm": 0.09654127061367035,
"learning_rate": 9.85631798039479e-05,
"loss": 0.8637946128845215,
"mean_token_accuracy": 0.7977021634578705,
"num_tokens": 16746947.0,
"step": 4170
},
{
"entropy": 0.9147222273051738,
"epoch": 1.5996938384998085,
"grad_norm": 0.10032576322555542,
"learning_rate": 9.829461528132134e-05,
"loss": 0.9848580360412598,
"mean_token_accuracy": 0.7794791385531425,
"num_tokens": 16792089.0,
"step": 4180
},
{
"entropy": 0.9350447114557028,
"epoch": 1.6035208572522004,
"grad_norm": 0.11322317272424698,
"learning_rate": 9.802605075869477e-05,
"loss": 0.9632351875305176,
"mean_token_accuracy": 0.7710213780403137,
"num_tokens": 16831782.0,
"step": 4190
},
{
"entropy": 0.8924577154219151,
"epoch": 1.6073478760045923,
"grad_norm": 0.08842343091964722,
"learning_rate": 9.775748623606822e-05,
"loss": 0.9661048889160156,
"mean_token_accuracy": 0.7863042891025543,
"num_tokens": 16867851.0,
"step": 4200
},
{
"entropy": 0.9452814936637879,
"epoch": 1.6111748947569842,
"grad_norm": 0.10469862073659897,
"learning_rate": 9.748892171344167e-05,
"loss": 1.0315632820129395,
"mean_token_accuracy": 0.769272243976593,
"num_tokens": 16909819.0,
"step": 4210
},
{
"entropy": 0.8794655621051788,
"epoch": 1.615001913509376,
"grad_norm": 0.08528223633766174,
"learning_rate": 9.72203571908151e-05,
"loss": 0.9158189773559571,
"mean_token_accuracy": 0.791112196445465,
"num_tokens": 16945241.0,
"step": 4220
},
{
"entropy": 0.9216304633766412,
"epoch": 1.618828932261768,
"grad_norm": 0.07684458047151566,
"learning_rate": 9.695179266818853e-05,
"loss": 1.0047569274902344,
"mean_token_accuracy": 0.7764274105429649,
"num_tokens": 16986516.0,
"step": 4230
},
{
"entropy": 0.8806056842207909,
"epoch": 1.6226559510141598,
"grad_norm": 0.09925177693367004,
"learning_rate": 9.668322814556198e-05,
"loss": 0.9321705818176269,
"mean_token_accuracy": 0.7873435765504837,
"num_tokens": 17026974.0,
"step": 4240
},
{
"entropy": 1.0260133132338525,
"epoch": 1.6264829697665517,
"grad_norm": 0.07781514525413513,
"learning_rate": 9.641466362293541e-05,
"loss": 1.0732348442077637,
"mean_token_accuracy": 0.755302457511425,
"num_tokens": 17063628.0,
"step": 4250
},
{
"entropy": 0.8771878894418478,
"epoch": 1.6303099885189436,
"grad_norm": 0.12377400696277618,
"learning_rate": 9.614609910030885e-05,
"loss": 0.9051324844360351,
"mean_token_accuracy": 0.7877693608403206,
"num_tokens": 17102243.0,
"step": 4260
},
{
"entropy": 0.9575911372900009,
"epoch": 1.6341370072713355,
"grad_norm": 0.07953961193561554,
"learning_rate": 9.58775345776823e-05,
"loss": 1.0206258773803711,
"mean_token_accuracy": 0.770101509988308,
"num_tokens": 17143256.0,
"step": 4270
},
{
"entropy": 0.9909125387668609,
"epoch": 1.6379640260237274,
"grad_norm": 0.09304741024971008,
"learning_rate": 9.560897005505573e-05,
"loss": 1.043109130859375,
"mean_token_accuracy": 0.7598145559430123,
"num_tokens": 17188878.0,
"step": 4280
},
{
"entropy": 0.8626054737716913,
"epoch": 1.6417910447761193,
"grad_norm": 0.08982561528682709,
"learning_rate": 9.534040553242916e-05,
"loss": 0.9062054634094239,
"mean_token_accuracy": 0.790125061571598,
"num_tokens": 17224537.0,
"step": 4290
},
{
"entropy": 0.919727610051632,
"epoch": 1.6456180635285111,
"grad_norm": 0.11226653307676315,
"learning_rate": 9.507184100980261e-05,
"loss": 0.970013427734375,
"mean_token_accuracy": 0.7747739493846894,
"num_tokens": 17262347.0,
"step": 4300
},
{
"entropy": 1.032866196334362,
"epoch": 1.649445082280903,
"grad_norm": 0.09440238773822784,
"learning_rate": 9.480327648717606e-05,
"loss": 1.0287545204162598,
"mean_token_accuracy": 0.7550160124897957,
"num_tokens": 17307578.0,
"step": 4310
},
{
"entropy": 0.907962580025196,
"epoch": 1.653272101033295,
"grad_norm": 0.11395370960235596,
"learning_rate": 9.453471196454948e-05,
"loss": 0.9705679893493653,
"mean_token_accuracy": 0.7807198286056518,
"num_tokens": 17342943.0,
"step": 4320
},
{
"entropy": 0.8495472550392151,
"epoch": 1.6570991197856868,
"grad_norm": 0.07685171812772751,
"learning_rate": 9.426614744192292e-05,
"loss": 0.9079866409301758,
"mean_token_accuracy": 0.7923004642128945,
"num_tokens": 17378158.0,
"step": 4330
},
{
"entropy": 0.8389323726296425,
"epoch": 1.6609261385380787,
"grad_norm": 0.09541229903697968,
"learning_rate": 9.399758291929637e-05,
"loss": 0.9092423439025878,
"mean_token_accuracy": 0.7946408927440644,
"num_tokens": 17412703.0,
"step": 4340
},
{
"entropy": 0.9035130314528942,
"epoch": 1.6647531572904706,
"grad_norm": 0.08291888236999512,
"learning_rate": 9.37290183966698e-05,
"loss": 0.9255120277404785,
"mean_token_accuracy": 0.7840688213706016,
"num_tokens": 17456250.0,
"step": 4350
},
{
"entropy": 0.8917031817138195,
"epoch": 1.6685801760428625,
"grad_norm": 0.08787538856267929,
"learning_rate": 9.346045387404324e-05,
"loss": 0.9318277359008789,
"mean_token_accuracy": 0.7854569494724274,
"num_tokens": 17492566.0,
"step": 4360
},
{
"entropy": 0.8860244527459145,
"epoch": 1.6724071947952543,
"grad_norm": 0.10287550836801529,
"learning_rate": 9.319188935141668e-05,
"loss": 0.9169553756713867,
"mean_token_accuracy": 0.7801365301012992,
"num_tokens": 17530267.0,
"step": 4370
},
{
"entropy": 0.8470614090561867,
"epoch": 1.6762342135476462,
"grad_norm": 0.13052308559417725,
"learning_rate": 9.292332482879013e-05,
"loss": 0.9004100799560547,
"mean_token_accuracy": 0.791596457362175,
"num_tokens": 17566336.0,
"step": 4380
},
{
"entropy": 0.9627884522080421,
"epoch": 1.6800612323000381,
"grad_norm": 0.09305555373430252,
"learning_rate": 9.265476030616355e-05,
"loss": 0.9837147712707519,
"mean_token_accuracy": 0.7687505498528481,
"num_tokens": 17609294.0,
"step": 4390
},
{
"entropy": 0.9614691123366356,
"epoch": 1.68388825105243,
"grad_norm": 0.08118042349815369,
"learning_rate": 9.2386195783537e-05,
"loss": 1.0093948364257812,
"mean_token_accuracy": 0.7686966329813003,
"num_tokens": 17653408.0,
"step": 4400
},
{
"entropy": 0.8255576498806476,
"epoch": 1.687715269804822,
"grad_norm": 0.07197146117687225,
"learning_rate": 9.211763126091045e-05,
"loss": 0.9013225555419921,
"mean_token_accuracy": 0.7994248151779175,
"num_tokens": 17693303.0,
"step": 4410
},
{
"entropy": 0.9197361193597317,
"epoch": 1.6915422885572138,
"grad_norm": 0.10147208720445633,
"learning_rate": 9.184906673828388e-05,
"loss": 0.966912841796875,
"mean_token_accuracy": 0.774210800230503,
"num_tokens": 17734446.0,
"step": 4420
},
{
"entropy": 0.8828513637185097,
"epoch": 1.6953693073096057,
"grad_norm": 0.08126919716596603,
"learning_rate": 9.158050221565731e-05,
"loss": 0.9237348556518554,
"mean_token_accuracy": 0.788974218070507,
"num_tokens": 17776056.0,
"step": 4430
},
{
"entropy": 0.8538446951657533,
"epoch": 1.6991963260619976,
"grad_norm": 0.08602278679609299,
"learning_rate": 9.131193769303076e-05,
"loss": 0.9384878158569336,
"mean_token_accuracy": 0.7924654617905617,
"num_tokens": 17814560.0,
"step": 4440
},
{
"entropy": 0.9160130321979523,
"epoch": 1.7030233448143894,
"grad_norm": 0.10127890110015869,
"learning_rate": 9.10433731704042e-05,
"loss": 0.9924029350280762,
"mean_token_accuracy": 0.7764815479516983,
"num_tokens": 17852872.0,
"step": 4450
},
{
"entropy": 0.8855723738670349,
"epoch": 1.7068503635667813,
"grad_norm": 0.09295201301574707,
"learning_rate": 9.077480864777763e-05,
"loss": 0.9131739616394043,
"mean_token_accuracy": 0.7876585990190506,
"num_tokens": 17893403.0,
"step": 4460
},
{
"entropy": 0.8825645297765732,
"epoch": 1.7106773823191732,
"grad_norm": 0.1038793995976448,
"learning_rate": 9.050624412515107e-05,
"loss": 0.9621119499206543,
"mean_token_accuracy": 0.7840767920017242,
"num_tokens": 17933069.0,
"step": 4470
},
{
"entropy": 0.9438045337796211,
"epoch": 1.714504401071565,
"grad_norm": 0.08998332172632217,
"learning_rate": 9.023767960252452e-05,
"loss": 1.0042546272277832,
"mean_token_accuracy": 0.7710625112056733,
"num_tokens": 17978081.0,
"step": 4480
},
{
"entropy": 0.9605814971029758,
"epoch": 1.718331419823957,
"grad_norm": 0.0936085507273674,
"learning_rate": 8.996911507989794e-05,
"loss": 1.0731863021850585,
"mean_token_accuracy": 0.7675610318779945,
"num_tokens": 18026355.0,
"step": 4490
},
{
"entropy": 0.9412197135388851,
"epoch": 1.7221584385763489,
"grad_norm": 0.11693151295185089,
"learning_rate": 8.970055055727139e-05,
"loss": 1.0271482467651367,
"mean_token_accuracy": 0.7749442532658577,
"num_tokens": 18064648.0,
"step": 4500
},
{
"entropy": 0.9840309470891953,
"epoch": 1.7259854573287408,
"grad_norm": 0.07721691578626633,
"learning_rate": 8.943198603464484e-05,
"loss": 0.9978925704956054,
"mean_token_accuracy": 0.7662706628441811,
"num_tokens": 18104352.0,
"step": 4510
},
{
"entropy": 0.9122109733521938,
"epoch": 1.7298124760811326,
"grad_norm": 0.10790548473596573,
"learning_rate": 8.916342151201827e-05,
"loss": 0.9912397384643554,
"mean_token_accuracy": 0.774566973745823,
"num_tokens": 18145632.0,
"step": 4520
},
{
"entropy": 0.8214024558663369,
"epoch": 1.7336394948335245,
"grad_norm": 0.0873790979385376,
"learning_rate": 8.88948569893917e-05,
"loss": 0.9174188613891602,
"mean_token_accuracy": 0.797317324578762,
"num_tokens": 18182216.0,
"step": 4530
},
{
"entropy": 0.8851194910705089,
"epoch": 1.7374665135859164,
"grad_norm": 0.08441472053527832,
"learning_rate": 8.862629246676515e-05,
"loss": 0.9345614433288574,
"mean_token_accuracy": 0.7909206628799439,
"num_tokens": 18220152.0,
"step": 4540
},
{
"entropy": 0.9045546390116215,
"epoch": 1.7412935323383083,
"grad_norm": 0.09491857141256332,
"learning_rate": 8.835772794413858e-05,
"loss": 1.0261774063110352,
"mean_token_accuracy": 0.7792002618312835,
"num_tokens": 18253615.0,
"step": 4550
},
{
"entropy": 0.8957971200346947,
"epoch": 1.7451205510907002,
"grad_norm": 0.07239943742752075,
"learning_rate": 8.808916342151202e-05,
"loss": 0.9220120429992675,
"mean_token_accuracy": 0.7812404081225395,
"num_tokens": 18292565.0,
"step": 4560
},
{
"entropy": 0.9733762003481388,
"epoch": 1.748947569843092,
"grad_norm": 0.07816951721906662,
"learning_rate": 8.782059889888546e-05,
"loss": 1.0176166534423827,
"mean_token_accuracy": 0.7636090680956841,
"num_tokens": 18337951.0,
"step": 4570
},
{
"entropy": 0.9952755816280842,
"epoch": 1.752774588595484,
"grad_norm": 0.09595679491758347,
"learning_rate": 8.75520343762589e-05,
"loss": 1.0484466552734375,
"mean_token_accuracy": 0.7624279737472535,
"num_tokens": 18378541.0,
"step": 4580
},
{
"entropy": 0.9325974151492119,
"epoch": 1.7566016073478758,
"grad_norm": 0.1425638496875763,
"learning_rate": 8.728346985363234e-05,
"loss": 1.007568645477295,
"mean_token_accuracy": 0.7753438904881478,
"num_tokens": 18416147.0,
"step": 4590
},
{
"entropy": 0.8879670143127442,
"epoch": 1.7604286261002677,
"grad_norm": 0.08936052024364471,
"learning_rate": 8.701490533100578e-05,
"loss": 0.9574133872985839,
"mean_token_accuracy": 0.7878236457705498,
"num_tokens": 18452658.0,
"step": 4600
},
{
"entropy": 0.9596087213605642,
"epoch": 1.7642556448526596,
"grad_norm": 0.08222804218530655,
"learning_rate": 8.674634080837921e-05,
"loss": 1.0134518623352051,
"mean_token_accuracy": 0.7686690568923951,
"num_tokens": 18493806.0,
"step": 4610
},
{
"entropy": 0.9412514306604862,
"epoch": 1.7680826636050515,
"grad_norm": 0.08482176810503006,
"learning_rate": 8.647777628575266e-05,
"loss": 0.9830768585205079,
"mean_token_accuracy": 0.7789766594767571,
"num_tokens": 18538740.0,
"step": 4620
},
{
"entropy": 0.8279416210949421,
"epoch": 1.7719096823574434,
"grad_norm": 0.12101086974143982,
"learning_rate": 8.620921176312609e-05,
"loss": 0.8422709465026855,
"mean_token_accuracy": 0.8009032368659973,
"num_tokens": 18579991.0,
"step": 4630
},
{
"entropy": 0.8889543637633324,
"epoch": 1.7757367011098353,
"grad_norm": 0.09586559236049652,
"learning_rate": 8.594064724049954e-05,
"loss": 0.9579720497131348,
"mean_token_accuracy": 0.7857530102133751,
"num_tokens": 18616195.0,
"step": 4640
},
{
"entropy": 0.9021936893463135,
"epoch": 1.7795637198622272,
"grad_norm": 0.0920713022351265,
"learning_rate": 8.567208271787297e-05,
"loss": 0.9568814277648926,
"mean_token_accuracy": 0.7835437625646591,
"num_tokens": 18650396.0,
"step": 4650
},
{
"entropy": 0.9605553701519967,
"epoch": 1.783390738614619,
"grad_norm": 0.0752284824848175,
"learning_rate": 8.54035181952464e-05,
"loss": 1.0107332229614259,
"mean_token_accuracy": 0.7712536633014679,
"num_tokens": 18692519.0,
"step": 4660
},
{
"entropy": 0.8929145928472281,
"epoch": 1.787217757367011,
"grad_norm": 0.08124406635761261,
"learning_rate": 8.513495367261985e-05,
"loss": 0.9392594337463379,
"mean_token_accuracy": 0.7837390914559365,
"num_tokens": 18730865.0,
"step": 4670
},
{
"entropy": 0.8995866551995277,
"epoch": 1.7910447761194028,
"grad_norm": 0.07306879013776779,
"learning_rate": 8.486638914999329e-05,
"loss": 0.9512563705444336,
"mean_token_accuracy": 0.7803288042545319,
"num_tokens": 18774851.0,
"step": 4680
},
{
"entropy": 0.9283428456634283,
"epoch": 1.7948717948717947,
"grad_norm": 0.06833672523498535,
"learning_rate": 8.459782462736673e-05,
"loss": 0.9614426612854003,
"mean_token_accuracy": 0.7776324123144149,
"num_tokens": 18815273.0,
"step": 4690
},
{
"entropy": 0.8980611331760884,
"epoch": 1.7986988136241866,
"grad_norm": 0.09426148980855942,
"learning_rate": 8.432926010474017e-05,
"loss": 0.9397372245788574,
"mean_token_accuracy": 0.7818324938416481,
"num_tokens": 18854806.0,
"step": 4700
},
{
"entropy": 0.9534067753702402,
"epoch": 1.8025258323765785,
"grad_norm": 0.11984719336032867,
"learning_rate": 8.40606955821136e-05,
"loss": 1.0058012008666992,
"mean_token_accuracy": 0.7710662186145782,
"num_tokens": 18893820.0,
"step": 4710
},
{
"entropy": 0.9396863542497158,
"epoch": 1.8063528511289704,
"grad_norm": 0.1126495823264122,
"learning_rate": 8.379213105948705e-05,
"loss": 0.9968315124511719,
"mean_token_accuracy": 0.7748499393463135,
"num_tokens": 18932078.0,
"step": 4720
},
{
"entropy": 1.0531147465109825,
"epoch": 1.8101798698813623,
"grad_norm": 0.0951380655169487,
"learning_rate": 8.352356653686048e-05,
"loss": 1.1058255195617677,
"mean_token_accuracy": 0.7495945364236831,
"num_tokens": 18974602.0,
"step": 4730
},
{
"entropy": 0.8520326256752014,
"epoch": 1.8140068886337541,
"grad_norm": 0.08623862266540527,
"learning_rate": 8.325500201423391e-05,
"loss": 0.8825064659118652,
"mean_token_accuracy": 0.7940022364258766,
"num_tokens": 19014261.0,
"step": 4740
},
{
"entropy": 0.979587784409523,
"epoch": 1.817833907386146,
"grad_norm": 0.11787699162960052,
"learning_rate": 8.298643749160736e-05,
"loss": 1.070664405822754,
"mean_token_accuracy": 0.7620177045464516,
"num_tokens": 19058223.0,
"step": 4750
},
{
"entropy": 0.8753061652183532,
"epoch": 1.821660926138538,
"grad_norm": 0.130862757563591,
"learning_rate": 8.271787296898081e-05,
"loss": 0.9366108894348144,
"mean_token_accuracy": 0.7874863654375076,
"num_tokens": 19100204.0,
"step": 4760
},
{
"entropy": 0.8777839131653309,
"epoch": 1.8254879448909298,
"grad_norm": 0.09261229634284973,
"learning_rate": 8.244930844635424e-05,
"loss": 0.9104420661926269,
"mean_token_accuracy": 0.7895808070898056,
"num_tokens": 19141588.0,
"step": 4770
},
{
"entropy": 0.9170226149260998,
"epoch": 1.8293149636433217,
"grad_norm": 0.06741383671760559,
"learning_rate": 8.218074392372768e-05,
"loss": 0.9543824195861816,
"mean_token_accuracy": 0.7765088111162186,
"num_tokens": 19182818.0,
"step": 4780
},
{
"entropy": 0.8602717489004135,
"epoch": 1.8331419823957136,
"grad_norm": 0.12861686944961548,
"learning_rate": 8.191217940110112e-05,
"loss": 0.926014518737793,
"mean_token_accuracy": 0.7917162731289864,
"num_tokens": 19217919.0,
"step": 4790
},
{
"entropy": 0.9471398882567883,
"epoch": 1.8369690011481055,
"grad_norm": 0.0744423121213913,
"learning_rate": 8.164361487847456e-05,
"loss": 0.9777775764465332,
"mean_token_accuracy": 0.7716371163725853,
"num_tokens": 19263991.0,
"step": 4800
},
{
"entropy": 0.8363759070634842,
"epoch": 1.8407960199004973,
"grad_norm": 0.08627327531576157,
"learning_rate": 8.137505035584799e-05,
"loss": 0.887846565246582,
"mean_token_accuracy": 0.7938979223370553,
"num_tokens": 19305441.0,
"step": 4810
},
{
"entropy": 0.9200571574270725,
"epoch": 1.8446230386528892,
"grad_norm": 0.08358518034219742,
"learning_rate": 8.110648583322144e-05,
"loss": 0.9703543663024903,
"mean_token_accuracy": 0.777827826142311,
"num_tokens": 19342309.0,
"step": 4820
},
{
"entropy": 0.9295372806489468,
"epoch": 1.8484500574052811,
"grad_norm": 0.0970570370554924,
"learning_rate": 8.083792131059487e-05,
"loss": 0.9934672355651856,
"mean_token_accuracy": 0.7739364430308342,
"num_tokens": 19387707.0,
"step": 4830
},
{
"entropy": 0.9003355488181114,
"epoch": 1.852277076157673,
"grad_norm": 0.09357219189405441,
"learning_rate": 8.05693567879683e-05,
"loss": 0.9544237136840821,
"mean_token_accuracy": 0.7824135825037957,
"num_tokens": 19428449.0,
"step": 4840
},
{
"entropy": 1.0107353992760182,
"epoch": 1.856104094910065,
"grad_norm": 0.08587910234928131,
"learning_rate": 8.030079226534175e-05,
"loss": 1.0694228172302247,
"mean_token_accuracy": 0.7585012704133988,
"num_tokens": 19469625.0,
"step": 4850
},
{
"entropy": 0.9866157718002796,
"epoch": 1.8599311136624568,
"grad_norm": 0.11663772910833359,
"learning_rate": 8.00322277427152e-05,
"loss": 1.0394890785217286,
"mean_token_accuracy": 0.7632519364356994,
"num_tokens": 19510480.0,
"step": 4860
},
{
"entropy": 0.881837759912014,
"epoch": 1.8637581324148487,
"grad_norm": 0.13599033653736115,
"learning_rate": 7.976366322008862e-05,
"loss": 0.9441938400268555,
"mean_token_accuracy": 0.7848336577415467,
"num_tokens": 19550305.0,
"step": 4870
},
{
"entropy": 0.9193019077181817,
"epoch": 1.8675851511672406,
"grad_norm": 0.09272989630699158,
"learning_rate": 7.949509869746207e-05,
"loss": 0.9862917900085449,
"mean_token_accuracy": 0.7736792579293251,
"num_tokens": 19588278.0,
"step": 4880
},
{
"entropy": 0.9219990812242032,
"epoch": 1.8714121699196324,
"grad_norm": 0.10006739944219589,
"learning_rate": 7.922653417483551e-05,
"loss": 0.9700265884399414,
"mean_token_accuracy": 0.780723437666893,
"num_tokens": 19625749.0,
"step": 4890
},
{
"entropy": 0.8564371943473816,
"epoch": 1.8752391886720245,
"grad_norm": 0.08216696232557297,
"learning_rate": 7.895796965220895e-05,
"loss": 0.9064787864685059,
"mean_token_accuracy": 0.7962334454059601,
"num_tokens": 19663935.0,
"step": 4900
},
{
"entropy": 0.9482992745935916,
"epoch": 1.8790662074244164,
"grad_norm": 0.06782303750514984,
"learning_rate": 7.868940512958238e-05,
"loss": 0.9920551300048828,
"mean_token_accuracy": 0.7696940049529075,
"num_tokens": 19704663.0,
"step": 4910
},
{
"entropy": 0.884397204965353,
"epoch": 1.8828932261768083,
"grad_norm": 0.06414399296045303,
"learning_rate": 7.842084060695583e-05,
"loss": 0.9083518981933594,
"mean_token_accuracy": 0.7858098462224007,
"num_tokens": 19753438.0,
"step": 4920
},
{
"entropy": 0.8019696604460478,
"epoch": 1.8867202449292002,
"grad_norm": 0.08456243574619293,
"learning_rate": 7.815227608432927e-05,
"loss": 0.8896969795227051,
"mean_token_accuracy": 0.8053153440356254,
"num_tokens": 19790404.0,
"step": 4930
},
{
"entropy": 0.8008564852178097,
"epoch": 1.890547263681592,
"grad_norm": 0.10543688386678696,
"learning_rate": 7.78837115617027e-05,
"loss": 0.8535223007202148,
"mean_token_accuracy": 0.8059684678912162,
"num_tokens": 19825645.0,
"step": 4940
},
{
"entropy": 0.8714719720184803,
"epoch": 1.894374282433984,
"grad_norm": 0.09498755633831024,
"learning_rate": 7.761514703907614e-05,
"loss": 0.9063860893249511,
"mean_token_accuracy": 0.7914781123399734,
"num_tokens": 19866092.0,
"step": 4950
},
{
"entropy": 0.9202240366488695,
"epoch": 1.8982013011863759,
"grad_norm": 0.07342597842216492,
"learning_rate": 7.734658251644959e-05,
"loss": 0.9767581939697265,
"mean_token_accuracy": 0.7755973920226097,
"num_tokens": 19907356.0,
"step": 4960
},
{
"entropy": 0.9477262906730175,
"epoch": 1.9020283199387678,
"grad_norm": 0.08742880076169968,
"learning_rate": 7.707801799382302e-05,
"loss": 1.0063783645629882,
"mean_token_accuracy": 0.7687567621469498,
"num_tokens": 19952869.0,
"step": 4970
},
{
"entropy": 0.977492806315422,
"epoch": 1.9058553386911596,
"grad_norm": 0.10321515798568726,
"learning_rate": 7.680945347119645e-05,
"loss": 1.0323823928833007,
"mean_token_accuracy": 0.7646988987922668,
"num_tokens": 19991372.0,
"step": 4980
},
{
"entropy": 0.7999268680810928,
"epoch": 1.9096823574435515,
"grad_norm": 0.08925452828407288,
"learning_rate": 7.65408889485699e-05,
"loss": 0.8391226768493653,
"mean_token_accuracy": 0.8017501994967461,
"num_tokens": 20029189.0,
"step": 4990
},
{
"entropy": 0.8757653787732125,
"epoch": 1.9135093761959434,
"grad_norm": 0.1915360540151596,
"learning_rate": 7.627232442594334e-05,
"loss": 0.9243562698364258,
"mean_token_accuracy": 0.7841411307454109,
"num_tokens": 20070611.0,
"step": 5000
},
{
"entropy": 0.9357082359492779,
"epoch": 1.9173363949483353,
"grad_norm": 0.08219558745622635,
"learning_rate": 7.600375990331677e-05,
"loss": 0.9772232055664063,
"mean_token_accuracy": 0.7725088000297546,
"num_tokens": 20110392.0,
"step": 5010
},
{
"entropy": 0.9191611532121897,
"epoch": 1.9211634137007272,
"grad_norm": 0.07629676163196564,
"learning_rate": 7.573519538069022e-05,
"loss": 0.9754646301269532,
"mean_token_accuracy": 0.7830281540751457,
"num_tokens": 20150683.0,
"step": 5020
},
{
"entropy": 0.9279548175632953,
"epoch": 1.924990432453119,
"grad_norm": 0.09845773130655289,
"learning_rate": 7.546663085806366e-05,
"loss": 0.9818471908569336,
"mean_token_accuracy": 0.7738550245761872,
"num_tokens": 20190521.0,
"step": 5030
},
{
"entropy": 0.9281142316758633,
"epoch": 1.928817451205511,
"grad_norm": 0.10571245104074478,
"learning_rate": 7.519806633543708e-05,
"loss": 0.999634075164795,
"mean_token_accuracy": 0.7708285465836525,
"num_tokens": 20230615.0,
"step": 5040
},
{
"entropy": 0.8793018095195293,
"epoch": 1.9326444699579028,
"grad_norm": 0.11255183815956116,
"learning_rate": 7.492950181281053e-05,
"loss": 0.9399495124816895,
"mean_token_accuracy": 0.7893765285611153,
"num_tokens": 20269332.0,
"step": 5050
},
{
"entropy": 0.8188632413744926,
"epoch": 1.9364714887102947,
"grad_norm": 0.08683498203754425,
"learning_rate": 7.466093729018398e-05,
"loss": 0.8760917663574219,
"mean_token_accuracy": 0.800470444560051,
"num_tokens": 20316849.0,
"step": 5060
},
{
"entropy": 0.9165158126503229,
"epoch": 1.9402985074626866,
"grad_norm": 0.12123431265354156,
"learning_rate": 7.439237276755741e-05,
"loss": 0.9515151023864746,
"mean_token_accuracy": 0.7772148326039314,
"num_tokens": 20354641.0,
"step": 5070
},
{
"entropy": 0.8890400048345327,
"epoch": 1.9441255262150785,
"grad_norm": 0.09551843255758286,
"learning_rate": 7.412380824493084e-05,
"loss": 0.9720385551452637,
"mean_token_accuracy": 0.7855533555150032,
"num_tokens": 20400703.0,
"step": 5080
},
{
"entropy": 0.9226945102214813,
"epoch": 1.9479525449674704,
"grad_norm": 0.11462504416704178,
"learning_rate": 7.385524372230429e-05,
"loss": 0.9757321357727051,
"mean_token_accuracy": 0.7739654749631881,
"num_tokens": 20442145.0,
"step": 5090
},
{
"entropy": 0.8108384694904089,
"epoch": 1.9517795637198623,
"grad_norm": 0.13017524778842926,
"learning_rate": 7.358667919967772e-05,
"loss": 0.8620017051696778,
"mean_token_accuracy": 0.8028488114476204,
"num_tokens": 20472714.0,
"step": 5100
},
{
"entropy": 0.9563053950667382,
"epoch": 1.9556065824722542,
"grad_norm": 0.10588496923446655,
"learning_rate": 7.331811467705116e-05,
"loss": 0.9805202484130859,
"mean_token_accuracy": 0.7729632049798966,
"num_tokens": 20518593.0,
"step": 5110
},
{
"entropy": 0.9307407476007938,
"epoch": 1.959433601224646,
"grad_norm": 0.09899015724658966,
"learning_rate": 7.30495501544246e-05,
"loss": 0.998748779296875,
"mean_token_accuracy": 0.7733172833919525,
"num_tokens": 20558008.0,
"step": 5120
},
{
"entropy": 0.9505821786820888,
"epoch": 1.963260619977038,
"grad_norm": 0.0943673700094223,
"learning_rate": 7.278098563179804e-05,
"loss": 1.0047925949096679,
"mean_token_accuracy": 0.7691358909010887,
"num_tokens": 20603741.0,
"step": 5130
},
{
"entropy": 1.04148171544075,
"epoch": 1.9670876387294298,
"grad_norm": 0.08869694918394089,
"learning_rate": 7.251242110917149e-05,
"loss": 1.0801177024841309,
"mean_token_accuracy": 0.7499634683132171,
"num_tokens": 20645827.0,
"step": 5140
},
{
"entropy": 0.7822969853878021,
"epoch": 1.9709146574818217,
"grad_norm": 0.0994991883635521,
"learning_rate": 7.224385658654492e-05,
"loss": 0.8042619705200196,
"mean_token_accuracy": 0.8097834318876267,
"num_tokens": 20684019.0,
"step": 5150
},
{
"entropy": 0.918664800748229,
"epoch": 1.9747416762342136,
"grad_norm": 0.11157739907503128,
"learning_rate": 7.197529206391837e-05,
"loss": 0.983153247833252,
"mean_token_accuracy": 0.7776870116591453,
"num_tokens": 20726278.0,
"step": 5160
},
{
"entropy": 0.911195681989193,
"epoch": 1.9785686949866055,
"grad_norm": 0.13472694158554077,
"learning_rate": 7.17067275412918e-05,
"loss": 0.9662351608276367,
"mean_token_accuracy": 0.7743990138173104,
"num_tokens": 20759927.0,
"step": 5170
},
{
"entropy": 0.8238823972642422,
"epoch": 1.9823957137389974,
"grad_norm": 0.08864834159612656,
"learning_rate": 7.143816301866523e-05,
"loss": 0.8870213508605957,
"mean_token_accuracy": 0.7989589869976044,
"num_tokens": 20798325.0,
"step": 5180
},
{
"entropy": 0.9405660286545754,
"epoch": 1.9862227324913893,
"grad_norm": 0.08372621983289719,
"learning_rate": 7.116959849603868e-05,
"loss": 0.9449873924255371,
"mean_token_accuracy": 0.7792889401316643,
"num_tokens": 20837136.0,
"step": 5190
},
{
"entropy": 0.8287422813475132,
"epoch": 1.9900497512437811,
"grad_norm": 0.0968240275979042,
"learning_rate": 7.090103397341211e-05,
"loss": 0.8873905181884766,
"mean_token_accuracy": 0.7976622357964516,
"num_tokens": 20877693.0,
"step": 5200
},
{
"entropy": 0.9188660819083452,
"epoch": 1.993876769996173,
"grad_norm": 0.09275626391172409,
"learning_rate": 7.063246945078555e-05,
"loss": 0.989016342163086,
"mean_token_accuracy": 0.7755422025918961,
"num_tokens": 20924885.0,
"step": 5210
},
{
"entropy": 0.9058490604162216,
"epoch": 1.997703788748565,
"grad_norm": 0.08644875138998032,
"learning_rate": 7.0363904928159e-05,
"loss": 0.9660470008850097,
"mean_token_accuracy": 0.7761533245444298,
"num_tokens": 20966342.0,
"step": 5220
},
{
"entropy": 0.7741431064903737,
"epoch": 2.0015308075009566,
"grad_norm": 0.07492107152938843,
"learning_rate": 7.009534040553243e-05,
"loss": 0.8241374015808105,
"mean_token_accuracy": 0.8149536207318306,
"num_tokens": 21004798.0,
"step": 5230
},
{
"entropy": 0.8813200116157531,
"epoch": 2.0053578262533485,
"grad_norm": 0.07805436849594116,
"learning_rate": 6.982677588290588e-05,
"loss": 0.921663761138916,
"mean_token_accuracy": 0.7912002876400948,
"num_tokens": 21049021.0,
"step": 5240
},
{
"entropy": 0.8896506872028113,
"epoch": 2.0091848450057403,
"grad_norm": 0.13928763568401337,
"learning_rate": 6.955821136027931e-05,
"loss": 0.9278170585632324,
"mean_token_accuracy": 0.7765205070376396,
"num_tokens": 21086531.0,
"step": 5250
},
{
"entropy": 0.9149777121841908,
"epoch": 2.0130118637581322,
"grad_norm": 0.06992843002080917,
"learning_rate": 6.928964683765274e-05,
"loss": 0.9667098045349121,
"mean_token_accuracy": 0.7750229969620704,
"num_tokens": 21127453.0,
"step": 5260
},
{
"entropy": 0.8076952576637269,
"epoch": 2.016838882510524,
"grad_norm": 0.12632791697978973,
"learning_rate": 6.902108231502619e-05,
"loss": 0.8237466812133789,
"mean_token_accuracy": 0.804887568950653,
"num_tokens": 21165297.0,
"step": 5270
},
{
"entropy": 0.8818444184958935,
"epoch": 2.020665901262916,
"grad_norm": 0.08924616128206253,
"learning_rate": 6.875251779239962e-05,
"loss": 0.9049506187438965,
"mean_token_accuracy": 0.7822276562452316,
"num_tokens": 21206219.0,
"step": 5280
},
{
"entropy": 0.7953705489635468,
"epoch": 2.024492920015308,
"grad_norm": 0.1111336424946785,
"learning_rate": 6.848395326977307e-05,
"loss": 0.8433744430541992,
"mean_token_accuracy": 0.8049945279955864,
"num_tokens": 21249239.0,
"step": 5290
},
{
"entropy": 0.904665675573051,
"epoch": 2.0283199387677,
"grad_norm": 0.09494993835687637,
"learning_rate": 6.82153887471465e-05,
"loss": 0.9693451881408691,
"mean_token_accuracy": 0.779350683093071,
"num_tokens": 21289639.0,
"step": 5300
},
{
"entropy": 0.7958274722099304,
"epoch": 2.0321469575200917,
"grad_norm": 0.10396509617567062,
"learning_rate": 6.794682422451995e-05,
"loss": 0.8559811592102051,
"mean_token_accuracy": 0.8057383120059967,
"num_tokens": 21329136.0,
"step": 5310
},
{
"entropy": 0.9416906848549843,
"epoch": 2.0359739762724836,
"grad_norm": 0.08166563510894775,
"learning_rate": 6.767825970189338e-05,
"loss": 0.9891387939453125,
"mean_token_accuracy": 0.7737650781869888,
"num_tokens": 21371300.0,
"step": 5320
},
{
"entropy": 0.9342201549559832,
"epoch": 2.0398009950248754,
"grad_norm": 0.09459090232849121,
"learning_rate": 6.740969517926682e-05,
"loss": 0.9509946823120117,
"mean_token_accuracy": 0.7751364663243294,
"num_tokens": 21412268.0,
"step": 5330
},
{
"entropy": 0.8397190041840077,
"epoch": 2.0436280137772673,
"grad_norm": 0.10005268454551697,
"learning_rate": 6.714113065664026e-05,
"loss": 0.9056560516357421,
"mean_token_accuracy": 0.79336898624897,
"num_tokens": 21451975.0,
"step": 5340
},
{
"entropy": 0.9148454248905182,
"epoch": 2.047455032529659,
"grad_norm": 0.10257065296173096,
"learning_rate": 6.68725661340137e-05,
"loss": 0.9611604690551758,
"mean_token_accuracy": 0.7737416908144951,
"num_tokens": 21491818.0,
"step": 5350
},
{
"entropy": 0.9010646104812622,
"epoch": 2.051282051282051,
"grad_norm": 0.11826229095458984,
"learning_rate": 6.660400161138713e-05,
"loss": 0.9446893692016601,
"mean_token_accuracy": 0.7851994633674622,
"num_tokens": 21528066.0,
"step": 5360
},
{
"entropy": 0.8987722039222718,
"epoch": 2.055109070034443,
"grad_norm": 0.10371451824903488,
"learning_rate": 6.633543708876058e-05,
"loss": 0.9595455169677735,
"mean_token_accuracy": 0.7833559066057205,
"num_tokens": 21562883.0,
"step": 5370
},
{
"entropy": 0.8856854721903801,
"epoch": 2.058936088786835,
"grad_norm": 0.1089499220252037,
"learning_rate": 6.606687256613403e-05,
"loss": 0.9219722747802734,
"mean_token_accuracy": 0.7822227373719215,
"num_tokens": 21600910.0,
"step": 5380
},
{
"entropy": 0.8720096081495285,
"epoch": 2.0627631075392268,
"grad_norm": 0.09962328523397446,
"learning_rate": 6.579830804350745e-05,
"loss": 0.9654089927673339,
"mean_token_accuracy": 0.7856920391321183,
"num_tokens": 21640445.0,
"step": 5390
},
{
"entropy": 0.9440382812172174,
"epoch": 2.0665901262916186,
"grad_norm": 0.08670477569103241,
"learning_rate": 6.552974352088089e-05,
"loss": 0.9934238433837891,
"mean_token_accuracy": 0.7687147289514542,
"num_tokens": 21682432.0,
"step": 5400
},
{
"entropy": 0.774172055721283,
"epoch": 2.0704171450440105,
"grad_norm": 0.11862040311098099,
"learning_rate": 6.526117899825434e-05,
"loss": 0.8106603622436523,
"mean_token_accuracy": 0.8135839134454728,
"num_tokens": 21721359.0,
"step": 5410
},
{
"entropy": 0.9194908868521452,
"epoch": 2.0742441637964024,
"grad_norm": 0.10227365791797638,
"learning_rate": 6.499261447562777e-05,
"loss": 0.9410523414611817,
"mean_token_accuracy": 0.7788734346628189,
"num_tokens": 21763700.0,
"step": 5420
},
{
"entropy": 0.7955736435949803,
"epoch": 2.0780711825487943,
"grad_norm": 0.09657785296440125,
"learning_rate": 6.472404995300121e-05,
"loss": 0.8665301322937011,
"mean_token_accuracy": 0.8067882195115089,
"num_tokens": 21804190.0,
"step": 5430
},
{
"entropy": 0.8065498791635036,
"epoch": 2.081898201301186,
"grad_norm": 0.11568085849285126,
"learning_rate": 6.445548543037465e-05,
"loss": 0.8515932083129882,
"mean_token_accuracy": 0.8035058185458184,
"num_tokens": 21839801.0,
"step": 5440
},
{
"entropy": 0.9087674509733915,
"epoch": 2.085725220053578,
"grad_norm": 0.09318574517965317,
"learning_rate": 6.418692090774809e-05,
"loss": 0.9387861251831054,
"mean_token_accuracy": 0.77939523011446,
"num_tokens": 21877125.0,
"step": 5450
},
{
"entropy": 0.86418566852808,
"epoch": 2.08955223880597,
"grad_norm": 0.08796729892492294,
"learning_rate": 6.391835638512152e-05,
"loss": 0.9152085304260253,
"mean_token_accuracy": 0.7899368211627007,
"num_tokens": 21921493.0,
"step": 5460
},
{
"entropy": 0.8593201294541359,
"epoch": 2.093379257558362,
"grad_norm": 0.14465564489364624,
"learning_rate": 6.364979186249497e-05,
"loss": 0.8955412864685058,
"mean_token_accuracy": 0.7898772984743119,
"num_tokens": 21961188.0,
"step": 5470
},
{
"entropy": 0.8998314358294011,
"epoch": 2.0972062763107537,
"grad_norm": 0.11634784191846848,
"learning_rate": 6.338122733986842e-05,
"loss": 0.9114861488342285,
"mean_token_accuracy": 0.7838647082448006,
"num_tokens": 22001738.0,
"step": 5480
},
{
"entropy": 0.8693659231066704,
"epoch": 2.1010332950631456,
"grad_norm": 0.11536803841590881,
"learning_rate": 6.311266281724184e-05,
"loss": 0.9232154846191406,
"mean_token_accuracy": 0.7881089702248574,
"num_tokens": 22039626.0,
"step": 5490
},
{
"entropy": 0.9556272588670254,
"epoch": 2.1048603138155375,
"grad_norm": 0.09614596515893936,
"learning_rate": 6.284409829461528e-05,
"loss": 1.0266177177429199,
"mean_token_accuracy": 0.7646962344646454,
"num_tokens": 22081971.0,
"step": 5500
},
{
"entropy": 0.7735307298600673,
"epoch": 2.1086873325679294,
"grad_norm": 0.10002073645591736,
"learning_rate": 6.257553377198873e-05,
"loss": 0.8011887550354004,
"mean_token_accuracy": 0.8088100135326386,
"num_tokens": 22117897.0,
"step": 5510
},
{
"entropy": 0.8981072999536991,
"epoch": 2.1125143513203213,
"grad_norm": 0.10524707287549973,
"learning_rate": 6.230696924936216e-05,
"loss": 0.9659936904907227,
"mean_token_accuracy": 0.7843907788395882,
"num_tokens": 22161049.0,
"step": 5520
},
{
"entropy": 0.8891891561448574,
"epoch": 2.116341370072713,
"grad_norm": 0.10095740854740143,
"learning_rate": 6.20384047267356e-05,
"loss": 0.9199987411499023,
"mean_token_accuracy": 0.7833669245243072,
"num_tokens": 22201183.0,
"step": 5530
},
{
"entropy": 0.9359986830502749,
"epoch": 2.120168388825105,
"grad_norm": 0.08723930269479752,
"learning_rate": 6.176984020410904e-05,
"loss": 0.9635790824890137,
"mean_token_accuracy": 0.7724878415465355,
"num_tokens": 22240779.0,
"step": 5540
},
{
"entropy": 0.8017430886626243,
"epoch": 2.123995407577497,
"grad_norm": 0.10579924285411835,
"learning_rate": 6.150127568148249e-05,
"loss": 0.842125129699707,
"mean_token_accuracy": 0.8020379558205605,
"num_tokens": 22279289.0,
"step": 5550
},
{
"entropy": 0.7666160762310028,
"epoch": 2.127822426329889,
"grad_norm": 0.09871628880500793,
"learning_rate": 6.123271115885591e-05,
"loss": 0.8378163337707519,
"mean_token_accuracy": 0.8119754999876022,
"num_tokens": 22316715.0,
"step": 5560
},
{
"entropy": 0.9505756117403508,
"epoch": 2.1316494450822807,
"grad_norm": 0.11093632131814957,
"learning_rate": 6.096414663622936e-05,
"loss": 0.9677371025085449,
"mean_token_accuracy": 0.7698320209980011,
"num_tokens": 22360112.0,
"step": 5570
},
{
"entropy": 0.7982158973813057,
"epoch": 2.1354764638346726,
"grad_norm": 0.11260368674993515,
"learning_rate": 6.06955821136028e-05,
"loss": 0.8571239471435547,
"mean_token_accuracy": 0.804571321606636,
"num_tokens": 22399114.0,
"step": 5580
},
{
"entropy": 0.8869463637471199,
"epoch": 2.1393034825870645,
"grad_norm": 0.08550643920898438,
"learning_rate": 6.042701759097623e-05,
"loss": 0.9476675033569336,
"mean_token_accuracy": 0.7807673364877701,
"num_tokens": 22440187.0,
"step": 5590
},
{
"entropy": 0.9491269618272782,
"epoch": 2.1431305013394564,
"grad_norm": 0.09019884467124939,
"learning_rate": 6.015845306834967e-05,
"loss": 1.0232599258422852,
"mean_token_accuracy": 0.7681664958596229,
"num_tokens": 22479682.0,
"step": 5600
},
{
"entropy": 0.8861779697239399,
"epoch": 2.1469575200918483,
"grad_norm": 0.11756031215190887,
"learning_rate": 5.988988854572312e-05,
"loss": 0.9251557350158691,
"mean_token_accuracy": 0.7849425792694091,
"num_tokens": 22520352.0,
"step": 5610
},
{
"entropy": 0.8735060147941113,
"epoch": 2.15078453884424,
"grad_norm": 0.0996679812669754,
"learning_rate": 5.9621324023096546e-05,
"loss": 0.9677264213562011,
"mean_token_accuracy": 0.7881714150309562,
"num_tokens": 22561677.0,
"step": 5620
},
{
"entropy": 0.991636025160551,
"epoch": 2.154611557596632,
"grad_norm": 0.10682649165391922,
"learning_rate": 5.935275950046999e-05,
"loss": 1.050811195373535,
"mean_token_accuracy": 0.7574850931763649,
"num_tokens": 22609671.0,
"step": 5630
},
{
"entropy": 0.9028345100581646,
"epoch": 2.158438576349024,
"grad_norm": 0.11249802261590958,
"learning_rate": 5.908419497784343e-05,
"loss": 0.9876343727111816,
"mean_token_accuracy": 0.783162035048008,
"num_tokens": 22650924.0,
"step": 5640
},
{
"entropy": 0.868353420495987,
"epoch": 2.162265595101416,
"grad_norm": 0.08846433460712433,
"learning_rate": 5.8815630455216867e-05,
"loss": 0.9271388053894043,
"mean_token_accuracy": 0.7898381799459457,
"num_tokens": 22691550.0,
"step": 5650
},
{
"entropy": 0.9247912406921387,
"epoch": 2.1660926138538077,
"grad_norm": 0.10013602674007416,
"learning_rate": 5.854706593259031e-05,
"loss": 1.0093653678894043,
"mean_token_accuracy": 0.7723490744829178,
"num_tokens": 22728956.0,
"step": 5660
},
{
"entropy": 0.82930968105793,
"epoch": 2.1699196326061996,
"grad_norm": 0.11004043370485306,
"learning_rate": 5.827850140996375e-05,
"loss": 0.8801467895507813,
"mean_token_accuracy": 0.798722094297409,
"num_tokens": 22765064.0,
"step": 5670
},
{
"entropy": 0.8950945638120175,
"epoch": 2.1737466513585915,
"grad_norm": 0.09994686394929886,
"learning_rate": 5.800993688733719e-05,
"loss": 0.9781051635742187,
"mean_token_accuracy": 0.7849533364176751,
"num_tokens": 22802213.0,
"step": 5680
},
{
"entropy": 0.8847132481634616,
"epoch": 2.1775736701109834,
"grad_norm": 0.09891512989997864,
"learning_rate": 5.774137236471062e-05,
"loss": 0.9338027954101562,
"mean_token_accuracy": 0.7867394030094147,
"num_tokens": 22839400.0,
"step": 5690
},
{
"entropy": 0.8212509788572788,
"epoch": 2.1814006888633752,
"grad_norm": 0.10451705008745193,
"learning_rate": 5.747280784208406e-05,
"loss": 0.8740688323974609,
"mean_token_accuracy": 0.7968196496367455,
"num_tokens": 22877771.0,
"step": 5700
},
{
"entropy": 0.7856742814183235,
"epoch": 2.185227707615767,
"grad_norm": 0.09351614862680435,
"learning_rate": 5.720424331945751e-05,
"loss": 0.8385543823242188,
"mean_token_accuracy": 0.8064358577132225,
"num_tokens": 22916159.0,
"step": 5710
},
{
"entropy": 0.9431014984846116,
"epoch": 2.189054726368159,
"grad_norm": 0.09432144463062286,
"learning_rate": 5.6935678796830935e-05,
"loss": 1.0021851539611817,
"mean_token_accuracy": 0.7693860113620759,
"num_tokens": 22958014.0,
"step": 5720
},
{
"entropy": 0.9080683786422015,
"epoch": 2.192881745120551,
"grad_norm": 0.08724278956651688,
"learning_rate": 5.666711427420438e-05,
"loss": 0.9878963470458985,
"mean_token_accuracy": 0.7802156403660774,
"num_tokens": 23003222.0,
"step": 5730
},
{
"entropy": 0.8772326201200485,
"epoch": 2.196708763872943,
"grad_norm": 0.1096489354968071,
"learning_rate": 5.639854975157782e-05,
"loss": 0.9326786041259766,
"mean_token_accuracy": 0.7881689593195915,
"num_tokens": 23039512.0,
"step": 5740
},
{
"entropy": 0.9084336057305336,
"epoch": 2.2005357826253347,
"grad_norm": 0.11137977987527847,
"learning_rate": 5.6129985228951256e-05,
"loss": 0.9574773788452149,
"mean_token_accuracy": 0.7860094889998436,
"num_tokens": 23078238.0,
"step": 5750
},
{
"entropy": 0.836103780195117,
"epoch": 2.2043628013777266,
"grad_norm": 0.11038387566804886,
"learning_rate": 5.5861420706324696e-05,
"loss": 0.88037109375,
"mean_token_accuracy": 0.7916925936937332,
"num_tokens": 23121089.0,
"step": 5760
},
{
"entropy": 0.9425606489181518,
"epoch": 2.2081898201301184,
"grad_norm": 0.10270453989505768,
"learning_rate": 5.5592856183698137e-05,
"loss": 0.983431339263916,
"mean_token_accuracy": 0.7715479463338852,
"num_tokens": 23158047.0,
"step": 5770
},
{
"entropy": 0.8212515480816365,
"epoch": 2.2120168388825103,
"grad_norm": 0.0880119651556015,
"learning_rate": 5.532429166107157e-05,
"loss": 0.887947940826416,
"mean_token_accuracy": 0.7997770145535469,
"num_tokens": 23204019.0,
"step": 5780
},
{
"entropy": 0.8668085850775242,
"epoch": 2.215843857634902,
"grad_norm": 0.11390146613121033,
"learning_rate": 5.505572713844501e-05,
"loss": 0.9010316848754882,
"mean_token_accuracy": 0.7880747586488723,
"num_tokens": 23241922.0,
"step": 5790
},
{
"entropy": 0.7907863073050976,
"epoch": 2.219670876387294,
"grad_norm": 0.11713080108165741,
"learning_rate": 5.478716261581846e-05,
"loss": 0.8595284461975098,
"mean_token_accuracy": 0.8068661123514176,
"num_tokens": 23280534.0,
"step": 5800
},
{
"entropy": 0.8358560226857662,
"epoch": 2.223497895139686,
"grad_norm": 0.11117064207792282,
"learning_rate": 5.45185980931919e-05,
"loss": 0.8745571136474609,
"mean_token_accuracy": 0.793362820148468,
"num_tokens": 23323119.0,
"step": 5810
},
{
"entropy": 0.8238232973963022,
"epoch": 2.227324913892078,
"grad_norm": 0.13185663521289825,
"learning_rate": 5.425003357056533e-05,
"loss": 0.8659845352172851,
"mean_token_accuracy": 0.8025152862071991,
"num_tokens": 23363749.0,
"step": 5820
},
{
"entropy": 0.8596846207976341,
"epoch": 2.2311519326444698,
"grad_norm": 0.09360291808843613,
"learning_rate": 5.398146904793877e-05,
"loss": 0.9118245124816895,
"mean_token_accuracy": 0.7882251426577568,
"num_tokens": 23402886.0,
"step": 5830
},
{
"entropy": 0.8035648860037327,
"epoch": 2.2349789513968616,
"grad_norm": 0.09347285330295563,
"learning_rate": 5.371290452531221e-05,
"loss": 0.8725827217102051,
"mean_token_accuracy": 0.8045972406864166,
"num_tokens": 23442339.0,
"step": 5840
},
{
"entropy": 0.9175308585166931,
"epoch": 2.2388059701492535,
"grad_norm": 0.12336985766887665,
"learning_rate": 5.3444340002685645e-05,
"loss": 0.9388077735900879,
"mean_token_accuracy": 0.7768721342086792,
"num_tokens": 23481344.0,
"step": 5850
},
{
"entropy": 0.868817687779665,
"epoch": 2.2426329889016454,
"grad_norm": 0.10311949998140335,
"learning_rate": 5.3175775480059086e-05,
"loss": 0.9337680816650391,
"mean_token_accuracy": 0.7877210825681686,
"num_tokens": 23520637.0,
"step": 5860
},
{
"entropy": 0.854228886961937,
"epoch": 2.2464600076540373,
"grad_norm": 0.10659918189048767,
"learning_rate": 5.2907210957432526e-05,
"loss": 0.9077530860900879,
"mean_token_accuracy": 0.7909654468297959,
"num_tokens": 23559877.0,
"step": 5870
},
{
"entropy": 0.8457217663526535,
"epoch": 2.250287026406429,
"grad_norm": 0.09633689373731613,
"learning_rate": 5.263864643480596e-05,
"loss": 0.8785475730895996,
"mean_token_accuracy": 0.7941769883036613,
"num_tokens": 23597033.0,
"step": 5880
},
{
"entropy": 0.8822055049240589,
"epoch": 2.254114045158821,
"grad_norm": 0.09562286734580994,
"learning_rate": 5.23700819121794e-05,
"loss": 0.8851138114929199,
"mean_token_accuracy": 0.7860250055789948,
"num_tokens": 23634788.0,
"step": 5890
},
{
"entropy": 0.8556318368762732,
"epoch": 2.257941063911213,
"grad_norm": 0.08814764767885208,
"learning_rate": 5.210151738955285e-05,
"loss": 0.8866415977478027,
"mean_token_accuracy": 0.7966004252433777,
"num_tokens": 23673283.0,
"step": 5900
},
{
"entropy": 0.7395530994981527,
"epoch": 2.261768082663605,
"grad_norm": 0.07671936601400375,
"learning_rate": 5.1832952866926274e-05,
"loss": 0.7680532455444335,
"mean_token_accuracy": 0.8190904691815376,
"num_tokens": 23711540.0,
"step": 5910
},
{
"entropy": 0.8898126773536206,
"epoch": 2.2655951014159967,
"grad_norm": 0.06960798799991608,
"learning_rate": 5.156438834429972e-05,
"loss": 1.026920700073242,
"mean_token_accuracy": 0.7816770374774933,
"num_tokens": 23756178.0,
"step": 5920
},
{
"entropy": 0.8902945756912232,
"epoch": 2.2694221201683886,
"grad_norm": 0.1114925891160965,
"learning_rate": 5.129582382167316e-05,
"loss": 0.9598423957824707,
"mean_token_accuracy": 0.784630736708641,
"num_tokens": 23792151.0,
"step": 5930
},
{
"entropy": 0.8439918398857117,
"epoch": 2.2732491389207805,
"grad_norm": 0.16730423271656036,
"learning_rate": 5.10272592990466e-05,
"loss": 0.851725959777832,
"mean_token_accuracy": 0.7940610617399215,
"num_tokens": 23830309.0,
"step": 5940
},
{
"entropy": 0.9178552135825158,
"epoch": 2.2770761576731724,
"grad_norm": 0.16359879076480865,
"learning_rate": 5.0758694776420035e-05,
"loss": 0.9417426109313964,
"mean_token_accuracy": 0.7781487166881561,
"num_tokens": 23874638.0,
"step": 5950
},
{
"entropy": 0.9053961969912052,
"epoch": 2.2809031764255643,
"grad_norm": 0.08877693116664886,
"learning_rate": 5.0490130253793475e-05,
"loss": 0.9975083351135254,
"mean_token_accuracy": 0.7837231978774071,
"num_tokens": 23918641.0,
"step": 5960
},
{
"entropy": 0.8590337552130223,
"epoch": 2.284730195177956,
"grad_norm": 0.1032002717256546,
"learning_rate": 5.022156573116692e-05,
"loss": 0.8895168304443359,
"mean_token_accuracy": 0.7937395930290222,
"num_tokens": 23964403.0,
"step": 5970
},
{
"entropy": 0.8678315542638302,
"epoch": 2.288557213930348,
"grad_norm": 0.12054577469825745,
"learning_rate": 4.9953001208540356e-05,
"loss": 0.9571179389953614,
"mean_token_accuracy": 0.7875312000513077,
"num_tokens": 24001736.0,
"step": 5980
},
{
"entropy": 0.8353918489068747,
"epoch": 2.29238423268274,
"grad_norm": 0.1126277968287468,
"learning_rate": 4.9684436685913796e-05,
"loss": 0.927174186706543,
"mean_token_accuracy": 0.7998543947935104,
"num_tokens": 24038494.0,
"step": 5990
},
{
"entropy": 0.7281714532524347,
"epoch": 2.296211251435132,
"grad_norm": 0.09404657036066055,
"learning_rate": 4.941587216328723e-05,
"loss": 0.7814407825469971,
"mean_token_accuracy": 0.8194777265191078,
"num_tokens": 24077404.0,
"step": 6000
},
{
"entropy": 0.8627386562526226,
"epoch": 2.3000382701875237,
"grad_norm": 0.07272294908761978,
"learning_rate": 4.914730764066067e-05,
"loss": 0.8920239448547364,
"mean_token_accuracy": 0.7905093863606453,
"num_tokens": 24123483.0,
"step": 6010
},
{
"entropy": 0.8679380901157856,
"epoch": 2.3038652889399156,
"grad_norm": 0.09443669021129608,
"learning_rate": 4.887874311803411e-05,
"loss": 0.874543571472168,
"mean_token_accuracy": 0.7891486629843711,
"num_tokens": 24165215.0,
"step": 6020
},
{
"entropy": 0.8942526787519455,
"epoch": 2.3076923076923075,
"grad_norm": 0.0953405573964119,
"learning_rate": 4.861017859540755e-05,
"loss": 0.9304584503173828,
"mean_token_accuracy": 0.7855148240923882,
"num_tokens": 24204454.0,
"step": 6030
},
{
"entropy": 0.7896301347762347,
"epoch": 2.3115193264446994,
"grad_norm": 0.11093971133232117,
"learning_rate": 4.834161407278099e-05,
"loss": 0.8957646369934082,
"mean_token_accuracy": 0.8066290900111198,
"num_tokens": 24245578.0,
"step": 6040
},
{
"entropy": 0.9012999664992094,
"epoch": 2.3153463451970913,
"grad_norm": 0.09953141212463379,
"learning_rate": 4.8073049550154424e-05,
"loss": 0.9699124336242676,
"mean_token_accuracy": 0.7792607560753823,
"num_tokens": 24286627.0,
"step": 6050
},
{
"entropy": 0.8553815156221389,
"epoch": 2.319173363949483,
"grad_norm": 0.09737669676542282,
"learning_rate": 4.7804485027527864e-05,
"loss": 0.9319831848144531,
"mean_token_accuracy": 0.7943563163280487,
"num_tokens": 24326050.0,
"step": 6060
},
{
"entropy": 0.8088245622813701,
"epoch": 2.323000382701875,
"grad_norm": 0.11754145473241806,
"learning_rate": 4.7535920504901305e-05,
"loss": 0.8612746238708496,
"mean_token_accuracy": 0.7998821645975113,
"num_tokens": 24365505.0,
"step": 6070
},
{
"entropy": 0.8720655493438244,
"epoch": 2.326827401454267,
"grad_norm": 0.10582665354013443,
"learning_rate": 4.726735598227474e-05,
"loss": 0.9663046836853028,
"mean_token_accuracy": 0.78773233294487,
"num_tokens": 24403619.0,
"step": 6080
},
{
"entropy": 0.814146314561367,
"epoch": 2.330654420206659,
"grad_norm": 0.10099766403436661,
"learning_rate": 4.6998791459648185e-05,
"loss": 0.8403602600097656,
"mean_token_accuracy": 0.8022790655493737,
"num_tokens": 24441133.0,
"step": 6090
},
{
"entropy": 0.8325122386217118,
"epoch": 2.3344814389590507,
"grad_norm": 0.0968555137515068,
"learning_rate": 4.673022693702162e-05,
"loss": 0.8952775955200195,
"mean_token_accuracy": 0.7972952157258988,
"num_tokens": 24487908.0,
"step": 6100
},
{
"entropy": 0.8313679326325655,
"epoch": 2.3383084577114426,
"grad_norm": 0.09856109321117401,
"learning_rate": 4.6461662414395066e-05,
"loss": 0.8740328788757324,
"mean_token_accuracy": 0.7973453208804131,
"num_tokens": 24528859.0,
"step": 6110
},
{
"entropy": 0.9734285809099674,
"epoch": 2.3421354764638345,
"grad_norm": 0.08564373850822449,
"learning_rate": 4.61930978917685e-05,
"loss": 1.0028407096862793,
"mean_token_accuracy": 0.761284664273262,
"num_tokens": 24574604.0,
"step": 6120
},
{
"entropy": 0.9015337243676186,
"epoch": 2.3459624952162264,
"grad_norm": 0.09626568853855133,
"learning_rate": 4.592453336914194e-05,
"loss": 0.9965445518493652,
"mean_token_accuracy": 0.7804829552769661,
"num_tokens": 24615926.0,
"step": 6130
},
{
"entropy": 0.8764280565083027,
"epoch": 2.3497895139686182,
"grad_norm": 0.09104456007480621,
"learning_rate": 4.565596884651538e-05,
"loss": 0.9158814430236817,
"mean_token_accuracy": 0.7859255224466324,
"num_tokens": 24656662.0,
"step": 6140
},
{
"entropy": 0.8626538865268231,
"epoch": 2.35361653272101,
"grad_norm": 0.10454346984624863,
"learning_rate": 4.5387404323888814e-05,
"loss": 0.9093445777893067,
"mean_token_accuracy": 0.7909897804260254,
"num_tokens": 24696048.0,
"step": 6150
},
{
"entropy": 0.9042750746011734,
"epoch": 2.357443551473402,
"grad_norm": 0.09976542741060257,
"learning_rate": 4.511883980126226e-05,
"loss": 0.9527711868286133,
"mean_token_accuracy": 0.7807446241378784,
"num_tokens": 24738856.0,
"step": 6160
},
{
"entropy": 0.892713101953268,
"epoch": 2.361270570225794,
"grad_norm": 0.09778838604688644,
"learning_rate": 4.4850275278635694e-05,
"loss": 0.9142132759094238,
"mean_token_accuracy": 0.7793798848986626,
"num_tokens": 24781940.0,
"step": 6170
},
{
"entropy": 0.8652282394468784,
"epoch": 2.365097588978186,
"grad_norm": 0.13737474381923676,
"learning_rate": 4.4581710756009134e-05,
"loss": 0.9030959129333496,
"mean_token_accuracy": 0.7882118329405785,
"num_tokens": 24818476.0,
"step": 6180
},
{
"entropy": 0.880942365527153,
"epoch": 2.3689246077305777,
"grad_norm": 0.09460416436195374,
"learning_rate": 4.4313146233382575e-05,
"loss": 0.9684123992919922,
"mean_token_accuracy": 0.7829654842615128,
"num_tokens": 24856950.0,
"step": 6190
},
{
"entropy": 0.9563789039850235,
"epoch": 2.3727516264829696,
"grad_norm": 0.10954713076353073,
"learning_rate": 4.404458171075601e-05,
"loss": 1.029030704498291,
"mean_token_accuracy": 0.7727080956101418,
"num_tokens": 24895606.0,
"step": 6200
},
{
"entropy": 0.827500730752945,
"epoch": 2.3765786452353614,
"grad_norm": 0.1212112084031105,
"learning_rate": 4.377601718812945e-05,
"loss": 0.8650990486145019,
"mean_token_accuracy": 0.7993797525763512,
"num_tokens": 24932482.0,
"step": 6210
},
{
"entropy": 0.8221234314143657,
"epoch": 2.3804056639877533,
"grad_norm": 0.10023710876703262,
"learning_rate": 4.350745266550289e-05,
"loss": 0.8777777671813964,
"mean_token_accuracy": 0.7987013593316078,
"num_tokens": 24975109.0,
"step": 6220
},
{
"entropy": 0.8734230428934098,
"epoch": 2.384232682740145,
"grad_norm": 0.09403553605079651,
"learning_rate": 4.323888814287633e-05,
"loss": 0.8978803634643555,
"mean_token_accuracy": 0.7872134670615196,
"num_tokens": 25020916.0,
"step": 6230
},
{
"entropy": 0.9003870271146297,
"epoch": 2.388059701492537,
"grad_norm": 0.09854581952095032,
"learning_rate": 4.297032362024977e-05,
"loss": 0.9225659370422363,
"mean_token_accuracy": 0.7807397484779358,
"num_tokens": 25061018.0,
"step": 6240
},
{
"entropy": 0.8118300527334213,
"epoch": 2.391886720244929,
"grad_norm": 0.11139514297246933,
"learning_rate": 4.27017590976232e-05,
"loss": 0.8876243591308594,
"mean_token_accuracy": 0.800039604306221,
"num_tokens": 25097954.0,
"step": 6250
},
{
"entropy": 0.8419897515326739,
"epoch": 2.395713738997321,
"grad_norm": 0.09123879671096802,
"learning_rate": 4.243319457499664e-05,
"loss": 0.86744384765625,
"mean_token_accuracy": 0.7919191718101501,
"num_tokens": 25134260.0,
"step": 6260
},
{
"entropy": 0.9123246632516384,
"epoch": 2.3995407577497128,
"grad_norm": 0.10300562530755997,
"learning_rate": 4.2164630052370084e-05,
"loss": 0.9368386268615723,
"mean_token_accuracy": 0.7797829449176789,
"num_tokens": 25176001.0,
"step": 6270
},
{
"entropy": 0.9066010326147079,
"epoch": 2.4033677765021046,
"grad_norm": 0.10231593996286392,
"learning_rate": 4.1896065529743524e-05,
"loss": 0.9637252807617187,
"mean_token_accuracy": 0.7807635113596916,
"num_tokens": 25214450.0,
"step": 6280
},
{
"entropy": 0.8680018067359925,
"epoch": 2.4071947952544965,
"grad_norm": 0.09813899546861649,
"learning_rate": 4.162750100711696e-05,
"loss": 0.9405930519104004,
"mean_token_accuracy": 0.7862071350216866,
"num_tokens": 25249019.0,
"step": 6290
},
{
"entropy": 0.8444254245609045,
"epoch": 2.4110218140068884,
"grad_norm": 0.09815159440040588,
"learning_rate": 4.1358936484490404e-05,
"loss": 0.9015726089477539,
"mean_token_accuracy": 0.7970604464411736,
"num_tokens": 25287466.0,
"step": 6300
},
{
"entropy": 0.9179269846528768,
"epoch": 2.4148488327592803,
"grad_norm": 0.1013285368680954,
"learning_rate": 4.109037196186384e-05,
"loss": 0.9629206657409668,
"mean_token_accuracy": 0.7756785362958908,
"num_tokens": 25325488.0,
"step": 6310
},
{
"entropy": 0.8627055402845144,
"epoch": 2.418675851511672,
"grad_norm": 0.09085863828659058,
"learning_rate": 4.082180743923728e-05,
"loss": 0.8825644493103028,
"mean_token_accuracy": 0.7927587017416954,
"num_tokens": 25362470.0,
"step": 6320
},
{
"entropy": 0.8909512132406234,
"epoch": 2.422502870264064,
"grad_norm": 0.12609654664993286,
"learning_rate": 4.055324291661072e-05,
"loss": 0.9005517959594727,
"mean_token_accuracy": 0.784729179739952,
"num_tokens": 25405399.0,
"step": 6330
},
{
"entropy": 0.8371693149209023,
"epoch": 2.426329889016456,
"grad_norm": 0.09511356055736542,
"learning_rate": 4.028467839398415e-05,
"loss": 0.8819235801696778,
"mean_token_accuracy": 0.7933985084295273,
"num_tokens": 25443537.0,
"step": 6340
},
{
"entropy": 0.8452706336975098,
"epoch": 2.430156907768848,
"grad_norm": 0.08440756797790527,
"learning_rate": 4.00161138713576e-05,
"loss": 0.9220956802368164,
"mean_token_accuracy": 0.791832709312439,
"num_tokens": 25482874.0,
"step": 6350
},
{
"entropy": 0.8533206440508365,
"epoch": 2.4339839265212397,
"grad_norm": 0.10529948770999908,
"learning_rate": 3.974754934873103e-05,
"loss": 0.8976041793823242,
"mean_token_accuracy": 0.7917203813791275,
"num_tokens": 25523091.0,
"step": 6360
},
{
"entropy": 0.8192368470132351,
"epoch": 2.4378109452736316,
"grad_norm": 0.08338342607021332,
"learning_rate": 3.947898482610447e-05,
"loss": 0.8657890319824219,
"mean_token_accuracy": 0.8002077579498291,
"num_tokens": 25566050.0,
"step": 6370
},
{
"entropy": 0.9303523369133473,
"epoch": 2.4416379640260235,
"grad_norm": 0.09010683745145798,
"learning_rate": 3.921042030347791e-05,
"loss": 0.9760264396667481,
"mean_token_accuracy": 0.7748634815216064,
"num_tokens": 25608936.0,
"step": 6380
},
{
"entropy": 0.7555282160639762,
"epoch": 2.4454649827784154,
"grad_norm": 0.11948851495981216,
"learning_rate": 3.894185578085135e-05,
"loss": 0.8005829811096191,
"mean_token_accuracy": 0.8136610746383667,
"num_tokens": 25647408.0,
"step": 6390
},
{
"entropy": 0.8959879912436008,
"epoch": 2.4492920015308073,
"grad_norm": 0.09189214557409286,
"learning_rate": 3.8673291258224794e-05,
"loss": 0.9070920944213867,
"mean_token_accuracy": 0.7838554188609124,
"num_tokens": 25690271.0,
"step": 6400
},
{
"entropy": 0.7601668298244476,
"epoch": 2.453119020283199,
"grad_norm": 0.11115460842847824,
"learning_rate": 3.840472673559823e-05,
"loss": 0.837701416015625,
"mean_token_accuracy": 0.8158529132604599,
"num_tokens": 25730098.0,
"step": 6410
},
{
"entropy": 0.9026189528405666,
"epoch": 2.456946039035591,
"grad_norm": 0.0951504036784172,
"learning_rate": 3.813616221297167e-05,
"loss": 0.9555998802185058,
"mean_token_accuracy": 0.7768774792551995,
"num_tokens": 25769649.0,
"step": 6420
},
{
"entropy": 0.8566267982125282,
"epoch": 2.460773057787983,
"grad_norm": 0.1477993279695511,
"learning_rate": 3.786759769034511e-05,
"loss": 0.901324462890625,
"mean_token_accuracy": 0.7918707326054573,
"num_tokens": 25805906.0,
"step": 6430
},
{
"entropy": 0.8576595298945904,
"epoch": 2.464600076540375,
"grad_norm": 0.08643563091754913,
"learning_rate": 3.759903316771854e-05,
"loss": 0.9027094841003418,
"mean_token_accuracy": 0.7925754263997078,
"num_tokens": 25847270.0,
"step": 6440
},
{
"entropy": 0.8848195761442185,
"epoch": 2.4684270952927667,
"grad_norm": 0.1148499846458435,
"learning_rate": 3.733046864509199e-05,
"loss": 0.9222222328186035,
"mean_token_accuracy": 0.7866752982139588,
"num_tokens": 25890454.0,
"step": 6450
},
{
"entropy": 0.8222585029900074,
"epoch": 2.4722541140451586,
"grad_norm": 0.1051439717411995,
"learning_rate": 3.706190412246542e-05,
"loss": 0.8674264907836914,
"mean_token_accuracy": 0.8014690011739731,
"num_tokens": 25927176.0,
"step": 6460
},
{
"entropy": 0.7895723138004541,
"epoch": 2.4760811327975505,
"grad_norm": 0.08904940634965897,
"learning_rate": 3.679333959983886e-05,
"loss": 0.8720718383789062,
"mean_token_accuracy": 0.8032544136047364,
"num_tokens": 25969008.0,
"step": 6470
},
{
"entropy": 0.8449521534144878,
"epoch": 2.4799081515499424,
"grad_norm": 0.09109736979007721,
"learning_rate": 3.65247750772123e-05,
"loss": 0.8994977951049805,
"mean_token_accuracy": 0.7939551532268524,
"num_tokens": 26008671.0,
"step": 6480
},
{
"entropy": 0.8769714809954167,
"epoch": 2.4837351703023343,
"grad_norm": 0.09221527725458145,
"learning_rate": 3.625621055458574e-05,
"loss": 0.9647493362426758,
"mean_token_accuracy": 0.7877351269125938,
"num_tokens": 26047583.0,
"step": 6490
},
{
"entropy": 0.840660959109664,
"epoch": 2.487562189054726,
"grad_norm": 0.0888860896229744,
"learning_rate": 3.598764603195918e-05,
"loss": 0.872824764251709,
"mean_token_accuracy": 0.7932088255882264,
"num_tokens": 26090690.0,
"step": 6500
},
{
"entropy": 0.9435165245085955,
"epoch": 2.491389207807118,
"grad_norm": 0.10055243968963623,
"learning_rate": 3.571908150933262e-05,
"loss": 1.008607769012451,
"mean_token_accuracy": 0.7684792190790176,
"num_tokens": 26134620.0,
"step": 6510
},
{
"entropy": 0.9596942149102687,
"epoch": 2.49521622655951,
"grad_norm": 0.11321604251861572,
"learning_rate": 3.545051698670606e-05,
"loss": 1.021597957611084,
"mean_token_accuracy": 0.7706323087215423,
"num_tokens": 26176850.0,
"step": 6520
},
{
"entropy": 0.9805667255073786,
"epoch": 2.499043245311902,
"grad_norm": 0.13084010779857635,
"learning_rate": 3.51819524640795e-05,
"loss": 1.0418537139892579,
"mean_token_accuracy": 0.763472905755043,
"num_tokens": 26220943.0,
"step": 6530
},
{
"entropy": 0.9104986634105444,
"epoch": 2.5028702640642937,
"grad_norm": 0.09176472574472427,
"learning_rate": 3.491338794145294e-05,
"loss": 0.972693920135498,
"mean_token_accuracy": 0.7809211134910583,
"num_tokens": 26262084.0,
"step": 6540
},
{
"entropy": 0.8316202580928802,
"epoch": 2.5066972828166856,
"grad_norm": 0.11009900271892548,
"learning_rate": 3.464482341882637e-05,
"loss": 0.8581557273864746,
"mean_token_accuracy": 0.7978575736284256,
"num_tokens": 26302790.0,
"step": 6550
},
{
"entropy": 0.9041007287800312,
"epoch": 2.5105243015690775,
"grad_norm": 0.12103740125894547,
"learning_rate": 3.437625889619981e-05,
"loss": 0.9546697616577149,
"mean_token_accuracy": 0.7800753250718117,
"num_tokens": 26347959.0,
"step": 6560
},
{
"entropy": 0.8139931574463845,
"epoch": 2.5143513203214694,
"grad_norm": 0.08679619431495667,
"learning_rate": 3.410769437357325e-05,
"loss": 0.8982272148132324,
"mean_token_accuracy": 0.8002956256270408,
"num_tokens": 26388946.0,
"step": 6570
},
{
"entropy": 0.838017127290368,
"epoch": 2.5181783390738612,
"grad_norm": 0.12066033482551575,
"learning_rate": 3.383912985094669e-05,
"loss": 0.8589006423950195,
"mean_token_accuracy": 0.7943052783608436,
"num_tokens": 26431191.0,
"step": 6580
},
{
"entropy": 0.8299121838063002,
"epoch": 2.522005357826253,
"grad_norm": 0.08988375216722488,
"learning_rate": 3.357056532832013e-05,
"loss": 0.9106943130493164,
"mean_token_accuracy": 0.7972570925951004,
"num_tokens": 26468346.0,
"step": 6590
},
{
"entropy": 1.0362544253468513,
"epoch": 2.525832376578645,
"grad_norm": 0.10034547746181488,
"learning_rate": 3.3302000805693566e-05,
"loss": 1.0991132736206055,
"mean_token_accuracy": 0.7502188056707382,
"num_tokens": 26508029.0,
"step": 6600
},
{
"entropy": 0.9098232574760914,
"epoch": 2.529659395331037,
"grad_norm": 0.12513861060142517,
"learning_rate": 3.303343628306701e-05,
"loss": 0.9807866096496582,
"mean_token_accuracy": 0.7815383434295654,
"num_tokens": 26549321.0,
"step": 6610
},
{
"entropy": 0.8234303712844848,
"epoch": 2.533486414083429,
"grad_norm": 0.08378947526216507,
"learning_rate": 3.2764871760440446e-05,
"loss": 0.8650754928588867,
"mean_token_accuracy": 0.7995569303631782,
"num_tokens": 26589472.0,
"step": 6620
},
{
"entropy": 0.769949347153306,
"epoch": 2.5373134328358207,
"grad_norm": 0.12056911736726761,
"learning_rate": 3.249630723781389e-05,
"loss": 0.8480927467346191,
"mean_token_accuracy": 0.818176555633545,
"num_tokens": 26627566.0,
"step": 6630
},
{
"entropy": 0.8099306054413319,
"epoch": 2.5411404515882126,
"grad_norm": 0.09869939833879471,
"learning_rate": 3.222774271518733e-05,
"loss": 0.8649662017822266,
"mean_token_accuracy": 0.7981634557247161,
"num_tokens": 26662566.0,
"step": 6640
},
{
"entropy": 0.8528701025992632,
"epoch": 2.5449674703406044,
"grad_norm": 0.10336704552173615,
"learning_rate": 3.195917819256076e-05,
"loss": 0.9127251625061035,
"mean_token_accuracy": 0.7928516089916229,
"num_tokens": 26705768.0,
"step": 6650
},
{
"entropy": 0.8498493686318398,
"epoch": 2.5487944890929963,
"grad_norm": 0.10704471170902252,
"learning_rate": 3.169061366993421e-05,
"loss": 0.863565731048584,
"mean_token_accuracy": 0.7932710304856301,
"num_tokens": 26743574.0,
"step": 6660
},
{
"entropy": 0.8566017836332321,
"epoch": 2.552621507845388,
"grad_norm": 0.12135261297225952,
"learning_rate": 3.142204914730764e-05,
"loss": 0.9187004089355468,
"mean_token_accuracy": 0.7913481816649437,
"num_tokens": 26784127.0,
"step": 6670
},
{
"entropy": 0.8302055161446333,
"epoch": 2.55644852659778,
"grad_norm": 0.1430647373199463,
"learning_rate": 3.115348462468108e-05,
"loss": 0.8857596397399903,
"mean_token_accuracy": 0.7965412393212319,
"num_tokens": 26823189.0,
"step": 6680
},
{
"entropy": 0.8327139757573605,
"epoch": 2.560275545350172,
"grad_norm": 0.09538804739713669,
"learning_rate": 3.088492010205452e-05,
"loss": 0.9255412101745606,
"mean_token_accuracy": 0.7939359977841377,
"num_tokens": 26861599.0,
"step": 6690
},
{
"entropy": 0.8530606523156166,
"epoch": 2.564102564102564,
"grad_norm": 0.09193538129329681,
"learning_rate": 3.0616355579427955e-05,
"loss": 0.9151040077209472,
"mean_token_accuracy": 0.7901859179139137,
"num_tokens": 26901064.0,
"step": 6700
},
{
"entropy": 0.794033832848072,
"epoch": 2.5679295828549558,
"grad_norm": 0.1283407062292099,
"learning_rate": 3.03477910568014e-05,
"loss": 0.8441056251525879,
"mean_token_accuracy": 0.8033816903829575,
"num_tokens": 26942161.0,
"step": 6710
},
{
"entropy": 0.9340717010200024,
"epoch": 2.5717566016073476,
"grad_norm": 0.09237734973430634,
"learning_rate": 3.0079226534174836e-05,
"loss": 0.9747485160827637,
"mean_token_accuracy": 0.7732965379953385,
"num_tokens": 26982759.0,
"step": 6720
},
{
"entropy": 0.8746799558401108,
"epoch": 2.5755836203597395,
"grad_norm": 0.1391710638999939,
"learning_rate": 2.9810662011548273e-05,
"loss": 0.9311764717102051,
"mean_token_accuracy": 0.7883311554789543,
"num_tokens": 27022926.0,
"step": 6730
},
{
"entropy": 0.8290158938616514,
"epoch": 2.5794106391121314,
"grad_norm": 0.10442391782999039,
"learning_rate": 2.9542097488921716e-05,
"loss": 0.8346040725708008,
"mean_token_accuracy": 0.7985544398427009,
"num_tokens": 27065028.0,
"step": 6740
},
{
"entropy": 0.8574424415826798,
"epoch": 2.5832376578645233,
"grad_norm": 0.13001689314842224,
"learning_rate": 2.9273532966295153e-05,
"loss": 0.906099510192871,
"mean_token_accuracy": 0.790866918861866,
"num_tokens": 27100867.0,
"step": 6750
},
{
"entropy": 0.840974472463131,
"epoch": 2.587064676616915,
"grad_norm": 0.1224556565284729,
"learning_rate": 2.9004968443668594e-05,
"loss": 0.8969048500061035,
"mean_token_accuracy": 0.7953185483813285,
"num_tokens": 27137338.0,
"step": 6760
},
{
"entropy": 0.8477607406675816,
"epoch": 2.590891695369307,
"grad_norm": 0.09641005098819733,
"learning_rate": 2.873640392104203e-05,
"loss": 0.9569526672363281,
"mean_token_accuracy": 0.7941199511289596,
"num_tokens": 27178308.0,
"step": 6770
},
{
"entropy": 0.8317056275904179,
"epoch": 2.594718714121699,
"grad_norm": 0.11853990703821182,
"learning_rate": 2.8467839398415468e-05,
"loss": 0.9125295639038086,
"mean_token_accuracy": 0.7960822626948356,
"num_tokens": 27216898.0,
"step": 6780
},
{
"entropy": 0.8558823302388191,
"epoch": 2.598545732874091,
"grad_norm": 0.10477570444345474,
"learning_rate": 2.819927487578891e-05,
"loss": 0.8844131469726563,
"mean_token_accuracy": 0.7940610870718956,
"num_tokens": 27254443.0,
"step": 6790
},
{
"entropy": 0.8210954669862985,
"epoch": 2.6023727516264827,
"grad_norm": 0.14100609719753265,
"learning_rate": 2.7930710353162348e-05,
"loss": 0.8684535980224609,
"mean_token_accuracy": 0.7988820597529411,
"num_tokens": 27290079.0,
"step": 6800
},
{
"entropy": 0.8657392464578152,
"epoch": 2.6061997703788746,
"grad_norm": 0.09813658148050308,
"learning_rate": 2.7662145830535785e-05,
"loss": 0.9158803939819335,
"mean_token_accuracy": 0.7908033922314643,
"num_tokens": 27328190.0,
"step": 6810
},
{
"entropy": 0.8866597019135952,
"epoch": 2.6100267891312665,
"grad_norm": 0.11115613579750061,
"learning_rate": 2.739358130790923e-05,
"loss": 0.9120420455932617,
"mean_token_accuracy": 0.7854148596525192,
"num_tokens": 27369945.0,
"step": 6820
},
{
"entropy": 0.7982962183654309,
"epoch": 2.6138538078836584,
"grad_norm": 0.1377696692943573,
"learning_rate": 2.7125016785282666e-05,
"loss": 0.8332090377807617,
"mean_token_accuracy": 0.8022376418113708,
"num_tokens": 27406302.0,
"step": 6830
},
{
"entropy": 0.8424798093736172,
"epoch": 2.6176808266360503,
"grad_norm": 0.11442425101995468,
"learning_rate": 2.6856452262656106e-05,
"loss": 0.8876424789428711,
"mean_token_accuracy": 0.7893706291913987,
"num_tokens": 27449733.0,
"step": 6840
},
{
"entropy": 0.9239407800137996,
"epoch": 2.621507845388442,
"grad_norm": 0.0799759030342102,
"learning_rate": 2.6587887740029543e-05,
"loss": 0.9658034324645997,
"mean_token_accuracy": 0.7757296651601792,
"num_tokens": 27492884.0,
"step": 6850
},
{
"entropy": 0.8720928959548473,
"epoch": 2.625334864140834,
"grad_norm": 0.11632338911294937,
"learning_rate": 2.631932321740298e-05,
"loss": 0.9089359283447266,
"mean_token_accuracy": 0.7913818553090095,
"num_tokens": 27531878.0,
"step": 6860
},
{
"entropy": 0.9302754916250706,
"epoch": 2.629161882893226,
"grad_norm": 0.11215951293706894,
"learning_rate": 2.6050758694776423e-05,
"loss": 1.0027677536010742,
"mean_token_accuracy": 0.7739164605736732,
"num_tokens": 27567970.0,
"step": 6870
},
{
"entropy": 0.9016003269702196,
"epoch": 2.632988901645618,
"grad_norm": 0.11951353400945663,
"learning_rate": 2.578219417214986e-05,
"loss": 0.9493217468261719,
"mean_token_accuracy": 0.7779877439141274,
"num_tokens": 27609840.0,
"step": 6880
},
{
"entropy": 0.8870487026870251,
"epoch": 2.6368159203980097,
"grad_norm": 0.1124744564294815,
"learning_rate": 2.55136296495233e-05,
"loss": 1.0031387329101562,
"mean_token_accuracy": 0.7866110280156136,
"num_tokens": 27649655.0,
"step": 6890
},
{
"entropy": 0.9296976864337921,
"epoch": 2.6406429391504016,
"grad_norm": 0.1161704882979393,
"learning_rate": 2.5245065126896738e-05,
"loss": 1.012251853942871,
"mean_token_accuracy": 0.7726465791463852,
"num_tokens": 27694105.0,
"step": 6900
},
{
"entropy": 0.8415393102914095,
"epoch": 2.6444699579027935,
"grad_norm": 0.0987096056342125,
"learning_rate": 2.4976500604270178e-05,
"loss": 0.9147520065307617,
"mean_token_accuracy": 0.7973951831459999,
"num_tokens": 27730663.0,
"step": 6910
},
{
"entropy": 0.8274203538894653,
"epoch": 2.6482969766551854,
"grad_norm": 0.1101188212633133,
"learning_rate": 2.4707936081643615e-05,
"loss": 0.8873770713806153,
"mean_token_accuracy": 0.7974746853113175,
"num_tokens": 27772881.0,
"step": 6920
},
{
"entropy": 0.7984559834003448,
"epoch": 2.6521239954075773,
"grad_norm": 0.10185439884662628,
"learning_rate": 2.4439371559017055e-05,
"loss": 0.8775921821594238,
"mean_token_accuracy": 0.807880648970604,
"num_tokens": 27809534.0,
"step": 6930
},
{
"entropy": 0.887981615960598,
"epoch": 2.655951014159969,
"grad_norm": 0.08309295773506165,
"learning_rate": 2.4170807036390495e-05,
"loss": 0.9466443061828613,
"mean_token_accuracy": 0.7859978228807449,
"num_tokens": 27852591.0,
"step": 6940
},
{
"entropy": 0.9378888584673405,
"epoch": 2.659778032912361,
"grad_norm": 0.136076882481575,
"learning_rate": 2.3902242513763932e-05,
"loss": 1.0269956588745117,
"mean_token_accuracy": 0.7709244459867477,
"num_tokens": 27892120.0,
"step": 6950
},
{
"entropy": 0.9220107842236758,
"epoch": 2.663605051664753,
"grad_norm": 0.08248933404684067,
"learning_rate": 2.363367799113737e-05,
"loss": 0.9726594924926758,
"mean_token_accuracy": 0.7753236919641495,
"num_tokens": 27935380.0,
"step": 6960
},
{
"entropy": 0.7793348811566829,
"epoch": 2.667432070417145,
"grad_norm": 0.08308061957359314,
"learning_rate": 2.336511346851081e-05,
"loss": 0.7947993278503418,
"mean_token_accuracy": 0.8088447406888009,
"num_tokens": 27973020.0,
"step": 6970
},
{
"entropy": 0.9587450519204139,
"epoch": 2.6712590891695367,
"grad_norm": 0.10263237357139587,
"learning_rate": 2.309654894588425e-05,
"loss": 0.9791707038879395,
"mean_token_accuracy": 0.7663016110658646,
"num_tokens": 28016389.0,
"step": 6980
},
{
"entropy": 0.8766636185348033,
"epoch": 2.6750861079219286,
"grad_norm": 0.09917714446783066,
"learning_rate": 2.282798442325769e-05,
"loss": 0.9187355041503906,
"mean_token_accuracy": 0.7864622801542283,
"num_tokens": 28058100.0,
"step": 6990
},
{
"entropy": 0.8623256701976061,
"epoch": 2.6789131266743205,
"grad_norm": 0.08802894502878189,
"learning_rate": 2.255941990063113e-05,
"loss": 0.9108509063720703,
"mean_token_accuracy": 0.7891170993447304,
"num_tokens": 28095166.0,
"step": 7000
},
{
"entropy": 0.919238954409957,
"epoch": 2.6827401454267124,
"grad_norm": 0.11916540563106537,
"learning_rate": 2.2290855378004567e-05,
"loss": 0.9972674369812011,
"mean_token_accuracy": 0.7765705808997154,
"num_tokens": 28137533.0,
"step": 7010
},
{
"entropy": 0.918128065392375,
"epoch": 2.6865671641791042,
"grad_norm": 0.09536208212375641,
"learning_rate": 2.2022290855378004e-05,
"loss": 0.9865476608276367,
"mean_token_accuracy": 0.7736267536878586,
"num_tokens": 28179301.0,
"step": 7020
},
{
"entropy": 0.8265572734177112,
"epoch": 2.690394182931496,
"grad_norm": 0.09432680904865265,
"learning_rate": 2.1753726332751444e-05,
"loss": 0.8995939254760742,
"mean_token_accuracy": 0.7947996065020562,
"num_tokens": 28223849.0,
"step": 7030
},
{
"entropy": 0.8321899034082889,
"epoch": 2.694221201683888,
"grad_norm": 0.1223755031824112,
"learning_rate": 2.1485161810124885e-05,
"loss": 0.9003139495849609,
"mean_token_accuracy": 0.7975824415683747,
"num_tokens": 28268485.0,
"step": 7040
},
{
"entropy": 0.9064472205936909,
"epoch": 2.69804822043628,
"grad_norm": 0.13409113883972168,
"learning_rate": 2.121659728749832e-05,
"loss": 0.9323970794677734,
"mean_token_accuracy": 0.7808707699179649,
"num_tokens": 28307792.0,
"step": 7050
},
{
"entropy": 0.9527742668986321,
"epoch": 2.701875239188672,
"grad_norm": 0.09863030910491943,
"learning_rate": 2.0948032764871762e-05,
"loss": 1.0056820869445802,
"mean_token_accuracy": 0.7673134744167328,
"num_tokens": 28355447.0,
"step": 7060
},
{
"entropy": 0.8202732041478157,
"epoch": 2.7057022579410637,
"grad_norm": 0.10251973569393158,
"learning_rate": 2.0679468242245202e-05,
"loss": 0.8743599891662598,
"mean_token_accuracy": 0.7957186102867126,
"num_tokens": 28397195.0,
"step": 7070
},
{
"entropy": 0.9328485410660505,
"epoch": 2.7095292766934556,
"grad_norm": 0.09044504910707474,
"learning_rate": 2.041090371961864e-05,
"loss": 0.9707870483398438,
"mean_token_accuracy": 0.7739486545324326,
"num_tokens": 28440070.0,
"step": 7080
},
{
"entropy": 0.9110265091061592,
"epoch": 2.7133562954458474,
"grad_norm": 0.10417858511209488,
"learning_rate": 2.0142339196992076e-05,
"loss": 0.9495024681091309,
"mean_token_accuracy": 0.7784481555223465,
"num_tokens": 28483039.0,
"step": 7090
},
{
"entropy": 0.907703897356987,
"epoch": 2.7171833141982393,
"grad_norm": 0.10365665704011917,
"learning_rate": 1.9873774674365516e-05,
"loss": 0.9539920806884765,
"mean_token_accuracy": 0.7803053423762322,
"num_tokens": 28524922.0,
"step": 7100
},
{
"entropy": 0.8090648584067821,
"epoch": 2.721010332950631,
"grad_norm": 0.13015250861644745,
"learning_rate": 1.9605210151738957e-05,
"loss": 0.8559967994689941,
"mean_token_accuracy": 0.7999090999364853,
"num_tokens": 28565638.0,
"step": 7110
},
{
"entropy": 0.832624789327383,
"epoch": 2.724837351703023,
"grad_norm": 0.12992241978645325,
"learning_rate": 1.9336645629112397e-05,
"loss": 0.886108112335205,
"mean_token_accuracy": 0.7986625626683235,
"num_tokens": 28603666.0,
"step": 7120
},
{
"entropy": 0.8167526118457318,
"epoch": 2.728664370455415,
"grad_norm": 0.0879233330488205,
"learning_rate": 1.9068081106485834e-05,
"loss": 0.8744274139404297,
"mean_token_accuracy": 0.8013173520565033,
"num_tokens": 28647331.0,
"step": 7130
},
{
"entropy": 0.8693740144371986,
"epoch": 2.732491389207807,
"grad_norm": 0.11505398899316788,
"learning_rate": 1.879951658385927e-05,
"loss": 0.9142866134643555,
"mean_token_accuracy": 0.7936322972178459,
"num_tokens": 28683073.0,
"step": 7140
},
{
"entropy": 0.7896613411605358,
"epoch": 2.7363184079601988,
"grad_norm": 0.10490158945322037,
"learning_rate": 1.853095206123271e-05,
"loss": 0.8762624740600586,
"mean_token_accuracy": 0.8044975116848946,
"num_tokens": 28722340.0,
"step": 7150
},
{
"entropy": 0.8261051677167416,
"epoch": 2.7401454267125906,
"grad_norm": 0.10280875116586685,
"learning_rate": 1.826238753860615e-05,
"loss": 0.888590145111084,
"mean_token_accuracy": 0.7989666223526001,
"num_tokens": 28757940.0,
"step": 7160
},
{
"entropy": 0.8630577899515629,
"epoch": 2.7439724454649825,
"grad_norm": 0.12757791578769684,
"learning_rate": 1.799382301597959e-05,
"loss": 0.9082697868347168,
"mean_token_accuracy": 0.7890564352273941,
"num_tokens": 28796985.0,
"step": 7170
},
{
"entropy": 0.8979216992855072,
"epoch": 2.7477994642173744,
"grad_norm": 0.13048897683620453,
"learning_rate": 1.772525849335303e-05,
"loss": 0.9468406677246094,
"mean_token_accuracy": 0.7829687342047691,
"num_tokens": 28838091.0,
"step": 7180
},
{
"entropy": 0.9002114910632372,
"epoch": 2.7516264829697663,
"grad_norm": 0.130500927567482,
"learning_rate": 1.745669397072647e-05,
"loss": 0.9897032737731933,
"mean_token_accuracy": 0.7817048847675323,
"num_tokens": 28879084.0,
"step": 7190
},
{
"entropy": 0.861878028512001,
"epoch": 2.755453501722158,
"grad_norm": 0.10523588210344315,
"learning_rate": 1.7188129448099906e-05,
"loss": 0.9628341674804688,
"mean_token_accuracy": 0.7882343173027039,
"num_tokens": 28918018.0,
"step": 7200
},
{
"entropy": 0.7814029835164547,
"epoch": 2.75928052047455,
"grad_norm": 0.14345957338809967,
"learning_rate": 1.6919564925473346e-05,
"loss": 0.8377615928649902,
"mean_token_accuracy": 0.8100636526942253,
"num_tokens": 28953674.0,
"step": 7210
},
{
"entropy": 0.8798072785139084,
"epoch": 2.763107539226942,
"grad_norm": 0.10911094397306442,
"learning_rate": 1.6651000402846783e-05,
"loss": 0.9405971527099609,
"mean_token_accuracy": 0.7843327835202217,
"num_tokens": 28995212.0,
"step": 7220
},
{
"entropy": 0.7432700909674168,
"epoch": 2.766934557979334,
"grad_norm": 0.09271088242530823,
"learning_rate": 1.6382435880220223e-05,
"loss": 0.7987990856170655,
"mean_token_accuracy": 0.8185402989387512,
"num_tokens": 29034878.0,
"step": 7230
},
{
"entropy": 0.7937459200620651,
"epoch": 2.7707615767317257,
"grad_norm": 0.11122163385152817,
"learning_rate": 1.6113871357593664e-05,
"loss": 0.8469036102294922,
"mean_token_accuracy": 0.8031805381178856,
"num_tokens": 29074372.0,
"step": 7240
},
{
"entropy": 0.8456454008817673,
"epoch": 2.7745885954841176,
"grad_norm": 0.11189702153205872,
"learning_rate": 1.5845306834967104e-05,
"loss": 0.8942484855651855,
"mean_token_accuracy": 0.7923400938510895,
"num_tokens": 29117619.0,
"step": 7250
},
{
"entropy": 0.885396859049797,
"epoch": 2.7784156142365095,
"grad_norm": 0.10170719027519226,
"learning_rate": 1.557674231234054e-05,
"loss": 0.9175837516784668,
"mean_token_accuracy": 0.7860854491591454,
"num_tokens": 29156601.0,
"step": 7260
},
{
"entropy": 0.8742636401206255,
"epoch": 2.7822426329889014,
"grad_norm": 0.11130956560373306,
"learning_rate": 1.5308177789713978e-05,
"loss": 0.9322646141052247,
"mean_token_accuracy": 0.7902692511677742,
"num_tokens": 29200295.0,
"step": 7270
},
{
"entropy": 0.8523757141083479,
"epoch": 2.7860696517412933,
"grad_norm": 0.08611233532428741,
"learning_rate": 1.5039613267087418e-05,
"loss": 0.9210372924804687,
"mean_token_accuracy": 0.7912763133645058,
"num_tokens": 29235323.0,
"step": 7280
},
{
"entropy": 0.7804547689855099,
"epoch": 2.789896670493685,
"grad_norm": 0.08091949671506882,
"learning_rate": 1.4771048744460858e-05,
"loss": 0.8202395439147949,
"mean_token_accuracy": 0.8117679923772811,
"num_tokens": 29270182.0,
"step": 7290
},
{
"entropy": 0.8199648998677731,
"epoch": 2.793723689246077,
"grad_norm": 0.07486634701490402,
"learning_rate": 1.4502484221834297e-05,
"loss": 0.8396285057067872,
"mean_token_accuracy": 0.8032143607735633,
"num_tokens": 29311588.0,
"step": 7300
},
{
"entropy": 0.9650515951216221,
"epoch": 2.797550707998469,
"grad_norm": 0.10391585528850555,
"learning_rate": 1.4233919699207734e-05,
"loss": 1.047046184539795,
"mean_token_accuracy": 0.7648886650800705,
"num_tokens": 29353979.0,
"step": 7310
},
{
"entropy": 0.7674700990319252,
"epoch": 2.801377726750861,
"grad_norm": 0.09043332189321518,
"learning_rate": 1.3965355176581174e-05,
"loss": 0.8154891014099122,
"mean_token_accuracy": 0.8105725541710853,
"num_tokens": 29393298.0,
"step": 7320
},
{
"entropy": 0.7795201197266579,
"epoch": 2.8052047455032527,
"grad_norm": 0.14624197781085968,
"learning_rate": 1.3696790653954614e-05,
"loss": 0.7968831062316895,
"mean_token_accuracy": 0.808569261431694,
"num_tokens": 29423547.0,
"step": 7330
},
{
"entropy": 0.9187626458704472,
"epoch": 2.8090317642556446,
"grad_norm": 0.1368781179189682,
"learning_rate": 1.3428226131328053e-05,
"loss": 0.9583258628845215,
"mean_token_accuracy": 0.7731027945876121,
"num_tokens": 29465593.0,
"step": 7340
},
{
"entropy": 0.9403511643409729,
"epoch": 2.8128587830080365,
"grad_norm": 0.10892713069915771,
"learning_rate": 1.315966160870149e-05,
"loss": 0.9621626853942871,
"mean_token_accuracy": 0.767315211892128,
"num_tokens": 29506888.0,
"step": 7350
},
{
"entropy": 0.842640140466392,
"epoch": 2.8166858017604284,
"grad_norm": 0.08862321823835373,
"learning_rate": 1.289109708607493e-05,
"loss": 0.9031145095825195,
"mean_token_accuracy": 0.7967306047677993,
"num_tokens": 29550811.0,
"step": 7360
},
{
"entropy": 0.8931968793272972,
"epoch": 2.8205128205128203,
"grad_norm": 0.0979296937584877,
"learning_rate": 1.2622532563448369e-05,
"loss": 0.9369117736816406,
"mean_token_accuracy": 0.785995215177536,
"num_tokens": 29587036.0,
"step": 7370
},
{
"entropy": 0.8621913805603981,
"epoch": 2.824339839265212,
"grad_norm": 0.08778136223554611,
"learning_rate": 1.2353968040821807e-05,
"loss": 0.884724235534668,
"mean_token_accuracy": 0.790358729660511,
"num_tokens": 29627992.0,
"step": 7380
},
{
"entropy": 0.8695362661033869,
"epoch": 2.828166858017604,
"grad_norm": 0.09141552448272705,
"learning_rate": 1.2085403518195248e-05,
"loss": 0.9539263725280762,
"mean_token_accuracy": 0.78631162494421,
"num_tokens": 29668509.0,
"step": 7390
},
{
"entropy": 0.8454725466668606,
"epoch": 2.831993876769996,
"grad_norm": 0.10090988874435425,
"learning_rate": 1.1816838995568685e-05,
"loss": 0.9256816864013672,
"mean_token_accuracy": 0.7941092774271965,
"num_tokens": 29706794.0,
"step": 7400
},
{
"entropy": 0.8406473740935325,
"epoch": 2.835820895522388,
"grad_norm": 0.12991519272327423,
"learning_rate": 1.1548274472942125e-05,
"loss": 0.8969921112060547,
"mean_token_accuracy": 0.7950825378298759,
"num_tokens": 29745883.0,
"step": 7410
},
{
"entropy": 0.8951507560908795,
"epoch": 2.8396479142747797,
"grad_norm": 0.14208164811134338,
"learning_rate": 1.1279709950315565e-05,
"loss": 0.9443653106689454,
"mean_token_accuracy": 0.7820898026227951,
"num_tokens": 29788428.0,
"step": 7420
},
{
"entropy": 0.859702505543828,
"epoch": 2.8434749330271716,
"grad_norm": 0.10485101491212845,
"learning_rate": 1.1011145427689002e-05,
"loss": 0.9106481552124024,
"mean_token_accuracy": 0.7910059571266175,
"num_tokens": 29829552.0,
"step": 7430
},
{
"entropy": 0.838575328886509,
"epoch": 2.8473019517795635,
"grad_norm": 0.09105801582336426,
"learning_rate": 1.0742580905062442e-05,
"loss": 0.9367799758911133,
"mean_token_accuracy": 0.7953649654984474,
"num_tokens": 29869380.0,
"step": 7440
},
{
"entropy": 0.9112126015126705,
"epoch": 2.8511289705319554,
"grad_norm": 0.09724974632263184,
"learning_rate": 1.0474016382435881e-05,
"loss": 0.9621581077575684,
"mean_token_accuracy": 0.7795066565275193,
"num_tokens": 29913977.0,
"step": 7450
},
{
"entropy": 0.7964273016899824,
"epoch": 2.8549559892843472,
"grad_norm": 0.09481512755155563,
"learning_rate": 1.020545185980932e-05,
"loss": 0.8208577156066894,
"mean_token_accuracy": 0.8045729547739029,
"num_tokens": 29949229.0,
"step": 7460
},
{
"entropy": 0.9103045649826527,
"epoch": 2.858783008036739,
"grad_norm": 0.08678591996431351,
"learning_rate": 9.936887337182758e-06,
"loss": 0.9599167823791503,
"mean_token_accuracy": 0.7792657531797886,
"num_tokens": 29999070.0,
"step": 7470
},
{
"entropy": 0.8333844318985939,
"epoch": 2.862610026789131,
"grad_norm": 0.07823742181062698,
"learning_rate": 9.668322814556198e-06,
"loss": 0.8832645416259766,
"mean_token_accuracy": 0.7986885383725166,
"num_tokens": 30041797.0,
"step": 7480
},
{
"entropy": 0.8970901295542717,
"epoch": 2.866437045541523,
"grad_norm": 0.11852974444627762,
"learning_rate": 9.399758291929635e-06,
"loss": 0.9755334854125977,
"mean_token_accuracy": 0.7814395651221275,
"num_tokens": 30080534.0,
"step": 7490
},
{
"entropy": 0.8733609687536955,
"epoch": 2.870264064293915,
"grad_norm": 0.08307944238185883,
"learning_rate": 9.131193769303076e-06,
"loss": 0.9116435050964355,
"mean_token_accuracy": 0.786429825425148,
"num_tokens": 30123488.0,
"step": 7500
},
{
"entropy": 0.7967244807630778,
"epoch": 2.8740910830463067,
"grad_norm": 0.121941938996315,
"learning_rate": 8.862629246676514e-06,
"loss": 0.8209601402282715,
"mean_token_accuracy": 0.8040247783064842,
"num_tokens": 30158076.0,
"step": 7510
},
{
"entropy": 0.8655086796730757,
"epoch": 2.8779181017986986,
"grad_norm": 0.10017320513725281,
"learning_rate": 8.594064724049953e-06,
"loss": 0.9246477127075196,
"mean_token_accuracy": 0.7905631095170975,
"num_tokens": 30198329.0,
"step": 7520
},
{
"entropy": 0.7916971929371357,
"epoch": 2.8817451205510904,
"grad_norm": 0.08822990953922272,
"learning_rate": 8.325500201423391e-06,
"loss": 0.8695680618286132,
"mean_token_accuracy": 0.8063599601387977,
"num_tokens": 30239990.0,
"step": 7530
},
{
"entropy": 0.7693583916872739,
"epoch": 2.8855721393034823,
"grad_norm": 0.1178632378578186,
"learning_rate": 8.056935678796832e-06,
"loss": 0.8029808044433594,
"mean_token_accuracy": 0.808713173866272,
"num_tokens": 30272583.0,
"step": 7540
},
{
"entropy": 0.9072235215455293,
"epoch": 2.889399158055874,
"grad_norm": 0.11368006467819214,
"learning_rate": 7.78837115617027e-06,
"loss": 0.9859001159667968,
"mean_token_accuracy": 0.7825366839766502,
"num_tokens": 30314370.0,
"step": 7550
},
{
"entropy": 0.909162075817585,
"epoch": 2.893226176808266,
"grad_norm": 0.10643935948610306,
"learning_rate": 7.519806633543709e-06,
"loss": 0.9263824462890625,
"mean_token_accuracy": 0.7813168540596962,
"num_tokens": 30362103.0,
"step": 7560
},
{
"entropy": 0.8779693342745304,
"epoch": 2.897053195560658,
"grad_norm": 0.12511365115642548,
"learning_rate": 7.2512421109171484e-06,
"loss": 0.9283166885375976,
"mean_token_accuracy": 0.7876154363155365,
"num_tokens": 30400468.0,
"step": 7570
},
{
"entropy": 0.9308112382888794,
"epoch": 2.90088021431305,
"grad_norm": 0.08942066878080368,
"learning_rate": 6.982677588290587e-06,
"loss": 0.9894198417663574,
"mean_token_accuracy": 0.7739586725831031,
"num_tokens": 30444628.0,
"step": 7580
},
{
"entropy": 0.8830183774232865,
"epoch": 2.9047072330654418,
"grad_norm": 0.08949998021125793,
"learning_rate": 6.7141130656640265e-06,
"loss": 0.9515928268432617,
"mean_token_accuracy": 0.7846902176737786,
"num_tokens": 30485845.0,
"step": 7590
},
{
"entropy": 0.8058773010969162,
"epoch": 2.9085342518178336,
"grad_norm": 0.1035229042172432,
"learning_rate": 6.445548543037465e-06,
"loss": 0.846186637878418,
"mean_token_accuracy": 0.8066700398921967,
"num_tokens": 30523979.0,
"step": 7600
},
{
"entropy": 0.9146121144294739,
"epoch": 2.9123612705702255,
"grad_norm": 0.09379884600639343,
"learning_rate": 6.176984020410904e-06,
"loss": 0.9735233306884765,
"mean_token_accuracy": 0.7774886921048164,
"num_tokens": 30564775.0,
"step": 7610
},
{
"entropy": 0.8396586284041405,
"epoch": 2.9161882893226174,
"grad_norm": 0.11920839548110962,
"learning_rate": 5.908419497784342e-06,
"loss": 0.9061779022216797,
"mean_token_accuracy": 0.7974281132221221,
"num_tokens": 30609113.0,
"step": 7620
},
{
"entropy": 0.8665836162865161,
"epoch": 2.9200153080750093,
"grad_norm": 0.10214731842279434,
"learning_rate": 5.639854975157783e-06,
"loss": 0.9333956718444825,
"mean_token_accuracy": 0.7912585958838463,
"num_tokens": 30652409.0,
"step": 7630
},
{
"entropy": 0.8082432024180889,
"epoch": 2.923842326827401,
"grad_norm": 0.09191566705703735,
"learning_rate": 5.371290452531221e-06,
"loss": 0.8443769454956055,
"mean_token_accuracy": 0.797667445242405,
"num_tokens": 30689299.0,
"step": 7640
},
{
"entropy": 0.8395522754639387,
"epoch": 2.927669345579793,
"grad_norm": 0.08281564712524414,
"learning_rate": 5.10272592990466e-06,
"loss": 0.8710539817810059,
"mean_token_accuracy": 0.7973509266972542,
"num_tokens": 30724619.0,
"step": 7650
},
{
"entropy": 0.8130493897944688,
"epoch": 2.931496364332185,
"grad_norm": 0.0996284931898117,
"learning_rate": 4.834161407278099e-06,
"loss": 0.8514342308044434,
"mean_token_accuracy": 0.800888329744339,
"num_tokens": 30764224.0,
"step": 7660
},
{
"entropy": 0.7793916609138251,
"epoch": 2.935323383084577,
"grad_norm": 0.09503267705440521,
"learning_rate": 4.565596884651538e-06,
"loss": 0.8305204391479493,
"mean_token_accuracy": 0.8106261268258095,
"num_tokens": 30800800.0,
"step": 7670
},
{
"entropy": 0.817446855083108,
"epoch": 2.9391504018369687,
"grad_norm": 0.13637053966522217,
"learning_rate": 4.2970323620249764e-06,
"loss": 0.839473819732666,
"mean_token_accuracy": 0.8018909886479377,
"num_tokens": 30841481.0,
"step": 7680
},
{
"entropy": 0.8140060313045978,
"epoch": 2.9429774205893606,
"grad_norm": 0.13390128314495087,
"learning_rate": 4.028467839398416e-06,
"loss": 0.8653444290161133,
"mean_token_accuracy": 0.8000675857067108,
"num_tokens": 30880001.0,
"step": 7690
},
{
"entropy": 0.7898532018065453,
"epoch": 2.9468044393417525,
"grad_norm": 0.11585478484630585,
"learning_rate": 3.7599033167718545e-06,
"loss": 0.8074365615844726,
"mean_token_accuracy": 0.8053972944617271,
"num_tokens": 30915563.0,
"step": 7700
},
{
"entropy": 0.8091453645378351,
"epoch": 2.9506314580941444,
"grad_norm": 0.09755035489797592,
"learning_rate": 3.4913387941452935e-06,
"loss": 0.8457134246826172,
"mean_token_accuracy": 0.8031114682555198,
"num_tokens": 30955410.0,
"step": 7710
},
{
"entropy": 0.8444364190101623,
"epoch": 2.9544584768465363,
"grad_norm": 0.1297679990530014,
"learning_rate": 3.2227742715187325e-06,
"loss": 0.910922622680664,
"mean_token_accuracy": 0.7976488128304482,
"num_tokens": 30997246.0,
"step": 7720
},
{
"entropy": 0.8454434804618358,
"epoch": 2.958285495598928,
"grad_norm": 0.15091662108898163,
"learning_rate": 2.954209748892171e-06,
"loss": 0.8977128982543945,
"mean_token_accuracy": 0.7951600447297096,
"num_tokens": 31042192.0,
"step": 7730
},
{
"entropy": 0.838621474429965,
"epoch": 2.96211251435132,
"grad_norm": 0.10101021081209183,
"learning_rate": 2.6856452262656106e-06,
"loss": 0.9142851829528809,
"mean_token_accuracy": 0.7966463148593903,
"num_tokens": 31082777.0,
"step": 7740
},
{
"entropy": 0.8021124713122845,
"epoch": 2.965939533103712,
"grad_norm": 0.11373798549175262,
"learning_rate": 2.4170807036390496e-06,
"loss": 0.845030403137207,
"mean_token_accuracy": 0.8039181783795357,
"num_tokens": 31122973.0,
"step": 7750
},
{
"entropy": 0.8570070005953312,
"epoch": 2.969766551856104,
"grad_norm": 0.0995812863111496,
"learning_rate": 2.1485161810124882e-06,
"loss": 0.8876262664794922,
"mean_token_accuracy": 0.7908932328224182,
"num_tokens": 31166313.0,
"step": 7760
},
{
"entropy": 0.9019658699631691,
"epoch": 2.9735935706084957,
"grad_norm": 0.10546575486660004,
"learning_rate": 1.8799516583859272e-06,
"loss": 0.9777070999145507,
"mean_token_accuracy": 0.7821963891386986,
"num_tokens": 31202060.0,
"step": 7770
},
{
"entropy": 0.9346055820584297,
"epoch": 2.9774205893608876,
"grad_norm": 0.11632298678159714,
"learning_rate": 1.6113871357593663e-06,
"loss": 1.017040729522705,
"mean_token_accuracy": 0.7751505836844444,
"num_tokens": 31241536.0,
"step": 7780
},
{
"entropy": 0.8882534563541412,
"epoch": 2.9812476081132795,
"grad_norm": 0.13064302504062653,
"learning_rate": 1.3428226131328053e-06,
"loss": 0.9505605697631836,
"mean_token_accuracy": 0.7848831593990326,
"num_tokens": 31278060.0,
"step": 7790
},
{
"entropy": 0.8854026839137077,
"epoch": 2.9850746268656714,
"grad_norm": 0.0977831557393074,
"learning_rate": 1.0742580905062441e-06,
"loss": 0.9311306953430176,
"mean_token_accuracy": 0.7847100362181664,
"num_tokens": 31325802.0,
"step": 7800
},
{
"entropy": 0.9448695838451385,
"epoch": 2.9889016456180633,
"grad_norm": 0.11724492162466049,
"learning_rate": 8.056935678796831e-07,
"loss": 0.983949089050293,
"mean_token_accuracy": 0.7636413291096688,
"num_tokens": 31367954.0,
"step": 7810
},
{
"entropy": 0.8787743166089058,
"epoch": 2.992728664370455,
"grad_norm": 0.09530383348464966,
"learning_rate": 5.371290452531221e-07,
"loss": 0.9605165481567383,
"mean_token_accuracy": 0.7847816556692123,
"num_tokens": 31410151.0,
"step": 7820
},
{
"entropy": 0.810061177611351,
"epoch": 2.996555683122847,
"grad_norm": 0.09042539447546005,
"learning_rate": 2.6856452262656103e-07,
"loss": 0.8766719818115234,
"mean_token_accuracy": 0.8047587737441063,
"num_tokens": 31451314.0,
"step": 7830
}
],
"logging_steps": 10,
"max_steps": 7839,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.17346463002948e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}