{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1245547831058502, "epoch": 0.0037313432835820895, "grad_norm": 1.6273682117462158, "learning_rate": 0.0002, "loss": 2.482689619064331, "mean_token_accuracy": 0.5370704382658005, "num_tokens": 16322.0, "step": 1 }, { "entropy": 1.2366806268692017, "epoch": 0.007462686567164179, "grad_norm": 1.4647141695022583, "learning_rate": 0.0002, "loss": 2.1726250648498535, "mean_token_accuracy": 0.5635550767183304, "num_tokens": 32624.0, "step": 2 }, { "entropy": 1.3885400295257568, "epoch": 0.011194029850746268, "grad_norm": 1.1605029106140137, "learning_rate": 0.0002, "loss": 1.7200348377227783, "mean_token_accuracy": 0.596715897321701, "num_tokens": 48781.0, "step": 3 }, { "entropy": 1.3746764063835144, "epoch": 0.014925373134328358, "grad_norm": 0.932724118232727, "learning_rate": 0.0002, "loss": 1.4033262729644775, "mean_token_accuracy": 0.6351611912250519, "num_tokens": 65119.0, "step": 4 }, { "entropy": 1.3346630930900574, "epoch": 0.018656716417910446, "grad_norm": 1.0168325901031494, "learning_rate": 0.0002, "loss": 1.2731056213378906, "mean_token_accuracy": 0.6540397107601166, "num_tokens": 81735.0, "step": 5 }, { "entropy": 1.2580328285694122, "epoch": 0.022388059701492536, "grad_norm": 0.5265628695487976, "learning_rate": 0.0002, "loss": 1.1689575910568237, "mean_token_accuracy": 0.6603054255247116, "num_tokens": 98081.0, "step": 6 }, { "entropy": 1.1583980917930603, "epoch": 0.026119402985074626, "grad_norm": 0.4118923842906952, "learning_rate": 0.0002, "loss": 1.078832983970642, "mean_token_accuracy": 0.6707835346460342, "num_tokens": 114185.0, "step": 7 }, { "entropy": 1.0589762330055237, "epoch": 0.029850746268656716, "grad_norm": 0.41156867146492004, "learning_rate": 0.0002, "loss": 1.0044282674789429, "mean_token_accuracy": 0.6823764145374298, "num_tokens": 130498.0, "step": 8 }, { "entropy": 0.9924780577421188, "epoch": 0.033582089552238806, "grad_norm": 0.5590541362762451, "learning_rate": 0.0002, "loss": 0.9619787931442261, "mean_token_accuracy": 0.6892934292554855, "num_tokens": 146820.0, "step": 9 }, { "entropy": 0.9725948423147202, "epoch": 0.03731343283582089, "grad_norm": 0.4368315637111664, "learning_rate": 0.0002, "loss": 0.8887773752212524, "mean_token_accuracy": 0.7022321075201035, "num_tokens": 163228.0, "step": 10 }, { "entropy": 0.9371236711740494, "epoch": 0.041044776119402986, "grad_norm": 0.43285107612609863, "learning_rate": 0.0002, "loss": 0.8475317358970642, "mean_token_accuracy": 0.706597164273262, "num_tokens": 179681.0, "step": 11 }, { "entropy": 0.8875125199556351, "epoch": 0.04477611940298507, "grad_norm": 6.3542633056640625, "learning_rate": 0.0002, "loss": 0.8327640295028687, "mean_token_accuracy": 0.7034512162208557, "num_tokens": 196348.0, "step": 12 }, { "entropy": 0.8179645836353302, "epoch": 0.048507462686567165, "grad_norm": 0.44303053617477417, "learning_rate": 0.0002, "loss": 0.7809244394302368, "mean_token_accuracy": 0.7242531627416611, "num_tokens": 213052.0, "step": 13 }, { "entropy": 0.7955248355865479, "epoch": 0.05223880597014925, "grad_norm": 0.8472722172737122, "learning_rate": 0.0002, "loss": 0.7439039945602417, "mean_token_accuracy": 0.7328712791204453, "num_tokens": 229644.0, "step": 14 }, { "entropy": 0.7496374696493149, "epoch": 0.055970149253731345, "grad_norm": 2.1060233116149902, "learning_rate": 0.0002, "loss": 0.7229201793670654, "mean_token_accuracy": 0.7347650229930878, "num_tokens": 246138.0, "step": 15 }, { "entropy": 0.6943426132202148, "epoch": 0.05970149253731343, "grad_norm": 0.4210701882839203, "learning_rate": 0.0002, "loss": 0.6997749209403992, "mean_token_accuracy": 0.7390953898429871, "num_tokens": 262489.0, "step": 16 }, { "entropy": 0.689127504825592, "epoch": 0.06343283582089553, "grad_norm": 0.3434777855873108, "learning_rate": 0.0002, "loss": 0.6818345189094543, "mean_token_accuracy": 0.7421105057001114, "num_tokens": 278800.0, "step": 17 }, { "entropy": 0.6688047796487808, "epoch": 0.06716417910447761, "grad_norm": 0.43096405267715454, "learning_rate": 0.0002, "loss": 0.65822833776474, "mean_token_accuracy": 0.7513366043567657, "num_tokens": 295153.0, "step": 18 }, { "entropy": 0.6683900207281113, "epoch": 0.0708955223880597, "grad_norm": 0.2875062823295593, "learning_rate": 0.0002, "loss": 0.6513902544975281, "mean_token_accuracy": 0.7488225400447845, "num_tokens": 311631.0, "step": 19 }, { "entropy": 0.6681984066963196, "epoch": 0.07462686567164178, "grad_norm": 0.34322109818458557, "learning_rate": 0.0002, "loss": 0.6516908407211304, "mean_token_accuracy": 0.7477276474237442, "num_tokens": 327810.0, "step": 20 }, { "entropy": 0.657578319311142, "epoch": 0.07835820895522388, "grad_norm": 0.3035106360912323, "learning_rate": 0.0002, "loss": 0.6391871571540833, "mean_token_accuracy": 0.7518605440855026, "num_tokens": 344148.0, "step": 21 }, { "entropy": 0.6416258066892624, "epoch": 0.08208955223880597, "grad_norm": 0.2896852493286133, "learning_rate": 0.0002, "loss": 0.6108838319778442, "mean_token_accuracy": 0.7639093101024628, "num_tokens": 360467.0, "step": 22 }, { "entropy": 0.6126270890235901, "epoch": 0.08582089552238806, "grad_norm": 0.28889304399490356, "learning_rate": 0.0002, "loss": 0.5967156887054443, "mean_token_accuracy": 0.7673086673021317, "num_tokens": 376740.0, "step": 23 }, { "entropy": 0.607315257191658, "epoch": 0.08955223880597014, "grad_norm": 0.26258257031440735, "learning_rate": 0.0002, "loss": 0.5931278467178345, "mean_token_accuracy": 0.7683079540729523, "num_tokens": 393035.0, "step": 24 }, { "entropy": 0.6071023941040039, "epoch": 0.09328358208955224, "grad_norm": 0.2627218961715698, "learning_rate": 0.0002, "loss": 0.5975178480148315, "mean_token_accuracy": 0.7655056416988373, "num_tokens": 409513.0, "step": 25 }, { "entropy": 0.6166605055332184, "epoch": 0.09701492537313433, "grad_norm": 0.2591419517993927, "learning_rate": 0.0002, "loss": 0.6048401594161987, "mean_token_accuracy": 0.7606765776872635, "num_tokens": 425838.0, "step": 26 }, { "entropy": 0.5888677388429642, "epoch": 0.10074626865671642, "grad_norm": 0.23267361521720886, "learning_rate": 0.0002, "loss": 0.5792773365974426, "mean_token_accuracy": 0.7714710682630539, "num_tokens": 442275.0, "step": 27 }, { "entropy": 0.6097696423530579, "epoch": 0.1044776119402985, "grad_norm": 0.25834810733795166, "learning_rate": 0.0002, "loss": 0.6025165915489197, "mean_token_accuracy": 0.7594742327928543, "num_tokens": 458633.0, "step": 28 }, { "entropy": 0.5876014679670334, "epoch": 0.10820895522388059, "grad_norm": 0.24802696704864502, "learning_rate": 0.0002, "loss": 0.577584445476532, "mean_token_accuracy": 0.7709765136241913, "num_tokens": 475114.0, "step": 29 }, { "entropy": 0.577396959066391, "epoch": 0.11194029850746269, "grad_norm": 0.24076423048973083, "learning_rate": 0.0002, "loss": 0.5727118849754333, "mean_token_accuracy": 0.7744314223527908, "num_tokens": 491389.0, "step": 30 }, { "entropy": 0.5895106196403503, "epoch": 0.11567164179104478, "grad_norm": 0.21412523090839386, "learning_rate": 0.0002, "loss": 0.5863120555877686, "mean_token_accuracy": 0.7693659514188766, "num_tokens": 507969.0, "step": 31 }, { "entropy": 0.5717187374830246, "epoch": 0.11940298507462686, "grad_norm": 0.1944267749786377, "learning_rate": 0.0002, "loss": 0.568047046661377, "mean_token_accuracy": 0.7752875536680222, "num_tokens": 524169.0, "step": 32 }, { "entropy": 0.5736564546823502, "epoch": 0.12313432835820895, "grad_norm": 0.23050418496131897, "learning_rate": 0.0002, "loss": 0.5761005282402039, "mean_token_accuracy": 0.7727629542350769, "num_tokens": 540463.0, "step": 33 }, { "entropy": 0.589300200343132, "epoch": 0.12686567164179105, "grad_norm": 0.21381224691867828, "learning_rate": 0.0002, "loss": 0.5865699052810669, "mean_token_accuracy": 0.7672912329435349, "num_tokens": 557025.0, "step": 34 }, { "entropy": 0.5663471221923828, "epoch": 0.13059701492537312, "grad_norm": 0.21070359647274017, "learning_rate": 0.0002, "loss": 0.5665886998176575, "mean_token_accuracy": 0.7742704451084137, "num_tokens": 573346.0, "step": 35 }, { "entropy": 0.5744731575250626, "epoch": 0.13432835820895522, "grad_norm": 0.2001814991235733, "learning_rate": 0.0002, "loss": 0.5742104649543762, "mean_token_accuracy": 0.7708545625209808, "num_tokens": 589678.0, "step": 36 }, { "entropy": 0.5785809606313705, "epoch": 0.13805970149253732, "grad_norm": 0.1615011990070343, "learning_rate": 0.0002, "loss": 0.5697225332260132, "mean_token_accuracy": 0.7719135135412216, "num_tokens": 606081.0, "step": 37 }, { "entropy": 0.5571976453065872, "epoch": 0.1417910447761194, "grad_norm": 0.1849016547203064, "learning_rate": 0.0002, "loss": 0.5493215322494507, "mean_token_accuracy": 0.7809059321880341, "num_tokens": 622168.0, "step": 38 }, { "entropy": 0.5916045606136322, "epoch": 0.1455223880597015, "grad_norm": 0.19314663112163544, "learning_rate": 0.0002, "loss": 0.5800106525421143, "mean_token_accuracy": 0.7677847892045975, "num_tokens": 638480.0, "step": 39 }, { "entropy": 0.5791963338851929, "epoch": 0.14925373134328357, "grad_norm": 0.18138627707958221, "learning_rate": 0.0002, "loss": 0.5779139399528503, "mean_token_accuracy": 0.767883911728859, "num_tokens": 654651.0, "step": 40 }, { "entropy": 0.5743307769298553, "epoch": 0.15298507462686567, "grad_norm": 0.17246870696544647, "learning_rate": 0.0002, "loss": 0.5706084370613098, "mean_token_accuracy": 0.7700994461774826, "num_tokens": 670948.0, "step": 41 }, { "entropy": 0.5432448089122772, "epoch": 0.15671641791044777, "grad_norm": 0.19110122323036194, "learning_rate": 0.0002, "loss": 0.5484994649887085, "mean_token_accuracy": 0.7811570912599564, "num_tokens": 687540.0, "step": 42 }, { "entropy": 0.5750848650932312, "epoch": 0.16044776119402984, "grad_norm": 0.1716981679201126, "learning_rate": 0.0002, "loss": 0.579657793045044, "mean_token_accuracy": 0.7663937658071518, "num_tokens": 704015.0, "step": 43 }, { "entropy": 0.561103492975235, "epoch": 0.16417910447761194, "grad_norm": 0.1821409910917282, "learning_rate": 0.0002, "loss": 0.5600441098213196, "mean_token_accuracy": 0.774185299873352, "num_tokens": 720451.0, "step": 44 }, { "entropy": 0.5737239718437195, "epoch": 0.16791044776119404, "grad_norm": 0.174806609749794, "learning_rate": 0.0002, "loss": 0.5676751732826233, "mean_token_accuracy": 0.770918071269989, "num_tokens": 736682.0, "step": 45 }, { "entropy": 0.5712144523859024, "epoch": 0.17164179104477612, "grad_norm": 0.18145714700222015, "learning_rate": 0.0002, "loss": 0.5659744143486023, "mean_token_accuracy": 0.7729035317897797, "num_tokens": 753217.0, "step": 46 }, { "entropy": 0.5745559930801392, "epoch": 0.17537313432835822, "grad_norm": 0.1639634072780609, "learning_rate": 0.0002, "loss": 0.5735749006271362, "mean_token_accuracy": 0.770696684718132, "num_tokens": 769822.0, "step": 47 }, { "entropy": 0.5605441480875015, "epoch": 0.1791044776119403, "grad_norm": 0.18234604597091675, "learning_rate": 0.0002, "loss": 0.5633875131607056, "mean_token_accuracy": 0.7749416828155518, "num_tokens": 786359.0, "step": 48 }, { "entropy": 0.5490550547838211, "epoch": 0.1828358208955224, "grad_norm": 0.18433044850826263, "learning_rate": 0.0002, "loss": 0.5567543506622314, "mean_token_accuracy": 0.7788835614919662, "num_tokens": 802963.0, "step": 49 }, { "entropy": 0.5616811364889145, "epoch": 0.1865671641791045, "grad_norm": 0.15450991690158844, "learning_rate": 0.0002, "loss": 0.5657309889793396, "mean_token_accuracy": 0.774708479642868, "num_tokens": 819668.0, "step": 50 }, { "entropy": 0.5582916140556335, "epoch": 0.19029850746268656, "grad_norm": 0.14035002887248993, "learning_rate": 0.0002, "loss": 0.551848828792572, "mean_token_accuracy": 0.7806462794542313, "num_tokens": 835858.0, "step": 51 }, { "entropy": 0.5508538037538528, "epoch": 0.19402985074626866, "grad_norm": 0.17560449242591858, "learning_rate": 0.0002, "loss": 0.5406010150909424, "mean_token_accuracy": 0.7840944528579712, "num_tokens": 852146.0, "step": 52 }, { "entropy": 0.5527998208999634, "epoch": 0.19776119402985073, "grad_norm": 0.15798722207546234, "learning_rate": 0.0002, "loss": 0.5423352718353271, "mean_token_accuracy": 0.782536968588829, "num_tokens": 868660.0, "step": 53 }, { "entropy": 0.5586383640766144, "epoch": 0.20149253731343283, "grad_norm": 0.15477648377418518, "learning_rate": 0.0002, "loss": 0.5521284937858582, "mean_token_accuracy": 0.7778433710336685, "num_tokens": 885133.0, "step": 54 }, { "entropy": 0.5694690942764282, "epoch": 0.20522388059701493, "grad_norm": 0.16944538056850433, "learning_rate": 0.0002, "loss": 0.5759178400039673, "mean_token_accuracy": 0.7684573978185654, "num_tokens": 901816.0, "step": 55 }, { "entropy": 0.5426557958126068, "epoch": 0.208955223880597, "grad_norm": 0.16989077627658844, "learning_rate": 0.0002, "loss": 0.5477243661880493, "mean_token_accuracy": 0.7811359614133835, "num_tokens": 918275.0, "step": 56 }, { "entropy": 0.5754421502351761, "epoch": 0.2126865671641791, "grad_norm": 0.15350034832954407, "learning_rate": 0.0002, "loss": 0.5865313410758972, "mean_token_accuracy": 0.7631517648696899, "num_tokens": 934630.0, "step": 57 }, { "entropy": 0.5742448717355728, "epoch": 0.21641791044776118, "grad_norm": 0.18639785051345825, "learning_rate": 0.0002, "loss": 0.575249433517456, "mean_token_accuracy": 0.7669856697320938, "num_tokens": 950844.0, "step": 58 }, { "entropy": 0.5708972364664078, "epoch": 0.22014925373134328, "grad_norm": 0.15229687094688416, "learning_rate": 0.0002, "loss": 0.5669128894805908, "mean_token_accuracy": 0.7711773067712784, "num_tokens": 966973.0, "step": 59 }, { "entropy": 0.5682551562786102, "epoch": 0.22388059701492538, "grad_norm": 0.1677161157131195, "learning_rate": 0.0002, "loss": 0.5593635439872742, "mean_token_accuracy": 0.7725416421890259, "num_tokens": 983221.0, "step": 60 }, { "entropy": 0.5679890364408493, "epoch": 0.22761194029850745, "grad_norm": 0.18057392537593842, "learning_rate": 0.0002, "loss": 0.5580260753631592, "mean_token_accuracy": 0.7754660546779633, "num_tokens": 999424.0, "step": 61 }, { "entropy": 0.5804609507322311, "epoch": 0.23134328358208955, "grad_norm": 0.143987238407135, "learning_rate": 0.0002, "loss": 0.570034384727478, "mean_token_accuracy": 0.7708772122859955, "num_tokens": 1015903.0, "step": 62 }, { "entropy": 0.5699467211961746, "epoch": 0.23507462686567165, "grad_norm": 0.15400487184524536, "learning_rate": 0.0002, "loss": 0.5733590126037598, "mean_token_accuracy": 0.7680967003107071, "num_tokens": 1032549.0, "step": 63 }, { "entropy": 0.5582360923290253, "epoch": 0.23880597014925373, "grad_norm": 0.17451652884483337, "learning_rate": 0.0002, "loss": 0.5732641220092773, "mean_token_accuracy": 0.7692582160234451, "num_tokens": 1048935.0, "step": 64 }, { "entropy": 0.5475955605506897, "epoch": 0.24253731343283583, "grad_norm": 0.1549489051103592, "learning_rate": 0.0002, "loss": 0.5526400804519653, "mean_token_accuracy": 0.7788676619529724, "num_tokens": 1065104.0, "step": 65 }, { "entropy": 0.5664391964673996, "epoch": 0.2462686567164179, "grad_norm": 0.14476634562015533, "learning_rate": 0.0002, "loss": 0.5617241263389587, "mean_token_accuracy": 0.7786661833524704, "num_tokens": 1081393.0, "step": 66 }, { "entropy": 0.5560042560100555, "epoch": 0.25, "grad_norm": 0.16752755641937256, "learning_rate": 0.0002, "loss": 0.5503427982330322, "mean_token_accuracy": 0.7781690061092377, "num_tokens": 1097575.0, "step": 67 }, { "entropy": 0.5609089732170105, "epoch": 0.2537313432835821, "grad_norm": 0.17903153598308563, "learning_rate": 0.0002, "loss": 0.5497362017631531, "mean_token_accuracy": 0.7771856188774109, "num_tokens": 1113937.0, "step": 68 }, { "entropy": 0.5642896294593811, "epoch": 0.2574626865671642, "grad_norm": 0.16974171996116638, "learning_rate": 0.0002, "loss": 0.563960611820221, "mean_token_accuracy": 0.7738614976406097, "num_tokens": 1130103.0, "step": 69 }, { "entropy": 0.5726548284292221, "epoch": 0.26119402985074625, "grad_norm": 0.14435403048992157, "learning_rate": 0.0002, "loss": 0.5712643265724182, "mean_token_accuracy": 0.7692683339118958, "num_tokens": 1146423.0, "step": 70 }, { "entropy": 0.5441250950098038, "epoch": 0.26492537313432835, "grad_norm": 0.14253664016723633, "learning_rate": 0.0002, "loss": 0.544674813747406, "mean_token_accuracy": 0.7780104726552963, "num_tokens": 1162733.0, "step": 71 }, { "entropy": 0.5444895774126053, "epoch": 0.26865671641791045, "grad_norm": 0.14379332959651947, "learning_rate": 0.0002, "loss": 0.5479044318199158, "mean_token_accuracy": 0.7788853794336319, "num_tokens": 1178848.0, "step": 72 }, { "entropy": 0.5541743487119675, "epoch": 0.27238805970149255, "grad_norm": 0.1346455216407776, "learning_rate": 0.0002, "loss": 0.5573484897613525, "mean_token_accuracy": 0.7779737412929535, "num_tokens": 1195357.0, "step": 73 }, { "entropy": 0.5649544596672058, "epoch": 0.27611940298507465, "grad_norm": 0.136294886469841, "learning_rate": 0.0002, "loss": 0.5603638291358948, "mean_token_accuracy": 0.7719381302595139, "num_tokens": 1211921.0, "step": 74 }, { "entropy": 0.5381972342729568, "epoch": 0.2798507462686567, "grad_norm": 0.12611278891563416, "learning_rate": 0.0002, "loss": 0.533305287361145, "mean_token_accuracy": 0.7839507907629013, "num_tokens": 1228381.0, "step": 75 }, { "entropy": 0.5607545524835587, "epoch": 0.2835820895522388, "grad_norm": 0.1318938434123993, "learning_rate": 0.0002, "loss": 0.5617884397506714, "mean_token_accuracy": 0.7753878086805344, "num_tokens": 1244769.0, "step": 76 }, { "entropy": 0.5631186813116074, "epoch": 0.2873134328358209, "grad_norm": 0.1374509632587433, "learning_rate": 0.0002, "loss": 0.5608174204826355, "mean_token_accuracy": 0.7753797173500061, "num_tokens": 1261197.0, "step": 77 }, { "entropy": 0.5789693742990494, "epoch": 0.291044776119403, "grad_norm": 0.1388232558965683, "learning_rate": 0.0002, "loss": 0.5779432058334351, "mean_token_accuracy": 0.7658645212650299, "num_tokens": 1277998.0, "step": 78 }, { "entropy": 0.5439933687448502, "epoch": 0.2947761194029851, "grad_norm": 0.15839162468910217, "learning_rate": 0.0002, "loss": 0.5506725311279297, "mean_token_accuracy": 0.7786760181188583, "num_tokens": 1294293.0, "step": 79 }, { "entropy": 0.5581207424402237, "epoch": 0.29850746268656714, "grad_norm": 0.16782821714878082, "learning_rate": 0.0002, "loss": 0.56475830078125, "mean_token_accuracy": 0.7746179699897766, "num_tokens": 1310588.0, "step": 80 }, { "entropy": 0.588770255446434, "epoch": 0.30223880597014924, "grad_norm": 0.17123626172542572, "learning_rate": 0.0002, "loss": 0.5832362174987793, "mean_token_accuracy": 0.7644577324390411, "num_tokens": 1327129.0, "step": 81 }, { "entropy": 0.5512869954109192, "epoch": 0.30597014925373134, "grad_norm": 0.12713028490543365, "learning_rate": 0.0002, "loss": 0.538611888885498, "mean_token_accuracy": 0.7855131775140762, "num_tokens": 1343481.0, "step": 82 }, { "entropy": 0.5826849788427353, "epoch": 0.30970149253731344, "grad_norm": 0.15148760378360748, "learning_rate": 0.0002, "loss": 0.580060601234436, "mean_token_accuracy": 0.7675654888153076, "num_tokens": 1359709.0, "step": 83 }, { "entropy": 0.581380233168602, "epoch": 0.31343283582089554, "grad_norm": 0.1486639529466629, "learning_rate": 0.0002, "loss": 0.5737113952636719, "mean_token_accuracy": 0.7694955766201019, "num_tokens": 1376209.0, "step": 84 }, { "entropy": 0.5577070415019989, "epoch": 0.31716417910447764, "grad_norm": 0.14268359541893005, "learning_rate": 0.0002, "loss": 0.5592327117919922, "mean_token_accuracy": 0.7741715162992477, "num_tokens": 1392271.0, "step": 85 }, { "entropy": 0.5519531518220901, "epoch": 0.3208955223880597, "grad_norm": 0.19115421175956726, "learning_rate": 0.0002, "loss": 0.5649857521057129, "mean_token_accuracy": 0.7735026627779007, "num_tokens": 1408680.0, "step": 86 }, { "entropy": 0.5389833152294159, "epoch": 0.3246268656716418, "grad_norm": 0.1511470526456833, "learning_rate": 0.0002, "loss": 0.5499240159988403, "mean_token_accuracy": 0.7795019447803497, "num_tokens": 1425241.0, "step": 87 }, { "entropy": 0.5535243153572083, "epoch": 0.3283582089552239, "grad_norm": 0.13003994524478912, "learning_rate": 0.0002, "loss": 0.5464329123497009, "mean_token_accuracy": 0.7804087400436401, "num_tokens": 1441530.0, "step": 88 }, { "entropy": 0.5626068562269211, "epoch": 0.332089552238806, "grad_norm": 0.1472884714603424, "learning_rate": 0.0002, "loss": 0.5579521656036377, "mean_token_accuracy": 0.7757730484008789, "num_tokens": 1457843.0, "step": 89 }, { "entropy": 0.5722664147615433, "epoch": 0.3358208955223881, "grad_norm": 0.14036864042282104, "learning_rate": 0.0002, "loss": 0.5636782050132751, "mean_token_accuracy": 0.7743526548147202, "num_tokens": 1474209.0, "step": 90 }, { "entropy": 0.5577493757009506, "epoch": 0.33955223880597013, "grad_norm": 0.12171963602304459, "learning_rate": 0.0002, "loss": 0.5502208471298218, "mean_token_accuracy": 0.7802051454782486, "num_tokens": 1490390.0, "step": 91 }, { "entropy": 0.547787681221962, "epoch": 0.34328358208955223, "grad_norm": 0.1525270640850067, "learning_rate": 0.0002, "loss": 0.5497896075248718, "mean_token_accuracy": 0.7809301018714905, "num_tokens": 1506675.0, "step": 92 }, { "entropy": 0.5554802119731903, "epoch": 0.34701492537313433, "grad_norm": 0.1502194106578827, "learning_rate": 0.0002, "loss": 0.5645507574081421, "mean_token_accuracy": 0.7722718119621277, "num_tokens": 1523263.0, "step": 93 }, { "entropy": 0.5594951659440994, "epoch": 0.35074626865671643, "grad_norm": 0.13331742584705353, "learning_rate": 0.0002, "loss": 0.5637622475624084, "mean_token_accuracy": 0.7736085057258606, "num_tokens": 1540004.0, "step": 94 }, { "entropy": 0.5551023185253143, "epoch": 0.35447761194029853, "grad_norm": 0.1213943138718605, "learning_rate": 0.0002, "loss": 0.5518482327461243, "mean_token_accuracy": 0.7777320593595505, "num_tokens": 1556547.0, "step": 95 }, { "entropy": 0.557207852602005, "epoch": 0.3582089552238806, "grad_norm": 0.1314304620027542, "learning_rate": 0.0002, "loss": 0.5546322464942932, "mean_token_accuracy": 0.7763337790966034, "num_tokens": 1572997.0, "step": 96 }, { "entropy": 0.556539997458458, "epoch": 0.3619402985074627, "grad_norm": 0.14363965392112732, "learning_rate": 0.0002, "loss": 0.5549654364585876, "mean_token_accuracy": 0.7731640189886093, "num_tokens": 1589289.0, "step": 97 }, { "entropy": 0.568042978644371, "epoch": 0.3656716417910448, "grad_norm": 0.11934816092252731, "learning_rate": 0.0002, "loss": 0.5679082274436951, "mean_token_accuracy": 0.768884465098381, "num_tokens": 1605516.0, "step": 98 }, { "entropy": 0.5484860688447952, "epoch": 0.3694029850746269, "grad_norm": 0.16246412694454193, "learning_rate": 0.0002, "loss": 0.5522934794425964, "mean_token_accuracy": 0.776402086019516, "num_tokens": 1622108.0, "step": 99 }, { "entropy": 0.5548600405454636, "epoch": 0.373134328358209, "grad_norm": 0.12589918076992035, "learning_rate": 0.0002, "loss": 0.5544294714927673, "mean_token_accuracy": 0.7768803536891937, "num_tokens": 1638659.0, "step": 100 }, { "entropy": 0.5692953765392303, "epoch": 0.376865671641791, "grad_norm": 0.12726213037967682, "learning_rate": 0.0002, "loss": 0.5662153363227844, "mean_token_accuracy": 0.7698657661676407, "num_tokens": 1654877.0, "step": 101 }, { "entropy": 0.560271605849266, "epoch": 0.3805970149253731, "grad_norm": 0.13260267674922943, "learning_rate": 0.0002, "loss": 0.5487651824951172, "mean_token_accuracy": 0.7778149247169495, "num_tokens": 1671436.0, "step": 102 }, { "entropy": 0.5644612163305283, "epoch": 0.3843283582089552, "grad_norm": 0.13504348695278168, "learning_rate": 0.0002, "loss": 0.5573433041572571, "mean_token_accuracy": 0.7781724482774734, "num_tokens": 1687817.0, "step": 103 }, { "entropy": 0.55845807492733, "epoch": 0.3880597014925373, "grad_norm": 0.1202038824558258, "learning_rate": 0.0002, "loss": 0.5552661418914795, "mean_token_accuracy": 0.7772795557975769, "num_tokens": 1704568.0, "step": 104 }, { "entropy": 0.5440086871385574, "epoch": 0.3917910447761194, "grad_norm": 0.12728044390678406, "learning_rate": 0.0002, "loss": 0.5538181662559509, "mean_token_accuracy": 0.7744371294975281, "num_tokens": 1720774.0, "step": 105 }, { "entropy": 0.5394178926944733, "epoch": 0.39552238805970147, "grad_norm": 0.14098908007144928, "learning_rate": 0.0002, "loss": 0.552955150604248, "mean_token_accuracy": 0.776681050658226, "num_tokens": 1737050.0, "step": 106 }, { "entropy": 0.5602739453315735, "epoch": 0.39925373134328357, "grad_norm": 0.1373777687549591, "learning_rate": 0.0002, "loss": 0.5666458010673523, "mean_token_accuracy": 0.7684379816055298, "num_tokens": 1753616.0, "step": 107 }, { "entropy": 0.5688735842704773, "epoch": 0.40298507462686567, "grad_norm": 0.12947675585746765, "learning_rate": 0.0002, "loss": 0.5618643760681152, "mean_token_accuracy": 0.7724806815385818, "num_tokens": 1770077.0, "step": 108 }, { "entropy": 0.569103866815567, "epoch": 0.40671641791044777, "grad_norm": 0.1482311338186264, "learning_rate": 0.0002, "loss": 0.5661442875862122, "mean_token_accuracy": 0.7717588543891907, "num_tokens": 1786557.0, "step": 109 }, { "entropy": 0.5550140291452408, "epoch": 0.41044776119402987, "grad_norm": 0.13066281378269196, "learning_rate": 0.0002, "loss": 0.5546547770500183, "mean_token_accuracy": 0.7755738943815231, "num_tokens": 1803029.0, "step": 110 }, { "entropy": 0.5526944696903229, "epoch": 0.4141791044776119, "grad_norm": 0.11755255609750748, "learning_rate": 0.0002, "loss": 0.5436115860939026, "mean_token_accuracy": 0.779561460018158, "num_tokens": 1819561.0, "step": 111 }, { "entropy": 0.5528556704521179, "epoch": 0.417910447761194, "grad_norm": 0.14607787132263184, "learning_rate": 0.0002, "loss": 0.5589385032653809, "mean_token_accuracy": 0.7751224488019943, "num_tokens": 1835992.0, "step": 112 }, { "entropy": 0.5393927693367004, "epoch": 0.4216417910447761, "grad_norm": 0.12512564659118652, "learning_rate": 0.0002, "loss": 0.5430585741996765, "mean_token_accuracy": 0.7801438719034195, "num_tokens": 1852545.0, "step": 113 }, { "entropy": 0.5346394777297974, "epoch": 0.4253731343283582, "grad_norm": 0.13879786431789398, "learning_rate": 0.0002, "loss": 0.5470178723335266, "mean_token_accuracy": 0.7800125926733017, "num_tokens": 1868767.0, "step": 114 }, { "entropy": 0.552959531545639, "epoch": 0.4291044776119403, "grad_norm": 0.13570789992809296, "learning_rate": 0.0002, "loss": 0.5606270432472229, "mean_token_accuracy": 0.7728203237056732, "num_tokens": 1885207.0, "step": 115 }, { "entropy": 0.5681584924459457, "epoch": 0.43283582089552236, "grad_norm": 0.13311345875263214, "learning_rate": 0.0002, "loss": 0.561408519744873, "mean_token_accuracy": 0.7729704976081848, "num_tokens": 1901670.0, "step": 116 }, { "entropy": 0.580392524600029, "epoch": 0.43656716417910446, "grad_norm": 0.15006045997142792, "learning_rate": 0.0002, "loss": 0.5710599422454834, "mean_token_accuracy": 0.7692873626947403, "num_tokens": 1918297.0, "step": 117 }, { "entropy": 0.5402243435382843, "epoch": 0.44029850746268656, "grad_norm": 0.13022655248641968, "learning_rate": 0.0002, "loss": 0.5290783047676086, "mean_token_accuracy": 0.7855078428983688, "num_tokens": 1934811.0, "step": 118 }, { "entropy": 0.5673187673091888, "epoch": 0.44402985074626866, "grad_norm": 0.1210206151008606, "learning_rate": 0.0002, "loss": 0.5625845193862915, "mean_token_accuracy": 0.771060049533844, "num_tokens": 1951276.0, "step": 119 }, { "entropy": 0.5444270074367523, "epoch": 0.44776119402985076, "grad_norm": 0.14453133940696716, "learning_rate": 0.0002, "loss": 0.5478600263595581, "mean_token_accuracy": 0.7782215029001236, "num_tokens": 1967851.0, "step": 120 }, { "entropy": 0.5516166985034943, "epoch": 0.45149253731343286, "grad_norm": 0.15330393612384796, "learning_rate": 0.0002, "loss": 0.5627217292785645, "mean_token_accuracy": 0.7735389173030853, "num_tokens": 1984175.0, "step": 121 }, { "entropy": 0.5447670072317123, "epoch": 0.4552238805970149, "grad_norm": 0.11896508932113647, "learning_rate": 0.0002, "loss": 0.5453386306762695, "mean_token_accuracy": 0.7792693227529526, "num_tokens": 2000419.0, "step": 122 }, { "entropy": 0.5593693852424622, "epoch": 0.458955223880597, "grad_norm": 0.14641404151916504, "learning_rate": 0.0002, "loss": 0.5527093410491943, "mean_token_accuracy": 0.7784133702516556, "num_tokens": 2016812.0, "step": 123 }, { "entropy": 0.5516424775123596, "epoch": 0.4626865671641791, "grad_norm": 0.13001076877117157, "learning_rate": 0.0002, "loss": 0.5495356917381287, "mean_token_accuracy": 0.7777290046215057, "num_tokens": 2032898.0, "step": 124 }, { "entropy": 0.5469458252191544, "epoch": 0.4664179104477612, "grad_norm": 0.12713271379470825, "learning_rate": 0.0002, "loss": 0.5466877222061157, "mean_token_accuracy": 0.7783260345458984, "num_tokens": 2049023.0, "step": 125 }, { "entropy": 0.5528912246227264, "epoch": 0.4701492537313433, "grad_norm": 0.13111256062984467, "learning_rate": 0.0002, "loss": 0.5582880973815918, "mean_token_accuracy": 0.7739576250314713, "num_tokens": 2065421.0, "step": 126 }, { "entropy": 0.536289632320404, "epoch": 0.47388059701492535, "grad_norm": 0.1449650228023529, "learning_rate": 0.0002, "loss": 0.5477018356323242, "mean_token_accuracy": 0.7764868587255478, "num_tokens": 2081738.0, "step": 127 }, { "entropy": 0.5412490218877792, "epoch": 0.47761194029850745, "grad_norm": 0.12087342143058777, "learning_rate": 0.0002, "loss": 0.5445610880851746, "mean_token_accuracy": 0.7799812257289886, "num_tokens": 2098128.0, "step": 128 }, { "entropy": 0.5749060362577438, "epoch": 0.48134328358208955, "grad_norm": 0.13593946397304535, "learning_rate": 0.0002, "loss": 0.5713242292404175, "mean_token_accuracy": 0.7683141082525253, "num_tokens": 2114660.0, "step": 129 }, { "entropy": 0.5624695718288422, "epoch": 0.48507462686567165, "grad_norm": 0.13926997780799866, "learning_rate": 0.0002, "loss": 0.5603138208389282, "mean_token_accuracy": 0.7724832147359848, "num_tokens": 2130850.0, "step": 130 }, { "entropy": 0.564590647816658, "epoch": 0.48880597014925375, "grad_norm": 0.1541988104581833, "learning_rate": 0.0002, "loss": 0.5548843145370483, "mean_token_accuracy": 0.7774635404348373, "num_tokens": 2147198.0, "step": 131 }, { "entropy": 0.5638516694307327, "epoch": 0.4925373134328358, "grad_norm": 0.14475074410438538, "learning_rate": 0.0002, "loss": 0.559626579284668, "mean_token_accuracy": 0.7742670625448227, "num_tokens": 2163592.0, "step": 132 }, { "entropy": 0.546675980091095, "epoch": 0.4962686567164179, "grad_norm": 0.14459353685379028, "learning_rate": 0.0002, "loss": 0.5525697469711304, "mean_token_accuracy": 0.7782329767942429, "num_tokens": 2179735.0, "step": 133 }, { "entropy": 0.5720339864492416, "epoch": 0.5, "grad_norm": 0.16138529777526855, "learning_rate": 0.0002, "loss": 0.5745345950126648, "mean_token_accuracy": 0.7678724527359009, "num_tokens": 2196300.0, "step": 134 }, { "entropy": 0.5302732288837433, "epoch": 0.503731343283582, "grad_norm": 0.13007810711860657, "learning_rate": 0.0002, "loss": 0.5221583843231201, "mean_token_accuracy": 0.786575123667717, "num_tokens": 2212703.0, "step": 135 }, { "entropy": 0.5611361563205719, "epoch": 0.5074626865671642, "grad_norm": 0.16084182262420654, "learning_rate": 0.0002, "loss": 0.557313084602356, "mean_token_accuracy": 0.7753567546606064, "num_tokens": 2229364.0, "step": 136 }, { "entropy": 0.5539422780275345, "epoch": 0.5111940298507462, "grad_norm": 0.1412162035703659, "learning_rate": 0.0002, "loss": 0.559614896774292, "mean_token_accuracy": 0.7726200222969055, "num_tokens": 2245576.0, "step": 137 }, { "entropy": 0.562326043844223, "epoch": 0.5149253731343284, "grad_norm": 0.12138223648071289, "learning_rate": 0.0002, "loss": 0.5638246536254883, "mean_token_accuracy": 0.7736532688140869, "num_tokens": 2261877.0, "step": 138 }, { "entropy": 0.5490357279777527, "epoch": 0.5186567164179104, "grad_norm": 0.13067315518856049, "learning_rate": 0.0002, "loss": 0.5565229654312134, "mean_token_accuracy": 0.7710774689912796, "num_tokens": 2278167.0, "step": 139 }, { "entropy": 0.5594187080860138, "epoch": 0.5223880597014925, "grad_norm": 0.15731613337993622, "learning_rate": 0.0002, "loss": 0.5585336089134216, "mean_token_accuracy": 0.7744586318731308, "num_tokens": 2294498.0, "step": 140 }, { "entropy": 0.5464736074209213, "epoch": 0.5261194029850746, "grad_norm": 0.11038337647914886, "learning_rate": 0.0002, "loss": 0.538608968257904, "mean_token_accuracy": 0.7829599231481552, "num_tokens": 2311130.0, "step": 141 }, { "entropy": 0.5605999529361725, "epoch": 0.5298507462686567, "grad_norm": 0.14088644087314606, "learning_rate": 0.0002, "loss": 0.552900493144989, "mean_token_accuracy": 0.7778186202049255, "num_tokens": 2327728.0, "step": 142 }, { "entropy": 0.5528270900249481, "epoch": 0.5335820895522388, "grad_norm": 0.1425020396709442, "learning_rate": 0.0002, "loss": 0.5515353083610535, "mean_token_accuracy": 0.7752819806337357, "num_tokens": 2343709.0, "step": 143 }, { "entropy": 0.548284262418747, "epoch": 0.5373134328358209, "grad_norm": 0.11753518134355545, "learning_rate": 0.0002, "loss": 0.5451334118843079, "mean_token_accuracy": 0.778195932507515, "num_tokens": 2360064.0, "step": 144 }, { "entropy": 0.5573805719614029, "epoch": 0.5410447761194029, "grad_norm": 0.16544298827648163, "learning_rate": 0.0002, "loss": 0.5645371675491333, "mean_token_accuracy": 0.774710014462471, "num_tokens": 2376625.0, "step": 145 }, { "entropy": 0.5539259165525436, "epoch": 0.5447761194029851, "grad_norm": 0.13032706081867218, "learning_rate": 0.0002, "loss": 0.5533608198165894, "mean_token_accuracy": 0.7761502712965012, "num_tokens": 2393124.0, "step": 146 }, { "entropy": 0.5611738562583923, "epoch": 0.5485074626865671, "grad_norm": 0.11081252992153168, "learning_rate": 0.0002, "loss": 0.5593815445899963, "mean_token_accuracy": 0.7766542136669159, "num_tokens": 2409745.0, "step": 147 }, { "entropy": 0.5696390718221664, "epoch": 0.5522388059701493, "grad_norm": 0.15060319006443024, "learning_rate": 0.0002, "loss": 0.5638480186462402, "mean_token_accuracy": 0.7716973423957825, "num_tokens": 2426282.0, "step": 148 }, { "entropy": 0.5485384464263916, "epoch": 0.5559701492537313, "grad_norm": 0.1222362369298935, "learning_rate": 0.0002, "loss": 0.5475510954856873, "mean_token_accuracy": 0.7770865708589554, "num_tokens": 2442853.0, "step": 149 }, { "entropy": 0.5401834696531296, "epoch": 0.5597014925373134, "grad_norm": 0.1280064433813095, "learning_rate": 0.0002, "loss": 0.546281099319458, "mean_token_accuracy": 0.777226597070694, "num_tokens": 2459134.0, "step": 150 }, { "entropy": 0.5523836761713028, "epoch": 0.5634328358208955, "grad_norm": 0.13370104134082794, "learning_rate": 0.0002, "loss": 0.5567190647125244, "mean_token_accuracy": 0.7742304503917694, "num_tokens": 2475612.0, "step": 151 }, { "entropy": 0.5323238670825958, "epoch": 0.5671641791044776, "grad_norm": 0.13501204550266266, "learning_rate": 0.0002, "loss": 0.5404109358787537, "mean_token_accuracy": 0.7807471007108688, "num_tokens": 2492038.0, "step": 152 }, { "entropy": 0.5367552191019058, "epoch": 0.5708955223880597, "grad_norm": 0.11861642450094223, "learning_rate": 0.0002, "loss": 0.5417584180831909, "mean_token_accuracy": 0.7794559895992279, "num_tokens": 2508568.0, "step": 153 }, { "entropy": 0.5438606441020966, "epoch": 0.5746268656716418, "grad_norm": 0.14000006020069122, "learning_rate": 0.0002, "loss": 0.5418928861618042, "mean_token_accuracy": 0.7817023396492004, "num_tokens": 2524812.0, "step": 154 }, { "entropy": 0.5425677746534348, "epoch": 0.5783582089552238, "grad_norm": 0.12695865333080292, "learning_rate": 0.0002, "loss": 0.5364310145378113, "mean_token_accuracy": 0.7822788208723068, "num_tokens": 2540971.0, "step": 155 }, { "entropy": 0.5774415135383606, "epoch": 0.582089552238806, "grad_norm": 0.13525983691215515, "learning_rate": 0.0002, "loss": 0.5755460858345032, "mean_token_accuracy": 0.7673929333686829, "num_tokens": 2557582.0, "step": 156 }, { "entropy": 0.5472007393836975, "epoch": 0.585820895522388, "grad_norm": 0.14802482724189758, "learning_rate": 0.0002, "loss": 0.5489597320556641, "mean_token_accuracy": 0.777190089225769, "num_tokens": 2573624.0, "step": 157 }, { "entropy": 0.5569610297679901, "epoch": 0.5895522388059702, "grad_norm": 0.12167536467313766, "learning_rate": 0.0002, "loss": 0.5526796579360962, "mean_token_accuracy": 0.7753524631261826, "num_tokens": 2590085.0, "step": 158 }, { "entropy": 0.5524294823408127, "epoch": 0.5932835820895522, "grad_norm": 0.11966220289468765, "learning_rate": 0.0002, "loss": 0.5499304533004761, "mean_token_accuracy": 0.7759323716163635, "num_tokens": 2606611.0, "step": 159 }, { "entropy": 0.5380967259407043, "epoch": 0.5970149253731343, "grad_norm": 0.12815536558628082, "learning_rate": 0.0002, "loss": 0.5423661470413208, "mean_token_accuracy": 0.7792660146951675, "num_tokens": 2623057.0, "step": 160 }, { "entropy": 0.5472327321767807, "epoch": 0.6007462686567164, "grad_norm": 0.1232324093580246, "learning_rate": 0.0002, "loss": 0.5512628555297852, "mean_token_accuracy": 0.7756103277206421, "num_tokens": 2639412.0, "step": 161 }, { "entropy": 0.53459233045578, "epoch": 0.6044776119402985, "grad_norm": 0.1279020607471466, "learning_rate": 0.0002, "loss": 0.530642569065094, "mean_token_accuracy": 0.784668356180191, "num_tokens": 2655725.0, "step": 162 }, { "entropy": 0.5487090200185776, "epoch": 0.6082089552238806, "grad_norm": 0.11489348113536835, "learning_rate": 0.0002, "loss": 0.5467615127563477, "mean_token_accuracy": 0.7774748206138611, "num_tokens": 2671780.0, "step": 163 }, { "entropy": 0.5611004680395126, "epoch": 0.6119402985074627, "grad_norm": 0.12106446921825409, "learning_rate": 0.0002, "loss": 0.5621192455291748, "mean_token_accuracy": 0.7757818549871445, "num_tokens": 2688187.0, "step": 164 }, { "entropy": 0.5655875951051712, "epoch": 0.6156716417910447, "grad_norm": 0.11722180247306824, "learning_rate": 0.0002, "loss": 0.5597223043441772, "mean_token_accuracy": 0.7729662656784058, "num_tokens": 2704679.0, "step": 165 }, { "entropy": 0.5630869567394257, "epoch": 0.6194029850746269, "grad_norm": 0.1220882460474968, "learning_rate": 0.0002, "loss": 0.5666179060935974, "mean_token_accuracy": 0.7716799974441528, "num_tokens": 2721384.0, "step": 166 }, { "entropy": 0.5498328506946564, "epoch": 0.6231343283582089, "grad_norm": 0.12011860311031342, "learning_rate": 0.0002, "loss": 0.5489162802696228, "mean_token_accuracy": 0.7789698839187622, "num_tokens": 2737648.0, "step": 167 }, { "entropy": 0.5477638095617294, "epoch": 0.6268656716417911, "grad_norm": 0.11750344932079315, "learning_rate": 0.0002, "loss": 0.5432245135307312, "mean_token_accuracy": 0.7796685546636581, "num_tokens": 2753735.0, "step": 168 }, { "entropy": 0.5453169494867325, "epoch": 0.6305970149253731, "grad_norm": 0.11574184149503708, "learning_rate": 0.0002, "loss": 0.5411070585250854, "mean_token_accuracy": 0.779533714056015, "num_tokens": 2770229.0, "step": 169 }, { "entropy": 0.545142874121666, "epoch": 0.6343283582089553, "grad_norm": 0.13359719514846802, "learning_rate": 0.0002, "loss": 0.5482118129730225, "mean_token_accuracy": 0.7763011008501053, "num_tokens": 2786644.0, "step": 170 }, { "entropy": 0.5370890945196152, "epoch": 0.6380597014925373, "grad_norm": 0.14816807210445404, "learning_rate": 0.0002, "loss": 0.5420677661895752, "mean_token_accuracy": 0.7803799211978912, "num_tokens": 2802914.0, "step": 171 }, { "entropy": 0.5518854707479477, "epoch": 0.6417910447761194, "grad_norm": 0.1388852596282959, "learning_rate": 0.0002, "loss": 0.5512416958808899, "mean_token_accuracy": 0.7771147638559341, "num_tokens": 2819398.0, "step": 172 }, { "entropy": 0.5400035530328751, "epoch": 0.6455223880597015, "grad_norm": 0.1363624781370163, "learning_rate": 0.0002, "loss": 0.5326176881790161, "mean_token_accuracy": 0.7852664589881897, "num_tokens": 2835742.0, "step": 173 }, { "entropy": 0.5528566986322403, "epoch": 0.6492537313432836, "grad_norm": 0.13000693917274475, "learning_rate": 0.0002, "loss": 0.5492731928825378, "mean_token_accuracy": 0.7760010659694672, "num_tokens": 2852099.0, "step": 174 }, { "entropy": 0.5556752383708954, "epoch": 0.6529850746268657, "grad_norm": 0.11847010999917984, "learning_rate": 0.0002, "loss": 0.5595160722732544, "mean_token_accuracy": 0.7731318473815918, "num_tokens": 2868521.0, "step": 175 }, { "entropy": 0.5382126122713089, "epoch": 0.6567164179104478, "grad_norm": 0.13996672630310059, "learning_rate": 0.0002, "loss": 0.5406076312065125, "mean_token_accuracy": 0.7809479385614395, "num_tokens": 2884940.0, "step": 176 }, { "entropy": 0.5601803660392761, "epoch": 0.6604477611940298, "grad_norm": 0.17110760509967804, "learning_rate": 0.0002, "loss": 0.5693113207817078, "mean_token_accuracy": 0.7711411267518997, "num_tokens": 2901255.0, "step": 177 }, { "entropy": 0.5570882558822632, "epoch": 0.664179104477612, "grad_norm": 0.13338999450206757, "learning_rate": 0.0002, "loss": 0.5597653388977051, "mean_token_accuracy": 0.7734159678220749, "num_tokens": 2917815.0, "step": 178 }, { "entropy": 0.5541604459285736, "epoch": 0.667910447761194, "grad_norm": 0.15003007650375366, "learning_rate": 0.0002, "loss": 0.550830066204071, "mean_token_accuracy": 0.773952454328537, "num_tokens": 2934029.0, "step": 179 }, { "entropy": 0.5483301132917404, "epoch": 0.6716417910447762, "grad_norm": 0.13809660077095032, "learning_rate": 0.0002, "loss": 0.544836163520813, "mean_token_accuracy": 0.7802225351333618, "num_tokens": 2950186.0, "step": 180 }, { "entropy": 0.563317745923996, "epoch": 0.6753731343283582, "grad_norm": 0.11954832822084427, "learning_rate": 0.0002, "loss": 0.5579479932785034, "mean_token_accuracy": 0.7754767686128616, "num_tokens": 2966696.0, "step": 181 }, { "entropy": 0.5388910472393036, "epoch": 0.6791044776119403, "grad_norm": 0.1495479792356491, "learning_rate": 0.0002, "loss": 0.5441924929618835, "mean_token_accuracy": 0.7800770252943039, "num_tokens": 2982704.0, "step": 182 }, { "entropy": 0.5419297218322754, "epoch": 0.6828358208955224, "grad_norm": 0.13201352953910828, "learning_rate": 0.0002, "loss": 0.5452746152877808, "mean_token_accuracy": 0.7787511199712753, "num_tokens": 2998931.0, "step": 183 }, { "entropy": 0.5475537180900574, "epoch": 0.6865671641791045, "grad_norm": 0.11876624077558517, "learning_rate": 0.0002, "loss": 0.5537864565849304, "mean_token_accuracy": 0.77639339864254, "num_tokens": 3015465.0, "step": 184 }, { "entropy": 0.5443734228610992, "epoch": 0.6902985074626866, "grad_norm": 0.142917662858963, "learning_rate": 0.0002, "loss": 0.5402485728263855, "mean_token_accuracy": 0.7805273532867432, "num_tokens": 3031848.0, "step": 185 }, { "entropy": 0.5626855194568634, "epoch": 0.6940298507462687, "grad_norm": 0.12896916270256042, "learning_rate": 0.0002, "loss": 0.5567379593849182, "mean_token_accuracy": 0.7732013463973999, "num_tokens": 3048160.0, "step": 186 }, { "entropy": 0.5523503571748734, "epoch": 0.6977611940298507, "grad_norm": 0.13464562594890594, "learning_rate": 0.0002, "loss": 0.5460264086723328, "mean_token_accuracy": 0.7796957343816757, "num_tokens": 3064378.0, "step": 187 }, { "entropy": 0.5515571534633636, "epoch": 0.7014925373134329, "grad_norm": 0.1277887523174286, "learning_rate": 0.0002, "loss": 0.5548107028007507, "mean_token_accuracy": 0.773384153842926, "num_tokens": 3080909.0, "step": 188 }, { "entropy": 0.5496191382408142, "epoch": 0.7052238805970149, "grad_norm": 0.1543433964252472, "learning_rate": 0.0002, "loss": 0.5634362101554871, "mean_token_accuracy": 0.7713208198547363, "num_tokens": 3097164.0, "step": 189 }, { "entropy": 0.533801332116127, "epoch": 0.7089552238805971, "grad_norm": 0.1185467317700386, "learning_rate": 0.0002, "loss": 0.5395026206970215, "mean_token_accuracy": 0.7796055674552917, "num_tokens": 3113434.0, "step": 190 }, { "entropy": 0.5635387450456619, "epoch": 0.7126865671641791, "grad_norm": 0.12236445397138596, "learning_rate": 0.0002, "loss": 0.5628854632377625, "mean_token_accuracy": 0.7733010798692703, "num_tokens": 3129906.0, "step": 191 }, { "entropy": 0.5444195717573166, "epoch": 0.7164179104477612, "grad_norm": 0.1353861391544342, "learning_rate": 0.0002, "loss": 0.5396167039871216, "mean_token_accuracy": 0.7793399095535278, "num_tokens": 3145901.0, "step": 192 }, { "entropy": 0.5682615637779236, "epoch": 0.7201492537313433, "grad_norm": 0.11948243528604507, "learning_rate": 0.0002, "loss": 0.5587157011032104, "mean_token_accuracy": 0.774067297577858, "num_tokens": 3162257.0, "step": 193 }, { "entropy": 0.5397479832172394, "epoch": 0.7238805970149254, "grad_norm": 0.14794877171516418, "learning_rate": 0.0002, "loss": 0.5473200678825378, "mean_token_accuracy": 0.7760735005140305, "num_tokens": 3178362.0, "step": 194 }, { "entropy": 0.5612514019012451, "epoch": 0.7276119402985075, "grad_norm": 0.12478621304035187, "learning_rate": 0.0002, "loss": 0.5709495544433594, "mean_token_accuracy": 0.771531730890274, "num_tokens": 3195003.0, "step": 195 }, { "entropy": 0.5640581250190735, "epoch": 0.7313432835820896, "grad_norm": 0.13103285431861877, "learning_rate": 0.0002, "loss": 0.5633752942085266, "mean_token_accuracy": 0.7763072997331619, "num_tokens": 3211488.0, "step": 196 }, { "entropy": 0.5409631133079529, "epoch": 0.7350746268656716, "grad_norm": 0.11954586207866669, "learning_rate": 0.0002, "loss": 0.5412945747375488, "mean_token_accuracy": 0.7807609885931015, "num_tokens": 3227872.0, "step": 197 }, { "entropy": 0.5516713857650757, "epoch": 0.7388059701492538, "grad_norm": 0.1291007399559021, "learning_rate": 0.0002, "loss": 0.5551599264144897, "mean_token_accuracy": 0.776901364326477, "num_tokens": 3244275.0, "step": 198 }, { "entropy": 0.5520838648080826, "epoch": 0.7425373134328358, "grad_norm": 0.1325356811285019, "learning_rate": 0.0002, "loss": 0.5542269945144653, "mean_token_accuracy": 0.7749388813972473, "num_tokens": 3260730.0, "step": 199 }, { "entropy": 0.5531659871339798, "epoch": 0.746268656716418, "grad_norm": 0.11382137984037399, "learning_rate": 0.0002, "loss": 0.5500154495239258, "mean_token_accuracy": 0.7769201993942261, "num_tokens": 3277054.0, "step": 200 }, { "entropy": 0.5739943087100983, "epoch": 0.75, "grad_norm": 0.116433285176754, "learning_rate": 0.0002, "loss": 0.5693427920341492, "mean_token_accuracy": 0.7700029015541077, "num_tokens": 3293536.0, "step": 201 }, { "entropy": 0.5410773009061813, "epoch": 0.753731343283582, "grad_norm": 0.12128517776727676, "learning_rate": 0.0002, "loss": 0.5383925437927246, "mean_token_accuracy": 0.7806861847639084, "num_tokens": 3310044.0, "step": 202 }, { "entropy": 0.5345109105110168, "epoch": 0.7574626865671642, "grad_norm": 0.11475860327482224, "learning_rate": 0.0002, "loss": 0.5396114587783813, "mean_token_accuracy": 0.7786486446857452, "num_tokens": 3326424.0, "step": 203 }, { "entropy": 0.5596074312925339, "epoch": 0.7611940298507462, "grad_norm": 0.1144401878118515, "learning_rate": 0.0002, "loss": 0.559008777141571, "mean_token_accuracy": 0.7744818329811096, "num_tokens": 3342803.0, "step": 204 }, { "entropy": 0.5440013706684113, "epoch": 0.7649253731343284, "grad_norm": 0.117170050740242, "learning_rate": 0.0002, "loss": 0.5520018935203552, "mean_token_accuracy": 0.7764452546834946, "num_tokens": 3359289.0, "step": 205 }, { "entropy": 0.5440059304237366, "epoch": 0.7686567164179104, "grad_norm": 0.12146680057048798, "learning_rate": 0.0002, "loss": 0.543918251991272, "mean_token_accuracy": 0.7812443971633911, "num_tokens": 3375680.0, "step": 206 }, { "entropy": 0.559204563498497, "epoch": 0.7723880597014925, "grad_norm": 0.11677462607622147, "learning_rate": 0.0002, "loss": 0.5479013323783875, "mean_token_accuracy": 0.7783834487199783, "num_tokens": 3392230.0, "step": 207 }, { "entropy": 0.5695496201515198, "epoch": 0.7761194029850746, "grad_norm": 0.12663210928440094, "learning_rate": 0.0002, "loss": 0.5560157895088196, "mean_token_accuracy": 0.7768621742725372, "num_tokens": 3408667.0, "step": 208 }, { "entropy": 0.5218568593263626, "epoch": 0.7798507462686567, "grad_norm": 0.13396473228931427, "learning_rate": 0.0002, "loss": 0.5200244784355164, "mean_token_accuracy": 0.7892128974199295, "num_tokens": 3424766.0, "step": 209 }, { "entropy": 0.5524403154850006, "epoch": 0.7835820895522388, "grad_norm": 0.11780054867267609, "learning_rate": 0.0002, "loss": 0.5549524426460266, "mean_token_accuracy": 0.7762513756752014, "num_tokens": 3441010.0, "step": 210 }, { "entropy": 0.5339344441890717, "epoch": 0.7873134328358209, "grad_norm": 0.13986989855766296, "learning_rate": 0.0002, "loss": 0.5432649254798889, "mean_token_accuracy": 0.7810570001602173, "num_tokens": 3457051.0, "step": 211 }, { "entropy": 0.5393660813570023, "epoch": 0.7910447761194029, "grad_norm": 0.14846238493919373, "learning_rate": 0.0002, "loss": 0.5462239980697632, "mean_token_accuracy": 0.7770469635725021, "num_tokens": 3473237.0, "step": 212 }, { "entropy": 0.5482676774263382, "epoch": 0.7947761194029851, "grad_norm": 0.1279968023300171, "learning_rate": 0.0002, "loss": 0.5470429062843323, "mean_token_accuracy": 0.7772368937730789, "num_tokens": 3489557.0, "step": 213 }, { "entropy": 0.5750377625226974, "epoch": 0.7985074626865671, "grad_norm": 0.1574614942073822, "learning_rate": 0.0002, "loss": 0.5681816339492798, "mean_token_accuracy": 0.7696330845355988, "num_tokens": 3506111.0, "step": 214 }, { "entropy": 0.5552468150854111, "epoch": 0.8022388059701493, "grad_norm": 0.11573337018489838, "learning_rate": 0.0002, "loss": 0.5513306260108948, "mean_token_accuracy": 0.7750436067581177, "num_tokens": 3522546.0, "step": 215 }, { "entropy": 0.5544361621141434, "epoch": 0.8059701492537313, "grad_norm": 0.11837700754404068, "learning_rate": 0.0002, "loss": 0.553516685962677, "mean_token_accuracy": 0.7765354365110397, "num_tokens": 3539207.0, "step": 216 }, { "entropy": 0.5567323267459869, "epoch": 0.8097014925373134, "grad_norm": 0.15473680198192596, "learning_rate": 0.0002, "loss": 0.5699406862258911, "mean_token_accuracy": 0.769306480884552, "num_tokens": 3555606.0, "step": 217 }, { "entropy": 0.55356065928936, "epoch": 0.8134328358208955, "grad_norm": 0.10959180444478989, "learning_rate": 0.0002, "loss": 0.5509120225906372, "mean_token_accuracy": 0.7775351405143738, "num_tokens": 3571937.0, "step": 218 }, { "entropy": 0.5506166815757751, "epoch": 0.8171641791044776, "grad_norm": 0.1107836365699768, "learning_rate": 0.0002, "loss": 0.5498772859573364, "mean_token_accuracy": 0.7781967967748642, "num_tokens": 3588147.0, "step": 219 }, { "entropy": 0.5483623296022415, "epoch": 0.8208955223880597, "grad_norm": 0.12760840356349945, "learning_rate": 0.0002, "loss": 0.5440163016319275, "mean_token_accuracy": 0.7794655859470367, "num_tokens": 3604413.0, "step": 220 }, { "entropy": 0.5516934990882874, "epoch": 0.8246268656716418, "grad_norm": 0.13432522118091583, "learning_rate": 0.0002, "loss": 0.5498266220092773, "mean_token_accuracy": 0.7779892683029175, "num_tokens": 3620667.0, "step": 221 }, { "entropy": 0.5583075881004333, "epoch": 0.8283582089552238, "grad_norm": 0.1205005794763565, "learning_rate": 0.0002, "loss": 0.5606446266174316, "mean_token_accuracy": 0.7730143070220947, "num_tokens": 3637160.0, "step": 222 }, { "entropy": 0.5281430184841156, "epoch": 0.832089552238806, "grad_norm": 0.11834297329187393, "learning_rate": 0.0002, "loss": 0.5331573486328125, "mean_token_accuracy": 0.7839753329753876, "num_tokens": 3653562.0, "step": 223 }, { "entropy": 0.5474057644605637, "epoch": 0.835820895522388, "grad_norm": 0.12258574366569519, "learning_rate": 0.0002, "loss": 0.5449813604354858, "mean_token_accuracy": 0.780377060174942, "num_tokens": 3669951.0, "step": 224 }, { "entropy": 0.5545710325241089, "epoch": 0.8395522388059702, "grad_norm": 0.1338793784379959, "learning_rate": 0.0002, "loss": 0.5493278503417969, "mean_token_accuracy": 0.7759524881839752, "num_tokens": 3686193.0, "step": 225 }, { "entropy": 0.5437184125185013, "epoch": 0.8432835820895522, "grad_norm": 0.11655160784721375, "learning_rate": 0.0002, "loss": 0.5418398380279541, "mean_token_accuracy": 0.7775491774082184, "num_tokens": 3702353.0, "step": 226 }, { "entropy": 0.5532678067684174, "epoch": 0.8470149253731343, "grad_norm": 0.1549050509929657, "learning_rate": 0.0002, "loss": 0.5550553798675537, "mean_token_accuracy": 0.7763772308826447, "num_tokens": 3719232.0, "step": 227 }, { "entropy": 0.5559423863887787, "epoch": 0.8507462686567164, "grad_norm": 0.14761976897716522, "learning_rate": 0.0002, "loss": 0.5570894479751587, "mean_token_accuracy": 0.772933155298233, "num_tokens": 3735537.0, "step": 228 }, { "entropy": 0.5467868000268936, "epoch": 0.8544776119402985, "grad_norm": 0.1289997398853302, "learning_rate": 0.0002, "loss": 0.5503818988800049, "mean_token_accuracy": 0.7735268622636795, "num_tokens": 3751761.0, "step": 229 }, { "entropy": 0.5500779002904892, "epoch": 0.8582089552238806, "grad_norm": 0.1492077112197876, "learning_rate": 0.0002, "loss": 0.5505205392837524, "mean_token_accuracy": 0.777638703584671, "num_tokens": 3768182.0, "step": 230 }, { "entropy": 0.539194718003273, "epoch": 0.8619402985074627, "grad_norm": 0.11280067265033722, "learning_rate": 0.0002, "loss": 0.5417665243148804, "mean_token_accuracy": 0.7794284075498581, "num_tokens": 3784647.0, "step": 231 }, { "entropy": 0.5511510968208313, "epoch": 0.8656716417910447, "grad_norm": 0.13110041618347168, "learning_rate": 0.0002, "loss": 0.5588247776031494, "mean_token_accuracy": 0.7747578173875809, "num_tokens": 3801072.0, "step": 232 }, { "entropy": 0.5328868925571442, "epoch": 0.8694029850746269, "grad_norm": 0.11132191121578217, "learning_rate": 0.0002, "loss": 0.5321682095527649, "mean_token_accuracy": 0.785084918141365, "num_tokens": 3817270.0, "step": 233 }, { "entropy": 0.5497525930404663, "epoch": 0.8731343283582089, "grad_norm": 0.12497328221797943, "learning_rate": 0.0002, "loss": 0.5490625500679016, "mean_token_accuracy": 0.7780804187059402, "num_tokens": 3833650.0, "step": 234 }, { "entropy": 0.5649874210357666, "epoch": 0.8768656716417911, "grad_norm": 0.10820397734642029, "learning_rate": 0.0002, "loss": 0.5612732172012329, "mean_token_accuracy": 0.7699918150901794, "num_tokens": 3849965.0, "step": 235 }, { "entropy": 0.5564968436956406, "epoch": 0.8805970149253731, "grad_norm": 0.11200150102376938, "learning_rate": 0.0002, "loss": 0.5574247241020203, "mean_token_accuracy": 0.7737843245267868, "num_tokens": 3866325.0, "step": 236 }, { "entropy": 0.5345783978700638, "epoch": 0.8843283582089553, "grad_norm": 0.11046700924634933, "learning_rate": 0.0002, "loss": 0.5353702902793884, "mean_token_accuracy": 0.7825029641389847, "num_tokens": 3882836.0, "step": 237 }, { "entropy": 0.5462570339441299, "epoch": 0.8880597014925373, "grad_norm": 0.13713142275810242, "learning_rate": 0.0002, "loss": 0.5531303286552429, "mean_token_accuracy": 0.775889053940773, "num_tokens": 3899019.0, "step": 238 }, { "entropy": 0.5346651673316956, "epoch": 0.8917910447761194, "grad_norm": 0.11298073828220367, "learning_rate": 0.0002, "loss": 0.5383750796318054, "mean_token_accuracy": 0.780723512172699, "num_tokens": 3915451.0, "step": 239 }, { "entropy": 0.5661043077707291, "epoch": 0.8955223880597015, "grad_norm": 0.12630173563957214, "learning_rate": 0.0002, "loss": 0.5633317232131958, "mean_token_accuracy": 0.7725178003311157, "num_tokens": 3931857.0, "step": 240 }, { "entropy": 0.5499769002199173, "epoch": 0.8992537313432836, "grad_norm": 0.10539573431015015, "learning_rate": 0.0002, "loss": 0.5443609356880188, "mean_token_accuracy": 0.7807674556970596, "num_tokens": 3948251.0, "step": 241 }, { "entropy": 0.5542334765195847, "epoch": 0.9029850746268657, "grad_norm": 0.10860421508550644, "learning_rate": 0.0002, "loss": 0.5467254519462585, "mean_token_accuracy": 0.7777283936738968, "num_tokens": 3964506.0, "step": 242 }, { "entropy": 0.5593715906143188, "epoch": 0.9067164179104478, "grad_norm": 0.11269830167293549, "learning_rate": 0.0002, "loss": 0.5568402409553528, "mean_token_accuracy": 0.7743813842535019, "num_tokens": 3980991.0, "step": 243 }, { "entropy": 0.5386274456977844, "epoch": 0.9104477611940298, "grad_norm": 0.12022864073514938, "learning_rate": 0.0002, "loss": 0.538654088973999, "mean_token_accuracy": 0.7814032137393951, "num_tokens": 3997541.0, "step": 244 }, { "entropy": 0.5274675115942955, "epoch": 0.914179104477612, "grad_norm": 0.14818064868450165, "learning_rate": 0.0002, "loss": 0.5381026268005371, "mean_token_accuracy": 0.7816068381071091, "num_tokens": 4013664.0, "step": 245 }, { "entropy": 0.5379235744476318, "epoch": 0.917910447761194, "grad_norm": 0.1228220984339714, "learning_rate": 0.0002, "loss": 0.5409340858459473, "mean_token_accuracy": 0.7790304571390152, "num_tokens": 4029963.0, "step": 246 }, { "entropy": 0.5446107536554337, "epoch": 0.9216417910447762, "grad_norm": 0.12891873717308044, "learning_rate": 0.0002, "loss": 0.5515777468681335, "mean_token_accuracy": 0.7764184921979904, "num_tokens": 4046258.0, "step": 247 }, { "entropy": 0.5525491833686829, "epoch": 0.9253731343283582, "grad_norm": 0.1355786919593811, "learning_rate": 0.0002, "loss": 0.5416724681854248, "mean_token_accuracy": 0.7802292257547379, "num_tokens": 4062506.0, "step": 248 }, { "entropy": 0.536956250667572, "epoch": 0.9291044776119403, "grad_norm": 0.12736709415912628, "learning_rate": 0.0002, "loss": 0.5312113761901855, "mean_token_accuracy": 0.783654510974884, "num_tokens": 4078661.0, "step": 249 }, { "entropy": 0.5549832433462143, "epoch": 0.9328358208955224, "grad_norm": 0.12017148733139038, "learning_rate": 0.0002, "loss": 0.5565866827964783, "mean_token_accuracy": 0.773817777633667, "num_tokens": 4095022.0, "step": 250 }, { "entropy": 0.5422243773937225, "epoch": 0.9365671641791045, "grad_norm": 0.13573786616325378, "learning_rate": 0.0002, "loss": 0.5521195530891418, "mean_token_accuracy": 0.7785970866680145, "num_tokens": 4111402.0, "step": 251 }, { "entropy": 0.5538443177938461, "epoch": 0.9402985074626866, "grad_norm": 0.11428782343864441, "learning_rate": 0.0002, "loss": 0.5559377670288086, "mean_token_accuracy": 0.7728682309389114, "num_tokens": 4127625.0, "step": 252 }, { "entropy": 0.5606874525547028, "epoch": 0.9440298507462687, "grad_norm": 0.11228293180465698, "learning_rate": 0.0002, "loss": 0.5537079572677612, "mean_token_accuracy": 0.7777886986732483, "num_tokens": 4144209.0, "step": 253 }, { "entropy": 0.5587089955806732, "epoch": 0.9477611940298507, "grad_norm": 0.11430441588163376, "learning_rate": 0.0002, "loss": 0.5511766672134399, "mean_token_accuracy": 0.7764836251735687, "num_tokens": 4160587.0, "step": 254 }, { "entropy": 0.5543984770774841, "epoch": 0.9514925373134329, "grad_norm": 0.11914564669132233, "learning_rate": 0.0002, "loss": 0.5457825064659119, "mean_token_accuracy": 0.7772367298603058, "num_tokens": 4177078.0, "step": 255 }, { "entropy": 0.5496934354305267, "epoch": 0.9552238805970149, "grad_norm": 0.11808159202337265, "learning_rate": 0.0002, "loss": 0.5523373484611511, "mean_token_accuracy": 0.7758414000272751, "num_tokens": 4193671.0, "step": 256 }, { "entropy": 0.5323416441679001, "epoch": 0.9589552238805971, "grad_norm": 0.12709033489227295, "learning_rate": 0.0002, "loss": 0.5384759902954102, "mean_token_accuracy": 0.7808651477098465, "num_tokens": 4210085.0, "step": 257 }, { "entropy": 0.5338983610272408, "epoch": 0.9626865671641791, "grad_norm": 0.13908886909484863, "learning_rate": 0.0002, "loss": 0.5462735891342163, "mean_token_accuracy": 0.7780435681343079, "num_tokens": 4226494.0, "step": 258 }, { "entropy": 0.5453044772148132, "epoch": 0.9664179104477612, "grad_norm": 0.12644866108894348, "learning_rate": 0.0002, "loss": 0.551929235458374, "mean_token_accuracy": 0.775839775800705, "num_tokens": 4242785.0, "step": 259 }, { "entropy": 0.5603075176477432, "epoch": 0.9701492537313433, "grad_norm": 0.12755440175533295, "learning_rate": 0.0002, "loss": 0.5524581670761108, "mean_token_accuracy": 0.7771914452314377, "num_tokens": 4259299.0, "step": 260 }, { "entropy": 0.5615698993206024, "epoch": 0.9738805970149254, "grad_norm": 0.12908904254436493, "learning_rate": 0.0002, "loss": 0.5537154078483582, "mean_token_accuracy": 0.7739745527505875, "num_tokens": 4275749.0, "step": 261 }, { "entropy": 0.5526564866304398, "epoch": 0.9776119402985075, "grad_norm": 0.10715582221746445, "learning_rate": 0.0002, "loss": 0.5478145480155945, "mean_token_accuracy": 0.7770287841558456, "num_tokens": 4291706.0, "step": 262 }, { "entropy": 0.5461979508399963, "epoch": 0.9813432835820896, "grad_norm": 0.14307166635990143, "learning_rate": 0.0002, "loss": 0.5454379916191101, "mean_token_accuracy": 0.7798766791820526, "num_tokens": 4308137.0, "step": 263 }, { "entropy": 0.5203245729207993, "epoch": 0.9850746268656716, "grad_norm": 0.15710005164146423, "learning_rate": 0.0002, "loss": 0.5299646258354187, "mean_token_accuracy": 0.7843145579099655, "num_tokens": 4324411.0, "step": 264 }, { "entropy": 0.5302061140537262, "epoch": 0.9888059701492538, "grad_norm": 0.1519300937652588, "learning_rate": 0.0002, "loss": 0.5403961539268494, "mean_token_accuracy": 0.7806786000728607, "num_tokens": 4340384.0, "step": 265 }, { "entropy": 0.5364599078893661, "epoch": 0.9925373134328358, "grad_norm": 0.13450899720191956, "learning_rate": 0.0002, "loss": 0.5356532335281372, "mean_token_accuracy": 0.7834792584180832, "num_tokens": 4356954.0, "step": 266 }, { "entropy": 0.5519508272409439, "epoch": 0.996268656716418, "grad_norm": 0.13190409541130066, "learning_rate": 0.0002, "loss": 0.5425809621810913, "mean_token_accuracy": 0.7814677059650421, "num_tokens": 4373557.0, "step": 267 }, { "entropy": 0.5717380940914154, "epoch": 1.0, "grad_norm": 0.13511350750923157, "learning_rate": 0.0002, "loss": 0.5594110488891602, "mean_token_accuracy": 0.7763755470514297, "num_tokens": 4390028.0, "step": 268 }, { "entropy": 0.5333094298839569, "epoch": 1.0037313432835822, "grad_norm": 0.11232882738113403, "learning_rate": 0.0002, "loss": 0.5279825925827026, "mean_token_accuracy": 0.7831753939390182, "num_tokens": 4406075.0, "step": 269 }, { "entropy": 0.5085988268256187, "epoch": 1.007462686567164, "grad_norm": 0.1554645448923111, "learning_rate": 0.0002, "loss": 0.516677737236023, "mean_token_accuracy": 0.7916137427091599, "num_tokens": 4422444.0, "step": 270 }, { "entropy": 0.5372590869665146, "epoch": 1.0111940298507462, "grad_norm": 0.14206163585186005, "learning_rate": 0.0002, "loss": 0.542325496673584, "mean_token_accuracy": 0.7813751995563507, "num_tokens": 4438619.0, "step": 271 }, { "entropy": 0.5327645987272263, "epoch": 1.0149253731343284, "grad_norm": 0.12639598548412323, "learning_rate": 0.0002, "loss": 0.5381733775138855, "mean_token_accuracy": 0.7798869907855988, "num_tokens": 4455013.0, "step": 272 }, { "entropy": 0.5318270623683929, "epoch": 1.0186567164179103, "grad_norm": 0.14597581326961517, "learning_rate": 0.0002, "loss": 0.5323677659034729, "mean_token_accuracy": 0.7859037518501282, "num_tokens": 4471596.0, "step": 273 }, { "entropy": 0.549939751625061, "epoch": 1.0223880597014925, "grad_norm": 0.14265935122966766, "learning_rate": 0.0002, "loss": 0.5377833247184753, "mean_token_accuracy": 0.7833307683467865, "num_tokens": 4487885.0, "step": 274 }, { "entropy": 0.549922838807106, "epoch": 1.0261194029850746, "grad_norm": 0.1281050145626068, "learning_rate": 0.0002, "loss": 0.5483719706535339, "mean_token_accuracy": 0.7763915956020355, "num_tokens": 4504279.0, "step": 275 }, { "entropy": 0.5519027858972549, "epoch": 1.0298507462686568, "grad_norm": 0.13199536502361298, "learning_rate": 0.0002, "loss": 0.5520401000976562, "mean_token_accuracy": 0.7754272371530533, "num_tokens": 4520877.0, "step": 276 }, { "entropy": 0.5326957255601883, "epoch": 1.0335820895522387, "grad_norm": 0.13716775178909302, "learning_rate": 0.0002, "loss": 0.5377839207649231, "mean_token_accuracy": 0.77959144115448, "num_tokens": 4537306.0, "step": 277 }, { "entropy": 0.5343386679887772, "epoch": 1.037313432835821, "grad_norm": 0.12250324338674545, "learning_rate": 0.0002, "loss": 0.5346370935440063, "mean_token_accuracy": 0.7819696217775345, "num_tokens": 4553694.0, "step": 278 }, { "entropy": 0.5221862643957138, "epoch": 1.041044776119403, "grad_norm": 0.14083418250083923, "learning_rate": 0.0002, "loss": 0.5204699039459229, "mean_token_accuracy": 0.7915231883525848, "num_tokens": 4569929.0, "step": 279 }, { "entropy": 0.5506787896156311, "epoch": 1.044776119402985, "grad_norm": 0.11459501832723618, "learning_rate": 0.0002, "loss": 0.5497503280639648, "mean_token_accuracy": 0.7762598097324371, "num_tokens": 4586327.0, "step": 280 }, { "entropy": 0.5387643724679947, "epoch": 1.0485074626865671, "grad_norm": 0.1149069145321846, "learning_rate": 0.0002, "loss": 0.536687970161438, "mean_token_accuracy": 0.7849635928869247, "num_tokens": 4602577.0, "step": 281 }, { "entropy": 0.5402974784374237, "epoch": 1.0522388059701493, "grad_norm": 0.13960953056812286, "learning_rate": 0.0002, "loss": 0.5357297658920288, "mean_token_accuracy": 0.782235711812973, "num_tokens": 4618829.0, "step": 282 }, { "entropy": 0.5379159897565842, "epoch": 1.0559701492537314, "grad_norm": 0.12440282106399536, "learning_rate": 0.0002, "loss": 0.5391443967819214, "mean_token_accuracy": 0.7829291224479675, "num_tokens": 4635167.0, "step": 283 }, { "entropy": 0.5129481852054596, "epoch": 1.0597014925373134, "grad_norm": 0.13519050180912018, "learning_rate": 0.0002, "loss": 0.5105025768280029, "mean_token_accuracy": 0.7926614433526993, "num_tokens": 4651165.0, "step": 284 }, { "entropy": 0.5542086809873581, "epoch": 1.0634328358208955, "grad_norm": 0.14323101937770844, "learning_rate": 0.0002, "loss": 0.5622052550315857, "mean_token_accuracy": 0.7727599292993546, "num_tokens": 4667347.0, "step": 285 }, { "entropy": 0.5243228375911713, "epoch": 1.0671641791044777, "grad_norm": 0.1330215483903885, "learning_rate": 0.0002, "loss": 0.5247523188591003, "mean_token_accuracy": 0.7867335379123688, "num_tokens": 4684015.0, "step": 286 }, { "entropy": 0.5412201136350632, "epoch": 1.0708955223880596, "grad_norm": 0.13448479771614075, "learning_rate": 0.0002, "loss": 0.54647296667099, "mean_token_accuracy": 0.7774277031421661, "num_tokens": 4700242.0, "step": 287 }, { "entropy": 0.5454149097204208, "epoch": 1.0746268656716418, "grad_norm": 0.13259278237819672, "learning_rate": 0.0002, "loss": 0.5461288690567017, "mean_token_accuracy": 0.7782861590385437, "num_tokens": 4716442.0, "step": 288 }, { "entropy": 0.526309534907341, "epoch": 1.078358208955224, "grad_norm": 0.12522561848163605, "learning_rate": 0.0002, "loss": 0.5221973061561584, "mean_token_accuracy": 0.789994552731514, "num_tokens": 4732742.0, "step": 289 }, { "entropy": 0.5411332100629807, "epoch": 1.0820895522388059, "grad_norm": 0.12081784009933472, "learning_rate": 0.0002, "loss": 0.5372704863548279, "mean_token_accuracy": 0.7822500914335251, "num_tokens": 4749084.0, "step": 290 }, { "entropy": 0.5575008988380432, "epoch": 1.085820895522388, "grad_norm": 0.11303576827049255, "learning_rate": 0.0002, "loss": 0.5508702397346497, "mean_token_accuracy": 0.7754259258508682, "num_tokens": 4765562.0, "step": 291 }, { "entropy": 0.5357666164636612, "epoch": 1.0895522388059702, "grad_norm": 0.12666599452495575, "learning_rate": 0.0002, "loss": 0.5432624220848083, "mean_token_accuracy": 0.7804068475961685, "num_tokens": 4781995.0, "step": 292 }, { "entropy": 0.5331733524799347, "epoch": 1.0932835820895523, "grad_norm": 0.12246809899806976, "learning_rate": 0.0002, "loss": 0.5331196784973145, "mean_token_accuracy": 0.7823672741651535, "num_tokens": 4798355.0, "step": 293 }, { "entropy": 0.531685009598732, "epoch": 1.0970149253731343, "grad_norm": 0.12172231823205948, "learning_rate": 0.0002, "loss": 0.5293748378753662, "mean_token_accuracy": 0.7843722105026245, "num_tokens": 4814357.0, "step": 294 }, { "entropy": 0.554166242480278, "epoch": 1.1007462686567164, "grad_norm": 0.14191463589668274, "learning_rate": 0.0002, "loss": 0.5532712936401367, "mean_token_accuracy": 0.7733844220638275, "num_tokens": 4830954.0, "step": 295 }, { "entropy": 0.5282094776630402, "epoch": 1.1044776119402986, "grad_norm": 0.14205436408519745, "learning_rate": 0.0002, "loss": 0.530907392501831, "mean_token_accuracy": 0.7830108106136322, "num_tokens": 4847654.0, "step": 296 }, { "entropy": 0.5379532426595688, "epoch": 1.1082089552238805, "grad_norm": 0.12750715017318726, "learning_rate": 0.0002, "loss": 0.5367629528045654, "mean_token_accuracy": 0.7796261459589005, "num_tokens": 4864209.0, "step": 297 }, { "entropy": 0.5312085449695587, "epoch": 1.1119402985074627, "grad_norm": 0.11801420152187347, "learning_rate": 0.0002, "loss": 0.5278028845787048, "mean_token_accuracy": 0.7856296449899673, "num_tokens": 4880489.0, "step": 298 }, { "entropy": 0.5340657457709312, "epoch": 1.1156716417910448, "grad_norm": 0.1341157853603363, "learning_rate": 0.0002, "loss": 0.5332481265068054, "mean_token_accuracy": 0.7815297544002533, "num_tokens": 4897040.0, "step": 299 }, { "entropy": 0.5495938658714294, "epoch": 1.1194029850746268, "grad_norm": 0.15130798518657684, "learning_rate": 0.0002, "loss": 0.5522593855857849, "mean_token_accuracy": 0.7767154276371002, "num_tokens": 4913499.0, "step": 300 }, { "entropy": 0.5539788007736206, "epoch": 1.123134328358209, "grad_norm": 0.16235828399658203, "learning_rate": 0.0002, "loss": 0.556696891784668, "mean_token_accuracy": 0.7743791192770004, "num_tokens": 4930129.0, "step": 301 }, { "entropy": 0.5188294276595116, "epoch": 1.126865671641791, "grad_norm": 0.15251989662647247, "learning_rate": 0.0002, "loss": 0.5240339040756226, "mean_token_accuracy": 0.7848995476961136, "num_tokens": 4946505.0, "step": 302 }, { "entropy": 0.5330336540937424, "epoch": 1.1305970149253732, "grad_norm": 0.12010055035352707, "learning_rate": 0.0002, "loss": 0.530551552772522, "mean_token_accuracy": 0.7852707505226135, "num_tokens": 4963130.0, "step": 303 }, { "entropy": 0.5485537797212601, "epoch": 1.1343283582089552, "grad_norm": 0.12690100073814392, "learning_rate": 0.0002, "loss": 0.5355115532875061, "mean_token_accuracy": 0.7832664847373962, "num_tokens": 4979396.0, "step": 304 }, { "entropy": 0.5363626033067703, "epoch": 1.1380597014925373, "grad_norm": 0.12670499086380005, "learning_rate": 0.0002, "loss": 0.5318777561187744, "mean_token_accuracy": 0.7821652144193649, "num_tokens": 4995808.0, "step": 305 }, { "entropy": 0.556913822889328, "epoch": 1.1417910447761195, "grad_norm": 0.1417754739522934, "learning_rate": 0.0002, "loss": 0.5632070899009705, "mean_token_accuracy": 0.7711838483810425, "num_tokens": 5012247.0, "step": 306 }, { "entropy": 0.531732589006424, "epoch": 1.1455223880597014, "grad_norm": 0.12725508213043213, "learning_rate": 0.0002, "loss": 0.5370599627494812, "mean_token_accuracy": 0.7827656418085098, "num_tokens": 5028592.0, "step": 307 }, { "entropy": 0.5216507539153099, "epoch": 1.1492537313432836, "grad_norm": 0.14518076181411743, "learning_rate": 0.0002, "loss": 0.5285972952842712, "mean_token_accuracy": 0.7866590619087219, "num_tokens": 5044691.0, "step": 308 }, { "entropy": 0.5357843339443207, "epoch": 1.1529850746268657, "grad_norm": 0.14331640303134918, "learning_rate": 0.0002, "loss": 0.5414748191833496, "mean_token_accuracy": 0.7796436995267868, "num_tokens": 5060981.0, "step": 309 }, { "entropy": 0.550069585442543, "epoch": 1.1567164179104479, "grad_norm": 0.1419994831085205, "learning_rate": 0.0002, "loss": 0.5494908690452576, "mean_token_accuracy": 0.774166613817215, "num_tokens": 5077445.0, "step": 310 }, { "entropy": 0.5334684997797012, "epoch": 1.1604477611940298, "grad_norm": 0.13464997708797455, "learning_rate": 0.0002, "loss": 0.5329424738883972, "mean_token_accuracy": 0.7852184623479843, "num_tokens": 5093959.0, "step": 311 }, { "entropy": 0.5384779423475266, "epoch": 1.164179104477612, "grad_norm": 0.12344568222761154, "learning_rate": 0.0002, "loss": 0.5393214821815491, "mean_token_accuracy": 0.783161386847496, "num_tokens": 5110114.0, "step": 312 }, { "entropy": 0.566596269607544, "epoch": 1.1679104477611941, "grad_norm": 0.13426469266414642, "learning_rate": 0.0002, "loss": 0.5611933469772339, "mean_token_accuracy": 0.7707538902759552, "num_tokens": 5126500.0, "step": 313 }, { "entropy": 0.5522208511829376, "epoch": 1.171641791044776, "grad_norm": 0.11628863960504532, "learning_rate": 0.0002, "loss": 0.544135332107544, "mean_token_accuracy": 0.7789785116910934, "num_tokens": 5143003.0, "step": 314 }, { "entropy": 0.5286403447389603, "epoch": 1.1753731343283582, "grad_norm": 0.1331920623779297, "learning_rate": 0.0002, "loss": 0.5280863046646118, "mean_token_accuracy": 0.7847232520580292, "num_tokens": 5159209.0, "step": 315 }, { "entropy": 0.5208230093121529, "epoch": 1.1791044776119404, "grad_norm": 0.16730330884456635, "learning_rate": 0.0002, "loss": 0.5261422395706177, "mean_token_accuracy": 0.7885824292898178, "num_tokens": 5175336.0, "step": 316 }, { "entropy": 0.5139501839876175, "epoch": 1.1828358208955223, "grad_norm": 0.17113769054412842, "learning_rate": 0.0002, "loss": 0.5231570601463318, "mean_token_accuracy": 0.7852117121219635, "num_tokens": 5191589.0, "step": 317 }, { "entropy": 0.5446046590805054, "epoch": 1.1865671641791045, "grad_norm": 0.13907761871814728, "learning_rate": 0.0002, "loss": 0.5399054288864136, "mean_token_accuracy": 0.7820506691932678, "num_tokens": 5207939.0, "step": 318 }, { "entropy": 0.5267596393823624, "epoch": 1.1902985074626866, "grad_norm": 0.1434536576271057, "learning_rate": 0.0002, "loss": 0.5265440344810486, "mean_token_accuracy": 0.7849590480327606, "num_tokens": 5224274.0, "step": 319 }, { "entropy": 0.5274358987808228, "epoch": 1.1940298507462686, "grad_norm": 0.1331617832183838, "learning_rate": 0.0002, "loss": 0.5201226472854614, "mean_token_accuracy": 0.7877639383077621, "num_tokens": 5240488.0, "step": 320 }, { "entropy": 0.5438350588083267, "epoch": 1.1977611940298507, "grad_norm": 0.13051791489124298, "learning_rate": 0.0002, "loss": 0.5417760610580444, "mean_token_accuracy": 0.7801128923892975, "num_tokens": 5256913.0, "step": 321 }, { "entropy": 0.5419559478759766, "epoch": 1.2014925373134329, "grad_norm": 0.1651846319437027, "learning_rate": 0.0002, "loss": 0.5418766140937805, "mean_token_accuracy": 0.78228460252285, "num_tokens": 5273335.0, "step": 322 }, { "entropy": 0.5415368527173996, "epoch": 1.205223880597015, "grad_norm": 0.16951487958431244, "learning_rate": 0.0002, "loss": 0.5506861209869385, "mean_token_accuracy": 0.7753586024045944, "num_tokens": 5289759.0, "step": 323 }, { "entropy": 0.5358785539865494, "epoch": 1.208955223880597, "grad_norm": 0.1276499480009079, "learning_rate": 0.0002, "loss": 0.536015510559082, "mean_token_accuracy": 0.7820306271314621, "num_tokens": 5305982.0, "step": 324 }, { "entropy": 0.5399276316165924, "epoch": 1.212686567164179, "grad_norm": 0.13910017907619476, "learning_rate": 0.0002, "loss": 0.5390846133232117, "mean_token_accuracy": 0.7822140157222748, "num_tokens": 5322089.0, "step": 325 }, { "entropy": 0.54273721575737, "epoch": 1.2164179104477613, "grad_norm": 0.14252571761608124, "learning_rate": 0.0002, "loss": 0.544661283493042, "mean_token_accuracy": 0.7795404642820358, "num_tokens": 5338453.0, "step": 326 }, { "entropy": 0.5249434560537338, "epoch": 1.2201492537313432, "grad_norm": 0.1477581411600113, "learning_rate": 0.0002, "loss": 0.5217203497886658, "mean_token_accuracy": 0.7876597344875336, "num_tokens": 5354700.0, "step": 327 }, { "entropy": 0.5396385788917542, "epoch": 1.2238805970149254, "grad_norm": 0.14778634905815125, "learning_rate": 0.0002, "loss": 0.5354180335998535, "mean_token_accuracy": 0.7824464589357376, "num_tokens": 5371063.0, "step": 328 }, { "entropy": 0.5529858469963074, "epoch": 1.2276119402985075, "grad_norm": 0.13042840361595154, "learning_rate": 0.0002, "loss": 0.5544819831848145, "mean_token_accuracy": 0.7761342972517014, "num_tokens": 5387332.0, "step": 329 }, { "entropy": 0.5454379618167877, "epoch": 1.2313432835820897, "grad_norm": 0.15361081063747406, "learning_rate": 0.0002, "loss": 0.5482691526412964, "mean_token_accuracy": 0.7785263955593109, "num_tokens": 5403888.0, "step": 330 }, { "entropy": 0.5411872565746307, "epoch": 1.2350746268656716, "grad_norm": 0.1457548439502716, "learning_rate": 0.0002, "loss": 0.5460063219070435, "mean_token_accuracy": 0.7781393676996231, "num_tokens": 5420504.0, "step": 331 }, { "entropy": 0.5440556704998016, "epoch": 1.2388059701492538, "grad_norm": 0.17071455717086792, "learning_rate": 0.0002, "loss": 0.5447981357574463, "mean_token_accuracy": 0.7792220860719681, "num_tokens": 5436983.0, "step": 332 }, { "entropy": 0.5312773138284683, "epoch": 1.242537313432836, "grad_norm": 0.15535041689872742, "learning_rate": 0.0002, "loss": 0.5284558534622192, "mean_token_accuracy": 0.7843498289585114, "num_tokens": 5453439.0, "step": 333 }, { "entropy": 0.5413801819086075, "epoch": 1.2462686567164178, "grad_norm": 0.12389594316482544, "learning_rate": 0.0002, "loss": 0.5376867651939392, "mean_token_accuracy": 0.7829112410545349, "num_tokens": 5470171.0, "step": 334 }, { "entropy": 0.5580787807703018, "epoch": 1.25, "grad_norm": 0.15255525708198547, "learning_rate": 0.0002, "loss": 0.5539383292198181, "mean_token_accuracy": 0.7776496410369873, "num_tokens": 5486721.0, "step": 335 }, { "entropy": 0.551739051938057, "epoch": 1.2537313432835822, "grad_norm": 0.14014676213264465, "learning_rate": 0.0002, "loss": 0.5544667840003967, "mean_token_accuracy": 0.7750911116600037, "num_tokens": 5502822.0, "step": 336 }, { "entropy": 0.5480811297893524, "epoch": 1.2574626865671643, "grad_norm": 0.1353754997253418, "learning_rate": 0.0002, "loss": 0.5507966876029968, "mean_token_accuracy": 0.7761414647102356, "num_tokens": 5519323.0, "step": 337 }, { "entropy": 0.5414211302995682, "epoch": 1.2611940298507462, "grad_norm": 0.1243680939078331, "learning_rate": 0.0002, "loss": 0.5453186631202698, "mean_token_accuracy": 0.7782161980867386, "num_tokens": 5535863.0, "step": 338 }, { "entropy": 0.527251847088337, "epoch": 1.2649253731343284, "grad_norm": 0.1459769904613495, "learning_rate": 0.0002, "loss": 0.5396205186843872, "mean_token_accuracy": 0.7795730829238892, "num_tokens": 5552171.0, "step": 339 }, { "entropy": 0.5239678472280502, "epoch": 1.2686567164179103, "grad_norm": 0.12427864223718643, "learning_rate": 0.0002, "loss": 0.5271449089050293, "mean_token_accuracy": 0.7882652282714844, "num_tokens": 5568175.0, "step": 340 }, { "entropy": 0.543644979596138, "epoch": 1.2723880597014925, "grad_norm": 0.11923787742853165, "learning_rate": 0.0002, "loss": 0.5382894277572632, "mean_token_accuracy": 0.7825156450271606, "num_tokens": 5584465.0, "step": 341 }, { "entropy": 0.5515155345201492, "epoch": 1.2761194029850746, "grad_norm": 0.11743160337209702, "learning_rate": 0.0002, "loss": 0.5425710082054138, "mean_token_accuracy": 0.7795869261026382, "num_tokens": 5601282.0, "step": 342 }, { "entropy": 0.556594654917717, "epoch": 1.2798507462686568, "grad_norm": 0.13206258416175842, "learning_rate": 0.0002, "loss": 0.553520679473877, "mean_token_accuracy": 0.7744052857160568, "num_tokens": 5617511.0, "step": 343 }, { "entropy": 0.5562093108892441, "epoch": 1.2835820895522387, "grad_norm": 0.1419561356306076, "learning_rate": 0.0002, "loss": 0.5573539733886719, "mean_token_accuracy": 0.7758442610502243, "num_tokens": 5634008.0, "step": 344 }, { "entropy": 0.5295949876308441, "epoch": 1.287313432835821, "grad_norm": 0.136697456240654, "learning_rate": 0.0002, "loss": 0.536439836025238, "mean_token_accuracy": 0.7857220619916916, "num_tokens": 5650510.0, "step": 345 }, { "entropy": 0.5379302501678467, "epoch": 1.291044776119403, "grad_norm": 0.12953169643878937, "learning_rate": 0.0002, "loss": 0.5420789122581482, "mean_token_accuracy": 0.7796627283096313, "num_tokens": 5667049.0, "step": 346 }, { "entropy": 0.5327381789684296, "epoch": 1.294776119402985, "grad_norm": 0.12574538588523865, "learning_rate": 0.0002, "loss": 0.5231812000274658, "mean_token_accuracy": 0.7879898250102997, "num_tokens": 5683103.0, "step": 347 }, { "entropy": 0.5485990345478058, "epoch": 1.2985074626865671, "grad_norm": 0.12788420915603638, "learning_rate": 0.0002, "loss": 0.5398032665252686, "mean_token_accuracy": 0.782793402671814, "num_tokens": 5699531.0, "step": 348 }, { "entropy": 0.533822700381279, "epoch": 1.3022388059701493, "grad_norm": 0.12131965160369873, "learning_rate": 0.0002, "loss": 0.5313589572906494, "mean_token_accuracy": 0.7867582440376282, "num_tokens": 5715578.0, "step": 349 }, { "entropy": 0.5322218984365463, "epoch": 1.3059701492537314, "grad_norm": 0.13636337220668793, "learning_rate": 0.0002, "loss": 0.5401290655136108, "mean_token_accuracy": 0.781011775135994, "num_tokens": 5731885.0, "step": 350 }, { "entropy": 0.5119979977607727, "epoch": 1.3097014925373134, "grad_norm": 0.1538715660572052, "learning_rate": 0.0002, "loss": 0.5197798013687134, "mean_token_accuracy": 0.787521705031395, "num_tokens": 5748165.0, "step": 351 }, { "entropy": 0.522780068218708, "epoch": 1.3134328358208955, "grad_norm": 0.16598650813102722, "learning_rate": 0.0002, "loss": 0.5323340892791748, "mean_token_accuracy": 0.7844688296318054, "num_tokens": 5764530.0, "step": 352 }, { "entropy": 0.5400198400020599, "epoch": 1.3171641791044777, "grad_norm": 0.13400353491306305, "learning_rate": 0.0002, "loss": 0.5443472266197205, "mean_token_accuracy": 0.7780963182449341, "num_tokens": 5780899.0, "step": 353 }, { "entropy": 0.556030884385109, "epoch": 1.3208955223880596, "grad_norm": 0.13756664097309113, "learning_rate": 0.0002, "loss": 0.5470365285873413, "mean_token_accuracy": 0.7808873951435089, "num_tokens": 5796973.0, "step": 354 }, { "entropy": 0.5455010533332825, "epoch": 1.3246268656716418, "grad_norm": 0.17140203714370728, "learning_rate": 0.0002, "loss": 0.534233808517456, "mean_token_accuracy": 0.7828006148338318, "num_tokens": 5813201.0, "step": 355 }, { "entropy": 0.5456499308347702, "epoch": 1.328358208955224, "grad_norm": 0.13772569596767426, "learning_rate": 0.0002, "loss": 0.5461813807487488, "mean_token_accuracy": 0.7786128669977188, "num_tokens": 5829457.0, "step": 356 }, { "entropy": 0.5223972797393799, "epoch": 1.332089552238806, "grad_norm": 0.22252066433429718, "learning_rate": 0.0002, "loss": 0.5330066084861755, "mean_token_accuracy": 0.7818692922592163, "num_tokens": 5845786.0, "step": 357 }, { "entropy": 0.5292713642120361, "epoch": 1.335820895522388, "grad_norm": 0.14202645421028137, "learning_rate": 0.0002, "loss": 0.5392715930938721, "mean_token_accuracy": 0.7805515229701996, "num_tokens": 5862226.0, "step": 358 }, { "entropy": 0.5300968736410141, "epoch": 1.3395522388059702, "grad_norm": 0.18332785367965698, "learning_rate": 0.0002, "loss": 0.5347115993499756, "mean_token_accuracy": 0.7835317403078079, "num_tokens": 5878683.0, "step": 359 }, { "entropy": 0.5431934744119644, "epoch": 1.3432835820895521, "grad_norm": 0.14532189071178436, "learning_rate": 0.0002, "loss": 0.5330429077148438, "mean_token_accuracy": 0.7804477661848068, "num_tokens": 5895049.0, "step": 360 }, { "entropy": 0.5435428023338318, "epoch": 1.3470149253731343, "grad_norm": 0.1675368696451187, "learning_rate": 0.0002, "loss": 0.5300995707511902, "mean_token_accuracy": 0.785721018910408, "num_tokens": 5911501.0, "step": 361 }, { "entropy": 0.5362260937690735, "epoch": 1.3507462686567164, "grad_norm": 0.12240255624055862, "learning_rate": 0.0002, "loss": 0.5256680846214294, "mean_token_accuracy": 0.7851513922214508, "num_tokens": 5927731.0, "step": 362 }, { "entropy": 0.5452938824892044, "epoch": 1.3544776119402986, "grad_norm": 0.15949903428554535, "learning_rate": 0.0002, "loss": 0.5495162010192871, "mean_token_accuracy": 0.7768245339393616, "num_tokens": 5944077.0, "step": 363 }, { "entropy": 0.5237463638186455, "epoch": 1.3582089552238805, "grad_norm": 0.2120627760887146, "learning_rate": 0.0002, "loss": 0.5346443057060242, "mean_token_accuracy": 0.7835520654916763, "num_tokens": 5960532.0, "step": 364 }, { "entropy": 0.5450356751680374, "epoch": 1.3619402985074627, "grad_norm": 0.12423616647720337, "learning_rate": 0.0002, "loss": 0.5510310530662537, "mean_token_accuracy": 0.7749469876289368, "num_tokens": 5976893.0, "step": 365 }, { "entropy": 0.5489538311958313, "epoch": 1.3656716417910448, "grad_norm": 0.17930445075035095, "learning_rate": 0.0002, "loss": 0.5512227416038513, "mean_token_accuracy": 0.7759018093347549, "num_tokens": 5993262.0, "step": 366 }, { "entropy": 0.5524207949638367, "epoch": 1.3694029850746268, "grad_norm": 0.12074736505746841, "learning_rate": 0.0002, "loss": 0.5450834631919861, "mean_token_accuracy": 0.7803297787904739, "num_tokens": 6009831.0, "step": 367 }, { "entropy": 0.5440987944602966, "epoch": 1.373134328358209, "grad_norm": 0.13452184200286865, "learning_rate": 0.0002, "loss": 0.5378953814506531, "mean_token_accuracy": 0.7820150256156921, "num_tokens": 6026331.0, "step": 368 }, { "entropy": 0.5413002520799637, "epoch": 1.376865671641791, "grad_norm": 0.1278562843799591, "learning_rate": 0.0002, "loss": 0.5359137654304504, "mean_token_accuracy": 0.783556342124939, "num_tokens": 6042945.0, "step": 369 }, { "entropy": 0.5525120049715042, "epoch": 1.3805970149253732, "grad_norm": 0.1208810955286026, "learning_rate": 0.0002, "loss": 0.5459328889846802, "mean_token_accuracy": 0.7781365811824799, "num_tokens": 6059427.0, "step": 370 }, { "entropy": 0.5276467949151993, "epoch": 1.3843283582089552, "grad_norm": 0.21167868375778198, "learning_rate": 0.0002, "loss": 0.5329975485801697, "mean_token_accuracy": 0.7855836153030396, "num_tokens": 6075868.0, "step": 371 }, { "entropy": 0.523284301161766, "epoch": 1.3880597014925373, "grad_norm": 0.13116827607154846, "learning_rate": 0.0002, "loss": 0.5309988260269165, "mean_token_accuracy": 0.7828356921672821, "num_tokens": 6092149.0, "step": 372 }, { "entropy": 0.5434711575508118, "epoch": 1.3917910447761195, "grad_norm": 0.3316550850868225, "learning_rate": 0.0002, "loss": 0.553439199924469, "mean_token_accuracy": 0.7766979038715363, "num_tokens": 6108567.0, "step": 373 }, { "entropy": 0.5287135094404221, "epoch": 1.3955223880597014, "grad_norm": 0.15037605166435242, "learning_rate": 0.0002, "loss": 0.5357441306114197, "mean_token_accuracy": 0.7817093282938004, "num_tokens": 6124527.0, "step": 374 }, { "entropy": 0.5508522838354111, "epoch": 1.3992537313432836, "grad_norm": 0.19524440169334412, "learning_rate": 0.0002, "loss": 0.5512291789054871, "mean_token_accuracy": 0.7776720374822617, "num_tokens": 6141075.0, "step": 375 }, { "entropy": 0.5336653590202332, "epoch": 1.4029850746268657, "grad_norm": 0.15542961657047272, "learning_rate": 0.0002, "loss": 0.5334641933441162, "mean_token_accuracy": 0.7813901156187057, "num_tokens": 6157438.0, "step": 376 }, { "entropy": 0.5536468476057053, "epoch": 1.4067164179104479, "grad_norm": 0.11985230445861816, "learning_rate": 0.0002, "loss": 0.5497922301292419, "mean_token_accuracy": 0.7766197621822357, "num_tokens": 6174052.0, "step": 377 }, { "entropy": 0.5455610156059265, "epoch": 1.4104477611940298, "grad_norm": 0.1377374231815338, "learning_rate": 0.0002, "loss": 0.5400494337081909, "mean_token_accuracy": 0.7812647223472595, "num_tokens": 6190741.0, "step": 378 }, { "entropy": 0.5355032831430435, "epoch": 1.414179104477612, "grad_norm": 0.12337534874677658, "learning_rate": 0.0002, "loss": 0.5313869118690491, "mean_token_accuracy": 0.7843705862760544, "num_tokens": 6207346.0, "step": 379 }, { "entropy": 0.5320865362882614, "epoch": 1.417910447761194, "grad_norm": 0.1453101485967636, "learning_rate": 0.0002, "loss": 0.5400369167327881, "mean_token_accuracy": 0.7805843502283096, "num_tokens": 6223644.0, "step": 380 }, { "entropy": 0.5373547524213791, "epoch": 1.421641791044776, "grad_norm": 0.19084329903125763, "learning_rate": 0.0002, "loss": 0.5499929785728455, "mean_token_accuracy": 0.7757923603057861, "num_tokens": 6239901.0, "step": 381 }, { "entropy": 0.5443465709686279, "epoch": 1.4253731343283582, "grad_norm": 0.11772217601537704, "learning_rate": 0.0002, "loss": 0.5418881773948669, "mean_token_accuracy": 0.7812986522912979, "num_tokens": 6256285.0, "step": 382 }, { "entropy": 0.5499950498342514, "epoch": 1.4291044776119404, "grad_norm": 0.1847136914730072, "learning_rate": 0.0002, "loss": 0.5488113760948181, "mean_token_accuracy": 0.7776869833469391, "num_tokens": 6272664.0, "step": 383 }, { "entropy": 0.5412472188472748, "epoch": 1.4328358208955223, "grad_norm": 0.1461949199438095, "learning_rate": 0.0002, "loss": 0.5365965366363525, "mean_token_accuracy": 0.7832726240158081, "num_tokens": 6289098.0, "step": 384 }, { "entropy": 0.5493346899747849, "epoch": 1.4365671641791045, "grad_norm": 0.17751483619213104, "learning_rate": 0.0002, "loss": 0.5465101003646851, "mean_token_accuracy": 0.7778099924325943, "num_tokens": 6305547.0, "step": 385 }, { "entropy": 0.5415252298116684, "epoch": 1.4402985074626866, "grad_norm": 0.13513009250164032, "learning_rate": 0.0002, "loss": 0.538934588432312, "mean_token_accuracy": 0.7832966297864914, "num_tokens": 6321844.0, "step": 386 }, { "entropy": 0.5470823347568512, "epoch": 1.4440298507462686, "grad_norm": 0.15616844594478607, "learning_rate": 0.0002, "loss": 0.5563836097717285, "mean_token_accuracy": 0.7730062156915665, "num_tokens": 6338401.0, "step": 387 }, { "entropy": 0.5151138752698898, "epoch": 1.4477611940298507, "grad_norm": 0.13514217734336853, "learning_rate": 0.0002, "loss": 0.5200275182723999, "mean_token_accuracy": 0.7898600101470947, "num_tokens": 6354762.0, "step": 388 }, { "entropy": 0.5174058377742767, "epoch": 1.4514925373134329, "grad_norm": 0.13703469932079315, "learning_rate": 0.0002, "loss": 0.5161208510398865, "mean_token_accuracy": 0.7918747067451477, "num_tokens": 6370840.0, "step": 389 }, { "entropy": 0.5557476729154587, "epoch": 1.455223880597015, "grad_norm": 0.11840767413377762, "learning_rate": 0.0002, "loss": 0.5515946745872498, "mean_token_accuracy": 0.7783915251493454, "num_tokens": 6387355.0, "step": 390 }, { "entropy": 0.5518558323383331, "epoch": 1.458955223880597, "grad_norm": 0.13202938437461853, "learning_rate": 0.0002, "loss": 0.5526413321495056, "mean_token_accuracy": 0.776582270860672, "num_tokens": 6403938.0, "step": 391 }, { "entropy": 0.5571378320455551, "epoch": 1.462686567164179, "grad_norm": 0.13269183039665222, "learning_rate": 0.0002, "loss": 0.5643842220306396, "mean_token_accuracy": 0.7722982317209244, "num_tokens": 6420250.0, "step": 392 }, { "entropy": 0.5537096560001373, "epoch": 1.4664179104477613, "grad_norm": 0.14151525497436523, "learning_rate": 0.0002, "loss": 0.553024411201477, "mean_token_accuracy": 0.7778746634721756, "num_tokens": 6436546.0, "step": 393 }, { "entropy": 0.5346309244632721, "epoch": 1.4701492537313432, "grad_norm": 0.13563434779644012, "learning_rate": 0.0002, "loss": 0.5249274969100952, "mean_token_accuracy": 0.7853583991527557, "num_tokens": 6453243.0, "step": 394 }, { "entropy": 0.5460333377122879, "epoch": 1.4738805970149254, "grad_norm": 0.14244568347930908, "learning_rate": 0.0002, "loss": 0.5472844243049622, "mean_token_accuracy": 0.7797000557184219, "num_tokens": 6469565.0, "step": 395 }, { "entropy": 0.5330733209848404, "epoch": 1.4776119402985075, "grad_norm": 0.15417160093784332, "learning_rate": 0.0002, "loss": 0.538681149482727, "mean_token_accuracy": 0.7821140140295029, "num_tokens": 6486038.0, "step": 396 }, { "entropy": 0.5275893434882164, "epoch": 1.4813432835820897, "grad_norm": 0.1634518802165985, "learning_rate": 0.0002, "loss": 0.5361412167549133, "mean_token_accuracy": 0.7828765362501144, "num_tokens": 6502376.0, "step": 397 }, { "entropy": 0.5401307940483093, "epoch": 1.4850746268656716, "grad_norm": 0.14567126333713531, "learning_rate": 0.0002, "loss": 0.5489403605461121, "mean_token_accuracy": 0.7781455963850021, "num_tokens": 6518668.0, "step": 398 }, { "entropy": 0.5669757276773453, "epoch": 1.4888059701492538, "grad_norm": 0.1354297697544098, "learning_rate": 0.0002, "loss": 0.5657601356506348, "mean_token_accuracy": 0.7712653428316116, "num_tokens": 6535182.0, "step": 399 }, { "entropy": 0.5363806635141373, "epoch": 1.4925373134328357, "grad_norm": 0.12377993017435074, "learning_rate": 0.0002, "loss": 0.529585599899292, "mean_token_accuracy": 0.7840481698513031, "num_tokens": 6551666.0, "step": 400 }, { "entropy": 0.5551501959562302, "epoch": 1.4962686567164178, "grad_norm": 0.14788372814655304, "learning_rate": 0.0002, "loss": 0.553497314453125, "mean_token_accuracy": 0.7757378667593002, "num_tokens": 6568256.0, "step": 401 }, { "entropy": 0.5353442132472992, "epoch": 1.5, "grad_norm": 0.12778371572494507, "learning_rate": 0.0002, "loss": 0.5333885550498962, "mean_token_accuracy": 0.7825479656457901, "num_tokens": 6584443.0, "step": 402 }, { "entropy": 0.5460584759712219, "epoch": 1.5037313432835822, "grad_norm": 0.1357504278421402, "learning_rate": 0.0002, "loss": 0.5496041774749756, "mean_token_accuracy": 0.7750886082649231, "num_tokens": 6600907.0, "step": 403 }, { "entropy": 0.5397640466690063, "epoch": 1.5074626865671643, "grad_norm": 0.13449276983737946, "learning_rate": 0.0002, "loss": 0.5374521017074585, "mean_token_accuracy": 0.783362939953804, "num_tokens": 6617309.0, "step": 404 }, { "entropy": 0.545674204826355, "epoch": 1.5111940298507462, "grad_norm": 0.12818823754787445, "learning_rate": 0.0002, "loss": 0.5414538383483887, "mean_token_accuracy": 0.7811758369207382, "num_tokens": 6633409.0, "step": 405 }, { "entropy": 0.5237551480531693, "epoch": 1.5149253731343284, "grad_norm": 0.1332634538412094, "learning_rate": 0.0002, "loss": 0.5288904905319214, "mean_token_accuracy": 0.7863495498895645, "num_tokens": 6649677.0, "step": 406 }, { "entropy": 0.5475018620491028, "epoch": 1.5186567164179103, "grad_norm": 0.1226048395037651, "learning_rate": 0.0002, "loss": 0.5457717180252075, "mean_token_accuracy": 0.7798316031694412, "num_tokens": 6665941.0, "step": 407 }, { "entropy": 0.5388360321521759, "epoch": 1.5223880597014925, "grad_norm": 0.11307930946350098, "learning_rate": 0.0002, "loss": 0.5332959294319153, "mean_token_accuracy": 0.7827007919549942, "num_tokens": 6682727.0, "step": 408 }, { "entropy": 0.5245520323514938, "epoch": 1.5261194029850746, "grad_norm": 0.13594341278076172, "learning_rate": 0.0002, "loss": 0.527988851070404, "mean_token_accuracy": 0.7841480374336243, "num_tokens": 6699061.0, "step": 409 }, { "entropy": 0.5443517565727234, "epoch": 1.5298507462686568, "grad_norm": 0.12875105440616608, "learning_rate": 0.0002, "loss": 0.5445384979248047, "mean_token_accuracy": 0.7800036072731018, "num_tokens": 6715276.0, "step": 410 }, { "entropy": 0.5312410593032837, "epoch": 1.533582089552239, "grad_norm": 0.14251653850078583, "learning_rate": 0.0002, "loss": 0.5363666415214539, "mean_token_accuracy": 0.7820229083299637, "num_tokens": 6731754.0, "step": 411 }, { "entropy": 0.5279273837804794, "epoch": 1.537313432835821, "grad_norm": 0.14002381265163422, "learning_rate": 0.0002, "loss": 0.533150851726532, "mean_token_accuracy": 0.7839628010988235, "num_tokens": 6748198.0, "step": 412 }, { "entropy": 0.5359641313552856, "epoch": 1.5410447761194028, "grad_norm": 0.12248595803976059, "learning_rate": 0.0002, "loss": 0.5377635359764099, "mean_token_accuracy": 0.7816402763128281, "num_tokens": 6764658.0, "step": 413 }, { "entropy": 0.5304668098688126, "epoch": 1.544776119402985, "grad_norm": 0.1455898880958557, "learning_rate": 0.0002, "loss": 0.527800440788269, "mean_token_accuracy": 0.7847253680229187, "num_tokens": 6780948.0, "step": 414 }, { "entropy": 0.5399336069822311, "epoch": 1.5485074626865671, "grad_norm": 0.1414983719587326, "learning_rate": 0.0002, "loss": 0.5367389917373657, "mean_token_accuracy": 0.7821487188339233, "num_tokens": 6797350.0, "step": 415 }, { "entropy": 0.5576040744781494, "epoch": 1.5522388059701493, "grad_norm": 0.12719132006168365, "learning_rate": 0.0002, "loss": 0.5524293780326843, "mean_token_accuracy": 0.7746585160493851, "num_tokens": 6813754.0, "step": 416 }, { "entropy": 0.5370134860277176, "epoch": 1.5559701492537314, "grad_norm": 0.1307905912399292, "learning_rate": 0.0002, "loss": 0.5359637141227722, "mean_token_accuracy": 0.7802634984254837, "num_tokens": 6829931.0, "step": 417 }, { "entropy": 0.5672536343336105, "epoch": 1.5597014925373134, "grad_norm": 0.14925286173820496, "learning_rate": 0.0002, "loss": 0.5706211924552917, "mean_token_accuracy": 0.7692793905735016, "num_tokens": 6846619.0, "step": 418 }, { "entropy": 0.5455258339643478, "epoch": 1.5634328358208955, "grad_norm": 0.13767075538635254, "learning_rate": 0.0002, "loss": 0.5497614145278931, "mean_token_accuracy": 0.7742694765329361, "num_tokens": 6862943.0, "step": 419 }, { "entropy": 0.5383682698011398, "epoch": 1.5671641791044775, "grad_norm": 0.14676761627197266, "learning_rate": 0.0002, "loss": 0.5352654457092285, "mean_token_accuracy": 0.7820954322814941, "num_tokens": 6879478.0, "step": 420 }, { "entropy": 0.5393406301736832, "epoch": 1.5708955223880596, "grad_norm": 0.14782963693141937, "learning_rate": 0.0002, "loss": 0.539406418800354, "mean_token_accuracy": 0.7811137288808823, "num_tokens": 6895819.0, "step": 421 }, { "entropy": 0.5472134947776794, "epoch": 1.5746268656716418, "grad_norm": 0.1328146755695343, "learning_rate": 0.0002, "loss": 0.5461377501487732, "mean_token_accuracy": 0.7797697186470032, "num_tokens": 6912305.0, "step": 422 }, { "entropy": 0.5397001504898071, "epoch": 1.578358208955224, "grad_norm": 0.12005209177732468, "learning_rate": 0.0002, "loss": 0.5396695137023926, "mean_token_accuracy": 0.7789896428585052, "num_tokens": 6928851.0, "step": 423 }, { "entropy": 0.5323083251714706, "epoch": 1.582089552238806, "grad_norm": 0.14206735789775848, "learning_rate": 0.0002, "loss": 0.5357058048248291, "mean_token_accuracy": 0.7814851403236389, "num_tokens": 6945117.0, "step": 424 }, { "entropy": 0.5220139473676682, "epoch": 1.585820895522388, "grad_norm": 0.13408760726451874, "learning_rate": 0.0002, "loss": 0.5282811522483826, "mean_token_accuracy": 0.7859802693128586, "num_tokens": 6961475.0, "step": 425 }, { "entropy": 0.5279606133699417, "epoch": 1.5895522388059702, "grad_norm": 0.1342962682247162, "learning_rate": 0.0002, "loss": 0.5310772061347961, "mean_token_accuracy": 0.7856840938329697, "num_tokens": 6977917.0, "step": 426 }, { "entropy": 0.5404426008462906, "epoch": 1.5932835820895521, "grad_norm": 0.11640056222677231, "learning_rate": 0.0002, "loss": 0.5350806713104248, "mean_token_accuracy": 0.7831773906946182, "num_tokens": 6994309.0, "step": 427 }, { "entropy": 0.546152800321579, "epoch": 1.5970149253731343, "grad_norm": 0.11648745834827423, "learning_rate": 0.0002, "loss": 0.5432876348495483, "mean_token_accuracy": 0.7806773632764816, "num_tokens": 7010651.0, "step": 428 }, { "entropy": 0.5330662578344345, "epoch": 1.6007462686567164, "grad_norm": 0.1201220154762268, "learning_rate": 0.0002, "loss": 0.5310200452804565, "mean_token_accuracy": 0.7844978868961334, "num_tokens": 7027129.0, "step": 429 }, { "entropy": 0.5318699181079865, "epoch": 1.6044776119402986, "grad_norm": 0.12328798323869705, "learning_rate": 0.0002, "loss": 0.5332854986190796, "mean_token_accuracy": 0.7820296734571457, "num_tokens": 7043492.0, "step": 430 }, { "entropy": 0.5330018848180771, "epoch": 1.6082089552238807, "grad_norm": 0.1538732498884201, "learning_rate": 0.0002, "loss": 0.5346086621284485, "mean_token_accuracy": 0.7841860204935074, "num_tokens": 7059825.0, "step": 431 }, { "entropy": 0.5369807183742523, "epoch": 1.6119402985074627, "grad_norm": 0.13523033261299133, "learning_rate": 0.0002, "loss": 0.543128490447998, "mean_token_accuracy": 0.779476061463356, "num_tokens": 7076083.0, "step": 432 }, { "entropy": 0.5597919672727585, "epoch": 1.6156716417910446, "grad_norm": 0.13593490421772003, "learning_rate": 0.0002, "loss": 0.56092369556427, "mean_token_accuracy": 0.7705628126859665, "num_tokens": 7092494.0, "step": 433 }, { "entropy": 0.5592869371175766, "epoch": 1.6194029850746268, "grad_norm": 0.13970784842967987, "learning_rate": 0.0002, "loss": 0.5588337182998657, "mean_token_accuracy": 0.7716414630413055, "num_tokens": 7108787.0, "step": 434 }, { "entropy": 0.5510755926370621, "epoch": 1.623134328358209, "grad_norm": 0.14515163004398346, "learning_rate": 0.0002, "loss": 0.5508431792259216, "mean_token_accuracy": 0.7757678478956223, "num_tokens": 7125326.0, "step": 435 }, { "entropy": 0.5493544340133667, "epoch": 1.626865671641791, "grad_norm": 0.13484683632850647, "learning_rate": 0.0002, "loss": 0.5357339382171631, "mean_token_accuracy": 0.7844331711530685, "num_tokens": 7141623.0, "step": 436 }, { "entropy": 0.5371888130903244, "epoch": 1.6305970149253732, "grad_norm": 0.12795639038085938, "learning_rate": 0.0002, "loss": 0.5337157249450684, "mean_token_accuracy": 0.7853695005178452, "num_tokens": 7158003.0, "step": 437 }, { "entropy": 0.5294598788022995, "epoch": 1.6343283582089554, "grad_norm": 0.13173329830169678, "learning_rate": 0.0002, "loss": 0.5329991579055786, "mean_token_accuracy": 0.7873143553733826, "num_tokens": 7174417.0, "step": 438 }, { "entropy": 0.5183067172765732, "epoch": 1.6380597014925373, "grad_norm": 0.14890097081661224, "learning_rate": 0.0002, "loss": 0.5276235938072205, "mean_token_accuracy": 0.7841698378324509, "num_tokens": 7190789.0, "step": 439 }, { "entropy": 0.5212598145008087, "epoch": 1.6417910447761193, "grad_norm": 0.1251063346862793, "learning_rate": 0.0002, "loss": 0.5228430032730103, "mean_token_accuracy": 0.7859450131654739, "num_tokens": 7207139.0, "step": 440 }, { "entropy": 0.5322405844926834, "epoch": 1.6455223880597014, "grad_norm": 0.13600069284439087, "learning_rate": 0.0002, "loss": 0.5263532996177673, "mean_token_accuracy": 0.7853893488645554, "num_tokens": 7223453.0, "step": 441 }, { "entropy": 0.5205891877412796, "epoch": 1.6492537313432836, "grad_norm": 0.13653913140296936, "learning_rate": 0.0002, "loss": 0.5208824872970581, "mean_token_accuracy": 0.7881260365247726, "num_tokens": 7240006.0, "step": 442 }, { "entropy": 0.5441347062587738, "epoch": 1.6529850746268657, "grad_norm": 0.14450038969516754, "learning_rate": 0.0002, "loss": 0.5436342358589172, "mean_token_accuracy": 0.7799146473407745, "num_tokens": 7256390.0, "step": 443 }, { "entropy": 0.5312005802989006, "epoch": 1.6567164179104479, "grad_norm": 0.12901286780834198, "learning_rate": 0.0002, "loss": 0.5335438847541809, "mean_token_accuracy": 0.78382308781147, "num_tokens": 7272830.0, "step": 444 }, { "entropy": 0.5523424595594406, "epoch": 1.6604477611940298, "grad_norm": 0.13704852759838104, "learning_rate": 0.0002, "loss": 0.5541114807128906, "mean_token_accuracy": 0.7756187319755554, "num_tokens": 7289085.0, "step": 445 }, { "entropy": 0.5462750494480133, "epoch": 1.664179104477612, "grad_norm": 0.1385122686624527, "learning_rate": 0.0002, "loss": 0.5408669114112854, "mean_token_accuracy": 0.7794688045978546, "num_tokens": 7305251.0, "step": 446 }, { "entropy": 0.5703910887241364, "epoch": 1.667910447761194, "grad_norm": 0.12344513088464737, "learning_rate": 0.0002, "loss": 0.5666346549987793, "mean_token_accuracy": 0.7705821841955185, "num_tokens": 7321796.0, "step": 447 }, { "entropy": 0.5504626631736755, "epoch": 1.671641791044776, "grad_norm": 0.12487871944904327, "learning_rate": 0.0002, "loss": 0.5492321848869324, "mean_token_accuracy": 0.7753137797117233, "num_tokens": 7338182.0, "step": 448 }, { "entropy": 0.5314936190843582, "epoch": 1.6753731343283582, "grad_norm": 0.1390916407108307, "learning_rate": 0.0002, "loss": 0.5342849493026733, "mean_token_accuracy": 0.7855862826108932, "num_tokens": 7354707.0, "step": 449 }, { "entropy": 0.5125585347414017, "epoch": 1.6791044776119404, "grad_norm": 0.13132618367671967, "learning_rate": 0.0002, "loss": 0.5202143788337708, "mean_token_accuracy": 0.7874000519514084, "num_tokens": 7370797.0, "step": 450 }, { "entropy": 0.5190107151865959, "epoch": 1.6828358208955225, "grad_norm": 0.15053601562976837, "learning_rate": 0.0002, "loss": 0.5218467116355896, "mean_token_accuracy": 0.7879750281572342, "num_tokens": 7387448.0, "step": 451 }, { "entropy": 0.5473128408193588, "epoch": 1.6865671641791045, "grad_norm": 0.14291800558567047, "learning_rate": 0.0002, "loss": 0.5459562540054321, "mean_token_accuracy": 0.7800840735435486, "num_tokens": 7403768.0, "step": 452 }, { "entropy": 0.5372306257486343, "epoch": 1.6902985074626866, "grad_norm": 0.14737331867218018, "learning_rate": 0.0002, "loss": 0.5391932725906372, "mean_token_accuracy": 0.7811848223209381, "num_tokens": 7420197.0, "step": 453 }, { "entropy": 0.5366326868534088, "epoch": 1.6940298507462686, "grad_norm": 0.13737186789512634, "learning_rate": 0.0002, "loss": 0.5392562747001648, "mean_token_accuracy": 0.7824465036392212, "num_tokens": 7436532.0, "step": 454 }, { "entropy": 0.5506515055894852, "epoch": 1.6977611940298507, "grad_norm": 0.15034589171409607, "learning_rate": 0.0002, "loss": 0.5501772165298462, "mean_token_accuracy": 0.7773263603448868, "num_tokens": 7452842.0, "step": 455 }, { "entropy": 0.5643105208873749, "epoch": 1.7014925373134329, "grad_norm": 0.14214570820331573, "learning_rate": 0.0002, "loss": 0.5492639541625977, "mean_token_accuracy": 0.7783908396959305, "num_tokens": 7469451.0, "step": 456 }, { "entropy": 0.5516497120261192, "epoch": 1.705223880597015, "grad_norm": 0.14590683579444885, "learning_rate": 0.0002, "loss": 0.5515267252922058, "mean_token_accuracy": 0.774686187505722, "num_tokens": 7485822.0, "step": 457 }, { "entropy": 0.5483950823545456, "epoch": 1.7089552238805972, "grad_norm": 0.15629805624485016, "learning_rate": 0.0002, "loss": 0.5422750115394592, "mean_token_accuracy": 0.7802471369504929, "num_tokens": 7502363.0, "step": 458 }, { "entropy": 0.5315360128879547, "epoch": 1.712686567164179, "grad_norm": 0.15466850996017456, "learning_rate": 0.0002, "loss": 0.5331098437309265, "mean_token_accuracy": 0.7842396944761276, "num_tokens": 7518672.0, "step": 459 }, { "entropy": 0.5366538316011429, "epoch": 1.716417910447761, "grad_norm": 0.15616163611412048, "learning_rate": 0.0002, "loss": 0.5455700755119324, "mean_token_accuracy": 0.7823781222105026, "num_tokens": 7534957.0, "step": 460 }, { "entropy": 0.5233009159564972, "epoch": 1.7201492537313432, "grad_norm": 0.1496264487504959, "learning_rate": 0.0002, "loss": 0.5298243761062622, "mean_token_accuracy": 0.7823347896337509, "num_tokens": 7551350.0, "step": 461 }, { "entropy": 0.5345755070447922, "epoch": 1.7238805970149254, "grad_norm": 0.15188711881637573, "learning_rate": 0.0002, "loss": 0.5339583158493042, "mean_token_accuracy": 0.7852912098169327, "num_tokens": 7567796.0, "step": 462 }, { "entropy": 0.525611899793148, "epoch": 1.7276119402985075, "grad_norm": 0.12338917702436447, "learning_rate": 0.0002, "loss": 0.5274109840393066, "mean_token_accuracy": 0.7858613133430481, "num_tokens": 7583895.0, "step": 463 }, { "entropy": 0.5306848883628845, "epoch": 1.7313432835820897, "grad_norm": 0.16974470019340515, "learning_rate": 0.0002, "loss": 0.5279258489608765, "mean_token_accuracy": 0.7865510731935501, "num_tokens": 7600124.0, "step": 464 }, { "entropy": 0.5408849269151688, "epoch": 1.7350746268656716, "grad_norm": 0.12648795545101166, "learning_rate": 0.0002, "loss": 0.5382460951805115, "mean_token_accuracy": 0.7846677452325821, "num_tokens": 7616438.0, "step": 465 }, { "entropy": 0.5429423898458481, "epoch": 1.7388059701492538, "grad_norm": 0.1650669425725937, "learning_rate": 0.0002, "loss": 0.549877941608429, "mean_token_accuracy": 0.7792258560657501, "num_tokens": 7632788.0, "step": 466 }, { "entropy": 0.5318955481052399, "epoch": 1.7425373134328357, "grad_norm": 0.12288089841604233, "learning_rate": 0.0002, "loss": 0.5323612093925476, "mean_token_accuracy": 0.7859359383583069, "num_tokens": 7649308.0, "step": 467 }, { "entropy": 0.548863410949707, "epoch": 1.7462686567164178, "grad_norm": 0.1326245218515396, "learning_rate": 0.0002, "loss": 0.5457996129989624, "mean_token_accuracy": 0.7799065709114075, "num_tokens": 7665793.0, "step": 468 }, { "entropy": 0.5389255881309509, "epoch": 1.75, "grad_norm": 0.12419410794973373, "learning_rate": 0.0002, "loss": 0.5312763452529907, "mean_token_accuracy": 0.7822507619857788, "num_tokens": 7682000.0, "step": 469 }, { "entropy": 0.5358720868825912, "epoch": 1.7537313432835822, "grad_norm": 0.13035476207733154, "learning_rate": 0.0002, "loss": 0.5321502685546875, "mean_token_accuracy": 0.7836209833621979, "num_tokens": 7698643.0, "step": 470 }, { "entropy": 0.5370121747255325, "epoch": 1.7574626865671643, "grad_norm": 0.1549667865037918, "learning_rate": 0.0002, "loss": 0.5385861396789551, "mean_token_accuracy": 0.7808156907558441, "num_tokens": 7714815.0, "step": 471 }, { "entropy": 0.5387648344039917, "epoch": 1.7611940298507462, "grad_norm": 0.14527052640914917, "learning_rate": 0.0002, "loss": 0.5470720529556274, "mean_token_accuracy": 0.7775331288576126, "num_tokens": 7731250.0, "step": 472 }, { "entropy": 0.5520026981830597, "epoch": 1.7649253731343284, "grad_norm": 0.19052588939666748, "learning_rate": 0.0002, "loss": 0.5578737854957581, "mean_token_accuracy": 0.7744869738817215, "num_tokens": 7747721.0, "step": 473 }, { "entropy": 0.5377953052520752, "epoch": 1.7686567164179103, "grad_norm": 0.13061052560806274, "learning_rate": 0.0002, "loss": 0.5413972735404968, "mean_token_accuracy": 0.7811722010374069, "num_tokens": 7763904.0, "step": 474 }, { "entropy": 0.5519908219575882, "epoch": 1.7723880597014925, "grad_norm": 0.1454058736562729, "learning_rate": 0.0002, "loss": 0.5414596796035767, "mean_token_accuracy": 0.7813711762428284, "num_tokens": 7780581.0, "step": 475 }, { "entropy": 0.5267625749111176, "epoch": 1.7761194029850746, "grad_norm": 0.1326485425233841, "learning_rate": 0.0002, "loss": 0.5213202834129333, "mean_token_accuracy": 0.7871652394533157, "num_tokens": 7796973.0, "step": 476 }, { "entropy": 0.553408294916153, "epoch": 1.7798507462686568, "grad_norm": 0.13312950730323792, "learning_rate": 0.0002, "loss": 0.5529948472976685, "mean_token_accuracy": 0.7743393182754517, "num_tokens": 7813279.0, "step": 477 }, { "entropy": 0.553880587220192, "epoch": 1.783582089552239, "grad_norm": 0.16114220023155212, "learning_rate": 0.0002, "loss": 0.5641807317733765, "mean_token_accuracy": 0.7722779810428619, "num_tokens": 7829823.0, "step": 478 }, { "entropy": 0.5241200774908066, "epoch": 1.787313432835821, "grad_norm": 0.15040791034698486, "learning_rate": 0.0002, "loss": 0.5346534252166748, "mean_token_accuracy": 0.7823406606912613, "num_tokens": 7845983.0, "step": 479 }, { "entropy": 0.5474425554275513, "epoch": 1.7910447761194028, "grad_norm": 0.13473069667816162, "learning_rate": 0.0002, "loss": 0.5514643788337708, "mean_token_accuracy": 0.775032564997673, "num_tokens": 7862179.0, "step": 480 }, { "entropy": 0.5494029968976974, "epoch": 1.794776119402985, "grad_norm": 0.14377883076667786, "learning_rate": 0.0002, "loss": 0.5433907508850098, "mean_token_accuracy": 0.7781640440225601, "num_tokens": 7878779.0, "step": 481 }, { "entropy": 0.5409138202667236, "epoch": 1.7985074626865671, "grad_norm": 0.14134465157985687, "learning_rate": 0.0002, "loss": 0.5372306704521179, "mean_token_accuracy": 0.7832998037338257, "num_tokens": 7895136.0, "step": 482 }, { "entropy": 0.5516301095485687, "epoch": 1.8022388059701493, "grad_norm": 0.13915129005908966, "learning_rate": 0.0002, "loss": 0.5529888272285461, "mean_token_accuracy": 0.7746001183986664, "num_tokens": 7911482.0, "step": 483 }, { "entropy": 0.5409607142210007, "epoch": 1.8059701492537314, "grad_norm": 0.1552349179983139, "learning_rate": 0.0002, "loss": 0.5396745204925537, "mean_token_accuracy": 0.7830557972192764, "num_tokens": 7927769.0, "step": 484 }, { "entropy": 0.5268412679433823, "epoch": 1.8097014925373134, "grad_norm": 0.16648107767105103, "learning_rate": 0.0002, "loss": 0.5397533178329468, "mean_token_accuracy": 0.782973125576973, "num_tokens": 7944237.0, "step": 485 }, { "entropy": 0.5383498221635818, "epoch": 1.8134328358208955, "grad_norm": 0.1299259066581726, "learning_rate": 0.0002, "loss": 0.5412971377372742, "mean_token_accuracy": 0.7789154797792435, "num_tokens": 7960404.0, "step": 486 }, { "entropy": 0.5497616678476334, "epoch": 1.8171641791044775, "grad_norm": 0.1571415513753891, "learning_rate": 0.0002, "loss": 0.5444965362548828, "mean_token_accuracy": 0.7790942490100861, "num_tokens": 7976843.0, "step": 487 }, { "entropy": 0.5411071628332138, "epoch": 1.8208955223880596, "grad_norm": 0.12472257018089294, "learning_rate": 0.0002, "loss": 0.5377678275108337, "mean_token_accuracy": 0.7812906056642532, "num_tokens": 7993308.0, "step": 488 }, { "entropy": 0.5332149565219879, "epoch": 1.8246268656716418, "grad_norm": 0.14515501260757446, "learning_rate": 0.0002, "loss": 0.532054603099823, "mean_token_accuracy": 0.7860440015792847, "num_tokens": 8009749.0, "step": 489 }, { "entropy": 0.5376683920621872, "epoch": 1.828358208955224, "grad_norm": 0.1362919807434082, "learning_rate": 0.0002, "loss": 0.5361682772636414, "mean_token_accuracy": 0.7828832864761353, "num_tokens": 8026107.0, "step": 490 }, { "entropy": 0.541684627532959, "epoch": 1.832089552238806, "grad_norm": 0.1390708088874817, "learning_rate": 0.0002, "loss": 0.5428534746170044, "mean_token_accuracy": 0.7796362638473511, "num_tokens": 8042519.0, "step": 491 }, { "entropy": 0.5491971075534821, "epoch": 1.835820895522388, "grad_norm": 0.18899311125278473, "learning_rate": 0.0002, "loss": 0.5468783378601074, "mean_token_accuracy": 0.7760737091302872, "num_tokens": 8058733.0, "step": 492 }, { "entropy": 0.5467192232608795, "epoch": 1.8395522388059702, "grad_norm": 0.12224384397268295, "learning_rate": 0.0002, "loss": 0.5412194728851318, "mean_token_accuracy": 0.7836457341909409, "num_tokens": 8075111.0, "step": 493 }, { "entropy": 0.5190225690603256, "epoch": 1.8432835820895521, "grad_norm": 0.17859016358852386, "learning_rate": 0.0002, "loss": 0.5287451148033142, "mean_token_accuracy": 0.7872583419084549, "num_tokens": 8091539.0, "step": 494 }, { "entropy": 0.5457055866718292, "epoch": 1.8470149253731343, "grad_norm": 0.14652208983898163, "learning_rate": 0.0002, "loss": 0.5511422157287598, "mean_token_accuracy": 0.7764985859394073, "num_tokens": 8107924.0, "step": 495 }, { "entropy": 0.5412308424711227, "epoch": 1.8507462686567164, "grad_norm": 0.14928752183914185, "learning_rate": 0.0002, "loss": 0.5386866331100464, "mean_token_accuracy": 0.7840718477964401, "num_tokens": 8124327.0, "step": 496 }, { "entropy": 0.5487564355134964, "epoch": 1.8544776119402986, "grad_norm": 0.14009299874305725, "learning_rate": 0.0002, "loss": 0.5402563810348511, "mean_token_accuracy": 0.781055673956871, "num_tokens": 8140629.0, "step": 497 }, { "entropy": 0.5530242621898651, "epoch": 1.8582089552238807, "grad_norm": 0.13880518078804016, "learning_rate": 0.0002, "loss": 0.5397564172744751, "mean_token_accuracy": 0.7810083031654358, "num_tokens": 8157176.0, "step": 498 }, { "entropy": 0.5339633226394653, "epoch": 1.8619402985074627, "grad_norm": 0.16541644930839539, "learning_rate": 0.0002, "loss": 0.5336776971817017, "mean_token_accuracy": 0.7829927057027817, "num_tokens": 8173382.0, "step": 499 }, { "entropy": 0.5558539777994156, "epoch": 1.8656716417910446, "grad_norm": 0.15278875827789307, "learning_rate": 0.0002, "loss": 0.5627698302268982, "mean_token_accuracy": 0.7725099176168442, "num_tokens": 8189820.0, "step": 500 }, { "entropy": 0.5367425978183746, "epoch": 1.8694029850746268, "grad_norm": 0.15401561558246613, "learning_rate": 0.0002, "loss": 0.546620786190033, "mean_token_accuracy": 0.7765664905309677, "num_tokens": 8205989.0, "step": 501 }, { "entropy": 0.5408999174833298, "epoch": 1.873134328358209, "grad_norm": 0.13051092624664307, "learning_rate": 0.0002, "loss": 0.5466805696487427, "mean_token_accuracy": 0.7781471610069275, "num_tokens": 8222509.0, "step": 502 }, { "entropy": 0.5321147739887238, "epoch": 1.876865671641791, "grad_norm": 0.13755947351455688, "learning_rate": 0.0002, "loss": 0.527456521987915, "mean_token_accuracy": 0.7872339636087418, "num_tokens": 8238911.0, "step": 503 }, { "entropy": 0.5611546188592911, "epoch": 1.8805970149253732, "grad_norm": 0.13492627441883087, "learning_rate": 0.0002, "loss": 0.548973798751831, "mean_token_accuracy": 0.7786827385425568, "num_tokens": 8255331.0, "step": 504 }, { "entropy": 0.5648814886808395, "epoch": 1.8843283582089554, "grad_norm": 0.13315370678901672, "learning_rate": 0.0002, "loss": 0.5626882314682007, "mean_token_accuracy": 0.7693315893411636, "num_tokens": 8271717.0, "step": 505 }, { "entropy": 0.528036579489708, "epoch": 1.8880597014925373, "grad_norm": 0.13826221227645874, "learning_rate": 0.0002, "loss": 0.5317479372024536, "mean_token_accuracy": 0.7865342795848846, "num_tokens": 8287916.0, "step": 506 }, { "entropy": 0.5300939381122589, "epoch": 1.8917910447761193, "grad_norm": 0.14022263884544373, "learning_rate": 0.0002, "loss": 0.5405997633934021, "mean_token_accuracy": 0.7812036871910095, "num_tokens": 8304453.0, "step": 507 }, { "entropy": 0.52273790538311, "epoch": 1.8955223880597014, "grad_norm": 0.1394582986831665, "learning_rate": 0.0002, "loss": 0.526207685470581, "mean_token_accuracy": 0.7882105112075806, "num_tokens": 8320635.0, "step": 508 }, { "entropy": 0.5376584082841873, "epoch": 1.8992537313432836, "grad_norm": 0.16204339265823364, "learning_rate": 0.0002, "loss": 0.5367757678031921, "mean_token_accuracy": 0.7841790616512299, "num_tokens": 8337016.0, "step": 509 }, { "entropy": 0.5457427948713303, "epoch": 1.9029850746268657, "grad_norm": 0.13758644461631775, "learning_rate": 0.0002, "loss": 0.5404728651046753, "mean_token_accuracy": 0.7789884358644485, "num_tokens": 8353374.0, "step": 510 }, { "entropy": 0.5548366904258728, "epoch": 1.9067164179104479, "grad_norm": 0.15079155564308167, "learning_rate": 0.0002, "loss": 0.5460405349731445, "mean_token_accuracy": 0.7766790390014648, "num_tokens": 8369864.0, "step": 511 }, { "entropy": 0.5432726740837097, "epoch": 1.9104477611940298, "grad_norm": 0.14672084152698517, "learning_rate": 0.0002, "loss": 0.5391443371772766, "mean_token_accuracy": 0.7813593149185181, "num_tokens": 8386310.0, "step": 512 }, { "entropy": 0.5469253212213516, "epoch": 1.914179104477612, "grad_norm": 0.12065178155899048, "learning_rate": 0.0002, "loss": 0.5509493350982666, "mean_token_accuracy": 0.7752728313207626, "num_tokens": 8402902.0, "step": 513 }, { "entropy": 0.5332511216402054, "epoch": 1.917910447761194, "grad_norm": 0.13797524571418762, "learning_rate": 0.0002, "loss": 0.5396395325660706, "mean_token_accuracy": 0.783454567193985, "num_tokens": 8418969.0, "step": 514 }, { "entropy": 0.5430255383253098, "epoch": 1.921641791044776, "grad_norm": 0.15779103338718414, "learning_rate": 0.0002, "loss": 0.5497632026672363, "mean_token_accuracy": 0.776575118303299, "num_tokens": 8435342.0, "step": 515 }, { "entropy": 0.541492372751236, "epoch": 1.9253731343283582, "grad_norm": 0.14993441104888916, "learning_rate": 0.0002, "loss": 0.5440635085105896, "mean_token_accuracy": 0.779094398021698, "num_tokens": 8451438.0, "step": 516 }, { "entropy": 0.5484725534915924, "epoch": 1.9291044776119404, "grad_norm": 0.12014457583427429, "learning_rate": 0.0002, "loss": 0.5494801998138428, "mean_token_accuracy": 0.7743937969207764, "num_tokens": 8467793.0, "step": 517 }, { "entropy": 0.5424629300832748, "epoch": 1.9328358208955225, "grad_norm": 0.1372799575328827, "learning_rate": 0.0002, "loss": 0.5402990579605103, "mean_token_accuracy": 0.7788502424955368, "num_tokens": 8484069.0, "step": 518 }, { "entropy": 0.544426254928112, "epoch": 1.9365671641791045, "grad_norm": 0.12580935657024384, "learning_rate": 0.0002, "loss": 0.5430607199668884, "mean_token_accuracy": 0.7801959961652756, "num_tokens": 8500603.0, "step": 519 }, { "entropy": 0.5405134111642838, "epoch": 1.9402985074626866, "grad_norm": 0.13943250477313995, "learning_rate": 0.0002, "loss": 0.5387794971466064, "mean_token_accuracy": 0.7797143012285233, "num_tokens": 8516792.0, "step": 520 }, { "entropy": 0.5363973081111908, "epoch": 1.9440298507462686, "grad_norm": 0.15255886316299438, "learning_rate": 0.0002, "loss": 0.5392638444900513, "mean_token_accuracy": 0.778968557715416, "num_tokens": 8533178.0, "step": 521 }, { "entropy": 0.5569429993629456, "epoch": 1.9477611940298507, "grad_norm": 0.14009712636470795, "learning_rate": 0.0002, "loss": 0.5554465055465698, "mean_token_accuracy": 0.7732362002134323, "num_tokens": 8549795.0, "step": 522 }, { "entropy": 0.560676708817482, "epoch": 1.9514925373134329, "grad_norm": 0.1429370492696762, "learning_rate": 0.0002, "loss": 0.5586832761764526, "mean_token_accuracy": 0.7744071185588837, "num_tokens": 8566708.0, "step": 523 }, { "entropy": 0.5566927641630173, "epoch": 1.955223880597015, "grad_norm": 0.1273992359638214, "learning_rate": 0.0002, "loss": 0.5483277440071106, "mean_token_accuracy": 0.7761266380548477, "num_tokens": 8582993.0, "step": 524 }, { "entropy": 0.5535138845443726, "epoch": 1.9589552238805972, "grad_norm": 0.15844318270683289, "learning_rate": 0.0002, "loss": 0.5520558953285217, "mean_token_accuracy": 0.7790683060884476, "num_tokens": 8599225.0, "step": 525 }, { "entropy": 0.5255821049213409, "epoch": 1.962686567164179, "grad_norm": 0.1505620777606964, "learning_rate": 0.0002, "loss": 0.5302370190620422, "mean_token_accuracy": 0.7846137434244156, "num_tokens": 8615790.0, "step": 526 }, { "entropy": 0.5364990532398224, "epoch": 1.966417910447761, "grad_norm": 0.18815594911575317, "learning_rate": 0.0002, "loss": 0.5442203283309937, "mean_token_accuracy": 0.7792959064245224, "num_tokens": 8632007.0, "step": 527 }, { "entropy": 0.5499100834131241, "epoch": 1.9701492537313432, "grad_norm": 0.12838681042194366, "learning_rate": 0.0002, "loss": 0.5423155426979065, "mean_token_accuracy": 0.77956822514534, "num_tokens": 8648517.0, "step": 528 }, { "entropy": 0.5600726753473282, "epoch": 1.9738805970149254, "grad_norm": 0.13670910894870758, "learning_rate": 0.0002, "loss": 0.5591787695884705, "mean_token_accuracy": 0.7713638991117477, "num_tokens": 8665136.0, "step": 529 }, { "entropy": 0.5376773029565811, "epoch": 1.9776119402985075, "grad_norm": 0.12114886194467545, "learning_rate": 0.0002, "loss": 0.5407379865646362, "mean_token_accuracy": 0.7814544290304184, "num_tokens": 8681529.0, "step": 530 }, { "entropy": 0.5403832793235779, "epoch": 1.9813432835820897, "grad_norm": 0.13908495008945465, "learning_rate": 0.0002, "loss": 0.5482066869735718, "mean_token_accuracy": 0.777704581618309, "num_tokens": 8697730.0, "step": 531 }, { "entropy": 0.5356862396001816, "epoch": 1.9850746268656716, "grad_norm": 0.13925939798355103, "learning_rate": 0.0002, "loss": 0.5371193289756775, "mean_token_accuracy": 0.783266693353653, "num_tokens": 8714219.0, "step": 532 }, { "entropy": 0.5331960469484329, "epoch": 1.9888059701492538, "grad_norm": 0.15995416045188904, "learning_rate": 0.0002, "loss": 0.5319101810455322, "mean_token_accuracy": 0.7843216061592102, "num_tokens": 8730525.0, "step": 533 }, { "entropy": 0.5409343987703323, "epoch": 1.9925373134328357, "grad_norm": 0.1330004185438156, "learning_rate": 0.0002, "loss": 0.5445230603218079, "mean_token_accuracy": 0.7773614227771759, "num_tokens": 8746950.0, "step": 534 }, { "entropy": 0.5394200682640076, "epoch": 1.9962686567164178, "grad_norm": 0.14103004336357117, "learning_rate": 0.0002, "loss": 0.5359162092208862, "mean_token_accuracy": 0.785576581954956, "num_tokens": 8763337.0, "step": 535 }, { "entropy": 0.5349156558513641, "epoch": 2.0, "grad_norm": 0.12837927043437958, "learning_rate": 0.0002, "loss": 0.5329214334487915, "mean_token_accuracy": 0.785938173532486, "num_tokens": 8779938.0, "step": 536 }, { "entropy": 0.5407280772924423, "epoch": 2.003731343283582, "grad_norm": 0.14622488617897034, "learning_rate": 0.0002, "loss": 0.5321956872940063, "mean_token_accuracy": 0.7852865755558014, "num_tokens": 8796464.0, "step": 537 }, { "entropy": 0.5337665975093842, "epoch": 2.0074626865671643, "grad_norm": 0.16594251990318298, "learning_rate": 0.0002, "loss": 0.5266042351722717, "mean_token_accuracy": 0.7868293672800064, "num_tokens": 8812777.0, "step": 538 }, { "entropy": 0.5268868803977966, "epoch": 2.0111940298507465, "grad_norm": 0.15608331561088562, "learning_rate": 0.0002, "loss": 0.5311114192008972, "mean_token_accuracy": 0.7839187681674957, "num_tokens": 8829112.0, "step": 539 }, { "entropy": 0.527610257267952, "epoch": 2.014925373134328, "grad_norm": 0.13121342658996582, "learning_rate": 0.0002, "loss": 0.5283110737800598, "mean_token_accuracy": 0.7851767688989639, "num_tokens": 8845686.0, "step": 540 }, { "entropy": 0.5114267989993095, "epoch": 2.0186567164179103, "grad_norm": 0.15982377529144287, "learning_rate": 0.0002, "loss": 0.5138009190559387, "mean_token_accuracy": 0.7923145592212677, "num_tokens": 8862042.0, "step": 541 }, { "entropy": 0.5179557651281357, "epoch": 2.0223880597014925, "grad_norm": 0.15685375034809113, "learning_rate": 0.0002, "loss": 0.5175086855888367, "mean_token_accuracy": 0.790000781416893, "num_tokens": 8878269.0, "step": 542 }, { "entropy": 0.5284497290849686, "epoch": 2.0261194029850746, "grad_norm": 0.155994713306427, "learning_rate": 0.0002, "loss": 0.5248953104019165, "mean_token_accuracy": 0.7887215316295624, "num_tokens": 8894744.0, "step": 543 }, { "entropy": 0.5114204958081245, "epoch": 2.029850746268657, "grad_norm": 0.1587519645690918, "learning_rate": 0.0002, "loss": 0.5146663784980774, "mean_token_accuracy": 0.7908709943294525, "num_tokens": 8911019.0, "step": 544 }, { "entropy": 0.5258788168430328, "epoch": 2.033582089552239, "grad_norm": 0.17405946552753448, "learning_rate": 0.0002, "loss": 0.5257717967033386, "mean_token_accuracy": 0.7857701331377029, "num_tokens": 8927423.0, "step": 545 }, { "entropy": 0.5308232307434082, "epoch": 2.0373134328358207, "grad_norm": 0.16010001301765442, "learning_rate": 0.0002, "loss": 0.5299814343452454, "mean_token_accuracy": 0.7874948382377625, "num_tokens": 8943802.0, "step": 546 }, { "entropy": 0.516572117805481, "epoch": 2.041044776119403, "grad_norm": 0.16816852986812592, "learning_rate": 0.0002, "loss": 0.5154708623886108, "mean_token_accuracy": 0.7876496762037277, "num_tokens": 8959993.0, "step": 547 }, { "entropy": 0.5281299874186516, "epoch": 2.044776119402985, "grad_norm": 0.14758102595806122, "learning_rate": 0.0002, "loss": 0.524406373500824, "mean_token_accuracy": 0.7861409038305283, "num_tokens": 8976245.0, "step": 548 }, { "entropy": 0.5246195495128632, "epoch": 2.048507462686567, "grad_norm": 0.16330084204673767, "learning_rate": 0.0002, "loss": 0.5244280099868774, "mean_token_accuracy": 0.7878082692623138, "num_tokens": 8992638.0, "step": 549 }, { "entropy": 0.514888346195221, "epoch": 2.0522388059701493, "grad_norm": 0.1649155467748642, "learning_rate": 0.0002, "loss": 0.5206322073936462, "mean_token_accuracy": 0.7888449877500534, "num_tokens": 9008736.0, "step": 550 }, { "entropy": 0.5066314935684204, "epoch": 2.0559701492537314, "grad_norm": 0.1575276404619217, "learning_rate": 0.0002, "loss": 0.5027191042900085, "mean_token_accuracy": 0.7947296053171158, "num_tokens": 9025125.0, "step": 551 }, { "entropy": 0.5268809348344803, "epoch": 2.0597014925373136, "grad_norm": 0.1932123601436615, "learning_rate": 0.0002, "loss": 0.526970386505127, "mean_token_accuracy": 0.7861645221710205, "num_tokens": 9041360.0, "step": 552 }, { "entropy": 0.5089156553149223, "epoch": 2.0634328358208953, "grad_norm": 0.17611229419708252, "learning_rate": 0.0002, "loss": 0.5170955061912537, "mean_token_accuracy": 0.7898762077093124, "num_tokens": 9057425.0, "step": 553 }, { "entropy": 0.5314554125070572, "epoch": 2.0671641791044775, "grad_norm": 0.16261620819568634, "learning_rate": 0.0002, "loss": 0.5317267775535583, "mean_token_accuracy": 0.7857931405305862, "num_tokens": 9073634.0, "step": 554 }, { "entropy": 0.5275600850582123, "epoch": 2.0708955223880596, "grad_norm": 0.1528756469488144, "learning_rate": 0.0002, "loss": 0.5216519832611084, "mean_token_accuracy": 0.784853920340538, "num_tokens": 9090072.0, "step": 555 }, { "entropy": 0.533121645450592, "epoch": 2.074626865671642, "grad_norm": 0.15978476405143738, "learning_rate": 0.0002, "loss": 0.5330748558044434, "mean_token_accuracy": 0.7852211892604828, "num_tokens": 9106310.0, "step": 556 }, { "entropy": 0.5289422124624252, "epoch": 2.078358208955224, "grad_norm": 0.18613378703594208, "learning_rate": 0.0002, "loss": 0.5246477127075195, "mean_token_accuracy": 0.7871279567480087, "num_tokens": 9122599.0, "step": 557 }, { "entropy": 0.5288784801959991, "epoch": 2.082089552238806, "grad_norm": 0.19494648277759552, "learning_rate": 0.0002, "loss": 0.5310162305831909, "mean_token_accuracy": 0.783275917172432, "num_tokens": 9138955.0, "step": 558 }, { "entropy": 0.5063241422176361, "epoch": 2.0858208955223883, "grad_norm": 0.17457328736782074, "learning_rate": 0.0002, "loss": 0.5103744268417358, "mean_token_accuracy": 0.7956038117408752, "num_tokens": 9155471.0, "step": 559 }, { "entropy": 0.5165305808186531, "epoch": 2.08955223880597, "grad_norm": 0.16135407984256744, "learning_rate": 0.0002, "loss": 0.5219785571098328, "mean_token_accuracy": 0.7876863032579422, "num_tokens": 9171894.0, "step": 560 }, { "entropy": 0.5188902914524078, "epoch": 2.093283582089552, "grad_norm": 0.16337014734745026, "learning_rate": 0.0002, "loss": 0.516549825668335, "mean_token_accuracy": 0.7918221950531006, "num_tokens": 9188463.0, "step": 561 }, { "entropy": 0.513557106256485, "epoch": 2.0970149253731343, "grad_norm": 0.1818535476922989, "learning_rate": 0.0002, "loss": 0.506076991558075, "mean_token_accuracy": 0.7936830073595047, "num_tokens": 9204870.0, "step": 562 }, { "entropy": 0.5341264307498932, "epoch": 2.1007462686567164, "grad_norm": 0.1677771359682083, "learning_rate": 0.0002, "loss": 0.530627965927124, "mean_token_accuracy": 0.7831838876008987, "num_tokens": 9221094.0, "step": 563 }, { "entropy": 0.5140577107667923, "epoch": 2.1044776119402986, "grad_norm": 0.17054656147956848, "learning_rate": 0.0002, "loss": 0.5144332647323608, "mean_token_accuracy": 0.7923785746097565, "num_tokens": 9237391.0, "step": 564 }, { "entropy": 0.497653529047966, "epoch": 2.1082089552238807, "grad_norm": 0.18110354244709015, "learning_rate": 0.0002, "loss": 0.5102217197418213, "mean_token_accuracy": 0.7931897193193436, "num_tokens": 9253611.0, "step": 565 }, { "entropy": 0.524284727871418, "epoch": 2.111940298507463, "grad_norm": 0.2005971521139145, "learning_rate": 0.0002, "loss": 0.5303030014038086, "mean_token_accuracy": 0.7885997593402863, "num_tokens": 9269952.0, "step": 566 }, { "entropy": 0.5399997532367706, "epoch": 2.1156716417910446, "grad_norm": 0.1460496038198471, "learning_rate": 0.0002, "loss": 0.5352809429168701, "mean_token_accuracy": 0.7851102352142334, "num_tokens": 9286381.0, "step": 567 }, { "entropy": 0.5403535813093185, "epoch": 2.1194029850746268, "grad_norm": 0.2164795845746994, "learning_rate": 0.0002, "loss": 0.5310791730880737, "mean_token_accuracy": 0.7864344716072083, "num_tokens": 9302619.0, "step": 568 }, { "entropy": 0.5281778201460838, "epoch": 2.123134328358209, "grad_norm": 0.14520607888698578, "learning_rate": 0.0002, "loss": 0.5214827060699463, "mean_token_accuracy": 0.7891172915697098, "num_tokens": 9319199.0, "step": 569 }, { "entropy": 0.5376487374305725, "epoch": 2.126865671641791, "grad_norm": 0.20075996220111847, "learning_rate": 0.0002, "loss": 0.5414179563522339, "mean_token_accuracy": 0.7825666964054108, "num_tokens": 9335645.0, "step": 570 }, { "entropy": 0.544133722782135, "epoch": 2.1305970149253732, "grad_norm": 0.17108148336410522, "learning_rate": 0.0002, "loss": 0.5474769473075867, "mean_token_accuracy": 0.778696671128273, "num_tokens": 9352250.0, "step": 571 }, { "entropy": 0.5139511153101921, "epoch": 2.1343283582089554, "grad_norm": 0.20305298268795013, "learning_rate": 0.0002, "loss": 0.5138852000236511, "mean_token_accuracy": 0.7916316092014313, "num_tokens": 9368581.0, "step": 572 }, { "entropy": 0.5336194783449173, "epoch": 2.138059701492537, "grad_norm": 0.17313581705093384, "learning_rate": 0.0002, "loss": 0.5371931195259094, "mean_token_accuracy": 0.7810296416282654, "num_tokens": 9385005.0, "step": 573 }, { "entropy": 0.5428188145160675, "epoch": 2.1417910447761193, "grad_norm": 0.18904267251491547, "learning_rate": 0.0002, "loss": 0.5414341688156128, "mean_token_accuracy": 0.7817030698060989, "num_tokens": 9401264.0, "step": 574 }, { "entropy": 0.5036500468850136, "epoch": 2.1455223880597014, "grad_norm": 0.16260603070259094, "learning_rate": 0.0002, "loss": 0.5049091577529907, "mean_token_accuracy": 0.7955426573753357, "num_tokens": 9417452.0, "step": 575 }, { "entropy": 0.5125822275876999, "epoch": 2.1492537313432836, "grad_norm": 0.18752527236938477, "learning_rate": 0.0002, "loss": 0.520676851272583, "mean_token_accuracy": 0.787801519036293, "num_tokens": 9433830.0, "step": 576 }, { "entropy": 0.5220265239477158, "epoch": 2.1529850746268657, "grad_norm": 0.17956171929836273, "learning_rate": 0.0002, "loss": 0.5259777903556824, "mean_token_accuracy": 0.7890594154596329, "num_tokens": 9449942.0, "step": 577 }, { "entropy": 0.5411542505025864, "epoch": 2.156716417910448, "grad_norm": 0.16276296973228455, "learning_rate": 0.0002, "loss": 0.5392127633094788, "mean_token_accuracy": 0.7827239036560059, "num_tokens": 9466361.0, "step": 578 }, { "entropy": 0.5376486927270889, "epoch": 2.16044776119403, "grad_norm": 0.18284423649311066, "learning_rate": 0.0002, "loss": 0.5354690551757812, "mean_token_accuracy": 0.7847119867801666, "num_tokens": 9482738.0, "step": 579 }, { "entropy": 0.527974009513855, "epoch": 2.1641791044776117, "grad_norm": 0.15606842935085297, "learning_rate": 0.0002, "loss": 0.5216515064239502, "mean_token_accuracy": 0.7893972098827362, "num_tokens": 9499285.0, "step": 580 }, { "entropy": 0.5080907642841339, "epoch": 2.167910447761194, "grad_norm": 0.19228458404541016, "learning_rate": 0.0002, "loss": 0.5062891840934753, "mean_token_accuracy": 0.7950604856014252, "num_tokens": 9515408.0, "step": 581 }, { "entropy": 0.5310265123844147, "epoch": 2.171641791044776, "grad_norm": 0.1585681140422821, "learning_rate": 0.0002, "loss": 0.5329898595809937, "mean_token_accuracy": 0.7825100123882294, "num_tokens": 9531802.0, "step": 582 }, { "entropy": 0.5163623988628387, "epoch": 2.175373134328358, "grad_norm": 0.16819821298122406, "learning_rate": 0.0002, "loss": 0.5175923109054565, "mean_token_accuracy": 0.7890376448631287, "num_tokens": 9548285.0, "step": 583 }, { "entropy": 0.5143009200692177, "epoch": 2.1791044776119404, "grad_norm": 0.16217826306819916, "learning_rate": 0.0002, "loss": 0.5155395865440369, "mean_token_accuracy": 0.7922197580337524, "num_tokens": 9564428.0, "step": 584 }, { "entropy": 0.5416625738143921, "epoch": 2.1828358208955225, "grad_norm": 0.15060050785541534, "learning_rate": 0.0002, "loss": 0.5370927453041077, "mean_token_accuracy": 0.7829685211181641, "num_tokens": 9580974.0, "step": 585 }, { "entropy": 0.5395999997854233, "epoch": 2.1865671641791047, "grad_norm": 0.17097517848014832, "learning_rate": 0.0002, "loss": 0.5385570526123047, "mean_token_accuracy": 0.7842200845479965, "num_tokens": 9597372.0, "step": 586 }, { "entropy": 0.5397211164236069, "epoch": 2.1902985074626864, "grad_norm": 0.1612662672996521, "learning_rate": 0.0002, "loss": 0.5392184257507324, "mean_token_accuracy": 0.7815093398094177, "num_tokens": 9613832.0, "step": 587 }, { "entropy": 0.5179775580763817, "epoch": 2.1940298507462686, "grad_norm": 0.17580583691596985, "learning_rate": 0.0002, "loss": 0.5214508771896362, "mean_token_accuracy": 0.7890152186155319, "num_tokens": 9630021.0, "step": 588 }, { "entropy": 0.5112824365496635, "epoch": 2.1977611940298507, "grad_norm": 0.2011307030916214, "learning_rate": 0.0002, "loss": 0.5203381180763245, "mean_token_accuracy": 0.7900092750787735, "num_tokens": 9646188.0, "step": 589 }, { "entropy": 0.5356829464435577, "epoch": 2.201492537313433, "grad_norm": 0.16764222085475922, "learning_rate": 0.0002, "loss": 0.5318949818611145, "mean_token_accuracy": 0.7853176593780518, "num_tokens": 9662704.0, "step": 590 }, { "entropy": 0.532988578081131, "epoch": 2.205223880597015, "grad_norm": 0.1625567525625229, "learning_rate": 0.0002, "loss": 0.5286852121353149, "mean_token_accuracy": 0.7845050990581512, "num_tokens": 9679126.0, "step": 591 }, { "entropy": 0.5083666741847992, "epoch": 2.208955223880597, "grad_norm": 0.17014159262180328, "learning_rate": 0.0002, "loss": 0.5085889101028442, "mean_token_accuracy": 0.7938840687274933, "num_tokens": 9695574.0, "step": 592 }, { "entropy": 0.5348383486270905, "epoch": 2.2126865671641793, "grad_norm": 0.15370626747608185, "learning_rate": 0.0002, "loss": 0.5363180041313171, "mean_token_accuracy": 0.7823249995708466, "num_tokens": 9711759.0, "step": 593 }, { "entropy": 0.521574854850769, "epoch": 2.216417910447761, "grad_norm": 0.1618925929069519, "learning_rate": 0.0002, "loss": 0.5165284872055054, "mean_token_accuracy": 0.7902027070522308, "num_tokens": 9728297.0, "step": 594 }, { "entropy": 0.5246837437152863, "epoch": 2.220149253731343, "grad_norm": 0.16403713822364807, "learning_rate": 0.0002, "loss": 0.5284984111785889, "mean_token_accuracy": 0.785593718290329, "num_tokens": 9745025.0, "step": 595 }, { "entropy": 0.5146933272480965, "epoch": 2.2238805970149254, "grad_norm": 0.16364289820194244, "learning_rate": 0.0002, "loss": 0.5155675411224365, "mean_token_accuracy": 0.7914301306009293, "num_tokens": 9761573.0, "step": 596 }, { "entropy": 0.5164592936635017, "epoch": 2.2276119402985075, "grad_norm": 0.16107001900672913, "learning_rate": 0.0002, "loss": 0.520284116268158, "mean_token_accuracy": 0.790960431098938, "num_tokens": 9777994.0, "step": 597 }, { "entropy": 0.5009781569242477, "epoch": 2.2313432835820897, "grad_norm": 0.17092035710811615, "learning_rate": 0.0002, "loss": 0.5013527870178223, "mean_token_accuracy": 0.7965078949928284, "num_tokens": 9794247.0, "step": 598 }, { "entropy": 0.5145166665315628, "epoch": 2.235074626865672, "grad_norm": 0.17742900550365448, "learning_rate": 0.0002, "loss": 0.5136178731918335, "mean_token_accuracy": 0.7902016937732697, "num_tokens": 9810623.0, "step": 599 }, { "entropy": 0.521144449710846, "epoch": 2.2388059701492535, "grad_norm": 0.1866447478532791, "learning_rate": 0.0002, "loss": 0.5256049633026123, "mean_token_accuracy": 0.7880899459123611, "num_tokens": 9827216.0, "step": 600 }, { "entropy": 0.5078264698386192, "epoch": 2.2425373134328357, "grad_norm": 0.18190419673919678, "learning_rate": 0.0002, "loss": 0.5107334852218628, "mean_token_accuracy": 0.7921731919050217, "num_tokens": 9843424.0, "step": 601 }, { "entropy": 0.5391242802143097, "epoch": 2.246268656716418, "grad_norm": 0.1664401739835739, "learning_rate": 0.0002, "loss": 0.5404478907585144, "mean_token_accuracy": 0.779574453830719, "num_tokens": 9859528.0, "step": 602 }, { "entropy": 0.5163552165031433, "epoch": 2.25, "grad_norm": 0.19338326156139374, "learning_rate": 0.0002, "loss": 0.5106169581413269, "mean_token_accuracy": 0.7929095774888992, "num_tokens": 9875496.0, "step": 603 }, { "entropy": 0.538531944155693, "epoch": 2.253731343283582, "grad_norm": 0.16355083882808685, "learning_rate": 0.0002, "loss": 0.5421521067619324, "mean_token_accuracy": 0.7775969356298447, "num_tokens": 9891706.0, "step": 604 }, { "entropy": 0.5201183184981346, "epoch": 2.2574626865671643, "grad_norm": 0.2061741203069687, "learning_rate": 0.0002, "loss": 0.5298879742622375, "mean_token_accuracy": 0.7839659005403519, "num_tokens": 9907901.0, "step": 605 }, { "entropy": 0.5299466401338577, "epoch": 2.2611940298507465, "grad_norm": 0.1585988998413086, "learning_rate": 0.0002, "loss": 0.5266643762588501, "mean_token_accuracy": 0.7857095748186111, "num_tokens": 9924584.0, "step": 606 }, { "entropy": 0.5331060588359833, "epoch": 2.264925373134328, "grad_norm": 0.22515474259853363, "learning_rate": 0.0002, "loss": 0.5281371474266052, "mean_token_accuracy": 0.7846943885087967, "num_tokens": 9940921.0, "step": 607 }, { "entropy": 0.5365794003009796, "epoch": 2.2686567164179103, "grad_norm": 0.14158517122268677, "learning_rate": 0.0002, "loss": 0.5241664052009583, "mean_token_accuracy": 0.7902594655752182, "num_tokens": 9957418.0, "step": 608 }, { "entropy": 0.5098173916339874, "epoch": 2.2723880597014925, "grad_norm": 0.19847925007343292, "learning_rate": 0.0002, "loss": 0.5109040141105652, "mean_token_accuracy": 0.7907959967851639, "num_tokens": 9973783.0, "step": 609 }, { "entropy": 0.507322758436203, "epoch": 2.2761194029850746, "grad_norm": 0.1904480904340744, "learning_rate": 0.0002, "loss": 0.5145297050476074, "mean_token_accuracy": 0.791220560669899, "num_tokens": 9990362.0, "step": 610 }, { "entropy": 0.5185896158218384, "epoch": 2.279850746268657, "grad_norm": 0.23211340606212616, "learning_rate": 0.0002, "loss": 0.524868905544281, "mean_token_accuracy": 0.7855911701917648, "num_tokens": 10006762.0, "step": 611 }, { "entropy": 0.5282359346747398, "epoch": 2.283582089552239, "grad_norm": 0.1768886297941208, "learning_rate": 0.0002, "loss": 0.5229817628860474, "mean_token_accuracy": 0.7895976901054382, "num_tokens": 10023191.0, "step": 612 }, { "entropy": 0.5275277346372604, "epoch": 2.2873134328358207, "grad_norm": 0.19380177557468414, "learning_rate": 0.0002, "loss": 0.5169612765312195, "mean_token_accuracy": 0.7907349169254303, "num_tokens": 10039350.0, "step": 613 }, { "entropy": 0.5204345509409904, "epoch": 2.291044776119403, "grad_norm": 0.15632414817810059, "learning_rate": 0.0002, "loss": 0.513292670249939, "mean_token_accuracy": 0.7925348877906799, "num_tokens": 10055872.0, "step": 614 }, { "entropy": 0.5112610086798668, "epoch": 2.294776119402985, "grad_norm": 0.18102124333381653, "learning_rate": 0.0002, "loss": 0.520767092704773, "mean_token_accuracy": 0.7886828035116196, "num_tokens": 10072419.0, "step": 615 }, { "entropy": 0.5232729762792587, "epoch": 2.298507462686567, "grad_norm": 0.25390854477882385, "learning_rate": 0.0002, "loss": 0.5408729314804077, "mean_token_accuracy": 0.7815985828638077, "num_tokens": 10088715.0, "step": 616 }, { "entropy": 0.529785230755806, "epoch": 2.3022388059701493, "grad_norm": 0.15947353839874268, "learning_rate": 0.0002, "loss": 0.5309044718742371, "mean_token_accuracy": 0.784679189324379, "num_tokens": 10105206.0, "step": 617 }, { "entropy": 0.5409619510173798, "epoch": 2.3059701492537314, "grad_norm": 0.21774348616600037, "learning_rate": 0.0002, "loss": 0.5331413745880127, "mean_token_accuracy": 0.7848716974258423, "num_tokens": 10121951.0, "step": 618 }, { "entropy": 0.5404030680656433, "epoch": 2.3097014925373136, "grad_norm": 0.17135120928287506, "learning_rate": 0.0002, "loss": 0.5320269465446472, "mean_token_accuracy": 0.7863317579030991, "num_tokens": 10138520.0, "step": 619 }, { "entropy": 0.543184906244278, "epoch": 2.3134328358208958, "grad_norm": 0.18270884454250336, "learning_rate": 0.0002, "loss": 0.5362977981567383, "mean_token_accuracy": 0.7825828939676285, "num_tokens": 10155242.0, "step": 620 }, { "entropy": 0.5144708007574081, "epoch": 2.3171641791044775, "grad_norm": 0.19776520133018494, "learning_rate": 0.0002, "loss": 0.5190030336380005, "mean_token_accuracy": 0.7893546521663666, "num_tokens": 10171493.0, "step": 621 }, { "entropy": 0.5012815147638321, "epoch": 2.3208955223880596, "grad_norm": 0.18417391180992126, "learning_rate": 0.0002, "loss": 0.5140509009361267, "mean_token_accuracy": 0.7917021214962006, "num_tokens": 10187924.0, "step": 622 }, { "entropy": 0.5291815996170044, "epoch": 2.324626865671642, "grad_norm": 0.18122002482414246, "learning_rate": 0.0002, "loss": 0.5308645367622375, "mean_token_accuracy": 0.7827988862991333, "num_tokens": 10204223.0, "step": 623 }, { "entropy": 0.5316928327083588, "epoch": 2.328358208955224, "grad_norm": 0.17393858730793, "learning_rate": 0.0002, "loss": 0.5351020097732544, "mean_token_accuracy": 0.7837810218334198, "num_tokens": 10220678.0, "step": 624 }, { "entropy": 0.5380063354969025, "epoch": 2.332089552238806, "grad_norm": 0.16641174256801605, "learning_rate": 0.0002, "loss": 0.5311377644538879, "mean_token_accuracy": 0.78605717420578, "num_tokens": 10236761.0, "step": 625 }, { "entropy": 0.5296464115381241, "epoch": 2.3358208955223883, "grad_norm": 0.16847732663154602, "learning_rate": 0.0002, "loss": 0.5290564894676208, "mean_token_accuracy": 0.7866681218147278, "num_tokens": 10253110.0, "step": 626 }, { "entropy": 0.5196742564439774, "epoch": 2.33955223880597, "grad_norm": 0.16526693105697632, "learning_rate": 0.0002, "loss": 0.516907811164856, "mean_token_accuracy": 0.7920583933591843, "num_tokens": 10269492.0, "step": 627 }, { "entropy": 0.541998103260994, "epoch": 2.343283582089552, "grad_norm": 0.18568557500839233, "learning_rate": 0.0002, "loss": 0.5372257828712463, "mean_token_accuracy": 0.7823797762393951, "num_tokens": 10285927.0, "step": 628 }, { "entropy": 0.5108761489391327, "epoch": 2.3470149253731343, "grad_norm": 0.1934242844581604, "learning_rate": 0.0002, "loss": 0.5139164924621582, "mean_token_accuracy": 0.7933155596256256, "num_tokens": 10302023.0, "step": 629 }, { "entropy": 0.5217199325561523, "epoch": 2.3507462686567164, "grad_norm": 0.17553211748600006, "learning_rate": 0.0002, "loss": 0.5230180025100708, "mean_token_accuracy": 0.7875964045524597, "num_tokens": 10318268.0, "step": 630 }, { "entropy": 0.5330761075019836, "epoch": 2.3544776119402986, "grad_norm": 0.15872074663639069, "learning_rate": 0.0002, "loss": 0.5290681719779968, "mean_token_accuracy": 0.7844167649745941, "num_tokens": 10334766.0, "step": 631 }, { "entropy": 0.5369035452604294, "epoch": 2.3582089552238807, "grad_norm": 0.1846853792667389, "learning_rate": 0.0002, "loss": 0.5329739451408386, "mean_token_accuracy": 0.7838435918092728, "num_tokens": 10351349.0, "step": 632 }, { "entropy": 0.5287653654813766, "epoch": 2.361940298507463, "grad_norm": 0.1996822953224182, "learning_rate": 0.0002, "loss": 0.5347191095352173, "mean_token_accuracy": 0.7811494767665863, "num_tokens": 10367871.0, "step": 633 }, { "entropy": 0.5239842683076859, "epoch": 2.3656716417910446, "grad_norm": 0.19435462355613708, "learning_rate": 0.0002, "loss": 0.530573308467865, "mean_token_accuracy": 0.7837476581335068, "num_tokens": 10384315.0, "step": 634 }, { "entropy": 0.5206383317708969, "epoch": 2.3694029850746268, "grad_norm": 0.19717657566070557, "learning_rate": 0.0002, "loss": 0.5275444388389587, "mean_token_accuracy": 0.7842705696821213, "num_tokens": 10400769.0, "step": 635 }, { "entropy": 0.5064749270677567, "epoch": 2.373134328358209, "grad_norm": 0.19260841608047485, "learning_rate": 0.0002, "loss": 0.51506507396698, "mean_token_accuracy": 0.789744108915329, "num_tokens": 10417006.0, "step": 636 }, { "entropy": 0.5361980348825455, "epoch": 2.376865671641791, "grad_norm": 0.17480432987213135, "learning_rate": 0.0002, "loss": 0.5336955189704895, "mean_token_accuracy": 0.7836211174726486, "num_tokens": 10433294.0, "step": 637 }, { "entropy": 0.5383089035749435, "epoch": 2.3805970149253732, "grad_norm": 0.18294544517993927, "learning_rate": 0.0002, "loss": 0.5289636254310608, "mean_token_accuracy": 0.7852412611246109, "num_tokens": 10449674.0, "step": 638 }, { "entropy": 0.5097021907567978, "epoch": 2.3843283582089554, "grad_norm": 0.16242100298404694, "learning_rate": 0.0002, "loss": 0.5021054148674011, "mean_token_accuracy": 0.7972816228866577, "num_tokens": 10465855.0, "step": 639 }, { "entropy": 0.5423515290021896, "epoch": 2.388059701492537, "grad_norm": 0.22227367758750916, "learning_rate": 0.0002, "loss": 0.548687756061554, "mean_token_accuracy": 0.776146799325943, "num_tokens": 10482179.0, "step": 640 }, { "entropy": 0.5074172541499138, "epoch": 2.3917910447761193, "grad_norm": 0.1631743311882019, "learning_rate": 0.0002, "loss": 0.5108535289764404, "mean_token_accuracy": 0.7928425967693329, "num_tokens": 10498617.0, "step": 641 }, { "entropy": 0.5141904726624489, "epoch": 2.3955223880597014, "grad_norm": 0.22901000082492828, "learning_rate": 0.0002, "loss": 0.5239617228507996, "mean_token_accuracy": 0.7894341051578522, "num_tokens": 10514855.0, "step": 642 }, { "entropy": 0.548003762960434, "epoch": 2.3992537313432836, "grad_norm": 0.1889556348323822, "learning_rate": 0.0002, "loss": 0.5518738627433777, "mean_token_accuracy": 0.7756821662187576, "num_tokens": 10531113.0, "step": 643 }, { "entropy": 0.5271116495132446, "epoch": 2.4029850746268657, "grad_norm": 0.15567590296268463, "learning_rate": 0.0002, "loss": 0.516383171081543, "mean_token_accuracy": 0.7933164685964584, "num_tokens": 10547691.0, "step": 644 }, { "entropy": 0.5330717116594315, "epoch": 2.406716417910448, "grad_norm": 0.17213337123394012, "learning_rate": 0.0002, "loss": 0.5231931209564209, "mean_token_accuracy": 0.7853028923273087, "num_tokens": 10563993.0, "step": 645 }, { "entropy": 0.542450025677681, "epoch": 2.41044776119403, "grad_norm": 0.16203731298446655, "learning_rate": 0.0002, "loss": 0.5375291109085083, "mean_token_accuracy": 0.7830152362585068, "num_tokens": 10580464.0, "step": 646 }, { "entropy": 0.5074228942394257, "epoch": 2.4141791044776117, "grad_norm": 0.16541871428489685, "learning_rate": 0.0002, "loss": 0.5123732089996338, "mean_token_accuracy": 0.7941079437732697, "num_tokens": 10596747.0, "step": 647 }, { "entropy": 0.5105165019631386, "epoch": 2.417910447761194, "grad_norm": 0.182412788271904, "learning_rate": 0.0002, "loss": 0.5217914581298828, "mean_token_accuracy": 0.7893105298280716, "num_tokens": 10612951.0, "step": 648 }, { "entropy": 0.5206151753664017, "epoch": 2.421641791044776, "grad_norm": 0.20678837597370148, "learning_rate": 0.0002, "loss": 0.5335655212402344, "mean_token_accuracy": 0.7840552628040314, "num_tokens": 10629467.0, "step": 649 }, { "entropy": 0.5416827350854874, "epoch": 2.425373134328358, "grad_norm": 0.16378135979175568, "learning_rate": 0.0002, "loss": 0.5401762127876282, "mean_token_accuracy": 0.782837986946106, "num_tokens": 10645981.0, "step": 650 }, { "entropy": 0.5352658033370972, "epoch": 2.4291044776119404, "grad_norm": 0.17120513319969177, "learning_rate": 0.0002, "loss": 0.5229877233505249, "mean_token_accuracy": 0.7894999831914902, "num_tokens": 10662599.0, "step": 651 }, { "entropy": 0.5378601551055908, "epoch": 2.4328358208955225, "grad_norm": 0.18634538352489471, "learning_rate": 0.0002, "loss": 0.5370844602584839, "mean_token_accuracy": 0.7834650576114655, "num_tokens": 10678905.0, "step": 652 }, { "entropy": 0.5139342248439789, "epoch": 2.4365671641791042, "grad_norm": 0.1823841780424118, "learning_rate": 0.0002, "loss": 0.5105010271072388, "mean_token_accuracy": 0.7942702323198318, "num_tokens": 10695354.0, "step": 653 }, { "entropy": 0.5001704916357994, "epoch": 2.4402985074626864, "grad_norm": 0.18246224522590637, "learning_rate": 0.0002, "loss": 0.5092322826385498, "mean_token_accuracy": 0.7953812628984451, "num_tokens": 10711419.0, "step": 654 }, { "entropy": 0.5088636800646782, "epoch": 2.4440298507462686, "grad_norm": 0.16581419110298157, "learning_rate": 0.0002, "loss": 0.5136841535568237, "mean_token_accuracy": 0.7919897437095642, "num_tokens": 10727853.0, "step": 655 }, { "entropy": 0.5198448672890663, "epoch": 2.4477611940298507, "grad_norm": 0.16655242443084717, "learning_rate": 0.0002, "loss": 0.5188886523246765, "mean_token_accuracy": 0.7890329360961914, "num_tokens": 10744204.0, "step": 656 }, { "entropy": 0.5168529972434044, "epoch": 2.451492537313433, "grad_norm": 0.18366754055023193, "learning_rate": 0.0002, "loss": 0.5171942114830017, "mean_token_accuracy": 0.7899800539016724, "num_tokens": 10760669.0, "step": 657 }, { "entropy": 0.5348050147294998, "epoch": 2.455223880597015, "grad_norm": 0.18297524750232697, "learning_rate": 0.0002, "loss": 0.5392665266990662, "mean_token_accuracy": 0.779433473944664, "num_tokens": 10777093.0, "step": 658 }, { "entropy": 0.5245852321386337, "epoch": 2.458955223880597, "grad_norm": 0.19149278104305267, "learning_rate": 0.0002, "loss": 0.5260974764823914, "mean_token_accuracy": 0.7873388528823853, "num_tokens": 10793455.0, "step": 659 }, { "entropy": 0.5311989635229111, "epoch": 2.4626865671641793, "grad_norm": 0.1547309309244156, "learning_rate": 0.0002, "loss": 0.5266692042350769, "mean_token_accuracy": 0.7839333266019821, "num_tokens": 10809788.0, "step": 660 }, { "entropy": 0.5379379391670227, "epoch": 2.466417910447761, "grad_norm": 0.15859338641166687, "learning_rate": 0.0002, "loss": 0.5321581363677979, "mean_token_accuracy": 0.7827870547771454, "num_tokens": 10825837.0, "step": 661 }, { "entropy": 0.5471830368041992, "epoch": 2.470149253731343, "grad_norm": 0.16068732738494873, "learning_rate": 0.0002, "loss": 0.5360886454582214, "mean_token_accuracy": 0.7848220616579056, "num_tokens": 10842037.0, "step": 662 }, { "entropy": 0.5252791494131088, "epoch": 2.4738805970149254, "grad_norm": 0.1590043157339096, "learning_rate": 0.0002, "loss": 0.5276464819908142, "mean_token_accuracy": 0.786907747387886, "num_tokens": 10858320.0, "step": 663 }, { "entropy": 0.525018036365509, "epoch": 2.4776119402985075, "grad_norm": 0.17438893020153046, "learning_rate": 0.0002, "loss": 0.5300197005271912, "mean_token_accuracy": 0.7852317094802856, "num_tokens": 10874855.0, "step": 664 }, { "entropy": 0.5394986271858215, "epoch": 2.4813432835820897, "grad_norm": 0.17128010094165802, "learning_rate": 0.0002, "loss": 0.5422081351280212, "mean_token_accuracy": 0.7800386846065521, "num_tokens": 10891526.0, "step": 665 }, { "entropy": 0.5076115503907204, "epoch": 2.485074626865672, "grad_norm": 0.1781933754682541, "learning_rate": 0.0002, "loss": 0.507164716720581, "mean_token_accuracy": 0.7957528084516525, "num_tokens": 10907862.0, "step": 666 }, { "entropy": 0.5271291732788086, "epoch": 2.4888059701492535, "grad_norm": 0.17105896770954132, "learning_rate": 0.0002, "loss": 0.5228562355041504, "mean_token_accuracy": 0.7889808863401413, "num_tokens": 10924235.0, "step": 667 }, { "entropy": 0.5363548994064331, "epoch": 2.4925373134328357, "grad_norm": 0.1583063155412674, "learning_rate": 0.0002, "loss": 0.5336060523986816, "mean_token_accuracy": 0.7860426157712936, "num_tokens": 10940599.0, "step": 668 }, { "entropy": 0.503924198448658, "epoch": 2.496268656716418, "grad_norm": 0.17252567410469055, "learning_rate": 0.0002, "loss": 0.5028519034385681, "mean_token_accuracy": 0.7955358028411865, "num_tokens": 10956649.0, "step": 669 }, { "entropy": 0.5256816297769547, "epoch": 2.5, "grad_norm": 0.1619226038455963, "learning_rate": 0.0002, "loss": 0.5266148447990417, "mean_token_accuracy": 0.787626251578331, "num_tokens": 10972977.0, "step": 670 }, { "entropy": 0.5120773613452911, "epoch": 2.503731343283582, "grad_norm": 0.16918344795703888, "learning_rate": 0.0002, "loss": 0.5207507610321045, "mean_token_accuracy": 0.7914620935916901, "num_tokens": 10989327.0, "step": 671 }, { "entropy": 0.5181663334369659, "epoch": 2.5074626865671643, "grad_norm": 0.19783611595630646, "learning_rate": 0.0002, "loss": 0.5268117189407349, "mean_token_accuracy": 0.7864458560943604, "num_tokens": 11005449.0, "step": 672 }, { "entropy": 0.5229259878396988, "epoch": 2.5111940298507465, "grad_norm": 0.1657666116952896, "learning_rate": 0.0002, "loss": 0.5208563208580017, "mean_token_accuracy": 0.7903305888175964, "num_tokens": 11021576.0, "step": 673 }, { "entropy": 0.5335699021816254, "epoch": 2.5149253731343286, "grad_norm": 0.1847028136253357, "learning_rate": 0.0002, "loss": 0.5323396921157837, "mean_token_accuracy": 0.7818653434514999, "num_tokens": 11038174.0, "step": 674 }, { "entropy": 0.5297135561704636, "epoch": 2.5186567164179103, "grad_norm": 0.17212164402008057, "learning_rate": 0.0002, "loss": 0.5294620990753174, "mean_token_accuracy": 0.7868784368038177, "num_tokens": 11054527.0, "step": 675 }, { "entropy": 0.5551169812679291, "epoch": 2.5223880597014925, "grad_norm": 0.19568513333797455, "learning_rate": 0.0002, "loss": 0.5539876222610474, "mean_token_accuracy": 0.775226280093193, "num_tokens": 11070805.0, "step": 676 }, { "entropy": 0.5319524109363556, "epoch": 2.5261194029850746, "grad_norm": 0.14972956478595734, "learning_rate": 0.0002, "loss": 0.5295209288597107, "mean_token_accuracy": 0.7860101461410522, "num_tokens": 11087510.0, "step": 677 }, { "entropy": 0.5265523195266724, "epoch": 2.529850746268657, "grad_norm": 0.16056260466575623, "learning_rate": 0.0002, "loss": 0.5248823761940002, "mean_token_accuracy": 0.7860508859157562, "num_tokens": 11103933.0, "step": 678 }, { "entropy": 0.5225390195846558, "epoch": 2.533582089552239, "grad_norm": 0.22218124568462372, "learning_rate": 0.0002, "loss": 0.5301728248596191, "mean_token_accuracy": 0.7851128876209259, "num_tokens": 11120292.0, "step": 679 }, { "entropy": 0.5265638679265976, "epoch": 2.5373134328358207, "grad_norm": 0.15814287960529327, "learning_rate": 0.0002, "loss": 0.5240415930747986, "mean_token_accuracy": 0.788665235042572, "num_tokens": 11136784.0, "step": 680 }, { "entropy": 0.5306698828935623, "epoch": 2.541044776119403, "grad_norm": 0.1664581149816513, "learning_rate": 0.0002, "loss": 0.5277557373046875, "mean_token_accuracy": 0.7860920429229736, "num_tokens": 11153320.0, "step": 681 }, { "entropy": 0.5291799604892731, "epoch": 2.544776119402985, "grad_norm": 0.1872314065694809, "learning_rate": 0.0002, "loss": 0.5320236086845398, "mean_token_accuracy": 0.7843979746103287, "num_tokens": 11169723.0, "step": 682 }, { "entropy": 0.53035868704319, "epoch": 2.548507462686567, "grad_norm": 0.20792965590953827, "learning_rate": 0.0002, "loss": 0.5358518362045288, "mean_token_accuracy": 0.7849173247814178, "num_tokens": 11186035.0, "step": 683 }, { "entropy": 0.5152866542339325, "epoch": 2.5522388059701493, "grad_norm": 0.20304447412490845, "learning_rate": 0.0002, "loss": 0.512556791305542, "mean_token_accuracy": 0.7908182591199875, "num_tokens": 11201972.0, "step": 684 }, { "entropy": 0.520212933421135, "epoch": 2.5559701492537314, "grad_norm": 0.19615566730499268, "learning_rate": 0.0002, "loss": 0.5241949558258057, "mean_token_accuracy": 0.7870226055383682, "num_tokens": 11218085.0, "step": 685 }, { "entropy": 0.523841142654419, "epoch": 2.5597014925373136, "grad_norm": 0.18903784453868866, "learning_rate": 0.0002, "loss": 0.5217975974082947, "mean_token_accuracy": 0.7914077341556549, "num_tokens": 11234466.0, "step": 686 }, { "entropy": 0.5006226599216461, "epoch": 2.5634328358208958, "grad_norm": 0.2238045483827591, "learning_rate": 0.0002, "loss": 0.503075122833252, "mean_token_accuracy": 0.7985939383506775, "num_tokens": 11250619.0, "step": 687 }, { "entropy": 0.522046685218811, "epoch": 2.5671641791044775, "grad_norm": 0.1861460655927658, "learning_rate": 0.0002, "loss": 0.5256574749946594, "mean_token_accuracy": 0.7879543006420135, "num_tokens": 11267052.0, "step": 688 }, { "entropy": 0.5404367446899414, "epoch": 2.5708955223880596, "grad_norm": 0.18886177241802216, "learning_rate": 0.0002, "loss": 0.5377542972564697, "mean_token_accuracy": 0.781608834862709, "num_tokens": 11283385.0, "step": 689 }, { "entropy": 0.526772603392601, "epoch": 2.574626865671642, "grad_norm": 0.16710662841796875, "learning_rate": 0.0002, "loss": 0.5189668536186218, "mean_token_accuracy": 0.7905929088592529, "num_tokens": 11299758.0, "step": 690 }, { "entropy": 0.528350904583931, "epoch": 2.578358208955224, "grad_norm": 0.17797508835792542, "learning_rate": 0.0002, "loss": 0.5194413661956787, "mean_token_accuracy": 0.7911931574344635, "num_tokens": 11316130.0, "step": 691 }, { "entropy": 0.52931809425354, "epoch": 2.582089552238806, "grad_norm": 0.21212708950042725, "learning_rate": 0.0002, "loss": 0.5379958152770996, "mean_token_accuracy": 0.7827763855457306, "num_tokens": 11332658.0, "step": 692 }, { "entropy": 0.5531658977270126, "epoch": 2.585820895522388, "grad_norm": 0.17241588234901428, "learning_rate": 0.0002, "loss": 0.5588712692260742, "mean_token_accuracy": 0.7756764441728592, "num_tokens": 11349446.0, "step": 693 }, { "entropy": 0.5219079852104187, "epoch": 2.58955223880597, "grad_norm": 0.15809156000614166, "learning_rate": 0.0002, "loss": 0.5210216045379639, "mean_token_accuracy": 0.7904610931873322, "num_tokens": 11366050.0, "step": 694 }, { "entropy": 0.5322331935167313, "epoch": 2.593283582089552, "grad_norm": 0.18396085500717163, "learning_rate": 0.0002, "loss": 0.5301384925842285, "mean_token_accuracy": 0.7841024845838547, "num_tokens": 11382491.0, "step": 695 }, { "entropy": 0.5307652056217194, "epoch": 2.5970149253731343, "grad_norm": 0.16308656334877014, "learning_rate": 0.0002, "loss": 0.5239346623420715, "mean_token_accuracy": 0.7880617082118988, "num_tokens": 11398802.0, "step": 696 }, { "entropy": 0.5340842455625534, "epoch": 2.6007462686567164, "grad_norm": 0.19761645793914795, "learning_rate": 0.0002, "loss": 0.5363891124725342, "mean_token_accuracy": 0.7838073074817657, "num_tokens": 11415128.0, "step": 697 }, { "entropy": 0.5340555012226105, "epoch": 2.6044776119402986, "grad_norm": 0.1661156415939331, "learning_rate": 0.0002, "loss": 0.5325526595115662, "mean_token_accuracy": 0.7847229689359665, "num_tokens": 11431318.0, "step": 698 }, { "entropy": 0.5427940785884857, "epoch": 2.6082089552238807, "grad_norm": 0.16063573956489563, "learning_rate": 0.0002, "loss": 0.5501501560211182, "mean_token_accuracy": 0.7748306840658188, "num_tokens": 11447713.0, "step": 699 }, { "entropy": 0.5213874280452728, "epoch": 2.611940298507463, "grad_norm": 0.1618213802576065, "learning_rate": 0.0002, "loss": 0.5210378170013428, "mean_token_accuracy": 0.787492960691452, "num_tokens": 11464142.0, "step": 700 }, { "entropy": 0.5329896062612534, "epoch": 2.6156716417910446, "grad_norm": 0.18406495451927185, "learning_rate": 0.0002, "loss": 0.5365204215049744, "mean_token_accuracy": 0.7818106710910797, "num_tokens": 11480468.0, "step": 701 }, { "entropy": 0.5018042698502541, "epoch": 2.6194029850746268, "grad_norm": 0.1559264361858368, "learning_rate": 0.0002, "loss": 0.507462203502655, "mean_token_accuracy": 0.7951454520225525, "num_tokens": 11496824.0, "step": 702 }, { "entropy": 0.5304955393075943, "epoch": 2.623134328358209, "grad_norm": 0.16140370070934296, "learning_rate": 0.0002, "loss": 0.5346159338951111, "mean_token_accuracy": 0.7851942926645279, "num_tokens": 11513567.0, "step": 703 }, { "entropy": 0.5185345709323883, "epoch": 2.626865671641791, "grad_norm": 0.16598905622959137, "learning_rate": 0.0002, "loss": 0.5121718645095825, "mean_token_accuracy": 0.7958889752626419, "num_tokens": 11530042.0, "step": 704 }, { "entropy": 0.5373921394348145, "epoch": 2.6305970149253732, "grad_norm": 0.18821974098682404, "learning_rate": 0.0002, "loss": 0.5302144289016724, "mean_token_accuracy": 0.7860950380563736, "num_tokens": 11546594.0, "step": 705 }, { "entropy": 0.5182069316506386, "epoch": 2.6343283582089554, "grad_norm": 0.17032590508460999, "learning_rate": 0.0002, "loss": 0.5235993266105652, "mean_token_accuracy": 0.7881369441747665, "num_tokens": 11562996.0, "step": 706 }, { "entropy": 0.5120366662740707, "epoch": 2.638059701492537, "grad_norm": 0.20226538181304932, "learning_rate": 0.0002, "loss": 0.5154089331626892, "mean_token_accuracy": 0.7893324643373489, "num_tokens": 11579247.0, "step": 707 }, { "entropy": 0.5271363854408264, "epoch": 2.6417910447761193, "grad_norm": 0.2367754727602005, "learning_rate": 0.0002, "loss": 0.529344916343689, "mean_token_accuracy": 0.7863059490919113, "num_tokens": 11595557.0, "step": 708 }, { "entropy": 0.5211906433105469, "epoch": 2.6455223880597014, "grad_norm": 0.17606736719608307, "learning_rate": 0.0002, "loss": 0.5162103176116943, "mean_token_accuracy": 0.7936627119779587, "num_tokens": 11612153.0, "step": 709 }, { "entropy": 0.5413748621940613, "epoch": 2.6492537313432836, "grad_norm": 0.16839931905269623, "learning_rate": 0.0002, "loss": 0.5375933051109314, "mean_token_accuracy": 0.7837605625391006, "num_tokens": 11628672.0, "step": 710 }, { "entropy": 0.5492138266563416, "epoch": 2.6529850746268657, "grad_norm": 0.1578325480222702, "learning_rate": 0.0002, "loss": 0.5387027263641357, "mean_token_accuracy": 0.7828567028045654, "num_tokens": 11645327.0, "step": 711 }, { "entropy": 0.5294462591409683, "epoch": 2.656716417910448, "grad_norm": 0.18846334517002106, "learning_rate": 0.0002, "loss": 0.5310033559799194, "mean_token_accuracy": 0.7850282490253448, "num_tokens": 11661886.0, "step": 712 }, { "entropy": 0.5195821523666382, "epoch": 2.66044776119403, "grad_norm": 0.1722957044839859, "learning_rate": 0.0002, "loss": 0.5247335433959961, "mean_token_accuracy": 0.7882849276065826, "num_tokens": 11678052.0, "step": 713 }, { "entropy": 0.5254689157009125, "epoch": 2.664179104477612, "grad_norm": 0.175649493932724, "learning_rate": 0.0002, "loss": 0.5303612947463989, "mean_token_accuracy": 0.7877318859100342, "num_tokens": 11694539.0, "step": 714 }, { "entropy": 0.5156526416540146, "epoch": 2.667910447761194, "grad_norm": 0.21296396851539612, "learning_rate": 0.0002, "loss": 0.5188760161399841, "mean_token_accuracy": 0.7886723130941391, "num_tokens": 11710806.0, "step": 715 }, { "entropy": 0.5304235517978668, "epoch": 2.671641791044776, "grad_norm": 0.1557040810585022, "learning_rate": 0.0002, "loss": 0.532120943069458, "mean_token_accuracy": 0.7845920920372009, "num_tokens": 11727178.0, "step": 716 }, { "entropy": 0.5396947711706161, "epoch": 2.675373134328358, "grad_norm": 0.23430386185646057, "learning_rate": 0.0002, "loss": 0.5410381555557251, "mean_token_accuracy": 0.7820145785808563, "num_tokens": 11743592.0, "step": 717 }, { "entropy": 0.5290116220712662, "epoch": 2.6791044776119404, "grad_norm": 0.18491677939891815, "learning_rate": 0.0002, "loss": 0.5220689177513123, "mean_token_accuracy": 0.7880972176790237, "num_tokens": 11759881.0, "step": 718 }, { "entropy": 0.5365530252456665, "epoch": 2.6828358208955225, "grad_norm": 0.20658747851848602, "learning_rate": 0.0002, "loss": 0.5274034738540649, "mean_token_accuracy": 0.7877165377140045, "num_tokens": 11776103.0, "step": 719 }, { "entropy": 0.5193691104650497, "epoch": 2.6865671641791042, "grad_norm": 0.15166765451431274, "learning_rate": 0.0002, "loss": 0.5179476737976074, "mean_token_accuracy": 0.7924929708242416, "num_tokens": 11792614.0, "step": 720 }, { "entropy": 0.5238720774650574, "epoch": 2.6902985074626864, "grad_norm": 0.2068144679069519, "learning_rate": 0.0002, "loss": 0.5365906953811646, "mean_token_accuracy": 0.7825643718242645, "num_tokens": 11808884.0, "step": 721 }, { "entropy": 0.5160530805587769, "epoch": 2.6940298507462686, "grad_norm": 0.1884981393814087, "learning_rate": 0.0002, "loss": 0.5255499482154846, "mean_token_accuracy": 0.785829171538353, "num_tokens": 11825190.0, "step": 722 }, { "entropy": 0.5381662398576736, "epoch": 2.6977611940298507, "grad_norm": 0.22528207302093506, "learning_rate": 0.0002, "loss": 0.5401077270507812, "mean_token_accuracy": 0.780912771821022, "num_tokens": 11841581.0, "step": 723 }, { "entropy": 0.5353066176176071, "epoch": 2.701492537313433, "grad_norm": 0.16518141329288483, "learning_rate": 0.0002, "loss": 0.5283069014549255, "mean_token_accuracy": 0.7859592884778976, "num_tokens": 11857924.0, "step": 724 }, { "entropy": 0.5316939651966095, "epoch": 2.705223880597015, "grad_norm": 0.1674748808145523, "learning_rate": 0.0002, "loss": 0.5228734016418457, "mean_token_accuracy": 0.7879570424556732, "num_tokens": 11874385.0, "step": 725 }, { "entropy": 0.5669917911291122, "epoch": 2.708955223880597, "grad_norm": 0.18983666598796844, "learning_rate": 0.0002, "loss": 0.5586099624633789, "mean_token_accuracy": 0.7734153866767883, "num_tokens": 11890893.0, "step": 726 }, { "entropy": 0.5250157564878464, "epoch": 2.7126865671641793, "grad_norm": 0.16966547071933746, "learning_rate": 0.0002, "loss": 0.5228544473648071, "mean_token_accuracy": 0.7863233536481857, "num_tokens": 11907436.0, "step": 727 }, { "entropy": 0.5265001058578491, "epoch": 2.716417910447761, "grad_norm": 0.21439625322818756, "learning_rate": 0.0002, "loss": 0.5315214991569519, "mean_token_accuracy": 0.7847255766391754, "num_tokens": 11923778.0, "step": 728 }, { "entropy": 0.5284342169761658, "epoch": 2.720149253731343, "grad_norm": 0.1824498325586319, "learning_rate": 0.0002, "loss": 0.5404508709907532, "mean_token_accuracy": 0.7798212766647339, "num_tokens": 11940075.0, "step": 729 }, { "entropy": 0.501299723982811, "epoch": 2.7238805970149254, "grad_norm": 0.2304428666830063, "learning_rate": 0.0002, "loss": 0.5122545957565308, "mean_token_accuracy": 0.791194960474968, "num_tokens": 11956336.0, "step": 730 }, { "entropy": 0.5443384349346161, "epoch": 2.7276119402985075, "grad_norm": 0.1537434458732605, "learning_rate": 0.0002, "loss": 0.5363157987594604, "mean_token_accuracy": 0.7845837771892548, "num_tokens": 11972840.0, "step": 731 }, { "entropy": 0.5315753519535065, "epoch": 2.7313432835820897, "grad_norm": 0.17106328904628754, "learning_rate": 0.0002, "loss": 0.5220600366592407, "mean_token_accuracy": 0.7875728458166122, "num_tokens": 11989350.0, "step": 732 }, { "entropy": 0.5302078127861023, "epoch": 2.7350746268656714, "grad_norm": 0.17003247141838074, "learning_rate": 0.0002, "loss": 0.5270202159881592, "mean_token_accuracy": 0.787715807557106, "num_tokens": 12005809.0, "step": 733 }, { "entropy": 0.527949333190918, "epoch": 2.7388059701492535, "grad_norm": 0.21327127516269684, "learning_rate": 0.0002, "loss": 0.5354670882225037, "mean_token_accuracy": 0.7835386097431183, "num_tokens": 12022336.0, "step": 734 }, { "entropy": 0.5089609026908875, "epoch": 2.7425373134328357, "grad_norm": 0.16088151931762695, "learning_rate": 0.0002, "loss": 0.5117763876914978, "mean_token_accuracy": 0.7938453704118729, "num_tokens": 12038779.0, "step": 735 }, { "entropy": 0.5126267448067665, "epoch": 2.746268656716418, "grad_norm": 0.1757761836051941, "learning_rate": 0.0002, "loss": 0.5135779976844788, "mean_token_accuracy": 0.7931608110666275, "num_tokens": 12054869.0, "step": 736 }, { "entropy": 0.5239577889442444, "epoch": 2.75, "grad_norm": 0.1817576140165329, "learning_rate": 0.0002, "loss": 0.5234410762786865, "mean_token_accuracy": 0.7875021547079086, "num_tokens": 12071361.0, "step": 737 }, { "entropy": 0.5307980924844742, "epoch": 2.753731343283582, "grad_norm": 0.1653635948896408, "learning_rate": 0.0002, "loss": 0.5298102498054504, "mean_token_accuracy": 0.7864446491003036, "num_tokens": 12087634.0, "step": 738 }, { "entropy": 0.5222239643335342, "epoch": 2.7574626865671643, "grad_norm": 0.18040236830711365, "learning_rate": 0.0002, "loss": 0.5258353352546692, "mean_token_accuracy": 0.7891390025615692, "num_tokens": 12103943.0, "step": 739 }, { "entropy": 0.5332596972584724, "epoch": 2.7611940298507465, "grad_norm": 0.15495066344738007, "learning_rate": 0.0002, "loss": 0.5282677412033081, "mean_token_accuracy": 0.785639688372612, "num_tokens": 12120325.0, "step": 740 }, { "entropy": 0.5371799468994141, "epoch": 2.7649253731343286, "grad_norm": 0.17130646109580994, "learning_rate": 0.0002, "loss": 0.5295438170433044, "mean_token_accuracy": 0.7828952521085739, "num_tokens": 12136761.0, "step": 741 }, { "entropy": 0.5405760109424591, "epoch": 2.7686567164179103, "grad_norm": 0.16763344407081604, "learning_rate": 0.0002, "loss": 0.5373218655586243, "mean_token_accuracy": 0.7816964089870453, "num_tokens": 12153043.0, "step": 742 }, { "entropy": 0.5118273198604584, "epoch": 2.7723880597014925, "grad_norm": 0.17398576438426971, "learning_rate": 0.0002, "loss": 0.5121888518333435, "mean_token_accuracy": 0.7949073165655136, "num_tokens": 12169387.0, "step": 743 }, { "entropy": 0.5252756625413895, "epoch": 2.7761194029850746, "grad_norm": 0.20275278389453888, "learning_rate": 0.0002, "loss": 0.5319023132324219, "mean_token_accuracy": 0.7827770113945007, "num_tokens": 12185773.0, "step": 744 }, { "entropy": 0.5281336456537247, "epoch": 2.779850746268657, "grad_norm": 0.16486869752407074, "learning_rate": 0.0002, "loss": 0.5282880663871765, "mean_token_accuracy": 0.7841639369726181, "num_tokens": 12202185.0, "step": 745 }, { "entropy": 0.5157778561115265, "epoch": 2.783582089552239, "grad_norm": 0.1883569210767746, "learning_rate": 0.0002, "loss": 0.5159796476364136, "mean_token_accuracy": 0.791821077466011, "num_tokens": 12218279.0, "step": 746 }, { "entropy": 0.5459621995687485, "epoch": 2.7873134328358207, "grad_norm": 0.15937039256095886, "learning_rate": 0.0002, "loss": 0.5399669408798218, "mean_token_accuracy": 0.7847357988357544, "num_tokens": 12234867.0, "step": 747 }, { "entropy": 0.52740877866745, "epoch": 2.791044776119403, "grad_norm": 0.14844611287117004, "learning_rate": 0.0002, "loss": 0.5260165929794312, "mean_token_accuracy": 0.7880454957485199, "num_tokens": 12251419.0, "step": 748 }, { "entropy": 0.5150434598326683, "epoch": 2.794776119402985, "grad_norm": 0.16429124772548676, "learning_rate": 0.0002, "loss": 0.5152871012687683, "mean_token_accuracy": 0.7888982892036438, "num_tokens": 12267583.0, "step": 749 }, { "entropy": 0.5261992961168289, "epoch": 2.798507462686567, "grad_norm": 0.18603260815143585, "learning_rate": 0.0002, "loss": 0.5299534201622009, "mean_token_accuracy": 0.7854207009077072, "num_tokens": 12284129.0, "step": 750 }, { "entropy": 0.529946893453598, "epoch": 2.8022388059701493, "grad_norm": 0.18355652689933777, "learning_rate": 0.0002, "loss": 0.5360465049743652, "mean_token_accuracy": 0.7842213064432144, "num_tokens": 12300683.0, "step": 751 }, { "entropy": 0.5377232730388641, "epoch": 2.8059701492537314, "grad_norm": 0.17548733949661255, "learning_rate": 0.0002, "loss": 0.5429165363311768, "mean_token_accuracy": 0.7822890281677246, "num_tokens": 12316833.0, "step": 752 }, { "entropy": 0.5407239943742752, "epoch": 2.8097014925373136, "grad_norm": 0.17476212978363037, "learning_rate": 0.0002, "loss": 0.5398030281066895, "mean_token_accuracy": 0.7804454267024994, "num_tokens": 12333283.0, "step": 753 }, { "entropy": 0.520610861480236, "epoch": 2.8134328358208958, "grad_norm": 0.15137535333633423, "learning_rate": 0.0002, "loss": 0.5157968401908875, "mean_token_accuracy": 0.7898696959018707, "num_tokens": 12349570.0, "step": 754 }, { "entropy": 0.5343620032072067, "epoch": 2.8171641791044775, "grad_norm": 0.16463439166545868, "learning_rate": 0.0002, "loss": 0.5255429148674011, "mean_token_accuracy": 0.7910490483045578, "num_tokens": 12366111.0, "step": 755 }, { "entropy": 0.5226383879780769, "epoch": 2.8208955223880596, "grad_norm": 0.17591623961925507, "learning_rate": 0.0002, "loss": 0.5295028686523438, "mean_token_accuracy": 0.7862412929534912, "num_tokens": 12382176.0, "step": 756 }, { "entropy": 0.5329883769154549, "epoch": 2.824626865671642, "grad_norm": 0.17046134173870087, "learning_rate": 0.0002, "loss": 0.5395819544792175, "mean_token_accuracy": 0.7815450727939606, "num_tokens": 12398954.0, "step": 757 }, { "entropy": 0.5189251601696014, "epoch": 2.828358208955224, "grad_norm": 0.17623355984687805, "learning_rate": 0.0002, "loss": 0.5211597681045532, "mean_token_accuracy": 0.7862699329853058, "num_tokens": 12415518.0, "step": 758 }, { "entropy": 0.5435206592082977, "epoch": 2.832089552238806, "grad_norm": 0.16461242735385895, "learning_rate": 0.0002, "loss": 0.5449641346931458, "mean_token_accuracy": 0.7772939503192902, "num_tokens": 12431840.0, "step": 759 }, { "entropy": 0.5242071002721786, "epoch": 2.835820895522388, "grad_norm": 0.16906797885894775, "learning_rate": 0.0002, "loss": 0.5236470103263855, "mean_token_accuracy": 0.7878623157739639, "num_tokens": 12447985.0, "step": 760 }, { "entropy": 0.5331535488367081, "epoch": 2.83955223880597, "grad_norm": 0.1613229662179947, "learning_rate": 0.0002, "loss": 0.5270719528198242, "mean_token_accuracy": 0.7869479656219482, "num_tokens": 12464369.0, "step": 761 }, { "entropy": 0.5153749734163284, "epoch": 2.843283582089552, "grad_norm": 0.1861318051815033, "learning_rate": 0.0002, "loss": 0.5134626626968384, "mean_token_accuracy": 0.7917421609163284, "num_tokens": 12480705.0, "step": 762 }, { "entropy": 0.5185382887721062, "epoch": 2.8470149253731343, "grad_norm": 0.15517400205135345, "learning_rate": 0.0002, "loss": 0.520057201385498, "mean_token_accuracy": 0.7887658178806305, "num_tokens": 12496768.0, "step": 763 }, { "entropy": 0.525531992316246, "epoch": 2.8507462686567164, "grad_norm": 0.2088494747877121, "learning_rate": 0.0002, "loss": 0.5236872434616089, "mean_token_accuracy": 0.7884621620178223, "num_tokens": 12513264.0, "step": 764 }, { "entropy": 0.516917809844017, "epoch": 2.8544776119402986, "grad_norm": 0.1747450977563858, "learning_rate": 0.0002, "loss": 0.5234484076499939, "mean_token_accuracy": 0.7843039780855179, "num_tokens": 12529856.0, "step": 765 }, { "entropy": 0.5171080678701401, "epoch": 2.8582089552238807, "grad_norm": 0.17318587005138397, "learning_rate": 0.0002, "loss": 0.520793080329895, "mean_token_accuracy": 0.7862659096717834, "num_tokens": 12546530.0, "step": 766 }, { "entropy": 0.540691614151001, "epoch": 2.861940298507463, "grad_norm": 0.15875069797039032, "learning_rate": 0.0002, "loss": 0.5400336384773254, "mean_token_accuracy": 0.7827646285295486, "num_tokens": 12563086.0, "step": 767 }, { "entropy": 0.5084429755806923, "epoch": 2.8656716417910446, "grad_norm": 0.14828889071941376, "learning_rate": 0.0002, "loss": 0.5024577379226685, "mean_token_accuracy": 0.7963315397500992, "num_tokens": 12579183.0, "step": 768 }, { "entropy": 0.5370931923389435, "epoch": 2.8694029850746268, "grad_norm": 0.14752823114395142, "learning_rate": 0.0002, "loss": 0.5261865854263306, "mean_token_accuracy": 0.7877734899520874, "num_tokens": 12596077.0, "step": 769 }, { "entropy": 0.5546486079692841, "epoch": 2.873134328358209, "grad_norm": 0.1517077535390854, "learning_rate": 0.0002, "loss": 0.5500649809837341, "mean_token_accuracy": 0.7785899043083191, "num_tokens": 12612620.0, "step": 770 }, { "entropy": 0.5144929736852646, "epoch": 2.876865671641791, "grad_norm": 0.18645553290843964, "learning_rate": 0.0002, "loss": 0.5184378623962402, "mean_token_accuracy": 0.7887341529130936, "num_tokens": 12628974.0, "step": 771 }, { "entropy": 0.5363174676895142, "epoch": 2.8805970149253732, "grad_norm": 0.173641175031662, "learning_rate": 0.0002, "loss": 0.5404868125915527, "mean_token_accuracy": 0.7838273793458939, "num_tokens": 12645473.0, "step": 772 }, { "entropy": 0.5220237821340561, "epoch": 2.8843283582089554, "grad_norm": 0.1810951977968216, "learning_rate": 0.0002, "loss": 0.5300620794296265, "mean_token_accuracy": 0.7870841026306152, "num_tokens": 12661871.0, "step": 773 }, { "entropy": 0.5215499252080917, "epoch": 2.888059701492537, "grad_norm": 0.17195403575897217, "learning_rate": 0.0002, "loss": 0.5228441953659058, "mean_token_accuracy": 0.7888252288103104, "num_tokens": 12678403.0, "step": 774 }, { "entropy": 0.5262960642576218, "epoch": 2.8917910447761193, "grad_norm": 0.16115020215511322, "learning_rate": 0.0002, "loss": 0.5279878973960876, "mean_token_accuracy": 0.7827633023262024, "num_tokens": 12694636.0, "step": 775 }, { "entropy": 0.5458672344684601, "epoch": 2.8955223880597014, "grad_norm": 0.18671803176403046, "learning_rate": 0.0002, "loss": 0.5379894971847534, "mean_token_accuracy": 0.7803581058979034, "num_tokens": 12711335.0, "step": 776 }, { "entropy": 0.5334444046020508, "epoch": 2.8992537313432836, "grad_norm": 0.16968129575252533, "learning_rate": 0.0002, "loss": 0.5301728248596191, "mean_token_accuracy": 0.7843312919139862, "num_tokens": 12727428.0, "step": 777 }, { "entropy": 0.5264092683792114, "epoch": 2.9029850746268657, "grad_norm": 0.17358112335205078, "learning_rate": 0.0002, "loss": 0.5304536819458008, "mean_token_accuracy": 0.7818145751953125, "num_tokens": 12743928.0, "step": 778 }, { "entropy": 0.521320641040802, "epoch": 2.906716417910448, "grad_norm": 0.19404703378677368, "learning_rate": 0.0002, "loss": 0.5308122038841248, "mean_token_accuracy": 0.7851481735706329, "num_tokens": 12760425.0, "step": 779 }, { "entropy": 0.5253891497850418, "epoch": 2.91044776119403, "grad_norm": 0.23603156208992004, "learning_rate": 0.0002, "loss": 0.537718653678894, "mean_token_accuracy": 0.7832214832305908, "num_tokens": 12776783.0, "step": 780 }, { "entropy": 0.5522697567939758, "epoch": 2.914179104477612, "grad_norm": 0.16655920445919037, "learning_rate": 0.0002, "loss": 0.5428380966186523, "mean_token_accuracy": 0.7817497551441193, "num_tokens": 12793260.0, "step": 781 }, { "entropy": 0.5386251360177994, "epoch": 2.917910447761194, "grad_norm": 0.17462746798992157, "learning_rate": 0.0002, "loss": 0.5273305773735046, "mean_token_accuracy": 0.7866194099187851, "num_tokens": 12809754.0, "step": 782 }, { "entropy": 0.5417182147502899, "epoch": 2.921641791044776, "grad_norm": 0.16420036554336548, "learning_rate": 0.0002, "loss": 0.5311017632484436, "mean_token_accuracy": 0.7847865968942642, "num_tokens": 12826135.0, "step": 783 }, { "entropy": 0.5094658881425858, "epoch": 2.925373134328358, "grad_norm": 0.209514319896698, "learning_rate": 0.0002, "loss": 0.5230738520622253, "mean_token_accuracy": 0.7901812642812729, "num_tokens": 12842378.0, "step": 784 }, { "entropy": 0.5122962892055511, "epoch": 2.9291044776119404, "grad_norm": 0.17986896634101868, "learning_rate": 0.0002, "loss": 0.5213406682014465, "mean_token_accuracy": 0.7899868190288544, "num_tokens": 12858715.0, "step": 785 }, { "entropy": 0.5239143073558807, "epoch": 2.9328358208955225, "grad_norm": 0.17349380254745483, "learning_rate": 0.0002, "loss": 0.5260440707206726, "mean_token_accuracy": 0.7880281209945679, "num_tokens": 12875134.0, "step": 786 }, { "entropy": 0.5183478370308876, "epoch": 2.9365671641791042, "grad_norm": 0.15738630294799805, "learning_rate": 0.0002, "loss": 0.5146017074584961, "mean_token_accuracy": 0.7944561541080475, "num_tokens": 12891435.0, "step": 787 }, { "entropy": 0.5321111530065536, "epoch": 2.9402985074626864, "grad_norm": 0.169599249958992, "learning_rate": 0.0002, "loss": 0.5332249402999878, "mean_token_accuracy": 0.7841628640890121, "num_tokens": 12907955.0, "step": 788 }, { "entropy": 0.5348423272371292, "epoch": 2.9440298507462686, "grad_norm": 0.1703958362340927, "learning_rate": 0.0002, "loss": 0.5319628715515137, "mean_token_accuracy": 0.7853727787733078, "num_tokens": 12924187.0, "step": 789 }, { "entropy": 0.5348647981882095, "epoch": 2.9477611940298507, "grad_norm": 0.16257572174072266, "learning_rate": 0.0002, "loss": 0.5274540185928345, "mean_token_accuracy": 0.7864417731761932, "num_tokens": 12940471.0, "step": 790 }, { "entropy": 0.5246876776218414, "epoch": 2.951492537313433, "grad_norm": 0.21989069879055023, "learning_rate": 0.0002, "loss": 0.532191276550293, "mean_token_accuracy": 0.7841058969497681, "num_tokens": 12956753.0, "step": 791 }, { "entropy": 0.5206954181194305, "epoch": 2.955223880597015, "grad_norm": 0.18530453741550446, "learning_rate": 0.0002, "loss": 0.5260450839996338, "mean_token_accuracy": 0.7853500992059708, "num_tokens": 12972983.0, "step": 792 }, { "entropy": 0.5218585133552551, "epoch": 2.958955223880597, "grad_norm": 0.19632470607757568, "learning_rate": 0.0002, "loss": 0.524539589881897, "mean_token_accuracy": 0.7870173752307892, "num_tokens": 12989538.0, "step": 793 }, { "entropy": 0.5301937758922577, "epoch": 2.9626865671641793, "grad_norm": 0.1759789139032364, "learning_rate": 0.0002, "loss": 0.5322460532188416, "mean_token_accuracy": 0.7846620082855225, "num_tokens": 13005865.0, "step": 794 }, { "entropy": 0.5316169708967209, "epoch": 2.966417910447761, "grad_norm": 0.18013249337673187, "learning_rate": 0.0002, "loss": 0.5267240405082703, "mean_token_accuracy": 0.7860967516899109, "num_tokens": 13022162.0, "step": 795 }, { "entropy": 0.5342477560043335, "epoch": 2.970149253731343, "grad_norm": 0.15967167913913727, "learning_rate": 0.0002, "loss": 0.531574010848999, "mean_token_accuracy": 0.7845140397548676, "num_tokens": 13038634.0, "step": 796 }, { "entropy": 0.5358534008264542, "epoch": 2.9738805970149254, "grad_norm": 0.18192364275455475, "learning_rate": 0.0002, "loss": 0.531234085559845, "mean_token_accuracy": 0.7822518199682236, "num_tokens": 13054913.0, "step": 797 }, { "entropy": 0.5332595482468605, "epoch": 2.9776119402985075, "grad_norm": 0.16098462045192719, "learning_rate": 0.0002, "loss": 0.5331971645355225, "mean_token_accuracy": 0.7841719388961792, "num_tokens": 13071687.0, "step": 798 }, { "entropy": 0.5196807980537415, "epoch": 2.9813432835820897, "grad_norm": 0.16396892070770264, "learning_rate": 0.0002, "loss": 0.5180687308311462, "mean_token_accuracy": 0.79112908244133, "num_tokens": 13088263.0, "step": 799 }, { "entropy": 0.5160314440727234, "epoch": 2.9850746268656714, "grad_norm": 0.18938018381595612, "learning_rate": 0.0002, "loss": 0.5278008580207825, "mean_token_accuracy": 0.7868732959032059, "num_tokens": 13104420.0, "step": 800 }, { "entropy": 0.5099834352731705, "epoch": 2.9888059701492535, "grad_norm": 0.18755869567394257, "learning_rate": 0.0002, "loss": 0.5147690176963806, "mean_token_accuracy": 0.790816992521286, "num_tokens": 13120862.0, "step": 801 }, { "entropy": 0.5440191924571991, "epoch": 2.9925373134328357, "grad_norm": 0.16148996353149414, "learning_rate": 0.0002, "loss": 0.5402988195419312, "mean_token_accuracy": 0.7817222625017166, "num_tokens": 13137523.0, "step": 802 }, { "entropy": 0.5369501113891602, "epoch": 2.996268656716418, "grad_norm": 0.17043927311897278, "learning_rate": 0.0002, "loss": 0.5288562178611755, "mean_token_accuracy": 0.7866682559251785, "num_tokens": 13153684.0, "step": 803 }, { "entropy": 0.5347233563661575, "epoch": 3.0, "grad_norm": 0.17972980439662933, "learning_rate": 0.0002, "loss": 0.5365173816680908, "mean_token_accuracy": 0.782272219657898, "num_tokens": 13170027.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2276685185818296e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }