{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1184664368629456, "epoch": 0.003734827264239029, "grad_norm": 0.411286324262619, "learning_rate": 0.0002, "loss": 2.457291841506958, "mean_token_accuracy": 0.5408388376235962, "num_tokens": 16491.0, "step": 1 }, { "entropy": 1.2453091144561768, "epoch": 0.007469654528478058, "grad_norm": 0.37089085578918457, "learning_rate": 0.0002, "loss": 2.1685681343078613, "mean_token_accuracy": 0.5649923086166382, "num_tokens": 32759.0, "step": 2 }, { "entropy": 1.4064331948757172, "epoch": 0.011204481792717087, "grad_norm": 0.2906820774078369, "learning_rate": 0.0002, "loss": 1.710010051727295, "mean_token_accuracy": 0.5920955091714859, "num_tokens": 49020.0, "step": 3 }, { "entropy": 1.364386886358261, "epoch": 0.014939309056956116, "grad_norm": 0.22797873616218567, "learning_rate": 0.0002, "loss": 1.3888747692108154, "mean_token_accuracy": 0.6421842128038406, "num_tokens": 65604.0, "step": 4 }, { "entropy": 1.3538264036178589, "epoch": 0.018674136321195144, "grad_norm": 0.2804432809352875, "learning_rate": 0.0002, "loss": 1.29875648021698, "mean_token_accuracy": 0.6417761594057083, "num_tokens": 81941.0, "step": 5 }, { "entropy": 1.2739673852920532, "epoch": 0.022408963585434174, "grad_norm": 0.15289267897605896, "learning_rate": 0.0002, "loss": 1.1843445301055908, "mean_token_accuracy": 0.6661720275878906, "num_tokens": 98022.0, "step": 6 }, { "entropy": 1.1963406801223755, "epoch": 0.026143790849673203, "grad_norm": 0.1057928279042244, "learning_rate": 0.0002, "loss": 1.089585304260254, "mean_token_accuracy": 0.6709173172712326, "num_tokens": 114552.0, "step": 7 }, { "entropy": 1.1228278279304504, "epoch": 0.029878618113912233, "grad_norm": 0.10864286869764328, "learning_rate": 0.0002, "loss": 1.028782844543457, "mean_token_accuracy": 0.6796794384717941, "num_tokens": 130943.0, "step": 8 }, { "entropy": 1.0330480933189392, "epoch": 0.03361344537815126, "grad_norm": 0.1194700375199318, "learning_rate": 0.0002, "loss": 0.978877067565918, "mean_token_accuracy": 0.6896098554134369, "num_tokens": 147432.0, "step": 9 }, { "entropy": 0.9659490436315536, "epoch": 0.03734827264239029, "grad_norm": 0.13075368106365204, "learning_rate": 0.0002, "loss": 0.93321692943573, "mean_token_accuracy": 0.6966541409492493, "num_tokens": 163753.0, "step": 10 }, { "entropy": 0.9611389189958572, "epoch": 0.04108309990662932, "grad_norm": 0.10369610041379929, "learning_rate": 0.0002, "loss": 0.8770816922187805, "mean_token_accuracy": 0.7034913301467896, "num_tokens": 180090.0, "step": 11 }, { "entropy": 0.9063249081373215, "epoch": 0.04481792717086835, "grad_norm": 0.10426584631204605, "learning_rate": 0.0002, "loss": 0.8171504139900208, "mean_token_accuracy": 0.7150022834539413, "num_tokens": 196381.0, "step": 12 }, { "entropy": 0.8290252089500427, "epoch": 0.04855275443510738, "grad_norm": 0.10911860316991806, "learning_rate": 0.0002, "loss": 0.7891132831573486, "mean_token_accuracy": 0.7208491563796997, "num_tokens": 212795.0, "step": 13 }, { "entropy": 0.7808938026428223, "epoch": 0.05228758169934641, "grad_norm": 0.10144662111997604, "learning_rate": 0.0002, "loss": 0.7427304983139038, "mean_token_accuracy": 0.7313003540039062, "num_tokens": 228936.0, "step": 14 }, { "entropy": 0.7421854734420776, "epoch": 0.056022408963585436, "grad_norm": 0.6942080855369568, "learning_rate": 0.0002, "loss": 0.7379668354988098, "mean_token_accuracy": 0.7287779599428177, "num_tokens": 245241.0, "step": 15 }, { "entropy": 0.7045212388038635, "epoch": 0.059757236227824466, "grad_norm": 0.16263937950134277, "learning_rate": 0.0002, "loss": 0.7117007374763489, "mean_token_accuracy": 0.7335064858198166, "num_tokens": 261386.0, "step": 16 }, { "entropy": 0.6911872327327728, "epoch": 0.06349206349206349, "grad_norm": 0.08423176407814026, "learning_rate": 0.0002, "loss": 0.6914121508598328, "mean_token_accuracy": 0.7408997714519501, "num_tokens": 278017.0, "step": 17 }, { "entropy": 0.6928284466266632, "epoch": 0.06722689075630252, "grad_norm": 0.08306165784597397, "learning_rate": 0.0002, "loss": 0.679314911365509, "mean_token_accuracy": 0.7417374551296234, "num_tokens": 294613.0, "step": 18 }, { "entropy": 0.6805895417928696, "epoch": 0.07096171802054155, "grad_norm": 0.7392253279685974, "learning_rate": 0.0002, "loss": 0.6667531728744507, "mean_token_accuracy": 0.7472580522298813, "num_tokens": 311040.0, "step": 19 }, { "entropy": 0.6846933215856552, "epoch": 0.07469654528478058, "grad_norm": 0.08478110283613205, "learning_rate": 0.0002, "loss": 0.6531012654304504, "mean_token_accuracy": 0.7482306957244873, "num_tokens": 327255.0, "step": 20 }, { "entropy": 0.6761725544929504, "epoch": 0.0784313725490196, "grad_norm": 0.07354654371738434, "learning_rate": 0.0002, "loss": 0.6507971882820129, "mean_token_accuracy": 0.7495593726634979, "num_tokens": 343726.0, "step": 21 }, { "entropy": 0.6475691944360733, "epoch": 0.08216619981325864, "grad_norm": 0.0701100155711174, "learning_rate": 0.0002, "loss": 0.6324924826622009, "mean_token_accuracy": 0.7519394010305405, "num_tokens": 360032.0, "step": 22 }, { "entropy": 0.6286474466323853, "epoch": 0.08590102707749767, "grad_norm": 0.07334811985492706, "learning_rate": 0.0002, "loss": 0.6221117377281189, "mean_token_accuracy": 0.7562299370765686, "num_tokens": 376211.0, "step": 23 }, { "entropy": 0.6444061696529388, "epoch": 0.0896358543417367, "grad_norm": 0.10214248299598694, "learning_rate": 0.0002, "loss": 0.6270927786827087, "mean_token_accuracy": 0.7587939649820328, "num_tokens": 392746.0, "step": 24 }, { "entropy": 0.6239012628793716, "epoch": 0.09337068160597572, "grad_norm": 0.07120268046855927, "learning_rate": 0.0002, "loss": 0.6152804493904114, "mean_token_accuracy": 0.7588517516851425, "num_tokens": 409085.0, "step": 25 }, { "entropy": 0.6276111602783203, "epoch": 0.09710550887021475, "grad_norm": 0.05954922363162041, "learning_rate": 0.0002, "loss": 0.6084893345832825, "mean_token_accuracy": 0.7613021731376648, "num_tokens": 425336.0, "step": 26 }, { "entropy": 0.6411866247653961, "epoch": 0.10084033613445378, "grad_norm": 0.05856655165553093, "learning_rate": 0.0002, "loss": 0.6222058534622192, "mean_token_accuracy": 0.7564119845628738, "num_tokens": 441729.0, "step": 27 }, { "entropy": 0.6264622807502747, "epoch": 0.10457516339869281, "grad_norm": 0.06027727574110031, "learning_rate": 0.0002, "loss": 0.6105791330337524, "mean_token_accuracy": 0.7609841376543045, "num_tokens": 457957.0, "step": 28 }, { "entropy": 0.6167244166135788, "epoch": 0.10830999066293184, "grad_norm": 0.07074937224388123, "learning_rate": 0.0002, "loss": 0.6111780405044556, "mean_token_accuracy": 0.7601886689662933, "num_tokens": 474399.0, "step": 29 }, { "entropy": 0.6115850210189819, "epoch": 0.11204481792717087, "grad_norm": 0.07707173377275467, "learning_rate": 0.0002, "loss": 0.6015152335166931, "mean_token_accuracy": 0.7627497315406799, "num_tokens": 490919.0, "step": 30 }, { "entropy": 0.6094368547201157, "epoch": 0.1157796451914099, "grad_norm": 0.059265896677970886, "learning_rate": 0.0002, "loss": 0.6023207902908325, "mean_token_accuracy": 0.758778989315033, "num_tokens": 507283.0, "step": 31 }, { "entropy": 0.6125481128692627, "epoch": 0.11951447245564893, "grad_norm": 0.07099295407533646, "learning_rate": 0.0002, "loss": 0.603573203086853, "mean_token_accuracy": 0.7601557075977325, "num_tokens": 523521.0, "step": 32 }, { "entropy": 0.6020256727933884, "epoch": 0.12324929971988796, "grad_norm": 0.05661124736070633, "learning_rate": 0.0002, "loss": 0.5916649103164673, "mean_token_accuracy": 0.7667604386806488, "num_tokens": 540024.0, "step": 33 }, { "entropy": 0.5748983919620514, "epoch": 0.12698412698412698, "grad_norm": 0.05405418947339058, "learning_rate": 0.0002, "loss": 0.5715272426605225, "mean_token_accuracy": 0.7717257738113403, "num_tokens": 555993.0, "step": 34 }, { "entropy": 0.5811779201030731, "epoch": 0.13071895424836602, "grad_norm": 0.04870233312249184, "learning_rate": 0.0002, "loss": 0.5783013701438904, "mean_token_accuracy": 0.7701490372419357, "num_tokens": 572358.0, "step": 35 }, { "entropy": 0.574293926358223, "epoch": 0.13445378151260504, "grad_norm": 0.05332570523023605, "learning_rate": 0.0002, "loss": 0.5724313259124756, "mean_token_accuracy": 0.7740762829780579, "num_tokens": 588766.0, "step": 36 }, { "entropy": 0.5665481090545654, "epoch": 0.13818860877684408, "grad_norm": 0.0575035996735096, "learning_rate": 0.0002, "loss": 0.5736980438232422, "mean_token_accuracy": 0.7706244140863419, "num_tokens": 604968.0, "step": 37 }, { "entropy": 0.5721801668405533, "epoch": 0.1419234360410831, "grad_norm": 0.07653734087944031, "learning_rate": 0.0002, "loss": 0.5833261013031006, "mean_token_accuracy": 0.7672377377748489, "num_tokens": 621192.0, "step": 38 }, { "entropy": 0.5661971271038055, "epoch": 0.14565826330532214, "grad_norm": 0.052845459431409836, "learning_rate": 0.0002, "loss": 0.5691311955451965, "mean_token_accuracy": 0.7725834846496582, "num_tokens": 637384.0, "step": 39 }, { "entropy": 0.5870122313499451, "epoch": 0.14939309056956115, "grad_norm": 0.05704643577337265, "learning_rate": 0.0002, "loss": 0.5838981866836548, "mean_token_accuracy": 0.7632379680871964, "num_tokens": 653697.0, "step": 40 }, { "entropy": 0.5808418691158295, "epoch": 0.1531279178338002, "grad_norm": 0.05715522915124893, "learning_rate": 0.0002, "loss": 0.5737625360488892, "mean_token_accuracy": 0.7728984951972961, "num_tokens": 670046.0, "step": 41 }, { "entropy": 0.5726363211870193, "epoch": 0.1568627450980392, "grad_norm": 0.053971655666828156, "learning_rate": 0.0002, "loss": 0.5629984736442566, "mean_token_accuracy": 0.7752888798713684, "num_tokens": 686076.0, "step": 42 }, { "entropy": 0.5652015507221222, "epoch": 0.16059757236227826, "grad_norm": 0.04180985689163208, "learning_rate": 0.0002, "loss": 0.5623056292533875, "mean_token_accuracy": 0.7748470306396484, "num_tokens": 702484.0, "step": 43 }, { "entropy": 0.5733779072761536, "epoch": 0.16433239962651727, "grad_norm": 0.050310708582401276, "learning_rate": 0.0002, "loss": 0.5759532451629639, "mean_token_accuracy": 0.7717497199773788, "num_tokens": 718709.0, "step": 44 }, { "entropy": 0.5682821422815323, "epoch": 0.16806722689075632, "grad_norm": 0.049945104867219925, "learning_rate": 0.0002, "loss": 0.5656522512435913, "mean_token_accuracy": 0.7735471576452255, "num_tokens": 735195.0, "step": 45 }, { "entropy": 0.5685591697692871, "epoch": 0.17180205415499533, "grad_norm": 0.044939614832401276, "learning_rate": 0.0002, "loss": 0.5674217939376831, "mean_token_accuracy": 0.7736205905675888, "num_tokens": 751212.0, "step": 46 }, { "entropy": 0.5851640552282333, "epoch": 0.17553688141923435, "grad_norm": 0.0478069968521595, "learning_rate": 0.0002, "loss": 0.5874634981155396, "mean_token_accuracy": 0.7659626305103302, "num_tokens": 767689.0, "step": 47 }, { "entropy": 0.5731439292430878, "epoch": 0.1792717086834734, "grad_norm": 0.046887464821338654, "learning_rate": 0.0002, "loss": 0.571601152420044, "mean_token_accuracy": 0.7696335017681122, "num_tokens": 784074.0, "step": 48 }, { "entropy": 0.5621766149997711, "epoch": 0.1830065359477124, "grad_norm": 0.04711559787392616, "learning_rate": 0.0002, "loss": 0.5606247782707214, "mean_token_accuracy": 0.7760322690010071, "num_tokens": 800292.0, "step": 49 }, { "entropy": 0.5671460330486298, "epoch": 0.18674136321195145, "grad_norm": 0.04404276981949806, "learning_rate": 0.0002, "loss": 0.5589705109596252, "mean_token_accuracy": 0.7788618206977844, "num_tokens": 816651.0, "step": 50 }, { "entropy": 0.5850909501314163, "epoch": 0.19047619047619047, "grad_norm": 0.04509448632597923, "learning_rate": 0.0002, "loss": 0.5727294683456421, "mean_token_accuracy": 0.7689620703458786, "num_tokens": 833150.0, "step": 51 }, { "entropy": 0.585056334733963, "epoch": 0.1942110177404295, "grad_norm": 0.04984965920448303, "learning_rate": 0.0002, "loss": 0.5666245818138123, "mean_token_accuracy": 0.771300658583641, "num_tokens": 849637.0, "step": 52 }, { "entropy": 0.5864798873662949, "epoch": 0.19794584500466852, "grad_norm": 0.03626571223139763, "learning_rate": 0.0002, "loss": 0.5745272636413574, "mean_token_accuracy": 0.7683106511831284, "num_tokens": 865989.0, "step": 53 }, { "entropy": 0.5780556201934814, "epoch": 0.20168067226890757, "grad_norm": 0.043707672506570816, "learning_rate": 0.0002, "loss": 0.5741198062896729, "mean_token_accuracy": 0.7700863778591156, "num_tokens": 882298.0, "step": 54 }, { "entropy": 0.5698854774236679, "epoch": 0.20541549953314658, "grad_norm": 0.04839429631829262, "learning_rate": 0.0002, "loss": 0.5747280120849609, "mean_token_accuracy": 0.7678831219673157, "num_tokens": 898608.0, "step": 55 }, { "entropy": 0.5627169758081436, "epoch": 0.20915032679738563, "grad_norm": 0.04472200199961662, "learning_rate": 0.0002, "loss": 0.5670843124389648, "mean_token_accuracy": 0.7717523276805878, "num_tokens": 914851.0, "step": 56 }, { "entropy": 0.5779636800289154, "epoch": 0.21288515406162464, "grad_norm": 0.040940672159194946, "learning_rate": 0.0002, "loss": 0.5778319239616394, "mean_token_accuracy": 0.7675311863422394, "num_tokens": 931487.0, "step": 57 }, { "entropy": 0.563320592045784, "epoch": 0.2166199813258637, "grad_norm": 0.0448877178132534, "learning_rate": 0.0002, "loss": 0.5575067400932312, "mean_token_accuracy": 0.7765846252441406, "num_tokens": 947878.0, "step": 58 }, { "entropy": 0.6058623939752579, "epoch": 0.2203548085901027, "grad_norm": 0.04985905811190605, "learning_rate": 0.0002, "loss": 0.6082996726036072, "mean_token_accuracy": 0.7539926767349243, "num_tokens": 964324.0, "step": 59 }, { "entropy": 0.5625719428062439, "epoch": 0.22408963585434175, "grad_norm": 0.038407351821660995, "learning_rate": 0.0002, "loss": 0.5598542094230652, "mean_token_accuracy": 0.7735666781663895, "num_tokens": 980437.0, "step": 60 }, { "entropy": 0.5738561451435089, "epoch": 0.22782446311858076, "grad_norm": 0.04555477574467659, "learning_rate": 0.0002, "loss": 0.5709559917449951, "mean_token_accuracy": 0.7690570503473282, "num_tokens": 996568.0, "step": 61 }, { "entropy": 0.5673829317092896, "epoch": 0.2315592903828198, "grad_norm": 0.04602229222655296, "learning_rate": 0.0002, "loss": 0.5713279843330383, "mean_token_accuracy": 0.7713401615619659, "num_tokens": 1012870.0, "step": 62 }, { "entropy": 0.5517095476388931, "epoch": 0.23529411764705882, "grad_norm": 0.043136853724718094, "learning_rate": 0.0002, "loss": 0.5557603240013123, "mean_token_accuracy": 0.7759266495704651, "num_tokens": 1029066.0, "step": 63 }, { "entropy": 0.5658771097660065, "epoch": 0.23902894491129786, "grad_norm": 0.04121146723628044, "learning_rate": 0.0002, "loss": 0.5609080791473389, "mean_token_accuracy": 0.7747898399829865, "num_tokens": 1045590.0, "step": 64 }, { "entropy": 0.549357607960701, "epoch": 0.24276377217553688, "grad_norm": 0.044083524495363235, "learning_rate": 0.0002, "loss": 0.5459793210029602, "mean_token_accuracy": 0.7811493426561356, "num_tokens": 1061874.0, "step": 65 }, { "entropy": 0.5573842078447342, "epoch": 0.24649859943977592, "grad_norm": 0.04087769240140915, "learning_rate": 0.0002, "loss": 0.5592548847198486, "mean_token_accuracy": 0.775547593832016, "num_tokens": 1078103.0, "step": 66 }, { "entropy": 0.5658538043498993, "epoch": 0.25023342670401494, "grad_norm": 0.03777799755334854, "learning_rate": 0.0002, "loss": 0.5519559979438782, "mean_token_accuracy": 0.776710718870163, "num_tokens": 1094650.0, "step": 67 }, { "entropy": 0.583881214261055, "epoch": 0.25396825396825395, "grad_norm": 0.044072795659303665, "learning_rate": 0.0002, "loss": 0.5742916464805603, "mean_token_accuracy": 0.7709541469812393, "num_tokens": 1110961.0, "step": 68 }, { "entropy": 0.5731556266546249, "epoch": 0.25770308123249297, "grad_norm": 0.045354213565588, "learning_rate": 0.0002, "loss": 0.5748150944709778, "mean_token_accuracy": 0.7677215486764908, "num_tokens": 1127571.0, "step": 69 }, { "entropy": 0.5605138093233109, "epoch": 0.26143790849673204, "grad_norm": 0.03672546148300171, "learning_rate": 0.0002, "loss": 0.5605238080024719, "mean_token_accuracy": 0.7723149508237839, "num_tokens": 1143932.0, "step": 70 }, { "entropy": 0.5381516218185425, "epoch": 0.26517273576097106, "grad_norm": 0.04045504331588745, "learning_rate": 0.0002, "loss": 0.5391750335693359, "mean_token_accuracy": 0.7822330445051193, "num_tokens": 1159972.0, "step": 71 }, { "entropy": 0.5469133257865906, "epoch": 0.2689075630252101, "grad_norm": 0.03917838633060455, "learning_rate": 0.0002, "loss": 0.552070140838623, "mean_token_accuracy": 0.776424303650856, "num_tokens": 1176122.0, "step": 72 }, { "entropy": 0.5674256831407547, "epoch": 0.2726423902894491, "grad_norm": 0.0378127247095108, "learning_rate": 0.0002, "loss": 0.5667495727539062, "mean_token_accuracy": 0.7705131769180298, "num_tokens": 1192483.0, "step": 73 }, { "entropy": 0.568048432469368, "epoch": 0.27637721755368816, "grad_norm": 0.035798948258161545, "learning_rate": 0.0002, "loss": 0.5668107867240906, "mean_token_accuracy": 0.7710251212120056, "num_tokens": 1209110.0, "step": 74 }, { "entropy": 0.5850978642702103, "epoch": 0.2801120448179272, "grad_norm": 0.03812864422798157, "learning_rate": 0.0002, "loss": 0.5801389217376709, "mean_token_accuracy": 0.7685801237821579, "num_tokens": 1225656.0, "step": 75 }, { "entropy": 0.5744365155696869, "epoch": 0.2838468720821662, "grad_norm": 0.03252263367176056, "learning_rate": 0.0002, "loss": 0.5715938806533813, "mean_token_accuracy": 0.7678718268871307, "num_tokens": 1241986.0, "step": 76 }, { "entropy": 0.5737413763999939, "epoch": 0.2875816993464052, "grad_norm": 0.03566081449389458, "learning_rate": 0.0002, "loss": 0.5768669843673706, "mean_token_accuracy": 0.768094465136528, "num_tokens": 1258437.0, "step": 77 }, { "entropy": 0.5403539538383484, "epoch": 0.2913165266106443, "grad_norm": 0.03335001692175865, "learning_rate": 0.0002, "loss": 0.5388357639312744, "mean_token_accuracy": 0.7831095159053802, "num_tokens": 1274706.0, "step": 78 }, { "entropy": 0.5797998905181885, "epoch": 0.2950513538748833, "grad_norm": 0.036791976541280746, "learning_rate": 0.0002, "loss": 0.5749024152755737, "mean_token_accuracy": 0.7673221081495285, "num_tokens": 1291375.0, "step": 79 }, { "entropy": 0.5663541257381439, "epoch": 0.2987861811391223, "grad_norm": 0.04374934732913971, "learning_rate": 0.0002, "loss": 0.5602323412895203, "mean_token_accuracy": 0.7732456177473068, "num_tokens": 1307621.0, "step": 80 }, { "entropy": 0.5841106921434402, "epoch": 0.3025210084033613, "grad_norm": 0.03585761412978172, "learning_rate": 0.0002, "loss": 0.5774515271186829, "mean_token_accuracy": 0.7695471197366714, "num_tokens": 1324292.0, "step": 81 }, { "entropy": 0.5769794583320618, "epoch": 0.3062558356676004, "grad_norm": 0.032680612057447433, "learning_rate": 0.0002, "loss": 0.5758101940155029, "mean_token_accuracy": 0.7648481875658035, "num_tokens": 1340714.0, "step": 82 }, { "entropy": 0.557876318693161, "epoch": 0.3099906629318394, "grad_norm": 0.036271534860134125, "learning_rate": 0.0002, "loss": 0.5576061010360718, "mean_token_accuracy": 0.7769448161125183, "num_tokens": 1357063.0, "step": 83 }, { "entropy": 0.5480719208717346, "epoch": 0.3137254901960784, "grad_norm": 0.04093662649393082, "learning_rate": 0.0002, "loss": 0.5554815530776978, "mean_token_accuracy": 0.7730589210987091, "num_tokens": 1373048.0, "step": 84 }, { "entropy": 0.5651550590991974, "epoch": 0.31746031746031744, "grad_norm": 0.03605310246348381, "learning_rate": 0.0002, "loss": 0.5752359628677368, "mean_token_accuracy": 0.767627626657486, "num_tokens": 1389533.0, "step": 85 }, { "entropy": 0.5644277483224869, "epoch": 0.3211951447245565, "grad_norm": 0.03757842630147934, "learning_rate": 0.0002, "loss": 0.5678563117980957, "mean_token_accuracy": 0.7691835910081863, "num_tokens": 1406026.0, "step": 86 }, { "entropy": 0.5682397186756134, "epoch": 0.32492997198879553, "grad_norm": 0.033709567040205, "learning_rate": 0.0002, "loss": 0.5628086924552917, "mean_token_accuracy": 0.7722707390785217, "num_tokens": 1422562.0, "step": 87 }, { "entropy": 0.5635691732168198, "epoch": 0.32866479925303455, "grad_norm": 0.03606971353292465, "learning_rate": 0.0002, "loss": 0.5536225438117981, "mean_token_accuracy": 0.7781998217105865, "num_tokens": 1438929.0, "step": 88 }, { "entropy": 0.5673100650310516, "epoch": 0.33239962651727356, "grad_norm": 0.03673219308257103, "learning_rate": 0.0002, "loss": 0.5621542930603027, "mean_token_accuracy": 0.7736853212118149, "num_tokens": 1455379.0, "step": 89 }, { "entropy": 0.5614307522773743, "epoch": 0.33613445378151263, "grad_norm": 0.037591755390167236, "learning_rate": 0.0002, "loss": 0.5566410422325134, "mean_token_accuracy": 0.7733979523181915, "num_tokens": 1471484.0, "step": 90 }, { "entropy": 0.5533501952886581, "epoch": 0.33986928104575165, "grad_norm": 0.03392329066991806, "learning_rate": 0.0002, "loss": 0.5534408092498779, "mean_token_accuracy": 0.7756673395633698, "num_tokens": 1487940.0, "step": 91 }, { "entropy": 0.5670682638883591, "epoch": 0.34360410830999066, "grad_norm": 0.038744084537029266, "learning_rate": 0.0002, "loss": 0.5757073760032654, "mean_token_accuracy": 0.7674537003040314, "num_tokens": 1504516.0, "step": 92 }, { "entropy": 0.5437405109405518, "epoch": 0.3473389355742297, "grad_norm": 0.03382673114538193, "learning_rate": 0.0002, "loss": 0.5484196543693542, "mean_token_accuracy": 0.7756420075893402, "num_tokens": 1520914.0, "step": 93 }, { "entropy": 0.5495916306972504, "epoch": 0.3510737628384687, "grad_norm": 0.03743721917271614, "learning_rate": 0.0002, "loss": 0.5565813183784485, "mean_token_accuracy": 0.7735388725996017, "num_tokens": 1537124.0, "step": 94 }, { "entropy": 0.568208858370781, "epoch": 0.35480859010270777, "grad_norm": 0.03229435160756111, "learning_rate": 0.0002, "loss": 0.5690167546272278, "mean_token_accuracy": 0.7696976512670517, "num_tokens": 1553562.0, "step": 95 }, { "entropy": 0.5612770318984985, "epoch": 0.3585434173669468, "grad_norm": 0.03424388915300369, "learning_rate": 0.0002, "loss": 0.5587109923362732, "mean_token_accuracy": 0.774835541844368, "num_tokens": 1569896.0, "step": 96 }, { "entropy": 0.5718783587217331, "epoch": 0.3622782446311858, "grad_norm": 0.033101778477430344, "learning_rate": 0.0002, "loss": 0.5643482208251953, "mean_token_accuracy": 0.7721461206674576, "num_tokens": 1586284.0, "step": 97 }, { "entropy": 0.5654337555170059, "epoch": 0.3660130718954248, "grad_norm": 0.035547658801078796, "learning_rate": 0.0002, "loss": 0.5555263757705688, "mean_token_accuracy": 0.7783078551292419, "num_tokens": 1602584.0, "step": 98 }, { "entropy": 0.5639571994543076, "epoch": 0.3697478991596639, "grad_norm": 0.03868361935019493, "learning_rate": 0.0002, "loss": 0.5630732178688049, "mean_token_accuracy": 0.773595780134201, "num_tokens": 1618810.0, "step": 99 }, { "entropy": 0.568704292178154, "epoch": 0.3734827264239029, "grad_norm": 0.03236787021160126, "learning_rate": 0.0002, "loss": 0.5669816732406616, "mean_token_accuracy": 0.7704071253538132, "num_tokens": 1635290.0, "step": 100 }, { "entropy": 0.551744356751442, "epoch": 0.3772175536881419, "grad_norm": 0.03913586586713791, "learning_rate": 0.0002, "loss": 0.5576678514480591, "mean_token_accuracy": 0.7771230936050415, "num_tokens": 1651818.0, "step": 101 }, { "entropy": 0.5260472893714905, "epoch": 0.38095238095238093, "grad_norm": 0.035290028899908066, "learning_rate": 0.0002, "loss": 0.5295023918151855, "mean_token_accuracy": 0.7862183749675751, "num_tokens": 1668252.0, "step": 102 }, { "entropy": 0.5585302114486694, "epoch": 0.38468720821662, "grad_norm": 0.03497280925512314, "learning_rate": 0.0002, "loss": 0.5631093978881836, "mean_token_accuracy": 0.7744487524032593, "num_tokens": 1684730.0, "step": 103 }, { "entropy": 0.5317506641149521, "epoch": 0.388422035480859, "grad_norm": 0.038267575204372406, "learning_rate": 0.0002, "loss": 0.5366777777671814, "mean_token_accuracy": 0.7837612628936768, "num_tokens": 1700724.0, "step": 104 }, { "entropy": 0.5369188189506531, "epoch": 0.39215686274509803, "grad_norm": 0.03429935500025749, "learning_rate": 0.0002, "loss": 0.5283028483390808, "mean_token_accuracy": 0.7885325402021408, "num_tokens": 1717105.0, "step": 105 }, { "entropy": 0.5693536698818207, "epoch": 0.39589169000933705, "grad_norm": 0.038153599947690964, "learning_rate": 0.0002, "loss": 0.5606598258018494, "mean_token_accuracy": 0.7737682908773422, "num_tokens": 1733363.0, "step": 106 }, { "entropy": 0.5737781524658203, "epoch": 0.3996265172735761, "grad_norm": 0.034137699753046036, "learning_rate": 0.0002, "loss": 0.5676036477088928, "mean_token_accuracy": 0.7725923210382462, "num_tokens": 1749928.0, "step": 107 }, { "entropy": 0.5680664926767349, "epoch": 0.40336134453781514, "grad_norm": 0.035801518708467484, "learning_rate": 0.0002, "loss": 0.5669195055961609, "mean_token_accuracy": 0.7720014601945877, "num_tokens": 1766520.0, "step": 108 }, { "entropy": 0.5640780180692673, "epoch": 0.40709617180205415, "grad_norm": 0.036836352199316025, "learning_rate": 0.0002, "loss": 0.5703918933868408, "mean_token_accuracy": 0.7716377079486847, "num_tokens": 1783002.0, "step": 109 }, { "entropy": 0.554967850446701, "epoch": 0.41083099906629317, "grad_norm": 0.03882612660527229, "learning_rate": 0.0002, "loss": 0.5642282962799072, "mean_token_accuracy": 0.7699488997459412, "num_tokens": 1799237.0, "step": 110 }, { "entropy": 0.5514571368694305, "epoch": 0.41456582633053224, "grad_norm": 0.03324515372514725, "learning_rate": 0.0002, "loss": 0.5484537482261658, "mean_token_accuracy": 0.7782372832298279, "num_tokens": 1815769.0, "step": 111 }, { "entropy": 0.573599174618721, "epoch": 0.41830065359477125, "grad_norm": 0.03034473955631256, "learning_rate": 0.0002, "loss": 0.5679251551628113, "mean_token_accuracy": 0.7719407975673676, "num_tokens": 1831989.0, "step": 112 }, { "entropy": 0.5896201282739639, "epoch": 0.42203548085901027, "grad_norm": 0.03557023033499718, "learning_rate": 0.0002, "loss": 0.5836873054504395, "mean_token_accuracy": 0.7634387165307999, "num_tokens": 1848590.0, "step": 113 }, { "entropy": 0.5535563677549362, "epoch": 0.4257703081232493, "grad_norm": 0.032203588634729385, "learning_rate": 0.0002, "loss": 0.5510682463645935, "mean_token_accuracy": 0.7764001041650772, "num_tokens": 1864862.0, "step": 114 }, { "entropy": 0.5557997226715088, "epoch": 0.4295051353874883, "grad_norm": 0.033370040357112885, "learning_rate": 0.0002, "loss": 0.5584062933921814, "mean_token_accuracy": 0.7749063074588776, "num_tokens": 1881168.0, "step": 115 }, { "entropy": 0.5543448776006699, "epoch": 0.4332399626517274, "grad_norm": 0.030230488628149033, "learning_rate": 0.0002, "loss": 0.5530171990394592, "mean_token_accuracy": 0.7758816778659821, "num_tokens": 1897482.0, "step": 116 }, { "entropy": 0.5602561086416245, "epoch": 0.4369747899159664, "grad_norm": 0.03355773538351059, "learning_rate": 0.0002, "loss": 0.5631951093673706, "mean_token_accuracy": 0.7723173201084137, "num_tokens": 1913520.0, "step": 117 }, { "entropy": 0.5448198318481445, "epoch": 0.4407096171802054, "grad_norm": 0.03538920357823372, "learning_rate": 0.0002, "loss": 0.5498956441879272, "mean_token_accuracy": 0.7779627591371536, "num_tokens": 1929827.0, "step": 118 }, { "entropy": 0.5492925643920898, "epoch": 0.4444444444444444, "grad_norm": 0.03334996476769447, "learning_rate": 0.0002, "loss": 0.5524949431419373, "mean_token_accuracy": 0.7753683775663376, "num_tokens": 1946145.0, "step": 119 }, { "entropy": 0.5578335374593735, "epoch": 0.4481792717086835, "grad_norm": 0.029814472422003746, "learning_rate": 0.0002, "loss": 0.5506975650787354, "mean_token_accuracy": 0.7767714560031891, "num_tokens": 1962460.0, "step": 120 }, { "entropy": 0.5471834242343903, "epoch": 0.4519140989729225, "grad_norm": 0.030702516436576843, "learning_rate": 0.0002, "loss": 0.5459597110748291, "mean_token_accuracy": 0.7779918015003204, "num_tokens": 1978468.0, "step": 121 }, { "entropy": 0.5746940076351166, "epoch": 0.4556489262371615, "grad_norm": 0.028086913749575615, "learning_rate": 0.0002, "loss": 0.5758755207061768, "mean_token_accuracy": 0.766986295580864, "num_tokens": 1994816.0, "step": 122 }, { "entropy": 0.5609753727912903, "epoch": 0.45938375350140054, "grad_norm": 0.027476167306303978, "learning_rate": 0.0002, "loss": 0.5596047639846802, "mean_token_accuracy": 0.7727872580289841, "num_tokens": 2011498.0, "step": 123 }, { "entropy": 0.5600833296775818, "epoch": 0.4631185807656396, "grad_norm": 0.03369581326842308, "learning_rate": 0.0002, "loss": 0.5641721487045288, "mean_token_accuracy": 0.7693867385387421, "num_tokens": 2027843.0, "step": 124 }, { "entropy": 0.5480703115463257, "epoch": 0.4668534080298786, "grad_norm": 0.029643159359693527, "learning_rate": 0.0002, "loss": 0.554192841053009, "mean_token_accuracy": 0.7775781005620956, "num_tokens": 2044099.0, "step": 125 }, { "entropy": 0.5571865439414978, "epoch": 0.47058823529411764, "grad_norm": 0.032963886857032776, "learning_rate": 0.0002, "loss": 0.5603472590446472, "mean_token_accuracy": 0.7727210968732834, "num_tokens": 2060417.0, "step": 126 }, { "entropy": 0.5587971061468124, "epoch": 0.47432306255835666, "grad_norm": 0.028774971142411232, "learning_rate": 0.0002, "loss": 0.5552476644515991, "mean_token_accuracy": 0.7738739997148514, "num_tokens": 2076710.0, "step": 127 }, { "entropy": 0.5658144652843475, "epoch": 0.4780578898225957, "grad_norm": 0.03230098634958267, "learning_rate": 0.0002, "loss": 0.557459831237793, "mean_token_accuracy": 0.7754161208868027, "num_tokens": 2093196.0, "step": 128 }, { "entropy": 0.5515187084674835, "epoch": 0.48179271708683474, "grad_norm": 0.03461001068353653, "learning_rate": 0.0002, "loss": 0.547848641872406, "mean_token_accuracy": 0.7798665314912796, "num_tokens": 2109091.0, "step": 129 }, { "entropy": 0.5527725219726562, "epoch": 0.48552754435107376, "grad_norm": 0.03391197323799133, "learning_rate": 0.0002, "loss": 0.5531637072563171, "mean_token_accuracy": 0.7753576338291168, "num_tokens": 2125292.0, "step": 130 }, { "entropy": 0.5310224145650864, "epoch": 0.4892623716153128, "grad_norm": 0.037288419902324677, "learning_rate": 0.0002, "loss": 0.5368673801422119, "mean_token_accuracy": 0.7833587974309921, "num_tokens": 2141768.0, "step": 131 }, { "entropy": 0.5471584349870682, "epoch": 0.49299719887955185, "grad_norm": 0.03433871641755104, "learning_rate": 0.0002, "loss": 0.5525721907615662, "mean_token_accuracy": 0.776105523109436, "num_tokens": 2158143.0, "step": 132 }, { "entropy": 0.5587402433156967, "epoch": 0.49673202614379086, "grad_norm": 0.03347739949822426, "learning_rate": 0.0002, "loss": 0.5661599636077881, "mean_token_accuracy": 0.7718635648488998, "num_tokens": 2174416.0, "step": 133 }, { "entropy": 0.5683765709400177, "epoch": 0.5004668534080299, "grad_norm": 0.03381507471203804, "learning_rate": 0.0002, "loss": 0.5622847080230713, "mean_token_accuracy": 0.7744656354188919, "num_tokens": 2190880.0, "step": 134 }, { "entropy": 0.5644540786743164, "epoch": 0.5042016806722689, "grad_norm": 0.03272015228867531, "learning_rate": 0.0002, "loss": 0.5552080869674683, "mean_token_accuracy": 0.7752301692962646, "num_tokens": 2207174.0, "step": 135 }, { "entropy": 0.5678849667310715, "epoch": 0.5079365079365079, "grad_norm": 0.031616441905498505, "learning_rate": 0.0002, "loss": 0.5582877993583679, "mean_token_accuracy": 0.7729764580726624, "num_tokens": 2223657.0, "step": 136 }, { "entropy": 0.560051366686821, "epoch": 0.5116713352007469, "grad_norm": 0.03558259457349777, "learning_rate": 0.0002, "loss": 0.5536358952522278, "mean_token_accuracy": 0.7764490097761154, "num_tokens": 2239931.0, "step": 137 }, { "entropy": 0.5550469309091568, "epoch": 0.5154061624649859, "grad_norm": 0.034295059740543365, "learning_rate": 0.0002, "loss": 0.5614034533500671, "mean_token_accuracy": 0.7718400210142136, "num_tokens": 2256301.0, "step": 138 }, { "entropy": 0.5675243437290192, "epoch": 0.5191409897292251, "grad_norm": 0.03538001328706741, "learning_rate": 0.0002, "loss": 0.5784004926681519, "mean_token_accuracy": 0.7684118300676346, "num_tokens": 2272718.0, "step": 139 }, { "entropy": 0.5533763766288757, "epoch": 0.5228758169934641, "grad_norm": 0.034997887909412384, "learning_rate": 0.0002, "loss": 0.563084602355957, "mean_token_accuracy": 0.7709241509437561, "num_tokens": 2289039.0, "step": 140 }, { "entropy": 0.5602118372917175, "epoch": 0.5266106442577031, "grad_norm": 0.033439598977565765, "learning_rate": 0.0002, "loss": 0.5643538236618042, "mean_token_accuracy": 0.7725736945867538, "num_tokens": 2305409.0, "step": 141 }, { "entropy": 0.572220578789711, "epoch": 0.5303454715219421, "grad_norm": 0.02899010293185711, "learning_rate": 0.0002, "loss": 0.56317138671875, "mean_token_accuracy": 0.7727230340242386, "num_tokens": 2321812.0, "step": 142 }, { "entropy": 0.5518327206373215, "epoch": 0.5340802987861811, "grad_norm": 0.03380458429455757, "learning_rate": 0.0002, "loss": 0.5400616526603699, "mean_token_accuracy": 0.7813573479652405, "num_tokens": 2338293.0, "step": 143 }, { "entropy": 0.59617879986763, "epoch": 0.5378151260504201, "grad_norm": 0.03466860204935074, "learning_rate": 0.0002, "loss": 0.58748859167099, "mean_token_accuracy": 0.7642232924699783, "num_tokens": 2354694.0, "step": 144 }, { "entropy": 0.5574633181095123, "epoch": 0.5415499533146592, "grad_norm": 0.030799690634012222, "learning_rate": 0.0002, "loss": 0.5586976408958435, "mean_token_accuracy": 0.774814635515213, "num_tokens": 2370998.0, "step": 145 }, { "entropy": 0.5298123508691788, "epoch": 0.5452847805788982, "grad_norm": 0.032734956592321396, "learning_rate": 0.0002, "loss": 0.5359174609184265, "mean_token_accuracy": 0.782838299870491, "num_tokens": 2387173.0, "step": 146 }, { "entropy": 0.5436026155948639, "epoch": 0.5490196078431373, "grad_norm": 0.03734711930155754, "learning_rate": 0.0002, "loss": 0.5544965267181396, "mean_token_accuracy": 0.7772063612937927, "num_tokens": 2403457.0, "step": 147 }, { "entropy": 0.5453614443540573, "epoch": 0.5527544351073763, "grad_norm": 0.030067089945077896, "learning_rate": 0.0002, "loss": 0.5510781407356262, "mean_token_accuracy": 0.7755871117115021, "num_tokens": 2419735.0, "step": 148 }, { "entropy": 0.55818210542202, "epoch": 0.5564892623716153, "grad_norm": 0.02786589413881302, "learning_rate": 0.0002, "loss": 0.5563390851020813, "mean_token_accuracy": 0.7738417237997055, "num_tokens": 2436098.0, "step": 149 }, { "entropy": 0.5619741082191467, "epoch": 0.5602240896358543, "grad_norm": 0.030777357518672943, "learning_rate": 0.0002, "loss": 0.5554664134979248, "mean_token_accuracy": 0.7789015769958496, "num_tokens": 2452471.0, "step": 150 }, { "entropy": 0.5570534616708755, "epoch": 0.5639589169000934, "grad_norm": 0.03233370929956436, "learning_rate": 0.0002, "loss": 0.5482333898544312, "mean_token_accuracy": 0.7772232443094254, "num_tokens": 2468628.0, "step": 151 }, { "entropy": 0.5588962733745575, "epoch": 0.5676937441643324, "grad_norm": 0.03047763742506504, "learning_rate": 0.0002, "loss": 0.5532917380332947, "mean_token_accuracy": 0.7753781825304031, "num_tokens": 2485072.0, "step": 152 }, { "entropy": 0.549691841006279, "epoch": 0.5714285714285714, "grad_norm": 0.02944052591919899, "learning_rate": 0.0002, "loss": 0.5515119433403015, "mean_token_accuracy": 0.7769780606031418, "num_tokens": 2501327.0, "step": 153 }, { "entropy": 0.5404879450798035, "epoch": 0.5751633986928104, "grad_norm": 0.032262854278087616, "learning_rate": 0.0002, "loss": 0.5476431846618652, "mean_token_accuracy": 0.7793239504098892, "num_tokens": 2517799.0, "step": 154 }, { "entropy": 0.5289865881204605, "epoch": 0.5788982259570495, "grad_norm": 0.03042609617114067, "learning_rate": 0.0002, "loss": 0.531823992729187, "mean_token_accuracy": 0.7862056195735931, "num_tokens": 2534300.0, "step": 155 }, { "entropy": 0.5359181612730026, "epoch": 0.5826330532212886, "grad_norm": 0.030735395848751068, "learning_rate": 0.0002, "loss": 0.5355162024497986, "mean_token_accuracy": 0.7830311506986618, "num_tokens": 2550561.0, "step": 156 }, { "entropy": 0.555221676826477, "epoch": 0.5863678804855276, "grad_norm": 0.03072836995124817, "learning_rate": 0.0002, "loss": 0.5626713037490845, "mean_token_accuracy": 0.7714420855045319, "num_tokens": 2566961.0, "step": 157 }, { "entropy": 0.553142175078392, "epoch": 0.5901027077497666, "grad_norm": 0.030098870396614075, "learning_rate": 0.0002, "loss": 0.5467352867126465, "mean_token_accuracy": 0.7787252068519592, "num_tokens": 2583507.0, "step": 158 }, { "entropy": 0.5665386617183685, "epoch": 0.5938375350140056, "grad_norm": 0.03258649259805679, "learning_rate": 0.0002, "loss": 0.5577669143676758, "mean_token_accuracy": 0.7736402750015259, "num_tokens": 2599944.0, "step": 159 }, { "entropy": 0.5569501370191574, "epoch": 0.5975723622782446, "grad_norm": 0.03186054900288582, "learning_rate": 0.0002, "loss": 0.5573895573616028, "mean_token_accuracy": 0.776360809803009, "num_tokens": 2616293.0, "step": 160 }, { "entropy": 0.5284514650702477, "epoch": 0.6013071895424836, "grad_norm": 0.029392873868346214, "learning_rate": 0.0002, "loss": 0.53591388463974, "mean_token_accuracy": 0.7802938669919968, "num_tokens": 2632542.0, "step": 161 }, { "entropy": 0.5517806857824326, "epoch": 0.6050420168067226, "grad_norm": 0.03547659516334534, "learning_rate": 0.0002, "loss": 0.5624344348907471, "mean_token_accuracy": 0.7713066786527634, "num_tokens": 2648855.0, "step": 162 }, { "entropy": 0.5444875061511993, "epoch": 0.6087768440709617, "grad_norm": 0.032323673367500305, "learning_rate": 0.0002, "loss": 0.5506078004837036, "mean_token_accuracy": 0.7763939499855042, "num_tokens": 2665389.0, "step": 163 }, { "entropy": 0.552508682012558, "epoch": 0.6125116713352008, "grad_norm": 0.029938260093331337, "learning_rate": 0.0002, "loss": 0.5556696653366089, "mean_token_accuracy": 0.774255782365799, "num_tokens": 2681574.0, "step": 164 }, { "entropy": 0.5732054561376572, "epoch": 0.6162464985994398, "grad_norm": 0.027899837121367455, "learning_rate": 0.0002, "loss": 0.5643041133880615, "mean_token_accuracy": 0.7738403379917145, "num_tokens": 2697956.0, "step": 165 }, { "entropy": 0.5575381070375443, "epoch": 0.6199813258636788, "grad_norm": 0.03164415806531906, "learning_rate": 0.0002, "loss": 0.5456005930900574, "mean_token_accuracy": 0.7768769711256027, "num_tokens": 2714390.0, "step": 166 }, { "entropy": 0.5516810864210129, "epoch": 0.6237161531279178, "grad_norm": 0.02569694072008133, "learning_rate": 0.0002, "loss": 0.5495009422302246, "mean_token_accuracy": 0.774631917476654, "num_tokens": 2730912.0, "step": 167 }, { "entropy": 0.5496233999729156, "epoch": 0.6274509803921569, "grad_norm": 0.03019907884299755, "learning_rate": 0.0002, "loss": 0.5496887564659119, "mean_token_accuracy": 0.7817335277795792, "num_tokens": 2747282.0, "step": 168 }, { "entropy": 0.5489860326051712, "epoch": 0.6311858076563959, "grad_norm": 0.03389516472816467, "learning_rate": 0.0002, "loss": 0.5572369694709778, "mean_token_accuracy": 0.7735096365213394, "num_tokens": 2763708.0, "step": 169 }, { "entropy": 0.5558005720376968, "epoch": 0.6349206349206349, "grad_norm": 0.02765459194779396, "learning_rate": 0.0002, "loss": 0.5571833848953247, "mean_token_accuracy": 0.7726074606180191, "num_tokens": 2780084.0, "step": 170 }, { "entropy": 0.5543476939201355, "epoch": 0.6386554621848739, "grad_norm": 0.0267086960375309, "learning_rate": 0.0002, "loss": 0.5579585433006287, "mean_token_accuracy": 0.7720465064048767, "num_tokens": 2796592.0, "step": 171 }, { "entropy": 0.5531720370054245, "epoch": 0.642390289449113, "grad_norm": 0.03003924898803234, "learning_rate": 0.0002, "loss": 0.5539361238479614, "mean_token_accuracy": 0.7745767682790756, "num_tokens": 2813004.0, "step": 172 }, { "entropy": 0.5696417987346649, "epoch": 0.646125116713352, "grad_norm": 0.030649833381175995, "learning_rate": 0.0002, "loss": 0.5720299482345581, "mean_token_accuracy": 0.7685467600822449, "num_tokens": 2829346.0, "step": 173 }, { "entropy": 0.5682009905576706, "epoch": 0.6498599439775911, "grad_norm": 0.028095850721001625, "learning_rate": 0.0002, "loss": 0.5576902627944946, "mean_token_accuracy": 0.7762027978897095, "num_tokens": 2845908.0, "step": 174 }, { "entropy": 0.5714679658412933, "epoch": 0.6535947712418301, "grad_norm": 0.028559835627675056, "learning_rate": 0.0002, "loss": 0.5658706426620483, "mean_token_accuracy": 0.7675664275884628, "num_tokens": 2862417.0, "step": 175 }, { "entropy": 0.5519525855779648, "epoch": 0.6573295985060691, "grad_norm": 0.034554384648799896, "learning_rate": 0.0002, "loss": 0.5615457892417908, "mean_token_accuracy": 0.7730480134487152, "num_tokens": 2878691.0, "step": 176 }, { "entropy": 0.5469972342252731, "epoch": 0.6610644257703081, "grad_norm": 0.038470808416604996, "learning_rate": 0.0002, "loss": 0.5615893006324768, "mean_token_accuracy": 0.7721795290708542, "num_tokens": 2894997.0, "step": 177 }, { "entropy": 0.5659243762493134, "epoch": 0.6647992530345471, "grad_norm": 0.028726449236273766, "learning_rate": 0.0002, "loss": 0.5627461671829224, "mean_token_accuracy": 0.7720647305250168, "num_tokens": 2911504.0, "step": 178 }, { "entropy": 0.5529140681028366, "epoch": 0.6685340802987861, "grad_norm": 0.02865666151046753, "learning_rate": 0.0002, "loss": 0.551171064376831, "mean_token_accuracy": 0.7765299677848816, "num_tokens": 2927890.0, "step": 179 }, { "entropy": 0.5597221851348877, "epoch": 0.6722689075630253, "grad_norm": 0.030919602140784264, "learning_rate": 0.0002, "loss": 0.5537790656089783, "mean_token_accuracy": 0.7759328931570053, "num_tokens": 2944242.0, "step": 180 }, { "entropy": 0.562122106552124, "epoch": 0.6760037348272643, "grad_norm": 0.03044375404715538, "learning_rate": 0.0002, "loss": 0.5568514466285706, "mean_token_accuracy": 0.7706819474697113, "num_tokens": 2960500.0, "step": 181 }, { "entropy": 0.5697348713874817, "epoch": 0.6797385620915033, "grad_norm": 0.031796056777238846, "learning_rate": 0.0002, "loss": 0.5688814520835876, "mean_token_accuracy": 0.7685033828020096, "num_tokens": 2976732.0, "step": 182 }, { "entropy": 0.5696271657943726, "epoch": 0.6834733893557423, "grad_norm": 0.034152235835790634, "learning_rate": 0.0002, "loss": 0.570652186870575, "mean_token_accuracy": 0.7676333039999008, "num_tokens": 2993011.0, "step": 183 }, { "entropy": 0.5509230494499207, "epoch": 0.6872082166199813, "grad_norm": 0.030170850455760956, "learning_rate": 0.0002, "loss": 0.5528304576873779, "mean_token_accuracy": 0.7786384671926498, "num_tokens": 3009475.0, "step": 184 }, { "entropy": 0.549485370516777, "epoch": 0.6909430438842203, "grad_norm": 0.03623858466744423, "learning_rate": 0.0002, "loss": 0.5553773045539856, "mean_token_accuracy": 0.7744152545928955, "num_tokens": 3025920.0, "step": 185 }, { "entropy": 0.5484632700681686, "epoch": 0.6946778711484594, "grad_norm": 0.033118441700935364, "learning_rate": 0.0002, "loss": 0.5544424653053284, "mean_token_accuracy": 0.7758429795503616, "num_tokens": 3042293.0, "step": 186 }, { "entropy": 0.5471510142087936, "epoch": 0.6984126984126984, "grad_norm": 0.027027102187275887, "learning_rate": 0.0002, "loss": 0.5416866540908813, "mean_token_accuracy": 0.7816910296678543, "num_tokens": 3058771.0, "step": 187 }, { "entropy": 0.5579911917448044, "epoch": 0.7021475256769374, "grad_norm": 0.03291584923863411, "learning_rate": 0.0002, "loss": 0.5471009016036987, "mean_token_accuracy": 0.7790512144565582, "num_tokens": 3075134.0, "step": 188 }, { "entropy": 0.5525984019041061, "epoch": 0.7058823529411765, "grad_norm": 0.029011745005846024, "learning_rate": 0.0002, "loss": 0.5483554005622864, "mean_token_accuracy": 0.7763502299785614, "num_tokens": 3091306.0, "step": 189 }, { "entropy": 0.5610422939062119, "epoch": 0.7096171802054155, "grad_norm": 0.02904326282441616, "learning_rate": 0.0002, "loss": 0.5617838501930237, "mean_token_accuracy": 0.7707021087408066, "num_tokens": 3107639.0, "step": 190 }, { "entropy": 0.5382349342107773, "epoch": 0.7133520074696545, "grad_norm": 0.027915941551327705, "learning_rate": 0.0002, "loss": 0.5406217575073242, "mean_token_accuracy": 0.7792213708162308, "num_tokens": 3123888.0, "step": 191 }, { "entropy": 0.5334387570619583, "epoch": 0.7170868347338936, "grad_norm": 0.024687422439455986, "learning_rate": 0.0002, "loss": 0.5337969660758972, "mean_token_accuracy": 0.7827744781970978, "num_tokens": 3140136.0, "step": 192 }, { "entropy": 0.5519388318061829, "epoch": 0.7208216619981326, "grad_norm": 0.03399450331926346, "learning_rate": 0.0002, "loss": 0.5664753317832947, "mean_token_accuracy": 0.7712263911962509, "num_tokens": 3156560.0, "step": 193 }, { "entropy": 0.5329768806695938, "epoch": 0.7245564892623716, "grad_norm": 0.03143489733338356, "learning_rate": 0.0002, "loss": 0.5424296259880066, "mean_token_accuracy": 0.7808002233505249, "num_tokens": 3172868.0, "step": 194 }, { "entropy": 0.5407986044883728, "epoch": 0.7282913165266106, "grad_norm": 0.02865898422896862, "learning_rate": 0.0002, "loss": 0.5426485538482666, "mean_token_accuracy": 0.7797252386808395, "num_tokens": 3188845.0, "step": 195 }, { "entropy": 0.5540356040000916, "epoch": 0.7320261437908496, "grad_norm": 0.031195135787129402, "learning_rate": 0.0002, "loss": 0.5537624359130859, "mean_token_accuracy": 0.772818997502327, "num_tokens": 3205059.0, "step": 196 }, { "entropy": 0.547016367316246, "epoch": 0.7357609710550888, "grad_norm": 0.026600942015647888, "learning_rate": 0.0002, "loss": 0.5409566164016724, "mean_token_accuracy": 0.7801954299211502, "num_tokens": 3221339.0, "step": 197 }, { "entropy": 0.5571199655532837, "epoch": 0.7394957983193278, "grad_norm": 0.027464795857667923, "learning_rate": 0.0002, "loss": 0.5505565404891968, "mean_token_accuracy": 0.7758535593748093, "num_tokens": 3237556.0, "step": 198 }, { "entropy": 0.5562743991613388, "epoch": 0.7432306255835668, "grad_norm": 0.029805589467287064, "learning_rate": 0.0002, "loss": 0.5526044368743896, "mean_token_accuracy": 0.7738559246063232, "num_tokens": 3253871.0, "step": 199 }, { "entropy": 0.5585610568523407, "epoch": 0.7469654528478058, "grad_norm": 0.03004448115825653, "learning_rate": 0.0002, "loss": 0.5598405599594116, "mean_token_accuracy": 0.7726627141237259, "num_tokens": 3269973.0, "step": 200 }, { "entropy": 0.5488641411066055, "epoch": 0.7507002801120448, "grad_norm": 0.027654554694890976, "learning_rate": 0.0002, "loss": 0.5513002276420593, "mean_token_accuracy": 0.7737944573163986, "num_tokens": 3286201.0, "step": 201 }, { "entropy": 0.5287523940205574, "epoch": 0.7544351073762838, "grad_norm": 0.03466613590717316, "learning_rate": 0.0002, "loss": 0.5311362743377686, "mean_token_accuracy": 0.7847718745470047, "num_tokens": 3302467.0, "step": 202 }, { "entropy": 0.5560965240001678, "epoch": 0.7581699346405228, "grad_norm": 0.034095581620931625, "learning_rate": 0.0002, "loss": 0.5613946914672852, "mean_token_accuracy": 0.7737453281879425, "num_tokens": 3318768.0, "step": 203 }, { "entropy": 0.5630687177181244, "epoch": 0.7619047619047619, "grad_norm": 0.03233996778726578, "learning_rate": 0.0002, "loss": 0.564468264579773, "mean_token_accuracy": 0.7691166549921036, "num_tokens": 3335233.0, "step": 204 }, { "entropy": 0.5651765614748001, "epoch": 0.765639589169001, "grad_norm": 0.030395060777664185, "learning_rate": 0.0002, "loss": 0.5597318410873413, "mean_token_accuracy": 0.7716515213251114, "num_tokens": 3351439.0, "step": 205 }, { "entropy": 0.5476003587245941, "epoch": 0.76937441643324, "grad_norm": 0.03382452204823494, "learning_rate": 0.0002, "loss": 0.5447301864624023, "mean_token_accuracy": 0.7816700637340546, "num_tokens": 3367520.0, "step": 206 }, { "entropy": 0.5484471321105957, "epoch": 0.773109243697479, "grad_norm": 0.02830951102077961, "learning_rate": 0.0002, "loss": 0.5454609394073486, "mean_token_accuracy": 0.7790801376104355, "num_tokens": 3383667.0, "step": 207 }, { "entropy": 0.5659755617380142, "epoch": 0.776844070961718, "grad_norm": 0.02530798688530922, "learning_rate": 0.0002, "loss": 0.5655397772789001, "mean_token_accuracy": 0.770569920539856, "num_tokens": 3400150.0, "step": 208 }, { "entropy": 0.5427214205265045, "epoch": 0.780578898225957, "grad_norm": 0.03361448645591736, "learning_rate": 0.0002, "loss": 0.5476981401443481, "mean_token_accuracy": 0.7780336290597916, "num_tokens": 3416165.0, "step": 209 }, { "entropy": 0.5494136065244675, "epoch": 0.7843137254901961, "grad_norm": 0.029303058981895447, "learning_rate": 0.0002, "loss": 0.555971086025238, "mean_token_accuracy": 0.7742915004491806, "num_tokens": 3432668.0, "step": 210 }, { "entropy": 0.5408063977956772, "epoch": 0.7880485527544351, "grad_norm": 0.024706227704882622, "learning_rate": 0.0002, "loss": 0.5423460006713867, "mean_token_accuracy": 0.7791419923305511, "num_tokens": 3449230.0, "step": 211 }, { "entropy": 0.5585084557533264, "epoch": 0.7917833800186741, "grad_norm": 0.031753819435834885, "learning_rate": 0.0002, "loss": 0.5534642934799194, "mean_token_accuracy": 0.7761369943618774, "num_tokens": 3465888.0, "step": 212 }, { "entropy": 0.5470991730690002, "epoch": 0.7955182072829131, "grad_norm": 0.02627946063876152, "learning_rate": 0.0002, "loss": 0.543735921382904, "mean_token_accuracy": 0.7773504257202148, "num_tokens": 3482200.0, "step": 213 }, { "entropy": 0.5522027462720871, "epoch": 0.7992530345471522, "grad_norm": 0.02693161368370056, "learning_rate": 0.0002, "loss": 0.5497567057609558, "mean_token_accuracy": 0.7760942578315735, "num_tokens": 3498472.0, "step": 214 }, { "entropy": 0.5438102185726166, "epoch": 0.8029878618113913, "grad_norm": 0.029677148908376694, "learning_rate": 0.0002, "loss": 0.5449556112289429, "mean_token_accuracy": 0.7757529467344284, "num_tokens": 3514748.0, "step": 215 }, { "entropy": 0.5440456867218018, "epoch": 0.8067226890756303, "grad_norm": 0.028825437650084496, "learning_rate": 0.0002, "loss": 0.5460283160209656, "mean_token_accuracy": 0.7805955857038498, "num_tokens": 3530910.0, "step": 216 }, { "entropy": 0.5444321483373642, "epoch": 0.8104575163398693, "grad_norm": 0.023829322308301926, "learning_rate": 0.0002, "loss": 0.5420593023300171, "mean_token_accuracy": 0.7787522822618484, "num_tokens": 3547036.0, "step": 217 }, { "entropy": 0.5498476177453995, "epoch": 0.8141923436041083, "grad_norm": 0.025729795917868614, "learning_rate": 0.0002, "loss": 0.5429421067237854, "mean_token_accuracy": 0.7785259187221527, "num_tokens": 3563357.0, "step": 218 }, { "entropy": 0.544920951128006, "epoch": 0.8179271708683473, "grad_norm": 0.027102749794721603, "learning_rate": 0.0002, "loss": 0.5383168458938599, "mean_token_accuracy": 0.7817831486463547, "num_tokens": 3579822.0, "step": 219 }, { "entropy": 0.5497463345527649, "epoch": 0.8216619981325863, "grad_norm": 0.0323423407971859, "learning_rate": 0.0002, "loss": 0.5521490573883057, "mean_token_accuracy": 0.7747017741203308, "num_tokens": 3596053.0, "step": 220 }, { "entropy": 0.5389717519283295, "epoch": 0.8253968253968254, "grad_norm": 0.027372388169169426, "learning_rate": 0.0002, "loss": 0.540256679058075, "mean_token_accuracy": 0.7825071215629578, "num_tokens": 3612271.0, "step": 221 }, { "entropy": 0.5472569465637207, "epoch": 0.8291316526610645, "grad_norm": 0.028159258887171745, "learning_rate": 0.0002, "loss": 0.5517306327819824, "mean_token_accuracy": 0.7758912444114685, "num_tokens": 3628658.0, "step": 222 }, { "entropy": 0.5441670119762421, "epoch": 0.8328664799253035, "grad_norm": 0.0357636883854866, "learning_rate": 0.0002, "loss": 0.5485156178474426, "mean_token_accuracy": 0.7771351188421249, "num_tokens": 3645179.0, "step": 223 }, { "entropy": 0.5535278022289276, "epoch": 0.8366013071895425, "grad_norm": 0.032881151884794235, "learning_rate": 0.0002, "loss": 0.5619751811027527, "mean_token_accuracy": 0.7715311944484711, "num_tokens": 3661296.0, "step": 224 }, { "entropy": 0.5683074444532394, "epoch": 0.8403361344537815, "grad_norm": 0.03166094422340393, "learning_rate": 0.0002, "loss": 0.5676220059394836, "mean_token_accuracy": 0.7721768617630005, "num_tokens": 3677506.0, "step": 225 }, { "entropy": 0.5672677755355835, "epoch": 0.8440709617180205, "grad_norm": 0.029754942283034325, "learning_rate": 0.0002, "loss": 0.5636724233627319, "mean_token_accuracy": 0.7715145349502563, "num_tokens": 3693949.0, "step": 226 }, { "entropy": 0.5544100105762482, "epoch": 0.8478057889822596, "grad_norm": 0.027808941900730133, "learning_rate": 0.0002, "loss": 0.5551161170005798, "mean_token_accuracy": 0.7762546241283417, "num_tokens": 3710403.0, "step": 227 }, { "entropy": 0.5428061038255692, "epoch": 0.8515406162464986, "grad_norm": 0.032082680612802505, "learning_rate": 0.0002, "loss": 0.5452495813369751, "mean_token_accuracy": 0.7784813046455383, "num_tokens": 3726407.0, "step": 228 }, { "entropy": 0.5473134368658066, "epoch": 0.8552754435107376, "grad_norm": 0.030095776543021202, "learning_rate": 0.0002, "loss": 0.5461854934692383, "mean_token_accuracy": 0.7758107632398605, "num_tokens": 3742861.0, "step": 229 }, { "entropy": 0.5752474963665009, "epoch": 0.8590102707749766, "grad_norm": 0.030156588181853294, "learning_rate": 0.0002, "loss": 0.5713618397712708, "mean_token_accuracy": 0.7695687711238861, "num_tokens": 3759464.0, "step": 230 }, { "entropy": 0.5410983264446259, "epoch": 0.8627450980392157, "grad_norm": 0.026288261637091637, "learning_rate": 0.0002, "loss": 0.5398176908493042, "mean_token_accuracy": 0.7807286381721497, "num_tokens": 3775673.0, "step": 231 }, { "entropy": 0.5493600815534592, "epoch": 0.8664799253034547, "grad_norm": 0.03065655194222927, "learning_rate": 0.0002, "loss": 0.5482011437416077, "mean_token_accuracy": 0.7772542536258698, "num_tokens": 3791787.0, "step": 232 }, { "entropy": 0.5542360842227936, "epoch": 0.8702147525676938, "grad_norm": 0.032031431794166565, "learning_rate": 0.0002, "loss": 0.5554019212722778, "mean_token_accuracy": 0.7739447802305222, "num_tokens": 3808316.0, "step": 233 }, { "entropy": 0.5599103569984436, "epoch": 0.8739495798319328, "grad_norm": 0.027463702484965324, "learning_rate": 0.0002, "loss": 0.5579502582550049, "mean_token_accuracy": 0.771759495139122, "num_tokens": 3824701.0, "step": 234 }, { "entropy": 0.5677217245101929, "epoch": 0.8776844070961718, "grad_norm": 0.03142165020108223, "learning_rate": 0.0002, "loss": 0.5663169622421265, "mean_token_accuracy": 0.7691013365983963, "num_tokens": 3841435.0, "step": 235 }, { "entropy": 0.5482347160577774, "epoch": 0.8814192343604108, "grad_norm": 0.031262561678886414, "learning_rate": 0.0002, "loss": 0.552081823348999, "mean_token_accuracy": 0.7783354371786118, "num_tokens": 3857866.0, "step": 236 }, { "entropy": 0.5391282737255096, "epoch": 0.8851540616246498, "grad_norm": 0.030781790614128113, "learning_rate": 0.0002, "loss": 0.5469714403152466, "mean_token_accuracy": 0.7780267000198364, "num_tokens": 3874216.0, "step": 237 }, { "entropy": 0.5443921983242035, "epoch": 0.8888888888888888, "grad_norm": 0.032567523419857025, "learning_rate": 0.0002, "loss": 0.549781084060669, "mean_token_accuracy": 0.7772793620824814, "num_tokens": 3890382.0, "step": 238 }, { "entropy": 0.5604461878538132, "epoch": 0.892623716153128, "grad_norm": 0.02667226456105709, "learning_rate": 0.0002, "loss": 0.5538907051086426, "mean_token_accuracy": 0.7770420461893082, "num_tokens": 3906697.0, "step": 239 }, { "entropy": 0.5541103631258011, "epoch": 0.896358543417367, "grad_norm": 0.027397198602557182, "learning_rate": 0.0002, "loss": 0.5516767501831055, "mean_token_accuracy": 0.7767754942178726, "num_tokens": 3922978.0, "step": 240 }, { "entropy": 0.5521068722009659, "epoch": 0.900093370681606, "grad_norm": 0.032886214554309845, "learning_rate": 0.0002, "loss": 0.5538557171821594, "mean_token_accuracy": 0.7769301533699036, "num_tokens": 3939282.0, "step": 241 }, { "entropy": 0.5449024885892868, "epoch": 0.903828197945845, "grad_norm": 0.026176048442721367, "learning_rate": 0.0002, "loss": 0.5478168725967407, "mean_token_accuracy": 0.7779200524091721, "num_tokens": 3955520.0, "step": 242 }, { "entropy": 0.5615669041872025, "epoch": 0.907563025210084, "grad_norm": 0.02917352132499218, "learning_rate": 0.0002, "loss": 0.5631118416786194, "mean_token_accuracy": 0.769850417971611, "num_tokens": 3971679.0, "step": 243 }, { "entropy": 0.5360025763511658, "epoch": 0.911297852474323, "grad_norm": 0.028804168105125427, "learning_rate": 0.0002, "loss": 0.5399969816207886, "mean_token_accuracy": 0.7786188125610352, "num_tokens": 3987832.0, "step": 244 }, { "entropy": 0.5670223534107208, "epoch": 0.9150326797385621, "grad_norm": 0.032523807138204575, "learning_rate": 0.0002, "loss": 0.568830668926239, "mean_token_accuracy": 0.7703544050455093, "num_tokens": 4004046.0, "step": 245 }, { "entropy": 0.5482122004032135, "epoch": 0.9187675070028011, "grad_norm": 0.024507107213139534, "learning_rate": 0.0002, "loss": 0.5461756587028503, "mean_token_accuracy": 0.7785715907812119, "num_tokens": 4020396.0, "step": 246 }, { "entropy": 0.5435233414173126, "epoch": 0.9225023342670402, "grad_norm": 0.026535481214523315, "learning_rate": 0.0002, "loss": 0.5347612500190735, "mean_token_accuracy": 0.7819430381059647, "num_tokens": 4036657.0, "step": 247 }, { "entropy": 0.5606936663389206, "epoch": 0.9262371615312792, "grad_norm": 0.03222998231649399, "learning_rate": 0.0002, "loss": 0.5588559508323669, "mean_token_accuracy": 0.7731847912073135, "num_tokens": 4052932.0, "step": 248 }, { "entropy": 0.5559582114219666, "epoch": 0.9299719887955182, "grad_norm": 0.027079764753580093, "learning_rate": 0.0002, "loss": 0.5551950931549072, "mean_token_accuracy": 0.7739483118057251, "num_tokens": 4069465.0, "step": 249 }, { "entropy": 0.5464590489864349, "epoch": 0.9337068160597572, "grad_norm": 0.025224287062883377, "learning_rate": 0.0002, "loss": 0.548494815826416, "mean_token_accuracy": 0.7777067720890045, "num_tokens": 4085793.0, "step": 250 }, { "entropy": 0.5697829127311707, "epoch": 0.9374416433239963, "grad_norm": 0.03149845451116562, "learning_rate": 0.0002, "loss": 0.5725698471069336, "mean_token_accuracy": 0.7667296230792999, "num_tokens": 4102389.0, "step": 251 }, { "entropy": 0.5524837523698807, "epoch": 0.9411764705882353, "grad_norm": 0.027573609724640846, "learning_rate": 0.0002, "loss": 0.5497711896896362, "mean_token_accuracy": 0.7749225348234177, "num_tokens": 4118604.0, "step": 252 }, { "entropy": 0.5428849905729294, "epoch": 0.9449112978524743, "grad_norm": 0.025667617097496986, "learning_rate": 0.0002, "loss": 0.5428351163864136, "mean_token_accuracy": 0.7771738916635513, "num_tokens": 4135001.0, "step": 253 }, { "entropy": 0.5520694851875305, "epoch": 0.9486461251167133, "grad_norm": 0.035842686891555786, "learning_rate": 0.0002, "loss": 0.550408661365509, "mean_token_accuracy": 0.7740647196769714, "num_tokens": 4151260.0, "step": 254 }, { "entropy": 0.5418593287467957, "epoch": 0.9523809523809523, "grad_norm": 0.0381033793091774, "learning_rate": 0.0002, "loss": 0.5492621660232544, "mean_token_accuracy": 0.7769514173269272, "num_tokens": 4167360.0, "step": 255 }, { "entropy": 0.5375488549470901, "epoch": 0.9561157796451915, "grad_norm": 0.029893534258008003, "learning_rate": 0.0002, "loss": 0.5434277057647705, "mean_token_accuracy": 0.7754911035299301, "num_tokens": 4183517.0, "step": 256 }, { "entropy": 0.5487121939659119, "epoch": 0.9598506069094305, "grad_norm": 0.03323543071746826, "learning_rate": 0.0002, "loss": 0.549543559551239, "mean_token_accuracy": 0.7791514545679092, "num_tokens": 4200020.0, "step": 257 }, { "entropy": 0.5533169955015182, "epoch": 0.9635854341736695, "grad_norm": 0.1564125418663025, "learning_rate": 0.0002, "loss": 0.5513023138046265, "mean_token_accuracy": 0.7750032246112823, "num_tokens": 4216280.0, "step": 258 }, { "entropy": 0.5475684553384781, "epoch": 0.9673202614379085, "grad_norm": 0.05765023082494736, "learning_rate": 0.0002, "loss": 0.5540170073509216, "mean_token_accuracy": 0.778236523270607, "num_tokens": 4232501.0, "step": 259 }, { "entropy": 0.5620233714580536, "epoch": 0.9710550887021475, "grad_norm": 0.046510934829711914, "learning_rate": 0.0002, "loss": 0.5589131712913513, "mean_token_accuracy": 0.7736849784851074, "num_tokens": 4248855.0, "step": 260 }, { "entropy": 0.565828487277031, "epoch": 0.9747899159663865, "grad_norm": 0.0395890548825264, "learning_rate": 0.0002, "loss": 0.5624877214431763, "mean_token_accuracy": 0.7722225338220596, "num_tokens": 4265077.0, "step": 261 }, { "entropy": 0.5551140010356903, "epoch": 0.9785247432306255, "grad_norm": 0.03330749273300171, "learning_rate": 0.0002, "loss": 0.5576150417327881, "mean_token_accuracy": 0.7741483747959137, "num_tokens": 4281357.0, "step": 262 }, { "entropy": 0.5746229141950607, "epoch": 0.9822595704948646, "grad_norm": 0.03519619628787041, "learning_rate": 0.0002, "loss": 0.582584023475647, "mean_token_accuracy": 0.7654829919338226, "num_tokens": 4297699.0, "step": 263 }, { "entropy": 0.5782353579998016, "epoch": 0.9859943977591037, "grad_norm": 0.03913693502545357, "learning_rate": 0.0002, "loss": 0.5755780339241028, "mean_token_accuracy": 0.7660959511995316, "num_tokens": 4314249.0, "step": 264 }, { "entropy": 0.5513299107551575, "epoch": 0.9897292250233427, "grad_norm": 0.030444784089922905, "learning_rate": 0.0002, "loss": 0.5514294505119324, "mean_token_accuracy": 0.7750695049762726, "num_tokens": 4330437.0, "step": 265 }, { "entropy": 0.5386128276586533, "epoch": 0.9934640522875817, "grad_norm": 0.03275322541594505, "learning_rate": 0.0002, "loss": 0.540998637676239, "mean_token_accuracy": 0.7796358019113541, "num_tokens": 4346677.0, "step": 266 }, { "entropy": 0.5513150691986084, "epoch": 0.9971988795518207, "grad_norm": 0.03458503261208534, "learning_rate": 0.0002, "loss": 0.5484628677368164, "mean_token_accuracy": 0.779531255364418, "num_tokens": 4363004.0, "step": 267 }, { "entropy": 0.5694002906481425, "epoch": 1.0, "grad_norm": 0.033372946083545685, "learning_rate": 0.0002, "loss": 0.5757001638412476, "mean_token_accuracy": 0.7725784182548523, "num_tokens": 4364721.0, "step": 268 }, { "entropy": 0.5490456074476242, "epoch": 1.003734827264239, "grad_norm": 0.030816873535513878, "learning_rate": 0.0002, "loss": 0.5466992855072021, "mean_token_accuracy": 0.7772593349218369, "num_tokens": 4380959.0, "step": 269 }, { "entropy": 0.5297957360744476, "epoch": 1.007469654528478, "grad_norm": 0.0300835482776165, "learning_rate": 0.0002, "loss": 0.5296781063079834, "mean_token_accuracy": 0.7851966172456741, "num_tokens": 4397319.0, "step": 270 }, { "entropy": 0.5426550507545471, "epoch": 1.011204481792717, "grad_norm": 0.0309379193931818, "learning_rate": 0.0002, "loss": 0.5401790142059326, "mean_token_accuracy": 0.7784202843904495, "num_tokens": 4413503.0, "step": 271 }, { "entropy": 0.536088228225708, "epoch": 1.014939309056956, "grad_norm": 0.030822666361927986, "learning_rate": 0.0002, "loss": 0.533880352973938, "mean_token_accuracy": 0.7821955978870392, "num_tokens": 4429731.0, "step": 272 }, { "entropy": 0.5376520156860352, "epoch": 1.018674136321195, "grad_norm": 0.03910338878631592, "learning_rate": 0.0002, "loss": 0.5515881776809692, "mean_token_accuracy": 0.7752164155244827, "num_tokens": 4445975.0, "step": 273 }, { "entropy": 0.5337154120206833, "epoch": 1.022408963585434, "grad_norm": 0.030765611678361893, "learning_rate": 0.0002, "loss": 0.5412506461143494, "mean_token_accuracy": 0.7780167758464813, "num_tokens": 4462105.0, "step": 274 }, { "entropy": 0.5487084090709686, "epoch": 1.026143790849673, "grad_norm": 0.03003527596592903, "learning_rate": 0.0002, "loss": 0.540929913520813, "mean_token_accuracy": 0.7784045934677124, "num_tokens": 4478591.0, "step": 275 }, { "entropy": 0.5385126918554306, "epoch": 1.0298786181139121, "grad_norm": 0.027475042268633842, "learning_rate": 0.0002, "loss": 0.5318593978881836, "mean_token_accuracy": 0.7862093448638916, "num_tokens": 4495044.0, "step": 276 }, { "entropy": 0.5600587129592896, "epoch": 1.0336134453781514, "grad_norm": 0.029431000351905823, "learning_rate": 0.0002, "loss": 0.5559869408607483, "mean_token_accuracy": 0.7744521498680115, "num_tokens": 4511459.0, "step": 277 }, { "entropy": 0.5381200164556503, "epoch": 1.0373482726423904, "grad_norm": 0.02848048508167267, "learning_rate": 0.0002, "loss": 0.5395113229751587, "mean_token_accuracy": 0.7798527628183365, "num_tokens": 4527903.0, "step": 278 }, { "entropy": 0.5346540361642838, "epoch": 1.0410830999066294, "grad_norm": 0.033454034477472305, "learning_rate": 0.0002, "loss": 0.5404960513114929, "mean_token_accuracy": 0.7793795019388199, "num_tokens": 4544182.0, "step": 279 }, { "entropy": 0.544955238699913, "epoch": 1.0448179271708684, "grad_norm": 0.02894734963774681, "learning_rate": 0.0002, "loss": 0.5436176061630249, "mean_token_accuracy": 0.7777452617883682, "num_tokens": 4560880.0, "step": 280 }, { "entropy": 0.5431416183710098, "epoch": 1.0485527544351074, "grad_norm": 0.02903336100280285, "learning_rate": 0.0002, "loss": 0.5436229109764099, "mean_token_accuracy": 0.7780826389789581, "num_tokens": 4577183.0, "step": 281 }, { "entropy": 0.5408187806606293, "epoch": 1.0522875816993464, "grad_norm": 0.029271787032485008, "learning_rate": 0.0002, "loss": 0.5370380282402039, "mean_token_accuracy": 0.7815099805593491, "num_tokens": 4593864.0, "step": 282 }, { "entropy": 0.5497590750455856, "epoch": 1.0560224089635855, "grad_norm": 0.028807660564780235, "learning_rate": 0.0002, "loss": 0.5504873991012573, "mean_token_accuracy": 0.777531310915947, "num_tokens": 4610349.0, "step": 283 }, { "entropy": 0.5368742346763611, "epoch": 1.0597572362278245, "grad_norm": 0.031959034502506256, "learning_rate": 0.0002, "loss": 0.5419926643371582, "mean_token_accuracy": 0.7784341871738434, "num_tokens": 4626437.0, "step": 284 }, { "entropy": 0.5532872825860977, "epoch": 1.0634920634920635, "grad_norm": 0.028826460242271423, "learning_rate": 0.0002, "loss": 0.5571930408477783, "mean_token_accuracy": 0.7746778875589371, "num_tokens": 4642633.0, "step": 285 }, { "entropy": 0.5407113283872604, "epoch": 1.0672268907563025, "grad_norm": 0.03065388835966587, "learning_rate": 0.0002, "loss": 0.5436424612998962, "mean_token_accuracy": 0.779659166932106, "num_tokens": 4658940.0, "step": 286 }, { "entropy": 0.5552934855222702, "epoch": 1.0709617180205415, "grad_norm": 0.03264114633202553, "learning_rate": 0.0002, "loss": 0.5482615232467651, "mean_token_accuracy": 0.7754945755004883, "num_tokens": 4675263.0, "step": 287 }, { "entropy": 0.5442743301391602, "epoch": 1.0746965452847805, "grad_norm": 0.031116079539060593, "learning_rate": 0.0002, "loss": 0.538812518119812, "mean_token_accuracy": 0.7806833982467651, "num_tokens": 4691415.0, "step": 288 }, { "entropy": 0.5530855804681778, "epoch": 1.0784313725490196, "grad_norm": 0.03077593445777893, "learning_rate": 0.0002, "loss": 0.548968493938446, "mean_token_accuracy": 0.7756039202213287, "num_tokens": 4707736.0, "step": 289 }, { "entropy": 0.5455960035324097, "epoch": 1.0821661998132586, "grad_norm": 0.028605274856090546, "learning_rate": 0.0002, "loss": 0.5435131788253784, "mean_token_accuracy": 0.7795460671186447, "num_tokens": 4724095.0, "step": 290 }, { "entropy": 0.5397526025772095, "epoch": 1.0859010270774976, "grad_norm": 0.03644070401787758, "learning_rate": 0.0002, "loss": 0.5488567352294922, "mean_token_accuracy": 0.7778657674789429, "num_tokens": 4740602.0, "step": 291 }, { "entropy": 0.5470818132162094, "epoch": 1.0896358543417366, "grad_norm": 0.033212918788194656, "learning_rate": 0.0002, "loss": 0.555572509765625, "mean_token_accuracy": 0.7734686136245728, "num_tokens": 4756842.0, "step": 292 }, { "entropy": 0.5398264974355698, "epoch": 1.0933706816059758, "grad_norm": 0.027302522212266922, "learning_rate": 0.0002, "loss": 0.5371235013008118, "mean_token_accuracy": 0.7826644480228424, "num_tokens": 4773499.0, "step": 293 }, { "entropy": 0.564954400062561, "epoch": 1.0971055088702149, "grad_norm": 0.02829107642173767, "learning_rate": 0.0002, "loss": 0.5558594465255737, "mean_token_accuracy": 0.7749541401863098, "num_tokens": 4790183.0, "step": 294 }, { "entropy": 0.5593573749065399, "epoch": 1.1008403361344539, "grad_norm": 0.027547527104616165, "learning_rate": 0.0002, "loss": 0.5560394525527954, "mean_token_accuracy": 0.7725719660520554, "num_tokens": 4806455.0, "step": 295 }, { "entropy": 0.5377779453992844, "epoch": 1.1045751633986929, "grad_norm": 0.03161724656820297, "learning_rate": 0.0002, "loss": 0.5370453596115112, "mean_token_accuracy": 0.782875582575798, "num_tokens": 4822731.0, "step": 296 }, { "entropy": 0.5386165231466293, "epoch": 1.108309990662932, "grad_norm": 0.03147651255130768, "learning_rate": 0.0002, "loss": 0.5423634648323059, "mean_token_accuracy": 0.7768422961235046, "num_tokens": 4839112.0, "step": 297 }, { "entropy": 0.5279396325349808, "epoch": 1.112044817927171, "grad_norm": 0.031283456832170486, "learning_rate": 0.0002, "loss": 0.5321308970451355, "mean_token_accuracy": 0.7849069982767105, "num_tokens": 4855229.0, "step": 298 }, { "entropy": 0.5327593311667442, "epoch": 1.11577964519141, "grad_norm": 0.03042989782989025, "learning_rate": 0.0002, "loss": 0.5393236875534058, "mean_token_accuracy": 0.7804521471261978, "num_tokens": 4871644.0, "step": 299 }, { "entropy": 0.560793936252594, "epoch": 1.119514472455649, "grad_norm": 0.029397251084446907, "learning_rate": 0.0002, "loss": 0.5557554960250854, "mean_token_accuracy": 0.7728655338287354, "num_tokens": 4887992.0, "step": 300 }, { "entropy": 0.5604539066553116, "epoch": 1.123249299719888, "grad_norm": 0.02948898635804653, "learning_rate": 0.0002, "loss": 0.5545894503593445, "mean_token_accuracy": 0.7743670493364334, "num_tokens": 4904384.0, "step": 301 }, { "entropy": 0.5394376814365387, "epoch": 1.126984126984127, "grad_norm": 0.029182471334934235, "learning_rate": 0.0002, "loss": 0.5341510772705078, "mean_token_accuracy": 0.7823253571987152, "num_tokens": 4920587.0, "step": 302 }, { "entropy": 0.5301040560007095, "epoch": 1.130718954248366, "grad_norm": 0.03680079057812691, "learning_rate": 0.0002, "loss": 0.5372604131698608, "mean_token_accuracy": 0.7793124318122864, "num_tokens": 4937055.0, "step": 303 }, { "entropy": 0.5290943831205368, "epoch": 1.134453781512605, "grad_norm": 0.03931280970573425, "learning_rate": 0.0002, "loss": 0.5391898155212402, "mean_token_accuracy": 0.7829029709100723, "num_tokens": 4953281.0, "step": 304 }, { "entropy": 0.5609545707702637, "epoch": 1.138188608776844, "grad_norm": 0.030014565214514732, "learning_rate": 0.0002, "loss": 0.5609763264656067, "mean_token_accuracy": 0.7726535797119141, "num_tokens": 4969665.0, "step": 305 }, { "entropy": 0.5205260962247849, "epoch": 1.141923436041083, "grad_norm": 0.03301642835140228, "learning_rate": 0.0002, "loss": 0.5286065340042114, "mean_token_accuracy": 0.7840328961610794, "num_tokens": 4985863.0, "step": 306 }, { "entropy": 0.5605068057775497, "epoch": 1.145658263305322, "grad_norm": 0.029299437999725342, "learning_rate": 0.0002, "loss": 0.5569101572036743, "mean_token_accuracy": 0.7721403539180756, "num_tokens": 5002543.0, "step": 307 }, { "entropy": 0.552753359079361, "epoch": 1.149393090569561, "grad_norm": 0.027307430282235146, "learning_rate": 0.0002, "loss": 0.5464527606964111, "mean_token_accuracy": 0.7777755260467529, "num_tokens": 5019035.0, "step": 308 }, { "entropy": 0.5565258711576462, "epoch": 1.1531279178338, "grad_norm": 0.028590641915798187, "learning_rate": 0.0002, "loss": 0.551773726940155, "mean_token_accuracy": 0.7753841280937195, "num_tokens": 5035778.0, "step": 309 }, { "entropy": 0.5335747301578522, "epoch": 1.156862745098039, "grad_norm": 0.02846100926399231, "learning_rate": 0.0002, "loss": 0.5332034826278687, "mean_token_accuracy": 0.7849084585905075, "num_tokens": 5052106.0, "step": 310 }, { "entropy": 0.5462342649698257, "epoch": 1.1605975723622783, "grad_norm": 0.03037341870367527, "learning_rate": 0.0002, "loss": 0.5533976554870605, "mean_token_accuracy": 0.7761731296777725, "num_tokens": 5068494.0, "step": 311 }, { "entropy": 0.5365739315748215, "epoch": 1.1643323996265174, "grad_norm": 0.0328284353017807, "learning_rate": 0.0002, "loss": 0.5443044900894165, "mean_token_accuracy": 0.7775984853506088, "num_tokens": 5084698.0, "step": 312 }, { "entropy": 0.5469802767038345, "epoch": 1.1680672268907564, "grad_norm": 0.029220817610621452, "learning_rate": 0.0002, "loss": 0.5449838638305664, "mean_token_accuracy": 0.7794362902641296, "num_tokens": 5101231.0, "step": 313 }, { "entropy": 0.5534107983112335, "epoch": 1.1718020541549954, "grad_norm": 0.03240218386054039, "learning_rate": 0.0002, "loss": 0.5596653819084167, "mean_token_accuracy": 0.7733468264341354, "num_tokens": 5117669.0, "step": 314 }, { "entropy": 0.5505286902189255, "epoch": 1.1755368814192344, "grad_norm": 0.030088460072875023, "learning_rate": 0.0002, "loss": 0.5460378527641296, "mean_token_accuracy": 0.7785163521766663, "num_tokens": 5134044.0, "step": 315 }, { "entropy": 0.5583444237709045, "epoch": 1.1792717086834734, "grad_norm": 0.03908608481287956, "learning_rate": 0.0002, "loss": 0.5499372482299805, "mean_token_accuracy": 0.7741111516952515, "num_tokens": 5150155.0, "step": 316 }, { "entropy": 0.5583514273166656, "epoch": 1.1830065359477124, "grad_norm": 0.03262948617339134, "learning_rate": 0.0002, "loss": 0.5514504909515381, "mean_token_accuracy": 0.7749726176261902, "num_tokens": 5166653.0, "step": 317 }, { "entropy": 0.54158616065979, "epoch": 1.1867413632119514, "grad_norm": 0.030375484377145767, "learning_rate": 0.0002, "loss": 0.535007119178772, "mean_token_accuracy": 0.78143410384655, "num_tokens": 5182849.0, "step": 318 }, { "entropy": 0.5355552136898041, "epoch": 1.1904761904761905, "grad_norm": 0.034217700362205505, "learning_rate": 0.0002, "loss": 0.5416175723075867, "mean_token_accuracy": 0.7821937054395676, "num_tokens": 5199310.0, "step": 319 }, { "entropy": 0.5375736951828003, "epoch": 1.1942110177404295, "grad_norm": 0.03742173686623573, "learning_rate": 0.0002, "loss": 0.5497441291809082, "mean_token_accuracy": 0.779162734746933, "num_tokens": 5215628.0, "step": 320 }, { "entropy": 0.5327057242393494, "epoch": 1.1979458450046685, "grad_norm": 0.03143603354692459, "learning_rate": 0.0002, "loss": 0.5377879738807678, "mean_token_accuracy": 0.7819731533527374, "num_tokens": 5232104.0, "step": 321 }, { "entropy": 0.5589822083711624, "epoch": 1.2016806722689075, "grad_norm": 0.030957849696278572, "learning_rate": 0.0002, "loss": 0.5600837469100952, "mean_token_accuracy": 0.772526428103447, "num_tokens": 5248228.0, "step": 322 }, { "entropy": 0.5267817825078964, "epoch": 1.2054154995331465, "grad_norm": 0.028181420639157295, "learning_rate": 0.0002, "loss": 0.5258863568305969, "mean_token_accuracy": 0.7852722406387329, "num_tokens": 5264722.0, "step": 323 }, { "entropy": 0.5596602708101273, "epoch": 1.2091503267973855, "grad_norm": 0.0294583011418581, "learning_rate": 0.0002, "loss": 0.5542659163475037, "mean_token_accuracy": 0.7757792323827744, "num_tokens": 5281102.0, "step": 324 }, { "entropy": 0.5631477683782578, "epoch": 1.2128851540616246, "grad_norm": 0.028790894895792007, "learning_rate": 0.0002, "loss": 0.5568723678588867, "mean_token_accuracy": 0.771973267197609, "num_tokens": 5297684.0, "step": 325 }, { "entropy": 0.5380028486251831, "epoch": 1.2166199813258638, "grad_norm": 0.031924713402986526, "learning_rate": 0.0002, "loss": 0.5376958847045898, "mean_token_accuracy": 0.7829422205686569, "num_tokens": 5313908.0, "step": 326 }, { "entropy": 0.5375301241874695, "epoch": 1.2203548085901028, "grad_norm": 0.03397483006119728, "learning_rate": 0.0002, "loss": 0.5478475093841553, "mean_token_accuracy": 0.7765705734491348, "num_tokens": 5329966.0, "step": 327 }, { "entropy": 0.5427165776491165, "epoch": 1.2240896358543418, "grad_norm": 0.035384900867938995, "learning_rate": 0.0002, "loss": 0.5524033308029175, "mean_token_accuracy": 0.7745779901742935, "num_tokens": 5346453.0, "step": 328 }, { "entropy": 0.5400120764970779, "epoch": 1.2278244631185808, "grad_norm": 0.030376868322491646, "learning_rate": 0.0002, "loss": 0.5346859097480774, "mean_token_accuracy": 0.7804136276245117, "num_tokens": 5362598.0, "step": 329 }, { "entropy": 0.5525883883237839, "epoch": 1.2315592903828199, "grad_norm": 0.029532834887504578, "learning_rate": 0.0002, "loss": 0.5460601449012756, "mean_token_accuracy": 0.7782909572124481, "num_tokens": 5378809.0, "step": 330 }, { "entropy": 0.5435810536146164, "epoch": 1.2352941176470589, "grad_norm": 0.02912810444831848, "learning_rate": 0.0002, "loss": 0.5412687659263611, "mean_token_accuracy": 0.7805328518152237, "num_tokens": 5394964.0, "step": 331 }, { "entropy": 0.5558127015829086, "epoch": 1.239028944911298, "grad_norm": 0.03399093821644783, "learning_rate": 0.0002, "loss": 0.5503210425376892, "mean_token_accuracy": 0.7771144658327103, "num_tokens": 5411296.0, "step": 332 }, { "entropy": 0.5612344145774841, "epoch": 1.242763772175537, "grad_norm": 0.028297265991568565, "learning_rate": 0.0002, "loss": 0.561404824256897, "mean_token_accuracy": 0.7735303044319153, "num_tokens": 5427522.0, "step": 333 }, { "entropy": 0.5317913144826889, "epoch": 1.246498599439776, "grad_norm": 0.03494315594434738, "learning_rate": 0.0002, "loss": 0.5433036684989929, "mean_token_accuracy": 0.7796971648931503, "num_tokens": 5443757.0, "step": 334 }, { "entropy": 0.542137622833252, "epoch": 1.250233426704015, "grad_norm": 0.02819279581308365, "learning_rate": 0.0002, "loss": 0.5451513528823853, "mean_token_accuracy": 0.7785246819257736, "num_tokens": 5460219.0, "step": 335 }, { "entropy": 0.5389015078544617, "epoch": 1.253968253968254, "grad_norm": 0.029153091832995415, "learning_rate": 0.0002, "loss": 0.5426021218299866, "mean_token_accuracy": 0.7783170789480209, "num_tokens": 5476465.0, "step": 336 }, { "entropy": 0.5529672205448151, "epoch": 1.257703081232493, "grad_norm": 0.03458336368203163, "learning_rate": 0.0002, "loss": 0.540812611579895, "mean_token_accuracy": 0.7807324081659317, "num_tokens": 5492565.0, "step": 337 }, { "entropy": 0.581393301486969, "epoch": 1.261437908496732, "grad_norm": 0.031111041083931923, "learning_rate": 0.0002, "loss": 0.5751311779022217, "mean_token_accuracy": 0.7666933685541153, "num_tokens": 5509003.0, "step": 338 }, { "entropy": 0.5588483065366745, "epoch": 1.265172735760971, "grad_norm": 0.030144309625029564, "learning_rate": 0.0002, "loss": 0.5589640140533447, "mean_token_accuracy": 0.7755171656608582, "num_tokens": 5525262.0, "step": 339 }, { "entropy": 0.5336481779813766, "epoch": 1.26890756302521, "grad_norm": 0.03417432680726051, "learning_rate": 0.0002, "loss": 0.5390788316726685, "mean_token_accuracy": 0.780031830072403, "num_tokens": 5541654.0, "step": 340 }, { "entropy": 0.5282999128103256, "epoch": 1.272642390289449, "grad_norm": 0.03498517721891403, "learning_rate": 0.0002, "loss": 0.5387616157531738, "mean_token_accuracy": 0.7800437808036804, "num_tokens": 5557983.0, "step": 341 }, { "entropy": 0.5369831025600433, "epoch": 1.276377217553688, "grad_norm": 0.029845617711544037, "learning_rate": 0.0002, "loss": 0.535378634929657, "mean_token_accuracy": 0.7823457568883896, "num_tokens": 5574311.0, "step": 342 }, { "entropy": 0.5538373440504074, "epoch": 1.280112044817927, "grad_norm": 0.027923226356506348, "learning_rate": 0.0002, "loss": 0.5500721335411072, "mean_token_accuracy": 0.7771336436271667, "num_tokens": 5590547.0, "step": 343 }, { "entropy": 0.5545977205038071, "epoch": 1.283846872082166, "grad_norm": 0.0305513683706522, "learning_rate": 0.0002, "loss": 0.5511223077774048, "mean_token_accuracy": 0.7757980972528458, "num_tokens": 5606717.0, "step": 344 }, { "entropy": 0.560431718826294, "epoch": 1.287581699346405, "grad_norm": 0.029267068952322006, "learning_rate": 0.0002, "loss": 0.5540031790733337, "mean_token_accuracy": 0.7738614529371262, "num_tokens": 5623238.0, "step": 345 }, { "entropy": 0.5598475635051727, "epoch": 1.2913165266106443, "grad_norm": 0.032441407442092896, "learning_rate": 0.0002, "loss": 0.5511676669120789, "mean_token_accuracy": 0.775727853178978, "num_tokens": 5639482.0, "step": 346 }, { "entropy": 0.532151535153389, "epoch": 1.2950513538748833, "grad_norm": 0.03496084734797478, "learning_rate": 0.0002, "loss": 0.5387351512908936, "mean_token_accuracy": 0.7811897695064545, "num_tokens": 5655745.0, "step": 347 }, { "entropy": 0.5362464487552643, "epoch": 1.2987861811391224, "grad_norm": 0.03774246945977211, "learning_rate": 0.0002, "loss": 0.5451931953430176, "mean_token_accuracy": 0.7775505632162094, "num_tokens": 5672305.0, "step": 348 }, { "entropy": 0.5285972878336906, "epoch": 1.3025210084033614, "grad_norm": 0.0332336500287056, "learning_rate": 0.0002, "loss": 0.5353838801383972, "mean_token_accuracy": 0.7838114500045776, "num_tokens": 5688630.0, "step": 349 }, { "entropy": 0.5421172678470612, "epoch": 1.3062558356676004, "grad_norm": 0.03457598015666008, "learning_rate": 0.0002, "loss": 0.5392417311668396, "mean_token_accuracy": 0.7807410657405853, "num_tokens": 5705054.0, "step": 350 }, { "entropy": 0.5382883250713348, "epoch": 1.3099906629318394, "grad_norm": 0.031050430610775948, "learning_rate": 0.0002, "loss": 0.5347834825515747, "mean_token_accuracy": 0.7828159481287003, "num_tokens": 5721382.0, "step": 351 }, { "entropy": 0.550368145108223, "epoch": 1.3137254901960784, "grad_norm": 0.03463875129818916, "learning_rate": 0.0002, "loss": 0.5514199137687683, "mean_token_accuracy": 0.7735539227724075, "num_tokens": 5737730.0, "step": 352 }, { "entropy": 0.538982629776001, "epoch": 1.3174603174603174, "grad_norm": 0.03956155851483345, "learning_rate": 0.0002, "loss": 0.5469655990600586, "mean_token_accuracy": 0.7747407406568527, "num_tokens": 5753795.0, "step": 353 }, { "entropy": 0.5339585244655609, "epoch": 1.3211951447245565, "grad_norm": 0.029367057606577873, "learning_rate": 0.0002, "loss": 0.536923348903656, "mean_token_accuracy": 0.7791249603033066, "num_tokens": 5770100.0, "step": 354 }, { "entropy": 0.5469655245542526, "epoch": 1.3249299719887955, "grad_norm": 0.044070687144994736, "learning_rate": 0.0002, "loss": 0.5485926270484924, "mean_token_accuracy": 0.7760020345449448, "num_tokens": 5786242.0, "step": 355 }, { "entropy": 0.5686767846345901, "epoch": 1.3286647992530345, "grad_norm": 0.0298174861818552, "learning_rate": 0.0002, "loss": 0.5646032691001892, "mean_token_accuracy": 0.7700935900211334, "num_tokens": 5802594.0, "step": 356 }, { "entropy": 0.5524211078882217, "epoch": 1.3323996265172735, "grad_norm": 0.03443749621510506, "learning_rate": 0.0002, "loss": 0.5538625717163086, "mean_token_accuracy": 0.7730942517518997, "num_tokens": 5818733.0, "step": 357 }, { "entropy": 0.5450694710016251, "epoch": 1.3361344537815127, "grad_norm": 0.042639389634132385, "learning_rate": 0.0002, "loss": 0.5457915663719177, "mean_token_accuracy": 0.7793462425470352, "num_tokens": 5834966.0, "step": 358 }, { "entropy": 0.5628755837678909, "epoch": 1.3398692810457518, "grad_norm": 0.031939953565597534, "learning_rate": 0.0002, "loss": 0.5615131855010986, "mean_token_accuracy": 0.7720433920621872, "num_tokens": 5851352.0, "step": 359 }, { "entropy": 0.5299947410821915, "epoch": 1.3436041083099908, "grad_norm": 0.03047833777964115, "learning_rate": 0.0002, "loss": 0.5295021533966064, "mean_token_accuracy": 0.7874699085950851, "num_tokens": 5867820.0, "step": 360 }, { "entropy": 0.5308109223842621, "epoch": 1.3473389355742298, "grad_norm": 0.032848697155714035, "learning_rate": 0.0002, "loss": 0.5431129336357117, "mean_token_accuracy": 0.7857107818126678, "num_tokens": 5883984.0, "step": 361 }, { "entropy": 0.5426601469516754, "epoch": 1.3510737628384688, "grad_norm": 0.033830493688583374, "learning_rate": 0.0002, "loss": 0.5514194965362549, "mean_token_accuracy": 0.77635657787323, "num_tokens": 5900290.0, "step": 362 }, { "entropy": 0.5411643236875534, "epoch": 1.3548085901027078, "grad_norm": 0.029694274067878723, "learning_rate": 0.0002, "loss": 0.5333205461502075, "mean_token_accuracy": 0.7832283675670624, "num_tokens": 5916469.0, "step": 363 }, { "entropy": 0.5501731634140015, "epoch": 1.3585434173669468, "grad_norm": 0.03007029928267002, "learning_rate": 0.0002, "loss": 0.5431393980979919, "mean_token_accuracy": 0.7804041355848312, "num_tokens": 5932693.0, "step": 364 }, { "entropy": 0.5419217795133591, "epoch": 1.3622782446311859, "grad_norm": 0.030986929312348366, "learning_rate": 0.0002, "loss": 0.5391764044761658, "mean_token_accuracy": 0.7810684144496918, "num_tokens": 5949053.0, "step": 365 }, { "entropy": 0.529257670044899, "epoch": 1.3660130718954249, "grad_norm": 0.0282028466463089, "learning_rate": 0.0002, "loss": 0.5282759666442871, "mean_token_accuracy": 0.7846860438585281, "num_tokens": 5965428.0, "step": 366 }, { "entropy": 0.5425796508789062, "epoch": 1.3697478991596639, "grad_norm": 0.03842358663678169, "learning_rate": 0.0002, "loss": 0.5492331981658936, "mean_token_accuracy": 0.7747556120157242, "num_tokens": 5981730.0, "step": 367 }, { "entropy": 0.5349410325288773, "epoch": 1.373482726423903, "grad_norm": 0.033598389476537704, "learning_rate": 0.0002, "loss": 0.5436474084854126, "mean_token_accuracy": 0.7797878831624985, "num_tokens": 5997949.0, "step": 368 }, { "entropy": 0.552407756447792, "epoch": 1.377217553688142, "grad_norm": 0.03342469781637192, "learning_rate": 0.0002, "loss": 0.5567049980163574, "mean_token_accuracy": 0.7723858207464218, "num_tokens": 6014178.0, "step": 369 }, { "entropy": 0.5454883426427841, "epoch": 1.380952380952381, "grad_norm": 0.03550714999437332, "learning_rate": 0.0002, "loss": 0.5418342351913452, "mean_token_accuracy": 0.7798961699008942, "num_tokens": 6030806.0, "step": 370 }, { "entropy": 0.552109032869339, "epoch": 1.38468720821662, "grad_norm": 0.03026903234422207, "learning_rate": 0.0002, "loss": 0.5456339120864868, "mean_token_accuracy": 0.7773927599191666, "num_tokens": 6046782.0, "step": 371 }, { "entropy": 0.5603116452693939, "epoch": 1.388422035480859, "grad_norm": 0.03449714556336403, "learning_rate": 0.0002, "loss": 0.5605192184448242, "mean_token_accuracy": 0.7709443271160126, "num_tokens": 6063178.0, "step": 372 }, { "entropy": 0.5442145317792892, "epoch": 1.392156862745098, "grad_norm": 0.03407449275255203, "learning_rate": 0.0002, "loss": 0.5482808947563171, "mean_token_accuracy": 0.7804455161094666, "num_tokens": 6079813.0, "step": 373 }, { "entropy": 0.5443685501813889, "epoch": 1.395891690009337, "grad_norm": 0.03118809685111046, "learning_rate": 0.0002, "loss": 0.5504392385482788, "mean_token_accuracy": 0.7759056687355042, "num_tokens": 6096208.0, "step": 374 }, { "entropy": 0.5544550269842148, "epoch": 1.399626517273576, "grad_norm": 0.03532007709145546, "learning_rate": 0.0002, "loss": 0.5569352507591248, "mean_token_accuracy": 0.7748352587223053, "num_tokens": 6112356.0, "step": 375 }, { "entropy": 0.5439307242631912, "epoch": 1.403361344537815, "grad_norm": 0.0334586501121521, "learning_rate": 0.0002, "loss": 0.542488694190979, "mean_token_accuracy": 0.777744397521019, "num_tokens": 6128800.0, "step": 376 }, { "entropy": 0.5407049357891083, "epoch": 1.407096171802054, "grad_norm": 0.029349738731980324, "learning_rate": 0.0002, "loss": 0.5370444655418396, "mean_token_accuracy": 0.7816447019577026, "num_tokens": 6145053.0, "step": 377 }, { "entropy": 0.5527060329914093, "epoch": 1.410830999066293, "grad_norm": 0.030373841524124146, "learning_rate": 0.0002, "loss": 0.5530543327331543, "mean_token_accuracy": 0.775768980383873, "num_tokens": 6161518.0, "step": 378 }, { "entropy": 0.5383686721324921, "epoch": 1.4145658263305323, "grad_norm": 0.033442895859479904, "learning_rate": 0.0002, "loss": 0.539923369884491, "mean_token_accuracy": 0.7825078517198563, "num_tokens": 6177817.0, "step": 379 }, { "entropy": 0.5557737052440643, "epoch": 1.4183006535947713, "grad_norm": 0.03396908566355705, "learning_rate": 0.0002, "loss": 0.5632482767105103, "mean_token_accuracy": 0.7692397683858871, "num_tokens": 6194312.0, "step": 380 }, { "entropy": 0.5457819253206253, "epoch": 1.4220354808590103, "grad_norm": 0.02866293303668499, "learning_rate": 0.0002, "loss": 0.5467988848686218, "mean_token_accuracy": 0.7775601893663406, "num_tokens": 6210818.0, "step": 381 }, { "entropy": 0.5640534311532974, "epoch": 1.4257703081232493, "grad_norm": 0.027476362884044647, "learning_rate": 0.0002, "loss": 0.5636141896247864, "mean_token_accuracy": 0.7717417329549789, "num_tokens": 6227080.0, "step": 382 }, { "entropy": 0.560546487569809, "epoch": 1.4295051353874884, "grad_norm": 0.030654683709144592, "learning_rate": 0.0002, "loss": 0.5566866397857666, "mean_token_accuracy": 0.7725766897201538, "num_tokens": 6243654.0, "step": 383 }, { "entropy": 0.5566196143627167, "epoch": 1.4332399626517274, "grad_norm": 0.03377790376543999, "learning_rate": 0.0002, "loss": 0.5511550903320312, "mean_token_accuracy": 0.7775295376777649, "num_tokens": 6259998.0, "step": 384 }, { "entropy": 0.5302538275718689, "epoch": 1.4369747899159664, "grad_norm": 0.028172362595796585, "learning_rate": 0.0002, "loss": 0.5359051823616028, "mean_token_accuracy": 0.7816868871450424, "num_tokens": 6276398.0, "step": 385 }, { "entropy": 0.543848991394043, "epoch": 1.4407096171802054, "grad_norm": 0.03123684599995613, "learning_rate": 0.0002, "loss": 0.5530490875244141, "mean_token_accuracy": 0.7756175249814987, "num_tokens": 6292623.0, "step": 386 }, { "entropy": 0.5351638197898865, "epoch": 1.4444444444444444, "grad_norm": 0.032041870057582855, "learning_rate": 0.0002, "loss": 0.5453383326530457, "mean_token_accuracy": 0.7787481844425201, "num_tokens": 6308980.0, "step": 387 }, { "entropy": 0.5499856919050217, "epoch": 1.4481792717086834, "grad_norm": 0.03275283798575401, "learning_rate": 0.0002, "loss": 0.5510199666023254, "mean_token_accuracy": 0.7770793437957764, "num_tokens": 6325352.0, "step": 388 }, { "entropy": 0.5473773032426834, "epoch": 1.4519140989729225, "grad_norm": 0.02793571725487709, "learning_rate": 0.0002, "loss": 0.540398120880127, "mean_token_accuracy": 0.7805086821317673, "num_tokens": 6341686.0, "step": 389 }, { "entropy": 0.553907573223114, "epoch": 1.4556489262371615, "grad_norm": 0.02763449028134346, "learning_rate": 0.0002, "loss": 0.5470324754714966, "mean_token_accuracy": 0.7763955593109131, "num_tokens": 6358367.0, "step": 390 }, { "entropy": 0.54300856590271, "epoch": 1.4593837535014005, "grad_norm": 0.0320272259414196, "learning_rate": 0.0002, "loss": 0.5394243001937866, "mean_token_accuracy": 0.7796929031610489, "num_tokens": 6374332.0, "step": 391 }, { "entropy": 0.5419201552867889, "epoch": 1.4631185807656397, "grad_norm": 0.029694141820073128, "learning_rate": 0.0002, "loss": 0.5459417104721069, "mean_token_accuracy": 0.7794879227876663, "num_tokens": 6390817.0, "step": 392 }, { "entropy": 0.533346489071846, "epoch": 1.4668534080298787, "grad_norm": 0.031921736896038055, "learning_rate": 0.0002, "loss": 0.5339134335517883, "mean_token_accuracy": 0.7845402210950851, "num_tokens": 6407105.0, "step": 393 }, { "entropy": 0.5490029752254486, "epoch": 1.4705882352941178, "grad_norm": 0.031292662024497986, "learning_rate": 0.0002, "loss": 0.5461300611495972, "mean_token_accuracy": 0.7792785912752151, "num_tokens": 6423432.0, "step": 394 }, { "entropy": 0.5407290160655975, "epoch": 1.4743230625583568, "grad_norm": 0.029509229585528374, "learning_rate": 0.0002, "loss": 0.5409979224205017, "mean_token_accuracy": 0.7798801958560944, "num_tokens": 6440111.0, "step": 395 }, { "entropy": 0.5352925509214401, "epoch": 1.4780578898225958, "grad_norm": 0.03132627159357071, "learning_rate": 0.0002, "loss": 0.5360226035118103, "mean_token_accuracy": 0.7835162281990051, "num_tokens": 6456553.0, "step": 396 }, { "entropy": 0.5409245789051056, "epoch": 1.4817927170868348, "grad_norm": 0.032262932509183884, "learning_rate": 0.0002, "loss": 0.5367339253425598, "mean_token_accuracy": 0.779682844877243, "num_tokens": 6472831.0, "step": 397 }, { "entropy": 0.5202168971300125, "epoch": 1.4855275443510738, "grad_norm": 0.033896930515766144, "learning_rate": 0.0002, "loss": 0.5268123149871826, "mean_token_accuracy": 0.7819826900959015, "num_tokens": 6488931.0, "step": 398 }, { "entropy": 0.5325956791639328, "epoch": 1.4892623716153128, "grad_norm": 0.03540036827325821, "learning_rate": 0.0002, "loss": 0.5433887839317322, "mean_token_accuracy": 0.778034120798111, "num_tokens": 6505354.0, "step": 399 }, { "entropy": 0.5327711254358292, "epoch": 1.4929971988795518, "grad_norm": 0.02958959899842739, "learning_rate": 0.0002, "loss": 0.5335476398468018, "mean_token_accuracy": 0.7828179448843002, "num_tokens": 6521544.0, "step": 400 }, { "entropy": 0.5357908606529236, "epoch": 1.4967320261437909, "grad_norm": 0.027617521584033966, "learning_rate": 0.0002, "loss": 0.5293720364570618, "mean_token_accuracy": 0.7868403792381287, "num_tokens": 6537889.0, "step": 401 }, { "entropy": 0.5473283380270004, "epoch": 1.5004668534080299, "grad_norm": 0.028360038995742798, "learning_rate": 0.0002, "loss": 0.5436528325080872, "mean_token_accuracy": 0.7810066491365433, "num_tokens": 6554149.0, "step": 402 }, { "entropy": 0.5518513321876526, "epoch": 1.504201680672269, "grad_norm": 0.031041931360960007, "learning_rate": 0.0002, "loss": 0.545119047164917, "mean_token_accuracy": 0.7779288738965988, "num_tokens": 6570521.0, "step": 403 }, { "entropy": 0.5428237915039062, "epoch": 1.507936507936508, "grad_norm": 0.032197825610637665, "learning_rate": 0.0002, "loss": 0.5472823977470398, "mean_token_accuracy": 0.7758528888225555, "num_tokens": 6587086.0, "step": 404 }, { "entropy": 0.5483403950929642, "epoch": 1.511671335200747, "grad_norm": 0.03174825757741928, "learning_rate": 0.0002, "loss": 0.5524789094924927, "mean_token_accuracy": 0.7772649824619293, "num_tokens": 6603513.0, "step": 405 }, { "entropy": 0.5337469726800919, "epoch": 1.515406162464986, "grad_norm": 0.03365413472056389, "learning_rate": 0.0002, "loss": 0.5418713688850403, "mean_token_accuracy": 0.7772432267665863, "num_tokens": 6619737.0, "step": 406 }, { "entropy": 0.5614880919456482, "epoch": 1.519140989729225, "grad_norm": 0.030781377106904984, "learning_rate": 0.0002, "loss": 0.5604795217514038, "mean_token_accuracy": 0.7718411535024643, "num_tokens": 6636097.0, "step": 407 }, { "entropy": 0.5390657633543015, "epoch": 1.522875816993464, "grad_norm": 0.02782733179628849, "learning_rate": 0.0002, "loss": 0.5329728126525879, "mean_token_accuracy": 0.7839234322309494, "num_tokens": 6652406.0, "step": 408 }, { "entropy": 0.5573919266462326, "epoch": 1.526610644257703, "grad_norm": 0.027401108294725418, "learning_rate": 0.0002, "loss": 0.5554807186126709, "mean_token_accuracy": 0.7726366519927979, "num_tokens": 6668812.0, "step": 409 }, { "entropy": 0.5391197204589844, "epoch": 1.530345471521942, "grad_norm": 0.03163023665547371, "learning_rate": 0.0002, "loss": 0.5407525897026062, "mean_token_accuracy": 0.7810121178627014, "num_tokens": 6685040.0, "step": 410 }, { "entropy": 0.5353195369243622, "epoch": 1.534080298786181, "grad_norm": 0.026917260140180588, "learning_rate": 0.0002, "loss": 0.5328407883644104, "mean_token_accuracy": 0.7829948961734772, "num_tokens": 6701433.0, "step": 411 }, { "entropy": 0.5223068818449974, "epoch": 1.53781512605042, "grad_norm": 0.03261617571115494, "learning_rate": 0.0002, "loss": 0.5255942344665527, "mean_token_accuracy": 0.785964623093605, "num_tokens": 6717710.0, "step": 412 }, { "entropy": 0.5453132838010788, "epoch": 1.541549953314659, "grad_norm": 0.03235824778676033, "learning_rate": 0.0002, "loss": 0.5518944263458252, "mean_token_accuracy": 0.7770064026117325, "num_tokens": 6733942.0, "step": 413 }, { "entropy": 0.5489854216575623, "epoch": 1.545284780578898, "grad_norm": 0.02913379855453968, "learning_rate": 0.0002, "loss": 0.5539657473564148, "mean_token_accuracy": 0.7730102986097336, "num_tokens": 6749978.0, "step": 414 }, { "entropy": 0.5504709929227829, "epoch": 1.5490196078431373, "grad_norm": 0.03497619554400444, "learning_rate": 0.0002, "loss": 0.5534422397613525, "mean_token_accuracy": 0.7738368958234787, "num_tokens": 6766386.0, "step": 415 }, { "entropy": 0.5360163599252701, "epoch": 1.5527544351073763, "grad_norm": 0.03147003799676895, "learning_rate": 0.0002, "loss": 0.5354920625686646, "mean_token_accuracy": 0.7844124883413315, "num_tokens": 6782497.0, "step": 416 }, { "entropy": 0.5680203884840012, "epoch": 1.5564892623716153, "grad_norm": 0.030537011101841927, "learning_rate": 0.0002, "loss": 0.5605371594429016, "mean_token_accuracy": 0.772536501288414, "num_tokens": 6799059.0, "step": 417 }, { "entropy": 0.5505528301000595, "epoch": 1.5602240896358543, "grad_norm": 0.028710143640637398, "learning_rate": 0.0002, "loss": 0.5522081255912781, "mean_token_accuracy": 0.7738733440637589, "num_tokens": 6815363.0, "step": 418 }, { "entropy": 0.5502945929765701, "epoch": 1.5639589169000934, "grad_norm": 0.0320894755423069, "learning_rate": 0.0002, "loss": 0.5519194006919861, "mean_token_accuracy": 0.775145635008812, "num_tokens": 6831823.0, "step": 419 }, { "entropy": 0.5572039783000946, "epoch": 1.5676937441643324, "grad_norm": 0.028658481314778328, "learning_rate": 0.0002, "loss": 0.5568941831588745, "mean_token_accuracy": 0.7728902250528336, "num_tokens": 6848346.0, "step": 420 }, { "entropy": 0.5431763082742691, "epoch": 1.5714285714285714, "grad_norm": 0.027273258194327354, "learning_rate": 0.0002, "loss": 0.5424181818962097, "mean_token_accuracy": 0.7814328521490097, "num_tokens": 6864537.0, "step": 421 }, { "entropy": 0.5466543883085251, "epoch": 1.5751633986928104, "grad_norm": 0.02875494956970215, "learning_rate": 0.0002, "loss": 0.5450119972229004, "mean_token_accuracy": 0.7765506953001022, "num_tokens": 6881053.0, "step": 422 }, { "entropy": 0.5499023944139481, "epoch": 1.5788982259570497, "grad_norm": 0.02958599291741848, "learning_rate": 0.0002, "loss": 0.5486996173858643, "mean_token_accuracy": 0.778396338224411, "num_tokens": 6897409.0, "step": 423 }, { "entropy": 0.5387710481882095, "epoch": 1.5826330532212887, "grad_norm": 0.030644621700048447, "learning_rate": 0.0002, "loss": 0.5404931306838989, "mean_token_accuracy": 0.7786550223827362, "num_tokens": 6913681.0, "step": 424 }, { "entropy": 0.5346106290817261, "epoch": 1.5863678804855277, "grad_norm": 0.028904983773827553, "learning_rate": 0.0002, "loss": 0.5413768887519836, "mean_token_accuracy": 0.7797856479883194, "num_tokens": 6930096.0, "step": 425 }, { "entropy": 0.5166824460029602, "epoch": 1.5901027077497667, "grad_norm": 0.03321892023086548, "learning_rate": 0.0002, "loss": 0.5238149166107178, "mean_token_accuracy": 0.7857634872198105, "num_tokens": 6946449.0, "step": 426 }, { "entropy": 0.5426425486803055, "epoch": 1.5938375350140057, "grad_norm": 0.030873097479343414, "learning_rate": 0.0002, "loss": 0.5491586923599243, "mean_token_accuracy": 0.7750476896762848, "num_tokens": 6962805.0, "step": 427 }, { "entropy": 0.555439367890358, "epoch": 1.5975723622782447, "grad_norm": 0.030430428683757782, "learning_rate": 0.0002, "loss": 0.5504173040390015, "mean_token_accuracy": 0.7780658453702927, "num_tokens": 6979378.0, "step": 428 }, { "entropy": 0.5425661355257034, "epoch": 1.6013071895424837, "grad_norm": 0.033183399587869644, "learning_rate": 0.0002, "loss": 0.5338144302368164, "mean_token_accuracy": 0.7815939337015152, "num_tokens": 6995576.0, "step": 429 }, { "entropy": 0.5580693334341049, "epoch": 1.6050420168067228, "grad_norm": 0.02936139702796936, "learning_rate": 0.0002, "loss": 0.5471250414848328, "mean_token_accuracy": 0.7805830985307693, "num_tokens": 7011887.0, "step": 430 }, { "entropy": 0.5445709973573685, "epoch": 1.6087768440709618, "grad_norm": 0.029686426743865013, "learning_rate": 0.0002, "loss": 0.5449705719947815, "mean_token_accuracy": 0.7791666090488434, "num_tokens": 7028245.0, "step": 431 }, { "entropy": 0.5352734625339508, "epoch": 1.6125116713352008, "grad_norm": 0.0335598923265934, "learning_rate": 0.0002, "loss": 0.5456029772758484, "mean_token_accuracy": 0.7778525203466415, "num_tokens": 7044490.0, "step": 432 }, { "entropy": 0.548936665058136, "epoch": 1.6162464985994398, "grad_norm": 0.03590673953294754, "learning_rate": 0.0002, "loss": 0.5520269870758057, "mean_token_accuracy": 0.7742140144109726, "num_tokens": 7060917.0, "step": 433 }, { "entropy": 0.5434507131576538, "epoch": 1.6199813258636788, "grad_norm": 0.028407955542206764, "learning_rate": 0.0002, "loss": 0.5414606332778931, "mean_token_accuracy": 0.778694823384285, "num_tokens": 7077100.0, "step": 434 }, { "entropy": 0.5490714907646179, "epoch": 1.6237161531279178, "grad_norm": 0.0324469618499279, "learning_rate": 0.0002, "loss": 0.5481012463569641, "mean_token_accuracy": 0.7763958275318146, "num_tokens": 7093665.0, "step": 435 }, { "entropy": 0.5379714071750641, "epoch": 1.6274509803921569, "grad_norm": 0.030424365773797035, "learning_rate": 0.0002, "loss": 0.5396856665611267, "mean_token_accuracy": 0.7815098166465759, "num_tokens": 7110174.0, "step": 436 }, { "entropy": 0.5480812042951584, "epoch": 1.6311858076563959, "grad_norm": 0.029105886816978455, "learning_rate": 0.0002, "loss": 0.5511510372161865, "mean_token_accuracy": 0.7754542678594589, "num_tokens": 7126486.0, "step": 437 }, { "entropy": 0.5540740191936493, "epoch": 1.6349206349206349, "grad_norm": 0.027599727734923363, "learning_rate": 0.0002, "loss": 0.5574399828910828, "mean_token_accuracy": 0.7723194360733032, "num_tokens": 7143064.0, "step": 438 }, { "entropy": 0.5382533967494965, "epoch": 1.638655462184874, "grad_norm": 0.02985025756061077, "learning_rate": 0.0002, "loss": 0.542414665222168, "mean_token_accuracy": 0.7797781080007553, "num_tokens": 7159194.0, "step": 439 }, { "entropy": 0.545093446969986, "epoch": 1.642390289449113, "grad_norm": 0.033221568912267685, "learning_rate": 0.0002, "loss": 0.5397443771362305, "mean_token_accuracy": 0.781465008854866, "num_tokens": 7175448.0, "step": 440 }, { "entropy": 0.547942727804184, "epoch": 1.646125116713352, "grad_norm": 0.030130675062537193, "learning_rate": 0.0002, "loss": 0.5471298098564148, "mean_token_accuracy": 0.7778923958539963, "num_tokens": 7191951.0, "step": 441 }, { "entropy": 0.5388812720775604, "epoch": 1.649859943977591, "grad_norm": 0.03608401492238045, "learning_rate": 0.0002, "loss": 0.5405545234680176, "mean_token_accuracy": 0.7795072197914124, "num_tokens": 7208082.0, "step": 442 }, { "entropy": 0.5480445921421051, "epoch": 1.65359477124183, "grad_norm": 0.03251367062330246, "learning_rate": 0.0002, "loss": 0.5486726760864258, "mean_token_accuracy": 0.7771764397621155, "num_tokens": 7224432.0, "step": 443 }, { "entropy": 0.5502856224775314, "epoch": 1.657329598506069, "grad_norm": 0.03557496517896652, "learning_rate": 0.0002, "loss": 0.5455541014671326, "mean_token_accuracy": 0.7788678556680679, "num_tokens": 7241112.0, "step": 444 }, { "entropy": 0.5650181323289871, "epoch": 1.661064425770308, "grad_norm": 0.036821287125349045, "learning_rate": 0.0002, "loss": 0.5659928321838379, "mean_token_accuracy": 0.7705142349004745, "num_tokens": 7257646.0, "step": 445 }, { "entropy": 0.5301887840032578, "epoch": 1.664799253034547, "grad_norm": 0.028849398717284203, "learning_rate": 0.0002, "loss": 0.5311304926872253, "mean_token_accuracy": 0.7853154540061951, "num_tokens": 7273883.0, "step": 446 }, { "entropy": 0.5287686139345169, "epoch": 1.668534080298786, "grad_norm": 0.027796290814876556, "learning_rate": 0.0002, "loss": 0.5300359129905701, "mean_token_accuracy": 0.7818829715251923, "num_tokens": 7290094.0, "step": 447 }, { "entropy": 0.5384389162063599, "epoch": 1.6722689075630253, "grad_norm": 0.03137550130486488, "learning_rate": 0.0002, "loss": 0.5358840227127075, "mean_token_accuracy": 0.7822984606027603, "num_tokens": 7306318.0, "step": 448 }, { "entropy": 0.5409219712018967, "epoch": 1.6760037348272643, "grad_norm": 0.03238392993807793, "learning_rate": 0.0002, "loss": 0.5490888357162476, "mean_token_accuracy": 0.7757006883621216, "num_tokens": 7322518.0, "step": 449 }, { "entropy": 0.5399473458528519, "epoch": 1.6797385620915033, "grad_norm": 0.03108685463666916, "learning_rate": 0.0002, "loss": 0.5397608876228333, "mean_token_accuracy": 0.7774724215269089, "num_tokens": 7338931.0, "step": 450 }, { "entropy": 0.5551822930574417, "epoch": 1.6834733893557423, "grad_norm": 0.02780800126492977, "learning_rate": 0.0002, "loss": 0.5481570959091187, "mean_token_accuracy": 0.7780963182449341, "num_tokens": 7355336.0, "step": 451 }, { "entropy": 0.54237399995327, "epoch": 1.6872082166199813, "grad_norm": 0.04012434557080269, "learning_rate": 0.0002, "loss": 0.5462750792503357, "mean_token_accuracy": 0.7741427570581436, "num_tokens": 7371655.0, "step": 452 }, { "entropy": 0.5476243197917938, "epoch": 1.6909430438842203, "grad_norm": 0.031238745898008347, "learning_rate": 0.0002, "loss": 0.5490629076957703, "mean_token_accuracy": 0.7778069078922272, "num_tokens": 7387779.0, "step": 453 }, { "entropy": 0.5370198786258698, "epoch": 1.6946778711484594, "grad_norm": 0.0672907754778862, "learning_rate": 0.0002, "loss": 0.5387383699417114, "mean_token_accuracy": 0.7835952490568161, "num_tokens": 7404160.0, "step": 454 }, { "entropy": 0.5476315915584564, "epoch": 1.6984126984126984, "grad_norm": 0.029196592047810555, "learning_rate": 0.0002, "loss": 0.5511754751205444, "mean_token_accuracy": 0.7767634838819504, "num_tokens": 7420779.0, "step": 455 }, { "entropy": 0.5495481044054031, "epoch": 1.7021475256769374, "grad_norm": 0.03591341897845268, "learning_rate": 0.0002, "loss": 0.5475634336471558, "mean_token_accuracy": 0.7761732786893845, "num_tokens": 7437268.0, "step": 456 }, { "entropy": 0.5471929609775543, "epoch": 1.7058823529411766, "grad_norm": 0.07272505015134811, "learning_rate": 0.0002, "loss": 0.5460875630378723, "mean_token_accuracy": 0.7771887481212616, "num_tokens": 7453407.0, "step": 457 }, { "entropy": 0.5470087379217148, "epoch": 1.7096171802054156, "grad_norm": 0.027592960745096207, "learning_rate": 0.0002, "loss": 0.544583797454834, "mean_token_accuracy": 0.7774143517017365, "num_tokens": 7469641.0, "step": 458 }, { "entropy": 0.5607744753360748, "epoch": 1.7133520074696547, "grad_norm": 0.031071651726961136, "learning_rate": 0.0002, "loss": 0.5542961955070496, "mean_token_accuracy": 0.7748319655656815, "num_tokens": 7486190.0, "step": 459 }, { "entropy": 0.5514983385801315, "epoch": 1.7170868347338937, "grad_norm": 0.03477690741419792, "learning_rate": 0.0002, "loss": 0.5511950254440308, "mean_token_accuracy": 0.7754039019346237, "num_tokens": 7502685.0, "step": 460 }, { "entropy": 0.5462844371795654, "epoch": 1.7208216619981327, "grad_norm": 0.02956387773156166, "learning_rate": 0.0002, "loss": 0.5578323602676392, "mean_token_accuracy": 0.7759933173656464, "num_tokens": 7518976.0, "step": 461 }, { "entropy": 0.5413178950548172, "epoch": 1.7245564892623717, "grad_norm": 0.03515993058681488, "learning_rate": 0.0002, "loss": 0.5494832992553711, "mean_token_accuracy": 0.7766997069120407, "num_tokens": 7535230.0, "step": 462 }, { "entropy": 0.5519613027572632, "epoch": 1.7282913165266107, "grad_norm": 0.03921071067452431, "learning_rate": 0.0002, "loss": 0.5593541860580444, "mean_token_accuracy": 0.7729771286249161, "num_tokens": 7551766.0, "step": 463 }, { "entropy": 0.5483202934265137, "epoch": 1.7320261437908497, "grad_norm": 0.02950095944106579, "learning_rate": 0.0002, "loss": 0.5464847683906555, "mean_token_accuracy": 0.7769839763641357, "num_tokens": 7568028.0, "step": 464 }, { "entropy": 0.5524065643548965, "epoch": 1.7357609710550888, "grad_norm": 0.038918618112802505, "learning_rate": 0.0002, "loss": 0.5422624945640564, "mean_token_accuracy": 0.7797468602657318, "num_tokens": 7584397.0, "step": 465 }, { "entropy": 0.546732097864151, "epoch": 1.7394957983193278, "grad_norm": 0.03082694672048092, "learning_rate": 0.0002, "loss": 0.5352342128753662, "mean_token_accuracy": 0.78376704454422, "num_tokens": 7600719.0, "step": 466 }, { "entropy": 0.557578444480896, "epoch": 1.7432306255835668, "grad_norm": 0.031017586588859558, "learning_rate": 0.0002, "loss": 0.54631108045578, "mean_token_accuracy": 0.7787049263715744, "num_tokens": 7617277.0, "step": 467 }, { "entropy": 0.5322857201099396, "epoch": 1.7469654528478058, "grad_norm": 0.0356813408434391, "learning_rate": 0.0002, "loss": 0.5350920557975769, "mean_token_accuracy": 0.7820670753717422, "num_tokens": 7633468.0, "step": 468 }, { "entropy": 0.5373670607805252, "epoch": 1.7507002801120448, "grad_norm": 0.0339689627289772, "learning_rate": 0.0002, "loss": 0.5516907572746277, "mean_token_accuracy": 0.7766174525022507, "num_tokens": 7649778.0, "step": 469 }, { "entropy": 0.522003561258316, "epoch": 1.7544351073762838, "grad_norm": 0.034353625029325485, "learning_rate": 0.0002, "loss": 0.533075749874115, "mean_token_accuracy": 0.7833420485258102, "num_tokens": 7666182.0, "step": 470 }, { "entropy": 0.5592000931501389, "epoch": 1.7581699346405228, "grad_norm": 0.029966510832309723, "learning_rate": 0.0002, "loss": 0.5585059523582458, "mean_token_accuracy": 0.7732168585062027, "num_tokens": 7682572.0, "step": 471 }, { "entropy": 0.5302631109952927, "epoch": 1.7619047619047619, "grad_norm": 0.030881982296705246, "learning_rate": 0.0002, "loss": 0.5330703854560852, "mean_token_accuracy": 0.7825835943222046, "num_tokens": 7698564.0, "step": 472 }, { "entropy": 0.5615632385015488, "epoch": 1.7656395891690009, "grad_norm": 0.03000018559396267, "learning_rate": 0.0002, "loss": 0.5536789298057556, "mean_token_accuracy": 0.7739888280630112, "num_tokens": 7714922.0, "step": 473 }, { "entropy": 0.5522587448358536, "epoch": 1.76937441643324, "grad_norm": 0.031349826604127884, "learning_rate": 0.0002, "loss": 0.551250696182251, "mean_token_accuracy": 0.7755384594202042, "num_tokens": 7731301.0, "step": 474 }, { "entropy": 0.5275092422962189, "epoch": 1.773109243697479, "grad_norm": 0.026553746312856674, "learning_rate": 0.0002, "loss": 0.5240329504013062, "mean_token_accuracy": 0.7870848327875137, "num_tokens": 7747693.0, "step": 475 }, { "entropy": 0.5298073589801788, "epoch": 1.776844070961718, "grad_norm": 0.03024754300713539, "learning_rate": 0.0002, "loss": 0.5267937183380127, "mean_token_accuracy": 0.7867465615272522, "num_tokens": 7763990.0, "step": 476 }, { "entropy": 0.5466170459985733, "epoch": 1.780578898225957, "grad_norm": 0.03677600622177124, "learning_rate": 0.0002, "loss": 0.5455999374389648, "mean_token_accuracy": 0.7789721339941025, "num_tokens": 7780428.0, "step": 477 }, { "entropy": 0.5342886596918106, "epoch": 1.784313725490196, "grad_norm": 0.03470218554139137, "learning_rate": 0.0002, "loss": 0.5434668660163879, "mean_token_accuracy": 0.7787842005491257, "num_tokens": 7796524.0, "step": 478 }, { "entropy": 0.5427644997835159, "epoch": 1.788048552754435, "grad_norm": 0.026957696303725243, "learning_rate": 0.0002, "loss": 0.5418925285339355, "mean_token_accuracy": 0.7785145193338394, "num_tokens": 7813105.0, "step": 479 }, { "entropy": 0.528566911816597, "epoch": 1.791783380018674, "grad_norm": 0.037975575774908066, "learning_rate": 0.0002, "loss": 0.5284658074378967, "mean_token_accuracy": 0.7871547490358353, "num_tokens": 7829398.0, "step": 480 }, { "entropy": 0.5551463812589645, "epoch": 1.795518207282913, "grad_norm": 0.028514336794614792, "learning_rate": 0.0002, "loss": 0.556096076965332, "mean_token_accuracy": 0.7756756544113159, "num_tokens": 7845626.0, "step": 481 }, { "entropy": 0.5317743271589279, "epoch": 1.7992530345471522, "grad_norm": 0.03154602646827698, "learning_rate": 0.0002, "loss": 0.5321435332298279, "mean_token_accuracy": 0.7815971374511719, "num_tokens": 7861817.0, "step": 482 }, { "entropy": 0.547456681728363, "epoch": 1.8029878618113913, "grad_norm": 0.03746788948774338, "learning_rate": 0.0002, "loss": 0.5512088537216187, "mean_token_accuracy": 0.7785567492246628, "num_tokens": 7878075.0, "step": 483 }, { "entropy": 0.5500114560127258, "epoch": 1.8067226890756303, "grad_norm": 0.030493978410959244, "learning_rate": 0.0002, "loss": 0.5513818264007568, "mean_token_accuracy": 0.773513063788414, "num_tokens": 7894502.0, "step": 484 }, { "entropy": 0.543748289346695, "epoch": 1.8104575163398693, "grad_norm": 0.036304932087659836, "learning_rate": 0.0002, "loss": 0.5411792993545532, "mean_token_accuracy": 0.7785163670778275, "num_tokens": 7910890.0, "step": 485 }, { "entropy": 0.5393827706575394, "epoch": 1.8141923436041083, "grad_norm": 0.03712041303515434, "learning_rate": 0.0002, "loss": 0.540428876876831, "mean_token_accuracy": 0.7790835350751877, "num_tokens": 7927094.0, "step": 486 }, { "entropy": 0.5430537164211273, "epoch": 1.8179271708683473, "grad_norm": 0.03853759169578552, "learning_rate": 0.0002, "loss": 0.5471268892288208, "mean_token_accuracy": 0.7801193594932556, "num_tokens": 7943326.0, "step": 487 }, { "entropy": 0.5636092722415924, "epoch": 1.8216619981325863, "grad_norm": 0.0457291305065155, "learning_rate": 0.0002, "loss": 0.5627254843711853, "mean_token_accuracy": 0.7725824415683746, "num_tokens": 7959760.0, "step": 488 }, { "entropy": 0.543666809797287, "epoch": 1.8253968253968254, "grad_norm": 0.02919071726500988, "learning_rate": 0.0002, "loss": 0.5421757102012634, "mean_token_accuracy": 0.7801477611064911, "num_tokens": 7975860.0, "step": 489 }, { "entropy": 0.5545783638954163, "epoch": 1.8291316526610646, "grad_norm": 0.03340514004230499, "learning_rate": 0.0002, "loss": 0.5518795251846313, "mean_token_accuracy": 0.7791309058666229, "num_tokens": 7992340.0, "step": 490 }, { "entropy": 0.5608565956354141, "epoch": 1.8328664799253036, "grad_norm": 0.03725928068161011, "learning_rate": 0.0002, "loss": 0.5564695596694946, "mean_token_accuracy": 0.7749106585979462, "num_tokens": 8008783.0, "step": 491 }, { "entropy": 0.5600428581237793, "epoch": 1.8366013071895426, "grad_norm": 0.030761808156967163, "learning_rate": 0.0002, "loss": 0.5595075488090515, "mean_token_accuracy": 0.7733119577169418, "num_tokens": 8025204.0, "step": 492 }, { "entropy": 0.5233868211507797, "epoch": 1.8403361344537816, "grad_norm": 0.030873069539666176, "learning_rate": 0.0002, "loss": 0.5303994417190552, "mean_token_accuracy": 0.784132570028305, "num_tokens": 8041524.0, "step": 493 }, { "entropy": 0.5531543642282486, "epoch": 1.8440709617180207, "grad_norm": 0.037785280495882034, "learning_rate": 0.0002, "loss": 0.5541731119155884, "mean_token_accuracy": 0.7754590958356857, "num_tokens": 8057944.0, "step": 494 }, { "entropy": 0.542868971824646, "epoch": 1.8478057889822597, "grad_norm": 0.03054802305996418, "learning_rate": 0.0002, "loss": 0.5407766699790955, "mean_token_accuracy": 0.7781128138303757, "num_tokens": 8074585.0, "step": 495 }, { "entropy": 0.5384076237678528, "epoch": 1.8515406162464987, "grad_norm": 0.024639198556542397, "learning_rate": 0.0002, "loss": 0.5381752848625183, "mean_token_accuracy": 0.7817619889974594, "num_tokens": 8091097.0, "step": 496 }, { "entropy": 0.5398432165384293, "epoch": 1.8552754435107377, "grad_norm": 0.04202251881361008, "learning_rate": 0.0002, "loss": 0.5468040704727173, "mean_token_accuracy": 0.7771125733852386, "num_tokens": 8107370.0, "step": 497 }, { "entropy": 0.5353098064661026, "epoch": 1.8590102707749767, "grad_norm": 0.03730052337050438, "learning_rate": 0.0002, "loss": 0.5450741052627563, "mean_token_accuracy": 0.7791319042444229, "num_tokens": 8123388.0, "step": 498 }, { "entropy": 0.537789598107338, "epoch": 1.8627450980392157, "grad_norm": 0.02861681580543518, "learning_rate": 0.0002, "loss": 0.5363599061965942, "mean_token_accuracy": 0.7793796509504318, "num_tokens": 8139491.0, "step": 499 }, { "entropy": 0.5609306395053864, "epoch": 1.8664799253034547, "grad_norm": 0.04193006083369255, "learning_rate": 0.0002, "loss": 0.5556061267852783, "mean_token_accuracy": 0.7729236781597137, "num_tokens": 8155893.0, "step": 500 }, { "entropy": 0.5393400639295578, "epoch": 1.8702147525676938, "grad_norm": 0.030415907502174377, "learning_rate": 0.0002, "loss": 0.5372475385665894, "mean_token_accuracy": 0.7827758193016052, "num_tokens": 8172021.0, "step": 501 }, { "entropy": 0.5631109476089478, "epoch": 1.8739495798319328, "grad_norm": 0.030597561970353127, "learning_rate": 0.0002, "loss": 0.56128990650177, "mean_token_accuracy": 0.7722143828868866, "num_tokens": 8188761.0, "step": 502 }, { "entropy": 0.540698915719986, "epoch": 1.8776844070961718, "grad_norm": 0.03197801113128662, "learning_rate": 0.0002, "loss": 0.5419467687606812, "mean_token_accuracy": 0.7789603024721146, "num_tokens": 8205080.0, "step": 503 }, { "entropy": 0.5343400835990906, "epoch": 1.8814192343604108, "grad_norm": 0.03577344864606857, "learning_rate": 0.0002, "loss": 0.5340043306350708, "mean_token_accuracy": 0.7837951481342316, "num_tokens": 8221164.0, "step": 504 }, { "entropy": 0.5417536497116089, "epoch": 1.8851540616246498, "grad_norm": 0.029083728790283203, "learning_rate": 0.0002, "loss": 0.5438728332519531, "mean_token_accuracy": 0.7775007486343384, "num_tokens": 8237535.0, "step": 505 }, { "entropy": 0.5649835765361786, "epoch": 1.8888888888888888, "grad_norm": 0.03408566117286682, "learning_rate": 0.0002, "loss": 0.5633872151374817, "mean_token_accuracy": 0.7726111114025116, "num_tokens": 8253827.0, "step": 506 }, { "entropy": 0.5582909733057022, "epoch": 1.8926237161531279, "grad_norm": 0.028437087312340736, "learning_rate": 0.0002, "loss": 0.556007981300354, "mean_token_accuracy": 0.7727185785770416, "num_tokens": 8270404.0, "step": 507 }, { "entropy": 0.5577380061149597, "epoch": 1.8963585434173669, "grad_norm": 0.029986968263983727, "learning_rate": 0.0002, "loss": 0.5514963865280151, "mean_token_accuracy": 0.7755957692861557, "num_tokens": 8286963.0, "step": 508 }, { "entropy": 0.5398396402597427, "epoch": 1.9000933706816059, "grad_norm": 0.030943697318434715, "learning_rate": 0.0002, "loss": 0.5466131567955017, "mean_token_accuracy": 0.7787002176046371, "num_tokens": 8303122.0, "step": 509 }, { "entropy": 0.536215141415596, "epoch": 1.903828197945845, "grad_norm": 0.03370903804898262, "learning_rate": 0.0002, "loss": 0.5468170046806335, "mean_token_accuracy": 0.7753093987703323, "num_tokens": 8319505.0, "step": 510 }, { "entropy": 0.5411160290241241, "epoch": 1.907563025210084, "grad_norm": 0.028430534526705742, "learning_rate": 0.0002, "loss": 0.5434973835945129, "mean_token_accuracy": 0.7790606617927551, "num_tokens": 8335861.0, "step": 511 }, { "entropy": 0.5555713921785355, "epoch": 1.911297852474323, "grad_norm": 0.029101036489009857, "learning_rate": 0.0002, "loss": 0.5541608929634094, "mean_token_accuracy": 0.7740498781204224, "num_tokens": 8352413.0, "step": 512 }, { "entropy": 0.5440339744091034, "epoch": 1.915032679738562, "grad_norm": 0.029705537483096123, "learning_rate": 0.0002, "loss": 0.5449399352073669, "mean_token_accuracy": 0.7799241691827774, "num_tokens": 8368524.0, "step": 513 }, { "entropy": 0.5385466068983078, "epoch": 1.918767507002801, "grad_norm": 0.02762160450220108, "learning_rate": 0.0002, "loss": 0.5408512353897095, "mean_token_accuracy": 0.7800593823194504, "num_tokens": 8384881.0, "step": 514 }, { "entropy": 0.5469230860471725, "epoch": 1.9225023342670402, "grad_norm": 0.02923613414168358, "learning_rate": 0.0002, "loss": 0.5409518480300903, "mean_token_accuracy": 0.7801833301782608, "num_tokens": 8401135.0, "step": 515 }, { "entropy": 0.5446400791406631, "epoch": 1.9262371615312792, "grad_norm": 0.031235719099640846, "learning_rate": 0.0002, "loss": 0.5424818992614746, "mean_token_accuracy": 0.7767819166183472, "num_tokens": 8417485.0, "step": 516 }, { "entropy": 0.5608149170875549, "epoch": 1.9299719887955182, "grad_norm": 0.027529114857316017, "learning_rate": 0.0002, "loss": 0.5587472915649414, "mean_token_accuracy": 0.7713829576969147, "num_tokens": 8433808.0, "step": 517 }, { "entropy": 0.560915470123291, "epoch": 1.9337068160597572, "grad_norm": 0.03099709376692772, "learning_rate": 0.0002, "loss": 0.5625618100166321, "mean_token_accuracy": 0.7697023302316666, "num_tokens": 8450212.0, "step": 518 }, { "entropy": 0.5411669313907623, "epoch": 1.9374416433239963, "grad_norm": 0.03581510856747627, "learning_rate": 0.0002, "loss": 0.5449709892272949, "mean_token_accuracy": 0.779253363609314, "num_tokens": 8466650.0, "step": 519 }, { "entropy": 0.5495533496141434, "epoch": 1.9411764705882353, "grad_norm": 0.02863345853984356, "learning_rate": 0.0002, "loss": 0.5461183786392212, "mean_token_accuracy": 0.7790695428848267, "num_tokens": 8482819.0, "step": 520 }, { "entropy": 0.5496646910905838, "epoch": 1.9449112978524743, "grad_norm": 0.028455862775444984, "learning_rate": 0.0002, "loss": 0.5562914609909058, "mean_token_accuracy": 0.7747850865125656, "num_tokens": 8499201.0, "step": 521 }, { "entropy": 0.5566077679395676, "epoch": 1.9486461251167133, "grad_norm": 0.030010810121893883, "learning_rate": 0.0002, "loss": 0.551722526550293, "mean_token_accuracy": 0.7771954238414764, "num_tokens": 8515798.0, "step": 522 }, { "entropy": 0.5467117130756378, "epoch": 1.9523809523809523, "grad_norm": 0.027012262493371964, "learning_rate": 0.0002, "loss": 0.5425857305526733, "mean_token_accuracy": 0.7798562794923782, "num_tokens": 8531958.0, "step": 523 }, { "entropy": 0.5346378833055496, "epoch": 1.9561157796451916, "grad_norm": 0.028377590700984, "learning_rate": 0.0002, "loss": 0.5295640230178833, "mean_token_accuracy": 0.7838203459978104, "num_tokens": 8548384.0, "step": 524 }, { "entropy": 0.5571393668651581, "epoch": 1.9598506069094306, "grad_norm": 0.02818567305803299, "learning_rate": 0.0002, "loss": 0.5521214008331299, "mean_token_accuracy": 0.7728232592344284, "num_tokens": 8564872.0, "step": 525 }, { "entropy": 0.5285107642412186, "epoch": 1.9635854341736696, "grad_norm": 0.03457087650895119, "learning_rate": 0.0002, "loss": 0.5370362401008606, "mean_token_accuracy": 0.7813837081193924, "num_tokens": 8581245.0, "step": 526 }, { "entropy": 0.5266488045454025, "epoch": 1.9673202614379086, "grad_norm": 0.030525686219334602, "learning_rate": 0.0002, "loss": 0.5345274806022644, "mean_token_accuracy": 0.7815807163715363, "num_tokens": 8597625.0, "step": 527 }, { "entropy": 0.5280887708067894, "epoch": 1.9710550887021476, "grad_norm": 0.03248651325702667, "learning_rate": 0.0002, "loss": 0.536238431930542, "mean_token_accuracy": 0.781073585152626, "num_tokens": 8613792.0, "step": 528 }, { "entropy": 0.5472559034824371, "epoch": 1.9747899159663866, "grad_norm": 0.029427766799926758, "learning_rate": 0.0002, "loss": 0.5451797842979431, "mean_token_accuracy": 0.7770361602306366, "num_tokens": 8629870.0, "step": 529 }, { "entropy": 0.5381799042224884, "epoch": 1.9785247432306257, "grad_norm": 0.028413154184818268, "learning_rate": 0.0002, "loss": 0.5342366695404053, "mean_token_accuracy": 0.7803498655557632, "num_tokens": 8646077.0, "step": 530 }, { "entropy": 0.5565104633569717, "epoch": 1.9822595704948647, "grad_norm": 0.031074564903974533, "learning_rate": 0.0002, "loss": 0.5515958666801453, "mean_token_accuracy": 0.7759076505899429, "num_tokens": 8662535.0, "step": 531 }, { "entropy": 0.5381414890289307, "epoch": 1.9859943977591037, "grad_norm": 0.027250438928604126, "learning_rate": 0.0002, "loss": 0.534949004650116, "mean_token_accuracy": 0.7819445878267288, "num_tokens": 8679064.0, "step": 532 }, { "entropy": 0.550770565867424, "epoch": 1.9897292250233427, "grad_norm": 0.03366328775882721, "learning_rate": 0.0002, "loss": 0.560295045375824, "mean_token_accuracy": 0.7720893323421478, "num_tokens": 8695198.0, "step": 533 }, { "entropy": 0.5551019459962845, "epoch": 1.9934640522875817, "grad_norm": 0.03133872151374817, "learning_rate": 0.0002, "loss": 0.5596403479576111, "mean_token_accuracy": 0.7717682421207428, "num_tokens": 8711690.0, "step": 534 }, { "entropy": 0.5346082448959351, "epoch": 1.9971988795518207, "grad_norm": 0.027525832876563072, "learning_rate": 0.0002, "loss": 0.5321208834648132, "mean_token_accuracy": 0.7810429036617279, "num_tokens": 8727828.0, "step": 535 }, { "entropy": 0.5438209176063538, "epoch": 2.0, "grad_norm": 0.03134825825691223, "learning_rate": 0.0002, "loss": 0.5398504734039307, "mean_token_accuracy": 0.7799929777781168, "num_tokens": 8729600.0, "step": 536 }, { "entropy": 0.5402208417654037, "epoch": 2.003734827264239, "grad_norm": 0.03922782838344574, "learning_rate": 0.0002, "loss": 0.5171674489974976, "mean_token_accuracy": 0.7900938540697098, "num_tokens": 8745809.0, "step": 537 }, { "entropy": 0.5227422267198563, "epoch": 2.007469654528478, "grad_norm": 0.032982293516397476, "learning_rate": 0.0002, "loss": 0.5183929204940796, "mean_token_accuracy": 0.7890477329492569, "num_tokens": 8762197.0, "step": 538 }, { "entropy": 0.5411823242902756, "epoch": 2.011204481792717, "grad_norm": 0.043377745896577835, "learning_rate": 0.0002, "loss": 0.5554962158203125, "mean_token_accuracy": 0.7736512869596481, "num_tokens": 8778400.0, "step": 539 }, { "entropy": 0.5156290903687477, "epoch": 2.014939309056956, "grad_norm": 0.05257771536707878, "learning_rate": 0.0002, "loss": 0.5335375666618347, "mean_token_accuracy": 0.7833946198225021, "num_tokens": 8794851.0, "step": 540 }, { "entropy": 0.5122585743665695, "epoch": 2.018674136321195, "grad_norm": 0.03504469618201256, "learning_rate": 0.0002, "loss": 0.5155843496322632, "mean_token_accuracy": 0.7894317060709, "num_tokens": 8811019.0, "step": 541 }, { "entropy": 0.5448772013187408, "epoch": 2.022408963585434, "grad_norm": 0.0317138209939003, "learning_rate": 0.0002, "loss": 0.5368859171867371, "mean_token_accuracy": 0.7842776328325272, "num_tokens": 8827258.0, "step": 542 }, { "entropy": 0.5527419149875641, "epoch": 2.026143790849673, "grad_norm": 0.03476279601454735, "learning_rate": 0.0002, "loss": 0.5388182997703552, "mean_token_accuracy": 0.7820452451705933, "num_tokens": 8843634.0, "step": 543 }, { "entropy": 0.5452789962291718, "epoch": 2.029878618113912, "grad_norm": 0.036034028977155685, "learning_rate": 0.0002, "loss": 0.5357140302658081, "mean_token_accuracy": 0.7825983464717865, "num_tokens": 8859977.0, "step": 544 }, { "entropy": 0.5304267108440399, "epoch": 2.033613445378151, "grad_norm": 0.02969290129840374, "learning_rate": 0.0002, "loss": 0.5306066870689392, "mean_token_accuracy": 0.7856841534376144, "num_tokens": 8876314.0, "step": 545 }, { "entropy": 0.5096816495060921, "epoch": 2.03734827264239, "grad_norm": 0.043957311660051346, "learning_rate": 0.0002, "loss": 0.5185045599937439, "mean_token_accuracy": 0.7923233062028885, "num_tokens": 8892568.0, "step": 546 }, { "entropy": 0.5319265872240067, "epoch": 2.041083099906629, "grad_norm": 0.035869866609573364, "learning_rate": 0.0002, "loss": 0.5334051847457886, "mean_token_accuracy": 0.7822142988443375, "num_tokens": 8909094.0, "step": 547 }, { "entropy": 0.527954563498497, "epoch": 2.044817927170868, "grad_norm": 0.034570369869470596, "learning_rate": 0.0002, "loss": 0.5274232029914856, "mean_token_accuracy": 0.7842770516872406, "num_tokens": 8925492.0, "step": 548 }, { "entropy": 0.5419287383556366, "epoch": 2.048552754435107, "grad_norm": 0.03259408101439476, "learning_rate": 0.0002, "loss": 0.5387848615646362, "mean_token_accuracy": 0.7804249227046967, "num_tokens": 8941717.0, "step": 549 }, { "entropy": 0.5271529629826546, "epoch": 2.052287581699346, "grad_norm": 0.03245944157242775, "learning_rate": 0.0002, "loss": 0.5230631828308105, "mean_token_accuracy": 0.785658523440361, "num_tokens": 8957856.0, "step": 550 }, { "entropy": 0.5363311916589737, "epoch": 2.0560224089635852, "grad_norm": 0.035185229033231735, "learning_rate": 0.0002, "loss": 0.5378749370574951, "mean_token_accuracy": 0.781972661614418, "num_tokens": 8974161.0, "step": 551 }, { "entropy": 0.513224758207798, "epoch": 2.0597572362278243, "grad_norm": 0.032956283539533615, "learning_rate": 0.0002, "loss": 0.5172683000564575, "mean_token_accuracy": 0.7906839847564697, "num_tokens": 8990304.0, "step": 552 }, { "entropy": 0.5387901067733765, "epoch": 2.0634920634920633, "grad_norm": 0.03281653672456741, "learning_rate": 0.0002, "loss": 0.5435392260551453, "mean_token_accuracy": 0.7788385599851608, "num_tokens": 9006661.0, "step": 553 }, { "entropy": 0.5324967205524445, "epoch": 2.0672268907563027, "grad_norm": 0.03808191418647766, "learning_rate": 0.0002, "loss": 0.5265247225761414, "mean_token_accuracy": 0.7839124202728271, "num_tokens": 9022887.0, "step": 554 }, { "entropy": 0.513438269495964, "epoch": 2.0709617180205417, "grad_norm": 0.033963609486818314, "learning_rate": 0.0002, "loss": 0.5105268955230713, "mean_token_accuracy": 0.7911703735589981, "num_tokens": 9039478.0, "step": 555 }, { "entropy": 0.526850014925003, "epoch": 2.0746965452847808, "grad_norm": 0.03211839497089386, "learning_rate": 0.0002, "loss": 0.5205508470535278, "mean_token_accuracy": 0.7859012186527252, "num_tokens": 9055612.0, "step": 556 }, { "entropy": 0.5130272284150124, "epoch": 2.0784313725490198, "grad_norm": 0.03543682396411896, "learning_rate": 0.0002, "loss": 0.5140283703804016, "mean_token_accuracy": 0.7925766706466675, "num_tokens": 9072145.0, "step": 557 }, { "entropy": 0.540324792265892, "epoch": 2.082166199813259, "grad_norm": 0.037342023104429245, "learning_rate": 0.0002, "loss": 0.5462511777877808, "mean_token_accuracy": 0.7783039510250092, "num_tokens": 9088571.0, "step": 558 }, { "entropy": 0.5168541818857193, "epoch": 2.085901027077498, "grad_norm": 0.03552469611167908, "learning_rate": 0.0002, "loss": 0.5188402533531189, "mean_token_accuracy": 0.7880326211452484, "num_tokens": 9104869.0, "step": 559 }, { "entropy": 0.5319818705320358, "epoch": 2.089635854341737, "grad_norm": 0.03719151020050049, "learning_rate": 0.0002, "loss": 0.5254620313644409, "mean_token_accuracy": 0.7848033308982849, "num_tokens": 9121231.0, "step": 560 }, { "entropy": 0.5197737812995911, "epoch": 2.093370681605976, "grad_norm": 0.03636628016829491, "learning_rate": 0.0002, "loss": 0.5256960988044739, "mean_token_accuracy": 0.7841715961694717, "num_tokens": 9137375.0, "step": 561 }, { "entropy": 0.5284384936094284, "epoch": 2.097105508870215, "grad_norm": 0.04217526316642761, "learning_rate": 0.0002, "loss": 0.5343865752220154, "mean_token_accuracy": 0.7833265513181686, "num_tokens": 9153783.0, "step": 562 }, { "entropy": 0.541428878903389, "epoch": 2.100840336134454, "grad_norm": 0.035067781805992126, "learning_rate": 0.0002, "loss": 0.532885730266571, "mean_token_accuracy": 0.7817672491073608, "num_tokens": 9170090.0, "step": 563 }, { "entropy": 0.5429966300725937, "epoch": 2.104575163398693, "grad_norm": 0.0392267219722271, "learning_rate": 0.0002, "loss": 0.541841447353363, "mean_token_accuracy": 0.7772143185138702, "num_tokens": 9186453.0, "step": 564 }, { "entropy": 0.5249236822128296, "epoch": 2.108309990662932, "grad_norm": 0.036935608834028244, "learning_rate": 0.0002, "loss": 0.5244463086128235, "mean_token_accuracy": 0.7873810976743698, "num_tokens": 9202852.0, "step": 565 }, { "entropy": 0.5256126970052719, "epoch": 2.112044817927171, "grad_norm": 0.03337714821100235, "learning_rate": 0.0002, "loss": 0.5264843702316284, "mean_token_accuracy": 0.7856813371181488, "num_tokens": 9219197.0, "step": 566 }, { "entropy": 0.5338774845004082, "epoch": 2.11577964519141, "grad_norm": 0.03405802696943283, "learning_rate": 0.0002, "loss": 0.5289718508720398, "mean_token_accuracy": 0.7856559157371521, "num_tokens": 9235376.0, "step": 567 }, { "entropy": 0.5203371495008469, "epoch": 2.119514472455649, "grad_norm": 0.0316944345831871, "learning_rate": 0.0002, "loss": 0.5218056440353394, "mean_token_accuracy": 0.7897576838731766, "num_tokens": 9251814.0, "step": 568 }, { "entropy": 0.5288322418928146, "epoch": 2.123249299719888, "grad_norm": 0.03991817682981491, "learning_rate": 0.0002, "loss": 0.538847029209137, "mean_token_accuracy": 0.7831858396530151, "num_tokens": 9268205.0, "step": 569 }, { "entropy": 0.5342643857002258, "epoch": 2.126984126984127, "grad_norm": 0.030493905767798424, "learning_rate": 0.0002, "loss": 0.5326664447784424, "mean_token_accuracy": 0.7838881760835648, "num_tokens": 9284834.0, "step": 570 }, { "entropy": 0.5280385613441467, "epoch": 2.130718954248366, "grad_norm": 0.033054206520318985, "learning_rate": 0.0002, "loss": 0.5298633575439453, "mean_token_accuracy": 0.7841726392507553, "num_tokens": 9301172.0, "step": 571 }, { "entropy": 0.5260151326656342, "epoch": 2.134453781512605, "grad_norm": 0.03629712015390396, "learning_rate": 0.0002, "loss": 0.5276213884353638, "mean_token_accuracy": 0.7840316295623779, "num_tokens": 9317580.0, "step": 572 }, { "entropy": 0.5471695214509964, "epoch": 2.138188608776844, "grad_norm": 0.036552250385284424, "learning_rate": 0.0002, "loss": 0.5418487787246704, "mean_token_accuracy": 0.7832918912172318, "num_tokens": 9333967.0, "step": 573 }, { "entropy": 0.5421722680330276, "epoch": 2.141923436041083, "grad_norm": 0.03261527791619301, "learning_rate": 0.0002, "loss": 0.535561203956604, "mean_token_accuracy": 0.7817313969135284, "num_tokens": 9350202.0, "step": 574 }, { "entropy": 0.5255165547132492, "epoch": 2.145658263305322, "grad_norm": 0.04084421694278717, "learning_rate": 0.0002, "loss": 0.531385064125061, "mean_token_accuracy": 0.7830993682146072, "num_tokens": 9366633.0, "step": 575 }, { "entropy": 0.5387750118970871, "epoch": 2.149393090569561, "grad_norm": 0.03884339705109596, "learning_rate": 0.0002, "loss": 0.5440813302993774, "mean_token_accuracy": 0.7815608382225037, "num_tokens": 9382903.0, "step": 576 }, { "entropy": 0.5333912819623947, "epoch": 2.1531279178338, "grad_norm": 0.03951586037874222, "learning_rate": 0.0002, "loss": 0.5371235013008118, "mean_token_accuracy": 0.7820965349674225, "num_tokens": 9399354.0, "step": 577 }, { "entropy": 0.5337669998407364, "epoch": 2.156862745098039, "grad_norm": 0.03831348195672035, "learning_rate": 0.0002, "loss": 0.5345415472984314, "mean_token_accuracy": 0.7816608846187592, "num_tokens": 9415670.0, "step": 578 }, { "entropy": 0.5471907705068588, "epoch": 2.160597572362278, "grad_norm": 0.04574183374643326, "learning_rate": 0.0002, "loss": 0.5423465967178345, "mean_token_accuracy": 0.7815522998571396, "num_tokens": 9432026.0, "step": 579 }, { "entropy": 0.5296851545572281, "epoch": 2.164332399626517, "grad_norm": 0.036245960742235184, "learning_rate": 0.0002, "loss": 0.5283267498016357, "mean_token_accuracy": 0.782764196395874, "num_tokens": 9448243.0, "step": 580 }, { "entropy": 0.5230330973863602, "epoch": 2.168067226890756, "grad_norm": 0.042745113372802734, "learning_rate": 0.0002, "loss": 0.5258357524871826, "mean_token_accuracy": 0.7882087379693985, "num_tokens": 9464651.0, "step": 581 }, { "entropy": 0.527550533413887, "epoch": 2.171802054154995, "grad_norm": 0.037547484040260315, "learning_rate": 0.0002, "loss": 0.5317714214324951, "mean_token_accuracy": 0.7830660939216614, "num_tokens": 9480840.0, "step": 582 }, { "entropy": 0.5365846008062363, "epoch": 2.175536881419234, "grad_norm": 0.031849246472120285, "learning_rate": 0.0002, "loss": 0.5385018587112427, "mean_token_accuracy": 0.7798628509044647, "num_tokens": 9497134.0, "step": 583 }, { "entropy": 0.5381672978401184, "epoch": 2.179271708683473, "grad_norm": 0.03450456261634827, "learning_rate": 0.0002, "loss": 0.534706711769104, "mean_token_accuracy": 0.7828627675771713, "num_tokens": 9513638.0, "step": 584 }, { "entropy": 0.5432828962802887, "epoch": 2.183006535947712, "grad_norm": 0.03337936848402023, "learning_rate": 0.0002, "loss": 0.537329375743866, "mean_token_accuracy": 0.7817478477954865, "num_tokens": 9530154.0, "step": 585 }, { "entropy": 0.5273857861757278, "epoch": 2.1867413632119517, "grad_norm": 0.03686324506998062, "learning_rate": 0.0002, "loss": 0.5241349339485168, "mean_token_accuracy": 0.7857643216848373, "num_tokens": 9546371.0, "step": 586 }, { "entropy": 0.5210235714912415, "epoch": 2.1904761904761907, "grad_norm": 0.036837268620729446, "learning_rate": 0.0002, "loss": 0.52490234375, "mean_token_accuracy": 0.7855679392814636, "num_tokens": 9562751.0, "step": 587 }, { "entropy": 0.5189210176467896, "epoch": 2.1942110177404297, "grad_norm": 0.034773845225572586, "learning_rate": 0.0002, "loss": 0.5269665718078613, "mean_token_accuracy": 0.7840563803911209, "num_tokens": 9579070.0, "step": 588 }, { "entropy": 0.5213501304388046, "epoch": 2.1979458450046687, "grad_norm": 0.03593657165765762, "learning_rate": 0.0002, "loss": 0.5271462798118591, "mean_token_accuracy": 0.7872640639543533, "num_tokens": 9595317.0, "step": 589 }, { "entropy": 0.5206883400678635, "epoch": 2.2016806722689077, "grad_norm": 0.04565085843205452, "learning_rate": 0.0002, "loss": 0.5203741192817688, "mean_token_accuracy": 0.7881180793046951, "num_tokens": 9611799.0, "step": 590 }, { "entropy": 0.5511275231838226, "epoch": 2.2054154995331468, "grad_norm": 0.03642827644944191, "learning_rate": 0.0002, "loss": 0.543250322341919, "mean_token_accuracy": 0.7804217487573624, "num_tokens": 9628251.0, "step": 591 }, { "entropy": 0.5495842546224594, "epoch": 2.2091503267973858, "grad_norm": 0.03284912183880806, "learning_rate": 0.0002, "loss": 0.5446897745132446, "mean_token_accuracy": 0.7783705443143845, "num_tokens": 9644703.0, "step": 592 }, { "entropy": 0.5297017693519592, "epoch": 2.212885154061625, "grad_norm": 0.04696131870150566, "learning_rate": 0.0002, "loss": 0.5325087308883667, "mean_token_accuracy": 0.7825257629156113, "num_tokens": 9660837.0, "step": 593 }, { "entropy": 0.5143487304449081, "epoch": 2.216619981325864, "grad_norm": 0.03802449256181717, "learning_rate": 0.0002, "loss": 0.5171544551849365, "mean_token_accuracy": 0.7884373366832733, "num_tokens": 9676767.0, "step": 594 }, { "entropy": 0.5370919853448868, "epoch": 2.220354808590103, "grad_norm": 0.0338297039270401, "learning_rate": 0.0002, "loss": 0.5430178046226501, "mean_token_accuracy": 0.7808142453432083, "num_tokens": 9693155.0, "step": 595 }, { "entropy": 0.5210085138678551, "epoch": 2.224089635854342, "grad_norm": 0.04106014966964722, "learning_rate": 0.0002, "loss": 0.5282027125358582, "mean_token_accuracy": 0.7855826020240784, "num_tokens": 9709374.0, "step": 596 }, { "entropy": 0.516735278069973, "epoch": 2.227824463118581, "grad_norm": 0.03890896216034889, "learning_rate": 0.0002, "loss": 0.5173130035400391, "mean_token_accuracy": 0.7897009253501892, "num_tokens": 9725684.0, "step": 597 }, { "entropy": 0.5427378565073013, "epoch": 2.23155929038282, "grad_norm": 0.038357146084308624, "learning_rate": 0.0002, "loss": 0.5421530604362488, "mean_token_accuracy": 0.7789111882448196, "num_tokens": 9742139.0, "step": 598 }, { "entropy": 0.5454076677560806, "epoch": 2.235294117647059, "grad_norm": 0.037645429372787476, "learning_rate": 0.0002, "loss": 0.5428951978683472, "mean_token_accuracy": 0.7789873778820038, "num_tokens": 9758607.0, "step": 599 }, { "entropy": 0.5404632985591888, "epoch": 2.239028944911298, "grad_norm": 0.039499301463365555, "learning_rate": 0.0002, "loss": 0.5404936075210571, "mean_token_accuracy": 0.7793222069740295, "num_tokens": 9775018.0, "step": 600 }, { "entropy": 0.530501589179039, "epoch": 2.242763772175537, "grad_norm": 0.040064238011837006, "learning_rate": 0.0002, "loss": 0.5247729420661926, "mean_token_accuracy": 0.7874402105808258, "num_tokens": 9791324.0, "step": 601 }, { "entropy": 0.5369330644607544, "epoch": 2.246498599439776, "grad_norm": 0.037321336567401886, "learning_rate": 0.0002, "loss": 0.5377377271652222, "mean_token_accuracy": 0.782483384013176, "num_tokens": 9807623.0, "step": 602 }, { "entropy": 0.5428077727556229, "epoch": 2.250233426704015, "grad_norm": 0.03844759240746498, "learning_rate": 0.0002, "loss": 0.5476452112197876, "mean_token_accuracy": 0.776596188545227, "num_tokens": 9824163.0, "step": 603 }, { "entropy": 0.5409123748540878, "epoch": 2.253968253968254, "grad_norm": 0.03608860820531845, "learning_rate": 0.0002, "loss": 0.5434892177581787, "mean_token_accuracy": 0.7794551849365234, "num_tokens": 9840911.0, "step": 604 }, { "entropy": 0.5327287763357162, "epoch": 2.257703081232493, "grad_norm": 0.037285350263118744, "learning_rate": 0.0002, "loss": 0.5284507274627686, "mean_token_accuracy": 0.7831137478351593, "num_tokens": 9857296.0, "step": 605 }, { "entropy": 0.5373975485563278, "epoch": 2.261437908496732, "grad_norm": 0.03957006335258484, "learning_rate": 0.0002, "loss": 0.5341996550559998, "mean_token_accuracy": 0.7810620963573456, "num_tokens": 9873850.0, "step": 606 }, { "entropy": 0.5290782749652863, "epoch": 2.265172735760971, "grad_norm": 0.040026333183050156, "learning_rate": 0.0002, "loss": 0.5341078042984009, "mean_token_accuracy": 0.780807763338089, "num_tokens": 9890140.0, "step": 607 }, { "entropy": 0.5333269834518433, "epoch": 2.26890756302521, "grad_norm": 0.03453996032476425, "learning_rate": 0.0002, "loss": 0.5351240634918213, "mean_token_accuracy": 0.7807765603065491, "num_tokens": 9906713.0, "step": 608 }, { "entropy": 0.5285785049200058, "epoch": 2.272642390289449, "grad_norm": 0.04334354028105736, "learning_rate": 0.0002, "loss": 0.5339541435241699, "mean_token_accuracy": 0.7852969169616699, "num_tokens": 9922956.0, "step": 609 }, { "entropy": 0.5360069870948792, "epoch": 2.276377217553688, "grad_norm": 0.03924287483096123, "learning_rate": 0.0002, "loss": 0.5359759330749512, "mean_token_accuracy": 0.7819220721721649, "num_tokens": 9939069.0, "step": 610 }, { "entropy": 0.5435689836740494, "epoch": 2.280112044817927, "grad_norm": 0.037971340119838715, "learning_rate": 0.0002, "loss": 0.5404746532440186, "mean_token_accuracy": 0.779410719871521, "num_tokens": 9955662.0, "step": 611 }, { "entropy": 0.5445673018693924, "epoch": 2.283846872082166, "grad_norm": 0.03730984404683113, "learning_rate": 0.0002, "loss": 0.5418494939804077, "mean_token_accuracy": 0.7813813239336014, "num_tokens": 9971957.0, "step": 612 }, { "entropy": 0.5419572293758392, "epoch": 2.287581699346405, "grad_norm": 0.041550587862730026, "learning_rate": 0.0002, "loss": 0.5388907194137573, "mean_token_accuracy": 0.7824303805828094, "num_tokens": 9988368.0, "step": 613 }, { "entropy": 0.5267663449048996, "epoch": 2.291316526610644, "grad_norm": 0.03576701879501343, "learning_rate": 0.0002, "loss": 0.5305144786834717, "mean_token_accuracy": 0.784039780497551, "num_tokens": 10004679.0, "step": 614 }, { "entropy": 0.5274334847927094, "epoch": 2.295051353874883, "grad_norm": 0.03758349269628525, "learning_rate": 0.0002, "loss": 0.5307276844978333, "mean_token_accuracy": 0.7834599912166595, "num_tokens": 10021146.0, "step": 615 }, { "entropy": 0.5317062586545944, "epoch": 2.298786181139122, "grad_norm": 0.04096253216266632, "learning_rate": 0.0002, "loss": 0.5370841026306152, "mean_token_accuracy": 0.7823186218738556, "num_tokens": 10037433.0, "step": 616 }, { "entropy": 0.5429483950138092, "epoch": 2.302521008403361, "grad_norm": 0.04739284887909889, "learning_rate": 0.0002, "loss": 0.5423600673675537, "mean_token_accuracy": 0.7782929539680481, "num_tokens": 10053809.0, "step": 617 }, { "entropy": 0.5375621318817139, "epoch": 2.3062558356676, "grad_norm": 0.03586879000067711, "learning_rate": 0.0002, "loss": 0.5351011157035828, "mean_token_accuracy": 0.7827189415693283, "num_tokens": 10070403.0, "step": 618 }, { "entropy": 0.5332229882478714, "epoch": 2.309990662931839, "grad_norm": 0.039749447256326675, "learning_rate": 0.0002, "loss": 0.5316674113273621, "mean_token_accuracy": 0.7804222106933594, "num_tokens": 10086520.0, "step": 619 }, { "entropy": 0.5367073863744736, "epoch": 2.313725490196078, "grad_norm": 0.03144790232181549, "learning_rate": 0.0002, "loss": 0.5297679305076599, "mean_token_accuracy": 0.7887496650218964, "num_tokens": 10102990.0, "step": 620 }, { "entropy": 0.5359086692333221, "epoch": 2.317460317460317, "grad_norm": 0.03297298401594162, "learning_rate": 0.0002, "loss": 0.5307391881942749, "mean_token_accuracy": 0.7840029001235962, "num_tokens": 10119527.0, "step": 621 }, { "entropy": 0.5245398730039597, "epoch": 2.3211951447245567, "grad_norm": 0.04077174887061119, "learning_rate": 0.0002, "loss": 0.5315594673156738, "mean_token_accuracy": 0.7830156534910202, "num_tokens": 10135668.0, "step": 622 }, { "entropy": 0.5435569882392883, "epoch": 2.3249299719887957, "grad_norm": 0.037014640867710114, "learning_rate": 0.0002, "loss": 0.5420113801956177, "mean_token_accuracy": 0.7790663093328476, "num_tokens": 10152032.0, "step": 623 }, { "entropy": 0.5408807992935181, "epoch": 2.3286647992530347, "grad_norm": 0.040016841143369675, "learning_rate": 0.0002, "loss": 0.5407621264457703, "mean_token_accuracy": 0.7807969450950623, "num_tokens": 10168548.0, "step": 624 }, { "entropy": 0.5394706726074219, "epoch": 2.3323996265172737, "grad_norm": 0.038603588938713074, "learning_rate": 0.0002, "loss": 0.5371181964874268, "mean_token_accuracy": 0.7830179631710052, "num_tokens": 10185087.0, "step": 625 }, { "entropy": 0.5331766307353973, "epoch": 2.3361344537815127, "grad_norm": 0.03732473403215408, "learning_rate": 0.0002, "loss": 0.5403282046318054, "mean_token_accuracy": 0.7811668664216995, "num_tokens": 10201643.0, "step": 626 }, { "entropy": 0.5270423144102097, "epoch": 2.3398692810457518, "grad_norm": 0.039125435054302216, "learning_rate": 0.0002, "loss": 0.5314643383026123, "mean_token_accuracy": 0.7847079634666443, "num_tokens": 10218028.0, "step": 627 }, { "entropy": 0.5217768847942352, "epoch": 2.3436041083099908, "grad_norm": 0.031856924295425415, "learning_rate": 0.0002, "loss": 0.5211607813835144, "mean_token_accuracy": 0.787055104970932, "num_tokens": 10234302.0, "step": 628 }, { "entropy": 0.5297789797186852, "epoch": 2.34733893557423, "grad_norm": 0.044731732457876205, "learning_rate": 0.0002, "loss": 0.5366175174713135, "mean_token_accuracy": 0.7815698832273483, "num_tokens": 10250527.0, "step": 629 }, { "entropy": 0.5372533053159714, "epoch": 2.351073762838469, "grad_norm": 0.03578559309244156, "learning_rate": 0.0002, "loss": 0.5398249626159668, "mean_token_accuracy": 0.782914400100708, "num_tokens": 10266845.0, "step": 630 }, { "entropy": 0.5397268682718277, "epoch": 2.354808590102708, "grad_norm": 0.04053846001625061, "learning_rate": 0.0002, "loss": 0.5417327880859375, "mean_token_accuracy": 0.7793487906455994, "num_tokens": 10283134.0, "step": 631 }, { "entropy": 0.5457513332366943, "epoch": 2.358543417366947, "grad_norm": 0.039855144917964935, "learning_rate": 0.0002, "loss": 0.5377854704856873, "mean_token_accuracy": 0.7803535759449005, "num_tokens": 10299673.0, "step": 632 }, { "entropy": 0.5374201238155365, "epoch": 2.362278244631186, "grad_norm": 0.03583669289946556, "learning_rate": 0.0002, "loss": 0.5346733331680298, "mean_token_accuracy": 0.7818557769060135, "num_tokens": 10316146.0, "step": 633 }, { "entropy": 0.5228708907961845, "epoch": 2.366013071895425, "grad_norm": 0.0356278158724308, "learning_rate": 0.0002, "loss": 0.5220701694488525, "mean_token_accuracy": 0.7894868850708008, "num_tokens": 10332482.0, "step": 634 }, { "entropy": 0.5448856949806213, "epoch": 2.369747899159664, "grad_norm": 0.045307550579309464, "learning_rate": 0.0002, "loss": 0.555870771408081, "mean_token_accuracy": 0.7770739197731018, "num_tokens": 10348970.0, "step": 635 }, { "entropy": 0.5384282767772675, "epoch": 2.373482726423903, "grad_norm": 0.03949993476271629, "learning_rate": 0.0002, "loss": 0.5424531102180481, "mean_token_accuracy": 0.7787595987319946, "num_tokens": 10365074.0, "step": 636 }, { "entropy": 0.532962828874588, "epoch": 2.377217553688142, "grad_norm": 0.0345122404396534, "learning_rate": 0.0002, "loss": 0.5286644697189331, "mean_token_accuracy": 0.7851764559745789, "num_tokens": 10381036.0, "step": 637 }, { "entropy": 0.5396641790866852, "epoch": 2.380952380952381, "grad_norm": 0.038070570677518845, "learning_rate": 0.0002, "loss": 0.5350325703620911, "mean_token_accuracy": 0.783138781785965, "num_tokens": 10397441.0, "step": 638 }, { "entropy": 0.5453281551599503, "epoch": 2.38468720821662, "grad_norm": 0.03477659448981285, "learning_rate": 0.0002, "loss": 0.5431845188140869, "mean_token_accuracy": 0.7779907435178757, "num_tokens": 10413843.0, "step": 639 }, { "entropy": 0.5235247910022736, "epoch": 2.388422035480859, "grad_norm": 0.04054819047451019, "learning_rate": 0.0002, "loss": 0.5272566080093384, "mean_token_accuracy": 0.7897930145263672, "num_tokens": 10430041.0, "step": 640 }, { "entropy": 0.5263708084821701, "epoch": 2.392156862745098, "grad_norm": 0.042338334023952484, "learning_rate": 0.0002, "loss": 0.5340385437011719, "mean_token_accuracy": 0.7824059575796127, "num_tokens": 10446100.0, "step": 641 }, { "entropy": 0.543594166636467, "epoch": 2.395891690009337, "grad_norm": 0.04357817769050598, "learning_rate": 0.0002, "loss": 0.5377992391586304, "mean_token_accuracy": 0.781853511929512, "num_tokens": 10462519.0, "step": 642 }, { "entropy": 0.5444612801074982, "epoch": 2.399626517273576, "grad_norm": 0.03883645310997963, "learning_rate": 0.0002, "loss": 0.5423793196678162, "mean_token_accuracy": 0.7786720097064972, "num_tokens": 10478807.0, "step": 643 }, { "entropy": 0.5298498719930649, "epoch": 2.403361344537815, "grad_norm": 0.03690332546830177, "learning_rate": 0.0002, "loss": 0.5272641181945801, "mean_token_accuracy": 0.782812237739563, "num_tokens": 10494864.0, "step": 644 }, { "entropy": 0.528311550617218, "epoch": 2.407096171802054, "grad_norm": 0.04098167642951012, "learning_rate": 0.0002, "loss": 0.5349369049072266, "mean_token_accuracy": 0.7804581671953201, "num_tokens": 10511211.0, "step": 645 }, { "entropy": 0.5355981737375259, "epoch": 2.410830999066293, "grad_norm": 0.040713947266340256, "learning_rate": 0.0002, "loss": 0.5427882075309753, "mean_token_accuracy": 0.7789115309715271, "num_tokens": 10527252.0, "step": 646 }, { "entropy": 0.5335679203271866, "epoch": 2.414565826330532, "grad_norm": 0.03578624129295349, "learning_rate": 0.0002, "loss": 0.5237961411476135, "mean_token_accuracy": 0.7891885042190552, "num_tokens": 10543508.0, "step": 647 }, { "entropy": 0.550647184252739, "epoch": 2.418300653594771, "grad_norm": 0.041548412293195724, "learning_rate": 0.0002, "loss": 0.5482417941093445, "mean_token_accuracy": 0.7743094116449356, "num_tokens": 10559883.0, "step": 648 }, { "entropy": 0.5099608227610588, "epoch": 2.42203548085901, "grad_norm": 0.035532381385564804, "learning_rate": 0.0002, "loss": 0.5146307349205017, "mean_token_accuracy": 0.7876965999603271, "num_tokens": 10576102.0, "step": 649 }, { "entropy": 0.5289439111948013, "epoch": 2.425770308123249, "grad_norm": 0.03995847702026367, "learning_rate": 0.0002, "loss": 0.529523491859436, "mean_token_accuracy": 0.7849718630313873, "num_tokens": 10592278.0, "step": 650 }, { "entropy": 0.5271874070167542, "epoch": 2.429505135387488, "grad_norm": 0.038978736847639084, "learning_rate": 0.0002, "loss": 0.5379216074943542, "mean_token_accuracy": 0.7814365327358246, "num_tokens": 10608707.0, "step": 651 }, { "entropy": 0.5211434736847878, "epoch": 2.4332399626517276, "grad_norm": 0.04277133196592331, "learning_rate": 0.0002, "loss": 0.5224626660346985, "mean_token_accuracy": 0.7893835753202438, "num_tokens": 10625209.0, "step": 652 }, { "entropy": 0.5353395342826843, "epoch": 2.4369747899159666, "grad_norm": 0.03804321959614754, "learning_rate": 0.0002, "loss": 0.5317578911781311, "mean_token_accuracy": 0.7827101796865463, "num_tokens": 10641678.0, "step": 653 }, { "entropy": 0.5419681817293167, "epoch": 2.4407096171802056, "grad_norm": 0.03237481042742729, "learning_rate": 0.0002, "loss": 0.5347220301628113, "mean_token_accuracy": 0.7828710377216339, "num_tokens": 10658020.0, "step": 654 }, { "entropy": 0.54988232254982, "epoch": 2.4444444444444446, "grad_norm": 0.0367792509496212, "learning_rate": 0.0002, "loss": 0.548277735710144, "mean_token_accuracy": 0.7793003767728806, "num_tokens": 10674273.0, "step": 655 }, { "entropy": 0.5270714908838272, "epoch": 2.4481792717086837, "grad_norm": 0.04078115150332451, "learning_rate": 0.0002, "loss": 0.5275436639785767, "mean_token_accuracy": 0.7857778370380402, "num_tokens": 10690682.0, "step": 656 }, { "entropy": 0.5229745805263519, "epoch": 2.4519140989729227, "grad_norm": 0.03635413572192192, "learning_rate": 0.0002, "loss": 0.5259315967559814, "mean_token_accuracy": 0.7876160591840744, "num_tokens": 10706935.0, "step": 657 }, { "entropy": 0.520149365067482, "epoch": 2.4556489262371617, "grad_norm": 0.04523176699876785, "learning_rate": 0.0002, "loss": 0.5284128189086914, "mean_token_accuracy": 0.7826286852359772, "num_tokens": 10723130.0, "step": 658 }, { "entropy": 0.5226980745792389, "epoch": 2.4593837535014007, "grad_norm": 0.04385685920715332, "learning_rate": 0.0002, "loss": 0.5277330279350281, "mean_token_accuracy": 0.7842638790607452, "num_tokens": 10739706.0, "step": 659 }, { "entropy": 0.5327855497598648, "epoch": 2.4631185807656397, "grad_norm": 0.03833289071917534, "learning_rate": 0.0002, "loss": 0.529242753982544, "mean_token_accuracy": 0.784681499004364, "num_tokens": 10756135.0, "step": 660 }, { "entropy": 0.5270693749189377, "epoch": 2.4668534080298787, "grad_norm": 0.04420669376850128, "learning_rate": 0.0002, "loss": 0.5234766602516174, "mean_token_accuracy": 0.7881586104631424, "num_tokens": 10772473.0, "step": 661 }, { "entropy": 0.5432615429162979, "epoch": 2.4705882352941178, "grad_norm": 0.03388570621609688, "learning_rate": 0.0002, "loss": 0.5459257364273071, "mean_token_accuracy": 0.7780051380395889, "num_tokens": 10788831.0, "step": 662 }, { "entropy": 0.5488771200180054, "epoch": 2.4743230625583568, "grad_norm": 0.04762876406311989, "learning_rate": 0.0002, "loss": 0.5534912943840027, "mean_token_accuracy": 0.7749715596437454, "num_tokens": 10805527.0, "step": 663 }, { "entropy": 0.5422950983047485, "epoch": 2.478057889822596, "grad_norm": 0.03591262549161911, "learning_rate": 0.0002, "loss": 0.5398073792457581, "mean_token_accuracy": 0.779995933175087, "num_tokens": 10821915.0, "step": 664 }, { "entropy": 0.567908450961113, "epoch": 2.481792717086835, "grad_norm": 0.04293651878833771, "learning_rate": 0.0002, "loss": 0.5645220875740051, "mean_token_accuracy": 0.771768257021904, "num_tokens": 10838601.0, "step": 665 }, { "entropy": 0.534419909119606, "epoch": 2.485527544351074, "grad_norm": 0.036424651741981506, "learning_rate": 0.0002, "loss": 0.5309603214263916, "mean_token_accuracy": 0.786093220114708, "num_tokens": 10854981.0, "step": 666 }, { "entropy": 0.5380399525165558, "epoch": 2.489262371615313, "grad_norm": 0.04585183784365654, "learning_rate": 0.0002, "loss": 0.5384916067123413, "mean_token_accuracy": 0.7809207290410995, "num_tokens": 10871328.0, "step": 667 }, { "entropy": 0.5118337720632553, "epoch": 2.492997198879552, "grad_norm": 0.03870607912540436, "learning_rate": 0.0002, "loss": 0.5148553252220154, "mean_token_accuracy": 0.7906211614608765, "num_tokens": 10887543.0, "step": 668 }, { "entropy": 0.539421871304512, "epoch": 2.496732026143791, "grad_norm": 0.04092569276690483, "learning_rate": 0.0002, "loss": 0.5474343299865723, "mean_token_accuracy": 0.7810823172330856, "num_tokens": 10904063.0, "step": 669 }, { "entropy": 0.5357869118452072, "epoch": 2.50046685340803, "grad_norm": 0.03857175633311272, "learning_rate": 0.0002, "loss": 0.5365599393844604, "mean_token_accuracy": 0.7816625684499741, "num_tokens": 10920474.0, "step": 670 }, { "entropy": 0.5330220460891724, "epoch": 2.504201680672269, "grad_norm": 0.03685252368450165, "learning_rate": 0.0002, "loss": 0.5331542491912842, "mean_token_accuracy": 0.7820776700973511, "num_tokens": 10936663.0, "step": 671 }, { "entropy": 0.524094969034195, "epoch": 2.507936507936508, "grad_norm": 0.03893151134252548, "learning_rate": 0.0002, "loss": 0.5277613997459412, "mean_token_accuracy": 0.7860450148582458, "num_tokens": 10952950.0, "step": 672 }, { "entropy": 0.5463172346353531, "epoch": 2.511671335200747, "grad_norm": 0.039967626333236694, "learning_rate": 0.0002, "loss": 0.5425282716751099, "mean_token_accuracy": 0.7801816016435623, "num_tokens": 10969412.0, "step": 673 }, { "entropy": 0.5239230394363403, "epoch": 2.515406162464986, "grad_norm": 0.046231936663389206, "learning_rate": 0.0002, "loss": 0.5241309404373169, "mean_token_accuracy": 0.787441685795784, "num_tokens": 10985869.0, "step": 674 }, { "entropy": 0.5359321981668472, "epoch": 2.519140989729225, "grad_norm": 0.040779855102300644, "learning_rate": 0.0002, "loss": 0.536766767501831, "mean_token_accuracy": 0.7817385196685791, "num_tokens": 11002074.0, "step": 675 }, { "entropy": 0.5319357812404633, "epoch": 2.522875816993464, "grad_norm": 0.03476366400718689, "learning_rate": 0.0002, "loss": 0.5311717391014099, "mean_token_accuracy": 0.7856648862361908, "num_tokens": 11018648.0, "step": 676 }, { "entropy": 0.5231706351041794, "epoch": 2.526610644257703, "grad_norm": 0.03785642236471176, "learning_rate": 0.0002, "loss": 0.5269960165023804, "mean_token_accuracy": 0.7866117358207703, "num_tokens": 11034686.0, "step": 677 }, { "entropy": 0.5381273478269577, "epoch": 2.530345471521942, "grad_norm": 0.03976747393608093, "learning_rate": 0.0002, "loss": 0.5381407141685486, "mean_token_accuracy": 0.7849810570478439, "num_tokens": 11050922.0, "step": 678 }, { "entropy": 0.5456480979919434, "epoch": 2.534080298786181, "grad_norm": 0.039225250482559204, "learning_rate": 0.0002, "loss": 0.5425232648849487, "mean_token_accuracy": 0.7785615175962448, "num_tokens": 11067148.0, "step": 679 }, { "entropy": 0.5407412797212601, "epoch": 2.53781512605042, "grad_norm": 0.03705086559057236, "learning_rate": 0.0002, "loss": 0.536932110786438, "mean_token_accuracy": 0.7821008861064911, "num_tokens": 11083363.0, "step": 680 }, { "entropy": 0.5263440012931824, "epoch": 2.541549953314659, "grad_norm": 0.0353594608604908, "learning_rate": 0.0002, "loss": 0.5256474018096924, "mean_token_accuracy": 0.7849348187446594, "num_tokens": 11099785.0, "step": 681 }, { "entropy": 0.5354757159948349, "epoch": 2.545284780578898, "grad_norm": 0.04532964155077934, "learning_rate": 0.0002, "loss": 0.5450004935264587, "mean_token_accuracy": 0.7807293385267258, "num_tokens": 11115892.0, "step": 682 }, { "entropy": 0.5281579941511154, "epoch": 2.549019607843137, "grad_norm": 0.03604253754019737, "learning_rate": 0.0002, "loss": 0.5311046838760376, "mean_token_accuracy": 0.7845126688480377, "num_tokens": 11132189.0, "step": 683 }, { "entropy": 0.5354526489973068, "epoch": 2.552754435107376, "grad_norm": 0.03747657313942909, "learning_rate": 0.0002, "loss": 0.5361717343330383, "mean_token_accuracy": 0.7801252007484436, "num_tokens": 11148681.0, "step": 684 }, { "entropy": 0.5386267453432083, "epoch": 2.556489262371615, "grad_norm": 0.037825409322977066, "learning_rate": 0.0002, "loss": 0.5390512347221375, "mean_token_accuracy": 0.7815877050161362, "num_tokens": 11165013.0, "step": 685 }, { "entropy": 0.530585527420044, "epoch": 2.560224089635854, "grad_norm": 0.03970746695995331, "learning_rate": 0.0002, "loss": 0.5291422009468079, "mean_token_accuracy": 0.7839111536741257, "num_tokens": 11181301.0, "step": 686 }, { "entropy": 0.5292850136756897, "epoch": 2.563958916900093, "grad_norm": 0.03387298434972763, "learning_rate": 0.0002, "loss": 0.5319269299507141, "mean_token_accuracy": 0.7840193659067154, "num_tokens": 11197537.0, "step": 687 }, { "entropy": 0.5399095267057419, "epoch": 2.567693744164332, "grad_norm": 0.038681600242853165, "learning_rate": 0.0002, "loss": 0.5435532331466675, "mean_token_accuracy": 0.7806709408760071, "num_tokens": 11213896.0, "step": 688 }, { "entropy": 0.5498056858778, "epoch": 2.571428571428571, "grad_norm": 0.03758297860622406, "learning_rate": 0.0002, "loss": 0.5467256307601929, "mean_token_accuracy": 0.7782751470804214, "num_tokens": 11230383.0, "step": 689 }, { "entropy": 0.5476771891117096, "epoch": 2.57516339869281, "grad_norm": 0.03605665639042854, "learning_rate": 0.0002, "loss": 0.541588544845581, "mean_token_accuracy": 0.7791445404291153, "num_tokens": 11246749.0, "step": 690 }, { "entropy": 0.542407214641571, "epoch": 2.5788982259570497, "grad_norm": 0.04616822302341461, "learning_rate": 0.0002, "loss": 0.535969614982605, "mean_token_accuracy": 0.7812883108854294, "num_tokens": 11263093.0, "step": 691 }, { "entropy": 0.5215721130371094, "epoch": 2.5826330532212887, "grad_norm": 0.040278688073158264, "learning_rate": 0.0002, "loss": 0.5306443572044373, "mean_token_accuracy": 0.783096119761467, "num_tokens": 11279295.0, "step": 692 }, { "entropy": 0.5300876200199127, "epoch": 2.5863678804855277, "grad_norm": 0.04465034604072571, "learning_rate": 0.0002, "loss": 0.5408331751823425, "mean_token_accuracy": 0.781616821885109, "num_tokens": 11295488.0, "step": 693 }, { "entropy": 0.529060423374176, "epoch": 2.5901027077497667, "grad_norm": 0.03697149083018303, "learning_rate": 0.0002, "loss": 0.5315713286399841, "mean_token_accuracy": 0.784434586763382, "num_tokens": 11311910.0, "step": 694 }, { "entropy": 0.5421274900436401, "epoch": 2.5938375350140057, "grad_norm": 0.03769063949584961, "learning_rate": 0.0002, "loss": 0.5342295169830322, "mean_token_accuracy": 0.7821343541145325, "num_tokens": 11328227.0, "step": 695 }, { "entropy": 0.5521349608898163, "epoch": 2.5975723622782447, "grad_norm": 0.037369053810834885, "learning_rate": 0.0002, "loss": 0.5406404733657837, "mean_token_accuracy": 0.7816728502511978, "num_tokens": 11344754.0, "step": 696 }, { "entropy": 0.5276040434837341, "epoch": 2.6013071895424837, "grad_norm": 0.04295807331800461, "learning_rate": 0.0002, "loss": 0.531209409236908, "mean_token_accuracy": 0.781694307923317, "num_tokens": 11360846.0, "step": 697 }, { "entropy": 0.5329545885324478, "epoch": 2.6050420168067228, "grad_norm": 0.04680144414305687, "learning_rate": 0.0002, "loss": 0.5448673963546753, "mean_token_accuracy": 0.7803032696247101, "num_tokens": 11376984.0, "step": 698 }, { "entropy": 0.5330372750759125, "epoch": 2.6087768440709618, "grad_norm": 0.038128506392240524, "learning_rate": 0.0002, "loss": 0.5345317125320435, "mean_token_accuracy": 0.7848279774188995, "num_tokens": 11393192.0, "step": 699 }, { "entropy": 0.5620173513889313, "epoch": 2.612511671335201, "grad_norm": 0.0405871607363224, "learning_rate": 0.0002, "loss": 0.5558884143829346, "mean_token_accuracy": 0.7717028856277466, "num_tokens": 11409571.0, "step": 700 }, { "entropy": 0.5401062965393066, "epoch": 2.61624649859944, "grad_norm": 0.033952489495277405, "learning_rate": 0.0002, "loss": 0.5324668884277344, "mean_token_accuracy": 0.7836252152919769, "num_tokens": 11426157.0, "step": 701 }, { "entropy": 0.5401272624731064, "epoch": 2.619981325863679, "grad_norm": 0.03486888110637665, "learning_rate": 0.0002, "loss": 0.5405600666999817, "mean_token_accuracy": 0.780670240521431, "num_tokens": 11442706.0, "step": 702 }, { "entropy": 0.5286990851163864, "epoch": 2.623716153127918, "grad_norm": 0.03971569985151291, "learning_rate": 0.0002, "loss": 0.5301419496536255, "mean_token_accuracy": 0.7845329642295837, "num_tokens": 11459059.0, "step": 703 }, { "entropy": 0.5408699810504913, "epoch": 2.627450980392157, "grad_norm": 0.03566860780119896, "learning_rate": 0.0002, "loss": 0.5422340631484985, "mean_token_accuracy": 0.7786179780960083, "num_tokens": 11475473.0, "step": 704 }, { "entropy": 0.5306770950555801, "epoch": 2.631185807656396, "grad_norm": 0.038531865924596786, "learning_rate": 0.0002, "loss": 0.5311087965965271, "mean_token_accuracy": 0.784186452627182, "num_tokens": 11491765.0, "step": 705 }, { "entropy": 0.5391299277544022, "epoch": 2.634920634920635, "grad_norm": 0.036147549748420715, "learning_rate": 0.0002, "loss": 0.5403758883476257, "mean_token_accuracy": 0.7817845791578293, "num_tokens": 11508291.0, "step": 706 }, { "entropy": 0.5316940769553185, "epoch": 2.638655462184874, "grad_norm": 0.036513980478048325, "learning_rate": 0.0002, "loss": 0.5340716242790222, "mean_token_accuracy": 0.7847382575273514, "num_tokens": 11524811.0, "step": 707 }, { "entropy": 0.5237598121166229, "epoch": 2.642390289449113, "grad_norm": 0.03360476344823837, "learning_rate": 0.0002, "loss": 0.5258880257606506, "mean_token_accuracy": 0.7865117788314819, "num_tokens": 11541335.0, "step": 708 }, { "entropy": 0.5325336754322052, "epoch": 2.646125116713352, "grad_norm": 0.03501066192984581, "learning_rate": 0.0002, "loss": 0.5358341336250305, "mean_token_accuracy": 0.7841547876596451, "num_tokens": 11557859.0, "step": 709 }, { "entropy": 0.5220260694622993, "epoch": 2.649859943977591, "grad_norm": 0.038072340190410614, "learning_rate": 0.0002, "loss": 0.5222914814949036, "mean_token_accuracy": 0.7870404571294785, "num_tokens": 11574116.0, "step": 710 }, { "entropy": 0.5257419422268867, "epoch": 2.65359477124183, "grad_norm": 0.03713792935013771, "learning_rate": 0.0002, "loss": 0.5267120599746704, "mean_token_accuracy": 0.7852788418531418, "num_tokens": 11590295.0, "step": 711 }, { "entropy": 0.5383759438991547, "epoch": 2.657329598506069, "grad_norm": 0.04603256285190582, "learning_rate": 0.0002, "loss": 0.5421494841575623, "mean_token_accuracy": 0.781136080622673, "num_tokens": 11606581.0, "step": 712 }, { "entropy": 0.5336297005414963, "epoch": 2.661064425770308, "grad_norm": 0.03931435942649841, "learning_rate": 0.0002, "loss": 0.5313882231712341, "mean_token_accuracy": 0.7825400978326797, "num_tokens": 11622793.0, "step": 713 }, { "entropy": 0.5316190719604492, "epoch": 2.664799253034547, "grad_norm": 0.03564710542559624, "learning_rate": 0.0002, "loss": 0.530137836933136, "mean_token_accuracy": 0.7861842215061188, "num_tokens": 11638909.0, "step": 714 }, { "entropy": 0.529007188975811, "epoch": 2.668534080298786, "grad_norm": 0.03671964257955551, "learning_rate": 0.0002, "loss": 0.5294506549835205, "mean_token_accuracy": 0.7843856066465378, "num_tokens": 11655048.0, "step": 715 }, { "entropy": 0.5391807407140732, "epoch": 2.6722689075630255, "grad_norm": 0.043020427227020264, "learning_rate": 0.0002, "loss": 0.5463923215866089, "mean_token_accuracy": 0.7798423320055008, "num_tokens": 11671616.0, "step": 716 }, { "entropy": 0.5369218289852142, "epoch": 2.6760037348272645, "grad_norm": 0.04039768502116203, "learning_rate": 0.0002, "loss": 0.5428373217582703, "mean_token_accuracy": 0.7810155898332596, "num_tokens": 11687981.0, "step": 717 }, { "entropy": 0.5410373359918594, "epoch": 2.6797385620915035, "grad_norm": 0.032212115824222565, "learning_rate": 0.0002, "loss": 0.538726806640625, "mean_token_accuracy": 0.7823937833309174, "num_tokens": 11704497.0, "step": 718 }, { "entropy": 0.5408433228731155, "epoch": 2.6834733893557425, "grad_norm": 0.04190416634082794, "learning_rate": 0.0002, "loss": 0.5312804579734802, "mean_token_accuracy": 0.7858607023954391, "num_tokens": 11720759.0, "step": 719 }, { "entropy": 0.52065759152174, "epoch": 2.6872082166199815, "grad_norm": 0.03749416023492813, "learning_rate": 0.0002, "loss": 0.5172442197799683, "mean_token_accuracy": 0.7908001989126205, "num_tokens": 11736897.0, "step": 720 }, { "entropy": 0.5223864614963531, "epoch": 2.6909430438842206, "grad_norm": 0.03889421746134758, "learning_rate": 0.0002, "loss": 0.5262103080749512, "mean_token_accuracy": 0.7863954603672028, "num_tokens": 11753026.0, "step": 721 }, { "entropy": 0.5417105704545975, "epoch": 2.6946778711484596, "grad_norm": 0.03900585323572159, "learning_rate": 0.0002, "loss": 0.548478901386261, "mean_token_accuracy": 0.7769544124603271, "num_tokens": 11769364.0, "step": 722 }, { "entropy": 0.5348965376615524, "epoch": 2.6984126984126986, "grad_norm": 0.040531598031520844, "learning_rate": 0.0002, "loss": 0.5366338491439819, "mean_token_accuracy": 0.7824574261903763, "num_tokens": 11785662.0, "step": 723 }, { "entropy": 0.5472202748060226, "epoch": 2.7021475256769376, "grad_norm": 0.03544607013463974, "learning_rate": 0.0002, "loss": 0.546108067035675, "mean_token_accuracy": 0.7778937071561813, "num_tokens": 11802091.0, "step": 724 }, { "entropy": 0.5445298254489899, "epoch": 2.7058823529411766, "grad_norm": 0.045996710658073425, "learning_rate": 0.0002, "loss": 0.5458025336265564, "mean_token_accuracy": 0.7784214168787003, "num_tokens": 11818307.0, "step": 725 }, { "entropy": 0.5437731146812439, "epoch": 2.7096171802054156, "grad_norm": 0.040692199021577835, "learning_rate": 0.0002, "loss": 0.5425392389297485, "mean_token_accuracy": 0.7800037413835526, "num_tokens": 11834733.0, "step": 726 }, { "entropy": 0.5586313903331757, "epoch": 2.7133520074696547, "grad_norm": 0.05102645978331566, "learning_rate": 0.0002, "loss": 0.5646232962608337, "mean_token_accuracy": 0.7713905870914459, "num_tokens": 11851346.0, "step": 727 }, { "entropy": 0.5276175439357758, "epoch": 2.7170868347338937, "grad_norm": 0.04199473559856415, "learning_rate": 0.0002, "loss": 0.5330867767333984, "mean_token_accuracy": 0.7841922342777252, "num_tokens": 11867709.0, "step": 728 }, { "entropy": 0.5365078300237656, "epoch": 2.7208216619981327, "grad_norm": 0.038084954023361206, "learning_rate": 0.0002, "loss": 0.5328811407089233, "mean_token_accuracy": 0.7830130755901337, "num_tokens": 11884172.0, "step": 729 }, { "entropy": 0.5306914746761322, "epoch": 2.7245564892623717, "grad_norm": 0.04009576886892319, "learning_rate": 0.0002, "loss": 0.5335056185722351, "mean_token_accuracy": 0.784161165356636, "num_tokens": 11900524.0, "step": 730 }, { "entropy": 0.5325679033994675, "epoch": 2.7282913165266107, "grad_norm": 0.0398661270737648, "learning_rate": 0.0002, "loss": 0.5311678051948547, "mean_token_accuracy": 0.7866542786359787, "num_tokens": 11916696.0, "step": 731 }, { "entropy": 0.5234319120645523, "epoch": 2.7320261437908497, "grad_norm": 0.03887765109539032, "learning_rate": 0.0002, "loss": 0.5243536233901978, "mean_token_accuracy": 0.786685973405838, "num_tokens": 11933375.0, "step": 732 }, { "entropy": 0.5323622822761536, "epoch": 2.7357609710550888, "grad_norm": 0.041390158236026764, "learning_rate": 0.0002, "loss": 0.5382110476493835, "mean_token_accuracy": 0.7813025563955307, "num_tokens": 11949641.0, "step": 733 }, { "entropy": 0.5282771736383438, "epoch": 2.7394957983193278, "grad_norm": 0.03821795806288719, "learning_rate": 0.0002, "loss": 0.5237923860549927, "mean_token_accuracy": 0.7858958840370178, "num_tokens": 11965904.0, "step": 734 }, { "entropy": 0.5336133688688278, "epoch": 2.743230625583567, "grad_norm": 0.040790773928165436, "learning_rate": 0.0002, "loss": 0.5322080850601196, "mean_token_accuracy": 0.7814221978187561, "num_tokens": 11982242.0, "step": 735 }, { "entropy": 0.5447276085615158, "epoch": 2.746965452847806, "grad_norm": 0.03733038902282715, "learning_rate": 0.0002, "loss": 0.5435236096382141, "mean_token_accuracy": 0.7783806473016739, "num_tokens": 11998525.0, "step": 736 }, { "entropy": 0.5370974391698837, "epoch": 2.750700280112045, "grad_norm": 0.035691265016794205, "learning_rate": 0.0002, "loss": 0.5391957759857178, "mean_token_accuracy": 0.7787430435419083, "num_tokens": 12014726.0, "step": 737 }, { "entropy": 0.5190877616405487, "epoch": 2.754435107376284, "grad_norm": 0.037242453545331955, "learning_rate": 0.0002, "loss": 0.5239222645759583, "mean_token_accuracy": 0.7867171913385391, "num_tokens": 12030648.0, "step": 738 }, { "entropy": 0.5201060324907303, "epoch": 2.758169934640523, "grad_norm": 0.03840528428554535, "learning_rate": 0.0002, "loss": 0.5264686942100525, "mean_token_accuracy": 0.7854082137346268, "num_tokens": 12046824.0, "step": 739 }, { "entropy": 0.5208890736103058, "epoch": 2.761904761904762, "grad_norm": 0.038443028926849365, "learning_rate": 0.0002, "loss": 0.5207111239433289, "mean_token_accuracy": 0.7860049307346344, "num_tokens": 12063182.0, "step": 740 }, { "entropy": 0.5337280184030533, "epoch": 2.765639589169001, "grad_norm": 0.0408535934984684, "learning_rate": 0.0002, "loss": 0.5295891165733337, "mean_token_accuracy": 0.7857932895421982, "num_tokens": 12079411.0, "step": 741 }, { "entropy": 0.5374506562948227, "epoch": 2.76937441643324, "grad_norm": 0.04354558512568474, "learning_rate": 0.0002, "loss": 0.5414345860481262, "mean_token_accuracy": 0.7807870209217072, "num_tokens": 12095874.0, "step": 742 }, { "entropy": 0.5360343009233475, "epoch": 2.773109243697479, "grad_norm": 0.03928976133465767, "learning_rate": 0.0002, "loss": 0.5380703210830688, "mean_token_accuracy": 0.7798075079917908, "num_tokens": 12112215.0, "step": 743 }, { "entropy": 0.5264292061328888, "epoch": 2.776844070961718, "grad_norm": 0.03775021806359291, "learning_rate": 0.0002, "loss": 0.5281617045402527, "mean_token_accuracy": 0.7842919081449509, "num_tokens": 12128361.0, "step": 744 }, { "entropy": 0.5419831871986389, "epoch": 2.780578898225957, "grad_norm": 0.032331038266420364, "learning_rate": 0.0002, "loss": 0.5362944602966309, "mean_token_accuracy": 0.7816326916217804, "num_tokens": 12144755.0, "step": 745 }, { "entropy": 0.5174460113048553, "epoch": 2.784313725490196, "grad_norm": 0.03798742592334747, "learning_rate": 0.0002, "loss": 0.515007495880127, "mean_token_accuracy": 0.7882062345743179, "num_tokens": 12161034.0, "step": 746 }, { "entropy": 0.5355328992009163, "epoch": 2.788048552754435, "grad_norm": 0.036557331681251526, "learning_rate": 0.0002, "loss": 0.5344611406326294, "mean_token_accuracy": 0.7847500294446945, "num_tokens": 12177479.0, "step": 747 }, { "entropy": 0.538584902882576, "epoch": 2.791783380018674, "grad_norm": 0.039520300924777985, "learning_rate": 0.0002, "loss": 0.5427792072296143, "mean_token_accuracy": 0.7786386609077454, "num_tokens": 12193830.0, "step": 748 }, { "entropy": 0.51973095536232, "epoch": 2.795518207282913, "grad_norm": 0.04126165434718132, "learning_rate": 0.0002, "loss": 0.5279180407524109, "mean_token_accuracy": 0.784518226981163, "num_tokens": 12210022.0, "step": 749 }, { "entropy": 0.5385647118091583, "epoch": 2.799253034547152, "grad_norm": 0.03742329403758049, "learning_rate": 0.0002, "loss": 0.5358390808105469, "mean_token_accuracy": 0.7814119607210159, "num_tokens": 12226184.0, "step": 750 }, { "entropy": 0.5483904033899307, "epoch": 2.802987861811391, "grad_norm": 0.03444087877869606, "learning_rate": 0.0002, "loss": 0.5442800521850586, "mean_token_accuracy": 0.7782953381538391, "num_tokens": 12242564.0, "step": 751 }, { "entropy": 0.5447859466075897, "epoch": 2.80672268907563, "grad_norm": 0.037425972521305084, "learning_rate": 0.0002, "loss": 0.5376838445663452, "mean_token_accuracy": 0.7805659919977188, "num_tokens": 12259077.0, "step": 752 }, { "entropy": 0.526421070098877, "epoch": 2.810457516339869, "grad_norm": 0.039544545114040375, "learning_rate": 0.0002, "loss": 0.5272819399833679, "mean_token_accuracy": 0.7836880385875702, "num_tokens": 12275297.0, "step": 753 }, { "entropy": 0.538783460855484, "epoch": 2.814192343604108, "grad_norm": 0.035788971930742264, "learning_rate": 0.0002, "loss": 0.5417999625205994, "mean_token_accuracy": 0.7819748818874359, "num_tokens": 12291643.0, "step": 754 }, { "entropy": 0.5367716252803802, "epoch": 2.817927170868347, "grad_norm": 0.040753189474344254, "learning_rate": 0.0002, "loss": 0.5376288294792175, "mean_token_accuracy": 0.7829637825489044, "num_tokens": 12307987.0, "step": 755 }, { "entropy": 0.5418078452348709, "epoch": 2.821661998132586, "grad_norm": 0.036726806312799454, "learning_rate": 0.0002, "loss": 0.5469898581504822, "mean_token_accuracy": 0.7801835685968399, "num_tokens": 12324503.0, "step": 756 }, { "entropy": 0.525896355509758, "epoch": 2.825396825396825, "grad_norm": 0.034559980034828186, "learning_rate": 0.0002, "loss": 0.5265108942985535, "mean_token_accuracy": 0.7867930829524994, "num_tokens": 12340881.0, "step": 757 }, { "entropy": 0.5369487851858139, "epoch": 2.8291316526610646, "grad_norm": 0.03595944494009018, "learning_rate": 0.0002, "loss": 0.5396771430969238, "mean_token_accuracy": 0.7813677042722702, "num_tokens": 12357352.0, "step": 758 }, { "entropy": 0.5467210859060287, "epoch": 2.8328664799253036, "grad_norm": 0.03524104505777359, "learning_rate": 0.0002, "loss": 0.544916570186615, "mean_token_accuracy": 0.7771721184253693, "num_tokens": 12373526.0, "step": 759 }, { "entropy": 0.5246351063251495, "epoch": 2.8366013071895426, "grad_norm": 0.036806508898735046, "learning_rate": 0.0002, "loss": 0.5221924781799316, "mean_token_accuracy": 0.7871624380350113, "num_tokens": 12389771.0, "step": 760 }, { "entropy": 0.530710369348526, "epoch": 2.8403361344537816, "grad_norm": 0.04332499951124191, "learning_rate": 0.0002, "loss": 0.5322965383529663, "mean_token_accuracy": 0.7832685261964798, "num_tokens": 12406028.0, "step": 761 }, { "entropy": 0.5254833996295929, "epoch": 2.8440709617180207, "grad_norm": 0.038304176181554794, "learning_rate": 0.0002, "loss": 0.5253804922103882, "mean_token_accuracy": 0.7873952239751816, "num_tokens": 12422639.0, "step": 762 }, { "entropy": 0.5236704498529434, "epoch": 2.8478057889822597, "grad_norm": 0.03660830482840538, "learning_rate": 0.0002, "loss": 0.5286169052124023, "mean_token_accuracy": 0.7816056311130524, "num_tokens": 12438922.0, "step": 763 }, { "entropy": 0.5321139246225357, "epoch": 2.8515406162464987, "grad_norm": 0.04276243969798088, "learning_rate": 0.0002, "loss": 0.5400298237800598, "mean_token_accuracy": 0.7802720963954926, "num_tokens": 12455234.0, "step": 764 }, { "entropy": 0.5383250862360001, "epoch": 2.8552754435107377, "grad_norm": 0.04291578382253647, "learning_rate": 0.0002, "loss": 0.5375620722770691, "mean_token_accuracy": 0.7810464948415756, "num_tokens": 12471352.0, "step": 765 }, { "entropy": 0.5423205345869064, "epoch": 2.8590102707749767, "grad_norm": 0.04575496166944504, "learning_rate": 0.0002, "loss": 0.5404216647148132, "mean_token_accuracy": 0.7788951247930527, "num_tokens": 12487810.0, "step": 766 }, { "entropy": 0.5412723869085312, "epoch": 2.8627450980392157, "grad_norm": 0.03895537182688713, "learning_rate": 0.0002, "loss": 0.5416159629821777, "mean_token_accuracy": 0.7791194468736649, "num_tokens": 12504261.0, "step": 767 }, { "entropy": 0.551712304353714, "epoch": 2.8664799253034547, "grad_norm": 0.04248276725411415, "learning_rate": 0.0002, "loss": 0.5512599945068359, "mean_token_accuracy": 0.7787346094846725, "num_tokens": 12520594.0, "step": 768 }, { "entropy": 0.5365375429391861, "epoch": 2.8702147525676938, "grad_norm": 0.0429382361471653, "learning_rate": 0.0002, "loss": 0.5369971990585327, "mean_token_accuracy": 0.7795698195695877, "num_tokens": 12537097.0, "step": 769 }, { "entropy": 0.5311344265937805, "epoch": 2.8739495798319328, "grad_norm": 0.03710220381617546, "learning_rate": 0.0002, "loss": 0.5327049493789673, "mean_token_accuracy": 0.784042477607727, "num_tokens": 12553319.0, "step": 770 }, { "entropy": 0.5425883233547211, "epoch": 2.877684407096172, "grad_norm": 0.04352175444364548, "learning_rate": 0.0002, "loss": 0.5457234382629395, "mean_token_accuracy": 0.7795119434595108, "num_tokens": 12569370.0, "step": 771 }, { "entropy": 0.5384223312139511, "epoch": 2.881419234360411, "grad_norm": 0.046248357743024826, "learning_rate": 0.0002, "loss": 0.5449962615966797, "mean_token_accuracy": 0.7777050882577896, "num_tokens": 12585550.0, "step": 772 }, { "entropy": 0.5304270684719086, "epoch": 2.88515406162465, "grad_norm": 0.03803584724664688, "learning_rate": 0.0002, "loss": 0.5308764576911926, "mean_token_accuracy": 0.7852406352758408, "num_tokens": 12601869.0, "step": 773 }, { "entropy": 0.5238187685608864, "epoch": 2.888888888888889, "grad_norm": 0.04374956712126732, "learning_rate": 0.0002, "loss": 0.5296017527580261, "mean_token_accuracy": 0.7867107540369034, "num_tokens": 12618133.0, "step": 774 }, { "entropy": 0.545166626572609, "epoch": 2.892623716153128, "grad_norm": 0.04235200583934784, "learning_rate": 0.0002, "loss": 0.5444045066833496, "mean_token_accuracy": 0.7811264097690582, "num_tokens": 12634590.0, "step": 775 }, { "entropy": 0.552961677312851, "epoch": 2.896358543417367, "grad_norm": 0.04033121094107628, "learning_rate": 0.0002, "loss": 0.5423647165298462, "mean_token_accuracy": 0.7789802700281143, "num_tokens": 12650990.0, "step": 776 }, { "entropy": 0.5362664610147476, "epoch": 2.900093370681606, "grad_norm": 0.039799049496650696, "learning_rate": 0.0002, "loss": 0.5340068340301514, "mean_token_accuracy": 0.7801271975040436, "num_tokens": 12667374.0, "step": 777 }, { "entropy": 0.540292888879776, "epoch": 2.903828197945845, "grad_norm": 0.04687785729765892, "learning_rate": 0.0002, "loss": 0.5417227149009705, "mean_token_accuracy": 0.7800564914941788, "num_tokens": 12683778.0, "step": 778 }, { "entropy": 0.5580530762672424, "epoch": 2.907563025210084, "grad_norm": 0.04104934632778168, "learning_rate": 0.0002, "loss": 0.553903341293335, "mean_token_accuracy": 0.7754019796848297, "num_tokens": 12700259.0, "step": 779 }, { "entropy": 0.5188224613666534, "epoch": 2.911297852474323, "grad_norm": 0.04876643791794777, "learning_rate": 0.0002, "loss": 0.525776207447052, "mean_token_accuracy": 0.7853571325540543, "num_tokens": 12716566.0, "step": 780 }, { "entropy": 0.5420665293931961, "epoch": 2.915032679738562, "grad_norm": 0.04760121926665306, "learning_rate": 0.0002, "loss": 0.5495279431343079, "mean_token_accuracy": 0.7769062519073486, "num_tokens": 12732949.0, "step": 781 }, { "entropy": 0.5393791049718857, "epoch": 2.918767507002801, "grad_norm": 0.0337008535861969, "learning_rate": 0.0002, "loss": 0.5375462174415588, "mean_token_accuracy": 0.7824095785617828, "num_tokens": 12749208.0, "step": 782 }, { "entropy": 0.5315912365913391, "epoch": 2.9225023342670404, "grad_norm": 0.04428756982088089, "learning_rate": 0.0002, "loss": 0.5206541419029236, "mean_token_accuracy": 0.7908456176519394, "num_tokens": 12765331.0, "step": 783 }, { "entropy": 0.5318206250667572, "epoch": 2.9262371615312794, "grad_norm": 0.04391348361968994, "learning_rate": 0.0002, "loss": 0.5263054370880127, "mean_token_accuracy": 0.7842861711978912, "num_tokens": 12781575.0, "step": 784 }, { "entropy": 0.5414671450853348, "epoch": 2.9299719887955185, "grad_norm": 0.03392143175005913, "learning_rate": 0.0002, "loss": 0.5417372584342957, "mean_token_accuracy": 0.779655933380127, "num_tokens": 12797804.0, "step": 785 }, { "entropy": 0.5150401219725609, "epoch": 2.9337068160597575, "grad_norm": 0.04989241063594818, "learning_rate": 0.0002, "loss": 0.5268764495849609, "mean_token_accuracy": 0.7849253863096237, "num_tokens": 12814387.0, "step": 786 }, { "entropy": 0.5104701817035675, "epoch": 2.9374416433239965, "grad_norm": 0.04267291724681854, "learning_rate": 0.0002, "loss": 0.5144373178482056, "mean_token_accuracy": 0.7921061366796494, "num_tokens": 12830547.0, "step": 787 }, { "entropy": 0.5301306545734406, "epoch": 2.9411764705882355, "grad_norm": 0.041861243546009064, "learning_rate": 0.0002, "loss": 0.5351182818412781, "mean_token_accuracy": 0.7849584370851517, "num_tokens": 12846796.0, "step": 788 }, { "entropy": 0.5566616058349609, "epoch": 2.9449112978524745, "grad_norm": 0.04726849123835564, "learning_rate": 0.0002, "loss": 0.5562955737113953, "mean_token_accuracy": 0.7750595211982727, "num_tokens": 12863231.0, "step": 789 }, { "entropy": 0.5550259649753571, "epoch": 2.9486461251167135, "grad_norm": 0.04144451022148132, "learning_rate": 0.0002, "loss": 0.5501708388328552, "mean_token_accuracy": 0.7760492265224457, "num_tokens": 12879599.0, "step": 790 }, { "entropy": 0.5439048856496811, "epoch": 2.9523809523809526, "grad_norm": 0.038411688059568405, "learning_rate": 0.0002, "loss": 0.5328619480133057, "mean_token_accuracy": 0.7869621217250824, "num_tokens": 12895954.0, "step": 791 }, { "entropy": 0.5426651537418365, "epoch": 2.9561157796451916, "grad_norm": 0.035909172147512436, "learning_rate": 0.0002, "loss": 0.5376070141792297, "mean_token_accuracy": 0.7810229063034058, "num_tokens": 12912468.0, "step": 792 }, { "entropy": 0.5385068506002426, "epoch": 2.9598506069094306, "grad_norm": 0.04422811418771744, "learning_rate": 0.0002, "loss": 0.5405643582344055, "mean_token_accuracy": 0.7827010452747345, "num_tokens": 12929047.0, "step": 793 }, { "entropy": 0.5246873497962952, "epoch": 2.9635854341736696, "grad_norm": 0.042685672640800476, "learning_rate": 0.0002, "loss": 0.537744402885437, "mean_token_accuracy": 0.7845292538404465, "num_tokens": 12945498.0, "step": 794 }, { "entropy": 0.534453883767128, "epoch": 2.9673202614379086, "grad_norm": 0.04630210995674133, "learning_rate": 0.0002, "loss": 0.5448824763298035, "mean_token_accuracy": 0.7790633589029312, "num_tokens": 12961911.0, "step": 795 }, { "entropy": 0.551120862364769, "epoch": 2.9710550887021476, "grad_norm": 0.038833893835544586, "learning_rate": 0.0002, "loss": 0.5517142415046692, "mean_token_accuracy": 0.7771248668432236, "num_tokens": 12978275.0, "step": 796 }, { "entropy": 0.540284737944603, "epoch": 2.9747899159663866, "grad_norm": 0.034402430057525635, "learning_rate": 0.0002, "loss": 0.5354663133621216, "mean_token_accuracy": 0.7817137837409973, "num_tokens": 12994610.0, "step": 797 }, { "entropy": 0.5466310381889343, "epoch": 2.9785247432306257, "grad_norm": 0.07181618362665176, "learning_rate": 0.0002, "loss": 0.5540565848350525, "mean_token_accuracy": 0.7755098789930344, "num_tokens": 13011180.0, "step": 798 }, { "entropy": 0.5366263538599014, "epoch": 2.9822595704948647, "grad_norm": 0.038452569395303726, "learning_rate": 0.0002, "loss": 0.5375447869300842, "mean_token_accuracy": 0.7817091047763824, "num_tokens": 13027553.0, "step": 799 }, { "entropy": 0.5117043852806091, "epoch": 2.9859943977591037, "grad_norm": 0.040419358760118484, "learning_rate": 0.0002, "loss": 0.5115300416946411, "mean_token_accuracy": 0.7910782992839813, "num_tokens": 13043466.0, "step": 800 }, { "entropy": 0.5549824833869934, "epoch": 2.9897292250233427, "grad_norm": 0.04015415534377098, "learning_rate": 0.0002, "loss": 0.5516586303710938, "mean_token_accuracy": 0.7774178683757782, "num_tokens": 13059980.0, "step": 801 }, { "entropy": 0.5470731258392334, "epoch": 2.9934640522875817, "grad_norm": 0.03732411563396454, "learning_rate": 0.0002, "loss": 0.5440268516540527, "mean_token_accuracy": 0.7784831672906876, "num_tokens": 13076305.0, "step": 802 }, { "entropy": 0.5496807992458344, "epoch": 2.9971988795518207, "grad_norm": 0.042060188949108124, "learning_rate": 0.0002, "loss": 0.5516492128372192, "mean_token_accuracy": 0.7782593071460724, "num_tokens": 13092596.0, "step": 803 }, { "entropy": 0.5623628298441569, "epoch": 3.0, "grad_norm": 0.04183833301067352, "learning_rate": 0.0002, "loss": 0.5470706820487976, "mean_token_accuracy": 0.7766743898391724, "num_tokens": 13094419.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2222761209723617e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }