{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 554, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.358862280845642, "epoch": 0.0036199095022624436, "grad_norm": 2.292628288269043, "learning_rate": 0.0, "loss": 0.7311, "mean_token_accuracy": 0.8534883409738541, "num_tokens": 9316.0, "step": 1 }, { "entropy": 2.674945294857025, "epoch": 0.007239819004524887, "grad_norm": 3.8950836658477783, "learning_rate": 1.0219999999999999e-05, "loss": 1.0621, "mean_token_accuracy": 0.8183160275220871, "num_tokens": 17707.0, "step": 2 }, { "entropy": 2.4915525913238525, "epoch": 0.01085972850678733, "grad_norm": 2.792142868041992, "learning_rate": 2.0439999999999997e-05, "loss": 0.8448, "mean_token_accuracy": 0.8489587754011154, "num_tokens": 26783.0, "step": 3 }, { "entropy": 2.525622010231018, "epoch": 0.014479638009049774, "grad_norm": 2.7071900367736816, "learning_rate": 3.0659999999999994e-05, "loss": 0.8847, "mean_token_accuracy": 0.8486668318510056, "num_tokens": 35947.0, "step": 4 }, { "entropy": 2.588509976863861, "epoch": 0.01809954751131222, "grad_norm": 2.981574773788452, "learning_rate": 4.0879999999999995e-05, "loss": 1.0783, "mean_token_accuracy": 0.8135111033916473, "num_tokens": 44505.0, "step": 5 }, { "entropy": 2.662865400314331, "epoch": 0.02171945701357466, "grad_norm": 2.629283905029297, "learning_rate": 5.1099999999999995e-05, "loss": 0.9485, "mean_token_accuracy": 0.8152717798948288, "num_tokens": 53140.0, "step": 6 }, { "entropy": 2.6662243604660034, "epoch": 0.025339366515837104, "grad_norm": 2.730058431625366, "learning_rate": 6.131999999999999e-05, "loss": 0.6982, "mean_token_accuracy": 0.8552135527133942, "num_tokens": 61932.0, "step": 7 }, { "entropy": 2.661384105682373, "epoch": 0.02895927601809955, "grad_norm": 2.562839984893799, "learning_rate": 7.154e-05, "loss": 0.7296, "mean_token_accuracy": 0.8579540699720383, "num_tokens": 70973.0, "step": 8 }, { "entropy": 2.7889368534088135, "epoch": 0.03257918552036199, "grad_norm": 2.8640544414520264, "learning_rate": 8.175999999999999e-05, "loss": 0.5965, "mean_token_accuracy": 0.8638457208871841, "num_tokens": 79977.0, "step": 9 }, { "entropy": 2.811532199382782, "epoch": 0.03619909502262444, "grad_norm": 2.6199426651000977, "learning_rate": 9.197999999999998e-05, "loss": 0.4819, "mean_token_accuracy": 0.8786454051733017, "num_tokens": 88915.0, "step": 10 }, { "entropy": 2.941167712211609, "epoch": 0.039819004524886875, "grad_norm": 1.2497272491455078, "learning_rate": 0.00010219999999999999, "loss": 0.7192, "mean_token_accuracy": 0.841494083404541, "num_tokens": 97749.0, "step": 11 }, { "entropy": 3.0547962188720703, "epoch": 0.04343891402714932, "grad_norm": 1.436136245727539, "learning_rate": 0.00011241999999999998, "loss": 0.5908, "mean_token_accuracy": 0.8657624870538712, "num_tokens": 106048.0, "step": 12 }, { "entropy": 2.9914053082466125, "epoch": 0.047058823529411764, "grad_norm": 0.9903654456138611, "learning_rate": 0.00012263999999999998, "loss": 0.4008, "mean_token_accuracy": 0.8985499292612076, "num_tokens": 115216.0, "step": 13 }, { "entropy": 3.1867465376853943, "epoch": 0.05067873303167421, "grad_norm": 1.019572377204895, "learning_rate": 0.00013286, "loss": 0.5062, "mean_token_accuracy": 0.8893097043037415, "num_tokens": 124040.0, "step": 14 }, { "entropy": 3.2431325912475586, "epoch": 0.05429864253393665, "grad_norm": 1.2394084930419922, "learning_rate": 0.00014308, "loss": 0.361, "mean_token_accuracy": 0.9009967148303986, "num_tokens": 132447.0, "step": 15 }, { "entropy": 3.1858643889427185, "epoch": 0.0579185520361991, "grad_norm": 0.9859603643417358, "learning_rate": 0.00015329999999999999, "loss": 0.4498, "mean_token_accuracy": 0.887280747294426, "num_tokens": 141228.0, "step": 16 }, { "entropy": 3.5029141902923584, "epoch": 0.06153846153846154, "grad_norm": 1.453957438468933, "learning_rate": 0.00016351999999999998, "loss": 0.4949, "mean_token_accuracy": 0.888081505894661, "num_tokens": 149789.0, "step": 17 }, { "entropy": 3.4572895765304565, "epoch": 0.06515837104072399, "grad_norm": 1.390377402305603, "learning_rate": 0.00017374, "loss": 0.5449, "mean_token_accuracy": 0.8745045810937881, "num_tokens": 157813.0, "step": 18 }, { "entropy": 3.3081750869750977, "epoch": 0.06877828054298643, "grad_norm": 1.1171791553497314, "learning_rate": 0.00018395999999999997, "loss": 0.4786, "mean_token_accuracy": 0.8893420845270157, "num_tokens": 166315.0, "step": 19 }, { "entropy": 3.3776715993881226, "epoch": 0.07239819004524888, "grad_norm": 1.5567998886108398, "learning_rate": 0.00019418, "loss": 0.3669, "mean_token_accuracy": 0.9146632701158524, "num_tokens": 175207.0, "step": 20 }, { "entropy": 3.2677870988845825, "epoch": 0.0760180995475113, "grad_norm": 1.7404611110687256, "learning_rate": 0.00020439999999999998, "loss": 0.5287, "mean_token_accuracy": 0.8777483552694321, "num_tokens": 183833.0, "step": 21 }, { "entropy": 3.313201069831848, "epoch": 0.07963800904977375, "grad_norm": 1.0836979150772095, "learning_rate": 0.00021461999999999997, "loss": 0.3014, "mean_token_accuracy": 0.9215261936187744, "num_tokens": 192591.0, "step": 22 }, { "entropy": 3.208672881126404, "epoch": 0.0832579185520362, "grad_norm": 1.2197301387786865, "learning_rate": 0.00022483999999999997, "loss": 0.4401, "mean_token_accuracy": 0.9031257778406143, "num_tokens": 201372.0, "step": 23 }, { "entropy": 3.1830995082855225, "epoch": 0.08687782805429864, "grad_norm": 1.2422229051589966, "learning_rate": 0.00023506, "loss": 0.5144, "mean_token_accuracy": 0.8915928155183792, "num_tokens": 210348.0, "step": 24 }, { "entropy": 3.085207223892212, "epoch": 0.09049773755656108, "grad_norm": 0.8987624049186707, "learning_rate": 0.00024527999999999996, "loss": 0.3253, "mean_token_accuracy": 0.9221627116203308, "num_tokens": 219131.0, "step": 25 }, { "entropy": 3.026031017303467, "epoch": 0.09411764705882353, "grad_norm": 1.0273475646972656, "learning_rate": 0.0002555, "loss": 0.3495, "mean_token_accuracy": 0.9147634357213974, "num_tokens": 228292.0, "step": 26 }, { "entropy": 3.0420032739639282, "epoch": 0.09773755656108597, "grad_norm": 1.0590945482254028, "learning_rate": 0.00026572, "loss": 0.4495, "mean_token_accuracy": 0.9019353687763214, "num_tokens": 236942.0, "step": 27 }, { "entropy": 3.0469263792037964, "epoch": 0.10135746606334842, "grad_norm": 0.9584959745407104, "learning_rate": 0.00027594, "loss": 0.405, "mean_token_accuracy": 0.9216890782117844, "num_tokens": 245543.0, "step": 28 }, { "entropy": 2.92683744430542, "epoch": 0.10497737556561086, "grad_norm": 0.8826628923416138, "learning_rate": 0.00028616, "loss": 0.4004, "mean_token_accuracy": 0.9173285663127899, "num_tokens": 254264.0, "step": 29 }, { "entropy": 3.0086968541145325, "epoch": 0.1085972850678733, "grad_norm": 0.8521863222122192, "learning_rate": 0.00029637999999999995, "loss": 0.2876, "mean_token_accuracy": 0.9335231184959412, "num_tokens": 263143.0, "step": 30 }, { "entropy": 2.9086623191833496, "epoch": 0.11221719457013575, "grad_norm": 0.7830919623374939, "learning_rate": 0.00030659999999999997, "loss": 0.548, "mean_token_accuracy": 0.8831343650817871, "num_tokens": 272055.0, "step": 31 }, { "entropy": 2.9730575680732727, "epoch": 0.1158371040723982, "grad_norm": 0.7217472195625305, "learning_rate": 0.00031682, "loss": 0.3564, "mean_token_accuracy": 0.9119151830673218, "num_tokens": 280971.0, "step": 32 }, { "entropy": 3.081720530986786, "epoch": 0.11945701357466064, "grad_norm": 0.8697704076766968, "learning_rate": 0.00032703999999999996, "loss": 0.334, "mean_token_accuracy": 0.9234935492277145, "num_tokens": 289449.0, "step": 33 }, { "entropy": 3.1043431162834167, "epoch": 0.12307692307692308, "grad_norm": 0.7962514758110046, "learning_rate": 0.00033726, "loss": 0.1602, "mean_token_accuracy": 0.9554370939731598, "num_tokens": 297804.0, "step": 34 }, { "entropy": 3.0275490283966064, "epoch": 0.12669683257918551, "grad_norm": 0.5887104272842407, "learning_rate": 0.00034748, "loss": 0.2254, "mean_token_accuracy": 0.9491932094097137, "num_tokens": 306589.0, "step": 35 }, { "entropy": 3.099652886390686, "epoch": 0.13031674208144797, "grad_norm": 0.894397497177124, "learning_rate": 0.00035769999999999997, "loss": 0.6397, "mean_token_accuracy": 0.8802188038825989, "num_tokens": 315534.0, "step": 36 }, { "entropy": 3.0312134623527527, "epoch": 0.1339366515837104, "grad_norm": 0.6374682188034058, "learning_rate": 0.00036791999999999993, "loss": 0.2183, "mean_token_accuracy": 0.9478497952222824, "num_tokens": 324492.0, "step": 37 }, { "entropy": 3.28497713804245, "epoch": 0.13755656108597286, "grad_norm": 0.6740968823432922, "learning_rate": 0.00037813999999999995, "loss": 0.3619, "mean_token_accuracy": 0.9288723170757294, "num_tokens": 333195.0, "step": 38 }, { "entropy": 3.1478323340415955, "epoch": 0.1411764705882353, "grad_norm": 0.7235494256019592, "learning_rate": 0.00038836, "loss": 0.324, "mean_token_accuracy": 0.9179254025220871, "num_tokens": 342028.0, "step": 39 }, { "entropy": 3.279879152774811, "epoch": 0.14479638009049775, "grad_norm": 0.7512595653533936, "learning_rate": 0.00039858, "loss": 0.4804, "mean_token_accuracy": 0.889826312661171, "num_tokens": 350902.0, "step": 40 }, { "entropy": 3.173546612262726, "epoch": 0.14841628959276018, "grad_norm": 0.6978861689567566, "learning_rate": 0.00040879999999999996, "loss": 0.3442, "mean_token_accuracy": 0.9205169230699539, "num_tokens": 359787.0, "step": 41 }, { "entropy": 3.2385765314102173, "epoch": 0.1520361990950226, "grad_norm": 0.8108944892883301, "learning_rate": 0.00041901999999999993, "loss": 0.4223, "mean_token_accuracy": 0.8979178965091705, "num_tokens": 368426.0, "step": 42 }, { "entropy": 3.146568477153778, "epoch": 0.15565610859728507, "grad_norm": 0.5847787261009216, "learning_rate": 0.00042923999999999995, "loss": 0.1953, "mean_token_accuracy": 0.9556037336587906, "num_tokens": 377349.0, "step": 43 }, { "entropy": 3.066233277320862, "epoch": 0.1592760180995475, "grad_norm": 0.7887329459190369, "learning_rate": 0.00043945999999999997, "loss": 0.6815, "mean_token_accuracy": 0.8654293268918991, "num_tokens": 386603.0, "step": 44 }, { "entropy": 3.1745981574058533, "epoch": 0.16289592760180996, "grad_norm": 0.7280165553092957, "learning_rate": 0.00044967999999999994, "loss": 0.1932, "mean_token_accuracy": 0.9479279220104218, "num_tokens": 395070.0, "step": 45 }, { "entropy": 3.1094446182250977, "epoch": 0.1665158371040724, "grad_norm": 0.6453448534011841, "learning_rate": 0.00045989999999999996, "loss": 0.2608, "mean_token_accuracy": 0.9249396026134491, "num_tokens": 403651.0, "step": 46 }, { "entropy": 2.9050925970077515, "epoch": 0.17013574660633485, "grad_norm": 0.6689278483390808, "learning_rate": 0.00047012, "loss": 0.4489, "mean_token_accuracy": 0.898686870932579, "num_tokens": 412898.0, "step": 47 }, { "entropy": 3.2239145040512085, "epoch": 0.17375565610859728, "grad_norm": 1.0014020204544067, "learning_rate": 0.00048033999999999994, "loss": 0.3234, "mean_token_accuracy": 0.9231891483068466, "num_tokens": 421420.0, "step": 48 }, { "entropy": 3.035899817943573, "epoch": 0.17737556561085974, "grad_norm": 0.6415768265724182, "learning_rate": 0.0004905599999999999, "loss": 0.2259, "mean_token_accuracy": 0.9447792917490005, "num_tokens": 430258.0, "step": 49 }, { "entropy": 3.057477653026581, "epoch": 0.18099547511312217, "grad_norm": 0.6042271256446838, "learning_rate": 0.0005007799999999999, "loss": 0.2228, "mean_token_accuracy": 0.9473378211259842, "num_tokens": 439593.0, "step": 50 }, { "entropy": 2.8375911116600037, "epoch": 0.18461538461538463, "grad_norm": 0.739811897277832, "learning_rate": 0.000511, "loss": 0.3623, "mean_token_accuracy": 0.9050924181938171, "num_tokens": 449056.0, "step": 51 }, { "entropy": 2.9926682114601135, "epoch": 0.18823529411764706, "grad_norm": 0.6637321710586548, "learning_rate": 0.0005109995633102972, "loss": 0.2924, "mean_token_accuracy": 0.9397273659706116, "num_tokens": 457677.0, "step": 52 }, { "entropy": 2.7932987809181213, "epoch": 0.19185520361990951, "grad_norm": 0.5666584372520447, "learning_rate": 0.0005109982532428477, "loss": 0.2055, "mean_token_accuracy": 0.9385408014059067, "num_tokens": 466969.0, "step": 53 }, { "entropy": 2.765812337398529, "epoch": 0.19547511312217195, "grad_norm": 0.7875120639801025, "learning_rate": 0.0005109960698026271, "loss": 0.4549, "mean_token_accuracy": 0.9052814990282059, "num_tokens": 476285.0, "step": 54 }, { "entropy": 2.884207248687744, "epoch": 0.19909502262443438, "grad_norm": 0.7538661956787109, "learning_rate": 0.0005109930129979285, "loss": 0.3751, "mean_token_accuracy": 0.9210246652364731, "num_tokens": 484668.0, "step": 55 }, { "entropy": 2.779718518257141, "epoch": 0.20271493212669683, "grad_norm": 0.8069296479225159, "learning_rate": 0.0005109890828403621, "loss": 0.3664, "mean_token_accuracy": 0.9219843596220016, "num_tokens": 493292.0, "step": 56 }, { "entropy": 2.841543674468994, "epoch": 0.20633484162895926, "grad_norm": 0.5545904636383057, "learning_rate": 0.0005109842793448548, "loss": 0.1973, "mean_token_accuracy": 0.9547395706176758, "num_tokens": 501973.0, "step": 57 }, { "entropy": 2.8180030584335327, "epoch": 0.20995475113122172, "grad_norm": 1.015456199645996, "learning_rate": 0.0005109786025296513, "loss": 0.6019, "mean_token_accuracy": 0.88613361120224, "num_tokens": 510840.0, "step": 58 }, { "entropy": 2.7450912594795227, "epoch": 0.21357466063348415, "grad_norm": 0.6784740686416626, "learning_rate": 0.0005109720524163127, "loss": 0.2868, "mean_token_accuracy": 0.9295425117015839, "num_tokens": 519656.0, "step": 59 }, { "entropy": 2.822400987148285, "epoch": 0.2171945701357466, "grad_norm": 0.8780149817466736, "learning_rate": 0.000510964629029717, "loss": 0.4371, "mean_token_accuracy": 0.9089596569538116, "num_tokens": 528105.0, "step": 60 }, { "entropy": 2.522100865840912, "epoch": 0.22081447963800904, "grad_norm": 0.51394122838974, "learning_rate": 0.0005109563323980594, "loss": 0.2509, "mean_token_accuracy": 0.941976860165596, "num_tokens": 537707.0, "step": 61 }, { "entropy": 2.6596657633781433, "epoch": 0.2244343891402715, "grad_norm": 0.6359816789627075, "learning_rate": 0.0005109471625528516, "loss": 0.3685, "mean_token_accuracy": 0.9191890209913254, "num_tokens": 546517.0, "step": 62 }, { "entropy": 2.800311803817749, "epoch": 0.22805429864253393, "grad_norm": 0.6862941980361938, "learning_rate": 0.0005109371195289215, "loss": 0.2457, "mean_token_accuracy": 0.9330879002809525, "num_tokens": 555493.0, "step": 63 }, { "entropy": 2.7235344648361206, "epoch": 0.2316742081447964, "grad_norm": 1.0464682579040527, "learning_rate": 0.0005109262033644142, "loss": 0.4417, "mean_token_accuracy": 0.8957678377628326, "num_tokens": 564255.0, "step": 64 }, { "entropy": 2.6643534302711487, "epoch": 0.23529411764705882, "grad_norm": 1.0790019035339355, "learning_rate": 0.0005109144141007903, "loss": 0.4947, "mean_token_accuracy": 0.8889007717370987, "num_tokens": 573401.0, "step": 65 }, { "entropy": 2.760925054550171, "epoch": 0.23891402714932128, "grad_norm": 0.7957189679145813, "learning_rate": 0.0005109017517828273, "loss": 0.2259, "mean_token_accuracy": 0.944578230381012, "num_tokens": 581905.0, "step": 66 }, { "entropy": 2.7048792839050293, "epoch": 0.2425339366515837, "grad_norm": 0.9530714750289917, "learning_rate": 0.0005108882164586181, "loss": 0.3122, "mean_token_accuracy": 0.9257418513298035, "num_tokens": 590802.0, "step": 67 }, { "entropy": 2.6733291149139404, "epoch": 0.24615384615384617, "grad_norm": 0.8295993208885193, "learning_rate": 0.0005108738081795716, "loss": 0.3701, "mean_token_accuracy": 0.898589238524437, "num_tokens": 599279.0, "step": 68 }, { "entropy": 2.5613606572151184, "epoch": 0.2497737556561086, "grad_norm": 0.6205935478210449, "learning_rate": 0.0005108585270004123, "loss": 0.4372, "mean_token_accuracy": 0.9116007685661316, "num_tokens": 608107.0, "step": 69 }, { "entropy": 2.458296835422516, "epoch": 0.25339366515837103, "grad_norm": 0.7629838585853577, "learning_rate": 0.0005108423729791799, "loss": 0.2307, "mean_token_accuracy": 0.9386163502931595, "num_tokens": 616881.0, "step": 70 }, { "entropy": 2.4176695346832275, "epoch": 0.25701357466063346, "grad_norm": 0.902400016784668, "learning_rate": 0.0005108253461772298, "loss": 0.2853, "mean_token_accuracy": 0.9237343072891235, "num_tokens": 625323.0, "step": 71 }, { "entropy": 2.2265281677246094, "epoch": 0.26063348416289595, "grad_norm": 0.7744383811950684, "learning_rate": 0.0005108074466592316, "loss": 0.2435, "mean_token_accuracy": 0.9508260935544968, "num_tokens": 634260.0, "step": 72 }, { "entropy": 2.1855952441692352, "epoch": 0.2642533936651584, "grad_norm": 0.8615190386772156, "learning_rate": 0.0005107886744931702, "loss": 0.3323, "mean_token_accuracy": 0.9276078194379807, "num_tokens": 643235.0, "step": 73 }, { "entropy": 2.179121494293213, "epoch": 0.2678733031674208, "grad_norm": 0.8953279256820679, "learning_rate": 0.0005107690297503444, "loss": 0.2384, "mean_token_accuracy": 0.9425230622291565, "num_tokens": 652032.0, "step": 74 }, { "entropy": 2.1565526127815247, "epoch": 0.27149321266968324, "grad_norm": 0.6830486059188843, "learning_rate": 0.0005107485125053678, "loss": 0.2759, "mean_token_accuracy": 0.9360661953687668, "num_tokens": 660978.0, "step": 75 }, { "entropy": 2.0900665521621704, "epoch": 0.2751131221719457, "grad_norm": 0.786665141582489, "learning_rate": 0.0005107271228361672, "loss": 0.4061, "mean_token_accuracy": 0.910009115934372, "num_tokens": 669817.0, "step": 76 }, { "entropy": 2.1311859488487244, "epoch": 0.27873303167420815, "grad_norm": 0.6399909853935242, "learning_rate": 0.0005107048608239836, "loss": 0.272, "mean_token_accuracy": 0.9424714297056198, "num_tokens": 678469.0, "step": 77 }, { "entropy": 2.059997320175171, "epoch": 0.2823529411764706, "grad_norm": 0.8114754557609558, "learning_rate": 0.0005106817265533706, "loss": 0.4029, "mean_token_accuracy": 0.9037660360336304, "num_tokens": 687261.0, "step": 78 }, { "entropy": 1.9725019037723541, "epoch": 0.285972850678733, "grad_norm": 0.9420941472053528, "learning_rate": 0.0005106577201121952, "loss": 0.535, "mean_token_accuracy": 0.8996377140283585, "num_tokens": 695941.0, "step": 79 }, { "entropy": 1.9951164424419403, "epoch": 0.2895927601809955, "grad_norm": 0.6476142406463623, "learning_rate": 0.0005106328415916372, "loss": 0.2242, "mean_token_accuracy": 0.941379725933075, "num_tokens": 704643.0, "step": 80 }, { "entropy": 1.8962564170360565, "epoch": 0.29321266968325793, "grad_norm": 0.5974630117416382, "learning_rate": 0.0005106070910861881, "loss": 0.2934, "mean_token_accuracy": 0.9217697530984879, "num_tokens": 713605.0, "step": 81 }, { "entropy": 1.9781515896320343, "epoch": 0.29683257918552036, "grad_norm": 0.8755478262901306, "learning_rate": 0.0005105804686936518, "loss": 0.4551, "mean_token_accuracy": 0.9051328897476196, "num_tokens": 722385.0, "step": 82 }, { "entropy": 1.9892418384552002, "epoch": 0.3004524886877828, "grad_norm": 0.6887345314025879, "learning_rate": 0.0005105529745151433, "loss": 0.244, "mean_token_accuracy": 0.9261117279529572, "num_tokens": 730962.0, "step": 83 }, { "entropy": 2.0053181648254395, "epoch": 0.3040723981900452, "grad_norm": 0.6930885910987854, "learning_rate": 0.0005105246086550893, "loss": 0.3155, "mean_token_accuracy": 0.9206147193908691, "num_tokens": 739499.0, "step": 84 }, { "entropy": 1.9716475903987885, "epoch": 0.3076923076923077, "grad_norm": 0.5049461722373962, "learning_rate": 0.0005104953712212266, "loss": 0.2215, "mean_token_accuracy": 0.9608763605356216, "num_tokens": 748604.0, "step": 85 }, { "entropy": 1.9186978042125702, "epoch": 0.31131221719457014, "grad_norm": 0.5756685733795166, "learning_rate": 0.000510465262324603, "loss": 0.2658, "mean_token_accuracy": 0.9372887462377548, "num_tokens": 757919.0, "step": 86 }, { "entropy": 1.9738290905952454, "epoch": 0.31493212669683257, "grad_norm": 0.6163789629936218, "learning_rate": 0.0005104342820795758, "loss": 0.2472, "mean_token_accuracy": 0.9430449157953262, "num_tokens": 766708.0, "step": 87 }, { "entropy": 2.1927571892738342, "epoch": 0.318552036199095, "grad_norm": 0.7953162789344788, "learning_rate": 0.0005104024306038119, "loss": 0.261, "mean_token_accuracy": 0.9425829648971558, "num_tokens": 774601.0, "step": 88 }, { "entropy": 2.043731451034546, "epoch": 0.3221719457013575, "grad_norm": 0.8098088502883911, "learning_rate": 0.0005103697080182872, "loss": 0.3126, "mean_token_accuracy": 0.9158089309930801, "num_tokens": 783170.0, "step": 89 }, { "entropy": 1.9801572561264038, "epoch": 0.3257918552036199, "grad_norm": 0.5227240920066833, "learning_rate": 0.0005103361144472864, "loss": 0.1291, "mean_token_accuracy": 0.9666071832180023, "num_tokens": 791769.0, "step": 90 }, { "entropy": 1.9553790986537933, "epoch": 0.32941176470588235, "grad_norm": 0.7819464206695557, "learning_rate": 0.0005103016500184022, "loss": 0.531, "mean_token_accuracy": 0.8817111849784851, "num_tokens": 800824.0, "step": 91 }, { "entropy": 1.9291303753852844, "epoch": 0.3330316742081448, "grad_norm": 0.7178757190704346, "learning_rate": 0.0005102663148625347, "loss": 0.3301, "mean_token_accuracy": 0.9357631802558899, "num_tokens": 809347.0, "step": 92 }, { "entropy": 1.9846041798591614, "epoch": 0.33665158371040727, "grad_norm": 1.316636085510254, "learning_rate": 0.0005102301091138916, "loss": 0.4241, "mean_token_accuracy": 0.8993304669857025, "num_tokens": 817174.0, "step": 93 }, { "entropy": 1.814637303352356, "epoch": 0.3402714932126697, "grad_norm": 0.5486414432525635, "learning_rate": 0.0005101930329099865, "loss": 0.116, "mean_token_accuracy": 0.9674727618694305, "num_tokens": 826177.0, "step": 94 }, { "entropy": 1.9128066003322601, "epoch": 0.3438914027149321, "grad_norm": 0.620303750038147, "learning_rate": 0.00051015508639164, "loss": 0.1833, "mean_token_accuracy": 0.9569521993398666, "num_tokens": 835409.0, "step": 95 }, { "entropy": 1.7541870176792145, "epoch": 0.34751131221719456, "grad_norm": 0.8337438702583313, "learning_rate": 0.0005101162697029776, "loss": 0.3327, "mean_token_accuracy": 0.9193180054426193, "num_tokens": 844692.0, "step": 96 }, { "entropy": 1.8255240619182587, "epoch": 0.351131221719457, "grad_norm": 0.877780556678772, "learning_rate": 0.00051007658299143, "loss": 0.2106, "mean_token_accuracy": 0.9527023881673813, "num_tokens": 853309.0, "step": 97 }, { "entropy": 1.8611579239368439, "epoch": 0.3547511312217195, "grad_norm": 1.0667716264724731, "learning_rate": 0.0005100360264077325, "loss": 0.3196, "mean_token_accuracy": 0.9195879399776459, "num_tokens": 861859.0, "step": 98 }, { "entropy": 1.821915864944458, "epoch": 0.3583710407239819, "grad_norm": 0.8400309681892395, "learning_rate": 0.0005099946001059241, "loss": 0.4036, "mean_token_accuracy": 0.8951036781072617, "num_tokens": 871060.0, "step": 99 }, { "entropy": 1.7648265063762665, "epoch": 0.36199095022624433, "grad_norm": 1.1391404867172241, "learning_rate": 0.0005099523042433472, "loss": 0.389, "mean_token_accuracy": 0.901309460401535, "num_tokens": 880593.0, "step": 100 }, { "entropy": 1.8506875336170197, "epoch": 0.36561085972850677, "grad_norm": 0.6923297643661499, "learning_rate": 0.000509909138980647, "loss": 0.2504, "mean_token_accuracy": 0.9384842216968536, "num_tokens": 889739.0, "step": 101 }, { "entropy": 1.9311015605926514, "epoch": 0.36923076923076925, "grad_norm": 0.9677391052246094, "learning_rate": 0.0005098651044817704, "loss": 0.6953, "mean_token_accuracy": 0.8752655684947968, "num_tokens": 898992.0, "step": 102 }, { "entropy": 1.9590983986854553, "epoch": 0.3728506787330317, "grad_norm": 0.6364567279815674, "learning_rate": 0.0005098202009139663, "loss": 0.4318, "mean_token_accuracy": 0.9056479930877686, "num_tokens": 908225.0, "step": 103 }, { "entropy": 1.9455370008945465, "epoch": 0.3764705882352941, "grad_norm": 0.6747863292694092, "learning_rate": 0.0005097744284477839, "loss": 0.244, "mean_token_accuracy": 0.9428392052650452, "num_tokens": 917134.0, "step": 104 }, { "entropy": 1.8632825911045074, "epoch": 0.38009049773755654, "grad_norm": 0.5705651044845581, "learning_rate": 0.0005097277872570731, "loss": 0.2508, "mean_token_accuracy": 0.9325222969055176, "num_tokens": 926573.0, "step": 105 }, { "entropy": 1.9370323717594147, "epoch": 0.38371040723981903, "grad_norm": 0.6298627853393555, "learning_rate": 0.000509680277518983, "loss": 0.2481, "mean_token_accuracy": 0.9281332045793533, "num_tokens": 935853.0, "step": 106 }, { "entropy": 2.0217572450637817, "epoch": 0.38733031674208146, "grad_norm": 0.5434353947639465, "learning_rate": 0.0005096318994139617, "loss": 0.1809, "mean_token_accuracy": 0.9592084139585495, "num_tokens": 944279.0, "step": 107 }, { "entropy": 1.9619770646095276, "epoch": 0.3909502262443439, "grad_norm": 0.6959638595581055, "learning_rate": 0.0005095826531257552, "loss": 0.1376, "mean_token_accuracy": 0.9608310014009476, "num_tokens": 953336.0, "step": 108 }, { "entropy": 2.12511146068573, "epoch": 0.3945701357466063, "grad_norm": 1.0152848958969116, "learning_rate": 0.0005095325388414074, "loss": 0.4382, "mean_token_accuracy": 0.915201798081398, "num_tokens": 962002.0, "step": 109 }, { "entropy": 2.0171878039836884, "epoch": 0.39819004524886875, "grad_norm": 0.8337467312812805, "learning_rate": 0.0005094815567512587, "loss": 0.2672, "mean_token_accuracy": 0.9313560128211975, "num_tokens": 970954.0, "step": 110 }, { "entropy": 2.1024146378040314, "epoch": 0.40180995475113124, "grad_norm": 0.8214333057403564, "learning_rate": 0.0005094297070489455, "loss": 0.3146, "mean_token_accuracy": 0.9289091974496841, "num_tokens": 979929.0, "step": 111 }, { "entropy": 2.260519325733185, "epoch": 0.40542986425339367, "grad_norm": 1.1298810243606567, "learning_rate": 0.0005093769899313996, "loss": 0.3055, "mean_token_accuracy": 0.9213490188121796, "num_tokens": 988477.0, "step": 112 }, { "entropy": 2.2228699326515198, "epoch": 0.4090497737556561, "grad_norm": 0.8601953983306885, "learning_rate": 0.0005093234055988475, "loss": 0.2738, "mean_token_accuracy": 0.920888364315033, "num_tokens": 997091.0, "step": 113 }, { "entropy": 2.2165185809135437, "epoch": 0.41266968325791853, "grad_norm": 0.6331561803817749, "learning_rate": 0.0005092689542548091, "loss": 0.2241, "mean_token_accuracy": 0.9408514499664307, "num_tokens": 1005866.0, "step": 114 }, { "entropy": 2.324040472507477, "epoch": 0.416289592760181, "grad_norm": 0.680496096611023, "learning_rate": 0.0005092136361060975, "loss": 0.2454, "mean_token_accuracy": 0.9433349967002869, "num_tokens": 1014277.0, "step": 115 }, { "entropy": 2.413789749145508, "epoch": 0.41990950226244345, "grad_norm": 0.7489557862281799, "learning_rate": 0.0005091574513628183, "loss": 0.2856, "mean_token_accuracy": 0.934124082326889, "num_tokens": 1023032.0, "step": 116 }, { "entropy": 2.4693005681037903, "epoch": 0.4235294117647059, "grad_norm": 0.6842612624168396, "learning_rate": 0.0005091004002383682, "loss": 0.2778, "mean_token_accuracy": 0.9386793673038483, "num_tokens": 1031883.0, "step": 117 }, { "entropy": 2.4351969361305237, "epoch": 0.4271493212669683, "grad_norm": 0.9150674343109131, "learning_rate": 0.0005090424829494347, "loss": 0.3151, "mean_token_accuracy": 0.9177709072828293, "num_tokens": 1040985.0, "step": 118 }, { "entropy": 2.5141562819480896, "epoch": 0.4307692307692308, "grad_norm": 1.0200655460357666, "learning_rate": 0.000508983699715995, "loss": 0.5134, "mean_token_accuracy": 0.8835459351539612, "num_tokens": 1049949.0, "step": 119 }, { "entropy": 2.479240596294403, "epoch": 0.4343891402714932, "grad_norm": 0.783278226852417, "learning_rate": 0.0005089240507613151, "loss": 0.2745, "mean_token_accuracy": 0.9389322698116302, "num_tokens": 1058953.0, "step": 120 }, { "entropy": 2.457803785800934, "epoch": 0.43800904977375565, "grad_norm": 0.7620834112167358, "learning_rate": 0.0005088635363119497, "loss": 0.3394, "mean_token_accuracy": 0.9145695865154266, "num_tokens": 1068624.0, "step": 121 }, { "entropy": 2.4909247756004333, "epoch": 0.4416289592760181, "grad_norm": 0.5868712067604065, "learning_rate": 0.0005088021565977403, "loss": 0.1726, "mean_token_accuracy": 0.9567564129829407, "num_tokens": 1077686.0, "step": 122 }, { "entropy": 2.5540462732315063, "epoch": 0.4452488687782805, "grad_norm": 1.1467291116714478, "learning_rate": 0.0005087399118518148, "loss": 0.2617, "mean_token_accuracy": 0.9329706132411957, "num_tokens": 1086230.0, "step": 123 }, { "entropy": 2.377680242061615, "epoch": 0.448868778280543, "grad_norm": 0.7021825909614563, "learning_rate": 0.0005086768023105866, "loss": 0.4124, "mean_token_accuracy": 0.9093360006809235, "num_tokens": 1095867.0, "step": 124 }, { "entropy": 2.55239599943161, "epoch": 0.45248868778280543, "grad_norm": 0.5947801470756531, "learning_rate": 0.0005086128282137538, "loss": 0.2752, "mean_token_accuracy": 0.9248816668987274, "num_tokens": 1105003.0, "step": 125 }, { "entropy": 2.4695483446121216, "epoch": 0.45610859728506786, "grad_norm": 1.345604658126831, "learning_rate": 0.0005085479898042985, "loss": 0.2577, "mean_token_accuracy": 0.9318550229072571, "num_tokens": 1114162.0, "step": 126 }, { "entropy": 2.4898732900619507, "epoch": 0.4597285067873303, "grad_norm": 0.8534179329872131, "learning_rate": 0.0005084822873284848, "loss": 0.3013, "mean_token_accuracy": 0.9195661097764969, "num_tokens": 1123457.0, "step": 127 }, { "entropy": 2.5951223969459534, "epoch": 0.4633484162895928, "grad_norm": 1.1677368879318237, "learning_rate": 0.0005084157210358592, "loss": 0.1612, "mean_token_accuracy": 0.9599333852529526, "num_tokens": 1131774.0, "step": 128 }, { "entropy": 2.7315847873687744, "epoch": 0.4669683257918552, "grad_norm": 0.7633224129676819, "learning_rate": 0.0005083482911792492, "loss": 0.2437, "mean_token_accuracy": 0.9487509876489639, "num_tokens": 1140301.0, "step": 129 }, { "entropy": 2.6348633766174316, "epoch": 0.47058823529411764, "grad_norm": 0.7573317885398865, "learning_rate": 0.0005082799980147617, "loss": 0.2426, "mean_token_accuracy": 0.947308748960495, "num_tokens": 1148929.0, "step": 130 }, { "entropy": 2.60002738237381, "epoch": 0.47420814479638007, "grad_norm": 1.8195319175720215, "learning_rate": 0.0005082108418017829, "loss": 0.1792, "mean_token_accuracy": 0.9512491375207901, "num_tokens": 1157682.0, "step": 131 }, { "entropy": 2.5319923162460327, "epoch": 0.47782805429864256, "grad_norm": 0.6342993378639221, "learning_rate": 0.0005081408228029771, "loss": 0.1843, "mean_token_accuracy": 0.9440758228302002, "num_tokens": 1166687.0, "step": 132 }, { "entropy": 2.5666881799697876, "epoch": 0.481447963800905, "grad_norm": 0.8979415893554688, "learning_rate": 0.0005080699412842852, "loss": 0.4824, "mean_token_accuracy": 0.8837443292140961, "num_tokens": 1175746.0, "step": 133 }, { "entropy": 2.6854636669158936, "epoch": 0.4850678733031674, "grad_norm": 0.8302125334739685, "learning_rate": 0.0005079981975149243, "loss": 0.267, "mean_token_accuracy": 0.9279022663831711, "num_tokens": 1184196.0, "step": 134 }, { "entropy": 2.564552128314972, "epoch": 0.48868778280542985, "grad_norm": 0.6785959005355835, "learning_rate": 0.0005079255917673863, "loss": 0.2031, "mean_token_accuracy": 0.9463823586702347, "num_tokens": 1192982.0, "step": 135 }, { "entropy": 2.673682928085327, "epoch": 0.49230769230769234, "grad_norm": 1.4760410785675049, "learning_rate": 0.0005078521243174371, "loss": 0.4791, "mean_token_accuracy": 0.8969505727291107, "num_tokens": 1201454.0, "step": 136 }, { "entropy": 2.6232714653015137, "epoch": 0.49592760180995477, "grad_norm": 0.7845668792724609, "learning_rate": 0.0005077777954441157, "loss": 0.2472, "mean_token_accuracy": 0.9404618591070175, "num_tokens": 1210182.0, "step": 137 }, { "entropy": 2.5614060163497925, "epoch": 0.4995475113122172, "grad_norm": 0.725419819355011, "learning_rate": 0.0005077026054297322, "loss": 0.3643, "mean_token_accuracy": 0.9193316847085953, "num_tokens": 1219487.0, "step": 138 }, { "entropy": 2.5907246470451355, "epoch": 0.5031674208144796, "grad_norm": 0.7741782665252686, "learning_rate": 0.0005076265545598682, "loss": 0.276, "mean_token_accuracy": 0.9447730481624603, "num_tokens": 1228066.0, "step": 139 }, { "entropy": 2.531104028224945, "epoch": 0.5067873303167421, "grad_norm": 0.680992603302002, "learning_rate": 0.0005075496431233745, "loss": 0.2004, "mean_token_accuracy": 0.9470729678869247, "num_tokens": 1236980.0, "step": 140 }, { "entropy": 2.590231478214264, "epoch": 0.5104072398190045, "grad_norm": 0.8260406255722046, "learning_rate": 0.0005074718714123704, "loss": 0.2756, "mean_token_accuracy": 0.9301882535219193, "num_tokens": 1245565.0, "step": 141 }, { "entropy": 2.4858668446540833, "epoch": 0.5140271493212669, "grad_norm": 0.8085922598838806, "learning_rate": 0.0005073932397222429, "loss": 0.2314, "mean_token_accuracy": 0.9449103325605392, "num_tokens": 1254366.0, "step": 142 }, { "entropy": 2.5374304056167603, "epoch": 0.5176470588235295, "grad_norm": 0.7858129143714905, "learning_rate": 0.0005073137483516452, "loss": 0.1622, "mean_token_accuracy": 0.9510673582553864, "num_tokens": 1263197.0, "step": 143 }, { "entropy": 2.608425199985504, "epoch": 0.5212669683257919, "grad_norm": 1.2698506116867065, "learning_rate": 0.0005072333976024957, "loss": 0.1729, "mean_token_accuracy": 0.9509973376989365, "num_tokens": 1271725.0, "step": 144 }, { "entropy": 2.437038242816925, "epoch": 0.5248868778280543, "grad_norm": 1.0788538455963135, "learning_rate": 0.0005071521877799765, "loss": 0.3344, "mean_token_accuracy": 0.9166721999645233, "num_tokens": 1280963.0, "step": 145 }, { "entropy": 2.589951515197754, "epoch": 0.5285067873303168, "grad_norm": 0.9228294491767883, "learning_rate": 0.0005070701191925332, "loss": 0.3095, "mean_token_accuracy": 0.9239777624607086, "num_tokens": 1289683.0, "step": 146 }, { "entropy": 2.575794994831085, "epoch": 0.5321266968325792, "grad_norm": 1.359767198562622, "learning_rate": 0.0005069871921518726, "loss": 0.2447, "mean_token_accuracy": 0.9374738186597824, "num_tokens": 1298397.0, "step": 147 }, { "entropy": 2.5628358721733093, "epoch": 0.5357466063348416, "grad_norm": 0.9870713353157043, "learning_rate": 0.000506903406972962, "loss": 0.4824, "mean_token_accuracy": 0.9027767181396484, "num_tokens": 1307191.0, "step": 148 }, { "entropy": 2.5513240098953247, "epoch": 0.539366515837104, "grad_norm": 0.7921387553215027, "learning_rate": 0.0005068187639740286, "loss": 0.3278, "mean_token_accuracy": 0.9161934554576874, "num_tokens": 1315878.0, "step": 149 }, { "entropy": 2.526439070701599, "epoch": 0.5429864253393665, "grad_norm": 0.6320391297340393, "learning_rate": 0.000506733263476557, "loss": 0.1701, "mean_token_accuracy": 0.9575318098068237, "num_tokens": 1324786.0, "step": 150 }, { "entropy": 2.4837265014648438, "epoch": 0.5466063348416289, "grad_norm": 0.5369354486465454, "learning_rate": 0.000506646905805289, "loss": 0.1328, "mean_token_accuracy": 0.9636050164699554, "num_tokens": 1333766.0, "step": 151 }, { "entropy": 2.5264737010002136, "epoch": 0.5502262443438914, "grad_norm": 0.7346852421760559, "learning_rate": 0.0005065596912882222, "loss": 0.2012, "mean_token_accuracy": 0.9448132663965225, "num_tokens": 1343004.0, "step": 152 }, { "entropy": 2.569309651851654, "epoch": 0.5538461538461539, "grad_norm": 0.9926508069038391, "learning_rate": 0.0005064716202566082, "loss": 0.2831, "mean_token_accuracy": 0.9332023113965988, "num_tokens": 1351561.0, "step": 153 }, { "entropy": 2.3148274421691895, "epoch": 0.5574660633484163, "grad_norm": 0.6301954984664917, "learning_rate": 0.0005063826930449523, "loss": 0.3622, "mean_token_accuracy": 0.9349419325590134, "num_tokens": 1360997.0, "step": 154 }, { "entropy": 2.497675657272339, "epoch": 0.5610859728506787, "grad_norm": 0.8846175670623779, "learning_rate": 0.000506292909991011, "loss": 0.2314, "mean_token_accuracy": 0.9468862265348434, "num_tokens": 1369600.0, "step": 155 }, { "entropy": 2.313987612724304, "epoch": 0.5647058823529412, "grad_norm": 0.5701894164085388, "learning_rate": 0.0005062022714357922, "loss": 0.2154, "mean_token_accuracy": 0.945093959569931, "num_tokens": 1379125.0, "step": 156 }, { "entropy": 2.4019755125045776, "epoch": 0.5683257918552036, "grad_norm": 0.8769335746765137, "learning_rate": 0.0005061107777235524, "loss": 0.3565, "mean_token_accuracy": 0.9133864492177963, "num_tokens": 1388111.0, "step": 157 }, { "entropy": 2.3127577900886536, "epoch": 0.571945701357466, "grad_norm": 1.1026453971862793, "learning_rate": 0.0005060184292017965, "loss": 0.2897, "mean_token_accuracy": 0.899736076593399, "num_tokens": 1397528.0, "step": 158 }, { "entropy": 2.2682697772979736, "epoch": 0.5755656108597285, "grad_norm": 0.5426591038703918, "learning_rate": 0.000505925226221276, "loss": 0.167, "mean_token_accuracy": 0.9609879851341248, "num_tokens": 1406809.0, "step": 159 }, { "entropy": 2.4639336466789246, "epoch": 0.579185520361991, "grad_norm": 0.6552363038063049, "learning_rate": 0.0005058311691359875, "loss": 0.2511, "mean_token_accuracy": 0.9355164766311646, "num_tokens": 1415498.0, "step": 160 }, { "entropy": 2.467900663614273, "epoch": 0.5828054298642534, "grad_norm": 0.7168154120445251, "learning_rate": 0.000505736258303172, "loss": 0.234, "mean_token_accuracy": 0.9450509995222092, "num_tokens": 1424524.0, "step": 161 }, { "entropy": 2.3683157563209534, "epoch": 0.5864253393665159, "grad_norm": 0.6433501839637756, "learning_rate": 0.0005056404940833128, "loss": 0.3441, "mean_token_accuracy": 0.9261108189821243, "num_tokens": 1434194.0, "step": 162 }, { "entropy": 2.4686295986175537, "epoch": 0.5900452488687783, "grad_norm": 0.9615177512168884, "learning_rate": 0.0005055438768401348, "loss": 0.1492, "mean_token_accuracy": 0.966903567314148, "num_tokens": 1442972.0, "step": 163 }, { "entropy": 2.5551892518997192, "epoch": 0.5936651583710407, "grad_norm": 0.4957484006881714, "learning_rate": 0.0005054464069406023, "loss": 0.1242, "mean_token_accuracy": 0.969713419675827, "num_tokens": 1451324.0, "step": 164 }, { "entropy": 2.554121434688568, "epoch": 0.5972850678733032, "grad_norm": 0.7399498224258423, "learning_rate": 0.0005053480847549187, "loss": 0.206, "mean_token_accuracy": 0.9498797357082367, "num_tokens": 1459698.0, "step": 165 }, { "entropy": 2.5181015729904175, "epoch": 0.6009049773755656, "grad_norm": 0.7433251142501831, "learning_rate": 0.0005052489106565241, "loss": 0.2883, "mean_token_accuracy": 0.9419967085123062, "num_tokens": 1468460.0, "step": 166 }, { "entropy": 2.3073930144309998, "epoch": 0.604524886877828, "grad_norm": 0.5920398831367493, "learning_rate": 0.0005051488850220941, "loss": 0.197, "mean_token_accuracy": 0.952111005783081, "num_tokens": 1477579.0, "step": 167 }, { "entropy": 2.532376289367676, "epoch": 0.6081447963800904, "grad_norm": 0.7033098936080933, "learning_rate": 0.0005050480082315392, "loss": 0.2122, "mean_token_accuracy": 0.9488633275032043, "num_tokens": 1486307.0, "step": 168 }, { "entropy": 2.397290349006653, "epoch": 0.611764705882353, "grad_norm": 0.8026869893074036, "learning_rate": 0.0005049462806680021, "loss": 0.2541, "mean_token_accuracy": 0.9427233040332794, "num_tokens": 1495152.0, "step": 169 }, { "entropy": 2.464823842048645, "epoch": 0.6153846153846154, "grad_norm": 0.6508225798606873, "learning_rate": 0.0005048437027178571, "loss": 0.2639, "mean_token_accuracy": 0.9391255974769592, "num_tokens": 1503903.0, "step": 170 }, { "entropy": 2.520734131336212, "epoch": 0.6190045248868778, "grad_norm": 0.8373616337776184, "learning_rate": 0.0005047402747707084, "loss": 0.3078, "mean_token_accuracy": 0.9302930980920792, "num_tokens": 1512588.0, "step": 171 }, { "entropy": 2.388108015060425, "epoch": 0.6226244343891403, "grad_norm": 0.6334089636802673, "learning_rate": 0.0005046359972193884, "loss": 0.1372, "mean_token_accuracy": 0.9666119515895844, "num_tokens": 1522011.0, "step": 172 }, { "entropy": 2.537126660346985, "epoch": 0.6262443438914027, "grad_norm": 0.7665116190910339, "learning_rate": 0.0005045308704599566, "loss": 0.2603, "mean_token_accuracy": 0.9350012242794037, "num_tokens": 1530767.0, "step": 173 }, { "entropy": 2.567205488681793, "epoch": 0.6298642533936651, "grad_norm": 0.8043875098228455, "learning_rate": 0.0005044248948916977, "loss": 0.2497, "mean_token_accuracy": 0.9400482773780823, "num_tokens": 1539971.0, "step": 174 }, { "entropy": 2.585887610912323, "epoch": 0.6334841628959276, "grad_norm": 0.5282150506973267, "learning_rate": 0.0005043180709171206, "loss": 0.1126, "mean_token_accuracy": 0.9680279046297073, "num_tokens": 1548971.0, "step": 175 }, { "entropy": 2.4289392232894897, "epoch": 0.63710407239819, "grad_norm": 0.6838382482528687, "learning_rate": 0.0005042103989419563, "loss": 0.2076, "mean_token_accuracy": 0.9468046277761459, "num_tokens": 1558403.0, "step": 176 }, { "entropy": 2.6080575585365295, "epoch": 0.6407239819004525, "grad_norm": 0.9058650732040405, "learning_rate": 0.0005041018793751566, "loss": 0.1781, "mean_token_accuracy": 0.9432647377252579, "num_tokens": 1567209.0, "step": 177 }, { "entropy": 2.5212480425834656, "epoch": 0.644343891402715, "grad_norm": 0.796381950378418, "learning_rate": 0.0005039925126288929, "loss": 0.2286, "mean_token_accuracy": 0.9305787235498428, "num_tokens": 1576255.0, "step": 178 }, { "entropy": 2.588195264339447, "epoch": 0.6479638009049774, "grad_norm": 0.6489388942718506, "learning_rate": 0.0005038822991185536, "loss": 0.1717, "mean_token_accuracy": 0.9572225511074066, "num_tokens": 1585335.0, "step": 179 }, { "entropy": 2.609215259552002, "epoch": 0.6515837104072398, "grad_norm": 0.8551130294799805, "learning_rate": 0.0005037712392627441, "loss": 0.2358, "mean_token_accuracy": 0.9529621452093124, "num_tokens": 1594354.0, "step": 180 }, { "entropy": 2.4199504256248474, "epoch": 0.6552036199095023, "grad_norm": 0.5775637030601501, "learning_rate": 0.0005036593334832836, "loss": 0.2402, "mean_token_accuracy": 0.9437069743871689, "num_tokens": 1603750.0, "step": 181 }, { "entropy": 2.516424596309662, "epoch": 0.6588235294117647, "grad_norm": 0.6967942118644714, "learning_rate": 0.0005035465822052047, "loss": 0.1624, "mean_token_accuracy": 0.9518167823553085, "num_tokens": 1612474.0, "step": 182 }, { "entropy": 2.463354170322418, "epoch": 0.6624434389140271, "grad_norm": 0.49672600626945496, "learning_rate": 0.000503432985856751, "loss": 0.1654, "mean_token_accuracy": 0.9564716964960098, "num_tokens": 1621563.0, "step": 183 }, { "entropy": 2.4456416964530945, "epoch": 0.6660633484162896, "grad_norm": 0.6207183003425598, "learning_rate": 0.000503318544869376, "loss": 0.1918, "mean_token_accuracy": 0.9476529806852341, "num_tokens": 1630801.0, "step": 184 }, { "entropy": 2.641440451145172, "epoch": 0.669683257918552, "grad_norm": 1.220821499824524, "learning_rate": 0.000503203259677741, "loss": 0.4019, "mean_token_accuracy": 0.9172120243310928, "num_tokens": 1639522.0, "step": 185 }, { "entropy": 2.6447275280952454, "epoch": 0.6733031674208145, "grad_norm": 0.7546490430831909, "learning_rate": 0.000503087130719714, "loss": 0.2484, "mean_token_accuracy": 0.9387800246477127, "num_tokens": 1647964.0, "step": 186 }, { "entropy": 2.4657886028289795, "epoch": 0.676923076923077, "grad_norm": 0.7679230570793152, "learning_rate": 0.0005029701584363675, "loss": 0.2659, "mean_token_accuracy": 0.930300235748291, "num_tokens": 1657181.0, "step": 187 }, { "entropy": 2.37973552942276, "epoch": 0.6805429864253394, "grad_norm": 0.7473414540290833, "learning_rate": 0.0005028523432719772, "loss": 0.32, "mean_token_accuracy": 0.9233052879571915, "num_tokens": 1666477.0, "step": 188 }, { "entropy": 2.5238219499588013, "epoch": 0.6841628959276018, "grad_norm": 0.5573673248291016, "learning_rate": 0.0005027336856740201, "loss": 0.1846, "mean_token_accuracy": 0.9445535093545914, "num_tokens": 1675002.0, "step": 189 }, { "entropy": 2.456815242767334, "epoch": 0.6877828054298643, "grad_norm": 0.47237634658813477, "learning_rate": 0.0005026141860931728, "loss": 0.1065, "mean_token_accuracy": 0.964375838637352, "num_tokens": 1683623.0, "step": 190 }, { "entropy": 2.548456132411957, "epoch": 0.6914027149321267, "grad_norm": 0.7699162364006042, "learning_rate": 0.00050249384498331, "loss": 0.1985, "mean_token_accuracy": 0.9438774734735489, "num_tokens": 1691718.0, "step": 191 }, { "entropy": 2.4514941573143005, "epoch": 0.6950226244343891, "grad_norm": 1.4113538265228271, "learning_rate": 0.0005023726628015027, "loss": 0.4541, "mean_token_accuracy": 0.9207872897386551, "num_tokens": 1699824.0, "step": 192 }, { "entropy": 2.2560824751853943, "epoch": 0.6986425339366515, "grad_norm": 0.6007948517799377, "learning_rate": 0.0005022506400080161, "loss": 0.1871, "mean_token_accuracy": 0.9502484053373337, "num_tokens": 1708722.0, "step": 193 }, { "entropy": 2.1833614110946655, "epoch": 0.702262443438914, "grad_norm": 0.7005489468574524, "learning_rate": 0.0005021277770663082, "loss": 0.2222, "mean_token_accuracy": 0.9386974722146988, "num_tokens": 1717592.0, "step": 194 }, { "entropy": 2.2031923830509186, "epoch": 0.7058823529411765, "grad_norm": 0.5830584764480591, "learning_rate": 0.0005020040744430284, "loss": 0.1106, "mean_token_accuracy": 0.9719562232494354, "num_tokens": 1726149.0, "step": 195 }, { "entropy": 2.199785351753235, "epoch": 0.709502262443439, "grad_norm": 0.7465847134590149, "learning_rate": 0.0005018795326080149, "loss": 0.1935, "mean_token_accuracy": 0.9497270882129669, "num_tokens": 1734541.0, "step": 196 }, { "entropy": 2.1103186309337616, "epoch": 0.7131221719457014, "grad_norm": 1.0782264471054077, "learning_rate": 0.0005017541520342934, "loss": 0.2895, "mean_token_accuracy": 0.9274258464574814, "num_tokens": 1743722.0, "step": 197 }, { "entropy": 2.2248528599739075, "epoch": 0.7167420814479638, "grad_norm": 0.6409780979156494, "learning_rate": 0.0005016279331980754, "loss": 0.1425, "mean_token_accuracy": 0.96550352871418, "num_tokens": 1752156.0, "step": 198 }, { "entropy": 2.19924658536911, "epoch": 0.7203619909502262, "grad_norm": 0.7019934058189392, "learning_rate": 0.0005015008765787561, "loss": 0.1969, "mean_token_accuracy": 0.9429282248020172, "num_tokens": 1760978.0, "step": 199 }, { "entropy": 2.297484815120697, "epoch": 0.7239819004524887, "grad_norm": 0.7826490998268127, "learning_rate": 0.0005013729826589127, "loss": 0.2399, "mean_token_accuracy": 0.9416657984256744, "num_tokens": 1769533.0, "step": 200 }, { "entropy": 2.2471498548984528, "epoch": 0.7276018099547511, "grad_norm": 0.621566891670227, "learning_rate": 0.0005012442519243027, "loss": 0.1876, "mean_token_accuracy": 0.9460793286561966, "num_tokens": 1778286.0, "step": 201 }, { "entropy": 2.2212815284729004, "epoch": 0.7312217194570135, "grad_norm": 0.622283935546875, "learning_rate": 0.0005011146848638616, "loss": 0.1617, "mean_token_accuracy": 0.9482609927654266, "num_tokens": 1787392.0, "step": 202 }, { "entropy": 2.308752655982971, "epoch": 0.7348416289592761, "grad_norm": 0.7263973355293274, "learning_rate": 0.0005009842819697018, "loss": 0.2043, "mean_token_accuracy": 0.9378403723239899, "num_tokens": 1796133.0, "step": 203 }, { "entropy": 2.3376497626304626, "epoch": 0.7384615384615385, "grad_norm": 0.5493630766868591, "learning_rate": 0.0005008530437371101, "loss": 0.1145, "mean_token_accuracy": 0.970586434006691, "num_tokens": 1804769.0, "step": 204 }, { "entropy": 2.373005509376526, "epoch": 0.7420814479638009, "grad_norm": 0.6313483119010925, "learning_rate": 0.0005007209706645461, "loss": 0.2183, "mean_token_accuracy": 0.9472708404064178, "num_tokens": 1813364.0, "step": 205 }, { "entropy": 2.468949854373932, "epoch": 0.7457013574660634, "grad_norm": 1.0125588178634644, "learning_rate": 0.00050058806325364, "loss": 0.2225, "mean_token_accuracy": 0.9351322948932648, "num_tokens": 1822149.0, "step": 206 }, { "entropy": 2.2420623898506165, "epoch": 0.7493212669683258, "grad_norm": 0.913761556148529, "learning_rate": 0.0005004543220091911, "loss": 0.2386, "mean_token_accuracy": 0.9453927427530289, "num_tokens": 1831533.0, "step": 207 }, { "entropy": 2.2966006994247437, "epoch": 0.7529411764705882, "grad_norm": 0.7386876940727234, "learning_rate": 0.0005003197474391658, "loss": 0.1768, "mean_token_accuracy": 0.949826255440712, "num_tokens": 1840157.0, "step": 208 }, { "entropy": 2.306001305580139, "epoch": 0.7565610859728507, "grad_norm": 0.8900741338729858, "learning_rate": 0.0005001843400546955, "loss": 0.2899, "mean_token_accuracy": 0.9241485595703125, "num_tokens": 1848898.0, "step": 209 }, { "entropy": 2.117514967918396, "epoch": 0.7601809954751131, "grad_norm": 0.644622802734375, "learning_rate": 0.0005000481003700746, "loss": 0.2714, "mean_token_accuracy": 0.9299416691064835, "num_tokens": 1858330.0, "step": 210 }, { "entropy": 2.3768392205238342, "epoch": 0.7638009049773755, "grad_norm": 0.9724471569061279, "learning_rate": 0.0004999110289027587, "loss": 0.1633, "mean_token_accuracy": 0.9550061523914337, "num_tokens": 1866806.0, "step": 211 }, { "entropy": 2.090679556131363, "epoch": 0.7674208144796381, "grad_norm": 0.5419518351554871, "learning_rate": 0.0004997731261733628, "loss": 0.1369, "mean_token_accuracy": 0.9619670957326889, "num_tokens": 1875937.0, "step": 212 }, { "entropy": 2.099909245967865, "epoch": 0.7710407239819005, "grad_norm": 0.6858121752738953, "learning_rate": 0.0004996343927056592, "loss": 0.1633, "mean_token_accuracy": 0.9528832882642746, "num_tokens": 1885145.0, "step": 213 }, { "entropy": 2.130059242248535, "epoch": 0.7746606334841629, "grad_norm": 0.7691065073013306, "learning_rate": 0.000499494829026575, "loss": 0.348, "mean_token_accuracy": 0.9162366837263107, "num_tokens": 1894255.0, "step": 214 }, { "entropy": 2.191373586654663, "epoch": 0.7782805429864253, "grad_norm": 0.7427324652671814, "learning_rate": 0.000499354435666191, "loss": 0.3373, "mean_token_accuracy": 0.9311849176883698, "num_tokens": 1902981.0, "step": 215 }, { "entropy": 2.1425398886203766, "epoch": 0.7819004524886878, "grad_norm": 0.6410383582115173, "learning_rate": 0.0004992132131577392, "loss": 0.2079, "mean_token_accuracy": 0.949742391705513, "num_tokens": 1912253.0, "step": 216 }, { "entropy": 2.1396586298942566, "epoch": 0.7855203619909502, "grad_norm": 0.5689850449562073, "learning_rate": 0.0004990711620376003, "loss": 0.1999, "mean_token_accuracy": 0.946034774184227, "num_tokens": 1921409.0, "step": 217 }, { "entropy": 2.2237865328788757, "epoch": 0.7891402714932126, "grad_norm": 0.6408923864364624, "learning_rate": 0.0004989282828453029, "loss": 0.2452, "mean_token_accuracy": 0.9510752111673355, "num_tokens": 1930397.0, "step": 218 }, { "entropy": 2.234771251678467, "epoch": 0.7927601809954751, "grad_norm": 0.751447856426239, "learning_rate": 0.0004987845761235203, "loss": 0.3057, "mean_token_accuracy": 0.9217256307601929, "num_tokens": 1939172.0, "step": 219 }, { "entropy": 2.2653815746307373, "epoch": 0.7963800904977375, "grad_norm": 0.751455545425415, "learning_rate": 0.0004986400424180688, "loss": 0.3245, "mean_token_accuracy": 0.9256318956613541, "num_tokens": 1947979.0, "step": 220 }, { "entropy": 2.3123483061790466, "epoch": 0.8, "grad_norm": 0.5939492583274841, "learning_rate": 0.0004984946822779061, "loss": 0.2429, "mean_token_accuracy": 0.9333402067422867, "num_tokens": 1956814.0, "step": 221 }, { "entropy": 2.3289234042167664, "epoch": 0.8036199095022625, "grad_norm": 0.5591994524002075, "learning_rate": 0.0004983484962551284, "loss": 0.1507, "mean_token_accuracy": 0.96376833319664, "num_tokens": 1965641.0, "step": 222 }, { "entropy": 2.4314023852348328, "epoch": 0.8072398190045249, "grad_norm": 0.5805783271789551, "learning_rate": 0.0004982014849049687, "loss": 0.2049, "mean_token_accuracy": 0.9586948156356812, "num_tokens": 1974180.0, "step": 223 }, { "entropy": 2.3639765977859497, "epoch": 0.8108597285067873, "grad_norm": 0.6924490332603455, "learning_rate": 0.0004980536487857951, "loss": 0.2137, "mean_token_accuracy": 0.9441423565149307, "num_tokens": 1982744.0, "step": 224 }, { "entropy": 2.3361759781837463, "epoch": 0.8144796380090498, "grad_norm": 0.4579620361328125, "learning_rate": 0.0004979049884591077, "loss": 0.1041, "mean_token_accuracy": 0.9753208309412003, "num_tokens": 1991583.0, "step": 225 }, { "entropy": 2.286989688873291, "epoch": 0.8180995475113122, "grad_norm": 0.6489312052726746, "learning_rate": 0.0004977555044895377, "loss": 0.2131, "mean_token_accuracy": 0.9520440250635147, "num_tokens": 2000193.0, "step": 226 }, { "entropy": 2.288672834634781, "epoch": 0.8217194570135746, "grad_norm": 0.7738961577415466, "learning_rate": 0.0004976051974448441, "loss": 0.325, "mean_token_accuracy": 0.9060750156641006, "num_tokens": 2009233.0, "step": 227 }, { "entropy": 2.288076102733612, "epoch": 0.8253393665158371, "grad_norm": 0.7042292356491089, "learning_rate": 0.0004974540678959123, "loss": 0.2206, "mean_token_accuracy": 0.94980289041996, "num_tokens": 2018417.0, "step": 228 }, { "entropy": 2.217707335948944, "epoch": 0.8289592760180996, "grad_norm": 0.6834208369255066, "learning_rate": 0.0004973021164167515, "loss": 0.2907, "mean_token_accuracy": 0.951058641076088, "num_tokens": 2027822.0, "step": 229 }, { "entropy": 2.1610691249370575, "epoch": 0.832579185520362, "grad_norm": 0.665044903755188, "learning_rate": 0.0004971493435844928, "loss": 0.2387, "mean_token_accuracy": 0.9506549835205078, "num_tokens": 2036983.0, "step": 230 }, { "entropy": 2.321135401725769, "epoch": 0.8361990950226245, "grad_norm": 0.8208273649215698, "learning_rate": 0.0004969957499793869, "loss": 0.2399, "mean_token_accuracy": 0.9435176253318787, "num_tokens": 2045574.0, "step": 231 }, { "entropy": 2.1943611800670624, "epoch": 0.8398190045248869, "grad_norm": 0.6293840408325195, "learning_rate": 0.0004968413361848019, "loss": 0.1784, "mean_token_accuracy": 0.9559669345617294, "num_tokens": 2054336.0, "step": 232 }, { "entropy": 2.2722273468971252, "epoch": 0.8434389140271493, "grad_norm": 0.6535817980766296, "learning_rate": 0.0004966861027872211, "loss": 0.1675, "mean_token_accuracy": 0.9532535970211029, "num_tokens": 2063225.0, "step": 233 }, { "entropy": 2.3278334736824036, "epoch": 0.8470588235294118, "grad_norm": 1.1610206365585327, "learning_rate": 0.0004965300503762406, "loss": 0.1588, "mean_token_accuracy": 0.9641145765781403, "num_tokens": 2071738.0, "step": 234 }, { "entropy": 2.202972888946533, "epoch": 0.8506787330316742, "grad_norm": 0.4811885356903076, "learning_rate": 0.0004963731795445675, "loss": 0.0813, "mean_token_accuracy": 0.9766911715269089, "num_tokens": 2080375.0, "step": 235 }, { "entropy": 2.2433705925941467, "epoch": 0.8542986425339366, "grad_norm": 0.8113318681716919, "learning_rate": 0.0004962154908880171, "loss": 0.2965, "mean_token_accuracy": 0.9290606826543808, "num_tokens": 2089522.0, "step": 236 }, { "entropy": 2.2168884873390198, "epoch": 0.857918552036199, "grad_norm": 0.6128959655761719, "learning_rate": 0.0004960569850055111, "loss": 0.1724, "mean_token_accuracy": 0.9603384286165237, "num_tokens": 2098162.0, "step": 237 }, { "entropy": 2.2738255858421326, "epoch": 0.8615384615384616, "grad_norm": 0.8557195663452148, "learning_rate": 0.0004958976624990749, "loss": 0.2596, "mean_token_accuracy": 0.9487071484327316, "num_tokens": 2106984.0, "step": 238 }, { "entropy": 2.2031425833702087, "epoch": 0.865158371040724, "grad_norm": 0.6621816158294678, "learning_rate": 0.0004957375239738359, "loss": 0.232, "mean_token_accuracy": 0.9525040090084076, "num_tokens": 2116040.0, "step": 239 }, { "entropy": 2.374737858772278, "epoch": 0.8687782805429864, "grad_norm": 0.8481062054634094, "learning_rate": 0.0004955765700380204, "loss": 0.2516, "mean_token_accuracy": 0.9396061599254608, "num_tokens": 2124862.0, "step": 240 }, { "entropy": 2.266704559326172, "epoch": 0.8723981900452489, "grad_norm": 0.6284282803535461, "learning_rate": 0.0004954148013029521, "loss": 0.3244, "mean_token_accuracy": 0.9381244331598282, "num_tokens": 2134018.0, "step": 241 }, { "entropy": 2.3935859203338623, "epoch": 0.8760180995475113, "grad_norm": 1.1564176082611084, "learning_rate": 0.0004952522183830493, "loss": 0.2706, "mean_token_accuracy": 0.9297053664922714, "num_tokens": 2142745.0, "step": 242 }, { "entropy": 2.281618118286133, "epoch": 0.8796380090497737, "grad_norm": 0.5324040055274963, "learning_rate": 0.0004950888218958225, "loss": 0.1573, "mean_token_accuracy": 0.9568462073802948, "num_tokens": 2151607.0, "step": 243 }, { "entropy": 2.230749189853668, "epoch": 0.8832579185520362, "grad_norm": 0.680780291557312, "learning_rate": 0.0004949246124618726, "loss": 0.1956, "mean_token_accuracy": 0.9479999989271164, "num_tokens": 2160904.0, "step": 244 }, { "entropy": 2.21382600069046, "epoch": 0.8868778280542986, "grad_norm": 0.6321626305580139, "learning_rate": 0.0004947595907048877, "loss": 0.2444, "mean_token_accuracy": 0.9376699328422546, "num_tokens": 2170021.0, "step": 245 }, { "entropy": 2.3659472465515137, "epoch": 0.890497737556561, "grad_norm": 0.9778954982757568, "learning_rate": 0.0004945937572516417, "loss": 0.3783, "mean_token_accuracy": 0.9104805737733841, "num_tokens": 2178995.0, "step": 246 }, { "entropy": 2.3233078718185425, "epoch": 0.8941176470588236, "grad_norm": 0.53229820728302, "learning_rate": 0.0004944271127319909, "loss": 0.0759, "mean_token_accuracy": 0.9791453778743744, "num_tokens": 2187823.0, "step": 247 }, { "entropy": 2.2469444274902344, "epoch": 0.897737556561086, "grad_norm": 0.6367197632789612, "learning_rate": 0.0004942596577788728, "loss": 0.2677, "mean_token_accuracy": 0.9392691254615784, "num_tokens": 2196923.0, "step": 248 }, { "entropy": 2.4508965611457825, "epoch": 0.9013574660633484, "grad_norm": 0.6042234897613525, "learning_rate": 0.0004940913930283024, "loss": 0.1102, "mean_token_accuracy": 0.9762090593576431, "num_tokens": 2205400.0, "step": 249 }, { "entropy": 2.365670144557953, "epoch": 0.9049773755656109, "grad_norm": 0.6490639448165894, "learning_rate": 0.0004939223191193707, "loss": 0.1532, "mean_token_accuracy": 0.9489114433526993, "num_tokens": 2214201.0, "step": 250 }, { "entropy": 2.4013625383377075, "epoch": 0.9085972850678733, "grad_norm": 0.5969854593276978, "learning_rate": 0.0004937524366942419, "loss": 0.1273, "mean_token_accuracy": 0.9682519882917404, "num_tokens": 2222979.0, "step": 251 }, { "entropy": 2.4402357935905457, "epoch": 0.9122171945701357, "grad_norm": 0.7559595704078674, "learning_rate": 0.0004935817463981513, "loss": 0.1979, "mean_token_accuracy": 0.9483373910188675, "num_tokens": 2231169.0, "step": 252 }, { "entropy": 2.4673256874084473, "epoch": 0.9158371040723982, "grad_norm": 0.8663308620452881, "learning_rate": 0.0004934102488794023, "loss": 0.2453, "mean_token_accuracy": 0.9408974200487137, "num_tokens": 2240099.0, "step": 253 }, { "entropy": 2.426262080669403, "epoch": 0.9194570135746606, "grad_norm": 0.7920467257499695, "learning_rate": 0.0004932379447893643, "loss": 0.2828, "mean_token_accuracy": 0.9319239109754562, "num_tokens": 2249088.0, "step": 254 }, { "entropy": 2.5018852949142456, "epoch": 0.9230769230769231, "grad_norm": 0.7216617465019226, "learning_rate": 0.0004930648347824701, "loss": 0.1647, "mean_token_accuracy": 0.9551804810762405, "num_tokens": 2257710.0, "step": 255 }, { "entropy": 2.43031644821167, "epoch": 0.9266968325791856, "grad_norm": 0.646794319152832, "learning_rate": 0.0004928909195162138, "loss": 0.1328, "mean_token_accuracy": 0.9663553237915039, "num_tokens": 2266883.0, "step": 256 }, { "entropy": 2.5406370759010315, "epoch": 0.930316742081448, "grad_norm": 0.5482825040817261, "learning_rate": 0.0004927161996511474, "loss": 0.1872, "mean_token_accuracy": 0.9557004272937775, "num_tokens": 2275728.0, "step": 257 }, { "entropy": 2.636320471763611, "epoch": 0.9339366515837104, "grad_norm": 0.7454632520675659, "learning_rate": 0.0004925406758508797, "loss": 0.1461, "mean_token_accuracy": 0.9578974395990372, "num_tokens": 2284319.0, "step": 258 }, { "entropy": 2.6067575812339783, "epoch": 0.9375565610859729, "grad_norm": 0.8695769309997559, "learning_rate": 0.000492364348782072, "loss": 0.1712, "mean_token_accuracy": 0.9652896523475647, "num_tokens": 2293035.0, "step": 259 }, { "entropy": 2.5837162137031555, "epoch": 0.9411764705882353, "grad_norm": 0.5752995014190674, "learning_rate": 0.0004921872191144371, "loss": 0.1398, "mean_token_accuracy": 0.9553333520889282, "num_tokens": 2301802.0, "step": 260 }, { "entropy": 2.713033616542816, "epoch": 0.9447963800904977, "grad_norm": 0.85626620054245, "learning_rate": 0.0004920092875207363, "loss": 0.2207, "mean_token_accuracy": 0.9468346834182739, "num_tokens": 2309981.0, "step": 261 }, { "entropy": 2.400112509727478, "epoch": 0.9484162895927601, "grad_norm": 0.6766608953475952, "learning_rate": 0.0004918305546767764, "loss": 0.1644, "mean_token_accuracy": 0.9502440094947815, "num_tokens": 2319212.0, "step": 262 }, { "entropy": 2.503827154636383, "epoch": 0.9520361990950226, "grad_norm": 0.789470911026001, "learning_rate": 0.0004916510212614072, "loss": 0.2117, "mean_token_accuracy": 0.9454390555620193, "num_tokens": 2328234.0, "step": 263 }, { "entropy": 2.669040560722351, "epoch": 0.9556561085972851, "grad_norm": 0.9579212069511414, "learning_rate": 0.0004914706879565197, "loss": 0.2193, "mean_token_accuracy": 0.9321542829275131, "num_tokens": 2336543.0, "step": 264 }, { "entropy": 2.507073998451233, "epoch": 0.9592760180995475, "grad_norm": 0.5315744876861572, "learning_rate": 0.000491289555447043, "loss": 0.0851, "mean_token_accuracy": 0.9771326780319214, "num_tokens": 2345292.0, "step": 265 }, { "entropy": 2.4205283522605896, "epoch": 0.96289592760181, "grad_norm": 0.5441373586654663, "learning_rate": 0.000491107624420941, "loss": 0.1323, "mean_token_accuracy": 0.9541790336370468, "num_tokens": 2354242.0, "step": 266 }, { "entropy": 2.3817258477211, "epoch": 0.9665158371040724, "grad_norm": 0.5946238040924072, "learning_rate": 0.0004909248955692111, "loss": 0.1708, "mean_token_accuracy": 0.947738841176033, "num_tokens": 2363183.0, "step": 267 }, { "entropy": 2.5073485374450684, "epoch": 0.9701357466063348, "grad_norm": 0.6979324817657471, "learning_rate": 0.0004907413695858812, "loss": 0.2099, "mean_token_accuracy": 0.9423733651638031, "num_tokens": 2371885.0, "step": 268 }, { "entropy": 2.5705007910728455, "epoch": 0.9737556561085973, "grad_norm": 0.8203943967819214, "learning_rate": 0.0004905570471680057, "loss": 0.217, "mean_token_accuracy": 0.9511639326810837, "num_tokens": 2380316.0, "step": 269 }, { "entropy": 2.2677993774414062, "epoch": 0.9773755656108597, "grad_norm": 0.5840432047843933, "learning_rate": 0.0004903719290156649, "loss": 0.2364, "mean_token_accuracy": 0.9407180696725845, "num_tokens": 2389723.0, "step": 270 }, { "entropy": 2.477886915206909, "epoch": 0.9809954751131221, "grad_norm": 0.818929135799408, "learning_rate": 0.0004901860158319612, "loss": 0.1707, "mean_token_accuracy": 0.9579566866159439, "num_tokens": 2398388.0, "step": 271 }, { "entropy": 2.549662232398987, "epoch": 0.9846153846153847, "grad_norm": 0.7804781198501587, "learning_rate": 0.0004899993083230166, "loss": 0.2944, "mean_token_accuracy": 0.9381812512874603, "num_tokens": 2406929.0, "step": 272 }, { "entropy": 2.4465304017066956, "epoch": 0.9882352941176471, "grad_norm": 0.5218799114227295, "learning_rate": 0.0004898118071979699, "loss": 0.1661, "mean_token_accuracy": 0.9500218778848648, "num_tokens": 2415631.0, "step": 273 }, { "entropy": 2.5852283239364624, "epoch": 0.9918552036199095, "grad_norm": 0.591163158416748, "learning_rate": 0.0004896235131689743, "loss": 0.2005, "mean_token_accuracy": 0.9455285370349884, "num_tokens": 2424091.0, "step": 274 }, { "entropy": 2.478701651096344, "epoch": 0.995475113122172, "grad_norm": 1.0615383386611938, "learning_rate": 0.0004894344269511945, "loss": 0.2864, "mean_token_accuracy": 0.9306265562772751, "num_tokens": 2432705.0, "step": 275 }, { "entropy": 2.600062847137451, "epoch": 0.9990950226244344, "grad_norm": 0.7011683583259583, "learning_rate": 0.0004892445492628043, "loss": 0.1664, "mean_token_accuracy": 0.9547821134328842, "num_tokens": 2440992.0, "step": 276 }, { "entropy": 2.3411240577697754, "epoch": 1.0, "grad_norm": 0.4944029450416565, "learning_rate": 0.000489053880824983, "loss": 0.022, "mean_token_accuracy": 0.9929078221321106, "num_tokens": 2441725.0, "step": 277 }, { "epoch": 1.0, "eval_entropy": 2.5467925265552553, "eval_loss": 0.21274714171886444, "eval_mean_token_accuracy": 0.9444630068492114, "eval_num_tokens": 2441725.0, "eval_runtime": 116.0434, "eval_samples_per_second": 3.18, "eval_steps_per_second": 1.06, "step": 277 }, { "entropy": 2.609170138835907, "epoch": 1.0036199095022624, "grad_norm": 1.0785081386566162, "learning_rate": 0.0004888624223619136, "loss": 0.3167, "mean_token_accuracy": 0.9296800643205643, "num_tokens": 2450193.0, "step": 278 }, { "entropy": 2.497025430202484, "epoch": 1.0072398190045249, "grad_norm": 0.5221985578536987, "learning_rate": 0.0004886701746007801, "loss": 0.0854, "mean_token_accuracy": 0.9753399342298508, "num_tokens": 2459309.0, "step": 279 }, { "entropy": 2.5487362146377563, "epoch": 1.0108597285067873, "grad_norm": 0.5161958336830139, "learning_rate": 0.0004884771382717638, "loss": 0.0819, "mean_token_accuracy": 0.9748431146144867, "num_tokens": 2467844.0, "step": 280 }, { "entropy": 2.5276209115982056, "epoch": 1.0144796380090497, "grad_norm": 0.5731730461120605, "learning_rate": 0.0004882833141080412, "loss": 0.1541, "mean_token_accuracy": 0.9567564427852631, "num_tokens": 2476894.0, "step": 281 }, { "entropy": 2.4442760348320007, "epoch": 1.0180995475113122, "grad_norm": 0.7120366096496582, "learning_rate": 0.0004880887028457813, "loss": 0.1945, "mean_token_accuracy": 0.9465379565954208, "num_tokens": 2485971.0, "step": 282 }, { "entropy": 2.4069360494613647, "epoch": 1.0217194570135746, "grad_norm": 0.7468647360801697, "learning_rate": 0.00048789330522414244, "loss": 0.2345, "mean_token_accuracy": 0.9446765780448914, "num_tokens": 2495043.0, "step": 283 }, { "entropy": 2.468382716178894, "epoch": 1.025339366515837, "grad_norm": 0.666231632232666, "learning_rate": 0.0004876971219852697, "loss": 0.1779, "mean_token_accuracy": 0.9534575343132019, "num_tokens": 2503672.0, "step": 284 }, { "entropy": 2.4362316727638245, "epoch": 1.0289592760180994, "grad_norm": 0.8445858955383301, "learning_rate": 0.000487500153874292, "loss": 0.1698, "mean_token_accuracy": 0.953661322593689, "num_tokens": 2512322.0, "step": 285 }, { "entropy": 2.364333391189575, "epoch": 1.032579185520362, "grad_norm": 0.4805246591567993, "learning_rate": 0.0004873024016393193, "loss": 0.0778, "mean_token_accuracy": 0.9824571758508682, "num_tokens": 2520791.0, "step": 286 }, { "entropy": 2.223461151123047, "epoch": 1.0361990950226245, "grad_norm": 0.648465096950531, "learning_rate": 0.0004871038660314399, "loss": 0.2593, "mean_token_accuracy": 0.9419913589954376, "num_tokens": 2530082.0, "step": 287 }, { "entropy": 2.3313387036323547, "epoch": 1.039819004524887, "grad_norm": 0.6912294626235962, "learning_rate": 0.00048690454780471725, "loss": 0.1354, "mean_token_accuracy": 0.9561934620141983, "num_tokens": 2538728.0, "step": 288 }, { "entropy": 2.191806375980377, "epoch": 1.0434389140271494, "grad_norm": 0.8620694279670715, "learning_rate": 0.0004867044477161874, "loss": 0.1103, "mean_token_accuracy": 0.968692272901535, "num_tokens": 2547219.0, "step": 289 }, { "entropy": 2.167125165462494, "epoch": 1.0470588235294118, "grad_norm": 0.6192149519920349, "learning_rate": 0.0004865035665258559, "loss": 0.1288, "mean_token_accuracy": 0.9643534421920776, "num_tokens": 2555940.0, "step": 290 }, { "entropy": 2.2750985622406006, "epoch": 1.0506787330316743, "grad_norm": 1.7459602355957031, "learning_rate": 0.0004863019049966953, "loss": 0.393, "mean_token_accuracy": 0.9146681725978851, "num_tokens": 2564362.0, "step": 291 }, { "entropy": 2.236129105091095, "epoch": 1.0542986425339367, "grad_norm": 0.6311184167861938, "learning_rate": 0.0004860994638946416, "loss": 0.1536, "mean_token_accuracy": 0.9636097103357315, "num_tokens": 2573316.0, "step": 292 }, { "entropy": 2.2642418146133423, "epoch": 1.0579185520361991, "grad_norm": 0.6023411154747009, "learning_rate": 0.000485896243988592, "loss": 0.191, "mean_token_accuracy": 0.9476015418767929, "num_tokens": 2581835.0, "step": 293 }, { "entropy": 2.3589024543762207, "epoch": 1.0615384615384615, "grad_norm": 0.48049232363700867, "learning_rate": 0.0004856922460504016, "loss": 0.1017, "mean_token_accuracy": 0.9713075459003448, "num_tokens": 2590317.0, "step": 294 }, { "entropy": 2.4141315817832947, "epoch": 1.065158371040724, "grad_norm": 0.8456616997718811, "learning_rate": 0.0004854874708548806, "loss": 0.1422, "mean_token_accuracy": 0.9622762501239777, "num_tokens": 2598538.0, "step": 295 }, { "entropy": 2.069903999567032, "epoch": 1.0687782805429864, "grad_norm": 0.7641116380691528, "learning_rate": 0.0004852819191797912, "loss": 0.2185, "mean_token_accuracy": 0.9464851468801498, "num_tokens": 2608219.0, "step": 296 }, { "entropy": 2.163217008113861, "epoch": 1.0723981900452488, "grad_norm": 0.546085000038147, "learning_rate": 0.0004850755918058449, "loss": 0.1035, "mean_token_accuracy": 0.9708487540483475, "num_tokens": 2617261.0, "step": 297 }, { "entropy": 2.2678662836551666, "epoch": 1.0760180995475113, "grad_norm": 0.8699386119842529, "learning_rate": 0.0004848684895166994, "loss": 0.2384, "mean_token_accuracy": 0.9486480504274368, "num_tokens": 2626144.0, "step": 298 }, { "entropy": 2.13065105676651, "epoch": 1.0796380090497737, "grad_norm": 0.44323107600212097, "learning_rate": 0.00048466061309895554, "loss": 0.0818, "mean_token_accuracy": 0.9722468554973602, "num_tokens": 2635626.0, "step": 299 }, { "entropy": 2.184772551059723, "epoch": 1.0832579185520361, "grad_norm": 0.7928256988525391, "learning_rate": 0.0004844519633421545, "loss": 0.2378, "mean_token_accuracy": 0.9477885961532593, "num_tokens": 2644674.0, "step": 300 }, { "entropy": 2.1669145822525024, "epoch": 1.0868778280542986, "grad_norm": 0.5570158362388611, "learning_rate": 0.00048424254103877456, "loss": 0.1434, "mean_token_accuracy": 0.9587411731481552, "num_tokens": 2653658.0, "step": 301 }, { "entropy": 2.3057579398155212, "epoch": 1.090497737556561, "grad_norm": 0.9084392189979553, "learning_rate": 0.00048403234698422837, "loss": 0.3831, "mean_token_accuracy": 0.8896283358335495, "num_tokens": 2662350.0, "step": 302 }, { "entropy": 2.1741657853126526, "epoch": 1.0941176470588236, "grad_norm": 0.6791238784790039, "learning_rate": 0.0004838213819768597, "loss": 0.1648, "mean_token_accuracy": 0.9576362520456314, "num_tokens": 2671450.0, "step": 303 }, { "entropy": 2.089864045381546, "epoch": 1.097737556561086, "grad_norm": 0.5696312189102173, "learning_rate": 0.0004836096468179406, "loss": 0.1269, "mean_token_accuracy": 0.9658148884773254, "num_tokens": 2680581.0, "step": 304 }, { "entropy": 2.2657605409622192, "epoch": 1.1013574660633485, "grad_norm": 1.605503797531128, "learning_rate": 0.0004833971423116682, "loss": 0.1027, "mean_token_accuracy": 0.9762597978115082, "num_tokens": 2689001.0, "step": 305 }, { "entropy": 2.079287111759186, "epoch": 1.104977375565611, "grad_norm": 0.5804780721664429, "learning_rate": 0.00048318386926516157, "loss": 0.1137, "mean_token_accuracy": 0.9633719325065613, "num_tokens": 2698050.0, "step": 306 }, { "entropy": 2.201345145702362, "epoch": 1.1085972850678734, "grad_norm": 0.8606241941452026, "learning_rate": 0.000482969828488459, "loss": 0.2124, "mean_token_accuracy": 0.9472681730985641, "num_tokens": 2706704.0, "step": 307 }, { "entropy": 2.095236599445343, "epoch": 1.1122171945701358, "grad_norm": 0.7078782320022583, "learning_rate": 0.0004827550207945147, "loss": 0.1957, "mean_token_accuracy": 0.9564679116010666, "num_tokens": 2715745.0, "step": 308 }, { "entropy": 2.186302363872528, "epoch": 1.1158371040723982, "grad_norm": 0.7166503667831421, "learning_rate": 0.0004825394469991956, "loss": 0.1539, "mean_token_accuracy": 0.9662427455186844, "num_tokens": 2724296.0, "step": 309 }, { "entropy": 2.052559405565262, "epoch": 1.1194570135746607, "grad_norm": 0.6510501503944397, "learning_rate": 0.00048232310792127846, "loss": 0.1831, "mean_token_accuracy": 0.9533994495868683, "num_tokens": 2733482.0, "step": 310 }, { "entropy": 2.093154102563858, "epoch": 1.123076923076923, "grad_norm": 0.711121678352356, "learning_rate": 0.0004821060043824466, "loss": 0.2315, "mean_token_accuracy": 0.9381555914878845, "num_tokens": 2742912.0, "step": 311 }, { "entropy": 2.188497006893158, "epoch": 1.1266968325791855, "grad_norm": 0.6782490015029907, "learning_rate": 0.00048188813720728707, "loss": 0.2, "mean_token_accuracy": 0.9501812607049942, "num_tokens": 2751808.0, "step": 312 }, { "entropy": 2.0495824217796326, "epoch": 1.130316742081448, "grad_norm": 0.7644634246826172, "learning_rate": 0.00048166950722328697, "loss": 0.2152, "mean_token_accuracy": 0.9440928995609283, "num_tokens": 2761066.0, "step": 313 }, { "entropy": 2.1707025468349457, "epoch": 1.1339366515837104, "grad_norm": 0.655131459236145, "learning_rate": 0.00048145011526083106, "loss": 0.1637, "mean_token_accuracy": 0.9500558227300644, "num_tokens": 2769870.0, "step": 314 }, { "entropy": 2.1047372221946716, "epoch": 1.1375565610859728, "grad_norm": 0.5353516936302185, "learning_rate": 0.0004812299621531979, "loss": 0.1705, "mean_token_accuracy": 0.9455999433994293, "num_tokens": 2779383.0, "step": 315 }, { "entropy": 2.1921610236167908, "epoch": 1.1411764705882352, "grad_norm": 0.8998016119003296, "learning_rate": 0.00048100904873655696, "loss": 0.3918, "mean_token_accuracy": 0.9382697492837906, "num_tokens": 2788386.0, "step": 316 }, { "entropy": 2.0850723683834076, "epoch": 1.1447963800904977, "grad_norm": 0.867432713508606, "learning_rate": 0.0004807873758499656, "loss": 0.2196, "mean_token_accuracy": 0.9498324394226074, "num_tokens": 2797496.0, "step": 317 }, { "entropy": 2.1980925798416138, "epoch": 1.14841628959276, "grad_norm": 0.6076980233192444, "learning_rate": 0.00048056494433536577, "loss": 0.1086, "mean_token_accuracy": 0.9642161130905151, "num_tokens": 2805836.0, "step": 318 }, { "entropy": 2.15611070394516, "epoch": 1.1520361990950225, "grad_norm": 0.6276211738586426, "learning_rate": 0.0004803417550375806, "loss": 0.1463, "mean_token_accuracy": 0.9622830748558044, "num_tokens": 2814404.0, "step": 319 }, { "entropy": 2.0017230808734894, "epoch": 1.155656108597285, "grad_norm": 0.5840948820114136, "learning_rate": 0.0004801178088043115, "loss": 0.1869, "mean_token_accuracy": 0.9506777077913284, "num_tokens": 2823786.0, "step": 320 }, { "entropy": 2.1539418697357178, "epoch": 1.1592760180995474, "grad_norm": 1.074331283569336, "learning_rate": 0.0004798931064861349, "loss": 0.2797, "mean_token_accuracy": 0.9271649420261383, "num_tokens": 2832374.0, "step": 321 }, { "entropy": 1.930726408958435, "epoch": 1.16289592760181, "grad_norm": 0.5121958255767822, "learning_rate": 0.0004796676489364988, "loss": 0.1579, "mean_token_accuracy": 0.9582571685314178, "num_tokens": 2841561.0, "step": 322 }, { "entropy": 2.0205810368061066, "epoch": 1.1665158371040725, "grad_norm": 0.6360969543457031, "learning_rate": 0.00047944143701171966, "loss": 0.1582, "mean_token_accuracy": 0.9620308429002762, "num_tokens": 2850171.0, "step": 323 }, { "entropy": 1.9655758142471313, "epoch": 1.170135746606335, "grad_norm": 0.6647385358810425, "learning_rate": 0.0004792144715709792, "loss": 0.1594, "mean_token_accuracy": 0.954497441649437, "num_tokens": 2858905.0, "step": 324 }, { "entropy": 1.9725223183631897, "epoch": 1.1737556561085973, "grad_norm": 0.6429229974746704, "learning_rate": 0.0004789867534763211, "loss": 0.1407, "mean_token_accuracy": 0.9645214527845383, "num_tokens": 2867533.0, "step": 325 }, { "entropy": 1.9473685026168823, "epoch": 1.1773755656108598, "grad_norm": 0.811651349067688, "learning_rate": 0.0004787582835926477, "loss": 0.1608, "mean_token_accuracy": 0.9479968994855881, "num_tokens": 2876286.0, "step": 326 }, { "entropy": 1.8863109350204468, "epoch": 1.1809954751131222, "grad_norm": 0.5587059855461121, "learning_rate": 0.00047852906278771686, "loss": 0.131, "mean_token_accuracy": 0.9684520065784454, "num_tokens": 2885667.0, "step": 327 }, { "entropy": 1.8288891315460205, "epoch": 1.1846153846153846, "grad_norm": 0.8450536131858826, "learning_rate": 0.0004782990919321383, "loss": 0.2224, "mean_token_accuracy": 0.9377491921186447, "num_tokens": 2894765.0, "step": 328 }, { "entropy": 1.9347718358039856, "epoch": 1.188235294117647, "grad_norm": 0.7665867209434509, "learning_rate": 0.0004780683718993705, "loss": 0.167, "mean_token_accuracy": 0.9583602845668793, "num_tokens": 2903551.0, "step": 329 }, { "entropy": 1.9097798764705658, "epoch": 1.1918552036199095, "grad_norm": 0.7705667018890381, "learning_rate": 0.00047783690356571784, "loss": 0.2115, "mean_token_accuracy": 0.9526428133249283, "num_tokens": 2912197.0, "step": 330 }, { "entropy": 1.9174850285053253, "epoch": 1.195475113122172, "grad_norm": 0.5695499181747437, "learning_rate": 0.00047760468781032634, "loss": 0.1033, "mean_token_accuracy": 0.969958484172821, "num_tokens": 2920579.0, "step": 331 }, { "entropy": 1.8578442931175232, "epoch": 1.1990950226244343, "grad_norm": 0.7843735814094543, "learning_rate": 0.000477371725515181, "loss": 0.1664, "mean_token_accuracy": 0.9545005410909653, "num_tokens": 2929352.0, "step": 332 }, { "entropy": 1.8509328961372375, "epoch": 1.2027149321266968, "grad_norm": 0.5951048135757446, "learning_rate": 0.0004771380175651026, "loss": 0.1566, "mean_token_accuracy": 0.9551403075456619, "num_tokens": 2938387.0, "step": 333 }, { "entropy": 1.8236390948295593, "epoch": 1.2063348416289592, "grad_norm": 0.4988223910331726, "learning_rate": 0.0004769035648477434, "loss": 0.1242, "mean_token_accuracy": 0.966319814324379, "num_tokens": 2947741.0, "step": 334 }, { "entropy": 1.9594822525978088, "epoch": 1.2099547511312216, "grad_norm": 0.7550755143165588, "learning_rate": 0.00047666836825358477, "loss": 0.1591, "mean_token_accuracy": 0.9666347652673721, "num_tokens": 2956313.0, "step": 335 }, { "entropy": 1.9148444533348083, "epoch": 1.213574660633484, "grad_norm": 0.5889077186584473, "learning_rate": 0.00047643242867593345, "loss": 0.1343, "mean_token_accuracy": 0.9611433297395706, "num_tokens": 2964928.0, "step": 336 }, { "entropy": 1.8126957714557648, "epoch": 1.2171945701357467, "grad_norm": 0.5447750091552734, "learning_rate": 0.0004761957470109179, "loss": 0.1659, "mean_token_accuracy": 0.9552300125360489, "num_tokens": 2974160.0, "step": 337 }, { "entropy": 1.7981431782245636, "epoch": 1.2208144796380092, "grad_norm": 0.5400761365890503, "learning_rate": 0.0004759583241574854, "loss": 0.1339, "mean_token_accuracy": 0.9620136916637421, "num_tokens": 2982900.0, "step": 338 }, { "entropy": 1.8613979518413544, "epoch": 1.2244343891402716, "grad_norm": 0.7452914714813232, "learning_rate": 0.0004757201610173981, "loss": 0.4, "mean_token_accuracy": 0.9068266004323959, "num_tokens": 2991783.0, "step": 339 }, { "entropy": 1.8654026687145233, "epoch": 1.228054298642534, "grad_norm": 1.7142685651779175, "learning_rate": 0.00047548125849523, "loss": 0.3168, "mean_token_accuracy": 0.9308896362781525, "num_tokens": 3000530.0, "step": 340 }, { "entropy": 1.7702704071998596, "epoch": 1.2316742081447964, "grad_norm": 0.6687431931495667, "learning_rate": 0.0004752416174983633, "loss": 0.1697, "mean_token_accuracy": 0.9530515670776367, "num_tokens": 3009355.0, "step": 341 }, { "entropy": 1.735857516527176, "epoch": 1.2352941176470589, "grad_norm": 0.6127599477767944, "learning_rate": 0.00047500123893698507, "loss": 0.1706, "mean_token_accuracy": 0.9593266248703003, "num_tokens": 3018518.0, "step": 342 }, { "entropy": 1.7076368927955627, "epoch": 1.2389140271493213, "grad_norm": 0.6973987817764282, "learning_rate": 0.0004747601237240836, "loss": 0.1615, "mean_token_accuracy": 0.9539438933134079, "num_tokens": 3027752.0, "step": 343 }, { "entropy": 1.7353227138519287, "epoch": 1.2425339366515837, "grad_norm": 0.8406392335891724, "learning_rate": 0.00047451827277544546, "loss": 0.2063, "mean_token_accuracy": 0.9488435834646225, "num_tokens": 3036383.0, "step": 344 }, { "entropy": 1.6597246527671814, "epoch": 1.2461538461538462, "grad_norm": 0.5971431732177734, "learning_rate": 0.00047427568700965107, "loss": 0.1013, "mean_token_accuracy": 0.9721864312887192, "num_tokens": 3045375.0, "step": 345 }, { "entropy": 1.7100033462047577, "epoch": 1.2497737556561086, "grad_norm": 0.5883470773696899, "learning_rate": 0.00047403236734807225, "loss": 0.1164, "mean_token_accuracy": 0.9664830714464188, "num_tokens": 3054084.0, "step": 346 }, { "entropy": 1.7402609288692474, "epoch": 1.253393665158371, "grad_norm": 0.7355862855911255, "learning_rate": 0.00047378831471486815, "loss": 0.2007, "mean_token_accuracy": 0.9560511559247971, "num_tokens": 3062727.0, "step": 347 }, { "entropy": 1.79518261551857, "epoch": 1.2570135746606335, "grad_norm": 0.6006518006324768, "learning_rate": 0.00047354353003698163, "loss": 0.1085, "mean_token_accuracy": 0.9598321914672852, "num_tokens": 3071178.0, "step": 348 }, { "entropy": 1.7328391373157501, "epoch": 1.260633484162896, "grad_norm": 0.560342013835907, "learning_rate": 0.0004732980142441362, "loss": 0.1593, "mean_token_accuracy": 0.9579409211874008, "num_tokens": 3079927.0, "step": 349 }, { "entropy": 1.7356511652469635, "epoch": 1.2642533936651583, "grad_norm": 0.9149975776672363, "learning_rate": 0.00047305176826883206, "loss": 0.4064, "mean_token_accuracy": 0.9265118837356567, "num_tokens": 3089314.0, "step": 350 }, { "entropy": 1.8573569357395172, "epoch": 1.2678733031674208, "grad_norm": 0.8300670981407166, "learning_rate": 0.0004728047930463428, "loss": 0.195, "mean_token_accuracy": 0.9453776180744171, "num_tokens": 3097702.0, "step": 351 }, { "entropy": 1.7906217575073242, "epoch": 1.2714932126696832, "grad_norm": 0.5668906569480896, "learning_rate": 0.0004725570895147118, "loss": 0.1572, "mean_token_accuracy": 0.962067037820816, "num_tokens": 3106379.0, "step": 352 }, { "entropy": 1.6957395374774933, "epoch": 1.2751131221719456, "grad_norm": 0.4048328399658203, "learning_rate": 0.0004723086586147487, "loss": 0.0944, "mean_token_accuracy": 0.9716819673776627, "num_tokens": 3115622.0, "step": 353 }, { "entropy": 1.8158144056797028, "epoch": 1.278733031674208, "grad_norm": 0.6396092772483826, "learning_rate": 0.00047205950129002564, "loss": 0.1011, "mean_token_accuracy": 0.9698463827371597, "num_tokens": 3124016.0, "step": 354 }, { "entropy": 1.730194479227066, "epoch": 1.2823529411764705, "grad_norm": 0.662876307964325, "learning_rate": 0.000471809618486874, "loss": 0.1641, "mean_token_accuracy": 0.9520179778337479, "num_tokens": 3132712.0, "step": 355 }, { "entropy": 1.6776110529899597, "epoch": 1.285972850678733, "grad_norm": 0.868507981300354, "learning_rate": 0.0004715590111543804, "loss": 0.3374, "mean_token_accuracy": 0.9303739666938782, "num_tokens": 3142103.0, "step": 356 }, { "entropy": 1.6501678824424744, "epoch": 1.2895927601809956, "grad_norm": 0.5433686971664429, "learning_rate": 0.0004713076802443834, "loss": 0.1237, "mean_token_accuracy": 0.9653612226247787, "num_tokens": 3151192.0, "step": 357 }, { "entropy": 1.6524465382099152, "epoch": 1.293212669683258, "grad_norm": 0.6145523190498352, "learning_rate": 0.00047105562671147, "loss": 0.1204, "mean_token_accuracy": 0.9690534323453903, "num_tokens": 3159839.0, "step": 358 }, { "entropy": 1.5339214205741882, "epoch": 1.2968325791855204, "grad_norm": 0.500477135181427, "learning_rate": 0.00047080285151297144, "loss": 0.1295, "mean_token_accuracy": 0.9571033865213394, "num_tokens": 3169047.0, "step": 359 }, { "entropy": 1.6765435338020325, "epoch": 1.3004524886877828, "grad_norm": 0.6697553396224976, "learning_rate": 0.00047054935560896026, "loss": 0.135, "mean_token_accuracy": 0.9672541171312332, "num_tokens": 3177062.0, "step": 360 }, { "entropy": 1.5932062566280365, "epoch": 1.3040723981900453, "grad_norm": 0.706957221031189, "learning_rate": 0.0004702951399622462, "loss": 0.1229, "mean_token_accuracy": 0.9634416699409485, "num_tokens": 3185829.0, "step": 361 }, { "entropy": 1.5623145997524261, "epoch": 1.3076923076923077, "grad_norm": 0.6199461221694946, "learning_rate": 0.00047004020553837275, "loss": 0.1449, "mean_token_accuracy": 0.9620065689086914, "num_tokens": 3194426.0, "step": 362 }, { "entropy": 1.5226828753948212, "epoch": 1.3113122171945701, "grad_norm": 0.8962509036064148, "learning_rate": 0.0004697845533056132, "loss": 0.2207, "mean_token_accuracy": 0.9403344839811325, "num_tokens": 3203655.0, "step": 363 }, { "entropy": 1.5395641326904297, "epoch": 1.3149321266968326, "grad_norm": 0.5993619561195374, "learning_rate": 0.00046952818423496727, "loss": 0.1486, "mean_token_accuracy": 0.9614185988903046, "num_tokens": 3212069.0, "step": 364 }, { "entropy": 1.5738630294799805, "epoch": 1.318552036199095, "grad_norm": 0.7393983602523804, "learning_rate": 0.00046927109930015756, "loss": 0.1812, "mean_token_accuracy": 0.9535021334886551, "num_tokens": 3220482.0, "step": 365 }, { "entropy": 1.5462632775306702, "epoch": 1.3221719457013574, "grad_norm": 0.7453555464744568, "learning_rate": 0.0004690132994776253, "loss": 0.164, "mean_token_accuracy": 0.9585814625024796, "num_tokens": 3229505.0, "step": 366 }, { "entropy": 1.5241961777210236, "epoch": 1.3257918552036199, "grad_norm": 0.7553415298461914, "learning_rate": 0.00046875478574652713, "loss": 0.1445, "mean_token_accuracy": 0.9682841598987579, "num_tokens": 3238326.0, "step": 367 }, { "entropy": 1.5344699025154114, "epoch": 1.3294117647058823, "grad_norm": 0.8565949201583862, "learning_rate": 0.0004684955590887311, "loss": 0.2521, "mean_token_accuracy": 0.920401468873024, "num_tokens": 3247482.0, "step": 368 }, { "entropy": 1.5109277665615082, "epoch": 1.3330316742081447, "grad_norm": 0.5170580148696899, "learning_rate": 0.00046823562048881295, "loss": 0.1393, "mean_token_accuracy": 0.9584086239337921, "num_tokens": 3256464.0, "step": 369 }, { "entropy": 1.4666939079761505, "epoch": 1.3366515837104074, "grad_norm": 0.6995373368263245, "learning_rate": 0.0004679749709340529, "loss": 0.1726, "mean_token_accuracy": 0.9477890431880951, "num_tokens": 3265853.0, "step": 370 }, { "entropy": 1.4208430051803589, "epoch": 1.3402714932126698, "grad_norm": 1.1363991498947144, "learning_rate": 0.000467713611414431, "loss": 0.196, "mean_token_accuracy": 0.9495431333780289, "num_tokens": 3275367.0, "step": 371 }, { "entropy": 1.5009459853172302, "epoch": 1.3438914027149322, "grad_norm": 0.7883325219154358, "learning_rate": 0.00046745154292262414, "loss": 0.2526, "mean_token_accuracy": 0.9334618002176285, "num_tokens": 3284772.0, "step": 372 }, { "entropy": 1.5485479533672333, "epoch": 1.3475113122171947, "grad_norm": 0.6516429781913757, "learning_rate": 0.00046718876645400156, "loss": 0.2057, "mean_token_accuracy": 0.9546459317207336, "num_tokens": 3293493.0, "step": 373 }, { "entropy": 1.6237249970436096, "epoch": 1.351131221719457, "grad_norm": 0.8916263580322266, "learning_rate": 0.00046692528300662213, "loss": 0.2123, "mean_token_accuracy": 0.9456845372915268, "num_tokens": 3302063.0, "step": 374 }, { "entropy": 1.561572015285492, "epoch": 1.3547511312217195, "grad_norm": 0.7527791857719421, "learning_rate": 0.00046666109358122935, "loss": 0.2113, "mean_token_accuracy": 0.9537477940320969, "num_tokens": 3311037.0, "step": 375 }, { "entropy": 1.5594256818294525, "epoch": 1.358371040723982, "grad_norm": 1.25638747215271, "learning_rate": 0.0004663961991812485, "loss": 0.1629, "mean_token_accuracy": 0.9508458077907562, "num_tokens": 3319635.0, "step": 376 }, { "entropy": 1.6909976303577423, "epoch": 1.3619909502262444, "grad_norm": 0.7627813220024109, "learning_rate": 0.00046613060081278194, "loss": 0.2303, "mean_token_accuracy": 0.9425801336765289, "num_tokens": 3328043.0, "step": 377 }, { "entropy": 1.6074829697608948, "epoch": 1.3656108597285068, "grad_norm": 0.6584346294403076, "learning_rate": 0.00046586429948460646, "loss": 0.1815, "mean_token_accuracy": 0.9536214470863342, "num_tokens": 3337143.0, "step": 378 }, { "entropy": 1.7382183969020844, "epoch": 1.3692307692307693, "grad_norm": 1.37154221534729, "learning_rate": 0.0004655972962081684, "loss": 0.1849, "mean_token_accuracy": 0.948440819978714, "num_tokens": 3346033.0, "step": 379 }, { "entropy": 1.7148900926113129, "epoch": 1.3728506787330317, "grad_norm": 0.9487980604171753, "learning_rate": 0.00046532959199758, "loss": 0.2521, "mean_token_accuracy": 0.9344504028558731, "num_tokens": 3354849.0, "step": 380 }, { "entropy": 1.7164019346237183, "epoch": 1.3764705882352941, "grad_norm": 0.5609025359153748, "learning_rate": 0.00046506118786961614, "loss": 0.1425, "mean_token_accuracy": 0.9571309834718704, "num_tokens": 3363674.0, "step": 381 }, { "entropy": 1.894619107246399, "epoch": 1.3800904977375565, "grad_norm": 0.9811336994171143, "learning_rate": 0.00046479208484370997, "loss": 0.2522, "mean_token_accuracy": 0.9424156546592712, "num_tokens": 3372325.0, "step": 382 }, { "entropy": 1.78870290517807, "epoch": 1.383710407239819, "grad_norm": 0.5707085132598877, "learning_rate": 0.00046452228394194893, "loss": 0.1354, "mean_token_accuracy": 0.9613165706396103, "num_tokens": 3381270.0, "step": 383 }, { "entropy": 1.803922712802887, "epoch": 1.3873303167420814, "grad_norm": 0.5655364394187927, "learning_rate": 0.0004642517861890713, "loss": 0.0818, "mean_token_accuracy": 0.9776160269975662, "num_tokens": 3390363.0, "step": 384 }, { "entropy": 1.8172507882118225, "epoch": 1.3909502262443438, "grad_norm": 0.6950513124465942, "learning_rate": 0.00046398059261246205, "loss": 0.1145, "mean_token_accuracy": 0.963288351893425, "num_tokens": 3399176.0, "step": 385 }, { "entropy": 1.9182518422603607, "epoch": 1.3945701357466063, "grad_norm": 0.5900619029998779, "learning_rate": 0.0004637087042421489, "loss": 0.108, "mean_token_accuracy": 0.9723307639360428, "num_tokens": 3407978.0, "step": 386 }, { "entropy": 1.8558574616909027, "epoch": 1.3981900452488687, "grad_norm": 0.6279832124710083, "learning_rate": 0.00046343612211079843, "loss": 0.1471, "mean_token_accuracy": 0.9603912532329559, "num_tokens": 3416856.0, "step": 387 }, { "entropy": 1.8146779537200928, "epoch": 1.4018099547511311, "grad_norm": 0.6171274781227112, "learning_rate": 0.0004631628472537125, "loss": 0.1872, "mean_token_accuracy": 0.9447146654129028, "num_tokens": 3426044.0, "step": 388 }, { "entropy": 1.9342225790023804, "epoch": 1.4054298642533936, "grad_norm": 0.9947887659072876, "learning_rate": 0.00046288888070882374, "loss": 0.2966, "mean_token_accuracy": 0.9279204607009888, "num_tokens": 3435154.0, "step": 389 }, { "entropy": 1.9391801953315735, "epoch": 1.409049773755656, "grad_norm": 0.7155653834342957, "learning_rate": 0.000462614223516692, "loss": 0.1847, "mean_token_accuracy": 0.9475171864032745, "num_tokens": 3444563.0, "step": 390 }, { "entropy": 2.0716978013515472, "epoch": 1.4126696832579184, "grad_norm": 0.8198989629745483, "learning_rate": 0.0004623388767205004, "loss": 0.1317, "mean_token_accuracy": 0.9608721435070038, "num_tokens": 3453410.0, "step": 391 }, { "entropy": 2.1060431599617004, "epoch": 1.416289592760181, "grad_norm": 1.025406002998352, "learning_rate": 0.00046206284136605106, "loss": 0.2146, "mean_token_accuracy": 0.9414294511079788, "num_tokens": 3461958.0, "step": 392 }, { "entropy": 2.1459922194480896, "epoch": 1.4199095022624435, "grad_norm": 0.9209627509117126, "learning_rate": 0.00046178611850176146, "loss": 0.2137, "mean_token_accuracy": 0.956874743103981, "num_tokens": 3470547.0, "step": 393 }, { "entropy": 2.0233450531959534, "epoch": 1.423529411764706, "grad_norm": 0.5777944922447205, "learning_rate": 0.00046150870917866025, "loss": 0.122, "mean_token_accuracy": 0.9672323018312454, "num_tokens": 3479618.0, "step": 394 }, { "entropy": 2.035937190055847, "epoch": 1.4271493212669684, "grad_norm": 0.7945542931556702, "learning_rate": 0.0004612306144503835, "loss": 0.2879, "mean_token_accuracy": 0.946587473154068, "num_tokens": 3488533.0, "step": 395 }, { "entropy": 2.155315637588501, "epoch": 1.4307692307692308, "grad_norm": 0.6385292410850525, "learning_rate": 0.00046095183537317035, "loss": 0.1008, "mean_token_accuracy": 0.9655124247074127, "num_tokens": 3496686.0, "step": 396 }, { "entropy": 2.186827063560486, "epoch": 1.4343891402714932, "grad_norm": 0.4759826958179474, "learning_rate": 0.0004606723730058593, "loss": 0.0768, "mean_token_accuracy": 0.9783597737550735, "num_tokens": 3504958.0, "step": 397 }, { "entropy": 1.974392294883728, "epoch": 1.4380090497737557, "grad_norm": 0.6250292062759399, "learning_rate": 0.00046039222840988406, "loss": 0.1381, "mean_token_accuracy": 0.9586146324872971, "num_tokens": 3513694.0, "step": 398 }, { "entropy": 2.045738846063614, "epoch": 1.441628959276018, "grad_norm": 0.5517769455909729, "learning_rate": 0.0004601114026492695, "loss": 0.1312, "mean_token_accuracy": 0.9682512134313583, "num_tokens": 3522395.0, "step": 399 }, { "entropy": 2.105030357837677, "epoch": 1.4452488687782805, "grad_norm": 0.6748242974281311, "learning_rate": 0.0004598298967906276, "loss": 0.1056, "mean_token_accuracy": 0.9701305478811264, "num_tokens": 3530838.0, "step": 400 }, { "entropy": 2.024325281381607, "epoch": 1.448868778280543, "grad_norm": 0.6320233941078186, "learning_rate": 0.00045954771190315344, "loss": 0.1129, "mean_token_accuracy": 0.9633017927408218, "num_tokens": 3540184.0, "step": 401 }, { "entropy": 2.1561593413352966, "epoch": 1.4524886877828054, "grad_norm": 0.7380363941192627, "learning_rate": 0.0004592648490586213, "loss": 0.1304, "mean_token_accuracy": 0.9599586874246597, "num_tokens": 3548727.0, "step": 402 }, { "entropy": 2.2986454367637634, "epoch": 1.4561085972850678, "grad_norm": 0.669114351272583, "learning_rate": 0.00045898130933138024, "loss": 0.1005, "mean_token_accuracy": 0.9724964797496796, "num_tokens": 3556780.0, "step": 403 }, { "entropy": 2.103136509656906, "epoch": 1.4597285067873302, "grad_norm": 0.6677402853965759, "learning_rate": 0.0004586970937983504, "loss": 0.1177, "mean_token_accuracy": 0.9597653448581696, "num_tokens": 3565427.0, "step": 404 }, { "entropy": 2.112696200609207, "epoch": 1.463348416289593, "grad_norm": 0.4597342014312744, "learning_rate": 0.0004584122035390185, "loss": 0.0695, "mean_token_accuracy": 0.9763098359107971, "num_tokens": 3573902.0, "step": 405 }, { "entropy": 2.0472628474235535, "epoch": 1.4669683257918553, "grad_norm": 0.7842056751251221, "learning_rate": 0.0004581266396354339, "loss": 0.1981, "mean_token_accuracy": 0.9521032422780991, "num_tokens": 3582913.0, "step": 406 }, { "entropy": 2.236558735370636, "epoch": 1.4705882352941178, "grad_norm": 0.7634767293930054, "learning_rate": 0.000457840403172205, "loss": 0.1956, "mean_token_accuracy": 0.9602932929992676, "num_tokens": 3591197.0, "step": 407 }, { "entropy": 2.182949125766754, "epoch": 1.4742081447963802, "grad_norm": 0.7084661722183228, "learning_rate": 0.00045755349523649415, "loss": 0.2463, "mean_token_accuracy": 0.9392582327127457, "num_tokens": 3600134.0, "step": 408 }, { "entropy": 2.135133147239685, "epoch": 1.4778280542986426, "grad_norm": 0.8172940015792847, "learning_rate": 0.00045726591691801433, "loss": 0.2375, "mean_token_accuracy": 0.9458330571651459, "num_tokens": 3608945.0, "step": 409 }, { "entropy": 2.157473146915436, "epoch": 1.481447963800905, "grad_norm": 0.6165594458580017, "learning_rate": 0.0004569776693090246, "loss": 0.1628, "mean_token_accuracy": 0.9586529731750488, "num_tokens": 3617790.0, "step": 410 }, { "entropy": 2.15165376663208, "epoch": 1.4850678733031675, "grad_norm": 0.6619407534599304, "learning_rate": 0.0004566887535043263, "loss": 0.1866, "mean_token_accuracy": 0.9545126557350159, "num_tokens": 3626937.0, "step": 411 }, { "entropy": 2.271161735057831, "epoch": 1.48868778280543, "grad_norm": 0.5861835479736328, "learning_rate": 0.0004563991706012582, "loss": 0.1409, "mean_token_accuracy": 0.9595955163240433, "num_tokens": 3636025.0, "step": 412 }, { "entropy": 2.277799427509308, "epoch": 1.4923076923076923, "grad_norm": 0.6464956402778625, "learning_rate": 0.00045610892169969323, "loss": 0.0792, "mean_token_accuracy": 0.9806316941976547, "num_tokens": 3644746.0, "step": 413 }, { "entropy": 2.2143171429634094, "epoch": 1.4959276018099548, "grad_norm": 0.7531687021255493, "learning_rate": 0.00045581800790203366, "loss": 0.2584, "mean_token_accuracy": 0.9225966930389404, "num_tokens": 3654064.0, "step": 414 }, { "entropy": 2.231681764125824, "epoch": 1.4995475113122172, "grad_norm": 0.6902768015861511, "learning_rate": 0.00045552643031320726, "loss": 0.232, "mean_token_accuracy": 0.9433842301368713, "num_tokens": 3663130.0, "step": 415 }, { "entropy": 2.2672717571258545, "epoch": 1.5031674208144796, "grad_norm": 0.5134314894676208, "learning_rate": 0.00045523419004066273, "loss": 0.0874, "mean_token_accuracy": 0.9708191752433777, "num_tokens": 3671981.0, "step": 416 }, { "entropy": 2.3302834033966064, "epoch": 1.506787330316742, "grad_norm": 0.885969340801239, "learning_rate": 0.0004549412881943659, "loss": 0.0723, "mean_token_accuracy": 0.9791463166475296, "num_tokens": 3680525.0, "step": 417 }, { "entropy": 2.2693899869918823, "epoch": 1.5104072398190045, "grad_norm": 0.7424856424331665, "learning_rate": 0.00045464772588679547, "loss": 0.1509, "mean_token_accuracy": 0.9600907415151596, "num_tokens": 3689430.0, "step": 418 }, { "entropy": 2.4042725563049316, "epoch": 1.514027149321267, "grad_norm": 0.8968034982681274, "learning_rate": 0.0004543535042329382, "loss": 0.1984, "mean_token_accuracy": 0.9488537162542343, "num_tokens": 3697836.0, "step": 419 }, { "entropy": 2.2518428564071655, "epoch": 1.5176470588235293, "grad_norm": 0.5963534712791443, "learning_rate": 0.0004540586243502858, "loss": 0.1214, "mean_token_accuracy": 0.9711381644010544, "num_tokens": 3706675.0, "step": 420 }, { "entropy": 2.275522291660309, "epoch": 1.5212669683257918, "grad_norm": 1.0797090530395508, "learning_rate": 0.0004537630873588293, "loss": 0.2508, "mean_token_accuracy": 0.9247037768363953, "num_tokens": 3715631.0, "step": 421 }, { "entropy": 2.249617278575897, "epoch": 1.5248868778280542, "grad_norm": 0.7636313438415527, "learning_rate": 0.000453466894381056, "loss": 0.1112, "mean_token_accuracy": 0.9681926071643829, "num_tokens": 3724579.0, "step": 422 }, { "entropy": 2.280571699142456, "epoch": 1.5285067873303166, "grad_norm": 0.9915648698806763, "learning_rate": 0.00045317004654194464, "loss": 0.3532, "mean_token_accuracy": 0.9360047876834869, "num_tokens": 3733607.0, "step": 423 }, { "entropy": 2.241512656211853, "epoch": 1.532126696832579, "grad_norm": 0.924977719783783, "learning_rate": 0.0004528725449689611, "loss": 0.1997, "mean_token_accuracy": 0.9475428760051727, "num_tokens": 3742611.0, "step": 424 }, { "entropy": 2.201731503009796, "epoch": 1.5357466063348415, "grad_norm": 0.7018861770629883, "learning_rate": 0.0004525743907920542, "loss": 0.1683, "mean_token_accuracy": 0.9465018659830093, "num_tokens": 3751737.0, "step": 425 }, { "entropy": 2.28944593667984, "epoch": 1.539366515837104, "grad_norm": 0.5893452763557434, "learning_rate": 0.00045227558514365166, "loss": 0.0969, "mean_token_accuracy": 0.9711766839027405, "num_tokens": 3761245.0, "step": 426 }, { "entropy": 2.3497202396392822, "epoch": 1.5429864253393664, "grad_norm": 0.685279130935669, "learning_rate": 0.0004519761291586551, "loss": 0.106, "mean_token_accuracy": 0.9663016647100449, "num_tokens": 3769854.0, "step": 427 }, { "entropy": 2.308362066745758, "epoch": 1.5466063348416288, "grad_norm": 0.5116177797317505, "learning_rate": 0.00045167602397443694, "loss": 0.1132, "mean_token_accuracy": 0.9700013697147369, "num_tokens": 3778996.0, "step": 428 }, { "entropy": 2.238637685775757, "epoch": 1.5502262443438914, "grad_norm": 0.8374833464622498, "learning_rate": 0.00045137527073083457, "loss": 0.2539, "mean_token_accuracy": 0.9407305717468262, "num_tokens": 3787835.0, "step": 429 }, { "entropy": 2.3406758308410645, "epoch": 1.5538461538461539, "grad_norm": 0.5140913724899292, "learning_rate": 0.0004510738705701473, "loss": 0.1113, "mean_token_accuracy": 0.9635641574859619, "num_tokens": 3796498.0, "step": 430 }, { "entropy": 2.2642539143562317, "epoch": 1.5574660633484163, "grad_norm": 0.5750702023506165, "learning_rate": 0.0004507718246371313, "loss": 0.1127, "mean_token_accuracy": 0.9660817235708237, "num_tokens": 3805464.0, "step": 431 }, { "entropy": 2.2058264315128326, "epoch": 1.5610859728506787, "grad_norm": 0.6448659300804138, "learning_rate": 0.0004504691340789955, "loss": 0.0994, "mean_token_accuracy": 0.96739861369133, "num_tokens": 3814309.0, "step": 432 }, { "entropy": 2.330399215221405, "epoch": 1.5647058823529412, "grad_norm": 0.8432528376579285, "learning_rate": 0.0004501658000453973, "loss": 0.1999, "mean_token_accuracy": 0.9510775059461594, "num_tokens": 3823126.0, "step": 433 }, { "entropy": 2.4211326837539673, "epoch": 1.5683257918552036, "grad_norm": 0.8101194500923157, "learning_rate": 0.00044986182368843806, "loss": 0.144, "mean_token_accuracy": 0.9656328558921814, "num_tokens": 3831274.0, "step": 434 }, { "entropy": 2.2594956755638123, "epoch": 1.571945701357466, "grad_norm": 0.6753663420677185, "learning_rate": 0.0004495572061626585, "loss": 0.1433, "mean_token_accuracy": 0.9572386592626572, "num_tokens": 3840206.0, "step": 435 }, { "entropy": 2.1233682930469513, "epoch": 1.5755656108597285, "grad_norm": 0.48616713285446167, "learning_rate": 0.000449251948625035, "loss": 0.0934, "mean_token_accuracy": 0.9740773588418961, "num_tokens": 3849363.0, "step": 436 }, { "entropy": 2.325556695461273, "epoch": 1.5791855203619911, "grad_norm": 0.7744045853614807, "learning_rate": 0.00044894605223497446, "loss": 0.127, "mean_token_accuracy": 0.9687052518129349, "num_tokens": 3857733.0, "step": 437 }, { "entropy": 2.266542673110962, "epoch": 1.5828054298642535, "grad_norm": 2.373530387878418, "learning_rate": 0.00044863951815431045, "loss": 0.2404, "mean_token_accuracy": 0.9437267184257507, "num_tokens": 3866374.0, "step": 438 }, { "entropy": 2.1757248640060425, "epoch": 1.586425339366516, "grad_norm": 0.5588560700416565, "learning_rate": 0.00044833234754729847, "loss": 0.142, "mean_token_accuracy": 0.9601300358772278, "num_tokens": 3875520.0, "step": 439 }, { "entropy": 2.124377518892288, "epoch": 1.5900452488687784, "grad_norm": 0.5602438449859619, "learning_rate": 0.0004480245415806116, "loss": 0.1556, "mean_token_accuracy": 0.9561446160078049, "num_tokens": 3884345.0, "step": 440 }, { "entropy": 2.1571075320243835, "epoch": 1.5936651583710408, "grad_norm": 0.472598671913147, "learning_rate": 0.0004477161014233361, "loss": 0.0848, "mean_token_accuracy": 0.9742853343486786, "num_tokens": 3893129.0, "step": 441 }, { "entropy": 2.0434057414531708, "epoch": 1.5972850678733033, "grad_norm": 0.7104448676109314, "learning_rate": 0.00044740702824696703, "loss": 0.1524, "mean_token_accuracy": 0.9542464315891266, "num_tokens": 3902120.0, "step": 442 }, { "entropy": 2.1118403673171997, "epoch": 1.6009049773755657, "grad_norm": 0.6632394194602966, "learning_rate": 0.0004470973232254037, "loss": 0.3001, "mean_token_accuracy": 0.928197592496872, "num_tokens": 3910974.0, "step": 443 }, { "entropy": 2.0292475819587708, "epoch": 1.6045248868778281, "grad_norm": 1.050956130027771, "learning_rate": 0.00044678698753494527, "loss": 0.2226, "mean_token_accuracy": 0.9448522627353668, "num_tokens": 3920005.0, "step": 444 }, { "entropy": 1.991033524274826, "epoch": 1.6081447963800906, "grad_norm": 0.670244038105011, "learning_rate": 0.00044647602235428624, "loss": 0.2158, "mean_token_accuracy": 0.9551118016242981, "num_tokens": 3929334.0, "step": 445 }, { "entropy": 2.04949289560318, "epoch": 1.611764705882353, "grad_norm": 0.6321494579315186, "learning_rate": 0.00044616442886451197, "loss": 0.1743, "mean_token_accuracy": 0.9494802355766296, "num_tokens": 3938211.0, "step": 446 }, { "entropy": 2.1101951897144318, "epoch": 1.6153846153846154, "grad_norm": 0.6970012187957764, "learning_rate": 0.0004458522082490943, "loss": 0.1228, "mean_token_accuracy": 0.9624926447868347, "num_tokens": 3946534.0, "step": 447 }, { "entropy": 1.9337081909179688, "epoch": 1.6190045248868778, "grad_norm": 0.5971657633781433, "learning_rate": 0.0004455393616938868, "loss": 0.1431, "mean_token_accuracy": 0.9635348320007324, "num_tokens": 3955694.0, "step": 448 }, { "entropy": 1.9635128676891327, "epoch": 1.6226244343891403, "grad_norm": 0.8510827422142029, "learning_rate": 0.00044522589038712074, "loss": 0.2446, "mean_token_accuracy": 0.9457641988992691, "num_tokens": 3964907.0, "step": 449 }, { "entropy": 2.0336360335350037, "epoch": 1.6262443438914027, "grad_norm": 0.5803818106651306, "learning_rate": 0.00044491179551939985, "loss": 0.0872, "mean_token_accuracy": 0.9734505414962769, "num_tokens": 3973584.0, "step": 450 }, { "entropy": 2.0668878853321075, "epoch": 1.6298642533936651, "grad_norm": 0.6990496516227722, "learning_rate": 0.0004445970782836967, "loss": 0.1138, "mean_token_accuracy": 0.9702571034431458, "num_tokens": 3982632.0, "step": 451 }, { "entropy": 2.1481760144233704, "epoch": 1.6334841628959276, "grad_norm": 0.6156729459762573, "learning_rate": 0.00044428173987534733, "loss": 0.0936, "mean_token_accuracy": 0.9739355593919754, "num_tokens": 3991147.0, "step": 452 }, { "entropy": 2.0678701996803284, "epoch": 1.63710407239819, "grad_norm": 0.5441684126853943, "learning_rate": 0.0004439657814920472, "loss": 0.123, "mean_token_accuracy": 0.9693446308374405, "num_tokens": 3999990.0, "step": 453 }, { "entropy": 1.9867055118083954, "epoch": 1.6407239819004524, "grad_norm": 0.9218093156814575, "learning_rate": 0.00044364920433384656, "loss": 0.1997, "mean_token_accuracy": 0.9564195573329926, "num_tokens": 4009097.0, "step": 454 }, { "entropy": 2.145586997270584, "epoch": 1.6443438914027149, "grad_norm": 0.77643883228302, "learning_rate": 0.0004433320096031458, "loss": 0.1491, "mean_token_accuracy": 0.9602408111095428, "num_tokens": 4018059.0, "step": 455 }, { "entropy": 2.071108251810074, "epoch": 1.6479638009049773, "grad_norm": 0.5267088413238525, "learning_rate": 0.0004430141985046909, "loss": 0.0875, "mean_token_accuracy": 0.9764399826526642, "num_tokens": 4027089.0, "step": 456 }, { "entropy": 2.1659318804740906, "epoch": 1.6515837104072397, "grad_norm": 1.0642318725585938, "learning_rate": 0.000442695772245569, "loss": 0.2623, "mean_token_accuracy": 0.9307756721973419, "num_tokens": 4035719.0, "step": 457 }, { "entropy": 2.0232724249362946, "epoch": 1.6552036199095022, "grad_norm": 0.6213289499282837, "learning_rate": 0.0004423767320352035, "loss": 0.1597, "mean_token_accuracy": 0.9599647223949432, "num_tokens": 4045088.0, "step": 458 }, { "entropy": 2.047410547733307, "epoch": 1.6588235294117646, "grad_norm": 0.6346105933189392, "learning_rate": 0.0004420570790853498, "loss": 0.1422, "mean_token_accuracy": 0.9649711549282074, "num_tokens": 4054262.0, "step": 459 }, { "entropy": 2.0923012793064117, "epoch": 1.662443438914027, "grad_norm": 0.46477749943733215, "learning_rate": 0.0004417368146100907, "loss": 0.079, "mean_token_accuracy": 0.9777993708848953, "num_tokens": 4063107.0, "step": 460 }, { "entropy": 2.168913394212723, "epoch": 1.6660633484162894, "grad_norm": 0.5164734721183777, "learning_rate": 0.0004414159398258312, "loss": 0.0941, "mean_token_accuracy": 0.9725133627653122, "num_tokens": 4071656.0, "step": 461 }, { "entropy": 2.152670443058014, "epoch": 1.6696832579185519, "grad_norm": 0.8985757231712341, "learning_rate": 0.00044109445595129495, "loss": 0.2142, "mean_token_accuracy": 0.9387252777814865, "num_tokens": 4080023.0, "step": 462 }, { "entropy": 2.111784875392914, "epoch": 1.6733031674208145, "grad_norm": 0.47521084547042847, "learning_rate": 0.0004407723642075184, "loss": 0.0581, "mean_token_accuracy": 0.9821985810995102, "num_tokens": 4088469.0, "step": 463 }, { "entropy": 1.9784683287143707, "epoch": 1.676923076923077, "grad_norm": 0.5552536249160767, "learning_rate": 0.0004404496658178472, "loss": 0.1353, "mean_token_accuracy": 0.9619844257831573, "num_tokens": 4097737.0, "step": 464 }, { "entropy": 2.015674114227295, "epoch": 1.6805429864253394, "grad_norm": 0.6078305244445801, "learning_rate": 0.0004401263620079309, "loss": 0.1916, "mean_token_accuracy": 0.9506707191467285, "num_tokens": 4107156.0, "step": 465 }, { "entropy": 2.0832217931747437, "epoch": 1.6841628959276018, "grad_norm": 0.6618755459785461, "learning_rate": 0.0004398024540057186, "loss": 0.1671, "mean_token_accuracy": 0.9617152661085129, "num_tokens": 4116019.0, "step": 466 }, { "entropy": 2.0383114516735077, "epoch": 1.6877828054298643, "grad_norm": 0.5774693489074707, "learning_rate": 0.0004394779430414541, "loss": 0.2647, "mean_token_accuracy": 0.9387127161026001, "num_tokens": 4125001.0, "step": 467 }, { "entropy": 2.201409190893173, "epoch": 1.6914027149321267, "grad_norm": 0.7600311636924744, "learning_rate": 0.0004391528303476715, "loss": 0.073, "mean_token_accuracy": 0.979825034737587, "num_tokens": 4133467.0, "step": 468 }, { "entropy": 2.168666422367096, "epoch": 1.6950226244343891, "grad_norm": 0.7801902294158936, "learning_rate": 0.00043882711715919015, "loss": 0.2406, "mean_token_accuracy": 0.9451306313276291, "num_tokens": 4141765.0, "step": 469 }, { "entropy": 2.1429262161254883, "epoch": 1.6986425339366515, "grad_norm": 0.5192358493804932, "learning_rate": 0.0004385008047131104, "loss": 0.1052, "mean_token_accuracy": 0.9749262481927872, "num_tokens": 4150732.0, "step": 470 }, { "entropy": 2.1387495696544647, "epoch": 1.702262443438914, "grad_norm": 0.6219777464866638, "learning_rate": 0.0004381738942488083, "loss": 0.2127, "mean_token_accuracy": 0.9398418068885803, "num_tokens": 4159715.0, "step": 471 }, { "entropy": 2.1718398332595825, "epoch": 1.7058823529411766, "grad_norm": 0.5738123655319214, "learning_rate": 0.0004378463870079316, "loss": 0.1703, "mean_token_accuracy": 0.9520847648382187, "num_tokens": 4168526.0, "step": 472 }, { "entropy": 2.2768235206604004, "epoch": 1.709502262443439, "grad_norm": 0.662564754486084, "learning_rate": 0.00043751828423439456, "loss": 0.138, "mean_token_accuracy": 0.9581841826438904, "num_tokens": 4177189.0, "step": 473 }, { "entropy": 2.29143089056015, "epoch": 1.7131221719457015, "grad_norm": 0.8638074398040771, "learning_rate": 0.00043718958717437324, "loss": 0.1432, "mean_token_accuracy": 0.9645630270242691, "num_tokens": 4185367.0, "step": 474 }, { "entropy": 2.2810245156288147, "epoch": 1.716742081447964, "grad_norm": 0.6139346957206726, "learning_rate": 0.00043686029707630097, "loss": 0.173, "mean_token_accuracy": 0.9592728316783905, "num_tokens": 4194418.0, "step": 475 }, { "entropy": 2.1307725310325623, "epoch": 1.7203619909502263, "grad_norm": 0.5192779302597046, "learning_rate": 0.00043653041519086354, "loss": 0.1025, "mean_token_accuracy": 0.970764696598053, "num_tokens": 4203705.0, "step": 476 }, { "entropy": 2.160595118999481, "epoch": 1.7239819004524888, "grad_norm": 0.7398526668548584, "learning_rate": 0.0004361999427709943, "loss": 0.229, "mean_token_accuracy": 0.9352773874998093, "num_tokens": 4212648.0, "step": 477 }, { "entropy": 2.1865442991256714, "epoch": 1.7276018099547512, "grad_norm": 0.6227203011512756, "learning_rate": 0.0004358688810718699, "loss": 0.1118, "mean_token_accuracy": 0.9689576476812363, "num_tokens": 4221208.0, "step": 478 }, { "entropy": 2.086527943611145, "epoch": 1.7312217194570136, "grad_norm": 0.722144603729248, "learning_rate": 0.00043553723135090447, "loss": 0.1656, "mean_token_accuracy": 0.9537550210952759, "num_tokens": 4230810.0, "step": 479 }, { "entropy": 2.068355441093445, "epoch": 1.734841628959276, "grad_norm": 0.5781517028808594, "learning_rate": 0.0004352049948677462, "loss": 0.1497, "mean_token_accuracy": 0.9600837379693985, "num_tokens": 4240394.0, "step": 480 }, { "entropy": 2.185140371322632, "epoch": 1.7384615384615385, "grad_norm": 0.7261873483657837, "learning_rate": 0.0004348721728842715, "loss": 0.1582, "mean_token_accuracy": 0.9584025889635086, "num_tokens": 4249205.0, "step": 481 }, { "entropy": 2.21835720539093, "epoch": 1.742081447963801, "grad_norm": 0.5321667194366455, "learning_rate": 0.0004345387666645807, "loss": 0.1344, "mean_token_accuracy": 0.9659005403518677, "num_tokens": 4257808.0, "step": 482 }, { "entropy": 2.078131854534149, "epoch": 1.7457013574660634, "grad_norm": 0.5598498582839966, "learning_rate": 0.00043420477747499307, "loss": 0.1347, "mean_token_accuracy": 0.9678008407354355, "num_tokens": 4266728.0, "step": 483 }, { "entropy": 2.060504525899887, "epoch": 1.7493212669683258, "grad_norm": 0.5017166137695312, "learning_rate": 0.0004338702065840422, "loss": 0.0722, "mean_token_accuracy": 0.9762782007455826, "num_tokens": 4275514.0, "step": 484 }, { "entropy": 2.165244698524475, "epoch": 1.7529411764705882, "grad_norm": 0.4664002060890198, "learning_rate": 0.00043353505526247084, "loss": 0.1206, "mean_token_accuracy": 0.9696767777204514, "num_tokens": 4284013.0, "step": 485 }, { "entropy": 2.103049159049988, "epoch": 1.7565610859728507, "grad_norm": 0.6669000387191772, "learning_rate": 0.0004331993247832265, "loss": 0.1052, "mean_token_accuracy": 0.9665459096431732, "num_tokens": 4293011.0, "step": 486 }, { "entropy": 2.1286613941192627, "epoch": 1.760180995475113, "grad_norm": 0.7821269631385803, "learning_rate": 0.00043286301642145634, "loss": 0.3669, "mean_token_accuracy": 0.9062697291374207, "num_tokens": 4301965.0, "step": 487 }, { "entropy": 2.098009169101715, "epoch": 1.7638009049773755, "grad_norm": 0.5720731616020203, "learning_rate": 0.0004325261314545024, "loss": 0.1324, "mean_token_accuracy": 0.9650943875312805, "num_tokens": 4310914.0, "step": 488 }, { "entropy": 2.164614498615265, "epoch": 1.767420814479638, "grad_norm": 1.0500473976135254, "learning_rate": 0.0004321886711618967, "loss": 0.1182, "mean_token_accuracy": 0.9720661342144012, "num_tokens": 4319072.0, "step": 489 }, { "entropy": 2.2015402913093567, "epoch": 1.7710407239819004, "grad_norm": 0.5770253539085388, "learning_rate": 0.00043185063682535634, "loss": 0.1226, "mean_token_accuracy": 0.9615659862756729, "num_tokens": 4327539.0, "step": 490 }, { "entropy": 2.075456440448761, "epoch": 1.7746606334841628, "grad_norm": 0.6456925272941589, "learning_rate": 0.0004315120297287789, "loss": 0.1123, "mean_token_accuracy": 0.9628709554672241, "num_tokens": 4336523.0, "step": 491 }, { "entropy": 2.158169150352478, "epoch": 1.7782805429864252, "grad_norm": 0.8282069563865662, "learning_rate": 0.00043117285115823733, "loss": 0.2146, "mean_token_accuracy": 0.9413971602916718, "num_tokens": 4345294.0, "step": 492 }, { "entropy": 2.02735897898674, "epoch": 1.7819004524886877, "grad_norm": 0.783597469329834, "learning_rate": 0.000430833102401975, "loss": 0.1376, "mean_token_accuracy": 0.964630737900734, "num_tokens": 4354107.0, "step": 493 }, { "entropy": 2.138492166996002, "epoch": 1.78552036199095, "grad_norm": 0.6317175030708313, "learning_rate": 0.000430492784750401, "loss": 0.1005, "mean_token_accuracy": 0.9734214246273041, "num_tokens": 4362560.0, "step": 494 }, { "entropy": 2.0253217220306396, "epoch": 1.7891402714932125, "grad_norm": 0.5523395538330078, "learning_rate": 0.000430151899496085, "loss": 0.1633, "mean_token_accuracy": 0.9558031558990479, "num_tokens": 4371698.0, "step": 495 }, { "entropy": 2.160472810268402, "epoch": 1.792760180995475, "grad_norm": 0.6557935476303101, "learning_rate": 0.00042981044793375295, "loss": 0.1154, "mean_token_accuracy": 0.9722230583429337, "num_tokens": 4380612.0, "step": 496 }, { "entropy": 2.0284159183502197, "epoch": 1.7963800904977374, "grad_norm": 0.7357863187789917, "learning_rate": 0.00042946843136028117, "loss": 0.1166, "mean_token_accuracy": 0.9629471153020859, "num_tokens": 4389521.0, "step": 497 }, { "entropy": 2.1544791162014008, "epoch": 1.8, "grad_norm": 0.5604898929595947, "learning_rate": 0.00042912585107469226, "loss": 0.0834, "mean_token_accuracy": 0.9783036410808563, "num_tokens": 4398059.0, "step": 498 }, { "entropy": 2.1051094830036163, "epoch": 1.8036199095022625, "grad_norm": 0.4598539173603058, "learning_rate": 0.0004287827083781497, "loss": 0.0411, "mean_token_accuracy": 0.9868490546941757, "num_tokens": 4406453.0, "step": 499 }, { "entropy": 2.0219272077083588, "epoch": 1.807239819004525, "grad_norm": 0.8164628744125366, "learning_rate": 0.00042843900457395343, "loss": 0.1988, "mean_token_accuracy": 0.9502352625131607, "num_tokens": 4415440.0, "step": 500 }, { "entropy": 1.980013906955719, "epoch": 1.8108597285067873, "grad_norm": 0.572798490524292, "learning_rate": 0.0004280947409675341, "loss": 0.1148, "mean_token_accuracy": 0.966580331325531, "num_tokens": 4424532.0, "step": 501 }, { "entropy": 2.0646563172340393, "epoch": 1.8144796380090498, "grad_norm": 0.769386351108551, "learning_rate": 0.00042774991886644875, "loss": 0.1592, "mean_token_accuracy": 0.9553463608026505, "num_tokens": 4432913.0, "step": 502 }, { "entropy": 2.040877491235733, "epoch": 1.8180995475113122, "grad_norm": 0.7467371821403503, "learning_rate": 0.0004274045395803758, "loss": 0.2247, "mean_token_accuracy": 0.9526964277029037, "num_tokens": 4441425.0, "step": 503 }, { "entropy": 1.9934698939323425, "epoch": 1.8217194570135746, "grad_norm": 0.6602952480316162, "learning_rate": 0.00042705860442110964, "loss": 0.1681, "mean_token_accuracy": 0.9594631940126419, "num_tokens": 4450383.0, "step": 504 }, { "entropy": 2.0858289897441864, "epoch": 1.825339366515837, "grad_norm": 0.684380829334259, "learning_rate": 0.0004267121147025562, "loss": 0.1154, "mean_token_accuracy": 0.9638111293315887, "num_tokens": 4458862.0, "step": 505 }, { "entropy": 2.0886995792388916, "epoch": 1.8289592760180997, "grad_norm": 0.5784837007522583, "learning_rate": 0.00042636507174072756, "loss": 0.1026, "mean_token_accuracy": 0.9676834791898727, "num_tokens": 4467386.0, "step": 506 }, { "entropy": 2.0236063301563263, "epoch": 1.8325791855203621, "grad_norm": 0.5101180672645569, "learning_rate": 0.00042601747685373716, "loss": 0.1031, "mean_token_accuracy": 0.9734093993902206, "num_tokens": 4476054.0, "step": 507 }, { "entropy": 1.9801031053066254, "epoch": 1.8361990950226246, "grad_norm": 0.6581607460975647, "learning_rate": 0.00042566933136179455, "loss": 0.1548, "mean_token_accuracy": 0.9581006914377213, "num_tokens": 4484895.0, "step": 508 }, { "entropy": 2.0244787633419037, "epoch": 1.839819004524887, "grad_norm": 0.8100608587265015, "learning_rate": 0.0004253206365872008, "loss": 0.196, "mean_token_accuracy": 0.9532899260520935, "num_tokens": 4493737.0, "step": 509 }, { "entropy": 1.9108119010925293, "epoch": 1.8434389140271494, "grad_norm": 0.4903942048549652, "learning_rate": 0.00042497139385434314, "loss": 0.1313, "mean_token_accuracy": 0.9667337089776993, "num_tokens": 4502840.0, "step": 510 }, { "entropy": 2.009468197822571, "epoch": 1.8470588235294119, "grad_norm": 0.6010113954544067, "learning_rate": 0.0004246216044896897, "loss": 0.1013, "mean_token_accuracy": 0.9692314714193344, "num_tokens": 4511407.0, "step": 511 }, { "entropy": 2.0337170362472534, "epoch": 1.8506787330316743, "grad_norm": 0.7906802892684937, "learning_rate": 0.00042427126982178546, "loss": 0.1682, "mean_token_accuracy": 0.9550099819898605, "num_tokens": 4520018.0, "step": 512 }, { "entropy": 1.8813888728618622, "epoch": 1.8542986425339367, "grad_norm": 0.5353080034255981, "learning_rate": 0.00042392039118124586, "loss": 0.1228, "mean_token_accuracy": 0.9624074995517731, "num_tokens": 4529270.0, "step": 513 }, { "entropy": 2.012698233127594, "epoch": 1.8579185520361992, "grad_norm": 0.6713843941688538, "learning_rate": 0.00042356896990075285, "loss": 0.2225, "mean_token_accuracy": 0.9417333751916885, "num_tokens": 4538008.0, "step": 514 }, { "entropy": 1.880586564540863, "epoch": 1.8615384615384616, "grad_norm": 0.5821724534034729, "learning_rate": 0.00042321700731504916, "loss": 0.1144, "mean_token_accuracy": 0.9677341282367706, "num_tokens": 4546950.0, "step": 515 }, { "entropy": 2.0066279470920563, "epoch": 1.865158371040724, "grad_norm": 0.4095056354999542, "learning_rate": 0.0004228645047609335, "loss": 0.0424, "mean_token_accuracy": 0.9854962974786758, "num_tokens": 4555452.0, "step": 516 }, { "entropy": 2.042815536260605, "epoch": 1.8687782805429864, "grad_norm": 0.5398769974708557, "learning_rate": 0.0004225114635772555, "loss": 0.1343, "mean_token_accuracy": 0.9615450948476791, "num_tokens": 4564386.0, "step": 517 }, { "entropy": 2.0948933362960815, "epoch": 1.8723981900452489, "grad_norm": 0.6738974452018738, "learning_rate": 0.0004221578851049107, "loss": 0.1541, "mean_token_accuracy": 0.9526563137769699, "num_tokens": 4573041.0, "step": 518 }, { "entropy": 2.102545380592346, "epoch": 1.8760180995475113, "grad_norm": 0.7769943475723267, "learning_rate": 0.00042180377068683504, "loss": 0.2362, "mean_token_accuracy": 0.9472651779651642, "num_tokens": 4581666.0, "step": 519 }, { "entropy": 2.087820291519165, "epoch": 1.8796380090497737, "grad_norm": 0.5722424983978271, "learning_rate": 0.0004214491216680004, "loss": 0.1657, "mean_token_accuracy": 0.9537082612514496, "num_tokens": 4590238.0, "step": 520 }, { "entropy": 2.0093430876731873, "epoch": 1.8832579185520362, "grad_norm": 0.5844932198524475, "learning_rate": 0.00042109393939540867, "loss": 0.1485, "mean_token_accuracy": 0.9624215811491013, "num_tokens": 4599352.0, "step": 521 }, { "entropy": 1.9117147326469421, "epoch": 1.8868778280542986, "grad_norm": 0.46085676550865173, "learning_rate": 0.0004207382252180876, "loss": 0.0853, "mean_token_accuracy": 0.9769327491521835, "num_tokens": 4608571.0, "step": 522 }, { "entropy": 2.0205602943897247, "epoch": 1.890497737556561, "grad_norm": 0.5571608543395996, "learning_rate": 0.000420381980487085, "loss": 0.1517, "mean_token_accuracy": 0.9646699875593185, "num_tokens": 4617445.0, "step": 523 }, { "entropy": 1.9571953415870667, "epoch": 1.8941176470588235, "grad_norm": 0.470630943775177, "learning_rate": 0.0004200252065554636, "loss": 0.1005, "mean_token_accuracy": 0.9750025719404221, "num_tokens": 4626756.0, "step": 524 }, { "entropy": 2.063209116458893, "epoch": 1.897737556561086, "grad_norm": 0.6447069644927979, "learning_rate": 0.00041966790477829637, "loss": 0.113, "mean_token_accuracy": 0.9695079624652863, "num_tokens": 4635378.0, "step": 525 }, { "entropy": 1.9232109785079956, "epoch": 1.9013574660633483, "grad_norm": 0.5114295482635498, "learning_rate": 0.000419310076512661, "loss": 0.1492, "mean_token_accuracy": 0.9653338938951492, "num_tokens": 4644769.0, "step": 526 }, { "entropy": 2.1691197752952576, "epoch": 1.9049773755656108, "grad_norm": 0.7630137205123901, "learning_rate": 0.00041895172311763476, "loss": 0.212, "mean_token_accuracy": 0.9533941894769669, "num_tokens": 4652857.0, "step": 527 }, { "entropy": 2.04753240942955, "epoch": 1.9085972850678732, "grad_norm": 0.6423042416572571, "learning_rate": 0.00041859284595428955, "loss": 0.1455, "mean_token_accuracy": 0.956505224108696, "num_tokens": 4661591.0, "step": 528 }, { "entropy": 1.9440338611602783, "epoch": 1.9122171945701356, "grad_norm": 0.5011327266693115, "learning_rate": 0.00041823344638568656, "loss": 0.1255, "mean_token_accuracy": 0.965131089091301, "num_tokens": 4670594.0, "step": 529 }, { "entropy": 2.0554805397987366, "epoch": 1.915837104072398, "grad_norm": 0.5821590423583984, "learning_rate": 0.0004178735257768713, "loss": 0.0486, "mean_token_accuracy": 0.9875282496213913, "num_tokens": 4679344.0, "step": 530 }, { "entropy": 2.130349576473236, "epoch": 1.9194570135746605, "grad_norm": 0.5332052111625671, "learning_rate": 0.0004175130854948679, "loss": 0.0915, "mean_token_accuracy": 0.9737034440040588, "num_tokens": 4687922.0, "step": 531 }, { "entropy": 2.146788775920868, "epoch": 1.9230769230769231, "grad_norm": 0.5016877055168152, "learning_rate": 0.00041715212690867455, "loss": 0.1281, "mean_token_accuracy": 0.9681432545185089, "num_tokens": 4696593.0, "step": 532 }, { "entropy": 2.041268438100815, "epoch": 1.9266968325791856, "grad_norm": 0.5257729887962341, "learning_rate": 0.00041679065138925807, "loss": 0.1272, "mean_token_accuracy": 0.9649266451597214, "num_tokens": 4705792.0, "step": 533 }, { "entropy": 2.114819645881653, "epoch": 1.930316742081448, "grad_norm": 0.7085135579109192, "learning_rate": 0.0004164286603095484, "loss": 0.1545, "mean_token_accuracy": 0.9581228941679001, "num_tokens": 4714599.0, "step": 534 }, { "entropy": 2.022280514240265, "epoch": 1.9339366515837104, "grad_norm": 0.5309014320373535, "learning_rate": 0.00041606615504443387, "loss": 0.1933, "mean_token_accuracy": 0.9562340676784515, "num_tokens": 4724062.0, "step": 535 }, { "entropy": 2.0959260165691376, "epoch": 1.9375565610859729, "grad_norm": 0.6528061628341675, "learning_rate": 0.0004157031369707557, "loss": 0.1306, "mean_token_accuracy": 0.9612343460321426, "num_tokens": 4733077.0, "step": 536 }, { "entropy": 2.2772948145866394, "epoch": 1.9411764705882353, "grad_norm": 0.7351471185684204, "learning_rate": 0.0004153396074673028, "loss": 0.1494, "mean_token_accuracy": 0.9608108699321747, "num_tokens": 4741201.0, "step": 537 }, { "entropy": 2.0935052037239075, "epoch": 1.9447963800904977, "grad_norm": 0.5435840487480164, "learning_rate": 0.0004149755679148065, "loss": 0.0884, "mean_token_accuracy": 0.9745689779520035, "num_tokens": 4750306.0, "step": 538 }, { "entropy": 2.2082818746566772, "epoch": 1.9484162895927601, "grad_norm": 0.3780331611633301, "learning_rate": 0.00041461101969593537, "loss": 0.0739, "mean_token_accuracy": 0.9777179658412933, "num_tokens": 4758954.0, "step": 539 }, { "entropy": 2.1683040261268616, "epoch": 1.9520361990950226, "grad_norm": 0.4637961685657501, "learning_rate": 0.00041424596419529017, "loss": 0.0632, "mean_token_accuracy": 0.9834533184766769, "num_tokens": 4767615.0, "step": 540 }, { "entropy": 2.075555235147476, "epoch": 1.9556561085972852, "grad_norm": 0.7603118419647217, "learning_rate": 0.00041388040279939804, "loss": 0.2835, "mean_token_accuracy": 0.9364205300807953, "num_tokens": 4776714.0, "step": 541 }, { "entropy": 2.18926739692688, "epoch": 1.9592760180995477, "grad_norm": 0.8895708918571472, "learning_rate": 0.0004135143368967079, "loss": 0.2514, "mean_token_accuracy": 0.9361050724983215, "num_tokens": 4785402.0, "step": 542 }, { "entropy": 2.2387169003486633, "epoch": 1.96289592760181, "grad_norm": 0.6013544797897339, "learning_rate": 0.00041314776787758454, "loss": 0.1502, "mean_token_accuracy": 0.9594238847494125, "num_tokens": 4793928.0, "step": 543 }, { "entropy": 2.208383619785309, "epoch": 1.9665158371040725, "grad_norm": 0.6934756636619568, "learning_rate": 0.00041278069713430386, "loss": 0.1777, "mean_token_accuracy": 0.9619583487510681, "num_tokens": 4802612.0, "step": 544 }, { "entropy": 2.2621757984161377, "epoch": 1.970135746606335, "grad_norm": 0.6920077800750732, "learning_rate": 0.00041241312606104743, "loss": 0.1689, "mean_token_accuracy": 0.9594835937023163, "num_tokens": 4811332.0, "step": 545 }, { "entropy": 2.2654454112052917, "epoch": 1.9737556561085974, "grad_norm": 0.6259592771530151, "learning_rate": 0.000412045056053897, "loss": 0.142, "mean_token_accuracy": 0.9648078680038452, "num_tokens": 4820441.0, "step": 546 }, { "entropy": 2.218056857585907, "epoch": 1.9773755656108598, "grad_norm": 0.5390617847442627, "learning_rate": 0.0004116764885108292, "loss": 0.1737, "mean_token_accuracy": 0.9595656991004944, "num_tokens": 4829437.0, "step": 547 }, { "entropy": 2.2571592330932617, "epoch": 1.9809954751131222, "grad_norm": 0.3656528890132904, "learning_rate": 0.0004113074248317108, "loss": 0.0545, "mean_token_accuracy": 0.9825418293476105, "num_tokens": 4838118.0, "step": 548 }, { "entropy": 2.1890549659729004, "epoch": 1.9846153846153847, "grad_norm": 0.5716155767440796, "learning_rate": 0.00041093786641829247, "loss": 0.0997, "mean_token_accuracy": 0.9715700745582581, "num_tokens": 4847073.0, "step": 549 }, { "entropy": 2.2726192474365234, "epoch": 1.988235294117647, "grad_norm": 0.4709530770778656, "learning_rate": 0.0004105678146742042, "loss": 0.0746, "mean_token_accuracy": 0.9799739569425583, "num_tokens": 4855755.0, "step": 550 }, { "entropy": 2.2328362464904785, "epoch": 1.9918552036199095, "grad_norm": 0.6773779392242432, "learning_rate": 0.0004101972710049498, "loss": 0.1418, "mean_token_accuracy": 0.9629421681165695, "num_tokens": 4864601.0, "step": 551 }, { "entropy": 2.199812740087509, "epoch": 1.995475113122172, "grad_norm": 0.717012882232666, "learning_rate": 0.00040982623681790113, "loss": 0.2948, "mean_token_accuracy": 0.9432803690433502, "num_tokens": 4873630.0, "step": 552 }, { "entropy": 2.2102787494659424, "epoch": 1.9990950226244344, "grad_norm": 0.6925314664840698, "learning_rate": 0.00040945471352229346, "loss": 0.2579, "mean_token_accuracy": 0.9435124397277832, "num_tokens": 4882714.0, "step": 553 }, { "entropy": 2.3318979740142822, "epoch": 2.0, "grad_norm": 2.688188314437866, "learning_rate": 0.0004090827025292197, "loss": 0.0283, "mean_token_accuracy": 0.9918032884597778, "num_tokens": 4883450.0, "step": 554 }, { "epoch": 2.0, "eval_entropy": 2.2165925522160723, "eval_loss": 0.16817161440849304, "eval_mean_token_accuracy": 0.9567220133494555, "eval_num_tokens": 4883450.0, "eval_runtime": 116.1556, "eval_samples_per_second": 3.177, "eval_steps_per_second": 1.059, "step": 554 } ], "logging_steps": 1, "max_steps": 1662, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.634384518674615e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }