| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 554, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.358862280845642, | |
| "epoch": 0.0036199095022624436, | |
| "grad_norm": 2.292628288269043, | |
| "learning_rate": 0.0, | |
| "loss": 0.7311, | |
| "mean_token_accuracy": 0.8534883409738541, | |
| "num_tokens": 9316.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 2.674945294857025, | |
| "epoch": 0.007239819004524887, | |
| "grad_norm": 3.8950836658477783, | |
| "learning_rate": 1.0219999999999999e-05, | |
| "loss": 1.0621, | |
| "mean_token_accuracy": 0.8183160275220871, | |
| "num_tokens": 17707.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 2.4915525913238525, | |
| "epoch": 0.01085972850678733, | |
| "grad_norm": 2.792142868041992, | |
| "learning_rate": 2.0439999999999997e-05, | |
| "loss": 0.8448, | |
| "mean_token_accuracy": 0.8489587754011154, | |
| "num_tokens": 26783.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 2.525622010231018, | |
| "epoch": 0.014479638009049774, | |
| "grad_norm": 2.7071900367736816, | |
| "learning_rate": 3.0659999999999994e-05, | |
| "loss": 0.8847, | |
| "mean_token_accuracy": 0.8486668318510056, | |
| "num_tokens": 35947.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 2.588509976863861, | |
| "epoch": 0.01809954751131222, | |
| "grad_norm": 2.981574773788452, | |
| "learning_rate": 4.0879999999999995e-05, | |
| "loss": 1.0783, | |
| "mean_token_accuracy": 0.8135111033916473, | |
| "num_tokens": 44505.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 2.662865400314331, | |
| "epoch": 0.02171945701357466, | |
| "grad_norm": 2.629283905029297, | |
| "learning_rate": 5.1099999999999995e-05, | |
| "loss": 0.9485, | |
| "mean_token_accuracy": 0.8152717798948288, | |
| "num_tokens": 53140.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 2.6662243604660034, | |
| "epoch": 0.025339366515837104, | |
| "grad_norm": 2.730058431625366, | |
| "learning_rate": 6.131999999999999e-05, | |
| "loss": 0.6982, | |
| "mean_token_accuracy": 0.8552135527133942, | |
| "num_tokens": 61932.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 2.661384105682373, | |
| "epoch": 0.02895927601809955, | |
| "grad_norm": 2.562839984893799, | |
| "learning_rate": 7.154e-05, | |
| "loss": 0.7296, | |
| "mean_token_accuracy": 0.8579540699720383, | |
| "num_tokens": 70973.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 2.7889368534088135, | |
| "epoch": 0.03257918552036199, | |
| "grad_norm": 2.8640544414520264, | |
| "learning_rate": 8.175999999999999e-05, | |
| "loss": 0.5965, | |
| "mean_token_accuracy": 0.8638457208871841, | |
| "num_tokens": 79977.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 2.811532199382782, | |
| "epoch": 0.03619909502262444, | |
| "grad_norm": 2.6199426651000977, | |
| "learning_rate": 9.197999999999998e-05, | |
| "loss": 0.4819, | |
| "mean_token_accuracy": 0.8786454051733017, | |
| "num_tokens": 88915.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.941167712211609, | |
| "epoch": 0.039819004524886875, | |
| "grad_norm": 1.2497272491455078, | |
| "learning_rate": 0.00010219999999999999, | |
| "loss": 0.7192, | |
| "mean_token_accuracy": 0.841494083404541, | |
| "num_tokens": 97749.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 3.0547962188720703, | |
| "epoch": 0.04343891402714932, | |
| "grad_norm": 1.436136245727539, | |
| "learning_rate": 0.00011241999999999998, | |
| "loss": 0.5908, | |
| "mean_token_accuracy": 0.8657624870538712, | |
| "num_tokens": 106048.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 2.9914053082466125, | |
| "epoch": 0.047058823529411764, | |
| "grad_norm": 0.9903654456138611, | |
| "learning_rate": 0.00012263999999999998, | |
| "loss": 0.4008, | |
| "mean_token_accuracy": 0.8985499292612076, | |
| "num_tokens": 115216.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 3.1867465376853943, | |
| "epoch": 0.05067873303167421, | |
| "grad_norm": 1.019572377204895, | |
| "learning_rate": 0.00013286, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.8893097043037415, | |
| "num_tokens": 124040.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 3.2431325912475586, | |
| "epoch": 0.05429864253393665, | |
| "grad_norm": 1.2394084930419922, | |
| "learning_rate": 0.00014308, | |
| "loss": 0.361, | |
| "mean_token_accuracy": 0.9009967148303986, | |
| "num_tokens": 132447.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 3.1858643889427185, | |
| "epoch": 0.0579185520361991, | |
| "grad_norm": 0.9859603643417358, | |
| "learning_rate": 0.00015329999999999999, | |
| "loss": 0.4498, | |
| "mean_token_accuracy": 0.887280747294426, | |
| "num_tokens": 141228.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 3.5029141902923584, | |
| "epoch": 0.06153846153846154, | |
| "grad_norm": 1.453957438468933, | |
| "learning_rate": 0.00016351999999999998, | |
| "loss": 0.4949, | |
| "mean_token_accuracy": 0.888081505894661, | |
| "num_tokens": 149789.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 3.4572895765304565, | |
| "epoch": 0.06515837104072399, | |
| "grad_norm": 1.390377402305603, | |
| "learning_rate": 0.00017374, | |
| "loss": 0.5449, | |
| "mean_token_accuracy": 0.8745045810937881, | |
| "num_tokens": 157813.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 3.3081750869750977, | |
| "epoch": 0.06877828054298643, | |
| "grad_norm": 1.1171791553497314, | |
| "learning_rate": 0.00018395999999999997, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8893420845270157, | |
| "num_tokens": 166315.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 3.3776715993881226, | |
| "epoch": 0.07239819004524888, | |
| "grad_norm": 1.5567998886108398, | |
| "learning_rate": 0.00019418, | |
| "loss": 0.3669, | |
| "mean_token_accuracy": 0.9146632701158524, | |
| "num_tokens": 175207.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 3.2677870988845825, | |
| "epoch": 0.0760180995475113, | |
| "grad_norm": 1.7404611110687256, | |
| "learning_rate": 0.00020439999999999998, | |
| "loss": 0.5287, | |
| "mean_token_accuracy": 0.8777483552694321, | |
| "num_tokens": 183833.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 3.313201069831848, | |
| "epoch": 0.07963800904977375, | |
| "grad_norm": 1.0836979150772095, | |
| "learning_rate": 0.00021461999999999997, | |
| "loss": 0.3014, | |
| "mean_token_accuracy": 0.9215261936187744, | |
| "num_tokens": 192591.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 3.208672881126404, | |
| "epoch": 0.0832579185520362, | |
| "grad_norm": 1.2197301387786865, | |
| "learning_rate": 0.00022483999999999997, | |
| "loss": 0.4401, | |
| "mean_token_accuracy": 0.9031257778406143, | |
| "num_tokens": 201372.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 3.1830995082855225, | |
| "epoch": 0.08687782805429864, | |
| "grad_norm": 1.2422229051589966, | |
| "learning_rate": 0.00023506, | |
| "loss": 0.5144, | |
| "mean_token_accuracy": 0.8915928155183792, | |
| "num_tokens": 210348.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 3.085207223892212, | |
| "epoch": 0.09049773755656108, | |
| "grad_norm": 0.8987624049186707, | |
| "learning_rate": 0.00024527999999999996, | |
| "loss": 0.3253, | |
| "mean_token_accuracy": 0.9221627116203308, | |
| "num_tokens": 219131.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 3.026031017303467, | |
| "epoch": 0.09411764705882353, | |
| "grad_norm": 1.0273475646972656, | |
| "learning_rate": 0.0002555, | |
| "loss": 0.3495, | |
| "mean_token_accuracy": 0.9147634357213974, | |
| "num_tokens": 228292.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 3.0420032739639282, | |
| "epoch": 0.09773755656108597, | |
| "grad_norm": 1.0590945482254028, | |
| "learning_rate": 0.00026572, | |
| "loss": 0.4495, | |
| "mean_token_accuracy": 0.9019353687763214, | |
| "num_tokens": 236942.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 3.0469263792037964, | |
| "epoch": 0.10135746606334842, | |
| "grad_norm": 0.9584959745407104, | |
| "learning_rate": 0.00027594, | |
| "loss": 0.405, | |
| "mean_token_accuracy": 0.9216890782117844, | |
| "num_tokens": 245543.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 2.92683744430542, | |
| "epoch": 0.10497737556561086, | |
| "grad_norm": 0.8826628923416138, | |
| "learning_rate": 0.00028616, | |
| "loss": 0.4004, | |
| "mean_token_accuracy": 0.9173285663127899, | |
| "num_tokens": 254264.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 3.0086968541145325, | |
| "epoch": 0.1085972850678733, | |
| "grad_norm": 0.8521863222122192, | |
| "learning_rate": 0.00029637999999999995, | |
| "loss": 0.2876, | |
| "mean_token_accuracy": 0.9335231184959412, | |
| "num_tokens": 263143.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.9086623191833496, | |
| "epoch": 0.11221719457013575, | |
| "grad_norm": 0.7830919623374939, | |
| "learning_rate": 0.00030659999999999997, | |
| "loss": 0.548, | |
| "mean_token_accuracy": 0.8831343650817871, | |
| "num_tokens": 272055.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 2.9730575680732727, | |
| "epoch": 0.1158371040723982, | |
| "grad_norm": 0.7217472195625305, | |
| "learning_rate": 0.00031682, | |
| "loss": 0.3564, | |
| "mean_token_accuracy": 0.9119151830673218, | |
| "num_tokens": 280971.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 3.081720530986786, | |
| "epoch": 0.11945701357466064, | |
| "grad_norm": 0.8697704076766968, | |
| "learning_rate": 0.00032703999999999996, | |
| "loss": 0.334, | |
| "mean_token_accuracy": 0.9234935492277145, | |
| "num_tokens": 289449.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 3.1043431162834167, | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 0.7962514758110046, | |
| "learning_rate": 0.00033726, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9554370939731598, | |
| "num_tokens": 297804.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 3.0275490283966064, | |
| "epoch": 0.12669683257918551, | |
| "grad_norm": 0.5887104272842407, | |
| "learning_rate": 0.00034748, | |
| "loss": 0.2254, | |
| "mean_token_accuracy": 0.9491932094097137, | |
| "num_tokens": 306589.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 3.099652886390686, | |
| "epoch": 0.13031674208144797, | |
| "grad_norm": 0.894397497177124, | |
| "learning_rate": 0.00035769999999999997, | |
| "loss": 0.6397, | |
| "mean_token_accuracy": 0.8802188038825989, | |
| "num_tokens": 315534.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 3.0312134623527527, | |
| "epoch": 0.1339366515837104, | |
| "grad_norm": 0.6374682188034058, | |
| "learning_rate": 0.00036791999999999993, | |
| "loss": 0.2183, | |
| "mean_token_accuracy": 0.9478497952222824, | |
| "num_tokens": 324492.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 3.28497713804245, | |
| "epoch": 0.13755656108597286, | |
| "grad_norm": 0.6740968823432922, | |
| "learning_rate": 0.00037813999999999995, | |
| "loss": 0.3619, | |
| "mean_token_accuracy": 0.9288723170757294, | |
| "num_tokens": 333195.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 3.1478323340415955, | |
| "epoch": 0.1411764705882353, | |
| "grad_norm": 0.7235494256019592, | |
| "learning_rate": 0.00038836, | |
| "loss": 0.324, | |
| "mean_token_accuracy": 0.9179254025220871, | |
| "num_tokens": 342028.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 3.279879152774811, | |
| "epoch": 0.14479638009049775, | |
| "grad_norm": 0.7512595653533936, | |
| "learning_rate": 0.00039858, | |
| "loss": 0.4804, | |
| "mean_token_accuracy": 0.889826312661171, | |
| "num_tokens": 350902.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 3.173546612262726, | |
| "epoch": 0.14841628959276018, | |
| "grad_norm": 0.6978861689567566, | |
| "learning_rate": 0.00040879999999999996, | |
| "loss": 0.3442, | |
| "mean_token_accuracy": 0.9205169230699539, | |
| "num_tokens": 359787.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 3.2385765314102173, | |
| "epoch": 0.1520361990950226, | |
| "grad_norm": 0.8108944892883301, | |
| "learning_rate": 0.00041901999999999993, | |
| "loss": 0.4223, | |
| "mean_token_accuracy": 0.8979178965091705, | |
| "num_tokens": 368426.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 3.146568477153778, | |
| "epoch": 0.15565610859728507, | |
| "grad_norm": 0.5847787261009216, | |
| "learning_rate": 0.00042923999999999995, | |
| "loss": 0.1953, | |
| "mean_token_accuracy": 0.9556037336587906, | |
| "num_tokens": 377349.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 3.066233277320862, | |
| "epoch": 0.1592760180995475, | |
| "grad_norm": 0.7887329459190369, | |
| "learning_rate": 0.00043945999999999997, | |
| "loss": 0.6815, | |
| "mean_token_accuracy": 0.8654293268918991, | |
| "num_tokens": 386603.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 3.1745981574058533, | |
| "epoch": 0.16289592760180996, | |
| "grad_norm": 0.7280165553092957, | |
| "learning_rate": 0.00044967999999999994, | |
| "loss": 0.1932, | |
| "mean_token_accuracy": 0.9479279220104218, | |
| "num_tokens": 395070.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 3.1094446182250977, | |
| "epoch": 0.1665158371040724, | |
| "grad_norm": 0.6453448534011841, | |
| "learning_rate": 0.00045989999999999996, | |
| "loss": 0.2608, | |
| "mean_token_accuracy": 0.9249396026134491, | |
| "num_tokens": 403651.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 2.9050925970077515, | |
| "epoch": 0.17013574660633485, | |
| "grad_norm": 0.6689278483390808, | |
| "learning_rate": 0.00047012, | |
| "loss": 0.4489, | |
| "mean_token_accuracy": 0.898686870932579, | |
| "num_tokens": 412898.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 3.2239145040512085, | |
| "epoch": 0.17375565610859728, | |
| "grad_norm": 1.0014020204544067, | |
| "learning_rate": 0.00048033999999999994, | |
| "loss": 0.3234, | |
| "mean_token_accuracy": 0.9231891483068466, | |
| "num_tokens": 421420.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 3.035899817943573, | |
| "epoch": 0.17737556561085974, | |
| "grad_norm": 0.6415768265724182, | |
| "learning_rate": 0.0004905599999999999, | |
| "loss": 0.2259, | |
| "mean_token_accuracy": 0.9447792917490005, | |
| "num_tokens": 430258.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 3.057477653026581, | |
| "epoch": 0.18099547511312217, | |
| "grad_norm": 0.6042271256446838, | |
| "learning_rate": 0.0005007799999999999, | |
| "loss": 0.2228, | |
| "mean_token_accuracy": 0.9473378211259842, | |
| "num_tokens": 439593.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.8375911116600037, | |
| "epoch": 0.18461538461538463, | |
| "grad_norm": 0.739811897277832, | |
| "learning_rate": 0.000511, | |
| "loss": 0.3623, | |
| "mean_token_accuracy": 0.9050924181938171, | |
| "num_tokens": 449056.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 2.9926682114601135, | |
| "epoch": 0.18823529411764706, | |
| "grad_norm": 0.6637321710586548, | |
| "learning_rate": 0.0005109995633102972, | |
| "loss": 0.2924, | |
| "mean_token_accuracy": 0.9397273659706116, | |
| "num_tokens": 457677.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 2.7932987809181213, | |
| "epoch": 0.19185520361990951, | |
| "grad_norm": 0.5666584372520447, | |
| "learning_rate": 0.0005109982532428477, | |
| "loss": 0.2055, | |
| "mean_token_accuracy": 0.9385408014059067, | |
| "num_tokens": 466969.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 2.765812337398529, | |
| "epoch": 0.19547511312217195, | |
| "grad_norm": 0.7875120639801025, | |
| "learning_rate": 0.0005109960698026271, | |
| "loss": 0.4549, | |
| "mean_token_accuracy": 0.9052814990282059, | |
| "num_tokens": 476285.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 2.884207248687744, | |
| "epoch": 0.19909502262443438, | |
| "grad_norm": 0.7538661956787109, | |
| "learning_rate": 0.0005109930129979285, | |
| "loss": 0.3751, | |
| "mean_token_accuracy": 0.9210246652364731, | |
| "num_tokens": 484668.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 2.779718518257141, | |
| "epoch": 0.20271493212669683, | |
| "grad_norm": 0.8069296479225159, | |
| "learning_rate": 0.0005109890828403621, | |
| "loss": 0.3664, | |
| "mean_token_accuracy": 0.9219843596220016, | |
| "num_tokens": 493292.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 2.841543674468994, | |
| "epoch": 0.20633484162895926, | |
| "grad_norm": 0.5545904636383057, | |
| "learning_rate": 0.0005109842793448548, | |
| "loss": 0.1973, | |
| "mean_token_accuracy": 0.9547395706176758, | |
| "num_tokens": 501973.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 2.8180030584335327, | |
| "epoch": 0.20995475113122172, | |
| "grad_norm": 1.015456199645996, | |
| "learning_rate": 0.0005109786025296513, | |
| "loss": 0.6019, | |
| "mean_token_accuracy": 0.88613361120224, | |
| "num_tokens": 510840.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 2.7450912594795227, | |
| "epoch": 0.21357466063348415, | |
| "grad_norm": 0.6784740686416626, | |
| "learning_rate": 0.0005109720524163127, | |
| "loss": 0.2868, | |
| "mean_token_accuracy": 0.9295425117015839, | |
| "num_tokens": 519656.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 2.822400987148285, | |
| "epoch": 0.2171945701357466, | |
| "grad_norm": 0.8780149817466736, | |
| "learning_rate": 0.000510964629029717, | |
| "loss": 0.4371, | |
| "mean_token_accuracy": 0.9089596569538116, | |
| "num_tokens": 528105.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.522100865840912, | |
| "epoch": 0.22081447963800904, | |
| "grad_norm": 0.51394122838974, | |
| "learning_rate": 0.0005109563323980594, | |
| "loss": 0.2509, | |
| "mean_token_accuracy": 0.941976860165596, | |
| "num_tokens": 537707.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 2.6596657633781433, | |
| "epoch": 0.2244343891402715, | |
| "grad_norm": 0.6359816789627075, | |
| "learning_rate": 0.0005109471625528516, | |
| "loss": 0.3685, | |
| "mean_token_accuracy": 0.9191890209913254, | |
| "num_tokens": 546517.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 2.800311803817749, | |
| "epoch": 0.22805429864253393, | |
| "grad_norm": 0.6862941980361938, | |
| "learning_rate": 0.0005109371195289215, | |
| "loss": 0.2457, | |
| "mean_token_accuracy": 0.9330879002809525, | |
| "num_tokens": 555493.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 2.7235344648361206, | |
| "epoch": 0.2316742081447964, | |
| "grad_norm": 1.0464682579040527, | |
| "learning_rate": 0.0005109262033644142, | |
| "loss": 0.4417, | |
| "mean_token_accuracy": 0.8957678377628326, | |
| "num_tokens": 564255.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 2.6643534302711487, | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 1.0790019035339355, | |
| "learning_rate": 0.0005109144141007903, | |
| "loss": 0.4947, | |
| "mean_token_accuracy": 0.8889007717370987, | |
| "num_tokens": 573401.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 2.760925054550171, | |
| "epoch": 0.23891402714932128, | |
| "grad_norm": 0.7957189679145813, | |
| "learning_rate": 0.0005109017517828273, | |
| "loss": 0.2259, | |
| "mean_token_accuracy": 0.944578230381012, | |
| "num_tokens": 581905.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 2.7048792839050293, | |
| "epoch": 0.2425339366515837, | |
| "grad_norm": 0.9530714750289917, | |
| "learning_rate": 0.0005108882164586181, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.9257418513298035, | |
| "num_tokens": 590802.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 2.6733291149139404, | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 0.8295993208885193, | |
| "learning_rate": 0.0005108738081795716, | |
| "loss": 0.3701, | |
| "mean_token_accuracy": 0.898589238524437, | |
| "num_tokens": 599279.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 2.5613606572151184, | |
| "epoch": 0.2497737556561086, | |
| "grad_norm": 0.6205935478210449, | |
| "learning_rate": 0.0005108585270004123, | |
| "loss": 0.4372, | |
| "mean_token_accuracy": 0.9116007685661316, | |
| "num_tokens": 608107.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 2.458296835422516, | |
| "epoch": 0.25339366515837103, | |
| "grad_norm": 0.7629838585853577, | |
| "learning_rate": 0.0005108423729791799, | |
| "loss": 0.2307, | |
| "mean_token_accuracy": 0.9386163502931595, | |
| "num_tokens": 616881.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.4176695346832275, | |
| "epoch": 0.25701357466063346, | |
| "grad_norm": 0.902400016784668, | |
| "learning_rate": 0.0005108253461772298, | |
| "loss": 0.2853, | |
| "mean_token_accuracy": 0.9237343072891235, | |
| "num_tokens": 625323.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 2.2265281677246094, | |
| "epoch": 0.26063348416289595, | |
| "grad_norm": 0.7744383811950684, | |
| "learning_rate": 0.0005108074466592316, | |
| "loss": 0.2435, | |
| "mean_token_accuracy": 0.9508260935544968, | |
| "num_tokens": 634260.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 2.1855952441692352, | |
| "epoch": 0.2642533936651584, | |
| "grad_norm": 0.8615190386772156, | |
| "learning_rate": 0.0005107886744931702, | |
| "loss": 0.3323, | |
| "mean_token_accuracy": 0.9276078194379807, | |
| "num_tokens": 643235.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 2.179121494293213, | |
| "epoch": 0.2678733031674208, | |
| "grad_norm": 0.8953279256820679, | |
| "learning_rate": 0.0005107690297503444, | |
| "loss": 0.2384, | |
| "mean_token_accuracy": 0.9425230622291565, | |
| "num_tokens": 652032.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 2.1565526127815247, | |
| "epoch": 0.27149321266968324, | |
| "grad_norm": 0.6830486059188843, | |
| "learning_rate": 0.0005107485125053678, | |
| "loss": 0.2759, | |
| "mean_token_accuracy": 0.9360661953687668, | |
| "num_tokens": 660978.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 2.0900665521621704, | |
| "epoch": 0.2751131221719457, | |
| "grad_norm": 0.786665141582489, | |
| "learning_rate": 0.0005107271228361672, | |
| "loss": 0.4061, | |
| "mean_token_accuracy": 0.910009115934372, | |
| "num_tokens": 669817.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 2.1311859488487244, | |
| "epoch": 0.27873303167420815, | |
| "grad_norm": 0.6399909853935242, | |
| "learning_rate": 0.0005107048608239836, | |
| "loss": 0.272, | |
| "mean_token_accuracy": 0.9424714297056198, | |
| "num_tokens": 678469.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 2.059997320175171, | |
| "epoch": 0.2823529411764706, | |
| "grad_norm": 0.8114754557609558, | |
| "learning_rate": 0.0005106817265533706, | |
| "loss": 0.4029, | |
| "mean_token_accuracy": 0.9037660360336304, | |
| "num_tokens": 687261.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.9725019037723541, | |
| "epoch": 0.285972850678733, | |
| "grad_norm": 0.9420941472053528, | |
| "learning_rate": 0.0005106577201121952, | |
| "loss": 0.535, | |
| "mean_token_accuracy": 0.8996377140283585, | |
| "num_tokens": 695941.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.9951164424419403, | |
| "epoch": 0.2895927601809955, | |
| "grad_norm": 0.6476142406463623, | |
| "learning_rate": 0.0005106328415916372, | |
| "loss": 0.2242, | |
| "mean_token_accuracy": 0.941379725933075, | |
| "num_tokens": 704643.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.8962564170360565, | |
| "epoch": 0.29321266968325793, | |
| "grad_norm": 0.5974630117416382, | |
| "learning_rate": 0.0005106070910861881, | |
| "loss": 0.2934, | |
| "mean_token_accuracy": 0.9217697530984879, | |
| "num_tokens": 713605.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.9781515896320343, | |
| "epoch": 0.29683257918552036, | |
| "grad_norm": 0.8755478262901306, | |
| "learning_rate": 0.0005105804686936518, | |
| "loss": 0.4551, | |
| "mean_token_accuracy": 0.9051328897476196, | |
| "num_tokens": 722385.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.9892418384552002, | |
| "epoch": 0.3004524886877828, | |
| "grad_norm": 0.6887345314025879, | |
| "learning_rate": 0.0005105529745151433, | |
| "loss": 0.244, | |
| "mean_token_accuracy": 0.9261117279529572, | |
| "num_tokens": 730962.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 2.0053181648254395, | |
| "epoch": 0.3040723981900452, | |
| "grad_norm": 0.6930885910987854, | |
| "learning_rate": 0.0005105246086550893, | |
| "loss": 0.3155, | |
| "mean_token_accuracy": 0.9206147193908691, | |
| "num_tokens": 739499.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.9716475903987885, | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.5049461722373962, | |
| "learning_rate": 0.0005104953712212266, | |
| "loss": 0.2215, | |
| "mean_token_accuracy": 0.9608763605356216, | |
| "num_tokens": 748604.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.9186978042125702, | |
| "epoch": 0.31131221719457014, | |
| "grad_norm": 0.5756685733795166, | |
| "learning_rate": 0.000510465262324603, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9372887462377548, | |
| "num_tokens": 757919.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.9738290905952454, | |
| "epoch": 0.31493212669683257, | |
| "grad_norm": 0.6163789629936218, | |
| "learning_rate": 0.0005104342820795758, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9430449157953262, | |
| "num_tokens": 766708.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 2.1927571892738342, | |
| "epoch": 0.318552036199095, | |
| "grad_norm": 0.7953162789344788, | |
| "learning_rate": 0.0005104024306038119, | |
| "loss": 0.261, | |
| "mean_token_accuracy": 0.9425829648971558, | |
| "num_tokens": 774601.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 2.043731451034546, | |
| "epoch": 0.3221719457013575, | |
| "grad_norm": 0.8098088502883911, | |
| "learning_rate": 0.0005103697080182872, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 0.9158089309930801, | |
| "num_tokens": 783170.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.9801572561264038, | |
| "epoch": 0.3257918552036199, | |
| "grad_norm": 0.5227240920066833, | |
| "learning_rate": 0.0005103361144472864, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9666071832180023, | |
| "num_tokens": 791769.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.9553790986537933, | |
| "epoch": 0.32941176470588235, | |
| "grad_norm": 0.7819464206695557, | |
| "learning_rate": 0.0005103016500184022, | |
| "loss": 0.531, | |
| "mean_token_accuracy": 0.8817111849784851, | |
| "num_tokens": 800824.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.9291303753852844, | |
| "epoch": 0.3330316742081448, | |
| "grad_norm": 0.7178757190704346, | |
| "learning_rate": 0.0005102663148625347, | |
| "loss": 0.3301, | |
| "mean_token_accuracy": 0.9357631802558899, | |
| "num_tokens": 809347.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.9846041798591614, | |
| "epoch": 0.33665158371040727, | |
| "grad_norm": 1.316636085510254, | |
| "learning_rate": 0.0005102301091138916, | |
| "loss": 0.4241, | |
| "mean_token_accuracy": 0.8993304669857025, | |
| "num_tokens": 817174.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.814637303352356, | |
| "epoch": 0.3402714932126697, | |
| "grad_norm": 0.5486414432525635, | |
| "learning_rate": 0.0005101930329099865, | |
| "loss": 0.116, | |
| "mean_token_accuracy": 0.9674727618694305, | |
| "num_tokens": 826177.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.9128066003322601, | |
| "epoch": 0.3438914027149321, | |
| "grad_norm": 0.620303750038147, | |
| "learning_rate": 0.00051015508639164, | |
| "loss": 0.1833, | |
| "mean_token_accuracy": 0.9569521993398666, | |
| "num_tokens": 835409.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.7541870176792145, | |
| "epoch": 0.34751131221719456, | |
| "grad_norm": 0.8337438702583313, | |
| "learning_rate": 0.0005101162697029776, | |
| "loss": 0.3327, | |
| "mean_token_accuracy": 0.9193180054426193, | |
| "num_tokens": 844692.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.8255240619182587, | |
| "epoch": 0.351131221719457, | |
| "grad_norm": 0.877780556678772, | |
| "learning_rate": 0.00051007658299143, | |
| "loss": 0.2106, | |
| "mean_token_accuracy": 0.9527023881673813, | |
| "num_tokens": 853309.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.8611579239368439, | |
| "epoch": 0.3547511312217195, | |
| "grad_norm": 1.0667716264724731, | |
| "learning_rate": 0.0005100360264077325, | |
| "loss": 0.3196, | |
| "mean_token_accuracy": 0.9195879399776459, | |
| "num_tokens": 861859.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.821915864944458, | |
| "epoch": 0.3583710407239819, | |
| "grad_norm": 0.8400309681892395, | |
| "learning_rate": 0.0005099946001059241, | |
| "loss": 0.4036, | |
| "mean_token_accuracy": 0.8951036781072617, | |
| "num_tokens": 871060.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.7648265063762665, | |
| "epoch": 0.36199095022624433, | |
| "grad_norm": 1.1391404867172241, | |
| "learning_rate": 0.0005099523042433472, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.901309460401535, | |
| "num_tokens": 880593.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.8506875336170197, | |
| "epoch": 0.36561085972850677, | |
| "grad_norm": 0.6923297643661499, | |
| "learning_rate": 0.000509909138980647, | |
| "loss": 0.2504, | |
| "mean_token_accuracy": 0.9384842216968536, | |
| "num_tokens": 889739.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.9311015605926514, | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 0.9677391052246094, | |
| "learning_rate": 0.0005098651044817704, | |
| "loss": 0.6953, | |
| "mean_token_accuracy": 0.8752655684947968, | |
| "num_tokens": 898992.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.9590983986854553, | |
| "epoch": 0.3728506787330317, | |
| "grad_norm": 0.6364567279815674, | |
| "learning_rate": 0.0005098202009139663, | |
| "loss": 0.4318, | |
| "mean_token_accuracy": 0.9056479930877686, | |
| "num_tokens": 908225.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.9455370008945465, | |
| "epoch": 0.3764705882352941, | |
| "grad_norm": 0.6747863292694092, | |
| "learning_rate": 0.0005097744284477839, | |
| "loss": 0.244, | |
| "mean_token_accuracy": 0.9428392052650452, | |
| "num_tokens": 917134.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.8632825911045074, | |
| "epoch": 0.38009049773755654, | |
| "grad_norm": 0.5705651044845581, | |
| "learning_rate": 0.0005097277872570731, | |
| "loss": 0.2508, | |
| "mean_token_accuracy": 0.9325222969055176, | |
| "num_tokens": 926573.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.9370323717594147, | |
| "epoch": 0.38371040723981903, | |
| "grad_norm": 0.6298627853393555, | |
| "learning_rate": 0.000509680277518983, | |
| "loss": 0.2481, | |
| "mean_token_accuracy": 0.9281332045793533, | |
| "num_tokens": 935853.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 2.0217572450637817, | |
| "epoch": 0.38733031674208146, | |
| "grad_norm": 0.5434353947639465, | |
| "learning_rate": 0.0005096318994139617, | |
| "loss": 0.1809, | |
| "mean_token_accuracy": 0.9592084139585495, | |
| "num_tokens": 944279.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.9619770646095276, | |
| "epoch": 0.3909502262443439, | |
| "grad_norm": 0.6959638595581055, | |
| "learning_rate": 0.0005095826531257552, | |
| "loss": 0.1376, | |
| "mean_token_accuracy": 0.9608310014009476, | |
| "num_tokens": 953336.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 2.12511146068573, | |
| "epoch": 0.3945701357466063, | |
| "grad_norm": 1.0152848958969116, | |
| "learning_rate": 0.0005095325388414074, | |
| "loss": 0.4382, | |
| "mean_token_accuracy": 0.915201798081398, | |
| "num_tokens": 962002.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 2.0171878039836884, | |
| "epoch": 0.39819004524886875, | |
| "grad_norm": 0.8337467312812805, | |
| "learning_rate": 0.0005094815567512587, | |
| "loss": 0.2672, | |
| "mean_token_accuracy": 0.9313560128211975, | |
| "num_tokens": 970954.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.1024146378040314, | |
| "epoch": 0.40180995475113124, | |
| "grad_norm": 0.8214333057403564, | |
| "learning_rate": 0.0005094297070489455, | |
| "loss": 0.3146, | |
| "mean_token_accuracy": 0.9289091974496841, | |
| "num_tokens": 979929.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 2.260519325733185, | |
| "epoch": 0.40542986425339367, | |
| "grad_norm": 1.1298810243606567, | |
| "learning_rate": 0.0005093769899313996, | |
| "loss": 0.3055, | |
| "mean_token_accuracy": 0.9213490188121796, | |
| "num_tokens": 988477.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 2.2228699326515198, | |
| "epoch": 0.4090497737556561, | |
| "grad_norm": 0.8601953983306885, | |
| "learning_rate": 0.0005093234055988475, | |
| "loss": 0.2738, | |
| "mean_token_accuracy": 0.920888364315033, | |
| "num_tokens": 997091.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 2.2165185809135437, | |
| "epoch": 0.41266968325791853, | |
| "grad_norm": 0.6331561803817749, | |
| "learning_rate": 0.0005092689542548091, | |
| "loss": 0.2241, | |
| "mean_token_accuracy": 0.9408514499664307, | |
| "num_tokens": 1005866.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 2.324040472507477, | |
| "epoch": 0.416289592760181, | |
| "grad_norm": 0.680496096611023, | |
| "learning_rate": 0.0005092136361060975, | |
| "loss": 0.2454, | |
| "mean_token_accuracy": 0.9433349967002869, | |
| "num_tokens": 1014277.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 2.413789749145508, | |
| "epoch": 0.41990950226244345, | |
| "grad_norm": 0.7489557862281799, | |
| "learning_rate": 0.0005091574513628183, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.934124082326889, | |
| "num_tokens": 1023032.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 2.4693005681037903, | |
| "epoch": 0.4235294117647059, | |
| "grad_norm": 0.6842612624168396, | |
| "learning_rate": 0.0005091004002383682, | |
| "loss": 0.2778, | |
| "mean_token_accuracy": 0.9386793673038483, | |
| "num_tokens": 1031883.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 2.4351969361305237, | |
| "epoch": 0.4271493212669683, | |
| "grad_norm": 0.9150674343109131, | |
| "learning_rate": 0.0005090424829494347, | |
| "loss": 0.3151, | |
| "mean_token_accuracy": 0.9177709072828293, | |
| "num_tokens": 1040985.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 2.5141562819480896, | |
| "epoch": 0.4307692307692308, | |
| "grad_norm": 1.0200655460357666, | |
| "learning_rate": 0.000508983699715995, | |
| "loss": 0.5134, | |
| "mean_token_accuracy": 0.8835459351539612, | |
| "num_tokens": 1049949.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 2.479240596294403, | |
| "epoch": 0.4343891402714932, | |
| "grad_norm": 0.783278226852417, | |
| "learning_rate": 0.0005089240507613151, | |
| "loss": 0.2745, | |
| "mean_token_accuracy": 0.9389322698116302, | |
| "num_tokens": 1058953.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.457803785800934, | |
| "epoch": 0.43800904977375565, | |
| "grad_norm": 0.7620834112167358, | |
| "learning_rate": 0.0005088635363119497, | |
| "loss": 0.3394, | |
| "mean_token_accuracy": 0.9145695865154266, | |
| "num_tokens": 1068624.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 2.4909247756004333, | |
| "epoch": 0.4416289592760181, | |
| "grad_norm": 0.5868712067604065, | |
| "learning_rate": 0.0005088021565977403, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9567564129829407, | |
| "num_tokens": 1077686.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 2.5540462732315063, | |
| "epoch": 0.4452488687782805, | |
| "grad_norm": 1.1467291116714478, | |
| "learning_rate": 0.0005087399118518148, | |
| "loss": 0.2617, | |
| "mean_token_accuracy": 0.9329706132411957, | |
| "num_tokens": 1086230.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 2.377680242061615, | |
| "epoch": 0.448868778280543, | |
| "grad_norm": 0.7021825909614563, | |
| "learning_rate": 0.0005086768023105866, | |
| "loss": 0.4124, | |
| "mean_token_accuracy": 0.9093360006809235, | |
| "num_tokens": 1095867.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 2.55239599943161, | |
| "epoch": 0.45248868778280543, | |
| "grad_norm": 0.5947801470756531, | |
| "learning_rate": 0.0005086128282137538, | |
| "loss": 0.2752, | |
| "mean_token_accuracy": 0.9248816668987274, | |
| "num_tokens": 1105003.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 2.4695483446121216, | |
| "epoch": 0.45610859728506786, | |
| "grad_norm": 1.345604658126831, | |
| "learning_rate": 0.0005085479898042985, | |
| "loss": 0.2577, | |
| "mean_token_accuracy": 0.9318550229072571, | |
| "num_tokens": 1114162.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 2.4898732900619507, | |
| "epoch": 0.4597285067873303, | |
| "grad_norm": 0.8534179329872131, | |
| "learning_rate": 0.0005084822873284848, | |
| "loss": 0.3013, | |
| "mean_token_accuracy": 0.9195661097764969, | |
| "num_tokens": 1123457.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 2.5951223969459534, | |
| "epoch": 0.4633484162895928, | |
| "grad_norm": 1.1677368879318237, | |
| "learning_rate": 0.0005084157210358592, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9599333852529526, | |
| "num_tokens": 1131774.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 2.7315847873687744, | |
| "epoch": 0.4669683257918552, | |
| "grad_norm": 0.7633224129676819, | |
| "learning_rate": 0.0005083482911792492, | |
| "loss": 0.2437, | |
| "mean_token_accuracy": 0.9487509876489639, | |
| "num_tokens": 1140301.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 2.6348633766174316, | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.7573317885398865, | |
| "learning_rate": 0.0005082799980147617, | |
| "loss": 0.2426, | |
| "mean_token_accuracy": 0.947308748960495, | |
| "num_tokens": 1148929.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.60002738237381, | |
| "epoch": 0.47420814479638007, | |
| "grad_norm": 1.8195319175720215, | |
| "learning_rate": 0.0005082108418017829, | |
| "loss": 0.1792, | |
| "mean_token_accuracy": 0.9512491375207901, | |
| "num_tokens": 1157682.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 2.5319923162460327, | |
| "epoch": 0.47782805429864256, | |
| "grad_norm": 0.6342993378639221, | |
| "learning_rate": 0.0005081408228029771, | |
| "loss": 0.1843, | |
| "mean_token_accuracy": 0.9440758228302002, | |
| "num_tokens": 1166687.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 2.5666881799697876, | |
| "epoch": 0.481447963800905, | |
| "grad_norm": 0.8979415893554688, | |
| "learning_rate": 0.0005080699412842852, | |
| "loss": 0.4824, | |
| "mean_token_accuracy": 0.8837443292140961, | |
| "num_tokens": 1175746.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 2.6854636669158936, | |
| "epoch": 0.4850678733031674, | |
| "grad_norm": 0.8302125334739685, | |
| "learning_rate": 0.0005079981975149243, | |
| "loss": 0.267, | |
| "mean_token_accuracy": 0.9279022663831711, | |
| "num_tokens": 1184196.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 2.564552128314972, | |
| "epoch": 0.48868778280542985, | |
| "grad_norm": 0.6785959005355835, | |
| "learning_rate": 0.0005079255917673863, | |
| "loss": 0.2031, | |
| "mean_token_accuracy": 0.9463823586702347, | |
| "num_tokens": 1192982.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 2.673682928085327, | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 1.4760410785675049, | |
| "learning_rate": 0.0005078521243174371, | |
| "loss": 0.4791, | |
| "mean_token_accuracy": 0.8969505727291107, | |
| "num_tokens": 1201454.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 2.6232714653015137, | |
| "epoch": 0.49592760180995477, | |
| "grad_norm": 0.7845668792724609, | |
| "learning_rate": 0.0005077777954441157, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9404618591070175, | |
| "num_tokens": 1210182.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 2.5614060163497925, | |
| "epoch": 0.4995475113122172, | |
| "grad_norm": 0.725419819355011, | |
| "learning_rate": 0.0005077026054297322, | |
| "loss": 0.3643, | |
| "mean_token_accuracy": 0.9193316847085953, | |
| "num_tokens": 1219487.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 2.5907246470451355, | |
| "epoch": 0.5031674208144796, | |
| "grad_norm": 0.7741782665252686, | |
| "learning_rate": 0.0005076265545598682, | |
| "loss": 0.276, | |
| "mean_token_accuracy": 0.9447730481624603, | |
| "num_tokens": 1228066.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 2.531104028224945, | |
| "epoch": 0.5067873303167421, | |
| "grad_norm": 0.680992603302002, | |
| "learning_rate": 0.0005075496431233745, | |
| "loss": 0.2004, | |
| "mean_token_accuracy": 0.9470729678869247, | |
| "num_tokens": 1236980.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.590231478214264, | |
| "epoch": 0.5104072398190045, | |
| "grad_norm": 0.8260406255722046, | |
| "learning_rate": 0.0005074718714123704, | |
| "loss": 0.2756, | |
| "mean_token_accuracy": 0.9301882535219193, | |
| "num_tokens": 1245565.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 2.4858668446540833, | |
| "epoch": 0.5140271493212669, | |
| "grad_norm": 0.8085922598838806, | |
| "learning_rate": 0.0005073932397222429, | |
| "loss": 0.2314, | |
| "mean_token_accuracy": 0.9449103325605392, | |
| "num_tokens": 1254366.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 2.5374304056167603, | |
| "epoch": 0.5176470588235295, | |
| "grad_norm": 0.7858129143714905, | |
| "learning_rate": 0.0005073137483516452, | |
| "loss": 0.1622, | |
| "mean_token_accuracy": 0.9510673582553864, | |
| "num_tokens": 1263197.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 2.608425199985504, | |
| "epoch": 0.5212669683257919, | |
| "grad_norm": 1.2698506116867065, | |
| "learning_rate": 0.0005072333976024957, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9509973376989365, | |
| "num_tokens": 1271725.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 2.437038242816925, | |
| "epoch": 0.5248868778280543, | |
| "grad_norm": 1.0788538455963135, | |
| "learning_rate": 0.0005071521877799765, | |
| "loss": 0.3344, | |
| "mean_token_accuracy": 0.9166721999645233, | |
| "num_tokens": 1280963.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 2.589951515197754, | |
| "epoch": 0.5285067873303168, | |
| "grad_norm": 0.9228294491767883, | |
| "learning_rate": 0.0005070701191925332, | |
| "loss": 0.3095, | |
| "mean_token_accuracy": 0.9239777624607086, | |
| "num_tokens": 1289683.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 2.575794994831085, | |
| "epoch": 0.5321266968325792, | |
| "grad_norm": 1.359767198562622, | |
| "learning_rate": 0.0005069871921518726, | |
| "loss": 0.2447, | |
| "mean_token_accuracy": 0.9374738186597824, | |
| "num_tokens": 1298397.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 2.5628358721733093, | |
| "epoch": 0.5357466063348416, | |
| "grad_norm": 0.9870713353157043, | |
| "learning_rate": 0.000506903406972962, | |
| "loss": 0.4824, | |
| "mean_token_accuracy": 0.9027767181396484, | |
| "num_tokens": 1307191.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 2.5513240098953247, | |
| "epoch": 0.539366515837104, | |
| "grad_norm": 0.7921387553215027, | |
| "learning_rate": 0.0005068187639740286, | |
| "loss": 0.3278, | |
| "mean_token_accuracy": 0.9161934554576874, | |
| "num_tokens": 1315878.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 2.526439070701599, | |
| "epoch": 0.5429864253393665, | |
| "grad_norm": 0.6320391297340393, | |
| "learning_rate": 0.000506733263476557, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9575318098068237, | |
| "num_tokens": 1324786.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.4837265014648438, | |
| "epoch": 0.5466063348416289, | |
| "grad_norm": 0.5369354486465454, | |
| "learning_rate": 0.000506646905805289, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9636050164699554, | |
| "num_tokens": 1333766.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 2.5264737010002136, | |
| "epoch": 0.5502262443438914, | |
| "grad_norm": 0.7346852421760559, | |
| "learning_rate": 0.0005065596912882222, | |
| "loss": 0.2012, | |
| "mean_token_accuracy": 0.9448132663965225, | |
| "num_tokens": 1343004.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 2.569309651851654, | |
| "epoch": 0.5538461538461539, | |
| "grad_norm": 0.9926508069038391, | |
| "learning_rate": 0.0005064716202566082, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.9332023113965988, | |
| "num_tokens": 1351561.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 2.3148274421691895, | |
| "epoch": 0.5574660633484163, | |
| "grad_norm": 0.6301954984664917, | |
| "learning_rate": 0.0005063826930449523, | |
| "loss": 0.3622, | |
| "mean_token_accuracy": 0.9349419325590134, | |
| "num_tokens": 1360997.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 2.497675657272339, | |
| "epoch": 0.5610859728506787, | |
| "grad_norm": 0.8846175670623779, | |
| "learning_rate": 0.000506292909991011, | |
| "loss": 0.2314, | |
| "mean_token_accuracy": 0.9468862265348434, | |
| "num_tokens": 1369600.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 2.313987612724304, | |
| "epoch": 0.5647058823529412, | |
| "grad_norm": 0.5701894164085388, | |
| "learning_rate": 0.0005062022714357922, | |
| "loss": 0.2154, | |
| "mean_token_accuracy": 0.945093959569931, | |
| "num_tokens": 1379125.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 2.4019755125045776, | |
| "epoch": 0.5683257918552036, | |
| "grad_norm": 0.8769335746765137, | |
| "learning_rate": 0.0005061107777235524, | |
| "loss": 0.3565, | |
| "mean_token_accuracy": 0.9133864492177963, | |
| "num_tokens": 1388111.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 2.3127577900886536, | |
| "epoch": 0.571945701357466, | |
| "grad_norm": 1.1026453971862793, | |
| "learning_rate": 0.0005060184292017965, | |
| "loss": 0.2897, | |
| "mean_token_accuracy": 0.899736076593399, | |
| "num_tokens": 1397528.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 2.2682697772979736, | |
| "epoch": 0.5755656108597285, | |
| "grad_norm": 0.5426591038703918, | |
| "learning_rate": 0.000505925226221276, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9609879851341248, | |
| "num_tokens": 1406809.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 2.4639336466789246, | |
| "epoch": 0.579185520361991, | |
| "grad_norm": 0.6552363038063049, | |
| "learning_rate": 0.0005058311691359875, | |
| "loss": 0.2511, | |
| "mean_token_accuracy": 0.9355164766311646, | |
| "num_tokens": 1415498.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.467900663614273, | |
| "epoch": 0.5828054298642534, | |
| "grad_norm": 0.7168154120445251, | |
| "learning_rate": 0.000505736258303172, | |
| "loss": 0.234, | |
| "mean_token_accuracy": 0.9450509995222092, | |
| "num_tokens": 1424524.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 2.3683157563209534, | |
| "epoch": 0.5864253393665159, | |
| "grad_norm": 0.6433501839637756, | |
| "learning_rate": 0.0005056404940833128, | |
| "loss": 0.3441, | |
| "mean_token_accuracy": 0.9261108189821243, | |
| "num_tokens": 1434194.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 2.4686295986175537, | |
| "epoch": 0.5900452488687783, | |
| "grad_norm": 0.9615177512168884, | |
| "learning_rate": 0.0005055438768401348, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.966903567314148, | |
| "num_tokens": 1442972.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 2.5551892518997192, | |
| "epoch": 0.5936651583710407, | |
| "grad_norm": 0.4957484006881714, | |
| "learning_rate": 0.0005054464069406023, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.969713419675827, | |
| "num_tokens": 1451324.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 2.554121434688568, | |
| "epoch": 0.5972850678733032, | |
| "grad_norm": 0.7399498224258423, | |
| "learning_rate": 0.0005053480847549187, | |
| "loss": 0.206, | |
| "mean_token_accuracy": 0.9498797357082367, | |
| "num_tokens": 1459698.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 2.5181015729904175, | |
| "epoch": 0.6009049773755656, | |
| "grad_norm": 0.7433251142501831, | |
| "learning_rate": 0.0005052489106565241, | |
| "loss": 0.2883, | |
| "mean_token_accuracy": 0.9419967085123062, | |
| "num_tokens": 1468460.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 2.3073930144309998, | |
| "epoch": 0.604524886877828, | |
| "grad_norm": 0.5920398831367493, | |
| "learning_rate": 0.0005051488850220941, | |
| "loss": 0.197, | |
| "mean_token_accuracy": 0.952111005783081, | |
| "num_tokens": 1477579.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 2.532376289367676, | |
| "epoch": 0.6081447963800904, | |
| "grad_norm": 0.7033098936080933, | |
| "learning_rate": 0.0005050480082315392, | |
| "loss": 0.2122, | |
| "mean_token_accuracy": 0.9488633275032043, | |
| "num_tokens": 1486307.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 2.397290349006653, | |
| "epoch": 0.611764705882353, | |
| "grad_norm": 0.8026869893074036, | |
| "learning_rate": 0.0005049462806680021, | |
| "loss": 0.2541, | |
| "mean_token_accuracy": 0.9427233040332794, | |
| "num_tokens": 1495152.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 2.464823842048645, | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.6508225798606873, | |
| "learning_rate": 0.0005048437027178571, | |
| "loss": 0.2639, | |
| "mean_token_accuracy": 0.9391255974769592, | |
| "num_tokens": 1503903.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.520734131336212, | |
| "epoch": 0.6190045248868778, | |
| "grad_norm": 0.8373616337776184, | |
| "learning_rate": 0.0005047402747707084, | |
| "loss": 0.3078, | |
| "mean_token_accuracy": 0.9302930980920792, | |
| "num_tokens": 1512588.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 2.388108015060425, | |
| "epoch": 0.6226244343891403, | |
| "grad_norm": 0.6334089636802673, | |
| "learning_rate": 0.0005046359972193884, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.9666119515895844, | |
| "num_tokens": 1522011.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 2.537126660346985, | |
| "epoch": 0.6262443438914027, | |
| "grad_norm": 0.7665116190910339, | |
| "learning_rate": 0.0005045308704599566, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.9350012242794037, | |
| "num_tokens": 1530767.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 2.567205488681793, | |
| "epoch": 0.6298642533936651, | |
| "grad_norm": 0.8043875098228455, | |
| "learning_rate": 0.0005044248948916977, | |
| "loss": 0.2497, | |
| "mean_token_accuracy": 0.9400482773780823, | |
| "num_tokens": 1539971.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 2.585887610912323, | |
| "epoch": 0.6334841628959276, | |
| "grad_norm": 0.5282150506973267, | |
| "learning_rate": 0.0005043180709171206, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9680279046297073, | |
| "num_tokens": 1548971.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 2.4289392232894897, | |
| "epoch": 0.63710407239819, | |
| "grad_norm": 0.6838382482528687, | |
| "learning_rate": 0.0005042103989419563, | |
| "loss": 0.2076, | |
| "mean_token_accuracy": 0.9468046277761459, | |
| "num_tokens": 1558403.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 2.6080575585365295, | |
| "epoch": 0.6407239819004525, | |
| "grad_norm": 0.9058650732040405, | |
| "learning_rate": 0.0005041018793751566, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.9432647377252579, | |
| "num_tokens": 1567209.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 2.5212480425834656, | |
| "epoch": 0.644343891402715, | |
| "grad_norm": 0.796381950378418, | |
| "learning_rate": 0.0005039925126288929, | |
| "loss": 0.2286, | |
| "mean_token_accuracy": 0.9305787235498428, | |
| "num_tokens": 1576255.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 2.588195264339447, | |
| "epoch": 0.6479638009049774, | |
| "grad_norm": 0.6489388942718506, | |
| "learning_rate": 0.0005038822991185536, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9572225511074066, | |
| "num_tokens": 1585335.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 2.609215259552002, | |
| "epoch": 0.6515837104072398, | |
| "grad_norm": 0.8551130294799805, | |
| "learning_rate": 0.0005037712392627441, | |
| "loss": 0.2358, | |
| "mean_token_accuracy": 0.9529621452093124, | |
| "num_tokens": 1594354.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.4199504256248474, | |
| "epoch": 0.6552036199095023, | |
| "grad_norm": 0.5775637030601501, | |
| "learning_rate": 0.0005036593334832836, | |
| "loss": 0.2402, | |
| "mean_token_accuracy": 0.9437069743871689, | |
| "num_tokens": 1603750.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 2.516424596309662, | |
| "epoch": 0.6588235294117647, | |
| "grad_norm": 0.6967942118644714, | |
| "learning_rate": 0.0005035465822052047, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9518167823553085, | |
| "num_tokens": 1612474.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 2.463354170322418, | |
| "epoch": 0.6624434389140271, | |
| "grad_norm": 0.49672600626945496, | |
| "learning_rate": 0.000503432985856751, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9564716964960098, | |
| "num_tokens": 1621563.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 2.4456416964530945, | |
| "epoch": 0.6660633484162896, | |
| "grad_norm": 0.6207183003425598, | |
| "learning_rate": 0.000503318544869376, | |
| "loss": 0.1918, | |
| "mean_token_accuracy": 0.9476529806852341, | |
| "num_tokens": 1630801.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 2.641440451145172, | |
| "epoch": 0.669683257918552, | |
| "grad_norm": 1.220821499824524, | |
| "learning_rate": 0.000503203259677741, | |
| "loss": 0.4019, | |
| "mean_token_accuracy": 0.9172120243310928, | |
| "num_tokens": 1639522.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 2.6447275280952454, | |
| "epoch": 0.6733031674208145, | |
| "grad_norm": 0.7546490430831909, | |
| "learning_rate": 0.000503087130719714, | |
| "loss": 0.2484, | |
| "mean_token_accuracy": 0.9387800246477127, | |
| "num_tokens": 1647964.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 2.4657886028289795, | |
| "epoch": 0.676923076923077, | |
| "grad_norm": 0.7679230570793152, | |
| "learning_rate": 0.0005029701584363675, | |
| "loss": 0.2659, | |
| "mean_token_accuracy": 0.930300235748291, | |
| "num_tokens": 1657181.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 2.37973552942276, | |
| "epoch": 0.6805429864253394, | |
| "grad_norm": 0.7473414540290833, | |
| "learning_rate": 0.0005028523432719772, | |
| "loss": 0.32, | |
| "mean_token_accuracy": 0.9233052879571915, | |
| "num_tokens": 1666477.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 2.5238219499588013, | |
| "epoch": 0.6841628959276018, | |
| "grad_norm": 0.5573673248291016, | |
| "learning_rate": 0.0005027336856740201, | |
| "loss": 0.1846, | |
| "mean_token_accuracy": 0.9445535093545914, | |
| "num_tokens": 1675002.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 2.456815242767334, | |
| "epoch": 0.6877828054298643, | |
| "grad_norm": 0.47237634658813477, | |
| "learning_rate": 0.0005026141860931728, | |
| "loss": 0.1065, | |
| "mean_token_accuracy": 0.964375838637352, | |
| "num_tokens": 1683623.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.548456132411957, | |
| "epoch": 0.6914027149321267, | |
| "grad_norm": 0.7699162364006042, | |
| "learning_rate": 0.00050249384498331, | |
| "loss": 0.1985, | |
| "mean_token_accuracy": 0.9438774734735489, | |
| "num_tokens": 1691718.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 2.4514941573143005, | |
| "epoch": 0.6950226244343891, | |
| "grad_norm": 1.4113538265228271, | |
| "learning_rate": 0.0005023726628015027, | |
| "loss": 0.4541, | |
| "mean_token_accuracy": 0.9207872897386551, | |
| "num_tokens": 1699824.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 2.2560824751853943, | |
| "epoch": 0.6986425339366515, | |
| "grad_norm": 0.6007948517799377, | |
| "learning_rate": 0.0005022506400080161, | |
| "loss": 0.1871, | |
| "mean_token_accuracy": 0.9502484053373337, | |
| "num_tokens": 1708722.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 2.1833614110946655, | |
| "epoch": 0.702262443438914, | |
| "grad_norm": 0.7005489468574524, | |
| "learning_rate": 0.0005021277770663082, | |
| "loss": 0.2222, | |
| "mean_token_accuracy": 0.9386974722146988, | |
| "num_tokens": 1717592.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 2.2031923830509186, | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.5830584764480591, | |
| "learning_rate": 0.0005020040744430284, | |
| "loss": 0.1106, | |
| "mean_token_accuracy": 0.9719562232494354, | |
| "num_tokens": 1726149.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 2.199785351753235, | |
| "epoch": 0.709502262443439, | |
| "grad_norm": 0.7465847134590149, | |
| "learning_rate": 0.0005018795326080149, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.9497270882129669, | |
| "num_tokens": 1734541.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 2.1103186309337616, | |
| "epoch": 0.7131221719457014, | |
| "grad_norm": 1.0782264471054077, | |
| "learning_rate": 0.0005017541520342934, | |
| "loss": 0.2895, | |
| "mean_token_accuracy": 0.9274258464574814, | |
| "num_tokens": 1743722.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 2.2248528599739075, | |
| "epoch": 0.7167420814479638, | |
| "grad_norm": 0.6409780979156494, | |
| "learning_rate": 0.0005016279331980754, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.96550352871418, | |
| "num_tokens": 1752156.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 2.19924658536911, | |
| "epoch": 0.7203619909502262, | |
| "grad_norm": 0.7019934058189392, | |
| "learning_rate": 0.0005015008765787561, | |
| "loss": 0.1969, | |
| "mean_token_accuracy": 0.9429282248020172, | |
| "num_tokens": 1760978.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 2.297484815120697, | |
| "epoch": 0.7239819004524887, | |
| "grad_norm": 0.7826490998268127, | |
| "learning_rate": 0.0005013729826589127, | |
| "loss": 0.2399, | |
| "mean_token_accuracy": 0.9416657984256744, | |
| "num_tokens": 1769533.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.2471498548984528, | |
| "epoch": 0.7276018099547511, | |
| "grad_norm": 0.621566891670227, | |
| "learning_rate": 0.0005012442519243027, | |
| "loss": 0.1876, | |
| "mean_token_accuracy": 0.9460793286561966, | |
| "num_tokens": 1778286.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 2.2212815284729004, | |
| "epoch": 0.7312217194570135, | |
| "grad_norm": 0.622283935546875, | |
| "learning_rate": 0.0005011146848638616, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9482609927654266, | |
| "num_tokens": 1787392.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 2.308752655982971, | |
| "epoch": 0.7348416289592761, | |
| "grad_norm": 0.7263973355293274, | |
| "learning_rate": 0.0005009842819697018, | |
| "loss": 0.2043, | |
| "mean_token_accuracy": 0.9378403723239899, | |
| "num_tokens": 1796133.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 2.3376497626304626, | |
| "epoch": 0.7384615384615385, | |
| "grad_norm": 0.5493630766868591, | |
| "learning_rate": 0.0005008530437371101, | |
| "loss": 0.1145, | |
| "mean_token_accuracy": 0.970586434006691, | |
| "num_tokens": 1804769.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 2.373005509376526, | |
| "epoch": 0.7420814479638009, | |
| "grad_norm": 0.6313483119010925, | |
| "learning_rate": 0.0005007209706645461, | |
| "loss": 0.2183, | |
| "mean_token_accuracy": 0.9472708404064178, | |
| "num_tokens": 1813364.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 2.468949854373932, | |
| "epoch": 0.7457013574660634, | |
| "grad_norm": 1.0125588178634644, | |
| "learning_rate": 0.00050058806325364, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9351322948932648, | |
| "num_tokens": 1822149.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 2.2420623898506165, | |
| "epoch": 0.7493212669683258, | |
| "grad_norm": 0.913761556148529, | |
| "learning_rate": 0.0005004543220091911, | |
| "loss": 0.2386, | |
| "mean_token_accuracy": 0.9453927427530289, | |
| "num_tokens": 1831533.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 2.2966006994247437, | |
| "epoch": 0.7529411764705882, | |
| "grad_norm": 0.7386876940727234, | |
| "learning_rate": 0.0005003197474391658, | |
| "loss": 0.1768, | |
| "mean_token_accuracy": 0.949826255440712, | |
| "num_tokens": 1840157.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 2.306001305580139, | |
| "epoch": 0.7565610859728507, | |
| "grad_norm": 0.8900741338729858, | |
| "learning_rate": 0.0005001843400546955, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.9241485595703125, | |
| "num_tokens": 1848898.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 2.117514967918396, | |
| "epoch": 0.7601809954751131, | |
| "grad_norm": 0.644622802734375, | |
| "learning_rate": 0.0005000481003700746, | |
| "loss": 0.2714, | |
| "mean_token_accuracy": 0.9299416691064835, | |
| "num_tokens": 1858330.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.3768392205238342, | |
| "epoch": 0.7638009049773755, | |
| "grad_norm": 0.9724471569061279, | |
| "learning_rate": 0.0004999110289027587, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9550061523914337, | |
| "num_tokens": 1866806.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 2.090679556131363, | |
| "epoch": 0.7674208144796381, | |
| "grad_norm": 0.5419518351554871, | |
| "learning_rate": 0.0004997731261733628, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9619670957326889, | |
| "num_tokens": 1875937.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 2.099909245967865, | |
| "epoch": 0.7710407239819005, | |
| "grad_norm": 0.6858121752738953, | |
| "learning_rate": 0.0004996343927056592, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9528832882642746, | |
| "num_tokens": 1885145.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 2.130059242248535, | |
| "epoch": 0.7746606334841629, | |
| "grad_norm": 0.7691065073013306, | |
| "learning_rate": 0.000499494829026575, | |
| "loss": 0.348, | |
| "mean_token_accuracy": 0.9162366837263107, | |
| "num_tokens": 1894255.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 2.191373586654663, | |
| "epoch": 0.7782805429864253, | |
| "grad_norm": 0.7427324652671814, | |
| "learning_rate": 0.000499354435666191, | |
| "loss": 0.3373, | |
| "mean_token_accuracy": 0.9311849176883698, | |
| "num_tokens": 1902981.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 2.1425398886203766, | |
| "epoch": 0.7819004524886878, | |
| "grad_norm": 0.6410383582115173, | |
| "learning_rate": 0.0004992132131577392, | |
| "loss": 0.2079, | |
| "mean_token_accuracy": 0.949742391705513, | |
| "num_tokens": 1912253.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 2.1396586298942566, | |
| "epoch": 0.7855203619909502, | |
| "grad_norm": 0.5689850449562073, | |
| "learning_rate": 0.0004990711620376003, | |
| "loss": 0.1999, | |
| "mean_token_accuracy": 0.946034774184227, | |
| "num_tokens": 1921409.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 2.2237865328788757, | |
| "epoch": 0.7891402714932126, | |
| "grad_norm": 0.6408923864364624, | |
| "learning_rate": 0.0004989282828453029, | |
| "loss": 0.2452, | |
| "mean_token_accuracy": 0.9510752111673355, | |
| "num_tokens": 1930397.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 2.234771251678467, | |
| "epoch": 0.7927601809954751, | |
| "grad_norm": 0.751447856426239, | |
| "learning_rate": 0.0004987845761235203, | |
| "loss": 0.3057, | |
| "mean_token_accuracy": 0.9217256307601929, | |
| "num_tokens": 1939172.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 2.2653815746307373, | |
| "epoch": 0.7963800904977375, | |
| "grad_norm": 0.751455545425415, | |
| "learning_rate": 0.0004986400424180688, | |
| "loss": 0.3245, | |
| "mean_token_accuracy": 0.9256318956613541, | |
| "num_tokens": 1947979.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.3123483061790466, | |
| "epoch": 0.8, | |
| "grad_norm": 0.5939492583274841, | |
| "learning_rate": 0.0004984946822779061, | |
| "loss": 0.2429, | |
| "mean_token_accuracy": 0.9333402067422867, | |
| "num_tokens": 1956814.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 2.3289234042167664, | |
| "epoch": 0.8036199095022625, | |
| "grad_norm": 0.5591994524002075, | |
| "learning_rate": 0.0004983484962551284, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.96376833319664, | |
| "num_tokens": 1965641.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 2.4314023852348328, | |
| "epoch": 0.8072398190045249, | |
| "grad_norm": 0.5805783271789551, | |
| "learning_rate": 0.0004982014849049687, | |
| "loss": 0.2049, | |
| "mean_token_accuracy": 0.9586948156356812, | |
| "num_tokens": 1974180.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 2.3639765977859497, | |
| "epoch": 0.8108597285067873, | |
| "grad_norm": 0.6924490332603455, | |
| "learning_rate": 0.0004980536487857951, | |
| "loss": 0.2137, | |
| "mean_token_accuracy": 0.9441423565149307, | |
| "num_tokens": 1982744.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 2.3361759781837463, | |
| "epoch": 0.8144796380090498, | |
| "grad_norm": 0.4579620361328125, | |
| "learning_rate": 0.0004979049884591077, | |
| "loss": 0.1041, | |
| "mean_token_accuracy": 0.9753208309412003, | |
| "num_tokens": 1991583.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 2.286989688873291, | |
| "epoch": 0.8180995475113122, | |
| "grad_norm": 0.6489312052726746, | |
| "learning_rate": 0.0004977555044895377, | |
| "loss": 0.2131, | |
| "mean_token_accuracy": 0.9520440250635147, | |
| "num_tokens": 2000193.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 2.288672834634781, | |
| "epoch": 0.8217194570135746, | |
| "grad_norm": 0.7738961577415466, | |
| "learning_rate": 0.0004976051974448441, | |
| "loss": 0.325, | |
| "mean_token_accuracy": 0.9060750156641006, | |
| "num_tokens": 2009233.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 2.288076102733612, | |
| "epoch": 0.8253393665158371, | |
| "grad_norm": 0.7042292356491089, | |
| "learning_rate": 0.0004974540678959123, | |
| "loss": 0.2206, | |
| "mean_token_accuracy": 0.94980289041996, | |
| "num_tokens": 2018417.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 2.217707335948944, | |
| "epoch": 0.8289592760180996, | |
| "grad_norm": 0.6834208369255066, | |
| "learning_rate": 0.0004973021164167515, | |
| "loss": 0.2907, | |
| "mean_token_accuracy": 0.951058641076088, | |
| "num_tokens": 2027822.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 2.1610691249370575, | |
| "epoch": 0.832579185520362, | |
| "grad_norm": 0.665044903755188, | |
| "learning_rate": 0.0004971493435844928, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.9506549835205078, | |
| "num_tokens": 2036983.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.321135401725769, | |
| "epoch": 0.8361990950226245, | |
| "grad_norm": 0.8208273649215698, | |
| "learning_rate": 0.0004969957499793869, | |
| "loss": 0.2399, | |
| "mean_token_accuracy": 0.9435176253318787, | |
| "num_tokens": 2045574.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 2.1943611800670624, | |
| "epoch": 0.8398190045248869, | |
| "grad_norm": 0.6293840408325195, | |
| "learning_rate": 0.0004968413361848019, | |
| "loss": 0.1784, | |
| "mean_token_accuracy": 0.9559669345617294, | |
| "num_tokens": 2054336.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 2.2722273468971252, | |
| "epoch": 0.8434389140271493, | |
| "grad_norm": 0.6535817980766296, | |
| "learning_rate": 0.0004966861027872211, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9532535970211029, | |
| "num_tokens": 2063225.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 2.3278334736824036, | |
| "epoch": 0.8470588235294118, | |
| "grad_norm": 1.1610206365585327, | |
| "learning_rate": 0.0004965300503762406, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9641145765781403, | |
| "num_tokens": 2071738.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 2.202972888946533, | |
| "epoch": 0.8506787330316742, | |
| "grad_norm": 0.4811885356903076, | |
| "learning_rate": 0.0004963731795445675, | |
| "loss": 0.0813, | |
| "mean_token_accuracy": 0.9766911715269089, | |
| "num_tokens": 2080375.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 2.2433705925941467, | |
| "epoch": 0.8542986425339366, | |
| "grad_norm": 0.8113318681716919, | |
| "learning_rate": 0.0004962154908880171, | |
| "loss": 0.2965, | |
| "mean_token_accuracy": 0.9290606826543808, | |
| "num_tokens": 2089522.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 2.2168884873390198, | |
| "epoch": 0.857918552036199, | |
| "grad_norm": 0.6128959655761719, | |
| "learning_rate": 0.0004960569850055111, | |
| "loss": 0.1724, | |
| "mean_token_accuracy": 0.9603384286165237, | |
| "num_tokens": 2098162.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 2.2738255858421326, | |
| "epoch": 0.8615384615384616, | |
| "grad_norm": 0.8557195663452148, | |
| "learning_rate": 0.0004958976624990749, | |
| "loss": 0.2596, | |
| "mean_token_accuracy": 0.9487071484327316, | |
| "num_tokens": 2106984.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 2.2031425833702087, | |
| "epoch": 0.865158371040724, | |
| "grad_norm": 0.6621816158294678, | |
| "learning_rate": 0.0004957375239738359, | |
| "loss": 0.232, | |
| "mean_token_accuracy": 0.9525040090084076, | |
| "num_tokens": 2116040.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 2.374737858772278, | |
| "epoch": 0.8687782805429864, | |
| "grad_norm": 0.8481062054634094, | |
| "learning_rate": 0.0004955765700380204, | |
| "loss": 0.2516, | |
| "mean_token_accuracy": 0.9396061599254608, | |
| "num_tokens": 2124862.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.266704559326172, | |
| "epoch": 0.8723981900452489, | |
| "grad_norm": 0.6284282803535461, | |
| "learning_rate": 0.0004954148013029521, | |
| "loss": 0.3244, | |
| "mean_token_accuracy": 0.9381244331598282, | |
| "num_tokens": 2134018.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 2.3935859203338623, | |
| "epoch": 0.8760180995475113, | |
| "grad_norm": 1.1564176082611084, | |
| "learning_rate": 0.0004952522183830493, | |
| "loss": 0.2706, | |
| "mean_token_accuracy": 0.9297053664922714, | |
| "num_tokens": 2142745.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 2.281618118286133, | |
| "epoch": 0.8796380090497737, | |
| "grad_norm": 0.5324040055274963, | |
| "learning_rate": 0.0004950888218958225, | |
| "loss": 0.1573, | |
| "mean_token_accuracy": 0.9568462073802948, | |
| "num_tokens": 2151607.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 2.230749189853668, | |
| "epoch": 0.8832579185520362, | |
| "grad_norm": 0.680780291557312, | |
| "learning_rate": 0.0004949246124618726, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9479999989271164, | |
| "num_tokens": 2160904.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 2.21382600069046, | |
| "epoch": 0.8868778280542986, | |
| "grad_norm": 0.6321626305580139, | |
| "learning_rate": 0.0004947595907048877, | |
| "loss": 0.2444, | |
| "mean_token_accuracy": 0.9376699328422546, | |
| "num_tokens": 2170021.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 2.3659472465515137, | |
| "epoch": 0.890497737556561, | |
| "grad_norm": 0.9778954982757568, | |
| "learning_rate": 0.0004945937572516417, | |
| "loss": 0.3783, | |
| "mean_token_accuracy": 0.9104805737733841, | |
| "num_tokens": 2178995.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 2.3233078718185425, | |
| "epoch": 0.8941176470588236, | |
| "grad_norm": 0.53229820728302, | |
| "learning_rate": 0.0004944271127319909, | |
| "loss": 0.0759, | |
| "mean_token_accuracy": 0.9791453778743744, | |
| "num_tokens": 2187823.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 2.2469444274902344, | |
| "epoch": 0.897737556561086, | |
| "grad_norm": 0.6367197632789612, | |
| "learning_rate": 0.0004942596577788728, | |
| "loss": 0.2677, | |
| "mean_token_accuracy": 0.9392691254615784, | |
| "num_tokens": 2196923.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 2.4508965611457825, | |
| "epoch": 0.9013574660633484, | |
| "grad_norm": 0.6042234897613525, | |
| "learning_rate": 0.0004940913930283024, | |
| "loss": 0.1102, | |
| "mean_token_accuracy": 0.9762090593576431, | |
| "num_tokens": 2205400.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 2.365670144557953, | |
| "epoch": 0.9049773755656109, | |
| "grad_norm": 0.6490639448165894, | |
| "learning_rate": 0.0004939223191193707, | |
| "loss": 0.1532, | |
| "mean_token_accuracy": 0.9489114433526993, | |
| "num_tokens": 2214201.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.4013625383377075, | |
| "epoch": 0.9085972850678733, | |
| "grad_norm": 0.5969854593276978, | |
| "learning_rate": 0.0004937524366942419, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9682519882917404, | |
| "num_tokens": 2222979.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 2.4402357935905457, | |
| "epoch": 0.9122171945701357, | |
| "grad_norm": 0.7559595704078674, | |
| "learning_rate": 0.0004935817463981513, | |
| "loss": 0.1979, | |
| "mean_token_accuracy": 0.9483373910188675, | |
| "num_tokens": 2231169.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 2.4673256874084473, | |
| "epoch": 0.9158371040723982, | |
| "grad_norm": 0.8663308620452881, | |
| "learning_rate": 0.0004934102488794023, | |
| "loss": 0.2453, | |
| "mean_token_accuracy": 0.9408974200487137, | |
| "num_tokens": 2240099.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 2.426262080669403, | |
| "epoch": 0.9194570135746606, | |
| "grad_norm": 0.7920467257499695, | |
| "learning_rate": 0.0004932379447893643, | |
| "loss": 0.2828, | |
| "mean_token_accuracy": 0.9319239109754562, | |
| "num_tokens": 2249088.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 2.5018852949142456, | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.7216617465019226, | |
| "learning_rate": 0.0004930648347824701, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9551804810762405, | |
| "num_tokens": 2257710.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 2.43031644821167, | |
| "epoch": 0.9266968325791856, | |
| "grad_norm": 0.646794319152832, | |
| "learning_rate": 0.0004928909195162138, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9663553237915039, | |
| "num_tokens": 2266883.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 2.5406370759010315, | |
| "epoch": 0.930316742081448, | |
| "grad_norm": 0.5482825040817261, | |
| "learning_rate": 0.0004927161996511474, | |
| "loss": 0.1872, | |
| "mean_token_accuracy": 0.9557004272937775, | |
| "num_tokens": 2275728.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 2.636320471763611, | |
| "epoch": 0.9339366515837104, | |
| "grad_norm": 0.7454632520675659, | |
| "learning_rate": 0.0004925406758508797, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9578974395990372, | |
| "num_tokens": 2284319.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 2.6067575812339783, | |
| "epoch": 0.9375565610859729, | |
| "grad_norm": 0.8695769309997559, | |
| "learning_rate": 0.000492364348782072, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.9652896523475647, | |
| "num_tokens": 2293035.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 2.5837162137031555, | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.5752995014190674, | |
| "learning_rate": 0.0004921872191144371, | |
| "loss": 0.1398, | |
| "mean_token_accuracy": 0.9553333520889282, | |
| "num_tokens": 2301802.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.713033616542816, | |
| "epoch": 0.9447963800904977, | |
| "grad_norm": 0.85626620054245, | |
| "learning_rate": 0.0004920092875207363, | |
| "loss": 0.2207, | |
| "mean_token_accuracy": 0.9468346834182739, | |
| "num_tokens": 2309981.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 2.400112509727478, | |
| "epoch": 0.9484162895927601, | |
| "grad_norm": 0.6766608953475952, | |
| "learning_rate": 0.0004918305546767764, | |
| "loss": 0.1644, | |
| "mean_token_accuracy": 0.9502440094947815, | |
| "num_tokens": 2319212.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 2.503827154636383, | |
| "epoch": 0.9520361990950226, | |
| "grad_norm": 0.789470911026001, | |
| "learning_rate": 0.0004916510212614072, | |
| "loss": 0.2117, | |
| "mean_token_accuracy": 0.9454390555620193, | |
| "num_tokens": 2328234.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 2.669040560722351, | |
| "epoch": 0.9556561085972851, | |
| "grad_norm": 0.9579212069511414, | |
| "learning_rate": 0.0004914706879565197, | |
| "loss": 0.2193, | |
| "mean_token_accuracy": 0.9321542829275131, | |
| "num_tokens": 2336543.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 2.507073998451233, | |
| "epoch": 0.9592760180995475, | |
| "grad_norm": 0.5315744876861572, | |
| "learning_rate": 0.000491289555447043, | |
| "loss": 0.0851, | |
| "mean_token_accuracy": 0.9771326780319214, | |
| "num_tokens": 2345292.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 2.4205283522605896, | |
| "epoch": 0.96289592760181, | |
| "grad_norm": 0.5441373586654663, | |
| "learning_rate": 0.000491107624420941, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9541790336370468, | |
| "num_tokens": 2354242.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 2.3817258477211, | |
| "epoch": 0.9665158371040724, | |
| "grad_norm": 0.5946238040924072, | |
| "learning_rate": 0.0004909248955692111, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.947738841176033, | |
| "num_tokens": 2363183.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 2.5073485374450684, | |
| "epoch": 0.9701357466063348, | |
| "grad_norm": 0.6979324817657471, | |
| "learning_rate": 0.0004907413695858812, | |
| "loss": 0.2099, | |
| "mean_token_accuracy": 0.9423733651638031, | |
| "num_tokens": 2371885.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 2.5705007910728455, | |
| "epoch": 0.9737556561085973, | |
| "grad_norm": 0.8203943967819214, | |
| "learning_rate": 0.0004905570471680057, | |
| "loss": 0.217, | |
| "mean_token_accuracy": 0.9511639326810837, | |
| "num_tokens": 2380316.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 2.2677993774414062, | |
| "epoch": 0.9773755656108597, | |
| "grad_norm": 0.5840432047843933, | |
| "learning_rate": 0.0004903719290156649, | |
| "loss": 0.2364, | |
| "mean_token_accuracy": 0.9407180696725845, | |
| "num_tokens": 2389723.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.477886915206909, | |
| "epoch": 0.9809954751131221, | |
| "grad_norm": 0.818929135799408, | |
| "learning_rate": 0.0004901860158319612, | |
| "loss": 0.1707, | |
| "mean_token_accuracy": 0.9579566866159439, | |
| "num_tokens": 2398388.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 2.549662232398987, | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 0.7804781198501587, | |
| "learning_rate": 0.0004899993083230166, | |
| "loss": 0.2944, | |
| "mean_token_accuracy": 0.9381812512874603, | |
| "num_tokens": 2406929.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 2.4465304017066956, | |
| "epoch": 0.9882352941176471, | |
| "grad_norm": 0.5218799114227295, | |
| "learning_rate": 0.0004898118071979699, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9500218778848648, | |
| "num_tokens": 2415631.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 2.5852283239364624, | |
| "epoch": 0.9918552036199095, | |
| "grad_norm": 0.591163158416748, | |
| "learning_rate": 0.0004896235131689743, | |
| "loss": 0.2005, | |
| "mean_token_accuracy": 0.9455285370349884, | |
| "num_tokens": 2424091.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 2.478701651096344, | |
| "epoch": 0.995475113122172, | |
| "grad_norm": 1.0615383386611938, | |
| "learning_rate": 0.0004894344269511945, | |
| "loss": 0.2864, | |
| "mean_token_accuracy": 0.9306265562772751, | |
| "num_tokens": 2432705.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 2.600062847137451, | |
| "epoch": 0.9990950226244344, | |
| "grad_norm": 0.7011683583259583, | |
| "learning_rate": 0.0004892445492628043, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9547821134328842, | |
| "num_tokens": 2440992.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 2.3411240577697754, | |
| "epoch": 1.0, | |
| "grad_norm": 0.4944029450416565, | |
| "learning_rate": 0.000489053880824983, | |
| "loss": 0.022, | |
| "mean_token_accuracy": 0.9929078221321106, | |
| "num_tokens": 2441725.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 2.5467925265552553, | |
| "eval_loss": 0.21274714171886444, | |
| "eval_mean_token_accuracy": 0.9444630068492114, | |
| "eval_num_tokens": 2441725.0, | |
| "eval_runtime": 116.0434, | |
| "eval_samples_per_second": 3.18, | |
| "eval_steps_per_second": 1.06, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 2.609170138835907, | |
| "epoch": 1.0036199095022624, | |
| "grad_norm": 1.0785081386566162, | |
| "learning_rate": 0.0004888624223619136, | |
| "loss": 0.3167, | |
| "mean_token_accuracy": 0.9296800643205643, | |
| "num_tokens": 2450193.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 2.497025430202484, | |
| "epoch": 1.0072398190045249, | |
| "grad_norm": 0.5221985578536987, | |
| "learning_rate": 0.0004886701746007801, | |
| "loss": 0.0854, | |
| "mean_token_accuracy": 0.9753399342298508, | |
| "num_tokens": 2459309.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 2.5487362146377563, | |
| "epoch": 1.0108597285067873, | |
| "grad_norm": 0.5161958336830139, | |
| "learning_rate": 0.0004884771382717638, | |
| "loss": 0.0819, | |
| "mean_token_accuracy": 0.9748431146144867, | |
| "num_tokens": 2467844.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.5276209115982056, | |
| "epoch": 1.0144796380090497, | |
| "grad_norm": 0.5731730461120605, | |
| "learning_rate": 0.0004882833141080412, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9567564427852631, | |
| "num_tokens": 2476894.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 2.4442760348320007, | |
| "epoch": 1.0180995475113122, | |
| "grad_norm": 0.7120366096496582, | |
| "learning_rate": 0.0004880887028457813, | |
| "loss": 0.1945, | |
| "mean_token_accuracy": 0.9465379565954208, | |
| "num_tokens": 2485971.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 2.4069360494613647, | |
| "epoch": 1.0217194570135746, | |
| "grad_norm": 0.7468647360801697, | |
| "learning_rate": 0.00048789330522414244, | |
| "loss": 0.2345, | |
| "mean_token_accuracy": 0.9446765780448914, | |
| "num_tokens": 2495043.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 2.468382716178894, | |
| "epoch": 1.025339366515837, | |
| "grad_norm": 0.666231632232666, | |
| "learning_rate": 0.0004876971219852697, | |
| "loss": 0.1779, | |
| "mean_token_accuracy": 0.9534575343132019, | |
| "num_tokens": 2503672.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 2.4362316727638245, | |
| "epoch": 1.0289592760180994, | |
| "grad_norm": 0.8445858955383301, | |
| "learning_rate": 0.000487500153874292, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.953661322593689, | |
| "num_tokens": 2512322.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 2.364333391189575, | |
| "epoch": 1.032579185520362, | |
| "grad_norm": 0.4805246591567993, | |
| "learning_rate": 0.0004873024016393193, | |
| "loss": 0.0778, | |
| "mean_token_accuracy": 0.9824571758508682, | |
| "num_tokens": 2520791.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 2.223461151123047, | |
| "epoch": 1.0361990950226245, | |
| "grad_norm": 0.648465096950531, | |
| "learning_rate": 0.0004871038660314399, | |
| "loss": 0.2593, | |
| "mean_token_accuracy": 0.9419913589954376, | |
| "num_tokens": 2530082.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 2.3313387036323547, | |
| "epoch": 1.039819004524887, | |
| "grad_norm": 0.6912294626235962, | |
| "learning_rate": 0.00048690454780471725, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9561934620141983, | |
| "num_tokens": 2538728.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 2.191806375980377, | |
| "epoch": 1.0434389140271494, | |
| "grad_norm": 0.8620694279670715, | |
| "learning_rate": 0.0004867044477161874, | |
| "loss": 0.1103, | |
| "mean_token_accuracy": 0.968692272901535, | |
| "num_tokens": 2547219.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 2.167125165462494, | |
| "epoch": 1.0470588235294118, | |
| "grad_norm": 0.6192149519920349, | |
| "learning_rate": 0.0004865035665258559, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9643534421920776, | |
| "num_tokens": 2555940.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.2750985622406006, | |
| "epoch": 1.0506787330316743, | |
| "grad_norm": 1.7459602355957031, | |
| "learning_rate": 0.0004863019049966953, | |
| "loss": 0.393, | |
| "mean_token_accuracy": 0.9146681725978851, | |
| "num_tokens": 2564362.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 2.236129105091095, | |
| "epoch": 1.0542986425339367, | |
| "grad_norm": 0.6311184167861938, | |
| "learning_rate": 0.0004860994638946416, | |
| "loss": 0.1536, | |
| "mean_token_accuracy": 0.9636097103357315, | |
| "num_tokens": 2573316.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 2.2642418146133423, | |
| "epoch": 1.0579185520361991, | |
| "grad_norm": 0.6023411154747009, | |
| "learning_rate": 0.000485896243988592, | |
| "loss": 0.191, | |
| "mean_token_accuracy": 0.9476015418767929, | |
| "num_tokens": 2581835.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 2.3589024543762207, | |
| "epoch": 1.0615384615384615, | |
| "grad_norm": 0.48049232363700867, | |
| "learning_rate": 0.0004856922460504016, | |
| "loss": 0.1017, | |
| "mean_token_accuracy": 0.9713075459003448, | |
| "num_tokens": 2590317.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 2.4141315817832947, | |
| "epoch": 1.065158371040724, | |
| "grad_norm": 0.8456616997718811, | |
| "learning_rate": 0.0004854874708548806, | |
| "loss": 0.1422, | |
| "mean_token_accuracy": 0.9622762501239777, | |
| "num_tokens": 2598538.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 2.069903999567032, | |
| "epoch": 1.0687782805429864, | |
| "grad_norm": 0.7641116380691528, | |
| "learning_rate": 0.0004852819191797912, | |
| "loss": 0.2185, | |
| "mean_token_accuracy": 0.9464851468801498, | |
| "num_tokens": 2608219.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 2.163217008113861, | |
| "epoch": 1.0723981900452488, | |
| "grad_norm": 0.546085000038147, | |
| "learning_rate": 0.0004850755918058449, | |
| "loss": 0.1035, | |
| "mean_token_accuracy": 0.9708487540483475, | |
| "num_tokens": 2617261.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 2.2678662836551666, | |
| "epoch": 1.0760180995475113, | |
| "grad_norm": 0.8699386119842529, | |
| "learning_rate": 0.0004848684895166994, | |
| "loss": 0.2384, | |
| "mean_token_accuracy": 0.9486480504274368, | |
| "num_tokens": 2626144.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 2.13065105676651, | |
| "epoch": 1.0796380090497737, | |
| "grad_norm": 0.44323107600212097, | |
| "learning_rate": 0.00048466061309895554, | |
| "loss": 0.0818, | |
| "mean_token_accuracy": 0.9722468554973602, | |
| "num_tokens": 2635626.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 2.184772551059723, | |
| "epoch": 1.0832579185520361, | |
| "grad_norm": 0.7928256988525391, | |
| "learning_rate": 0.0004844519633421545, | |
| "loss": 0.2378, | |
| "mean_token_accuracy": 0.9477885961532593, | |
| "num_tokens": 2644674.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.1669145822525024, | |
| "epoch": 1.0868778280542986, | |
| "grad_norm": 0.5570158362388611, | |
| "learning_rate": 0.00048424254103877456, | |
| "loss": 0.1434, | |
| "mean_token_accuracy": 0.9587411731481552, | |
| "num_tokens": 2653658.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 2.3057579398155212, | |
| "epoch": 1.090497737556561, | |
| "grad_norm": 0.9084392189979553, | |
| "learning_rate": 0.00048403234698422837, | |
| "loss": 0.3831, | |
| "mean_token_accuracy": 0.8896283358335495, | |
| "num_tokens": 2662350.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 2.1741657853126526, | |
| "epoch": 1.0941176470588236, | |
| "grad_norm": 0.6791238784790039, | |
| "learning_rate": 0.0004838213819768597, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9576362520456314, | |
| "num_tokens": 2671450.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 2.089864045381546, | |
| "epoch": 1.097737556561086, | |
| "grad_norm": 0.5696312189102173, | |
| "learning_rate": 0.0004836096468179406, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.9658148884773254, | |
| "num_tokens": 2680581.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 2.2657605409622192, | |
| "epoch": 1.1013574660633485, | |
| "grad_norm": 1.605503797531128, | |
| "learning_rate": 0.0004833971423116682, | |
| "loss": 0.1027, | |
| "mean_token_accuracy": 0.9762597978115082, | |
| "num_tokens": 2689001.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 2.079287111759186, | |
| "epoch": 1.104977375565611, | |
| "grad_norm": 0.5804780721664429, | |
| "learning_rate": 0.00048318386926516157, | |
| "loss": 0.1137, | |
| "mean_token_accuracy": 0.9633719325065613, | |
| "num_tokens": 2698050.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 2.201345145702362, | |
| "epoch": 1.1085972850678734, | |
| "grad_norm": 0.8606241941452026, | |
| "learning_rate": 0.000482969828488459, | |
| "loss": 0.2124, | |
| "mean_token_accuracy": 0.9472681730985641, | |
| "num_tokens": 2706704.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 2.095236599445343, | |
| "epoch": 1.1122171945701358, | |
| "grad_norm": 0.7078782320022583, | |
| "learning_rate": 0.0004827550207945147, | |
| "loss": 0.1957, | |
| "mean_token_accuracy": 0.9564679116010666, | |
| "num_tokens": 2715745.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 2.186302363872528, | |
| "epoch": 1.1158371040723982, | |
| "grad_norm": 0.7166503667831421, | |
| "learning_rate": 0.0004825394469991956, | |
| "loss": 0.1539, | |
| "mean_token_accuracy": 0.9662427455186844, | |
| "num_tokens": 2724296.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 2.052559405565262, | |
| "epoch": 1.1194570135746607, | |
| "grad_norm": 0.6510501503944397, | |
| "learning_rate": 0.00048232310792127846, | |
| "loss": 0.1831, | |
| "mean_token_accuracy": 0.9533994495868683, | |
| "num_tokens": 2733482.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 2.093154102563858, | |
| "epoch": 1.123076923076923, | |
| "grad_norm": 0.711121678352356, | |
| "learning_rate": 0.0004821060043824466, | |
| "loss": 0.2315, | |
| "mean_token_accuracy": 0.9381555914878845, | |
| "num_tokens": 2742912.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 2.188497006893158, | |
| "epoch": 1.1266968325791855, | |
| "grad_norm": 0.6782490015029907, | |
| "learning_rate": 0.00048188813720728707, | |
| "loss": 0.2, | |
| "mean_token_accuracy": 0.9501812607049942, | |
| "num_tokens": 2751808.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 2.0495824217796326, | |
| "epoch": 1.130316742081448, | |
| "grad_norm": 0.7644634246826172, | |
| "learning_rate": 0.00048166950722328697, | |
| "loss": 0.2152, | |
| "mean_token_accuracy": 0.9440928995609283, | |
| "num_tokens": 2761066.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 2.1707025468349457, | |
| "epoch": 1.1339366515837104, | |
| "grad_norm": 0.655131459236145, | |
| "learning_rate": 0.00048145011526083106, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9500558227300644, | |
| "num_tokens": 2769870.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 2.1047372221946716, | |
| "epoch": 1.1375565610859728, | |
| "grad_norm": 0.5353516936302185, | |
| "learning_rate": 0.0004812299621531979, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.9455999433994293, | |
| "num_tokens": 2779383.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 2.1921610236167908, | |
| "epoch": 1.1411764705882352, | |
| "grad_norm": 0.8998016119003296, | |
| "learning_rate": 0.00048100904873655696, | |
| "loss": 0.3918, | |
| "mean_token_accuracy": 0.9382697492837906, | |
| "num_tokens": 2788386.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 2.0850723683834076, | |
| "epoch": 1.1447963800904977, | |
| "grad_norm": 0.867432713508606, | |
| "learning_rate": 0.0004807873758499656, | |
| "loss": 0.2196, | |
| "mean_token_accuracy": 0.9498324394226074, | |
| "num_tokens": 2797496.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 2.1980925798416138, | |
| "epoch": 1.14841628959276, | |
| "grad_norm": 0.6076980233192444, | |
| "learning_rate": 0.00048056494433536577, | |
| "loss": 0.1086, | |
| "mean_token_accuracy": 0.9642161130905151, | |
| "num_tokens": 2805836.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 2.15611070394516, | |
| "epoch": 1.1520361990950225, | |
| "grad_norm": 0.6276211738586426, | |
| "learning_rate": 0.0004803417550375806, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9622830748558044, | |
| "num_tokens": 2814404.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 2.0017230808734894, | |
| "epoch": 1.155656108597285, | |
| "grad_norm": 0.5840948820114136, | |
| "learning_rate": 0.0004801178088043115, | |
| "loss": 0.1869, | |
| "mean_token_accuracy": 0.9506777077913284, | |
| "num_tokens": 2823786.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.1539418697357178, | |
| "epoch": 1.1592760180995474, | |
| "grad_norm": 1.074331283569336, | |
| "learning_rate": 0.0004798931064861349, | |
| "loss": 0.2797, | |
| "mean_token_accuracy": 0.9271649420261383, | |
| "num_tokens": 2832374.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.930726408958435, | |
| "epoch": 1.16289592760181, | |
| "grad_norm": 0.5121958255767822, | |
| "learning_rate": 0.0004796676489364988, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9582571685314178, | |
| "num_tokens": 2841561.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 2.0205810368061066, | |
| "epoch": 1.1665158371040725, | |
| "grad_norm": 0.6360969543457031, | |
| "learning_rate": 0.00047944143701171966, | |
| "loss": 0.1582, | |
| "mean_token_accuracy": 0.9620308429002762, | |
| "num_tokens": 2850171.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.9655758142471313, | |
| "epoch": 1.170135746606335, | |
| "grad_norm": 0.6647385358810425, | |
| "learning_rate": 0.0004792144715709792, | |
| "loss": 0.1594, | |
| "mean_token_accuracy": 0.954497441649437, | |
| "num_tokens": 2858905.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.9725223183631897, | |
| "epoch": 1.1737556561085973, | |
| "grad_norm": 0.6429229974746704, | |
| "learning_rate": 0.0004789867534763211, | |
| "loss": 0.1407, | |
| "mean_token_accuracy": 0.9645214527845383, | |
| "num_tokens": 2867533.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.9473685026168823, | |
| "epoch": 1.1773755656108598, | |
| "grad_norm": 0.811651349067688, | |
| "learning_rate": 0.0004787582835926477, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9479968994855881, | |
| "num_tokens": 2876286.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.8863109350204468, | |
| "epoch": 1.1809954751131222, | |
| "grad_norm": 0.5587059855461121, | |
| "learning_rate": 0.00047852906278771686, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9684520065784454, | |
| "num_tokens": 2885667.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.8288891315460205, | |
| "epoch": 1.1846153846153846, | |
| "grad_norm": 0.8450536131858826, | |
| "learning_rate": 0.0004782990919321383, | |
| "loss": 0.2224, | |
| "mean_token_accuracy": 0.9377491921186447, | |
| "num_tokens": 2894765.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.9347718358039856, | |
| "epoch": 1.188235294117647, | |
| "grad_norm": 0.7665867209434509, | |
| "learning_rate": 0.0004780683718993705, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9583602845668793, | |
| "num_tokens": 2903551.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.9097798764705658, | |
| "epoch": 1.1918552036199095, | |
| "grad_norm": 0.7705667018890381, | |
| "learning_rate": 0.00047783690356571784, | |
| "loss": 0.2115, | |
| "mean_token_accuracy": 0.9526428133249283, | |
| "num_tokens": 2912197.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.9174850285053253, | |
| "epoch": 1.195475113122172, | |
| "grad_norm": 0.5695499181747437, | |
| "learning_rate": 0.00047760468781032634, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.969958484172821, | |
| "num_tokens": 2920579.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.8578442931175232, | |
| "epoch": 1.1990950226244343, | |
| "grad_norm": 0.7843735814094543, | |
| "learning_rate": 0.000477371725515181, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9545005410909653, | |
| "num_tokens": 2929352.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.8509328961372375, | |
| "epoch": 1.2027149321266968, | |
| "grad_norm": 0.5951048135757446, | |
| "learning_rate": 0.0004771380175651026, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9551403075456619, | |
| "num_tokens": 2938387.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.8236390948295593, | |
| "epoch": 1.2063348416289592, | |
| "grad_norm": 0.4988223910331726, | |
| "learning_rate": 0.0004769035648477434, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.966319814324379, | |
| "num_tokens": 2947741.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.9594822525978088, | |
| "epoch": 1.2099547511312216, | |
| "grad_norm": 0.7550755143165588, | |
| "learning_rate": 0.00047666836825358477, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.9666347652673721, | |
| "num_tokens": 2956313.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.9148444533348083, | |
| "epoch": 1.213574660633484, | |
| "grad_norm": 0.5889077186584473, | |
| "learning_rate": 0.00047643242867593345, | |
| "loss": 0.1343, | |
| "mean_token_accuracy": 0.9611433297395706, | |
| "num_tokens": 2964928.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.8126957714557648, | |
| "epoch": 1.2171945701357467, | |
| "grad_norm": 0.5447750091552734, | |
| "learning_rate": 0.0004761957470109179, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9552300125360489, | |
| "num_tokens": 2974160.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.7981431782245636, | |
| "epoch": 1.2208144796380092, | |
| "grad_norm": 0.5400761365890503, | |
| "learning_rate": 0.0004759583241574854, | |
| "loss": 0.1339, | |
| "mean_token_accuracy": 0.9620136916637421, | |
| "num_tokens": 2982900.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.8613979518413544, | |
| "epoch": 1.2244343891402716, | |
| "grad_norm": 0.7452914714813232, | |
| "learning_rate": 0.0004757201610173981, | |
| "loss": 0.4, | |
| "mean_token_accuracy": 0.9068266004323959, | |
| "num_tokens": 2991783.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.8654026687145233, | |
| "epoch": 1.228054298642534, | |
| "grad_norm": 1.7142685651779175, | |
| "learning_rate": 0.00047548125849523, | |
| "loss": 0.3168, | |
| "mean_token_accuracy": 0.9308896362781525, | |
| "num_tokens": 3000530.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.7702704071998596, | |
| "epoch": 1.2316742081447964, | |
| "grad_norm": 0.6687431931495667, | |
| "learning_rate": 0.0004752416174983633, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9530515670776367, | |
| "num_tokens": 3009355.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.735857516527176, | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 0.6127599477767944, | |
| "learning_rate": 0.00047500123893698507, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9593266248703003, | |
| "num_tokens": 3018518.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.7076368927955627, | |
| "epoch": 1.2389140271493213, | |
| "grad_norm": 0.6973987817764282, | |
| "learning_rate": 0.0004747601237240836, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9539438933134079, | |
| "num_tokens": 3027752.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.7353227138519287, | |
| "epoch": 1.2425339366515837, | |
| "grad_norm": 0.8406392335891724, | |
| "learning_rate": 0.00047451827277544546, | |
| "loss": 0.2063, | |
| "mean_token_accuracy": 0.9488435834646225, | |
| "num_tokens": 3036383.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.6597246527671814, | |
| "epoch": 1.2461538461538462, | |
| "grad_norm": 0.5971431732177734, | |
| "learning_rate": 0.00047427568700965107, | |
| "loss": 0.1013, | |
| "mean_token_accuracy": 0.9721864312887192, | |
| "num_tokens": 3045375.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.7100033462047577, | |
| "epoch": 1.2497737556561086, | |
| "grad_norm": 0.5883470773696899, | |
| "learning_rate": 0.00047403236734807225, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9664830714464188, | |
| "num_tokens": 3054084.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.7402609288692474, | |
| "epoch": 1.253393665158371, | |
| "grad_norm": 0.7355862855911255, | |
| "learning_rate": 0.00047378831471486815, | |
| "loss": 0.2007, | |
| "mean_token_accuracy": 0.9560511559247971, | |
| "num_tokens": 3062727.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.79518261551857, | |
| "epoch": 1.2570135746606335, | |
| "grad_norm": 0.6006518006324768, | |
| "learning_rate": 0.00047354353003698163, | |
| "loss": 0.1085, | |
| "mean_token_accuracy": 0.9598321914672852, | |
| "num_tokens": 3071178.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.7328391373157501, | |
| "epoch": 1.260633484162896, | |
| "grad_norm": 0.560342013835907, | |
| "learning_rate": 0.0004732980142441362, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9579409211874008, | |
| "num_tokens": 3079927.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.7356511652469635, | |
| "epoch": 1.2642533936651583, | |
| "grad_norm": 0.9149975776672363, | |
| "learning_rate": 0.00047305176826883206, | |
| "loss": 0.4064, | |
| "mean_token_accuracy": 0.9265118837356567, | |
| "num_tokens": 3089314.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.8573569357395172, | |
| "epoch": 1.2678733031674208, | |
| "grad_norm": 0.8300670981407166, | |
| "learning_rate": 0.0004728047930463428, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9453776180744171, | |
| "num_tokens": 3097702.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.7906217575073242, | |
| "epoch": 1.2714932126696832, | |
| "grad_norm": 0.5668906569480896, | |
| "learning_rate": 0.0004725570895147118, | |
| "loss": 0.1572, | |
| "mean_token_accuracy": 0.962067037820816, | |
| "num_tokens": 3106379.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.6957395374774933, | |
| "epoch": 1.2751131221719456, | |
| "grad_norm": 0.4048328399658203, | |
| "learning_rate": 0.0004723086586147487, | |
| "loss": 0.0944, | |
| "mean_token_accuracy": 0.9716819673776627, | |
| "num_tokens": 3115622.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.8158144056797028, | |
| "epoch": 1.278733031674208, | |
| "grad_norm": 0.6396092772483826, | |
| "learning_rate": 0.00047205950129002564, | |
| "loss": 0.1011, | |
| "mean_token_accuracy": 0.9698463827371597, | |
| "num_tokens": 3124016.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.730194479227066, | |
| "epoch": 1.2823529411764705, | |
| "grad_norm": 0.662876307964325, | |
| "learning_rate": 0.000471809618486874, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.9520179778337479, | |
| "num_tokens": 3132712.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.6776110529899597, | |
| "epoch": 1.285972850678733, | |
| "grad_norm": 0.868507981300354, | |
| "learning_rate": 0.0004715590111543804, | |
| "loss": 0.3374, | |
| "mean_token_accuracy": 0.9303739666938782, | |
| "num_tokens": 3142103.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.6501678824424744, | |
| "epoch": 1.2895927601809956, | |
| "grad_norm": 0.5433686971664429, | |
| "learning_rate": 0.0004713076802443834, | |
| "loss": 0.1237, | |
| "mean_token_accuracy": 0.9653612226247787, | |
| "num_tokens": 3151192.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.6524465382099152, | |
| "epoch": 1.293212669683258, | |
| "grad_norm": 0.6145523190498352, | |
| "learning_rate": 0.00047105562671147, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9690534323453903, | |
| "num_tokens": 3159839.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.5339214205741882, | |
| "epoch": 1.2968325791855204, | |
| "grad_norm": 0.500477135181427, | |
| "learning_rate": 0.00047080285151297144, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9571033865213394, | |
| "num_tokens": 3169047.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.6765435338020325, | |
| "epoch": 1.3004524886877828, | |
| "grad_norm": 0.6697553396224976, | |
| "learning_rate": 0.00047054935560896026, | |
| "loss": 0.135, | |
| "mean_token_accuracy": 0.9672541171312332, | |
| "num_tokens": 3177062.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.5932062566280365, | |
| "epoch": 1.3040723981900453, | |
| "grad_norm": 0.706957221031189, | |
| "learning_rate": 0.0004702951399622462, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9634416699409485, | |
| "num_tokens": 3185829.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.5623145997524261, | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 0.6199461221694946, | |
| "learning_rate": 0.00047004020553837275, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.9620065689086914, | |
| "num_tokens": 3194426.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.5226828753948212, | |
| "epoch": 1.3113122171945701, | |
| "grad_norm": 0.8962509036064148, | |
| "learning_rate": 0.0004697845533056132, | |
| "loss": 0.2207, | |
| "mean_token_accuracy": 0.9403344839811325, | |
| "num_tokens": 3203655.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.5395641326904297, | |
| "epoch": 1.3149321266968326, | |
| "grad_norm": 0.5993619561195374, | |
| "learning_rate": 0.00046952818423496727, | |
| "loss": 0.1486, | |
| "mean_token_accuracy": 0.9614185988903046, | |
| "num_tokens": 3212069.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.5738630294799805, | |
| "epoch": 1.318552036199095, | |
| "grad_norm": 0.7393983602523804, | |
| "learning_rate": 0.00046927109930015756, | |
| "loss": 0.1812, | |
| "mean_token_accuracy": 0.9535021334886551, | |
| "num_tokens": 3220482.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.5462632775306702, | |
| "epoch": 1.3221719457013574, | |
| "grad_norm": 0.7453555464744568, | |
| "learning_rate": 0.0004690132994776253, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9585814625024796, | |
| "num_tokens": 3229505.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.5241961777210236, | |
| "epoch": 1.3257918552036199, | |
| "grad_norm": 0.7553415298461914, | |
| "learning_rate": 0.00046875478574652713, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.9682841598987579, | |
| "num_tokens": 3238326.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.5344699025154114, | |
| "epoch": 1.3294117647058823, | |
| "grad_norm": 0.8565949201583862, | |
| "learning_rate": 0.0004684955590887311, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 0.920401468873024, | |
| "num_tokens": 3247482.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.5109277665615082, | |
| "epoch": 1.3330316742081447, | |
| "grad_norm": 0.5170580148696899, | |
| "learning_rate": 0.00046823562048881295, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.9584086239337921, | |
| "num_tokens": 3256464.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.4666939079761505, | |
| "epoch": 1.3366515837104074, | |
| "grad_norm": 0.6995373368263245, | |
| "learning_rate": 0.0004679749709340529, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9477890431880951, | |
| "num_tokens": 3265853.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.4208430051803589, | |
| "epoch": 1.3402714932126698, | |
| "grad_norm": 1.1363991498947144, | |
| "learning_rate": 0.000467713611414431, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9495431333780289, | |
| "num_tokens": 3275367.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.5009459853172302, | |
| "epoch": 1.3438914027149322, | |
| "grad_norm": 0.7883325219154358, | |
| "learning_rate": 0.00046745154292262414, | |
| "loss": 0.2526, | |
| "mean_token_accuracy": 0.9334618002176285, | |
| "num_tokens": 3284772.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.5485479533672333, | |
| "epoch": 1.3475113122171947, | |
| "grad_norm": 0.6516429781913757, | |
| "learning_rate": 0.00046718876645400156, | |
| "loss": 0.2057, | |
| "mean_token_accuracy": 0.9546459317207336, | |
| "num_tokens": 3293493.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.6237249970436096, | |
| "epoch": 1.351131221719457, | |
| "grad_norm": 0.8916263580322266, | |
| "learning_rate": 0.00046692528300662213, | |
| "loss": 0.2123, | |
| "mean_token_accuracy": 0.9456845372915268, | |
| "num_tokens": 3302063.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.561572015285492, | |
| "epoch": 1.3547511312217195, | |
| "grad_norm": 0.7527791857719421, | |
| "learning_rate": 0.00046666109358122935, | |
| "loss": 0.2113, | |
| "mean_token_accuracy": 0.9537477940320969, | |
| "num_tokens": 3311037.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.5594256818294525, | |
| "epoch": 1.358371040723982, | |
| "grad_norm": 1.25638747215271, | |
| "learning_rate": 0.0004663961991812485, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9508458077907562, | |
| "num_tokens": 3319635.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.6909976303577423, | |
| "epoch": 1.3619909502262444, | |
| "grad_norm": 0.7627813220024109, | |
| "learning_rate": 0.00046613060081278194, | |
| "loss": 0.2303, | |
| "mean_token_accuracy": 0.9425801336765289, | |
| "num_tokens": 3328043.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.6074829697608948, | |
| "epoch": 1.3656108597285068, | |
| "grad_norm": 0.6584346294403076, | |
| "learning_rate": 0.00046586429948460646, | |
| "loss": 0.1815, | |
| "mean_token_accuracy": 0.9536214470863342, | |
| "num_tokens": 3337143.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.7382183969020844, | |
| "epoch": 1.3692307692307693, | |
| "grad_norm": 1.37154221534729, | |
| "learning_rate": 0.0004655972962081684, | |
| "loss": 0.1849, | |
| "mean_token_accuracy": 0.948440819978714, | |
| "num_tokens": 3346033.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.7148900926113129, | |
| "epoch": 1.3728506787330317, | |
| "grad_norm": 0.9487980604171753, | |
| "learning_rate": 0.00046532959199758, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 0.9344504028558731, | |
| "num_tokens": 3354849.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.7164019346237183, | |
| "epoch": 1.3764705882352941, | |
| "grad_norm": 0.5609025359153748, | |
| "learning_rate": 0.00046506118786961614, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.9571309834718704, | |
| "num_tokens": 3363674.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.894619107246399, | |
| "epoch": 1.3800904977375565, | |
| "grad_norm": 0.9811336994171143, | |
| "learning_rate": 0.00046479208484370997, | |
| "loss": 0.2522, | |
| "mean_token_accuracy": 0.9424156546592712, | |
| "num_tokens": 3372325.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.78870290517807, | |
| "epoch": 1.383710407239819, | |
| "grad_norm": 0.5707085132598877, | |
| "learning_rate": 0.00046452228394194893, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9613165706396103, | |
| "num_tokens": 3381270.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.803922712802887, | |
| "epoch": 1.3873303167420814, | |
| "grad_norm": 0.5655364394187927, | |
| "learning_rate": 0.0004642517861890713, | |
| "loss": 0.0818, | |
| "mean_token_accuracy": 0.9776160269975662, | |
| "num_tokens": 3390363.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.8172507882118225, | |
| "epoch": 1.3909502262443438, | |
| "grad_norm": 0.6950513124465942, | |
| "learning_rate": 0.00046398059261246205, | |
| "loss": 0.1145, | |
| "mean_token_accuracy": 0.963288351893425, | |
| "num_tokens": 3399176.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.9182518422603607, | |
| "epoch": 1.3945701357466063, | |
| "grad_norm": 0.5900619029998779, | |
| "learning_rate": 0.0004637087042421489, | |
| "loss": 0.108, | |
| "mean_token_accuracy": 0.9723307639360428, | |
| "num_tokens": 3407978.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.8558574616909027, | |
| "epoch": 1.3981900452488687, | |
| "grad_norm": 0.6279832124710083, | |
| "learning_rate": 0.00046343612211079843, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9603912532329559, | |
| "num_tokens": 3416856.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.8146779537200928, | |
| "epoch": 1.4018099547511311, | |
| "grad_norm": 0.6171274781227112, | |
| "learning_rate": 0.0004631628472537125, | |
| "loss": 0.1872, | |
| "mean_token_accuracy": 0.9447146654129028, | |
| "num_tokens": 3426044.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.9342225790023804, | |
| "epoch": 1.4054298642533936, | |
| "grad_norm": 0.9947887659072876, | |
| "learning_rate": 0.00046288888070882374, | |
| "loss": 0.2966, | |
| "mean_token_accuracy": 0.9279204607009888, | |
| "num_tokens": 3435154.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.9391801953315735, | |
| "epoch": 1.409049773755656, | |
| "grad_norm": 0.7155653834342957, | |
| "learning_rate": 0.000462614223516692, | |
| "loss": 0.1847, | |
| "mean_token_accuracy": 0.9475171864032745, | |
| "num_tokens": 3444563.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 2.0716978013515472, | |
| "epoch": 1.4126696832579184, | |
| "grad_norm": 0.8198989629745483, | |
| "learning_rate": 0.0004623388767205004, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9608721435070038, | |
| "num_tokens": 3453410.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 2.1060431599617004, | |
| "epoch": 1.416289592760181, | |
| "grad_norm": 1.025406002998352, | |
| "learning_rate": 0.00046206284136605106, | |
| "loss": 0.2146, | |
| "mean_token_accuracy": 0.9414294511079788, | |
| "num_tokens": 3461958.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 2.1459922194480896, | |
| "epoch": 1.4199095022624435, | |
| "grad_norm": 0.9209627509117126, | |
| "learning_rate": 0.00046178611850176146, | |
| "loss": 0.2137, | |
| "mean_token_accuracy": 0.956874743103981, | |
| "num_tokens": 3470547.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 2.0233450531959534, | |
| "epoch": 1.423529411764706, | |
| "grad_norm": 0.5777944922447205, | |
| "learning_rate": 0.00046150870917866025, | |
| "loss": 0.122, | |
| "mean_token_accuracy": 0.9672323018312454, | |
| "num_tokens": 3479618.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 2.035937190055847, | |
| "epoch": 1.4271493212669684, | |
| "grad_norm": 0.7945542931556702, | |
| "learning_rate": 0.0004612306144503835, | |
| "loss": 0.2879, | |
| "mean_token_accuracy": 0.946587473154068, | |
| "num_tokens": 3488533.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 2.155315637588501, | |
| "epoch": 1.4307692307692308, | |
| "grad_norm": 0.6385292410850525, | |
| "learning_rate": 0.00046095183537317035, | |
| "loss": 0.1008, | |
| "mean_token_accuracy": 0.9655124247074127, | |
| "num_tokens": 3496686.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 2.186827063560486, | |
| "epoch": 1.4343891402714932, | |
| "grad_norm": 0.4759826958179474, | |
| "learning_rate": 0.0004606723730058593, | |
| "loss": 0.0768, | |
| "mean_token_accuracy": 0.9783597737550735, | |
| "num_tokens": 3504958.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.974392294883728, | |
| "epoch": 1.4380090497737557, | |
| "grad_norm": 0.6250292062759399, | |
| "learning_rate": 0.00046039222840988406, | |
| "loss": 0.1381, | |
| "mean_token_accuracy": 0.9586146324872971, | |
| "num_tokens": 3513694.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 2.045738846063614, | |
| "epoch": 1.441628959276018, | |
| "grad_norm": 0.5517769455909729, | |
| "learning_rate": 0.0004601114026492695, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9682512134313583, | |
| "num_tokens": 3522395.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 2.105030357837677, | |
| "epoch": 1.4452488687782805, | |
| "grad_norm": 0.6748242974281311, | |
| "learning_rate": 0.0004598298967906276, | |
| "loss": 0.1056, | |
| "mean_token_accuracy": 0.9701305478811264, | |
| "num_tokens": 3530838.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.024325281381607, | |
| "epoch": 1.448868778280543, | |
| "grad_norm": 0.6320233941078186, | |
| "learning_rate": 0.00045954771190315344, | |
| "loss": 0.1129, | |
| "mean_token_accuracy": 0.9633017927408218, | |
| "num_tokens": 3540184.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 2.1561593413352966, | |
| "epoch": 1.4524886877828054, | |
| "grad_norm": 0.7380363941192627, | |
| "learning_rate": 0.0004592648490586213, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9599586874246597, | |
| "num_tokens": 3548727.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 2.2986454367637634, | |
| "epoch": 1.4561085972850678, | |
| "grad_norm": 0.669114351272583, | |
| "learning_rate": 0.00045898130933138024, | |
| "loss": 0.1005, | |
| "mean_token_accuracy": 0.9724964797496796, | |
| "num_tokens": 3556780.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 2.103136509656906, | |
| "epoch": 1.4597285067873302, | |
| "grad_norm": 0.6677402853965759, | |
| "learning_rate": 0.0004586970937983504, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9597653448581696, | |
| "num_tokens": 3565427.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 2.112696200609207, | |
| "epoch": 1.463348416289593, | |
| "grad_norm": 0.4597342014312744, | |
| "learning_rate": 0.0004584122035390185, | |
| "loss": 0.0695, | |
| "mean_token_accuracy": 0.9763098359107971, | |
| "num_tokens": 3573902.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 2.0472628474235535, | |
| "epoch": 1.4669683257918553, | |
| "grad_norm": 0.7842056751251221, | |
| "learning_rate": 0.0004581266396354339, | |
| "loss": 0.1981, | |
| "mean_token_accuracy": 0.9521032422780991, | |
| "num_tokens": 3582913.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 2.236558735370636, | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.7634767293930054, | |
| "learning_rate": 0.000457840403172205, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9602932929992676, | |
| "num_tokens": 3591197.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 2.182949125766754, | |
| "epoch": 1.4742081447963802, | |
| "grad_norm": 0.7084661722183228, | |
| "learning_rate": 0.00045755349523649415, | |
| "loss": 0.2463, | |
| "mean_token_accuracy": 0.9392582327127457, | |
| "num_tokens": 3600134.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 2.135133147239685, | |
| "epoch": 1.4778280542986426, | |
| "grad_norm": 0.8172940015792847, | |
| "learning_rate": 0.00045726591691801433, | |
| "loss": 0.2375, | |
| "mean_token_accuracy": 0.9458330571651459, | |
| "num_tokens": 3608945.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 2.157473146915436, | |
| "epoch": 1.481447963800905, | |
| "grad_norm": 0.6165594458580017, | |
| "learning_rate": 0.0004569776693090246, | |
| "loss": 0.1628, | |
| "mean_token_accuracy": 0.9586529731750488, | |
| "num_tokens": 3617790.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 2.15165376663208, | |
| "epoch": 1.4850678733031675, | |
| "grad_norm": 0.6619407534599304, | |
| "learning_rate": 0.0004566887535043263, | |
| "loss": 0.1866, | |
| "mean_token_accuracy": 0.9545126557350159, | |
| "num_tokens": 3626937.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 2.271161735057831, | |
| "epoch": 1.48868778280543, | |
| "grad_norm": 0.5861835479736328, | |
| "learning_rate": 0.0004563991706012582, | |
| "loss": 0.1409, | |
| "mean_token_accuracy": 0.9595955163240433, | |
| "num_tokens": 3636025.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 2.277799427509308, | |
| "epoch": 1.4923076923076923, | |
| "grad_norm": 0.6464956402778625, | |
| "learning_rate": 0.00045610892169969323, | |
| "loss": 0.0792, | |
| "mean_token_accuracy": 0.9806316941976547, | |
| "num_tokens": 3644746.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 2.2143171429634094, | |
| "epoch": 1.4959276018099548, | |
| "grad_norm": 0.7531687021255493, | |
| "learning_rate": 0.00045581800790203366, | |
| "loss": 0.2584, | |
| "mean_token_accuracy": 0.9225966930389404, | |
| "num_tokens": 3654064.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 2.231681764125824, | |
| "epoch": 1.4995475113122172, | |
| "grad_norm": 0.6902768015861511, | |
| "learning_rate": 0.00045552643031320726, | |
| "loss": 0.232, | |
| "mean_token_accuracy": 0.9433842301368713, | |
| "num_tokens": 3663130.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 2.2672717571258545, | |
| "epoch": 1.5031674208144796, | |
| "grad_norm": 0.5134314894676208, | |
| "learning_rate": 0.00045523419004066273, | |
| "loss": 0.0874, | |
| "mean_token_accuracy": 0.9708191752433777, | |
| "num_tokens": 3671981.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 2.3302834033966064, | |
| "epoch": 1.506787330316742, | |
| "grad_norm": 0.885969340801239, | |
| "learning_rate": 0.0004549412881943659, | |
| "loss": 0.0723, | |
| "mean_token_accuracy": 0.9791463166475296, | |
| "num_tokens": 3680525.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 2.2693899869918823, | |
| "epoch": 1.5104072398190045, | |
| "grad_norm": 0.7424856424331665, | |
| "learning_rate": 0.00045464772588679547, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9600907415151596, | |
| "num_tokens": 3689430.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 2.4042725563049316, | |
| "epoch": 1.514027149321267, | |
| "grad_norm": 0.8968034982681274, | |
| "learning_rate": 0.0004543535042329382, | |
| "loss": 0.1984, | |
| "mean_token_accuracy": 0.9488537162542343, | |
| "num_tokens": 3697836.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 2.2518428564071655, | |
| "epoch": 1.5176470588235293, | |
| "grad_norm": 0.5963534712791443, | |
| "learning_rate": 0.0004540586243502858, | |
| "loss": 0.1214, | |
| "mean_token_accuracy": 0.9711381644010544, | |
| "num_tokens": 3706675.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.275522291660309, | |
| "epoch": 1.5212669683257918, | |
| "grad_norm": 1.0797090530395508, | |
| "learning_rate": 0.0004537630873588293, | |
| "loss": 0.2508, | |
| "mean_token_accuracy": 0.9247037768363953, | |
| "num_tokens": 3715631.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 2.249617278575897, | |
| "epoch": 1.5248868778280542, | |
| "grad_norm": 0.7636313438415527, | |
| "learning_rate": 0.000453466894381056, | |
| "loss": 0.1112, | |
| "mean_token_accuracy": 0.9681926071643829, | |
| "num_tokens": 3724579.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 2.280571699142456, | |
| "epoch": 1.5285067873303166, | |
| "grad_norm": 0.9915648698806763, | |
| "learning_rate": 0.00045317004654194464, | |
| "loss": 0.3532, | |
| "mean_token_accuracy": 0.9360047876834869, | |
| "num_tokens": 3733607.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 2.241512656211853, | |
| "epoch": 1.532126696832579, | |
| "grad_norm": 0.924977719783783, | |
| "learning_rate": 0.0004528725449689611, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9475428760051727, | |
| "num_tokens": 3742611.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 2.201731503009796, | |
| "epoch": 1.5357466063348415, | |
| "grad_norm": 0.7018861770629883, | |
| "learning_rate": 0.0004525743907920542, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9465018659830093, | |
| "num_tokens": 3751737.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 2.28944593667984, | |
| "epoch": 1.539366515837104, | |
| "grad_norm": 0.5893452763557434, | |
| "learning_rate": 0.00045227558514365166, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9711766839027405, | |
| "num_tokens": 3761245.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 2.3497202396392822, | |
| "epoch": 1.5429864253393664, | |
| "grad_norm": 0.685279130935669, | |
| "learning_rate": 0.0004519761291586551, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9663016647100449, | |
| "num_tokens": 3769854.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 2.308362066745758, | |
| "epoch": 1.5466063348416288, | |
| "grad_norm": 0.5116177797317505, | |
| "learning_rate": 0.00045167602397443694, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9700013697147369, | |
| "num_tokens": 3778996.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 2.238637685775757, | |
| "epoch": 1.5502262443438914, | |
| "grad_norm": 0.8374833464622498, | |
| "learning_rate": 0.00045137527073083457, | |
| "loss": 0.2539, | |
| "mean_token_accuracy": 0.9407305717468262, | |
| "num_tokens": 3787835.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 2.3406758308410645, | |
| "epoch": 1.5538461538461539, | |
| "grad_norm": 0.5140913724899292, | |
| "learning_rate": 0.0004510738705701473, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9635641574859619, | |
| "num_tokens": 3796498.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 2.2642539143562317, | |
| "epoch": 1.5574660633484163, | |
| "grad_norm": 0.5750702023506165, | |
| "learning_rate": 0.0004507718246371313, | |
| "loss": 0.1127, | |
| "mean_token_accuracy": 0.9660817235708237, | |
| "num_tokens": 3805464.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 2.2058264315128326, | |
| "epoch": 1.5610859728506787, | |
| "grad_norm": 0.6448659300804138, | |
| "learning_rate": 0.0004504691340789955, | |
| "loss": 0.0994, | |
| "mean_token_accuracy": 0.96739861369133, | |
| "num_tokens": 3814309.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 2.330399215221405, | |
| "epoch": 1.5647058823529412, | |
| "grad_norm": 0.8432528376579285, | |
| "learning_rate": 0.0004501658000453973, | |
| "loss": 0.1999, | |
| "mean_token_accuracy": 0.9510775059461594, | |
| "num_tokens": 3823126.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 2.4211326837539673, | |
| "epoch": 1.5683257918552036, | |
| "grad_norm": 0.8101194500923157, | |
| "learning_rate": 0.00044986182368843806, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9656328558921814, | |
| "num_tokens": 3831274.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 2.2594956755638123, | |
| "epoch": 1.571945701357466, | |
| "grad_norm": 0.6753663420677185, | |
| "learning_rate": 0.0004495572061626585, | |
| "loss": 0.1433, | |
| "mean_token_accuracy": 0.9572386592626572, | |
| "num_tokens": 3840206.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 2.1233682930469513, | |
| "epoch": 1.5755656108597285, | |
| "grad_norm": 0.48616713285446167, | |
| "learning_rate": 0.000449251948625035, | |
| "loss": 0.0934, | |
| "mean_token_accuracy": 0.9740773588418961, | |
| "num_tokens": 3849363.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 2.325556695461273, | |
| "epoch": 1.5791855203619911, | |
| "grad_norm": 0.7744045853614807, | |
| "learning_rate": 0.00044894605223497446, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.9687052518129349, | |
| "num_tokens": 3857733.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 2.266542673110962, | |
| "epoch": 1.5828054298642535, | |
| "grad_norm": 2.373530387878418, | |
| "learning_rate": 0.00044863951815431045, | |
| "loss": 0.2404, | |
| "mean_token_accuracy": 0.9437267184257507, | |
| "num_tokens": 3866374.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 2.1757248640060425, | |
| "epoch": 1.586425339366516, | |
| "grad_norm": 0.5588560700416565, | |
| "learning_rate": 0.00044833234754729847, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.9601300358772278, | |
| "num_tokens": 3875520.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 2.124377518892288, | |
| "epoch": 1.5900452488687784, | |
| "grad_norm": 0.5602438449859619, | |
| "learning_rate": 0.0004480245415806116, | |
| "loss": 0.1556, | |
| "mean_token_accuracy": 0.9561446160078049, | |
| "num_tokens": 3884345.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.1571075320243835, | |
| "epoch": 1.5936651583710408, | |
| "grad_norm": 0.472598671913147, | |
| "learning_rate": 0.0004477161014233361, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9742853343486786, | |
| "num_tokens": 3893129.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 2.0434057414531708, | |
| "epoch": 1.5972850678733033, | |
| "grad_norm": 0.7104448676109314, | |
| "learning_rate": 0.00044740702824696703, | |
| "loss": 0.1524, | |
| "mean_token_accuracy": 0.9542464315891266, | |
| "num_tokens": 3902120.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 2.1118403673171997, | |
| "epoch": 1.6009049773755657, | |
| "grad_norm": 0.6632394194602966, | |
| "learning_rate": 0.0004470973232254037, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.928197592496872, | |
| "num_tokens": 3910974.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 2.0292475819587708, | |
| "epoch": 1.6045248868778281, | |
| "grad_norm": 1.050956130027771, | |
| "learning_rate": 0.00044678698753494527, | |
| "loss": 0.2226, | |
| "mean_token_accuracy": 0.9448522627353668, | |
| "num_tokens": 3920005.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.991033524274826, | |
| "epoch": 1.6081447963800906, | |
| "grad_norm": 0.670244038105011, | |
| "learning_rate": 0.00044647602235428624, | |
| "loss": 0.2158, | |
| "mean_token_accuracy": 0.9551118016242981, | |
| "num_tokens": 3929334.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 2.04949289560318, | |
| "epoch": 1.611764705882353, | |
| "grad_norm": 0.6321494579315186, | |
| "learning_rate": 0.00044616442886451197, | |
| "loss": 0.1743, | |
| "mean_token_accuracy": 0.9494802355766296, | |
| "num_tokens": 3938211.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 2.1101951897144318, | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 0.6970012187957764, | |
| "learning_rate": 0.0004458522082490943, | |
| "loss": 0.1228, | |
| "mean_token_accuracy": 0.9624926447868347, | |
| "num_tokens": 3946534.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.9337081909179688, | |
| "epoch": 1.6190045248868778, | |
| "grad_norm": 0.5971657633781433, | |
| "learning_rate": 0.0004455393616938868, | |
| "loss": 0.1431, | |
| "mean_token_accuracy": 0.9635348320007324, | |
| "num_tokens": 3955694.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.9635128676891327, | |
| "epoch": 1.6226244343891403, | |
| "grad_norm": 0.8510827422142029, | |
| "learning_rate": 0.00044522589038712074, | |
| "loss": 0.2446, | |
| "mean_token_accuracy": 0.9457641988992691, | |
| "num_tokens": 3964907.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 2.0336360335350037, | |
| "epoch": 1.6262443438914027, | |
| "grad_norm": 0.5803818106651306, | |
| "learning_rate": 0.00044491179551939985, | |
| "loss": 0.0872, | |
| "mean_token_accuracy": 0.9734505414962769, | |
| "num_tokens": 3973584.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 2.0668878853321075, | |
| "epoch": 1.6298642533936651, | |
| "grad_norm": 0.6990496516227722, | |
| "learning_rate": 0.0004445970782836967, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9702571034431458, | |
| "num_tokens": 3982632.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 2.1481760144233704, | |
| "epoch": 1.6334841628959276, | |
| "grad_norm": 0.6156729459762573, | |
| "learning_rate": 0.00044428173987534733, | |
| "loss": 0.0936, | |
| "mean_token_accuracy": 0.9739355593919754, | |
| "num_tokens": 3991147.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 2.0678701996803284, | |
| "epoch": 1.63710407239819, | |
| "grad_norm": 0.5441684126853943, | |
| "learning_rate": 0.0004439657814920472, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9693446308374405, | |
| "num_tokens": 3999990.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.9867055118083954, | |
| "epoch": 1.6407239819004524, | |
| "grad_norm": 0.9218093156814575, | |
| "learning_rate": 0.00044364920433384656, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9564195573329926, | |
| "num_tokens": 4009097.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 2.145586997270584, | |
| "epoch": 1.6443438914027149, | |
| "grad_norm": 0.77643883228302, | |
| "learning_rate": 0.0004433320096031458, | |
| "loss": 0.1491, | |
| "mean_token_accuracy": 0.9602408111095428, | |
| "num_tokens": 4018059.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 2.071108251810074, | |
| "epoch": 1.6479638009049773, | |
| "grad_norm": 0.5267088413238525, | |
| "learning_rate": 0.0004430141985046909, | |
| "loss": 0.0875, | |
| "mean_token_accuracy": 0.9764399826526642, | |
| "num_tokens": 4027089.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 2.1659318804740906, | |
| "epoch": 1.6515837104072397, | |
| "grad_norm": 1.0642318725585938, | |
| "learning_rate": 0.000442695772245569, | |
| "loss": 0.2623, | |
| "mean_token_accuracy": 0.9307756721973419, | |
| "num_tokens": 4035719.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 2.0232724249362946, | |
| "epoch": 1.6552036199095022, | |
| "grad_norm": 0.6213289499282837, | |
| "learning_rate": 0.0004423767320352035, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9599647223949432, | |
| "num_tokens": 4045088.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 2.047410547733307, | |
| "epoch": 1.6588235294117646, | |
| "grad_norm": 0.6346105933189392, | |
| "learning_rate": 0.0004420570790853498, | |
| "loss": 0.1422, | |
| "mean_token_accuracy": 0.9649711549282074, | |
| "num_tokens": 4054262.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 2.0923012793064117, | |
| "epoch": 1.662443438914027, | |
| "grad_norm": 0.46477749943733215, | |
| "learning_rate": 0.0004417368146100907, | |
| "loss": 0.079, | |
| "mean_token_accuracy": 0.9777993708848953, | |
| "num_tokens": 4063107.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 2.168913394212723, | |
| "epoch": 1.6660633484162894, | |
| "grad_norm": 0.5164734721183777, | |
| "learning_rate": 0.0004414159398258312, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.9725133627653122, | |
| "num_tokens": 4071656.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 2.152670443058014, | |
| "epoch": 1.6696832579185519, | |
| "grad_norm": 0.8985757231712341, | |
| "learning_rate": 0.00044109445595129495, | |
| "loss": 0.2142, | |
| "mean_token_accuracy": 0.9387252777814865, | |
| "num_tokens": 4080023.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 2.111784875392914, | |
| "epoch": 1.6733031674208145, | |
| "grad_norm": 0.47521084547042847, | |
| "learning_rate": 0.0004407723642075184, | |
| "loss": 0.0581, | |
| "mean_token_accuracy": 0.9821985810995102, | |
| "num_tokens": 4088469.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.9784683287143707, | |
| "epoch": 1.676923076923077, | |
| "grad_norm": 0.5552536249160767, | |
| "learning_rate": 0.0004404496658178472, | |
| "loss": 0.1353, | |
| "mean_token_accuracy": 0.9619844257831573, | |
| "num_tokens": 4097737.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 2.015674114227295, | |
| "epoch": 1.6805429864253394, | |
| "grad_norm": 0.6078305244445801, | |
| "learning_rate": 0.0004401263620079309, | |
| "loss": 0.1916, | |
| "mean_token_accuracy": 0.9506707191467285, | |
| "num_tokens": 4107156.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 2.0832217931747437, | |
| "epoch": 1.6841628959276018, | |
| "grad_norm": 0.6618755459785461, | |
| "learning_rate": 0.0004398024540057186, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9617152661085129, | |
| "num_tokens": 4116019.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 2.0383114516735077, | |
| "epoch": 1.6877828054298643, | |
| "grad_norm": 0.5774693489074707, | |
| "learning_rate": 0.0004394779430414541, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9387127161026001, | |
| "num_tokens": 4125001.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 2.201409190893173, | |
| "epoch": 1.6914027149321267, | |
| "grad_norm": 0.7600311636924744, | |
| "learning_rate": 0.0004391528303476715, | |
| "loss": 0.073, | |
| "mean_token_accuracy": 0.979825034737587, | |
| "num_tokens": 4133467.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 2.168666422367096, | |
| "epoch": 1.6950226244343891, | |
| "grad_norm": 0.7801902294158936, | |
| "learning_rate": 0.00043882711715919015, | |
| "loss": 0.2406, | |
| "mean_token_accuracy": 0.9451306313276291, | |
| "num_tokens": 4141765.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 2.1429262161254883, | |
| "epoch": 1.6986425339366515, | |
| "grad_norm": 0.5192358493804932, | |
| "learning_rate": 0.0004385008047131104, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9749262481927872, | |
| "num_tokens": 4150732.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.1387495696544647, | |
| "epoch": 1.702262443438914, | |
| "grad_norm": 0.6219777464866638, | |
| "learning_rate": 0.0004381738942488083, | |
| "loss": 0.2127, | |
| "mean_token_accuracy": 0.9398418068885803, | |
| "num_tokens": 4159715.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 2.1718398332595825, | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.5738123655319214, | |
| "learning_rate": 0.0004378463870079316, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9520847648382187, | |
| "num_tokens": 4168526.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 2.2768235206604004, | |
| "epoch": 1.709502262443439, | |
| "grad_norm": 0.662564754486084, | |
| "learning_rate": 0.00043751828423439456, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9581841826438904, | |
| "num_tokens": 4177189.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 2.29143089056015, | |
| "epoch": 1.7131221719457015, | |
| "grad_norm": 0.8638074398040771, | |
| "learning_rate": 0.00043718958717437324, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9645630270242691, | |
| "num_tokens": 4185367.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 2.2810245156288147, | |
| "epoch": 1.716742081447964, | |
| "grad_norm": 0.6139346957206726, | |
| "learning_rate": 0.00043686029707630097, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9592728316783905, | |
| "num_tokens": 4194418.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 2.1307725310325623, | |
| "epoch": 1.7203619909502263, | |
| "grad_norm": 0.5192779302597046, | |
| "learning_rate": 0.00043653041519086354, | |
| "loss": 0.1025, | |
| "mean_token_accuracy": 0.970764696598053, | |
| "num_tokens": 4203705.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 2.160595118999481, | |
| "epoch": 1.7239819004524888, | |
| "grad_norm": 0.7398526668548584, | |
| "learning_rate": 0.0004361999427709943, | |
| "loss": 0.229, | |
| "mean_token_accuracy": 0.9352773874998093, | |
| "num_tokens": 4212648.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 2.1865442991256714, | |
| "epoch": 1.7276018099547512, | |
| "grad_norm": 0.6227203011512756, | |
| "learning_rate": 0.0004358688810718699, | |
| "loss": 0.1118, | |
| "mean_token_accuracy": 0.9689576476812363, | |
| "num_tokens": 4221208.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 2.086527943611145, | |
| "epoch": 1.7312217194570136, | |
| "grad_norm": 0.722144603729248, | |
| "learning_rate": 0.00043553723135090447, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9537550210952759, | |
| "num_tokens": 4230810.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 2.068355441093445, | |
| "epoch": 1.734841628959276, | |
| "grad_norm": 0.5781517028808594, | |
| "learning_rate": 0.0004352049948677462, | |
| "loss": 0.1497, | |
| "mean_token_accuracy": 0.9600837379693985, | |
| "num_tokens": 4240394.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 2.185140371322632, | |
| "epoch": 1.7384615384615385, | |
| "grad_norm": 0.7261873483657837, | |
| "learning_rate": 0.0004348721728842715, | |
| "loss": 0.1582, | |
| "mean_token_accuracy": 0.9584025889635086, | |
| "num_tokens": 4249205.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 2.21835720539093, | |
| "epoch": 1.742081447963801, | |
| "grad_norm": 0.5321667194366455, | |
| "learning_rate": 0.0004345387666645807, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9659005403518677, | |
| "num_tokens": 4257808.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 2.078131854534149, | |
| "epoch": 1.7457013574660634, | |
| "grad_norm": 0.5598498582839966, | |
| "learning_rate": 0.00043420477747499307, | |
| "loss": 0.1347, | |
| "mean_token_accuracy": 0.9678008407354355, | |
| "num_tokens": 4266728.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 2.060504525899887, | |
| "epoch": 1.7493212669683258, | |
| "grad_norm": 0.5017166137695312, | |
| "learning_rate": 0.0004338702065840422, | |
| "loss": 0.0722, | |
| "mean_token_accuracy": 0.9762782007455826, | |
| "num_tokens": 4275514.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 2.165244698524475, | |
| "epoch": 1.7529411764705882, | |
| "grad_norm": 0.4664002060890198, | |
| "learning_rate": 0.00043353505526247084, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9696767777204514, | |
| "num_tokens": 4284013.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 2.103049159049988, | |
| "epoch": 1.7565610859728507, | |
| "grad_norm": 0.6669000387191772, | |
| "learning_rate": 0.0004331993247832265, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9665459096431732, | |
| "num_tokens": 4293011.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 2.1286613941192627, | |
| "epoch": 1.760180995475113, | |
| "grad_norm": 0.7821269631385803, | |
| "learning_rate": 0.00043286301642145634, | |
| "loss": 0.3669, | |
| "mean_token_accuracy": 0.9062697291374207, | |
| "num_tokens": 4301965.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 2.098009169101715, | |
| "epoch": 1.7638009049773755, | |
| "grad_norm": 0.5720731616020203, | |
| "learning_rate": 0.0004325261314545024, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9650943875312805, | |
| "num_tokens": 4310914.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 2.164614498615265, | |
| "epoch": 1.767420814479638, | |
| "grad_norm": 1.0500473976135254, | |
| "learning_rate": 0.0004321886711618967, | |
| "loss": 0.1182, | |
| "mean_token_accuracy": 0.9720661342144012, | |
| "num_tokens": 4319072.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 2.2015402913093567, | |
| "epoch": 1.7710407239819004, | |
| "grad_norm": 0.5770253539085388, | |
| "learning_rate": 0.00043185063682535634, | |
| "loss": 0.1226, | |
| "mean_token_accuracy": 0.9615659862756729, | |
| "num_tokens": 4327539.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 2.075456440448761, | |
| "epoch": 1.7746606334841628, | |
| "grad_norm": 0.6456925272941589, | |
| "learning_rate": 0.0004315120297287789, | |
| "loss": 0.1123, | |
| "mean_token_accuracy": 0.9628709554672241, | |
| "num_tokens": 4336523.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 2.158169150352478, | |
| "epoch": 1.7782805429864252, | |
| "grad_norm": 0.8282069563865662, | |
| "learning_rate": 0.00043117285115823733, | |
| "loss": 0.2146, | |
| "mean_token_accuracy": 0.9413971602916718, | |
| "num_tokens": 4345294.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 2.02735897898674, | |
| "epoch": 1.7819004524886877, | |
| "grad_norm": 0.783597469329834, | |
| "learning_rate": 0.000430833102401975, | |
| "loss": 0.1376, | |
| "mean_token_accuracy": 0.964630737900734, | |
| "num_tokens": 4354107.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 2.138492166996002, | |
| "epoch": 1.78552036199095, | |
| "grad_norm": 0.6317175030708313, | |
| "learning_rate": 0.000430492784750401, | |
| "loss": 0.1005, | |
| "mean_token_accuracy": 0.9734214246273041, | |
| "num_tokens": 4362560.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 2.0253217220306396, | |
| "epoch": 1.7891402714932125, | |
| "grad_norm": 0.5523395538330078, | |
| "learning_rate": 0.000430151899496085, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9558031558990479, | |
| "num_tokens": 4371698.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 2.160472810268402, | |
| "epoch": 1.792760180995475, | |
| "grad_norm": 0.6557935476303101, | |
| "learning_rate": 0.00042981044793375295, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.9722230583429337, | |
| "num_tokens": 4380612.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 2.0284159183502197, | |
| "epoch": 1.7963800904977374, | |
| "grad_norm": 0.7357863187789917, | |
| "learning_rate": 0.00042946843136028117, | |
| "loss": 0.1166, | |
| "mean_token_accuracy": 0.9629471153020859, | |
| "num_tokens": 4389521.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 2.1544791162014008, | |
| "epoch": 1.8, | |
| "grad_norm": 0.5604898929595947, | |
| "learning_rate": 0.00042912585107469226, | |
| "loss": 0.0834, | |
| "mean_token_accuracy": 0.9783036410808563, | |
| "num_tokens": 4398059.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 2.1051094830036163, | |
| "epoch": 1.8036199095022625, | |
| "grad_norm": 0.4598539173603058, | |
| "learning_rate": 0.0004287827083781497, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9868490546941757, | |
| "num_tokens": 4406453.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 2.0219272077083588, | |
| "epoch": 1.807239819004525, | |
| "grad_norm": 0.8164628744125366, | |
| "learning_rate": 0.00042843900457395343, | |
| "loss": 0.1988, | |
| "mean_token_accuracy": 0.9502352625131607, | |
| "num_tokens": 4415440.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.980013906955719, | |
| "epoch": 1.8108597285067873, | |
| "grad_norm": 0.572798490524292, | |
| "learning_rate": 0.0004280947409675341, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.966580331325531, | |
| "num_tokens": 4424532.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 2.0646563172340393, | |
| "epoch": 1.8144796380090498, | |
| "grad_norm": 0.769386351108551, | |
| "learning_rate": 0.00042774991886644875, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9553463608026505, | |
| "num_tokens": 4432913.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 2.040877491235733, | |
| "epoch": 1.8180995475113122, | |
| "grad_norm": 0.7467371821403503, | |
| "learning_rate": 0.0004274045395803758, | |
| "loss": 0.2247, | |
| "mean_token_accuracy": 0.9526964277029037, | |
| "num_tokens": 4441425.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 1.9934698939323425, | |
| "epoch": 1.8217194570135746, | |
| "grad_norm": 0.6602952480316162, | |
| "learning_rate": 0.00042705860442110964, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9594631940126419, | |
| "num_tokens": 4450383.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 2.0858289897441864, | |
| "epoch": 1.825339366515837, | |
| "grad_norm": 0.684380829334259, | |
| "learning_rate": 0.0004267121147025562, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.9638111293315887, | |
| "num_tokens": 4458862.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 2.0886995792388916, | |
| "epoch": 1.8289592760180997, | |
| "grad_norm": 0.5784837007522583, | |
| "learning_rate": 0.00042636507174072756, | |
| "loss": 0.1026, | |
| "mean_token_accuracy": 0.9676834791898727, | |
| "num_tokens": 4467386.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 2.0236063301563263, | |
| "epoch": 1.8325791855203621, | |
| "grad_norm": 0.5101180672645569, | |
| "learning_rate": 0.00042601747685373716, | |
| "loss": 0.1031, | |
| "mean_token_accuracy": 0.9734093993902206, | |
| "num_tokens": 4476054.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 1.9801031053066254, | |
| "epoch": 1.8361990950226246, | |
| "grad_norm": 0.6581607460975647, | |
| "learning_rate": 0.00042566933136179455, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9581006914377213, | |
| "num_tokens": 4484895.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 2.0244787633419037, | |
| "epoch": 1.839819004524887, | |
| "grad_norm": 0.8100608587265015, | |
| "learning_rate": 0.0004253206365872008, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9532899260520935, | |
| "num_tokens": 4493737.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 1.9108119010925293, | |
| "epoch": 1.8434389140271494, | |
| "grad_norm": 0.4903942048549652, | |
| "learning_rate": 0.00042497139385434314, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9667337089776993, | |
| "num_tokens": 4502840.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 2.009468197822571, | |
| "epoch": 1.8470588235294119, | |
| "grad_norm": 0.6010113954544067, | |
| "learning_rate": 0.0004246216044896897, | |
| "loss": 0.1013, | |
| "mean_token_accuracy": 0.9692314714193344, | |
| "num_tokens": 4511407.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 2.0337170362472534, | |
| "epoch": 1.8506787330316743, | |
| "grad_norm": 0.7906802892684937, | |
| "learning_rate": 0.00042427126982178546, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9550099819898605, | |
| "num_tokens": 4520018.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 1.8813888728618622, | |
| "epoch": 1.8542986425339367, | |
| "grad_norm": 0.5353080034255981, | |
| "learning_rate": 0.00042392039118124586, | |
| "loss": 0.1228, | |
| "mean_token_accuracy": 0.9624074995517731, | |
| "num_tokens": 4529270.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 2.012698233127594, | |
| "epoch": 1.8579185520361992, | |
| "grad_norm": 0.6713843941688538, | |
| "learning_rate": 0.00042356896990075285, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9417333751916885, | |
| "num_tokens": 4538008.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 1.880586564540863, | |
| "epoch": 1.8615384615384616, | |
| "grad_norm": 0.5821724534034729, | |
| "learning_rate": 0.00042321700731504916, | |
| "loss": 0.1144, | |
| "mean_token_accuracy": 0.9677341282367706, | |
| "num_tokens": 4546950.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 2.0066279470920563, | |
| "epoch": 1.865158371040724, | |
| "grad_norm": 0.4095056354999542, | |
| "learning_rate": 0.0004228645047609335, | |
| "loss": 0.0424, | |
| "mean_token_accuracy": 0.9854962974786758, | |
| "num_tokens": 4555452.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 2.042815536260605, | |
| "epoch": 1.8687782805429864, | |
| "grad_norm": 0.5398769974708557, | |
| "learning_rate": 0.0004225114635772555, | |
| "loss": 0.1343, | |
| "mean_token_accuracy": 0.9615450948476791, | |
| "num_tokens": 4564386.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 2.0948933362960815, | |
| "epoch": 1.8723981900452489, | |
| "grad_norm": 0.6738974452018738, | |
| "learning_rate": 0.0004221578851049107, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9526563137769699, | |
| "num_tokens": 4573041.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 2.102545380592346, | |
| "epoch": 1.8760180995475113, | |
| "grad_norm": 0.7769943475723267, | |
| "learning_rate": 0.00042180377068683504, | |
| "loss": 0.2362, | |
| "mean_token_accuracy": 0.9472651779651642, | |
| "num_tokens": 4581666.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 2.087820291519165, | |
| "epoch": 1.8796380090497737, | |
| "grad_norm": 0.5722424983978271, | |
| "learning_rate": 0.0004214491216680004, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9537082612514496, | |
| "num_tokens": 4590238.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 2.0093430876731873, | |
| "epoch": 1.8832579185520362, | |
| "grad_norm": 0.5844932198524475, | |
| "learning_rate": 0.00042109393939540867, | |
| "loss": 0.1485, | |
| "mean_token_accuracy": 0.9624215811491013, | |
| "num_tokens": 4599352.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 1.9117147326469421, | |
| "epoch": 1.8868778280542986, | |
| "grad_norm": 0.46085676550865173, | |
| "learning_rate": 0.0004207382252180876, | |
| "loss": 0.0853, | |
| "mean_token_accuracy": 0.9769327491521835, | |
| "num_tokens": 4608571.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 2.0205602943897247, | |
| "epoch": 1.890497737556561, | |
| "grad_norm": 0.5571608543395996, | |
| "learning_rate": 0.000420381980487085, | |
| "loss": 0.1517, | |
| "mean_token_accuracy": 0.9646699875593185, | |
| "num_tokens": 4617445.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 1.9571953415870667, | |
| "epoch": 1.8941176470588235, | |
| "grad_norm": 0.470630943775177, | |
| "learning_rate": 0.0004200252065554636, | |
| "loss": 0.1005, | |
| "mean_token_accuracy": 0.9750025719404221, | |
| "num_tokens": 4626756.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 2.063209116458893, | |
| "epoch": 1.897737556561086, | |
| "grad_norm": 0.6447069644927979, | |
| "learning_rate": 0.00041966790477829637, | |
| "loss": 0.113, | |
| "mean_token_accuracy": 0.9695079624652863, | |
| "num_tokens": 4635378.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.9232109785079956, | |
| "epoch": 1.9013574660633483, | |
| "grad_norm": 0.5114295482635498, | |
| "learning_rate": 0.000419310076512661, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9653338938951492, | |
| "num_tokens": 4644769.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 2.1691197752952576, | |
| "epoch": 1.9049773755656108, | |
| "grad_norm": 0.7630137205123901, | |
| "learning_rate": 0.00041895172311763476, | |
| "loss": 0.212, | |
| "mean_token_accuracy": 0.9533941894769669, | |
| "num_tokens": 4652857.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 2.04753240942955, | |
| "epoch": 1.9085972850678732, | |
| "grad_norm": 0.6423042416572571, | |
| "learning_rate": 0.00041859284595428955, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.956505224108696, | |
| "num_tokens": 4661591.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 1.9440338611602783, | |
| "epoch": 1.9122171945701356, | |
| "grad_norm": 0.5011327266693115, | |
| "learning_rate": 0.00041823344638568656, | |
| "loss": 0.1255, | |
| "mean_token_accuracy": 0.965131089091301, | |
| "num_tokens": 4670594.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 2.0554805397987366, | |
| "epoch": 1.915837104072398, | |
| "grad_norm": 0.5821590423583984, | |
| "learning_rate": 0.0004178735257768713, | |
| "loss": 0.0486, | |
| "mean_token_accuracy": 0.9875282496213913, | |
| "num_tokens": 4679344.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 2.130349576473236, | |
| "epoch": 1.9194570135746605, | |
| "grad_norm": 0.5332052111625671, | |
| "learning_rate": 0.0004175130854948679, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9737034440040588, | |
| "num_tokens": 4687922.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 2.146788775920868, | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.5016877055168152, | |
| "learning_rate": 0.00041715212690867455, | |
| "loss": 0.1281, | |
| "mean_token_accuracy": 0.9681432545185089, | |
| "num_tokens": 4696593.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 2.041268438100815, | |
| "epoch": 1.9266968325791856, | |
| "grad_norm": 0.5257729887962341, | |
| "learning_rate": 0.00041679065138925807, | |
| "loss": 0.1272, | |
| "mean_token_accuracy": 0.9649266451597214, | |
| "num_tokens": 4705792.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 2.114819645881653, | |
| "epoch": 1.930316742081448, | |
| "grad_norm": 0.7085135579109192, | |
| "learning_rate": 0.0004164286603095484, | |
| "loss": 0.1545, | |
| "mean_token_accuracy": 0.9581228941679001, | |
| "num_tokens": 4714599.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 2.022280514240265, | |
| "epoch": 1.9339366515837104, | |
| "grad_norm": 0.5309014320373535, | |
| "learning_rate": 0.00041606615504443387, | |
| "loss": 0.1933, | |
| "mean_token_accuracy": 0.9562340676784515, | |
| "num_tokens": 4724062.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 2.0959260165691376, | |
| "epoch": 1.9375565610859729, | |
| "grad_norm": 0.6528061628341675, | |
| "learning_rate": 0.0004157031369707557, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.9612343460321426, | |
| "num_tokens": 4733077.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 2.2772948145866394, | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.7351471185684204, | |
| "learning_rate": 0.0004153396074673028, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9608108699321747, | |
| "num_tokens": 4741201.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 2.0935052037239075, | |
| "epoch": 1.9447963800904977, | |
| "grad_norm": 0.5435840487480164, | |
| "learning_rate": 0.0004149755679148065, | |
| "loss": 0.0884, | |
| "mean_token_accuracy": 0.9745689779520035, | |
| "num_tokens": 4750306.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 2.2082818746566772, | |
| "epoch": 1.9484162895927601, | |
| "grad_norm": 0.3780331611633301, | |
| "learning_rate": 0.00041461101969593537, | |
| "loss": 0.0739, | |
| "mean_token_accuracy": 0.9777179658412933, | |
| "num_tokens": 4758954.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 2.1683040261268616, | |
| "epoch": 1.9520361990950226, | |
| "grad_norm": 0.4637961685657501, | |
| "learning_rate": 0.00041424596419529017, | |
| "loss": 0.0632, | |
| "mean_token_accuracy": 0.9834533184766769, | |
| "num_tokens": 4767615.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 2.075555235147476, | |
| "epoch": 1.9556561085972852, | |
| "grad_norm": 0.7603118419647217, | |
| "learning_rate": 0.00041388040279939804, | |
| "loss": 0.2835, | |
| "mean_token_accuracy": 0.9364205300807953, | |
| "num_tokens": 4776714.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 2.18926739692688, | |
| "epoch": 1.9592760180995477, | |
| "grad_norm": 0.8895708918571472, | |
| "learning_rate": 0.0004135143368967079, | |
| "loss": 0.2514, | |
| "mean_token_accuracy": 0.9361050724983215, | |
| "num_tokens": 4785402.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 2.2387169003486633, | |
| "epoch": 1.96289592760181, | |
| "grad_norm": 0.6013544797897339, | |
| "learning_rate": 0.00041314776787758454, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9594238847494125, | |
| "num_tokens": 4793928.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 2.208383619785309, | |
| "epoch": 1.9665158371040725, | |
| "grad_norm": 0.6934756636619568, | |
| "learning_rate": 0.00041278069713430386, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9619583487510681, | |
| "num_tokens": 4802612.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 2.2621757984161377, | |
| "epoch": 1.970135746606335, | |
| "grad_norm": 0.6920077800750732, | |
| "learning_rate": 0.00041241312606104743, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9594835937023163, | |
| "num_tokens": 4811332.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 2.2654454112052917, | |
| "epoch": 1.9737556561085974, | |
| "grad_norm": 0.6259592771530151, | |
| "learning_rate": 0.000412045056053897, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.9648078680038452, | |
| "num_tokens": 4820441.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 2.218056857585907, | |
| "epoch": 1.9773755656108598, | |
| "grad_norm": 0.5390617847442627, | |
| "learning_rate": 0.0004116764885108292, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9595656991004944, | |
| "num_tokens": 4829437.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 2.2571592330932617, | |
| "epoch": 1.9809954751131222, | |
| "grad_norm": 0.3656528890132904, | |
| "learning_rate": 0.0004113074248317108, | |
| "loss": 0.0545, | |
| "mean_token_accuracy": 0.9825418293476105, | |
| "num_tokens": 4838118.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 2.1890549659729004, | |
| "epoch": 1.9846153846153847, | |
| "grad_norm": 0.5716155767440796, | |
| "learning_rate": 0.00041093786641829247, | |
| "loss": 0.0997, | |
| "mean_token_accuracy": 0.9715700745582581, | |
| "num_tokens": 4847073.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 2.2726192474365234, | |
| "epoch": 1.988235294117647, | |
| "grad_norm": 0.4709530770778656, | |
| "learning_rate": 0.0004105678146742042, | |
| "loss": 0.0746, | |
| "mean_token_accuracy": 0.9799739569425583, | |
| "num_tokens": 4855755.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 2.2328362464904785, | |
| "epoch": 1.9918552036199095, | |
| "grad_norm": 0.6773779392242432, | |
| "learning_rate": 0.0004101972710049498, | |
| "loss": 0.1418, | |
| "mean_token_accuracy": 0.9629421681165695, | |
| "num_tokens": 4864601.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 2.199812740087509, | |
| "epoch": 1.995475113122172, | |
| "grad_norm": 0.717012882232666, | |
| "learning_rate": 0.00040982623681790113, | |
| "loss": 0.2948, | |
| "mean_token_accuracy": 0.9432803690433502, | |
| "num_tokens": 4873630.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 2.2102787494659424, | |
| "epoch": 1.9990950226244344, | |
| "grad_norm": 0.6925314664840698, | |
| "learning_rate": 0.00040945471352229346, | |
| "loss": 0.2579, | |
| "mean_token_accuracy": 0.9435124397277832, | |
| "num_tokens": 4882714.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 2.3318979740142822, | |
| "epoch": 2.0, | |
| "grad_norm": 2.688188314437866, | |
| "learning_rate": 0.0004090827025292197, | |
| "loss": 0.0283, | |
| "mean_token_accuracy": 0.9918032884597778, | |
| "num_tokens": 4883450.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 2.2165925522160723, | |
| "eval_loss": 0.16817161440849304, | |
| "eval_mean_token_accuracy": 0.9567220133494555, | |
| "eval_num_tokens": 4883450.0, | |
| "eval_runtime": 116.1556, | |
| "eval_samples_per_second": 3.177, | |
| "eval_steps_per_second": 1.059, | |
| "step": 554 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1662, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.634384518674615e+17, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |