{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 250, "global_step": 530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.825, "epoch": 0.01890359168241966, "grad_norm": 157.0, "learning_rate": 5.925925925925926e-06, "loss": 2.9876, "mean_token_accuracy": 0.6893173575401306, "num_input_tokens_seen": 115216, "num_tokens": 114489.0, "step": 5, "train_runtime": 4.4379, "train_tokens_per_second": 25961.602 }, { "entropy": 0.8890625, "epoch": 0.03780718336483932, "grad_norm": 24.5, "learning_rate": 1.3333333333333333e-05, "loss": 0.6662, "mean_token_accuracy": 0.8326915562152862, "num_input_tokens_seen": 230592, "num_tokens": 229109.0, "step": 10, "train_runtime": 7.7641, "train_tokens_per_second": 29699.859 }, { "entropy": 1.02890625, "epoch": 0.05671077504725898, "grad_norm": 18.625, "learning_rate": 2.074074074074074e-05, "loss": 0.6027, "mean_token_accuracy": 0.8529165983200073, "num_input_tokens_seen": 345600, "num_tokens": 343545.0, "step": 15, "train_runtime": 13.2817, "train_tokens_per_second": 26020.836 }, { "entropy": 1.1421875, "epoch": 0.07561436672967864, "grad_norm": 19.125, "learning_rate": 2.814814814814815e-05, "loss": 0.4447, "mean_token_accuracy": 0.881816154718399, "num_input_tokens_seen": 461282, "num_tokens": 458335.0, "step": 20, "train_runtime": 16.8766, "train_tokens_per_second": 27332.56 }, { "entropy": 1.21640625, "epoch": 0.0945179584120983, "grad_norm": 21.75, "learning_rate": 3.555555555555555e-05, "loss": 0.3989, "mean_token_accuracy": 0.8929793298244476, "num_input_tokens_seen": 576346, "num_tokens": 572824.0, "step": 25, "train_runtime": 22.5256, "train_tokens_per_second": 25586.277 }, { "entropy": 1.26484375, "epoch": 0.11342155009451796, "grad_norm": 13.625, "learning_rate": 3.999843966403289e-05, "loss": 0.4872, "mean_token_accuracy": 0.8781549751758575, "num_input_tokens_seen": 691188, "num_tokens": 687152.0, "step": 30, "train_runtime": 26.0077, "train_tokens_per_second": 26576.234 }, { "entropy": 1.28515625, "epoch": 0.1323251417769376, "grad_norm": 17.375, "learning_rate": 3.99808886803243e-05, "loss": 0.28, "mean_token_accuracy": 0.9074305832386017, "num_input_tokens_seen": 806740, "num_tokens": 801973.0, "step": 35, "train_runtime": 29.7225, "train_tokens_per_second": 27142.403 }, { "entropy": 1.2984375, "epoch": 0.15122873345935728, "grad_norm": 12.625, "learning_rate": 3.994385346473689e-05, "loss": 0.356, "mean_token_accuracy": 0.9146295249462127, "num_input_tokens_seen": 921796, "num_tokens": 916426.0, "step": 40, "train_runtime": 34.6767, "train_tokens_per_second": 26582.553 }, { "entropy": 1.56953125, "epoch": 0.17013232514177692, "grad_norm": 11.4375, "learning_rate": 3.9887370131917e-05, "loss": 0.3933, "mean_token_accuracy": 0.9064954161643982, "num_input_tokens_seen": 1036824, "num_tokens": 1030824.0, "step": 45, "train_runtime": 38.0075, "train_tokens_per_second": 27279.472 }, { "entropy": 1.7515625, "epoch": 0.1890359168241966, "grad_norm": 15.1875, "learning_rate": 3.981149376121427e-05, "loss": 0.2873, "mean_token_accuracy": 0.9260397672653198, "num_input_tokens_seen": 1152356, "num_tokens": 1145500.0, "step": 50, "train_runtime": 43.1792, "train_tokens_per_second": 26687.759 }, { "entropy": 1.75859375, "epoch": 0.20793950850661624, "grad_norm": 8.5625, "learning_rate": 3.97162983429714e-05, "loss": 0.3322, "mean_token_accuracy": 0.9256749033927918, "num_input_tokens_seen": 1267634, "num_tokens": 1260057.0, "step": 55, "train_runtime": 46.6166, "train_tokens_per_second": 27192.755 }, { "entropy": 1.734375, "epoch": 0.22684310018903592, "grad_norm": 11.6875, "learning_rate": 3.960187670637294e-05, "loss": 0.2865, "mean_token_accuracy": 0.9282522916793823, "num_input_tokens_seen": 1383494, "num_tokens": 1374973.0, "step": 60, "train_runtime": 50.5007, "train_tokens_per_second": 27395.522 }, { "entropy": 1.70859375, "epoch": 0.24574669187145556, "grad_norm": 9.75, "learning_rate": 3.946834042892355e-05, "loss": 0.2277, "mean_token_accuracy": 0.9320353448390961, "num_input_tokens_seen": 1499052, "num_tokens": 1489683.0, "step": 65, "train_runtime": 55.2907, "train_tokens_per_second": 27112.206 }, { "entropy": 1.6578125, "epoch": 0.2646502835538752, "grad_norm": 10.125, "learning_rate": 3.931581972764386e-05, "loss": 0.2733, "mean_token_accuracy": 0.9363594233989716, "num_input_tokens_seen": 1614146, "num_tokens": 1604106.0, "step": 70, "train_runtime": 58.5263, "train_tokens_per_second": 27579.832 }, { "entropy": 1.57109375, "epoch": 0.2835538752362949, "grad_norm": 6.34375, "learning_rate": 3.91444633320903e-05, "loss": 0.2164, "mean_token_accuracy": 0.9349239528179168, "num_input_tokens_seen": 1729362, "num_tokens": 1718632.0, "step": 75, "train_runtime": 63.5177, "train_tokens_per_second": 27226.454 }, { "entropy": 1.57734375, "epoch": 0.30245746691871456, "grad_norm": 10.875, "learning_rate": 3.8954438339322366e-05, "loss": 0.2173, "mean_token_accuracy": 0.9350460767745972, "num_input_tokens_seen": 1844444, "num_tokens": 1833068.0, "step": 80, "train_runtime": 66.8194, "train_tokens_per_second": 27603.402 }, { "entropy": 1.62421875, "epoch": 0.32136105860113423, "grad_norm": 11.0, "learning_rate": 3.874593005095909e-05, "loss": 0.2337, "mean_token_accuracy": 0.929820317029953, "num_input_tokens_seen": 1959682, "num_tokens": 1947640.0, "step": 85, "train_runtime": 70.4744, "train_tokens_per_second": 27806.99 }, { "entropy": 1.71171875, "epoch": 0.34026465028355385, "grad_norm": 8.625, "learning_rate": 3.851914179248333e-05, "loss": 0.2156, "mean_token_accuracy": 0.9308744966983795, "num_input_tokens_seen": 2075138, "num_tokens": 2062310.0, "step": 90, "train_runtime": 75.9345, "train_tokens_per_second": 27327.991 }, { "entropy": 1.86875, "epoch": 0.3591682419659735, "grad_norm": 13.5625, "learning_rate": 3.82742947149703e-05, "loss": 0.2718, "mean_token_accuracy": 0.9264281988143921, "num_input_tokens_seen": 2190160, "num_tokens": 2176716.0, "step": 95, "train_runtime": 79.4416, "train_tokens_per_second": 27569.42 }, { "entropy": 1.94765625, "epoch": 0.3780718336483932, "grad_norm": 6.125, "learning_rate": 3.801162757943359e-05, "loss": 0.3385, "mean_token_accuracy": 0.9164456725120544, "num_input_tokens_seen": 2305250, "num_tokens": 2291230.0, "step": 100, "train_runtime": 84.7105, "train_tokens_per_second": 27213.265 }, { "entropy": 1.846875, "epoch": 0.39697542533081287, "grad_norm": 15.1875, "learning_rate": 3.773139652399884e-05, "loss": 0.1811, "mean_token_accuracy": 0.944804173707962, "num_input_tokens_seen": 2420666, "num_tokens": 2405904.0, "step": 105, "train_runtime": 88.7231, "train_tokens_per_second": 27283.383 }, { "entropy": 1.84765625, "epoch": 0.4158790170132325, "grad_norm": 6.3125, "learning_rate": 3.743387481413243e-05, "loss": 0.1974, "mean_token_accuracy": 0.9379207909107208, "num_input_tokens_seen": 2535606, "num_tokens": 2520235.0, "step": 110, "train_runtime": 93.0343, "train_tokens_per_second": 27254.523 }, { "entropy": 1.83984375, "epoch": 0.43478260869565216, "grad_norm": 2.875, "learning_rate": 3.711935257616842e-05, "loss": 0.1266, "mean_token_accuracy": 0.9594786465167999, "num_input_tokens_seen": 2650514, "num_tokens": 2634592.0, "step": 115, "train_runtime": 98.3865, "train_tokens_per_second": 26939.815 }, { "entropy": 1.78984375, "epoch": 0.45368620037807184, "grad_norm": 3.6875, "learning_rate": 3.678813651439376e-05, "loss": 0.1993, "mean_token_accuracy": 0.9459972441196441, "num_input_tokens_seen": 2766004, "num_tokens": 2749299.0, "step": 120, "train_runtime": 102.0946, "train_tokens_per_second": 27092.567 }, { "entropy": 1.78828125, "epoch": 0.4725897920604915, "grad_norm": 9.1875, "learning_rate": 3.6440549611967656e-05, "loss": 0.2075, "mean_token_accuracy": 0.940614128112793, "num_input_tokens_seen": 2880990, "num_tokens": 2863713.0, "step": 125, "train_runtime": 107.8735, "train_tokens_per_second": 26707.121 }, { "entropy": 1.85234375, "epoch": 0.4914933837429111, "grad_norm": 7.15625, "learning_rate": 3.6076930815966654e-05, "loss": 0.236, "mean_token_accuracy": 0.9343804061412812, "num_input_tokens_seen": 2995844, "num_tokens": 2978032.0, "step": 130, "train_runtime": 111.3362, "train_tokens_per_second": 26908.095 }, { "entropy": 1.95859375, "epoch": 0.5103969754253308, "grad_norm": 7.375, "learning_rate": 3.569763470686262e-05, "loss": 0.162, "mean_token_accuracy": 0.9484993875026703, "num_input_tokens_seen": 3111092, "num_tokens": 3092605.0, "step": 135, "train_runtime": 115.8418, "train_tokens_per_second": 26856.393 }, { "entropy": 1.99921875, "epoch": 0.5293005671077504, "grad_norm": 7.125, "learning_rate": 3.530303115275597e-05, "loss": 0.1892, "mean_token_accuracy": 0.9394895970821381, "num_input_tokens_seen": 3226396, "num_tokens": 3207190.0, "step": 140, "train_runtime": 120.7172, "train_tokens_per_second": 26726.892 }, { "entropy": 1.96484375, "epoch": 0.5482041587901701, "grad_norm": 4.5625, "learning_rate": 3.4893504948701185e-05, "loss": 0.1614, "mean_token_accuracy": 0.9600624740123749, "num_input_tokens_seen": 3341802, "num_tokens": 3321840.0, "step": 145, "train_runtime": 124.4268, "train_tokens_per_second": 26857.576 }, { "entropy": 1.90859375, "epoch": 0.5671077504725898, "grad_norm": 6.96875, "learning_rate": 3.4469455441476475e-05, "loss": 0.1334, "mean_token_accuracy": 0.9625543296337128, "num_input_tokens_seen": 3456964, "num_tokens": 3436339.0, "step": 150, "train_runtime": 130.3081, "train_tokens_per_second": 26529.148 }, { "entropy": 1.92421875, "epoch": 0.5860113421550095, "grad_norm": 12.4375, "learning_rate": 3.403129614016339e-05, "loss": 0.1427, "mean_token_accuracy": 0.9588114261627197, "num_input_tokens_seen": 3572084, "num_tokens": 3550813.0, "step": 155, "train_runtime": 133.8989, "train_tokens_per_second": 26677.47 }, { "entropy": 1.98671875, "epoch": 0.6049149338374291, "grad_norm": 7.3125, "learning_rate": 3.357945431291618e-05, "loss": 0.2129, "mean_token_accuracy": 0.9367718935012818, "num_input_tokens_seen": 3687248, "num_tokens": 3665300.0, "step": 160, "train_runtime": 138.2948, "train_tokens_per_second": 26662.235 }, { "entropy": 2.1359375, "epoch": 0.6238185255198487, "grad_norm": 3.09375, "learning_rate": 3.311437057031406e-05, "loss": 0.2219, "mean_token_accuracy": 0.9387097895145416, "num_input_tokens_seen": 3802458, "num_tokens": 3779809.0, "step": 165, "train_runtime": 142.569, "train_tokens_per_second": 26671.004 }, { "entropy": 2.0859375, "epoch": 0.6427221172022685, "grad_norm": 4.53125, "learning_rate": 3.263649843570271e-05, "loss": 0.1355, "mean_token_accuracy": 0.9585716307163239, "num_input_tokens_seen": 3917580, "num_tokens": 3894322.0, "step": 170, "train_runtime": 145.9767, "train_tokens_per_second": 26837.021 }, { "entropy": 1.946875, "epoch": 0.6616257088846881, "grad_norm": 6.53125, "learning_rate": 3.214630390294396e-05, "loss": 0.2962, "mean_token_accuracy": 0.9372412860393524, "num_input_tokens_seen": 4032748, "num_tokens": 4008844.0, "step": 175, "train_runtime": 151.6027, "train_tokens_per_second": 26600.765 }, { "entropy": 1.98671875, "epoch": 0.6805293005671077, "grad_norm": 6.96875, "learning_rate": 3.1644264982005e-05, "loss": 0.1841, "mean_token_accuracy": 0.9490657150745392, "num_input_tokens_seen": 4148142, "num_tokens": 4123487.0, "step": 180, "train_runtime": 154.9764, "train_tokens_per_second": 26766.274 }, { "entropy": 2.021875, "epoch": 0.6994328922495274, "grad_norm": 2.953125, "learning_rate": 3.113087123283002e-05, "loss": 0.124, "mean_token_accuracy": 0.964401924610138, "num_input_tokens_seen": 4263312, "num_tokens": 4238014.0, "step": 185, "train_runtime": 159.4694, "train_tokens_per_second": 26734.354 }, { "entropy": 1.96171875, "epoch": 0.718336483931947, "grad_norm": 3.4375, "learning_rate": 3.060662328794916e-05, "loss": 0.1498, "mean_token_accuracy": 0.9481843888759613, "num_input_tokens_seen": 4378630, "num_tokens": 4352627.0, "step": 190, "train_runtime": 163.6223, "train_tokens_per_second": 26760.595 }, { "entropy": 1.9640625, "epoch": 0.7372400756143668, "grad_norm": 4.1875, "learning_rate": 3.0072032364289914e-05, "loss": 0.1076, "mean_token_accuracy": 0.9691859900951385, "num_input_tokens_seen": 4493600, "num_tokens": 4467053.0, "step": 195, "train_runtime": 166.9247, "train_tokens_per_second": 26919.915 }, { "entropy": 2.02734375, "epoch": 0.7561436672967864, "grad_norm": 3.875, "learning_rate": 2.9527619764667376e-05, "loss": 0.2501, "mean_token_accuracy": 0.9455641567707062, "num_input_tokens_seen": 4609216, "num_tokens": 4581812.0, "step": 200, "train_runtime": 172.0695, "train_tokens_per_second": 26786.938 }, { "entropy": 2.14375, "epoch": 0.775047258979206, "grad_norm": 5.4375, "learning_rate": 2.8973916369439194e-05, "loss": 0.2157, "mean_token_accuracy": 0.9492439985275268, "num_input_tokens_seen": 4724086, "num_tokens": 4696178.0, "step": 205, "train_runtime": 175.6473, "train_tokens_per_second": 26895.294 }, { "entropy": 2.2625, "epoch": 0.7939508506616257, "grad_norm": 3.75, "learning_rate": 2.84114621188211e-05, "loss": 0.1762, "mean_token_accuracy": 0.9574925601482391, "num_input_tokens_seen": 4839702, "num_tokens": 4810939.0, "step": 210, "train_runtime": 180.4712, "train_tokens_per_second": 26817.036 }, { "entropy": 2.2953125, "epoch": 0.8128544423440454, "grad_norm": 3.9375, "learning_rate": 2.7840805486367792e-05, "loss": 0.1703, "mean_token_accuracy": 0.9540181159973145, "num_input_tokens_seen": 4955098, "num_tokens": 4925591.0, "step": 215, "train_runtime": 184.4177, "train_tokens_per_second": 26868.891 }, { "entropy": 2.2828125, "epoch": 0.831758034026465, "grad_norm": 4.625, "learning_rate": 2.7262502944132526e-05, "loss": 0.0938, "mean_token_accuracy": 0.9725252389907837, "num_input_tokens_seen": 5070258, "num_tokens": 5040089.0, "step": 220, "train_runtime": 188.065, "train_tokens_per_second": 26960.132 }, { "entropy": 2.1265625, "epoch": 0.8506616257088847, "grad_norm": 5.71875, "learning_rate": 2.667711842002707e-05, "loss": 0.1704, "mean_token_accuracy": 0.9579161703586578, "num_input_tokens_seen": 5185478, "num_tokens": 5154604.0, "step": 225, "train_runtime": 192.8301, "train_tokens_per_second": 26891.43 }, { "entropy": 2.0484375, "epoch": 0.8695652173913043, "grad_norm": 4.3125, "learning_rate": 2.6085222747911155e-05, "loss": 0.4284, "mean_token_accuracy": 0.9190201222896576, "num_input_tokens_seen": 5301020, "num_tokens": 5269357.0, "step": 230, "train_runtime": 196.1744, "train_tokens_per_second": 27021.971 }, { "entropy": 1.98671875, "epoch": 0.888468809073724, "grad_norm": 13.3125, "learning_rate": 2.5487393110947557e-05, "loss": 0.1346, "mean_token_accuracy": 0.9579481542110443, "num_input_tokens_seen": 5416464, "num_tokens": 5384069.0, "step": 235, "train_runtime": 201.21, "train_tokens_per_second": 26919.463 }, { "entropy": 1.9875, "epoch": 0.9073724007561437, "grad_norm": 3.84375, "learning_rate": 2.4884212478765747e-05, "loss": 0.097, "mean_token_accuracy": 0.9672803819179535, "num_input_tokens_seen": 5531644, "num_tokens": 5498568.0, "step": 240, "train_runtime": 205.075, "train_tokens_per_second": 26973.766 }, { "entropy": 2.00234375, "epoch": 0.9262759924385633, "grad_norm": 4.9375, "learning_rate": 2.427626903898292e-05, "loss": 0.2298, "mean_token_accuracy": 0.9443018674850464, "num_input_tokens_seen": 5646952, "num_tokens": 5613157.0, "step": 245, "train_runtime": 208.4891, "train_tokens_per_second": 27085.115 }, { "entropy": 2.0140625, "epoch": 0.945179584120983, "grad_norm": 6.03125, "learning_rate": 2.3664155623636715e-05, "loss": 0.1732, "mean_token_accuracy": 0.9442705571651459, "num_input_tokens_seen": 5762366, "num_tokens": 5727795.0, "step": 250, "train_runtime": 214.059, "train_tokens_per_second": 26919.525 }, { "entropy": 2.0125, "epoch": 0.9640831758034026, "grad_norm": 3.546875, "learning_rate": 2.304846913108891e-05, "loss": 0.1083, "mean_token_accuracy": 0.9664817750453949, "num_input_tokens_seen": 5877646, "num_tokens": 5842437.0, "step": 255, "train_runtime": 275.7098, "train_tokens_per_second": 21318.232 }, { "entropy": 2.0, "epoch": 0.9829867674858223, "grad_norm": 2.671875, "learning_rate": 2.242980994396401e-05, "loss": 0.0875, "mean_token_accuracy": 0.9795427262783051, "num_input_tokens_seen": 5992710, "num_tokens": 5956870.0, "step": 260, "train_runtime": 280.9684, "train_tokens_per_second": 21328.766 }, { "entropy": 1.9513888888888888, "epoch": 1.0, "grad_norm": 7.4375, "learning_rate": 2.1808781343690027e-05, "loss": 0.1654, "mean_token_accuracy": 0.9603289763132731, "num_input_tokens_seen": 6096342, "num_tokens": 6059927.0, "step": 265, "train_runtime": 284.3725, "train_tokens_per_second": 21437.877 }, { "entropy": 1.903125, "epoch": 1.0189035916824196, "grad_norm": 3.453125, "learning_rate": 2.118598892221257e-05, "loss": 0.0783, "mean_token_accuracy": 0.9817151129245758, "num_input_tokens_seen": 6211574, "num_tokens": 6174483.0, "step": 270, "train_runtime": 288.2049, "train_tokens_per_second": 21552.63 }, { "entropy": 1.84375, "epoch": 1.0378071833648392, "grad_norm": 1.734375, "learning_rate": 2.0562039991455877e-05, "loss": 0.1214, "mean_token_accuracy": 0.9741188943386078, "num_input_tokens_seen": 6327000, "num_tokens": 6289163.0, "step": 275, "train_runtime": 293.7126, "train_tokens_per_second": 21541.469 }, { "entropy": 1.8421875, "epoch": 1.056710775047259, "grad_norm": 3.78125, "learning_rate": 1.99375429911066e-05, "loss": 0.1393, "mean_token_accuracy": 0.9579156279563904, "num_input_tokens_seen": 6442290, "num_tokens": 6403766.0, "step": 280, "train_runtime": 297.1668, "train_tokens_per_second": 21679.038 }, { "entropy": 1.85078125, "epoch": 1.0756143667296787, "grad_norm": 3.953125, "learning_rate": 1.931310689529781e-05, "loss": 0.0872, "mean_token_accuracy": 0.9788394093513488, "num_input_tokens_seen": 6557852, "num_tokens": 6518469.0, "step": 285, "train_runtime": 301.7702, "train_tokens_per_second": 21731.276 }, { "entropy": 1.8234375, "epoch": 1.0945179584120983, "grad_norm": 7.1875, "learning_rate": 1.8689340618771937e-05, "loss": 0.0637, "mean_token_accuracy": 0.972537738084793, "num_input_tokens_seen": 6673032, "num_tokens": 6632963.0, "step": 290, "train_runtime": 306.4769, "train_tokens_per_second": 21773.362 }, { "entropy": 1.78359375, "epoch": 1.113421550094518, "grad_norm": 5.78125, "learning_rate": 1.806685242310156e-05, "loss": 0.0565, "mean_token_accuracy": 0.9854797184467315, "num_input_tokens_seen": 6788174, "num_tokens": 6747403.0, "step": 295, "train_runtime": 310.3851, "train_tokens_per_second": 21870.17 }, { "entropy": 1.76015625, "epoch": 1.1323251417769375, "grad_norm": 8.8125, "learning_rate": 1.7446249323547117e-05, "loss": 0.0973, "mean_token_accuracy": 0.9734237968921662, "num_input_tokens_seen": 6903146, "num_tokens": 6861788.0, "step": 300, "train_runtime": 315.4655, "train_tokens_per_second": 21882.41 }, { "entropy": 1.75078125, "epoch": 1.1512287334593574, "grad_norm": 1.4453125, "learning_rate": 1.6828136497130014e-05, "loss": 0.0681, "mean_token_accuracy": 0.9820096373558045, "num_input_tokens_seen": 7018350, "num_tokens": 6976277.0, "step": 305, "train_runtime": 319.0527, "train_tokens_per_second": 21997.465 }, { "entropy": 1.740625, "epoch": 1.170132325141777, "grad_norm": 4.90625, "learning_rate": 1.6213116692498206e-05, "loss": 0.0625, "mean_token_accuracy": 0.9826828062534332, "num_input_tokens_seen": 7133636, "num_tokens": 7090874.0, "step": 310, "train_runtime": 323.8986, "train_tokens_per_second": 22024.29 }, { "entropy": 1.7328125, "epoch": 1.1890359168241966, "grad_norm": 0.66015625, "learning_rate": 1.560178964215987e-05, "loss": 0.077, "mean_token_accuracy": 0.978941410779953, "num_input_tokens_seen": 7248866, "num_tokens": 7205391.0, "step": 315, "train_runtime": 327.5895, "train_tokens_per_second": 22127.897 }, { "entropy": 1.73203125, "epoch": 1.2079395085066162, "grad_norm": 4.0625, "learning_rate": 1.4994751477658139e-05, "loss": 0.067, "mean_token_accuracy": 0.9818780541419982, "num_input_tokens_seen": 7363900, "num_tokens": 7319827.0, "step": 320, "train_runtime": 331.4598, "train_tokens_per_second": 22216.571 }, { "entropy": 1.73515625, "epoch": 1.2268431001890359, "grad_norm": 2.734375, "learning_rate": 1.4392594148257426e-05, "loss": 0.1153, "mean_token_accuracy": 0.9638942897319793, "num_input_tokens_seen": 7479394, "num_tokens": 7434543.0, "step": 325, "train_runtime": 336.2629, "train_tokens_per_second": 22242.696 }, { "entropy": 1.74609375, "epoch": 1.2457466918714555, "grad_norm": 2.046875, "learning_rate": 1.3795904843707959e-05, "loss": 0.0359, "mean_token_accuracy": 0.9886789560317993, "num_input_tokens_seen": 7594632, "num_tokens": 7549134.0, "step": 330, "train_runtime": 339.6052, "train_tokens_per_second": 22363.12 }, { "entropy": 1.740625, "epoch": 1.264650283553875, "grad_norm": 2.25, "learning_rate": 1.3205265421651588e-05, "loss": 0.0808, "mean_token_accuracy": 0.9852688193321228, "num_input_tokens_seen": 7709704, "num_tokens": 7663583.0, "step": 335, "train_runtime": 344.9458, "train_tokens_per_second": 22350.48 }, { "entropy": 1.75078125, "epoch": 1.283553875236295, "grad_norm": 1.8125, "learning_rate": 1.2621251840227112e-05, "loss": 0.0663, "mean_token_accuracy": 0.9817369997501373, "num_input_tokens_seen": 7824834, "num_tokens": 7778064.0, "step": 340, "train_runtime": 348.223, "train_tokens_per_second": 22470.756 }, { "entropy": 1.75234375, "epoch": 1.3024574669187146, "grad_norm": 4.28125, "learning_rate": 1.2044433596428537e-05, "loss": 0.0678, "mean_token_accuracy": 0.9812626421451569, "num_input_tokens_seen": 7939832, "num_tokens": 7892415.0, "step": 345, "train_runtime": 352.0847, "train_tokens_per_second": 22550.916 }, { "entropy": 1.746875, "epoch": 1.3213610586011342, "grad_norm": 2.703125, "learning_rate": 1.1475373170763819e-05, "loss": 0.0465, "mean_token_accuracy": 0.9823280215263367, "num_input_tokens_seen": 8054988, "num_tokens": 8006926.0, "step": 350, "train_runtime": 357.1271, "train_tokens_per_second": 22554.962 }, { "entropy": 1.74765625, "epoch": 1.3402646502835538, "grad_norm": 1.4921875, "learning_rate": 1.0914625478755672e-05, "loss": 0.1174, "mean_token_accuracy": 0.9695515096187591, "num_input_tokens_seen": 8170098, "num_tokens": 8121373.0, "step": 355, "train_runtime": 360.7524, "train_tokens_per_second": 22647.381 }, { "entropy": 1.74453125, "epoch": 1.3591682419659734, "grad_norm": 1.1015625, "learning_rate": 1.0362737329819413e-05, "loss": 0.045, "mean_token_accuracy": 0.9885900497436524, "num_input_tokens_seen": 8285346, "num_tokens": 8235981.0, "step": 360, "train_runtime": 366.0216, "train_tokens_per_second": 22636.221 }, { "entropy": 1.74296875, "epoch": 1.3780718336483933, "grad_norm": 5.15625, "learning_rate": 9.820246894045316e-06, "loss": 0.0428, "mean_token_accuracy": 0.9822307825088501, "num_input_tokens_seen": 8400240, "num_tokens": 8350356.0, "step": 365, "train_runtime": 369.6364, "train_tokens_per_second": 22725.685 }, { "entropy": 1.73515625, "epoch": 1.3969754253308129, "grad_norm": 4.09375, "learning_rate": 9.28768317740564e-06, "loss": 0.099, "mean_token_accuracy": 0.9710565328598022, "num_input_tokens_seen": 8515740, "num_tokens": 8465025.0, "step": 370, "train_runtime": 373.5701, "train_tokens_per_second": 22795.56 }, { "entropy": 1.7328125, "epoch": 1.4158790170132325, "grad_norm": 4.96875, "learning_rate": 8.765565505897902e-06, "loss": 0.0736, "mean_token_accuracy": 0.9741575241088867, "num_input_tokens_seen": 8631054, "num_tokens": 8579648.0, "step": 375, "train_runtime": 378.7394, "train_tokens_per_second": 22788.901 }, { "entropy": 1.73359375, "epoch": 1.434782608695652, "grad_norm": 3.265625, "learning_rate": 8.254403019127566e-06, "loss": 0.0806, "mean_token_accuracy": 0.9791056990623475, "num_input_tokens_seen": 8746364, "num_tokens": 8694249.0, "step": 380, "train_runtime": 382.0615, "train_tokens_per_second": 22892.552 }, { "entropy": 1.73515625, "epoch": 1.4536862003780717, "grad_norm": 3.75, "learning_rate": 7.754694173823947e-06, "loss": 0.0404, "mean_token_accuracy": 0.9839386224746705, "num_input_tokens_seen": 8861574, "num_tokens": 8808789.0, "step": 385, "train_runtime": 387.2205, "train_tokens_per_second": 22885.084 }, { "entropy": 1.73359375, "epoch": 1.4725897920604916, "grad_norm": 5.09375, "learning_rate": 7.266926257773346e-06, "loss": 0.0926, "mean_token_accuracy": 0.9714232623577118, "num_input_tokens_seen": 8976944, "num_tokens": 8923407.0, "step": 390, "train_runtime": 390.891, "train_tokens_per_second": 22965.336 }, { "entropy": 1.72265625, "epoch": 1.4914933837429112, "grad_norm": 5.0, "learning_rate": 6.7915749146436415e-06, "loss": 0.0519, "mean_token_accuracy": 0.9837916433811188, "num_input_tokens_seen": 9092050, "num_tokens": 9037924.0, "step": 395, "train_runtime": 395.3397, "train_tokens_per_second": 22998.071 }, { "entropy": 1.71796875, "epoch": 1.5103969754253308, "grad_norm": 3.875, "learning_rate": 6.329103680163495e-06, "loss": 0.2115, "mean_token_accuracy": 0.9516554296016693, "num_input_tokens_seen": 9207594, "num_tokens": 9152659.0, "step": 400, "train_runtime": 399.5499, "train_tokens_per_second": 23044.916 }, { "entropy": 1.71640625, "epoch": 1.5293005671077504, "grad_norm": 0.51953125, "learning_rate": 5.879963530108506e-06, "loss": 0.0348, "mean_token_accuracy": 0.9919346511363983, "num_input_tokens_seen": 9322572, "num_tokens": 9267059.0, "step": 405, "train_runtime": 403.4031, "train_tokens_per_second": 23109.815 }, { "entropy": 1.7125, "epoch": 1.54820415879017, "grad_norm": 2.234375, "learning_rate": 5.444592440535177e-06, "loss": 0.0374, "mean_token_accuracy": 0.9837370038032531, "num_input_tokens_seen": 9438004, "num_tokens": 9381725.0, "step": 410, "train_runtime": 407.9692, "train_tokens_per_second": 23134.111 }, { "entropy": 1.7078125, "epoch": 1.5671077504725899, "grad_norm": 3.09375, "learning_rate": 5.023414960691469e-06, "loss": 0.0325, "mean_token_accuracy": 0.9918534696102143, "num_input_tokens_seen": 9553156, "num_tokens": 9496255.0, "step": 415, "train_runtime": 412.1408, "train_tokens_per_second": 23179.35 }, { "entropy": 1.703125, "epoch": 1.5860113421550095, "grad_norm": 4.78125, "learning_rate": 4.616841799020364e-06, "loss": 0.0618, "mean_token_accuracy": 0.9808044970035553, "num_input_tokens_seen": 9668364, "num_tokens": 9610808.0, "step": 420, "train_runtime": 416.3235, "train_tokens_per_second": 23223.203 }, { "entropy": 1.703125, "epoch": 1.6049149338374291, "grad_norm": 6.1875, "learning_rate": 4.225269422660258e-06, "loss": 0.0493, "mean_token_accuracy": 0.9843941271305084, "num_input_tokens_seen": 9783552, "num_tokens": 9725283.0, "step": 425, "train_runtime": 421.1316, "train_tokens_per_second": 23231.576 }, { "entropy": 1.70390625, "epoch": 1.6238185255198487, "grad_norm": 1.5078125, "learning_rate": 3.8490796708326404e-06, "loss": 0.0595, "mean_token_accuracy": 0.9822299420833588, "num_input_tokens_seen": 9898934, "num_tokens": 9839878.0, "step": 430, "train_runtime": 424.7606, "train_tokens_per_second": 23304.735 }, { "entropy": 1.7015625, "epoch": 1.6427221172022684, "grad_norm": 1.328125, "learning_rate": 3.4886393824940924e-06, "loss": 0.059, "mean_token_accuracy": 0.9807979345321656, "num_input_tokens_seen": 10014142, "num_tokens": 9954403.0, "step": 435, "train_runtime": 429.8927, "train_tokens_per_second": 23294.514 }, { "entropy": 1.70390625, "epoch": 1.6616257088846882, "grad_norm": 2.09375, "learning_rate": 3.144300038615691e-06, "loss": 0.0574, "mean_token_accuracy": 0.9839386105537414, "num_input_tokens_seen": 10129264, "num_tokens": 10068933.0, "step": 440, "train_runtime": 433.4828, "train_tokens_per_second": 23367.164 }, { "entropy": 1.6984375, "epoch": 1.6805293005671076, "grad_norm": 4.25, "learning_rate": 2.8163974194386766e-06, "loss": 0.0669, "mean_token_accuracy": 0.9792383193969727, "num_input_tokens_seen": 10244732, "num_tokens": 10183591.0, "step": 445, "train_runtime": 437.9792, "train_tokens_per_second": 23390.909 }, { "entropy": 1.7015625, "epoch": 1.6994328922495274, "grad_norm": 3.46875, "learning_rate": 2.5052512770405434e-06, "loss": 0.0801, "mean_token_accuracy": 0.9761136710643769, "num_input_tokens_seen": 10360212, "num_tokens": 10298251.0, "step": 450, "train_runtime": 442.481, "train_tokens_per_second": 23413.915 }, { "entropy": 1.70234375, "epoch": 1.718336483931947, "grad_norm": 0.59765625, "learning_rate": 2.2111650235309147e-06, "loss": 0.0297, "mean_token_accuracy": 0.9904489517211914, "num_input_tokens_seen": 10475400, "num_tokens": 10412810.0, "step": 455, "train_runtime": 446.3738, "train_tokens_per_second": 23467.773 }, { "entropy": 1.69921875, "epoch": 1.7372400756143667, "grad_norm": 4.0625, "learning_rate": 1.9344254351812287e-06, "loss": 0.0989, "mean_token_accuracy": 0.9743396818637848, "num_input_tokens_seen": 10590710, "num_tokens": 10527389.0, "step": 460, "train_runtime": 451.1755, "train_tokens_per_second": 23473.591 }, { "entropy": 1.703125, "epoch": 1.7561436672967865, "grad_norm": 0.890625, "learning_rate": 1.6753023727767436e-06, "loss": 0.0476, "mean_token_accuracy": 0.9838890075683594, "num_input_tokens_seen": 10705900, "num_tokens": 10641918.0, "step": 465, "train_runtime": 454.754, "train_tokens_per_second": 23542.179 }, { "entropy": 1.7, "epoch": 1.775047258979206, "grad_norm": 1.359375, "learning_rate": 1.4340485184635712e-06, "loss": 0.0556, "mean_token_accuracy": 0.9777659058570862, "num_input_tokens_seen": 10821144, "num_tokens": 10756496.0, "step": 470, "train_runtime": 459.2027, "train_tokens_per_second": 23565.072 }, { "entropy": 1.69921875, "epoch": 1.7939508506616257, "grad_norm": 1.171875, "learning_rate": 1.2108991293473627e-06, "loss": 0.0595, "mean_token_accuracy": 0.9741835057735443, "num_input_tokens_seen": 10936460, "num_tokens": 10871124.0, "step": 475, "train_runtime": 463.6099, "train_tokens_per_second": 23589.79 }, { "entropy": 1.69765625, "epoch": 1.8128544423440454, "grad_norm": 3.265625, "learning_rate": 1.0060718080838683e-06, "loss": 0.0541, "mean_token_accuracy": 0.9831156551837921, "num_input_tokens_seen": 11051508, "num_tokens": 10985594.0, "step": 480, "train_runtime": 467.1593, "train_tokens_per_second": 23656.828 }, { "entropy": 1.70078125, "epoch": 1.831758034026465, "grad_norm": 2.4375, "learning_rate": 8.197662906851534e-07, "loss": 0.0835, "mean_token_accuracy": 0.9726030707359314, "num_input_tokens_seen": 11166904, "num_tokens": 11100230.0, "step": 485, "train_runtime": 472.195, "train_tokens_per_second": 23648.922 }, { "entropy": 1.69921875, "epoch": 1.8506616257088848, "grad_norm": 2.765625, "learning_rate": 6.521642517483573e-07, "loss": 0.0532, "mean_token_accuracy": 0.9853454470634461, "num_input_tokens_seen": 11281802, "num_tokens": 11214624.0, "step": 490, "train_runtime": 475.7718, "train_tokens_per_second": 23712.635 }, { "entropy": 1.70078125, "epoch": 1.8695652173913042, "grad_norm": 2.171875, "learning_rate": 5.034291272968772e-07, "loss": 0.027, "mean_token_accuracy": 0.9934648215770722, "num_input_tokens_seen": 11396946, "num_tokens": 11329098.0, "step": 495, "train_runtime": 480.2436, "train_tokens_per_second": 23731.596 }, { "entropy": 1.6984375, "epoch": 1.888468809073724, "grad_norm": 4.0625, "learning_rate": 3.737059554068334e-07, "loss": 0.0742, "mean_token_accuracy": 0.9744843065738678, "num_input_tokens_seen": 11512282, "num_tokens": 11443715.0, "step": 500, "train_runtime": 484.6792, "train_tokens_per_second": 23752.376 }, { "entropy": 1.69921875, "epoch": 1.9073724007561437, "grad_norm": 6.84375, "learning_rate": 2.631212347741352e-07, "loss": 0.1322, "mean_token_accuracy": 0.9680740118026734, "num_input_tokens_seen": 11627828, "num_tokens": 11558513.0, "step": 505, "train_runtime": 544.5283, "train_tokens_per_second": 21353.945 }, { "entropy": 1.69921875, "epoch": 1.9262759924385633, "grad_norm": 1.0078125, "learning_rate": 1.7178280136011417e-07, "loss": 0.0864, "mean_token_accuracy": 0.9749818980693817, "num_input_tokens_seen": 11743010, "num_tokens": 11673010.0, "step": 510, "train_runtime": 549.7569, "train_tokens_per_second": 21360.369 }, { "entropy": 1.69921875, "epoch": 1.9451795841209831, "grad_norm": 2.5625, "learning_rate": 9.977972323599095e-08, "loss": 0.1175, "mean_token_accuracy": 0.9680160820484162, "num_input_tokens_seen": 11858430, "num_tokens": 11787637.0, "step": 515, "train_runtime": 553.6509, "train_tokens_per_second": 21418.605 }, { "entropy": 1.69765625, "epoch": 1.9640831758034025, "grad_norm": 2.921875, "learning_rate": 4.718221372874254e-08, "loss": 0.0695, "mean_token_accuracy": 0.9804269134998321, "num_input_tokens_seen": 11973576, "num_tokens": 11902111.0, "step": 520, "train_runtime": 557.8609, "train_tokens_per_second": 21463.371 }, { "entropy": 1.69609375, "epoch": 1.9829867674858224, "grad_norm": 5.8125, "learning_rate": 1.4041562953031051e-08, "loss": 0.1152, "mean_token_accuracy": 0.9696780204772949, "num_input_tokens_seen": 12088990, "num_tokens": 12016759.0, "step": 525, "train_runtime": 561.9991, "train_tokens_per_second": 21510.694 }, { "entropy": 1.6961805555555556, "epoch": 2.0, "grad_norm": 3.75, "learning_rate": 3.900877959917004e-10, "loss": 0.0989, "mean_token_accuracy": 0.9715293182267083, "num_input_tokens_seen": 12192662, "num_tokens": 12119827.0, "step": 530, "train_runtime": 565.5622, "train_tokens_per_second": 21558.482 }, { "epoch": 2.0, "num_input_tokens_seen": 12192662, "step": 530, "total_flos": 3.3226637176733696e+16, "train_loss": 0.1822078584218925, "train_runtime": 612.9949, "train_samples_per_second": 27.592, "train_steps_per_second": 0.865, "train_tokens_per_second": 2486.879 } ], "logging_steps": 5, "max_steps": 530, "num_input_tokens_seen": 12192662, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3226637176733696e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }