{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.9247042655944826, "epoch": 0.001, "grad_norm": 30.328035354614258, "learning_rate": 5.333333333333335e-07, "loss": 2.1764, "mean_token_accuracy": 0.5825600981712341, "num_input_tokens_seen": 14021, "num_tokens": 14021.0, "step": 5, "train_runtime": 36.9291, "train_tokens_per_second": 379.673 }, { "entropy": 1.775024700164795, "epoch": 0.002, "grad_norm": 17.90558433532715, "learning_rate": 1.2000000000000002e-06, "loss": 1.9134, "mean_token_accuracy": 0.6386884331703186, "num_input_tokens_seen": 27956, "num_tokens": 27956.0, "step": 10, "train_runtime": 44.4762, "train_tokens_per_second": 628.561 }, { "entropy": 2.0568880081176757, "epoch": 0.003, "grad_norm": 93.14165496826172, "learning_rate": 1.8666666666666669e-06, "loss": 1.9122, "mean_token_accuracy": 0.6272765874862671, "num_input_tokens_seen": 40693, "num_tokens": 40693.0, "step": 15, "train_runtime": 52.5296, "train_tokens_per_second": 774.668 }, { "entropy": 2.5321552753448486, "epoch": 0.004, "grad_norm": 21.415904998779297, "learning_rate": 2.5333333333333338e-06, "loss": 1.7253, "mean_token_accuracy": 0.6245870590209961, "num_input_tokens_seen": 54004, "num_tokens": 54004.0, "step": 20, "train_runtime": 60.5748, "train_tokens_per_second": 891.525 }, { "entropy": 2.5103355646133423, "epoch": 0.005, "grad_norm": 16.27761459350586, "learning_rate": 3.2000000000000003e-06, "loss": 1.4988, "mean_token_accuracy": 0.6652264595031738, "num_input_tokens_seen": 64708, "num_tokens": 64708.0, "step": 25, "train_runtime": 68.6629, "train_tokens_per_second": 942.401 }, { "entropy": 2.3708704233169557, "epoch": 0.006, "grad_norm": 10.451281547546387, "learning_rate": 3.866666666666667e-06, "loss": 1.1486, "mean_token_accuracy": 0.722599184513092, "num_input_tokens_seen": 77700, "num_tokens": 77700.0, "step": 30, "train_runtime": 76.6651, "train_tokens_per_second": 1013.499 }, { "entropy": 1.513807725906372, "epoch": 0.007, "grad_norm": 14.1542329788208, "learning_rate": 4.533333333333334e-06, "loss": 1.0078, "mean_token_accuracy": 0.7568871378898621, "num_input_tokens_seen": 88573, "num_tokens": 88573.0, "step": 35, "train_runtime": 83.9087, "train_tokens_per_second": 1055.588 }, { "entropy": 1.894319772720337, "epoch": 0.008, "grad_norm": 8.10840129852295, "learning_rate": 5.2e-06, "loss": 0.9356, "mean_token_accuracy": 0.7691613674163819, "num_input_tokens_seen": 101340, "num_tokens": 101340.0, "step": 40, "train_runtime": 91.6879, "train_tokens_per_second": 1105.272 }, { "entropy": 2.3185460567474365, "epoch": 0.009, "grad_norm": 15.459212303161621, "learning_rate": 5.8666666666666675e-06, "loss": 1.0118, "mean_token_accuracy": 0.7507454872131347, "num_input_tokens_seen": 115212, "num_tokens": 115212.0, "step": 45, "train_runtime": 99.6647, "train_tokens_per_second": 1155.997 }, { "entropy": 2.9704634666442873, "epoch": 0.01, "grad_norm": 35.69361114501953, "learning_rate": 6.533333333333334e-06, "loss": 0.955, "mean_token_accuracy": 0.7496288180351257, "num_input_tokens_seen": 128798, "num_tokens": 128798.0, "step": 50, "train_runtime": 107.6423, "train_tokens_per_second": 1196.537 }, { "entropy": 3.7563072681427, "epoch": 0.011, "grad_norm": 10.142990112304688, "learning_rate": 7.2000000000000005e-06, "loss": 1.0592, "mean_token_accuracy": 0.741034471988678, "num_input_tokens_seen": 140261, "num_tokens": 140261.0, "step": 55, "train_runtime": 115.4286, "train_tokens_per_second": 1215.132 }, { "entropy": 3.849906349182129, "epoch": 0.012, "grad_norm": 9.785825729370117, "learning_rate": 7.866666666666667e-06, "loss": 1.0381, "mean_token_accuracy": 0.7284790635108948, "num_input_tokens_seen": 152078, "num_tokens": 152078.0, "step": 60, "train_runtime": 123.3308, "train_tokens_per_second": 1233.09 }, { "entropy": 4.036617374420166, "epoch": 0.013, "grad_norm": 11.580093383789062, "learning_rate": 8.533333333333335e-06, "loss": 1.009, "mean_token_accuracy": 0.7272097945213318, "num_input_tokens_seen": 163785, "num_tokens": 163785.0, "step": 65, "train_runtime": 130.7003, "train_tokens_per_second": 1253.134 }, { "entropy": 4.421450233459472, "epoch": 0.014, "grad_norm": 8.655736923217773, "learning_rate": 9.200000000000002e-06, "loss": 0.9767, "mean_token_accuracy": 0.7302183866500854, "num_input_tokens_seen": 175608, "num_tokens": 175608.0, "step": 70, "train_runtime": 138.5085, "train_tokens_per_second": 1267.85 }, { "entropy": 4.3665361404418945, "epoch": 0.015, "grad_norm": 8.766755104064941, "learning_rate": 9.866666666666668e-06, "loss": 1.1807, "mean_token_accuracy": 0.7079806089401245, "num_input_tokens_seen": 190213, "num_tokens": 190213.0, "step": 75, "train_runtime": 146.4245, "train_tokens_per_second": 1299.051 }, { "entropy": 3.4251870632171633, "epoch": 0.016, "grad_norm": 10.03065013885498, "learning_rate": 1.0533333333333333e-05, "loss": 1.0374, "mean_token_accuracy": 0.7371157526969909, "num_input_tokens_seen": 201934, "num_tokens": 201934.0, "step": 80, "train_runtime": 154.1361, "train_tokens_per_second": 1310.102 }, { "entropy": 3.863028383255005, "epoch": 0.017, "grad_norm": 14.456884384155273, "learning_rate": 1.1200000000000001e-05, "loss": 1.0277, "mean_token_accuracy": 0.7279868721961975, "num_input_tokens_seen": 214526, "num_tokens": 214526.0, "step": 85, "train_runtime": 162.1106, "train_tokens_per_second": 1323.331 }, { "entropy": 3.9400867938995363, "epoch": 0.018, "grad_norm": 8.171151161193848, "learning_rate": 1.186666666666667e-05, "loss": 1.0229, "mean_token_accuracy": 0.7250483751296997, "num_input_tokens_seen": 225920, "num_tokens": 225920.0, "step": 90, "train_runtime": 169.9822, "train_tokens_per_second": 1329.08 }, { "entropy": 3.3297733783721926, "epoch": 0.019, "grad_norm": 104.0350570678711, "learning_rate": 1.2533333333333336e-05, "loss": 0.8878, "mean_token_accuracy": 0.7721951246261597, "num_input_tokens_seen": 237137, "num_tokens": 237137.0, "step": 95, "train_runtime": 177.2693, "train_tokens_per_second": 1337.722 }, { "entropy": 3.6277270793914793, "epoch": 0.02, "grad_norm": 20.603931427001953, "learning_rate": 1.3200000000000002e-05, "loss": 1.0313, "mean_token_accuracy": 0.7335701942443847, "num_input_tokens_seen": 249502, "num_tokens": 249502.0, "step": 100, "train_runtime": 185.0853, "train_tokens_per_second": 1348.038 }, { "entropy": 3.1997247695922852, "epoch": 0.021, "grad_norm": 9.093222618103027, "learning_rate": 1.3866666666666669e-05, "loss": 1.1175, "mean_token_accuracy": 0.7215671420097352, "num_input_tokens_seen": 260704, "num_tokens": 260704.0, "step": 105, "train_runtime": 192.8283, "train_tokens_per_second": 1352.001 }, { "entropy": 4.093398523330689, "epoch": 0.022, "grad_norm": 79.49065399169922, "learning_rate": 1.4533333333333335e-05, "loss": 0.9573, "mean_token_accuracy": 0.7511666655540467, "num_input_tokens_seen": 275156, "num_tokens": 275156.0, "step": 110, "train_runtime": 201.0039, "train_tokens_per_second": 1368.909 }, { "entropy": 2.8930378437042235, "epoch": 0.023, "grad_norm": 7.635770797729492, "learning_rate": 1.5200000000000002e-05, "loss": 1.0662, "mean_token_accuracy": 0.7242736339569091, "num_input_tokens_seen": 285839, "num_tokens": 285839.0, "step": 115, "train_runtime": 208.8823, "train_tokens_per_second": 1368.421 }, { "entropy": 2.1532176733016968, "epoch": 0.024, "grad_norm": 7.965024948120117, "learning_rate": 1.586666666666667e-05, "loss": 0.7736, "mean_token_accuracy": 0.7942737340927124, "num_input_tokens_seen": 297292, "num_tokens": 297292.0, "step": 120, "train_runtime": 216.4839, "train_tokens_per_second": 1373.275 }, { "entropy": 2.0017834663391114, "epoch": 0.025, "grad_norm": 7.879920482635498, "learning_rate": 1.6533333333333333e-05, "loss": 0.8776, "mean_token_accuracy": 0.7670745015144348, "num_input_tokens_seen": 308319, "num_tokens": 308319.0, "step": 125, "train_runtime": 224.1633, "train_tokens_per_second": 1375.422 }, { "entropy": 2.594643235206604, "epoch": 0.026, "grad_norm": 5.547260761260986, "learning_rate": 1.72e-05, "loss": 0.9943, "mean_token_accuracy": 0.7505615472793579, "num_input_tokens_seen": 320020, "num_tokens": 320020.0, "step": 130, "train_runtime": 231.9851, "train_tokens_per_second": 1379.485 }, { "entropy": 2.6080145835876465, "epoch": 0.027, "grad_norm": 259.3119201660156, "learning_rate": 1.7866666666666666e-05, "loss": 0.9008, "mean_token_accuracy": 0.7765714049339294, "num_input_tokens_seen": 332126, "num_tokens": 332126.0, "step": 135, "train_runtime": 240.0464, "train_tokens_per_second": 1383.591 }, { "entropy": 2.48198561668396, "epoch": 0.028, "grad_norm": 6.056805610656738, "learning_rate": 1.8533333333333334e-05, "loss": 0.9217, "mean_token_accuracy": 0.7611514925956726, "num_input_tokens_seen": 344084, "num_tokens": 344084.0, "step": 140, "train_runtime": 247.9003, "train_tokens_per_second": 1387.994 }, { "entropy": 2.8523908138275145, "epoch": 0.029, "grad_norm": 11.759600639343262, "learning_rate": 1.9200000000000003e-05, "loss": 0.9664, "mean_token_accuracy": 0.7755662202835083, "num_input_tokens_seen": 357754, "num_tokens": 357754.0, "step": 145, "train_runtime": 255.7715, "train_tokens_per_second": 1398.725 }, { "entropy": 2.4413506984710693, "epoch": 0.03, "grad_norm": 5.7263264656066895, "learning_rate": 1.9866666666666667e-05, "loss": 0.7455, "mean_token_accuracy": 0.8257268667221069, "num_input_tokens_seen": 368160, "num_tokens": 368160.0, "step": 150, "train_runtime": 263.51, "train_tokens_per_second": 1397.139 }, { "entropy": 2.3468523979187013, "epoch": 0.031, "grad_norm": 13.177472114562988, "learning_rate": 1.999996643350365e-05, "loss": 0.9948, "mean_token_accuracy": 0.7692738771438599, "num_input_tokens_seen": 379011, "num_tokens": 379011.0, "step": 155, "train_runtime": 270.8566, "train_tokens_per_second": 1399.305 }, { "entropy": 2.702459287643433, "epoch": 0.032, "grad_norm": 19.234607696533203, "learning_rate": 1.999983006999844e-05, "loss": 0.9253, "mean_token_accuracy": 0.7862245321273804, "num_input_tokens_seen": 391688, "num_tokens": 391688.0, "step": 160, "train_runtime": 278.8953, "train_tokens_per_second": 1404.427 }, { "entropy": 3.7248690605163572, "epoch": 0.033, "grad_norm": 23.181379318237305, "learning_rate": 1.999958881300763e-05, "loss": 1.1625, "mean_token_accuracy": 0.7444316983222962, "num_input_tokens_seen": 402818, "num_tokens": 402818.0, "step": 165, "train_runtime": 286.8276, "train_tokens_per_second": 1404.391 }, { "entropy": 3.3490195274353027, "epoch": 0.034, "grad_norm": 17.01816749572754, "learning_rate": 1.99992426650619e-05, "loss": 1.0291, "mean_token_accuracy": 0.7642000675201416, "num_input_tokens_seen": 416420, "num_tokens": 416420.0, "step": 170, "train_runtime": 294.7972, "train_tokens_per_second": 1412.564 }, { "entropy": 2.343595194816589, "epoch": 0.035, "grad_norm": 31.172258377075195, "learning_rate": 1.9998791629792172e-05, "loss": 0.9916, "mean_token_accuracy": 0.771911871433258, "num_input_tokens_seen": 431488, "num_tokens": 431488.0, "step": 175, "train_runtime": 302.8358, "train_tokens_per_second": 1424.825 }, { "entropy": 2.448350429534912, "epoch": 0.036, "grad_norm": 15.46915340423584, "learning_rate": 1.9998235711929593e-05, "loss": 0.967, "mean_token_accuracy": 0.7873155832290649, "num_input_tokens_seen": 442006, "num_tokens": 442006.0, "step": 180, "train_runtime": 310.4683, "train_tokens_per_second": 1423.675 }, { "entropy": 2.1757837533950806, "epoch": 0.037, "grad_norm": 17.592159271240234, "learning_rate": 1.999757491730548e-05, "loss": 0.8657, "mean_token_accuracy": 0.7963372468948364, "num_input_tokens_seen": 452452, "num_tokens": 452452.0, "step": 185, "train_runtime": 318.0742, "train_tokens_per_second": 1422.473 }, { "entropy": 1.914800763130188, "epoch": 0.038, "grad_norm": 14.993433952331543, "learning_rate": 1.9996809252851254e-05, "loss": 0.9004, "mean_token_accuracy": 0.7912453413009644, "num_input_tokens_seen": 462952, "num_tokens": 462952.0, "step": 190, "train_runtime": 325.7905, "train_tokens_per_second": 1421.011 }, { "entropy": 1.2005248427391053, "epoch": 0.039, "grad_norm": 26.665267944335938, "learning_rate": 1.9995938726598374e-05, "loss": 1.115, "mean_token_accuracy": 0.7572134733200073, "num_input_tokens_seen": 475805, "num_tokens": 475805.0, "step": 195, "train_runtime": 333.9227, "train_tokens_per_second": 1424.896 }, { "entropy": 0.8696770548820496, "epoch": 0.04, "grad_norm": 23.537723541259766, "learning_rate": 1.999496334767825e-05, "loss": 0.9469, "mean_token_accuracy": 0.768835699558258, "num_input_tokens_seen": 488642, "num_tokens": 488642.0, "step": 200, "train_runtime": 341.913, "train_tokens_per_second": 1429.141 }, { "entropy": 0.905512011051178, "epoch": 0.041, "grad_norm": 20.80912208557129, "learning_rate": 1.9993883126322142e-05, "loss": 1.056, "mean_token_accuracy": 0.7642222166061401, "num_input_tokens_seen": 500554, "num_tokens": 500554.0, "step": 205, "train_runtime": 349.7982, "train_tokens_per_second": 1430.979 }, { "entropy": 1.0598583459854125, "epoch": 0.042, "grad_norm": 9.609282493591309, "learning_rate": 1.9992698073861067e-05, "loss": 0.8798, "mean_token_accuracy": 0.7919794678688049, "num_input_tokens_seen": 512096, "num_tokens": 512096.0, "step": 210, "train_runtime": 357.0149, "train_tokens_per_second": 1434.383 }, { "entropy": 1.0921847581863404, "epoch": 0.043, "grad_norm": 15.870024681091309, "learning_rate": 1.999140820272566e-05, "loss": 0.9589, "mean_token_accuracy": 0.7832521200180054, "num_input_tokens_seen": 522144, "num_tokens": 522144.0, "step": 215, "train_runtime": 364.7704, "train_tokens_per_second": 1431.432 }, { "entropy": 1.0526527166366577, "epoch": 0.044, "grad_norm": 10.599091529846191, "learning_rate": 1.9990013526446056e-05, "loss": 0.787, "mean_token_accuracy": 0.8196481823921203, "num_input_tokens_seen": 531128, "num_tokens": 531128.0, "step": 220, "train_runtime": 372.5032, "train_tokens_per_second": 1425.835 }, { "entropy": 1.6944920778274537, "epoch": 0.045, "grad_norm": 23.740386962890625, "learning_rate": 1.998851405965175e-05, "loss": 1.3617, "mean_token_accuracy": 0.7199732303619385, "num_input_tokens_seen": 540518, "num_tokens": 540518.0, "step": 225, "train_runtime": 380.2862, "train_tokens_per_second": 1421.345 }, { "entropy": 1.5939557075500488, "epoch": 0.046, "grad_norm": 29.556196212768555, "learning_rate": 1.998690981807145e-05, "loss": 1.4508, "mean_token_accuracy": 0.7068975567817688, "num_input_tokens_seen": 553898, "num_tokens": 553898.0, "step": 230, "train_runtime": 388.249, "train_tokens_per_second": 1426.656 }, { "entropy": 1.200244951248169, "epoch": 0.047, "grad_norm": 25.907962799072266, "learning_rate": 1.9985200818532873e-05, "loss": 0.9439, "mean_token_accuracy": 0.7855159521102906, "num_input_tokens_seen": 565473, "num_tokens": 565473.0, "step": 235, "train_runtime": 396.0659, "train_tokens_per_second": 1427.724 }, { "entropy": 1.6297954082489015, "epoch": 0.048, "grad_norm": 10.747814178466797, "learning_rate": 1.9983387078962634e-05, "loss": 0.852, "mean_token_accuracy": 0.8058385372161865, "num_input_tokens_seen": 576840, "num_tokens": 576840.0, "step": 240, "train_runtime": 403.378, "train_tokens_per_second": 1430.023 }, { "entropy": 1.2379817962646484, "epoch": 0.049, "grad_norm": 30.63800811767578, "learning_rate": 1.998146861838599e-05, "loss": 1.0588, "mean_token_accuracy": 0.770342481136322, "num_input_tokens_seen": 585400, "num_tokens": 585400.0, "step": 245, "train_runtime": 411.1, "train_tokens_per_second": 1423.984 }, { "entropy": 2.4146640062332154, "epoch": 0.05, "grad_norm": 33.35897445678711, "learning_rate": 1.997944545692669e-05, "loss": 0.8276, "mean_token_accuracy": 0.8050395131111145, "num_input_tokens_seen": 595468, "num_tokens": 595468.0, "step": 250, "train_runtime": 418.8504, "train_tokens_per_second": 1421.672 }, { "entropy": 1.2783653497695924, "epoch": 0.051, "grad_norm": 23.615873336791992, "learning_rate": 1.9977317615806738e-05, "loss": 1.0866, "mean_token_accuracy": 0.7583643674850464, "num_input_tokens_seen": 604900, "num_tokens": 604900.0, "step": 255, "train_runtime": 426.6759, "train_tokens_per_second": 1417.704 }, { "entropy": 1.0852158069610596, "epoch": 0.052, "grad_norm": 12.886068344116211, "learning_rate": 1.997508511734618e-05, "loss": 0.8958, "mean_token_accuracy": 0.7893513083457947, "num_input_tokens_seen": 616220, "num_tokens": 616220.0, "step": 260, "train_runtime": 434.6165, "train_tokens_per_second": 1417.848 }, { "entropy": 1.2177501797676087, "epoch": 0.053, "grad_norm": 26.582170486450195, "learning_rate": 1.997274798496287e-05, "loss": 1.2777, "mean_token_accuracy": 0.7230580449104309, "num_input_tokens_seen": 625321, "num_tokens": 625321.0, "step": 265, "train_runtime": 442.1695, "train_tokens_per_second": 1414.211 }, { "entropy": 1.315873622894287, "epoch": 0.054, "grad_norm": 24.019363403320312, "learning_rate": 1.9970306243172223e-05, "loss": 1.0741, "mean_token_accuracy": 0.7587724208831788, "num_input_tokens_seen": 635782, "num_tokens": 635782.0, "step": 270, "train_runtime": 449.7261, "train_tokens_per_second": 1413.709 }, { "entropy": 1.3849868535995484, "epoch": 0.055, "grad_norm": 33.714141845703125, "learning_rate": 1.9967759917586953e-05, "loss": 1.2643, "mean_token_accuracy": 0.7102671027183532, "num_input_tokens_seen": 647271, "num_tokens": 647271.0, "step": 275, "train_runtime": 457.6017, "train_tokens_per_second": 1414.486 }, { "entropy": 0.9884407758712769, "epoch": 0.056, "grad_norm": 17.75234603881836, "learning_rate": 1.9965109034916806e-05, "loss": 1.0065, "mean_token_accuracy": 0.7668455362319946, "num_input_tokens_seen": 656784, "num_tokens": 656784.0, "step": 280, "train_runtime": 465.2962, "train_tokens_per_second": 1411.539 }, { "entropy": 1.6859658479690551, "epoch": 0.057, "grad_norm": 56.73685836791992, "learning_rate": 1.9962353622968296e-05, "loss": 1.3324, "mean_token_accuracy": 0.722649359703064, "num_input_tokens_seen": 667204, "num_tokens": 667204.0, "step": 285, "train_runtime": 472.9789, "train_tokens_per_second": 1410.642 }, { "entropy": 2.68042311668396, "epoch": 0.058, "grad_norm": 64.5443115234375, "learning_rate": 1.9959493710644385e-05, "loss": 1.338, "mean_token_accuracy": 0.7050921201705933, "num_input_tokens_seen": 679972, "num_tokens": 679972.0, "step": 290, "train_runtime": 480.8122, "train_tokens_per_second": 1414.215 }, { "entropy": 2.2331590175628664, "epoch": 0.059, "grad_norm": 26.320358276367188, "learning_rate": 1.9956529327944198e-05, "loss": 1.4025, "mean_token_accuracy": 0.7041542172431946, "num_input_tokens_seen": 690805, "num_tokens": 690805.0, "step": 295, "train_runtime": 488.161, "train_tokens_per_second": 1415.117 }, { "entropy": 2.688382935523987, "epoch": 0.06, "grad_norm": 23.513538360595703, "learning_rate": 1.995346050596271e-05, "loss": 1.4327, "mean_token_accuracy": 0.6945638537406922, "num_input_tokens_seen": 702334, "num_tokens": 702334.0, "step": 300, "train_runtime": 495.9954, "train_tokens_per_second": 1416.009 }, { "entropy": 1.9270040273666382, "epoch": 0.061, "grad_norm": 23.034151077270508, "learning_rate": 1.995028727689041e-05, "loss": 1.2338, "mean_token_accuracy": 0.7275295972824096, "num_input_tokens_seen": 714296, "num_tokens": 714296.0, "step": 305, "train_runtime": 504.0084, "train_tokens_per_second": 1417.23 }, { "entropy": 1.7296420693397523, "epoch": 0.062, "grad_norm": 18.51228141784668, "learning_rate": 1.9947009674012975e-05, "loss": 1.0521, "mean_token_accuracy": 0.7563829541206359, "num_input_tokens_seen": 724446, "num_tokens": 724446.0, "step": 310, "train_runtime": 512.0196, "train_tokens_per_second": 1414.879 }, { "entropy": 1.2774593114852906, "epoch": 0.063, "grad_norm": 36.4163703918457, "learning_rate": 1.9943627731710896e-05, "loss": 1.1846, "mean_token_accuracy": 0.7388334155082703, "num_input_tokens_seen": 734651, "num_tokens": 734651.0, "step": 315, "train_runtime": 519.8764, "train_tokens_per_second": 1413.126 }, { "entropy": 1.1775312185287476, "epoch": 0.064, "grad_norm": 14.374308586120605, "learning_rate": 1.994014148545916e-05, "loss": 0.7869, "mean_token_accuracy": 0.813383936882019, "num_input_tokens_seen": 743858, "num_tokens": 743858.0, "step": 320, "train_runtime": 527.6782, "train_tokens_per_second": 1409.681 }, { "entropy": 1.7520169019699097, "epoch": 0.065, "grad_norm": 33.21381759643555, "learning_rate": 1.9936550971826835e-05, "loss": 1.3175, "mean_token_accuracy": 0.7157476782798767, "num_input_tokens_seen": 755522, "num_tokens": 755522.0, "step": 325, "train_runtime": 535.1093, "train_tokens_per_second": 1411.902 }, { "entropy": 1.3952185392379761, "epoch": 0.066, "grad_norm": 18.199487686157227, "learning_rate": 1.9932856228476705e-05, "loss": 1.0627, "mean_token_accuracy": 0.7550289034843445, "num_input_tokens_seen": 767616, "num_tokens": 767616.0, "step": 330, "train_runtime": 543.0492, "train_tokens_per_second": 1413.529 }, { "entropy": 1.8292996406555175, "epoch": 0.067, "grad_norm": 21.766708374023438, "learning_rate": 1.9929057294164894e-05, "loss": 1.3149, "mean_token_accuracy": 0.7141664981842041, "num_input_tokens_seen": 775446, "num_tokens": 775446.0, "step": 335, "train_runtime": 550.8186, "train_tokens_per_second": 1407.806 }, { "entropy": 1.860321593284607, "epoch": 0.068, "grad_norm": 21.08194351196289, "learning_rate": 1.9925154208740412e-05, "loss": 1.1614, "mean_token_accuracy": 0.7497189760208129, "num_input_tokens_seen": 786656, "num_tokens": 786656.0, "step": 340, "train_runtime": 558.745, "train_tokens_per_second": 1407.898 }, { "entropy": 2.0527014255523683, "epoch": 0.069, "grad_norm": 62.193748474121094, "learning_rate": 1.9921147013144782e-05, "loss": 1.2322, "mean_token_accuracy": 0.7277578115463257, "num_input_tokens_seen": 798133, "num_tokens": 798133.0, "step": 345, "train_runtime": 566.6511, "train_tokens_per_second": 1408.509 }, { "entropy": 2.551530694961548, "epoch": 0.07, "grad_norm": 43.676841735839844, "learning_rate": 1.9917035749411585e-05, "loss": 1.5311, "mean_token_accuracy": 0.674881386756897, "num_input_tokens_seen": 811290, "num_tokens": 811290.0, "step": 350, "train_runtime": 574.6427, "train_tokens_per_second": 1411.816 }, { "entropy": 1.588026523590088, "epoch": 0.071, "grad_norm": 29.081323623657227, "learning_rate": 1.9912820460666046e-05, "loss": 0.9962, "mean_token_accuracy": 0.7661972045898438, "num_input_tokens_seen": 823767, "num_tokens": 823767.0, "step": 355, "train_runtime": 582.1676, "train_tokens_per_second": 1415.0 }, { "entropy": 2.3431188583374025, "epoch": 0.072, "grad_norm": 24.090322494506836, "learning_rate": 1.9908501191124535e-05, "loss": 0.8849, "mean_token_accuracy": 0.7893974423408509, "num_input_tokens_seen": 837160, "num_tokens": 837160.0, "step": 360, "train_runtime": 590.1299, "train_tokens_per_second": 1418.603 }, { "entropy": 2.934668254852295, "epoch": 0.073, "grad_norm": 27.628726959228516, "learning_rate": 1.9904077986094153e-05, "loss": 1.1722, "mean_token_accuracy": 0.7333286762237549, "num_input_tokens_seen": 848624, "num_tokens": 848624.0, "step": 365, "train_runtime": 598.1243, "train_tokens_per_second": 1418.809 }, { "entropy": 2.608621430397034, "epoch": 0.074, "grad_norm": 19.03021812438965, "learning_rate": 1.9899550891972224e-05, "loss": 0.9972, "mean_token_accuracy": 0.7770003914833069, "num_input_tokens_seen": 858982, "num_tokens": 858982.0, "step": 370, "train_runtime": 605.9368, "train_tokens_per_second": 1417.61 }, { "entropy": 1.784191083908081, "epoch": 0.075, "grad_norm": 42.8726806640625, "learning_rate": 1.9894919956245825e-05, "loss": 1.1929, "mean_token_accuracy": 0.74155193567276, "num_input_tokens_seen": 871514, "num_tokens": 871514.0, "step": 375, "train_runtime": 614.0443, "train_tokens_per_second": 1419.301 }, { "entropy": 2.564887189865112, "epoch": 0.076, "grad_norm": 17.869461059570312, "learning_rate": 1.9890185227491285e-05, "loss": 1.1852, "mean_token_accuracy": 0.735111665725708, "num_input_tokens_seen": 883768, "num_tokens": 883768.0, "step": 380, "train_runtime": 621.8668, "train_tokens_per_second": 1421.153 }, { "entropy": 2.7047438621520996, "epoch": 0.077, "grad_norm": 28.475475311279297, "learning_rate": 1.988534675537366e-05, "loss": 1.2398, "mean_token_accuracy": 0.7126539707183838, "num_input_tokens_seen": 895922, "num_tokens": 895922.0, "step": 385, "train_runtime": 629.5746, "train_tokens_per_second": 1423.059 }, { "entropy": 2.1920059442520143, "epoch": 0.078, "grad_norm": 31.29215431213379, "learning_rate": 1.9880404590646233e-05, "loss": 0.8989, "mean_token_accuracy": 0.7906081318855286, "num_input_tokens_seen": 907720, "num_tokens": 907720.0, "step": 390, "train_runtime": 637.4501, "train_tokens_per_second": 1423.986 }, { "entropy": 1.5685983180999756, "epoch": 0.079, "grad_norm": 42.72916030883789, "learning_rate": 1.9875358785149982e-05, "loss": 1.6362, "mean_token_accuracy": 0.653790819644928, "num_input_tokens_seen": 920266, "num_tokens": 920266.0, "step": 395, "train_runtime": 645.4536, "train_tokens_per_second": 1425.766 }, { "entropy": 0.9352366924285889, "epoch": 0.08, "grad_norm": 8.06243896484375, "learning_rate": 1.9870209391813013e-05, "loss": 0.8484, "mean_token_accuracy": 0.8070485949516296, "num_input_tokens_seen": 929040, "num_tokens": 929040.0, "step": 400, "train_runtime": 653.2243, "train_tokens_per_second": 1422.237 }, { "entropy": 0.7112592399120331, "epoch": 0.081, "grad_norm": 20.502967834472656, "learning_rate": 1.9864956464650027e-05, "loss": 0.9947, "mean_token_accuracy": 0.7576935768127442, "num_input_tokens_seen": 940463, "num_tokens": 940463.0, "step": 405, "train_runtime": 661.1452, "train_tokens_per_second": 1422.476 }, { "entropy": 1.420214992761612, "epoch": 0.082, "grad_norm": 17.0722713470459, "learning_rate": 1.985960005876174e-05, "loss": 1.1339, "mean_token_accuracy": 0.7397612571716309, "num_input_tokens_seen": 952864, "num_tokens": 952864.0, "step": 410, "train_runtime": 668.6006, "train_tokens_per_second": 1425.162 }, { "entropy": 1.7739054679870605, "epoch": 0.083, "grad_norm": 23.0296630859375, "learning_rate": 1.9854140230334323e-05, "loss": 1.2931, "mean_token_accuracy": 0.7232706785202027, "num_input_tokens_seen": 960535, "num_tokens": 960535.0, "step": 415, "train_runtime": 676.2697, "train_tokens_per_second": 1420.343 }, { "entropy": 1.7044967412948608, "epoch": 0.084, "grad_norm": 11.81539535522461, "learning_rate": 1.984857703663879e-05, "loss": 0.8884, "mean_token_accuracy": 0.7905951619148255, "num_input_tokens_seen": 971740, "num_tokens": 971740.0, "step": 420, "train_runtime": 684.1972, "train_tokens_per_second": 1420.263 }, { "entropy": 1.0279615640640258, "epoch": 0.085, "grad_norm": 129.53575134277344, "learning_rate": 1.98429105360304e-05, "loss": 1.3375, "mean_token_accuracy": 0.7262090921401978, "num_input_tokens_seen": 983939, "num_tokens": 983939.0, "step": 425, "train_runtime": 692.1359, "train_tokens_per_second": 1421.598 }, { "entropy": 1.39291672706604, "epoch": 0.086, "grad_norm": 13.812393188476562, "learning_rate": 1.9837140787948082e-05, "loss": 1.2456, "mean_token_accuracy": 0.7199745416641236, "num_input_tokens_seen": 995886, "num_tokens": 995886.0, "step": 430, "train_runtime": 699.8771, "train_tokens_per_second": 1422.944 }, { "entropy": 2.2280022144317626, "epoch": 0.087, "grad_norm": 52.42177963256836, "learning_rate": 1.983126785291375e-05, "loss": 1.4044, "mean_token_accuracy": 0.7022199153900146, "num_input_tokens_seen": 1011156, "num_tokens": 1011156.0, "step": 435, "train_runtime": 708.046, "train_tokens_per_second": 1428.094 }, { "entropy": 1.569710612297058, "epoch": 0.088, "grad_norm": 31.967531204223633, "learning_rate": 1.9825291792531717e-05, "loss": 1.2246, "mean_token_accuracy": 0.7250410676002502, "num_input_tokens_seen": 1024348, "num_tokens": 1024348.0, "step": 440, "train_runtime": 715.4326, "train_tokens_per_second": 1431.788 }, { "entropy": 1.7904691219329834, "epoch": 0.089, "grad_norm": 24.883691787719727, "learning_rate": 1.9819212669488026e-05, "loss": 1.2685, "mean_token_accuracy": 0.716226315498352, "num_input_tokens_seen": 1037261, "num_tokens": 1037261.0, "step": 445, "train_runtime": 723.3354, "train_tokens_per_second": 1433.997 }, { "entropy": 1.6276556730270386, "epoch": 0.09, "grad_norm": 36.92219161987305, "learning_rate": 1.9813030547549806e-05, "loss": 1.0476, "mean_token_accuracy": 0.7567752957344055, "num_input_tokens_seen": 1049204, "num_tokens": 1049204.0, "step": 450, "train_runtime": 731.3069, "train_tokens_per_second": 1434.697 }, { "entropy": 1.7801040172576905, "epoch": 0.091, "grad_norm": 29.174489974975586, "learning_rate": 1.9806745491564588e-05, "loss": 1.1227, "mean_token_accuracy": 0.7405856728553772, "num_input_tokens_seen": 1061882, "num_tokens": 1061882.0, "step": 455, "train_runtime": 739.3607, "train_tokens_per_second": 1436.216 }, { "entropy": 1.3776607990264893, "epoch": 0.092, "grad_norm": 19.485397338867188, "learning_rate": 1.9800357567459633e-05, "loss": 1.0293, "mean_token_accuracy": 0.7614623665809631, "num_input_tokens_seen": 1072824, "num_tokens": 1072824.0, "step": 460, "train_runtime": 747.2339, "train_tokens_per_second": 1435.727 }, { "entropy": 2.4246857166290283, "epoch": 0.093, "grad_norm": 39.464900970458984, "learning_rate": 1.9793866842241245e-05, "loss": 1.2099, "mean_token_accuracy": 0.7389422535896302, "num_input_tokens_seen": 1084042, "num_tokens": 1084042.0, "step": 465, "train_runtime": 755.1012, "train_tokens_per_second": 1435.625 }, { "entropy": 2.286059927940369, "epoch": 0.094, "grad_norm": 22.538352966308594, "learning_rate": 1.978727338399406e-05, "loss": 0.94, "mean_token_accuracy": 0.7868804693222046, "num_input_tokens_seen": 1097380, "num_tokens": 1097380.0, "step": 470, "train_runtime": 762.6232, "train_tokens_per_second": 1438.954 }, { "entropy": 2.316425633430481, "epoch": 0.095, "grad_norm": 25.335355758666992, "learning_rate": 1.9780577261880336e-05, "loss": 1.303, "mean_token_accuracy": 0.7122011184692383, "num_input_tokens_seen": 1110895, "num_tokens": 1110895.0, "step": 475, "train_runtime": 770.74, "train_tokens_per_second": 1441.336 }, { "entropy": 1.6908939957618714, "epoch": 0.096, "grad_norm": 17.215105056762695, "learning_rate": 1.9773778546139228e-05, "loss": 0.8077, "mean_token_accuracy": 0.8058344483375549, "num_input_tokens_seen": 1122176, "num_tokens": 1122176.0, "step": 480, "train_runtime": 778.608, "train_tokens_per_second": 1441.259 }, { "entropy": 2.3195732831954956, "epoch": 0.097, "grad_norm": 58.26915740966797, "learning_rate": 1.9766877308086038e-05, "loss": 1.4723, "mean_token_accuracy": 0.6892394065856934, "num_input_tokens_seen": 1137459, "num_tokens": 1137459.0, "step": 485, "train_runtime": 786.7939, "train_tokens_per_second": 1445.689 }, { "entropy": 1.9484764814376831, "epoch": 0.098, "grad_norm": 11.622220039367676, "learning_rate": 1.9759873620111492e-05, "loss": 0.9697, "mean_token_accuracy": 0.7782936811447143, "num_input_tokens_seen": 1148728, "num_tokens": 1148728.0, "step": 490, "train_runtime": 794.511, "train_tokens_per_second": 1445.83 }, { "entropy": 3.1672781467437745, "epoch": 0.099, "grad_norm": 22.868005752563477, "learning_rate": 1.9752767555680967e-05, "loss": 1.2104, "mean_token_accuracy": 0.7402653098106384, "num_input_tokens_seen": 1159080, "num_tokens": 1159080.0, "step": 495, "train_runtime": 802.4911, "train_tokens_per_second": 1444.352 }, { "entropy": 2.7734265327453613, "epoch": 0.1, "grad_norm": 13.784249305725098, "learning_rate": 1.974555918933371e-05, "loss": 1.1445, "mean_token_accuracy": 0.7421590089797974, "num_input_tokens_seen": 1173684, "num_tokens": 1173684.0, "step": 500, "train_runtime": 810.1067, "train_tokens_per_second": 1448.802 }, { "entropy": 1.6398258328437805, "epoch": 0.101, "grad_norm": 19.270593643188477, "learning_rate": 1.9738248596682078e-05, "loss": 0.8517, "mean_token_accuracy": 0.801758861541748, "num_input_tokens_seen": 1186902, "num_tokens": 1186902.0, "step": 505, "train_runtime": 817.9777, "train_tokens_per_second": 1451.02 }, { "entropy": 2.3668081521987916, "epoch": 0.102, "grad_norm": 25.54165267944336, "learning_rate": 1.9730835854410726e-05, "loss": 1.1957, "mean_token_accuracy": 0.7375865459442139, "num_input_tokens_seen": 1199374, "num_tokens": 1199374.0, "step": 510, "train_runtime": 825.9125, "train_tokens_per_second": 1452.18 }, { "entropy": 2.5896084785461424, "epoch": 0.103, "grad_norm": 30.09250831604004, "learning_rate": 1.9723321040275816e-05, "loss": 1.2398, "mean_token_accuracy": 0.7158333778381347, "num_input_tokens_seen": 1211296, "num_tokens": 1211296.0, "step": 515, "train_runtime": 834.0089, "train_tokens_per_second": 1452.378 }, { "entropy": 2.3841901302337645, "epoch": 0.104, "grad_norm": 26.53577423095703, "learning_rate": 1.9715704233104188e-05, "loss": 1.3197, "mean_token_accuracy": 0.7169519782066345, "num_input_tokens_seen": 1224356, "num_tokens": 1224356.0, "step": 520, "train_runtime": 841.8763, "train_tokens_per_second": 1454.318 }, { "entropy": 2.632377290725708, "epoch": 0.105, "grad_norm": 30.180646896362305, "learning_rate": 1.9707985512792544e-05, "loss": 1.3782, "mean_token_accuracy": 0.6957810044288635, "num_input_tokens_seen": 1238288, "num_tokens": 1238288.0, "step": 525, "train_runtime": 849.7975, "train_tokens_per_second": 1457.157 }, { "entropy": 1.4708729982376099, "epoch": 0.106, "grad_norm": 18.378416061401367, "learning_rate": 1.9700164960306612e-05, "loss": 0.9271, "mean_token_accuracy": 0.7790943384170532, "num_input_tokens_seen": 1250772, "num_tokens": 1250772.0, "step": 530, "train_runtime": 857.3733, "train_tokens_per_second": 1458.842 }, { "entropy": 2.078226113319397, "epoch": 0.107, "grad_norm": 25.391448974609375, "learning_rate": 1.9692242657680286e-05, "loss": 1.3142, "mean_token_accuracy": 0.7021040797233582, "num_input_tokens_seen": 1263333, "num_tokens": 1263333.0, "step": 535, "train_runtime": 865.38, "train_tokens_per_second": 1459.859 }, { "entropy": 1.666552221775055, "epoch": 0.108, "grad_norm": 18.142709732055664, "learning_rate": 1.9684218688014773e-05, "loss": 1.0286, "mean_token_accuracy": 0.7565093755722045, "num_input_tokens_seen": 1274874, "num_tokens": 1274874.0, "step": 540, "train_runtime": 873.2857, "train_tokens_per_second": 1459.859 }, { "entropy": 1.8019450426101684, "epoch": 0.109, "grad_norm": 30.58747100830078, "learning_rate": 1.9676093135477713e-05, "loss": 1.2578, "mean_token_accuracy": 0.7087090969085693, "num_input_tokens_seen": 1286203, "num_tokens": 1286203.0, "step": 545, "train_runtime": 881.0843, "train_tokens_per_second": 1459.796 }, { "entropy": 2.149148464202881, "epoch": 0.11, "grad_norm": 18.876434326171875, "learning_rate": 1.9667866085302312e-05, "loss": 1.2696, "mean_token_accuracy": 0.7162733435630798, "num_input_tokens_seen": 1296684, "num_tokens": 1296684.0, "step": 550, "train_runtime": 889.0142, "train_tokens_per_second": 1458.564 }, { "entropy": 1.7550033330917358, "epoch": 0.111, "grad_norm": 23.008808135986328, "learning_rate": 1.9659537623786428e-05, "loss": 1.3528, "mean_token_accuracy": 0.7081112146377564, "num_input_tokens_seen": 1308016, "num_tokens": 1308016.0, "step": 555, "train_runtime": 896.6335, "train_tokens_per_second": 1458.808 }, { "entropy": 1.5823419332504272, "epoch": 0.112, "grad_norm": 22.918617248535156, "learning_rate": 1.965110783829169e-05, "loss": 1.2146, "mean_token_accuracy": 0.7482839345932006, "num_input_tokens_seen": 1318532, "num_tokens": 1318532.0, "step": 560, "train_runtime": 904.1717, "train_tokens_per_second": 1458.276 }, { "entropy": 1.5263604402542115, "epoch": 0.113, "grad_norm": 18.7270565032959, "learning_rate": 1.9642576817242553e-05, "loss": 1.2566, "mean_token_accuracy": 0.7282665967941284, "num_input_tokens_seen": 1329686, "num_tokens": 1329686.0, "step": 565, "train_runtime": 912.007, "train_tokens_per_second": 1457.978 }, { "entropy": 1.408542561531067, "epoch": 0.114, "grad_norm": 14.177605628967285, "learning_rate": 1.963394465012539e-05, "loss": 1.1868, "mean_token_accuracy": 0.7396206736564637, "num_input_tokens_seen": 1343140, "num_tokens": 1343140.0, "step": 570, "train_runtime": 919.9361, "train_tokens_per_second": 1460.036 }, { "entropy": 2.172963285446167, "epoch": 0.115, "grad_norm": 36.03068923950195, "learning_rate": 1.962521142748755e-05, "loss": 1.6285, "mean_token_accuracy": 0.6697513699531555, "num_input_tokens_seen": 1356257, "num_tokens": 1356257.0, "step": 575, "train_runtime": 927.8943, "train_tokens_per_second": 1461.65 }, { "entropy": 2.4995534896850584, "epoch": 0.116, "grad_norm": 80.18787384033203, "learning_rate": 1.961637724093641e-05, "loss": 1.2771, "mean_token_accuracy": 0.7064315795898437, "num_input_tokens_seen": 1369448, "num_tokens": 1369448.0, "step": 580, "train_runtime": 935.9039, "train_tokens_per_second": 1463.236 }, { "entropy": 1.3043323516845704, "epoch": 0.117, "grad_norm": 43.095703125, "learning_rate": 1.9607442183138403e-05, "loss": 1.1063, "mean_token_accuracy": 0.7448446154594421, "num_input_tokens_seen": 1379241, "num_tokens": 1379241.0, "step": 585, "train_runtime": 943.0478, "train_tokens_per_second": 1462.536 }, { "entropy": 2.348073649406433, "epoch": 0.118, "grad_norm": 22.049196243286133, "learning_rate": 1.9598406347818056e-05, "loss": 1.4247, "mean_token_accuracy": 0.6922738671302795, "num_input_tokens_seen": 1393020, "num_tokens": 1393020.0, "step": 590, "train_runtime": 951.0394, "train_tokens_per_second": 1464.734 }, { "entropy": 1.9763983249664308, "epoch": 0.119, "grad_norm": 22.45396614074707, "learning_rate": 1.958926982975701e-05, "loss": 1.3141, "mean_token_accuracy": 0.7057921767234803, "num_input_tokens_seen": 1405600, "num_tokens": 1405600.0, "step": 595, "train_runtime": 958.9358, "train_tokens_per_second": 1465.792 }, { "entropy": 2.0069111824035644, "epoch": 0.12, "grad_norm": 15.863134384155273, "learning_rate": 1.958003272479301e-05, "loss": 1.0387, "mean_token_accuracy": 0.7647307872772217, "num_input_tokens_seen": 1415836, "num_tokens": 1415836.0, "step": 600, "train_runtime": 966.7296, "train_tokens_per_second": 1464.563 }, { "entropy": 1.9786799907684327, "epoch": 0.121, "grad_norm": 31.84896469116211, "learning_rate": 1.9570695129818928e-05, "loss": 1.2099, "mean_token_accuracy": 0.7352941989898681, "num_input_tokens_seen": 1423041, "num_tokens": 1423041.0, "step": 605, "train_runtime": 974.3837, "train_tokens_per_second": 1460.452 }, { "entropy": 2.540336084365845, "epoch": 0.122, "grad_norm": 13.923699378967285, "learning_rate": 1.9561257142781706e-05, "loss": 1.0208, "mean_token_accuracy": 0.7641383409500122, "num_input_tokens_seen": 1436848, "num_tokens": 1436848.0, "step": 610, "train_runtime": 982.4969, "train_tokens_per_second": 1462.445 }, { "entropy": 2.0677133798599243, "epoch": 0.123, "grad_norm": 27.110973358154297, "learning_rate": 1.9551718862681363e-05, "loss": 1.3317, "mean_token_accuracy": 0.7105428576469421, "num_input_tokens_seen": 1449541, "num_tokens": 1449541.0, "step": 615, "train_runtime": 989.9548, "train_tokens_per_second": 1464.25 }, { "entropy": 2.9823999881744383, "epoch": 0.124, "grad_norm": 13.827988624572754, "learning_rate": 1.9542080389569947e-05, "loss": 1.3744, "mean_token_accuracy": 0.704598891735077, "num_input_tokens_seen": 1463314, "num_tokens": 1463314.0, "step": 620, "train_runtime": 997.9369, "train_tokens_per_second": 1466.339 }, { "entropy": 2.7947487354278566, "epoch": 0.125, "grad_norm": 17.706403732299805, "learning_rate": 1.953234182455048e-05, "loss": 1.3047, "mean_token_accuracy": 0.7222363352775574, "num_input_tokens_seen": 1475561, "num_tokens": 1475561.0, "step": 625, "train_runtime": 1005.9478, "train_tokens_per_second": 1466.837 }, { "entropy": 2.0744057178497313, "epoch": 0.126, "grad_norm": 11.465906143188477, "learning_rate": 1.9522503269775897e-05, "loss": 0.9425, "mean_token_accuracy": 0.7820311307907104, "num_input_tokens_seen": 1487576, "num_tokens": 1487576.0, "step": 630, "train_runtime": 1013.7121, "train_tokens_per_second": 1467.454 }, { "entropy": 2.5656540632247924, "epoch": 0.127, "grad_norm": 31.32952880859375, "learning_rate": 1.951256482844799e-05, "loss": 1.208, "mean_token_accuracy": 0.7277989983558655, "num_input_tokens_seen": 1499608, "num_tokens": 1499608.0, "step": 635, "train_runtime": 1021.5834, "train_tokens_per_second": 1467.925 }, { "entropy": 1.358598518371582, "epoch": 0.128, "grad_norm": 13.288721084594727, "learning_rate": 1.9502526604816293e-05, "loss": 1.0581, "mean_token_accuracy": 0.7627841234207153, "num_input_tokens_seen": 1509690, "num_tokens": 1509690.0, "step": 640, "train_runtime": 1029.2032, "train_tokens_per_second": 1466.853 }, { "entropy": 1.2806796073913573, "epoch": 0.129, "grad_norm": 24.01066017150879, "learning_rate": 1.9492388704177036e-05, "loss": 1.3479, "mean_token_accuracy": 0.7160539507865906, "num_input_tokens_seen": 1520509, "num_tokens": 1520509.0, "step": 645, "train_runtime": 1036.7592, "train_tokens_per_second": 1466.598 }, { "entropy": 0.9980000019073486, "epoch": 0.13, "grad_norm": 11.930582046508789, "learning_rate": 1.948215123287199e-05, "loss": 0.8631, "mean_token_accuracy": 0.810214900970459, "num_input_tokens_seen": 1529708, "num_tokens": 1529708.0, "step": 650, "train_runtime": 1044.4397, "train_tokens_per_second": 1464.621 }, { "entropy": 1.2121602296829224, "epoch": 0.131, "grad_norm": 24.203609466552734, "learning_rate": 1.947181429828739e-05, "loss": 1.1996, "mean_token_accuracy": 0.7406519412994385, "num_input_tokens_seen": 1540738, "num_tokens": 1540738.0, "step": 655, "train_runtime": 1052.3118, "train_tokens_per_second": 1464.146 }, { "entropy": 1.463084602355957, "epoch": 0.132, "grad_norm": 11.741750717163086, "learning_rate": 1.9461378008852785e-05, "loss": 1.0741, "mean_token_accuracy": 0.7718634724617004, "num_input_tokens_seen": 1554410, "num_tokens": 1554410.0, "step": 660, "train_runtime": 1060.4706, "train_tokens_per_second": 1465.774 }, { "entropy": 1.4478654623031617, "epoch": 0.133, "grad_norm": 29.194820404052734, "learning_rate": 1.9450842474039914e-05, "loss": 1.3358, "mean_token_accuracy": 0.7294575691223144, "num_input_tokens_seen": 1564870, "num_tokens": 1564870.0, "step": 665, "train_runtime": 1068.2817, "train_tokens_per_second": 1464.848 }, { "entropy": 1.6964669227600098, "epoch": 0.134, "grad_norm": 7.934049606323242, "learning_rate": 1.944020780436155e-05, "loss": 1.0332, "mean_token_accuracy": 0.7616851329803467, "num_input_tokens_seen": 1574450, "num_tokens": 1574450.0, "step": 670, "train_runtime": 1076.1474, "train_tokens_per_second": 1463.043 }, { "entropy": 1.9691448211669922, "epoch": 0.135, "grad_norm": 31.225624084472656, "learning_rate": 1.942947411137035e-05, "loss": 1.2224, "mean_token_accuracy": 0.7422297716140747, "num_input_tokens_seen": 1584478, "num_tokens": 1584478.0, "step": 675, "train_runtime": 1083.3168, "train_tokens_per_second": 1462.617 }, { "entropy": 1.6749921560287475, "epoch": 0.136, "grad_norm": 14.478198051452637, "learning_rate": 1.9418641507657673e-05, "loss": 1.3323, "mean_token_accuracy": 0.7093935966491699, "num_input_tokens_seen": 1592968, "num_tokens": 1592968.0, "step": 680, "train_runtime": 1090.9668, "train_tokens_per_second": 1460.143 }, { "entropy": 1.5938732862472533, "epoch": 0.137, "grad_norm": 21.1867618560791, "learning_rate": 1.9407710106852405e-05, "loss": 1.2822, "mean_token_accuracy": 0.715202271938324, "num_input_tokens_seen": 1604200, "num_tokens": 1604200.0, "step": 685, "train_runtime": 1098.8729, "train_tokens_per_second": 1459.859 }, { "entropy": 1.2818638324737548, "epoch": 0.138, "grad_norm": 12.877001762390137, "learning_rate": 1.9396680023619767e-05, "loss": 0.9033, "mean_token_accuracy": 0.7878343939781189, "num_input_tokens_seen": 1617766, "num_tokens": 1617766.0, "step": 690, "train_runtime": 1106.7679, "train_tokens_per_second": 1461.703 }, { "entropy": 2.487531232833862, "epoch": 0.139, "grad_norm": 34.02974319458008, "learning_rate": 1.9385551373660113e-05, "loss": 1.5965, "mean_token_accuracy": 0.6637152791023254, "num_input_tokens_seen": 1631992, "num_tokens": 1631992.0, "step": 695, "train_runtime": 1114.7822, "train_tokens_per_second": 1463.956 }, { "entropy": 1.7720878839492797, "epoch": 0.14, "grad_norm": 17.36054229736328, "learning_rate": 1.9374324273707717e-05, "loss": 1.0766, "mean_token_accuracy": 0.7659256815910339, "num_input_tokens_seen": 1641478, "num_tokens": 1641478.0, "step": 700, "train_runtime": 1122.2582, "train_tokens_per_second": 1462.656 }, { "entropy": 1.2474261045455932, "epoch": 0.141, "grad_norm": 18.11788558959961, "learning_rate": 1.9362998841529542e-05, "loss": 0.7715, "mean_token_accuracy": 0.8085653901100158, "num_input_tokens_seen": 1649081, "num_tokens": 1649081.0, "step": 705, "train_runtime": 1129.6855, "train_tokens_per_second": 1459.77 }, { "entropy": 1.8623355150222778, "epoch": 0.142, "grad_norm": 17.649934768676758, "learning_rate": 1.9351575195924014e-05, "loss": 0.9863, "mean_token_accuracy": 0.7656079769134522, "num_input_tokens_seen": 1659308, "num_tokens": 1659308.0, "step": 710, "train_runtime": 1137.4708, "train_tokens_per_second": 1458.77 }, { "entropy": 1.7257145881652831, "epoch": 0.143, "grad_norm": 20.64045524597168, "learning_rate": 1.9340053456719768e-05, "loss": 1.0612, "mean_token_accuracy": 0.7581552505493164, "num_input_tokens_seen": 1669627, "num_tokens": 1669627.0, "step": 715, "train_runtime": 1145.3001, "train_tokens_per_second": 1457.807 }, { "entropy": 1.8995280265808105, "epoch": 0.144, "grad_norm": 16.121498107910156, "learning_rate": 1.9328433744774403e-05, "loss": 1.1786, "mean_token_accuracy": 0.7428599238395691, "num_input_tokens_seen": 1683108, "num_tokens": 1683108.0, "step": 720, "train_runtime": 1153.2854, "train_tokens_per_second": 1459.403 }, { "entropy": 2.1401336431503295, "epoch": 0.145, "grad_norm": 27.458831787109375, "learning_rate": 1.931671618197319e-05, "loss": 1.169, "mean_token_accuracy": 0.7340465784072876, "num_input_tokens_seen": 1697138, "num_tokens": 1697138.0, "step": 725, "train_runtime": 1161.3625, "train_tokens_per_second": 1461.334 }, { "entropy": 1.987505316734314, "epoch": 0.146, "grad_norm": 18.56959342956543, "learning_rate": 1.9304900891227825e-05, "loss": 1.1025, "mean_token_accuracy": 0.7560898303985596, "num_input_tokens_seen": 1709202, "num_tokens": 1709202.0, "step": 730, "train_runtime": 1168.7973, "train_tokens_per_second": 1462.36 }, { "entropy": 1.7084295988082885, "epoch": 0.147, "grad_norm": 36.63855743408203, "learning_rate": 1.9292987996475113e-05, "loss": 1.1774, "mean_token_accuracy": 0.7277894496917725, "num_input_tokens_seen": 1722793, "num_tokens": 1722793.0, "step": 735, "train_runtime": 1176.8454, "train_tokens_per_second": 1463.908 }, { "entropy": 1.6064619064331054, "epoch": 0.148, "grad_norm": 10.864763259887695, "learning_rate": 1.928097762267568e-05, "loss": 1.0592, "mean_token_accuracy": 0.7674826502799987, "num_input_tokens_seen": 1734540, "num_tokens": 1734540.0, "step": 740, "train_runtime": 1184.7001, "train_tokens_per_second": 1464.117 }, { "entropy": 2.1586907386779783, "epoch": 0.149, "grad_norm": 35.27556610107422, "learning_rate": 1.9268869895812673e-05, "loss": 1.4747, "mean_token_accuracy": 0.6828147292137146, "num_input_tokens_seen": 1747378, "num_tokens": 1747378.0, "step": 745, "train_runtime": 1192.6153, "train_tokens_per_second": 1465.165 }, { "entropy": 2.162753367424011, "epoch": 0.15, "grad_norm": 11.816399574279785, "learning_rate": 1.9256664942890412e-05, "loss": 1.0043, "mean_token_accuracy": 0.7717919468879699, "num_input_tokens_seen": 1758416, "num_tokens": 1758416.0, "step": 750, "train_runtime": 1200.4176, "train_tokens_per_second": 1464.837 }, { "entropy": 1.9827997446060182, "epoch": 0.151, "grad_norm": 20.774890899658203, "learning_rate": 1.9244362891933077e-05, "loss": 0.975, "mean_token_accuracy": 0.7789272189140319, "num_input_tokens_seen": 1768541, "num_tokens": 1768541.0, "step": 755, "train_runtime": 1207.9749, "train_tokens_per_second": 1464.054 }, { "entropy": 2.149402713775635, "epoch": 0.152, "grad_norm": 11.887255668640137, "learning_rate": 1.9231963871983367e-05, "loss": 0.8696, "mean_token_accuracy": 0.8040732026100159, "num_input_tokens_seen": 1782060, "num_tokens": 1782060.0, "step": 760, "train_runtime": 1215.7864, "train_tokens_per_second": 1465.767 }, { "entropy": 2.252961802482605, "epoch": 0.153, "grad_norm": 34.2891960144043, "learning_rate": 1.9219468013101123e-05, "loss": 1.2386, "mean_token_accuracy": 0.7228067994117737, "num_input_tokens_seen": 1795323, "num_tokens": 1795323.0, "step": 765, "train_runtime": 1223.8891, "train_tokens_per_second": 1466.9 }, { "entropy": 2.2149811744689942, "epoch": 0.154, "grad_norm": 13.472471237182617, "learning_rate": 1.9206875446362005e-05, "loss": 1.0753, "mean_token_accuracy": 0.7691938400268554, "num_input_tokens_seen": 1807024, "num_tokens": 1807024.0, "step": 770, "train_runtime": 1231.7872, "train_tokens_per_second": 1466.994 }, { "entropy": 2.2554484367370606, "epoch": 0.155, "grad_norm": 28.220993041992188, "learning_rate": 1.919418630385607e-05, "loss": 1.1747, "mean_token_accuracy": 0.7412548780441284, "num_input_tokens_seen": 1818476, "num_tokens": 1818476.0, "step": 775, "train_runtime": 1239.8273, "train_tokens_per_second": 1466.717 }, { "entropy": 1.5733176231384278, "epoch": 0.156, "grad_norm": 19.47785186767578, "learning_rate": 1.918140071868642e-05, "loss": 1.0726, "mean_token_accuracy": 0.7566750168800354, "num_input_tokens_seen": 1830074, "num_tokens": 1830074.0, "step": 780, "train_runtime": 1247.7155, "train_tokens_per_second": 1466.74 }, { "entropy": 0.9829119324684144, "epoch": 0.157, "grad_norm": 22.995859146118164, "learning_rate": 1.9168518824967797e-05, "loss": 0.8623, "mean_token_accuracy": 0.8017370700836182, "num_input_tokens_seen": 1841113, "num_tokens": 1841113.0, "step": 785, "train_runtime": 1255.4287, "train_tokens_per_second": 1466.521 }, { "entropy": 0.9932267189025878, "epoch": 0.158, "grad_norm": 12.162199020385742, "learning_rate": 1.9155540757825168e-05, "loss": 1.1064, "mean_token_accuracy": 0.7627958655357361, "num_input_tokens_seen": 1851856, "num_tokens": 1851856.0, "step": 790, "train_runtime": 1263.1103, "train_tokens_per_second": 1466.108 }, { "entropy": 1.2403859138488769, "epoch": 0.159, "grad_norm": 79.85828399658203, "learning_rate": 1.9142466653392317e-05, "loss": 1.2737, "mean_token_accuracy": 0.70594722032547, "num_input_tokens_seen": 1866700, "num_tokens": 1866700.0, "step": 795, "train_runtime": 1271.2917, "train_tokens_per_second": 1468.349 }, { "entropy": 0.83047856092453, "epoch": 0.16, "grad_norm": 22.448453903198242, "learning_rate": 1.912929664881041e-05, "loss": 1.2172, "mean_token_accuracy": 0.726837432384491, "num_input_tokens_seen": 1878756, "num_tokens": 1878756.0, "step": 800, "train_runtime": 1279.1562, "train_tokens_per_second": 1468.746 }, { "entropy": 1.2668817639350891, "epoch": 0.161, "grad_norm": 31.816118240356445, "learning_rate": 1.911603088222657e-05, "loss": 1.3971, "mean_token_accuracy": 0.7061687588691712, "num_input_tokens_seen": 1891685, "num_tokens": 1891685.0, "step": 805, "train_runtime": 1287.0438, "train_tokens_per_second": 1469.791 }, { "entropy": 1.1983356952667237, "epoch": 0.162, "grad_norm": 14.948308944702148, "learning_rate": 1.9102669492792406e-05, "loss": 0.8987, "mean_token_accuracy": 0.7988252401351928, "num_input_tokens_seen": 1902614, "num_tokens": 1902614.0, "step": 810, "train_runtime": 1294.8428, "train_tokens_per_second": 1469.378 }, { "entropy": 1.2379872798919678, "epoch": 0.163, "grad_norm": 31.080856323242188, "learning_rate": 1.908921262066257e-05, "loss": 1.0645, "mean_token_accuracy": 0.7529470443725585, "num_input_tokens_seen": 1915279, "num_tokens": 1915279.0, "step": 815, "train_runtime": 1302.1364, "train_tokens_per_second": 1470.874 }, { "entropy": 1.5611100435256957, "epoch": 0.164, "grad_norm": 12.244266510009766, "learning_rate": 1.9075660406993285e-05, "loss": 0.996, "mean_token_accuracy": 0.7660620331764221, "num_input_tokens_seen": 1928236, "num_tokens": 1928236.0, "step": 820, "train_runtime": 1310.1477, "train_tokens_per_second": 1471.77 }, { "entropy": 1.7769227981567384, "epoch": 0.165, "grad_norm": 19.543272018432617, "learning_rate": 1.906201299394086e-05, "loss": 1.274, "mean_token_accuracy": 0.7181691765785218, "num_input_tokens_seen": 1941721, "num_tokens": 1941721.0, "step": 825, "train_runtime": 1318.1719, "train_tokens_per_second": 1473.041 }, { "entropy": 1.47511568069458, "epoch": 0.166, "grad_norm": 12.673044204711914, "learning_rate": 1.9048270524660197e-05, "loss": 0.8723, "mean_token_accuracy": 0.7944092273712158, "num_input_tokens_seen": 1953864, "num_tokens": 1953864.0, "step": 830, "train_runtime": 1326.0313, "train_tokens_per_second": 1473.468 }, { "entropy": 2.3661667823791506, "epoch": 0.167, "grad_norm": 48.600181579589844, "learning_rate": 1.90344331433033e-05, "loss": 1.3143, "mean_token_accuracy": 0.7126195549964904, "num_input_tokens_seen": 1967740, "num_tokens": 1967740.0, "step": 835, "train_runtime": 1334.0093, "train_tokens_per_second": 1475.057 }, { "entropy": 2.489607572555542, "epoch": 0.168, "grad_norm": 13.38939380645752, "learning_rate": 1.9020500995017747e-05, "loss": 1.1249, "mean_token_accuracy": 0.7440953731536866, "num_input_tokens_seen": 1980654, "num_tokens": 1980654.0, "step": 840, "train_runtime": 1341.7157, "train_tokens_per_second": 1476.21 }, { "entropy": 2.720272254943848, "epoch": 0.169, "grad_norm": 134.3373565673828, "learning_rate": 1.900647422594519e-05, "loss": 1.37, "mean_token_accuracy": 0.7167404890060425, "num_input_tokens_seen": 1995568, "num_tokens": 1995568.0, "step": 845, "train_runtime": 1349.6234, "train_tokens_per_second": 1478.611 }, { "entropy": 2.572855520248413, "epoch": 0.17, "grad_norm": 27.080028533935547, "learning_rate": 1.8992352983219785e-05, "loss": 1.15, "mean_token_accuracy": 0.735902214050293, "num_input_tokens_seen": 2007296, "num_tokens": 2007296.0, "step": 850, "train_runtime": 1357.5157, "train_tokens_per_second": 1478.654 }, { "entropy": 2.0758432865142824, "epoch": 0.171, "grad_norm": 24.015117645263672, "learning_rate": 1.89781374149667e-05, "loss": 0.9982, "mean_token_accuracy": 0.7674156546592712, "num_input_tokens_seen": 2018973, "num_tokens": 2018973.0, "step": 855, "train_runtime": 1365.3015, "train_tokens_per_second": 1478.774 }, { "entropy": 2.26265869140625, "epoch": 0.172, "grad_norm": 14.30013656616211, "learning_rate": 1.8963827670300512e-05, "loss": 0.93, "mean_token_accuracy": 0.7798004984855652, "num_input_tokens_seen": 2030366, "num_tokens": 2030366.0, "step": 860, "train_runtime": 1373.2023, "train_tokens_per_second": 1478.563 }, { "entropy": 1.6695388555526733, "epoch": 0.173, "grad_norm": 20.70516014099121, "learning_rate": 1.894942389932367e-05, "loss": 1.37, "mean_token_accuracy": 0.7112033128738403, "num_input_tokens_seen": 2044807, "num_tokens": 2044807.0, "step": 865, "train_runtime": 1381.2426, "train_tokens_per_second": 1480.411 }, { "entropy": 1.988920521736145, "epoch": 0.174, "grad_norm": 14.551627159118652, "learning_rate": 1.8934926253124922e-05, "loss": 1.22, "mean_token_accuracy": 0.7354559302330017, "num_input_tokens_seen": 2060202, "num_tokens": 2060202.0, "step": 870, "train_runtime": 1389.1169, "train_tokens_per_second": 1483.102 }, { "entropy": 1.0579666256904603, "epoch": 0.175, "grad_norm": 22.92796516418457, "learning_rate": 1.892033488377771e-05, "loss": 1.2121, "mean_token_accuracy": 0.7356925964355469, "num_input_tokens_seen": 2071361, "num_tokens": 2071361.0, "step": 875, "train_runtime": 1396.7358, "train_tokens_per_second": 1483.001 }, { "entropy": 1.512852382659912, "epoch": 0.176, "grad_norm": 24.014389038085938, "learning_rate": 1.8905649944338596e-05, "loss": 1.066, "mean_token_accuracy": 0.7594571232795715, "num_input_tokens_seen": 2085210, "num_tokens": 2085210.0, "step": 880, "train_runtime": 1404.8657, "train_tokens_per_second": 1484.277 }, { "entropy": 1.8931194305419923, "epoch": 0.177, "grad_norm": 50.34173583984375, "learning_rate": 1.8890871588845653e-05, "loss": 1.2254, "mean_token_accuracy": 0.7164549112319947, "num_input_tokens_seen": 2098534, "num_tokens": 2098534.0, "step": 885, "train_runtime": 1412.8104, "train_tokens_per_second": 1485.361 }, { "entropy": 1.8903178453445435, "epoch": 0.178, "grad_norm": 26.925838470458984, "learning_rate": 1.8875999972316826e-05, "loss": 1.2333, "mean_token_accuracy": 0.7311474204063415, "num_input_tokens_seen": 2112132, "num_tokens": 2112132.0, "step": 890, "train_runtime": 1420.8307, "train_tokens_per_second": 1486.547 }, { "entropy": 1.561018180847168, "epoch": 0.179, "grad_norm": 18.123018264770508, "learning_rate": 1.8861035250748343e-05, "loss": 1.1291, "mean_token_accuracy": 0.743868625164032, "num_input_tokens_seen": 2124215, "num_tokens": 2124215.0, "step": 895, "train_runtime": 1428.4649, "train_tokens_per_second": 1487.061 }, { "entropy": 1.604744064807892, "epoch": 0.18, "grad_norm": 11.469419479370117, "learning_rate": 1.8845977581113048e-05, "loss": 0.7682, "mean_token_accuracy": 0.815960431098938, "num_input_tokens_seen": 2132296, "num_tokens": 2132296.0, "step": 900, "train_runtime": 1436.1156, "train_tokens_per_second": 1484.766 }, { "entropy": 2.1941394329071047, "epoch": 0.181, "grad_norm": 20.266151428222656, "learning_rate": 1.883082712135877e-05, "loss": 1.3157, "mean_token_accuracy": 0.7129065752029419, "num_input_tokens_seen": 2146141, "num_tokens": 2146141.0, "step": 905, "train_runtime": 1443.9351, "train_tokens_per_second": 1486.314 }, { "entropy": 2.1940310478210447, "epoch": 0.182, "grad_norm": 16.473621368408203, "learning_rate": 1.8815584030406663e-05, "loss": 1.0611, "mean_token_accuracy": 0.7669301986694336, "num_input_tokens_seen": 2159340, "num_tokens": 2159340.0, "step": 910, "train_runtime": 1451.8626, "train_tokens_per_second": 1487.289 }, { "entropy": 1.77043616771698, "epoch": 0.183, "grad_norm": 20.959806442260742, "learning_rate": 1.8800248468149545e-05, "loss": 1.2164, "mean_token_accuracy": 0.7337101459503174, "num_input_tokens_seen": 2171465, "num_tokens": 2171465.0, "step": 915, "train_runtime": 1459.7056, "train_tokens_per_second": 1487.605 }, { "entropy": 1.7262412786483765, "epoch": 0.184, "grad_norm": 16.45913314819336, "learning_rate": 1.8784820595450198e-05, "loss": 1.01, "mean_token_accuracy": 0.7694900870323181, "num_input_tokens_seen": 2183874, "num_tokens": 2183874.0, "step": 920, "train_runtime": 1467.648, "train_tokens_per_second": 1488.009 }, { "entropy": 2.4993420839309692, "epoch": 0.185, "grad_norm": 38.57929992675781, "learning_rate": 1.876930057413971e-05, "loss": 1.3126, "mean_token_accuracy": 0.7130668520927429, "num_input_tokens_seen": 2196897, "num_tokens": 2196897.0, "step": 925, "train_runtime": 1475.5252, "train_tokens_per_second": 1488.892 }, { "entropy": 2.2688174486160277, "epoch": 0.186, "grad_norm": 22.353689193725586, "learning_rate": 1.875368856701576e-05, "loss": 0.9961, "mean_token_accuracy": 0.7753549098968506, "num_input_tokens_seen": 2209046, "num_tokens": 2209046.0, "step": 930, "train_runtime": 1482.9427, "train_tokens_per_second": 1489.637 }, { "entropy": 2.1050793647766115, "epoch": 0.187, "grad_norm": 31.13572883605957, "learning_rate": 1.873798473784092e-05, "loss": 1.2541, "mean_token_accuracy": 0.7385418534278869, "num_input_tokens_seen": 2222823, "num_tokens": 2222823.0, "step": 935, "train_runtime": 1491.1057, "train_tokens_per_second": 1490.721 }, { "entropy": 1.9931723833084107, "epoch": 0.188, "grad_norm": 12.5863676071167, "learning_rate": 1.872218925134092e-05, "loss": 1.0974, "mean_token_accuracy": 0.7569717764854431, "num_input_tokens_seen": 2234434, "num_tokens": 2234434.0, "step": 940, "train_runtime": 1499.0845, "train_tokens_per_second": 1490.532 }, { "entropy": 1.9971989631652831, "epoch": 0.189, "grad_norm": 20.01220703125, "learning_rate": 1.870630227320294e-05, "loss": 1.2304, "mean_token_accuracy": 0.7170242667198181, "num_input_tokens_seen": 2249603, "num_tokens": 2249603.0, "step": 945, "train_runtime": 1507.2944, "train_tokens_per_second": 1492.478 }, { "entropy": 1.2628510117530822, "epoch": 0.19, "grad_norm": 12.424386024475098, "learning_rate": 1.8690323970073874e-05, "loss": 0.876, "mean_token_accuracy": 0.8019235849380493, "num_input_tokens_seen": 2260474, "num_tokens": 2260474.0, "step": 950, "train_runtime": 1515.0636, "train_tokens_per_second": 1491.999 }, { "entropy": 2.2569441318511965, "epoch": 0.191, "grad_norm": 41.75444412231445, "learning_rate": 1.8674254509558544e-05, "loss": 1.3624, "mean_token_accuracy": 0.696606719493866, "num_input_tokens_seen": 2273218, "num_tokens": 2273218.0, "step": 955, "train_runtime": 1522.8659, "train_tokens_per_second": 1492.724 }, { "entropy": 2.827728843688965, "epoch": 0.192, "grad_norm": 25.55834197998047, "learning_rate": 1.8658094060218e-05, "loss": 1.1234, "mean_token_accuracy": 0.7542357683181763, "num_input_tokens_seen": 2287114, "num_tokens": 2287114.0, "step": 960, "train_runtime": 1530.6437, "train_tokens_per_second": 1494.217 }, { "entropy": 2.1221920251846313, "epoch": 0.193, "grad_norm": 25.550983428955078, "learning_rate": 1.86418427915677e-05, "loss": 1.2372, "mean_token_accuracy": 0.7325672626495361, "num_input_tokens_seen": 2297170, "num_tokens": 2297170.0, "step": 965, "train_runtime": 1538.4295, "train_tokens_per_second": 1493.192 }, { "entropy": 2.4799105405807493, "epoch": 0.194, "grad_norm": 16.807575225830078, "learning_rate": 1.862550087407577e-05, "loss": 1.0905, "mean_token_accuracy": 0.7593504548072815, "num_input_tokens_seen": 2309526, "num_tokens": 2309526.0, "step": 970, "train_runtime": 1546.2744, "train_tokens_per_second": 1493.607 }, { "entropy": 2.265731143951416, "epoch": 0.195, "grad_norm": 24.413179397583008, "learning_rate": 1.8609068479161182e-05, "loss": 1.2853, "mean_token_accuracy": 0.7342265009880066, "num_input_tokens_seen": 2324122, "num_tokens": 2324122.0, "step": 975, "train_runtime": 1554.285, "train_tokens_per_second": 1495.3 }, { "entropy": 3.4570779323577883, "epoch": 0.196, "grad_norm": 16.08462905883789, "learning_rate": 1.8592545779191993e-05, "loss": 1.4573, "mean_token_accuracy": 0.702107059955597, "num_input_tokens_seen": 2337048, "num_tokens": 2337048.0, "step": 980, "train_runtime": 1562.1975, "train_tokens_per_second": 1496.0 }, { "entropy": 1.6146063089370728, "epoch": 0.197, "grad_norm": 21.52993392944336, "learning_rate": 1.8575932947483503e-05, "loss": 1.4142, "mean_token_accuracy": 0.7019688010215759, "num_input_tokens_seen": 2351673, "num_tokens": 2351673.0, "step": 985, "train_runtime": 1570.1158, "train_tokens_per_second": 1497.77 }, { "entropy": 1.385775053501129, "epoch": 0.198, "grad_norm": 18.42176055908203, "learning_rate": 1.8559230158296454e-05, "loss": 1.1104, "mean_token_accuracy": 0.7604029297828674, "num_input_tokens_seen": 2364194, "num_tokens": 2364194.0, "step": 990, "train_runtime": 1577.8302, "train_tokens_per_second": 1498.383 }, { "entropy": 1.678727889060974, "epoch": 0.199, "grad_norm": 32.511749267578125, "learning_rate": 1.8542437586835202e-05, "loss": 1.1675, "mean_token_accuracy": 0.748304831981659, "num_input_tokens_seen": 2374903, "num_tokens": 2374903.0, "step": 995, "train_runtime": 1585.6395, "train_tokens_per_second": 1497.757 }, { "entropy": 1.6218863248825073, "epoch": 0.2, "grad_norm": 16.702844619750977, "learning_rate": 1.8525555409245877e-05, "loss": 1.0823, "mean_token_accuracy": 0.7636188387870788, "num_input_tokens_seen": 2385892, "num_tokens": 2385892.0, "step": 1000, "train_runtime": 1593.4216, "train_tokens_per_second": 1497.339 }, { "entropy": 1.5122142553329467, "epoch": 0.201, "grad_norm": 26.665863037109375, "learning_rate": 1.8508583802614534e-05, "loss": 1.0533, "mean_token_accuracy": 0.758086371421814, "num_input_tokens_seen": 2396754, "num_tokens": 2396754.0, "step": 1005, "train_runtime": 1601.3436, "train_tokens_per_second": 1496.714 }, { "entropy": 1.495746874809265, "epoch": 0.202, "grad_norm": 16.017236709594727, "learning_rate": 1.849152294496529e-05, "loss": 1.0954, "mean_token_accuracy": 0.7606940507888794, "num_input_tokens_seen": 2405464, "num_tokens": 2405464.0, "step": 1010, "train_runtime": 1608.9809, "train_tokens_per_second": 1495.023 }, { "entropy": 1.1275855541229247, "epoch": 0.203, "grad_norm": 42.96962356567383, "learning_rate": 1.8474373015258472e-05, "loss": 1.0589, "mean_token_accuracy": 0.7711854338645935, "num_input_tokens_seen": 2416499, "num_tokens": 2416499.0, "step": 1015, "train_runtime": 1616.6024, "train_tokens_per_second": 1494.801 }, { "entropy": 1.2307052135467529, "epoch": 0.204, "grad_norm": 15.4004545211792, "learning_rate": 1.845713419338873e-05, "loss": 1.007, "mean_token_accuracy": 0.7718303680419922, "num_input_tokens_seen": 2426362, "num_tokens": 2426362.0, "step": 1020, "train_runtime": 1624.0246, "train_tokens_per_second": 1494.043 }, { "entropy": 2.266387867927551, "epoch": 0.205, "grad_norm": 99.84893798828125, "learning_rate": 1.843980666018315e-05, "loss": 1.4528, "mean_token_accuracy": 0.6928727388381958, "num_input_tokens_seen": 2439075, "num_tokens": 2439075.0, "step": 1025, "train_runtime": 1631.9635, "train_tokens_per_second": 1494.565 }, { "entropy": 1.6988699316978455, "epoch": 0.206, "grad_norm": 15.833264350891113, "learning_rate": 1.842239059739935e-05, "loss": 1.2118, "mean_token_accuracy": 0.726143217086792, "num_input_tokens_seen": 2452054, "num_tokens": 2452054.0, "step": 1030, "train_runtime": 1640.0387, "train_tokens_per_second": 1495.12 }, { "entropy": 1.8930716037750244, "epoch": 0.207, "grad_norm": 25.152311325073242, "learning_rate": 1.840488618772359e-05, "loss": 1.152, "mean_token_accuracy": 0.7456088900566101, "num_input_tokens_seen": 2463024, "num_tokens": 2463024.0, "step": 1035, "train_runtime": 1647.7875, "train_tokens_per_second": 1494.746 }, { "entropy": 1.6631874561309814, "epoch": 0.208, "grad_norm": 13.29953384399414, "learning_rate": 1.8387293614768843e-05, "loss": 1.0359, "mean_token_accuracy": 0.7646137118339539, "num_input_tokens_seen": 2475926, "num_tokens": 2475926.0, "step": 1040, "train_runtime": 1655.4165, "train_tokens_per_second": 1495.651 }, { "entropy": 1.893170428276062, "epoch": 0.209, "grad_norm": 22.283761978149414, "learning_rate": 1.8369613063072875e-05, "loss": 1.1958, "mean_token_accuracy": 0.7319866299629212, "num_input_tokens_seen": 2487974, "num_tokens": 2487974.0, "step": 1045, "train_runtime": 1663.0955, "train_tokens_per_second": 1495.99 }, { "entropy": 1.6280813455581664, "epoch": 0.21, "grad_norm": 15.383233070373535, "learning_rate": 1.835184471809631e-05, "loss": 0.9214, "mean_token_accuracy": 0.791292917728424, "num_input_tokens_seen": 2497790, "num_tokens": 2497790.0, "step": 1050, "train_runtime": 1670.8854, "train_tokens_per_second": 1494.89 }, { "entropy": 1.6741443157196045, "epoch": 0.211, "grad_norm": 22.48347282409668, "learning_rate": 1.8333988766220676e-05, "loss": 1.2364, "mean_token_accuracy": 0.7287265062332153, "num_input_tokens_seen": 2508787, "num_tokens": 2508787.0, "step": 1055, "train_runtime": 1678.7636, "train_tokens_per_second": 1494.425 }, { "entropy": 1.7784600496292113, "epoch": 0.212, "grad_norm": 15.869063377380371, "learning_rate": 1.831604539474646e-05, "loss": 0.8924, "mean_token_accuracy": 0.7899987697601318, "num_input_tokens_seen": 2520030, "num_tokens": 2520030.0, "step": 1060, "train_runtime": 1686.5367, "train_tokens_per_second": 1494.204 }, { "entropy": 1.9719912886619568, "epoch": 0.213, "grad_norm": 28.53473472595215, "learning_rate": 1.8298014791891138e-05, "loss": 1.4759, "mean_token_accuracy": 0.6851282000541687, "num_input_tokens_seen": 2534593, "num_tokens": 2534593.0, "step": 1065, "train_runtime": 1694.744, "train_tokens_per_second": 1495.561 }, { "entropy": 2.180422139167786, "epoch": 0.214, "grad_norm": 18.278095245361328, "learning_rate": 1.8279897146787204e-05, "loss": 1.1392, "mean_token_accuracy": 0.7562355875968934, "num_input_tokens_seen": 2545966, "num_tokens": 2545966.0, "step": 1070, "train_runtime": 1702.4165, "train_tokens_per_second": 1495.501 }, { "entropy": 2.7334835290908814, "epoch": 0.215, "grad_norm": 24.28639793395996, "learning_rate": 1.8261692649480174e-05, "loss": 1.3129, "mean_token_accuracy": 0.7086966514587403, "num_input_tokens_seen": 2558923, "num_tokens": 2558923.0, "step": 1075, "train_runtime": 1710.225, "train_tokens_per_second": 1496.249 }, { "entropy": 1.0636597156524659, "epoch": 0.216, "grad_norm": 12.717808723449707, "learning_rate": 1.8243401490926623e-05, "loss": 0.8729, "mean_token_accuracy": 0.7919552087783813, "num_input_tokens_seen": 2570532, "num_tokens": 2570532.0, "step": 1080, "train_runtime": 1717.9683, "train_tokens_per_second": 1496.263 }, { "entropy": 1.8448323249816894, "epoch": 0.217, "grad_norm": 36.6221923828125, "learning_rate": 1.822502386299214e-05, "loss": 1.0965, "mean_token_accuracy": 0.7572914958000183, "num_input_tokens_seen": 2581284, "num_tokens": 2581284.0, "step": 1085, "train_runtime": 1725.7684, "train_tokens_per_second": 1495.73 }, { "entropy": 1.623563003540039, "epoch": 0.218, "grad_norm": 16.415569305419922, "learning_rate": 1.820655995844935e-05, "loss": 1.092, "mean_token_accuracy": 0.7598554968833924, "num_input_tokens_seen": 2594382, "num_tokens": 2594382.0, "step": 1090, "train_runtime": 1733.8828, "train_tokens_per_second": 1496.285 }, { "entropy": 1.391305184364319, "epoch": 0.219, "grad_norm": 27.79310417175293, "learning_rate": 1.818800997097587e-05, "loss": 1.1844, "mean_token_accuracy": 0.7429009556770325, "num_input_tokens_seen": 2606128, "num_tokens": 2606128.0, "step": 1095, "train_runtime": 1741.8139, "train_tokens_per_second": 1496.215 }, { "entropy": 1.305377757549286, "epoch": 0.22, "grad_norm": 11.160552978515625, "learning_rate": 1.8169374095152298e-05, "loss": 0.9833, "mean_token_accuracy": 0.7852571368217468, "num_input_tokens_seen": 2617842, "num_tokens": 2617842.0, "step": 1100, "train_runtime": 1749.3728, "train_tokens_per_second": 1496.446 }, { "entropy": 1.4599490761756897, "epoch": 0.221, "grad_norm": 51.59029006958008, "learning_rate": 1.8150652526460146e-05, "loss": 1.3637, "mean_token_accuracy": 0.7055651068687439, "num_input_tokens_seen": 2630749, "num_tokens": 2630749.0, "step": 1105, "train_runtime": 1757.1282, "train_tokens_per_second": 1497.187 }, { "entropy": 1.9002761363983154, "epoch": 0.222, "grad_norm": 15.919507026672363, "learning_rate": 1.8131845461279813e-05, "loss": 1.1948, "mean_token_accuracy": 0.7477057337760925, "num_input_tokens_seen": 2644422, "num_tokens": 2644422.0, "step": 1110, "train_runtime": 1765.3095, "train_tokens_per_second": 1497.993 }, { "entropy": 1.3458446145057679, "epoch": 0.223, "grad_norm": 27.990976333618164, "learning_rate": 1.8112953096888517e-05, "loss": 1.2934, "mean_token_accuracy": 0.724444580078125, "num_input_tokens_seen": 2655502, "num_tokens": 2655502.0, "step": 1115, "train_runtime": 1773.2223, "train_tokens_per_second": 1497.557 }, { "entropy": 2.051145005226135, "epoch": 0.224, "grad_norm": 15.003273963928223, "learning_rate": 1.8093975631458215e-05, "loss": 1.1581, "mean_token_accuracy": 0.7347219467163086, "num_input_tokens_seen": 2669660, "num_tokens": 2669660.0, "step": 1120, "train_runtime": 1781.4245, "train_tokens_per_second": 1498.61 }, { "entropy": 1.4939687848091125, "epoch": 0.225, "grad_norm": 25.186275482177734, "learning_rate": 1.8074913264053547e-05, "loss": 1.3281, "mean_token_accuracy": 0.714842164516449, "num_input_tokens_seen": 2681950, "num_tokens": 2681950.0, "step": 1125, "train_runtime": 1789.3551, "train_tokens_per_second": 1498.836 }, { "entropy": 1.5468995451927186, "epoch": 0.226, "grad_norm": 17.994199752807617, "learning_rate": 1.8055766194629717e-05, "loss": 1.0875, "mean_token_accuracy": 0.762558889389038, "num_input_tokens_seen": 2694590, "num_tokens": 2694590.0, "step": 1130, "train_runtime": 1796.7565, "train_tokens_per_second": 1499.697 }, { "entropy": 1.5358847856521607, "epoch": 0.227, "grad_norm": 23.238435745239258, "learning_rate": 1.8036534624030428e-05, "loss": 1.1843, "mean_token_accuracy": 0.7393738508224488, "num_input_tokens_seen": 2705045, "num_tokens": 2705045.0, "step": 1135, "train_runtime": 1804.7054, "train_tokens_per_second": 1498.885 }, { "entropy": 1.5250971078872682, "epoch": 0.228, "grad_norm": 23.188899993896484, "learning_rate": 1.8017218753985758e-05, "loss": 1.2583, "mean_token_accuracy": 0.7174087047576905, "num_input_tokens_seen": 2715970, "num_tokens": 2715970.0, "step": 1140, "train_runtime": 1812.7154, "train_tokens_per_second": 1498.288 }, { "entropy": 1.4680069208145141, "epoch": 0.229, "grad_norm": 28.266469955444336, "learning_rate": 1.7997818787110043e-05, "loss": 1.5286, "mean_token_accuracy": 0.6745350122451782, "num_input_tokens_seen": 2725245, "num_tokens": 2725245.0, "step": 1145, "train_runtime": 1820.6694, "train_tokens_per_second": 1496.837 }, { "entropy": 1.1229366540908814, "epoch": 0.23, "grad_norm": 17.652786254882812, "learning_rate": 1.7978334926899748e-05, "loss": 1.1712, "mean_token_accuracy": 0.7368714213371277, "num_input_tokens_seen": 2735210, "num_tokens": 2735210.0, "step": 1150, "train_runtime": 1828.6015, "train_tokens_per_second": 1495.793 }, { "entropy": 0.8773279905319213, "epoch": 0.231, "grad_norm": 37.759952545166016, "learning_rate": 1.795876737773136e-05, "loss": 1.4256, "mean_token_accuracy": 0.6964215397834778, "num_input_tokens_seen": 2744598, "num_tokens": 2744598.0, "step": 1155, "train_runtime": 1836.2648, "train_tokens_per_second": 1494.664 }, { "entropy": 1.166712725162506, "epoch": 0.232, "grad_norm": 18.826770782470703, "learning_rate": 1.79391163448592e-05, "loss": 1.4529, "mean_token_accuracy": 0.7026087880134583, "num_input_tokens_seen": 2755800, "num_tokens": 2755800.0, "step": 1160, "train_runtime": 1843.8804, "train_tokens_per_second": 1494.565 }, { "entropy": 1.311229658126831, "epoch": 0.233, "grad_norm": 46.353214263916016, "learning_rate": 1.7919382034413306e-05, "loss": 1.4694, "mean_token_accuracy": 0.684325659275055, "num_input_tokens_seen": 2765551, "num_tokens": 2765551.0, "step": 1165, "train_runtime": 1851.8903, "train_tokens_per_second": 1493.367 }, { "entropy": 1.2749906063079834, "epoch": 0.234, "grad_norm": 19.173524856567383, "learning_rate": 1.789956465339726e-05, "loss": 1.2149, "mean_token_accuracy": 0.7351233720779419, "num_input_tokens_seen": 2774738, "num_tokens": 2774738.0, "step": 1170, "train_runtime": 1859.6927, "train_tokens_per_second": 1492.041 }, { "entropy": 1.5731144428253174, "epoch": 0.235, "grad_norm": 38.264320373535156, "learning_rate": 1.7879664409686007e-05, "loss": 1.39, "mean_token_accuracy": 0.7010078430175781, "num_input_tokens_seen": 2784276, "num_tokens": 2784276.0, "step": 1175, "train_runtime": 1867.7285, "train_tokens_per_second": 1490.728 }, { "entropy": 1.2220728158950807, "epoch": 0.236, "grad_norm": 85.74222564697266, "learning_rate": 1.7859681512023694e-05, "loss": 1.5731, "mean_token_accuracy": 0.6796548366546631, "num_input_tokens_seen": 2793572, "num_tokens": 2793572.0, "step": 1180, "train_runtime": 1875.652, "train_tokens_per_second": 1489.387 }, { "entropy": 1.268454337120056, "epoch": 0.237, "grad_norm": 30.37742042541504, "learning_rate": 1.7839616170021452e-05, "loss": 1.4416, "mean_token_accuracy": 0.6907160401344299, "num_input_tokens_seen": 2803569, "num_tokens": 2803569.0, "step": 1185, "train_runtime": 1883.3759, "train_tokens_per_second": 1488.587 }, { "entropy": 1.415718913078308, "epoch": 0.238, "grad_norm": 15.435678482055664, "learning_rate": 1.7819468594155235e-05, "loss": 1.246, "mean_token_accuracy": 0.7309785127639771, "num_input_tokens_seen": 2813986, "num_tokens": 2813986.0, "step": 1190, "train_runtime": 1891.0274, "train_tokens_per_second": 1488.073 }, { "entropy": 1.8212880134582519, "epoch": 0.239, "grad_norm": 29.454036712646484, "learning_rate": 1.779923899576357e-05, "loss": 1.591, "mean_token_accuracy": 0.6770082592964173, "num_input_tokens_seen": 2824514, "num_tokens": 2824514.0, "step": 1195, "train_runtime": 1898.8571, "train_tokens_per_second": 1487.481 }, { "entropy": 2.078702116012573, "epoch": 0.24, "grad_norm": 20.800691604614258, "learning_rate": 1.7778927587045373e-05, "loss": 1.1278, "mean_token_accuracy": 0.7445695161819458, "num_input_tokens_seen": 2834422, "num_tokens": 2834422.0, "step": 1200, "train_runtime": 1906.6951, "train_tokens_per_second": 1486.563 }, { "entropy": 2.179371976852417, "epoch": 0.241, "grad_norm": 39.96086502075195, "learning_rate": 1.775853458105772e-05, "loss": 1.4827, "mean_token_accuracy": 0.6985586881637573, "num_input_tokens_seen": 2843088, "num_tokens": 2843088.0, "step": 1205, "train_runtime": 1914.6106, "train_tokens_per_second": 1484.943 }, { "entropy": 1.5217188835144042, "epoch": 0.242, "grad_norm": 22.260730743408203, "learning_rate": 1.773806019171358e-05, "loss": 1.225, "mean_token_accuracy": 0.7413068056106568, "num_input_tokens_seen": 2852880, "num_tokens": 2852880.0, "step": 1210, "train_runtime": 1922.6034, "train_tokens_per_second": 1483.863 }, { "entropy": 1.4584936141967773, "epoch": 0.243, "grad_norm": 50.44161605834961, "learning_rate": 1.7717504633779618e-05, "loss": 1.4813, "mean_token_accuracy": 0.6806095719337464, "num_input_tokens_seen": 2862105, "num_tokens": 2862105.0, "step": 1215, "train_runtime": 1930.0375, "train_tokens_per_second": 1482.927 }, { "entropy": 1.4142881631851196, "epoch": 0.244, "grad_norm": 21.736143112182617, "learning_rate": 1.769686812287391e-05, "loss": 1.1578, "mean_token_accuracy": 0.7592082142829895, "num_input_tokens_seen": 2874420, "num_tokens": 2874420.0, "step": 1220, "train_runtime": 1938.0767, "train_tokens_per_second": 1483.13 }, { "entropy": 1.4385481595993042, "epoch": 0.245, "grad_norm": 42.064579010009766, "learning_rate": 1.7676150875463688e-05, "loss": 1.3639, "mean_token_accuracy": 0.7058291912078858, "num_input_tokens_seen": 2883330, "num_tokens": 2883330.0, "step": 1225, "train_runtime": 1945.9098, "train_tokens_per_second": 1481.739 }, { "entropy": 1.3672122240066529, "epoch": 0.246, "grad_norm": 22.480314254760742, "learning_rate": 1.7655353108863068e-05, "loss": 1.2043, "mean_token_accuracy": 0.731696093082428, "num_input_tokens_seen": 2891854, "num_tokens": 2891854.0, "step": 1230, "train_runtime": 1953.6722, "train_tokens_per_second": 1480.215 }, { "entropy": 1.2041258811950684, "epoch": 0.247, "grad_norm": 81.84649658203125, "learning_rate": 1.7634475041230796e-05, "loss": 1.3604, "mean_token_accuracy": 0.7095182776451111, "num_input_tokens_seen": 2901195, "num_tokens": 2901195.0, "step": 1235, "train_runtime": 1961.6417, "train_tokens_per_second": 1478.963 }, { "entropy": 1.1324700951576232, "epoch": 0.248, "grad_norm": 19.865205764770508, "learning_rate": 1.7613516891567907e-05, "loss": 1.1577, "mean_token_accuracy": 0.7308401465415955, "num_input_tokens_seen": 2910132, "num_tokens": 2910132.0, "step": 1240, "train_runtime": 1969.2849, "train_tokens_per_second": 1477.761 }, { "entropy": 1.6531818628311157, "epoch": 0.249, "grad_norm": 37.53541946411133, "learning_rate": 1.759247887971548e-05, "loss": 1.4333, "mean_token_accuracy": 0.6970197558403015, "num_input_tokens_seen": 2921098, "num_tokens": 2921098.0, "step": 1245, "train_runtime": 1976.9246, "train_tokens_per_second": 1477.597 }, { "entropy": 1.6558474063873292, "epoch": 0.25, "grad_norm": 79.10953521728516, "learning_rate": 1.7571361226352305e-05, "loss": 1.1122, "mean_token_accuracy": 0.751661503314972, "num_input_tokens_seen": 2933196, "num_tokens": 2933196.0, "step": 1250, "train_runtime": 1984.7747, "train_tokens_per_second": 1477.848 }, { "entropy": 1.8808089017868042, "epoch": 0.251, "grad_norm": 30.842592239379883, "learning_rate": 1.7550164152992573e-05, "loss": 1.4671, "mean_token_accuracy": 0.6812240958213807, "num_input_tokens_seen": 2943533, "num_tokens": 2943533.0, "step": 1255, "train_runtime": 1992.7295, "train_tokens_per_second": 1477.136 }, { "entropy": 1.7506457805633544, "epoch": 0.252, "grad_norm": 19.3480281829834, "learning_rate": 1.752888788198355e-05, "loss": 1.097, "mean_token_accuracy": 0.7492644786834717, "num_input_tokens_seen": 2952870, "num_tokens": 2952870.0, "step": 1260, "train_runtime": 2000.647, "train_tokens_per_second": 1475.958 }, { "entropy": 2.3125572204589844, "epoch": 0.253, "grad_norm": 31.685880661010742, "learning_rate": 1.7507532636503256e-05, "loss": 1.3672, "mean_token_accuracy": 0.6862363934516906, "num_input_tokens_seen": 2961949, "num_tokens": 2961949.0, "step": 1265, "train_runtime": 2008.5833, "train_tokens_per_second": 1474.646 }, { "entropy": 1.891088342666626, "epoch": 0.254, "grad_norm": 18.1688175201416, "learning_rate": 1.7486098640558105e-05, "loss": 0.9911, "mean_token_accuracy": 0.7749869227409363, "num_input_tokens_seen": 2970904, "num_tokens": 2970904.0, "step": 1270, "train_runtime": 2016.1718, "train_tokens_per_second": 1473.537 }, { "entropy": 1.8362998247146607, "epoch": 0.255, "grad_norm": 28.076095581054688, "learning_rate": 1.746458611898058e-05, "loss": 1.2675, "mean_token_accuracy": 0.7187411069869996, "num_input_tokens_seen": 2979445, "num_tokens": 2979445.0, "step": 1275, "train_runtime": 2023.6229, "train_tokens_per_second": 1472.332 }, { "entropy": 2.284788131713867, "epoch": 0.256, "grad_norm": 20.291929244995117, "learning_rate": 1.7442995297426846e-05, "loss": 1.2821, "mean_token_accuracy": 0.7193153500556946, "num_input_tokens_seen": 2989592, "num_tokens": 2989592.0, "step": 1280, "train_runtime": 2031.5385, "train_tokens_per_second": 1471.59 }, { "entropy": 2.02677960395813, "epoch": 0.257, "grad_norm": 27.852441787719727, "learning_rate": 1.7421326402374406e-05, "loss": 1.2849, "mean_token_accuracy": 0.7152616262435914, "num_input_tokens_seen": 2999871, "num_tokens": 2999871.0, "step": 1285, "train_runtime": 2039.3645, "train_tokens_per_second": 1470.983 }, { "entropy": 1.2690160274505615, "epoch": 0.258, "grad_norm": 17.0200138092041, "learning_rate": 1.7399579661119713e-05, "loss": 1.0871, "mean_token_accuracy": 0.7653188586235047, "num_input_tokens_seen": 3009218, "num_tokens": 3009218.0, "step": 1290, "train_runtime": 2047.211, "train_tokens_per_second": 1469.911 }, { "entropy": 1.031061625480652, "epoch": 0.259, "grad_norm": 35.9340934753418, "learning_rate": 1.73777553017758e-05, "loss": 1.4868, "mean_token_accuracy": 0.7065757870674133, "num_input_tokens_seen": 3019972, "num_tokens": 3019972.0, "step": 1295, "train_runtime": 2055.0575, "train_tokens_per_second": 1469.532 }, { "entropy": 1.0237751007080078, "epoch": 0.26, "grad_norm": 20.21115493774414, "learning_rate": 1.7355853553269865e-05, "loss": 1.3648, "mean_token_accuracy": 0.7193031430244445, "num_input_tokens_seen": 3030240, "num_tokens": 3030240.0, "step": 1300, "train_runtime": 2062.741, "train_tokens_per_second": 1469.036 }, { "entropy": 1.0271146655082704, "epoch": 0.261, "grad_norm": 39.226402282714844, "learning_rate": 1.7333874645340886e-05, "loss": 1.3639, "mean_token_accuracy": 0.7147809863090515, "num_input_tokens_seen": 3038367, "num_tokens": 3038367.0, "step": 1305, "train_runtime": 2070.2041, "train_tokens_per_second": 1467.665 }, { "entropy": 0.9566171288490295, "epoch": 0.262, "grad_norm": 32.47471237182617, "learning_rate": 1.7311818808537206e-05, "loss": 1.244, "mean_token_accuracy": 0.7321358799934388, "num_input_tokens_seen": 3048334, "num_tokens": 3048334.0, "step": 1310, "train_runtime": 2077.9527, "train_tokens_per_second": 1466.989 }, { "entropy": 0.8938545227050781, "epoch": 0.263, "grad_norm": 32.363529205322266, "learning_rate": 1.7289686274214116e-05, "loss": 1.4038, "mean_token_accuracy": 0.711756455898285, "num_input_tokens_seen": 3058570, "num_tokens": 3058570.0, "step": 1315, "train_runtime": 2085.8202, "train_tokens_per_second": 1466.363 }, { "entropy": 0.9820765733718873, "epoch": 0.264, "grad_norm": 17.698198318481445, "learning_rate": 1.7267477274531432e-05, "loss": 1.1682, "mean_token_accuracy": 0.7518245935440063, "num_input_tokens_seen": 3068106, "num_tokens": 3068106.0, "step": 1320, "train_runtime": 2093.6601, "train_tokens_per_second": 1465.427 }, { "entropy": 0.8849873423576355, "epoch": 0.265, "grad_norm": 38.41575241088867, "learning_rate": 1.724519204245105e-05, "loss": 1.3946, "mean_token_accuracy": 0.7107982158660888, "num_input_tokens_seen": 3077868, "num_tokens": 3077868.0, "step": 1325, "train_runtime": 2101.5777, "train_tokens_per_second": 1464.551 }, { "entropy": 0.8861074566841125, "epoch": 0.266, "grad_norm": 22.461830139160156, "learning_rate": 1.7222830811734502e-05, "loss": 1.162, "mean_token_accuracy": 0.7565742015838623, "num_input_tokens_seen": 3086830, "num_tokens": 3086830.0, "step": 1330, "train_runtime": 2109.1942, "train_tokens_per_second": 1463.511 }, { "entropy": 1.1419548749923707, "epoch": 0.267, "grad_norm": 34.59196853637695, "learning_rate": 1.720039381694053e-05, "loss": 1.3086, "mean_token_accuracy": 0.7225972294807435, "num_input_tokens_seen": 3096153, "num_tokens": 3096153.0, "step": 1335, "train_runtime": 2116.749, "train_tokens_per_second": 1462.693 }, { "entropy": 0.973324978351593, "epoch": 0.268, "grad_norm": 17.652379989624023, "learning_rate": 1.7177881293422586e-05, "loss": 1.0557, "mean_token_accuracy": 0.7771013736724853, "num_input_tokens_seen": 3105070, "num_tokens": 3105070.0, "step": 1340, "train_runtime": 2124.5058, "train_tokens_per_second": 1461.549 }, { "entropy": 1.7407701015472412, "epoch": 0.269, "grad_norm": 38.165924072265625, "learning_rate": 1.7155293477326385e-05, "loss": 1.4097, "mean_token_accuracy": 0.7157220602035522, "num_input_tokens_seen": 3115392, "num_tokens": 3115392.0, "step": 1345, "train_runtime": 2132.367, "train_tokens_per_second": 1461.002 }, { "entropy": 2.0077241897583007, "epoch": 0.27, "grad_norm": 15.912755012512207, "learning_rate": 1.7132630605587433e-05, "loss": 1.027, "mean_token_accuracy": 0.7831048488616943, "num_input_tokens_seen": 3125532, "num_tokens": 3125532.0, "step": 1350, "train_runtime": 2140.2328, "train_tokens_per_second": 1460.37 }, { "entropy": 2.5219493389129637, "epoch": 0.271, "grad_norm": 41.65001678466797, "learning_rate": 1.7109892915928535e-05, "loss": 1.3322, "mean_token_accuracy": 0.7229766488075257, "num_input_tokens_seen": 3134239, "num_tokens": 3134239.0, "step": 1355, "train_runtime": 2148.1764, "train_tokens_per_second": 1459.023 }, { "entropy": 2.577584743499756, "epoch": 0.272, "grad_norm": 14.724543571472168, "learning_rate": 1.7087080646857293e-05, "loss": 1.1431, "mean_token_accuracy": 0.7652744054794312, "num_input_tokens_seen": 3145174, "num_tokens": 3145174.0, "step": 1360, "train_runtime": 2155.5116, "train_tokens_per_second": 1459.131 }, { "entropy": 2.2075974702835084, "epoch": 0.273, "grad_norm": 38.296775817871094, "learning_rate": 1.706419403766361e-05, "loss": 1.4833, "mean_token_accuracy": 0.7018082857131958, "num_input_tokens_seen": 3154111, "num_tokens": 3154111.0, "step": 1365, "train_runtime": 2163.3392, "train_tokens_per_second": 1457.983 }, { "entropy": 2.055910277366638, "epoch": 0.274, "grad_norm": 17.082412719726562, "learning_rate": 1.7041233328417194e-05, "loss": 1.109, "mean_token_accuracy": 0.7666389107704162, "num_input_tokens_seen": 3162896, "num_tokens": 3162896.0, "step": 1370, "train_runtime": 2171.13, "train_tokens_per_second": 1456.797 }, { "entropy": 2.1883644104003905, "epoch": 0.275, "grad_norm": 37.75225067138672, "learning_rate": 1.7018198759965018e-05, "loss": 1.3619, "mean_token_accuracy": 0.7215317964553833, "num_input_tokens_seen": 3172090, "num_tokens": 3172090.0, "step": 1375, "train_runtime": 2178.9155, "train_tokens_per_second": 1455.811 }, { "entropy": 2.2757044076919555, "epoch": 0.276, "grad_norm": 17.573753356933594, "learning_rate": 1.69950905739288e-05, "loss": 1.1286, "mean_token_accuracy": 0.7666920781135559, "num_input_tokens_seen": 3183078, "num_tokens": 3183078.0, "step": 1380, "train_runtime": 2186.8587, "train_tokens_per_second": 1455.548 }, { "entropy": 2.2024645090103148, "epoch": 0.277, "grad_norm": 37.02727508544922, "learning_rate": 1.6971909012702483e-05, "loss": 1.6183, "mean_token_accuracy": 0.6861090421676636, "num_input_tokens_seen": 3194154, "num_tokens": 3194154.0, "step": 1385, "train_runtime": 2194.7262, "train_tokens_per_second": 1455.377 }, { "entropy": 2.0373539447784426, "epoch": 0.278, "grad_norm": 18.59466552734375, "learning_rate": 1.6948654319449674e-05, "loss": 1.0776, "mean_token_accuracy": 0.7835559725761414, "num_input_tokens_seen": 3202962, "num_tokens": 3202962.0, "step": 1390, "train_runtime": 2202.003, "train_tokens_per_second": 1454.567 }, { "entropy": 1.8857535362243651, "epoch": 0.279, "grad_norm": 42.342281341552734, "learning_rate": 1.69253267381011e-05, "loss": 1.4789, "mean_token_accuracy": 0.7060995101928711, "num_input_tokens_seen": 3213302, "num_tokens": 3213302.0, "step": 1395, "train_runtime": 2209.9054, "train_tokens_per_second": 1454.045 }, { "entropy": 2.31169798374176, "epoch": 0.28, "grad_norm": 20.941030502319336, "learning_rate": 1.6901926513352052e-05, "loss": 1.1983, "mean_token_accuracy": 0.7506766438484191, "num_input_tokens_seen": 3223142, "num_tokens": 3223142.0, "step": 1400, "train_runtime": 2217.702, "train_tokens_per_second": 1453.37 }, { "entropy": 2.4272347927093505, "epoch": 0.281, "grad_norm": 46.16843032836914, "learning_rate": 1.6878453890659815e-05, "loss": 1.4267, "mean_token_accuracy": 0.7181783318519592, "num_input_tokens_seen": 3232443, "num_tokens": 3232443.0, "step": 1405, "train_runtime": 2225.6263, "train_tokens_per_second": 1452.375 }, { "entropy": 2.614600324630737, "epoch": 0.282, "grad_norm": 17.90781021118164, "learning_rate": 1.685490911624109e-05, "loss": 1.1771, "mean_token_accuracy": 0.7595712900161743, "num_input_tokens_seen": 3241960, "num_tokens": 3241960.0, "step": 1410, "train_runtime": 2233.5733, "train_tokens_per_second": 1451.468 }, { "entropy": 1.8452144384384155, "epoch": 0.283, "grad_norm": 40.912628173828125, "learning_rate": 1.6831292437069425e-05, "loss": 1.5066, "mean_token_accuracy": 0.7093664407730103, "num_input_tokens_seen": 3250904, "num_tokens": 3250904.0, "step": 1415, "train_runtime": 2241.2922, "train_tokens_per_second": 1450.46 }, { "entropy": 1.8838290452957154, "epoch": 0.284, "grad_norm": 21.02004623413086, "learning_rate": 1.6807604100872604e-05, "loss": 0.9967, "mean_token_accuracy": 0.7866453051567077, "num_input_tokens_seen": 3259140, "num_tokens": 3259140.0, "step": 1420, "train_runtime": 2249.1521, "train_tokens_per_second": 1449.053 }, { "entropy": 2.031890320777893, "epoch": 0.285, "grad_norm": 41.851478576660156, "learning_rate": 1.6783844356130073e-05, "loss": 1.7078, "mean_token_accuracy": 0.6776078999042511, "num_input_tokens_seen": 3271104, "num_tokens": 3271104.0, "step": 1425, "train_runtime": 2256.8715, "train_tokens_per_second": 1449.398 }, { "entropy": 1.5643399238586426, "epoch": 0.286, "grad_norm": 18.9265079498291, "learning_rate": 1.6760013452070304e-05, "loss": 1.215, "mean_token_accuracy": 0.7555792093276977, "num_input_tokens_seen": 3284614, "num_tokens": 3284614.0, "step": 1430, "train_runtime": 2264.9114, "train_tokens_per_second": 1450.217 }, { "entropy": 1.5935586214065551, "epoch": 0.287, "grad_norm": 42.32921600341797, "learning_rate": 1.6736111638668203e-05, "loss": 1.5395, "mean_token_accuracy": 0.7004055261611939, "num_input_tokens_seen": 3293894, "num_tokens": 3293894.0, "step": 1435, "train_runtime": 2272.7006, "train_tokens_per_second": 1449.33 }, { "entropy": 1.9701715230941772, "epoch": 0.288, "grad_norm": 16.297861099243164, "learning_rate": 1.671213916664249e-05, "loss": 1.2561, "mean_token_accuracy": 0.7520538806915283, "num_input_tokens_seen": 3303824, "num_tokens": 3303824.0, "step": 1440, "train_runtime": 2280.6716, "train_tokens_per_second": 1448.619 }, { "entropy": 2.2586899280548094, "epoch": 0.289, "grad_norm": 42.477420806884766, "learning_rate": 1.6688096287453048e-05, "loss": 1.4525, "mean_token_accuracy": 0.6988260865211486, "num_input_tokens_seen": 3314598, "num_tokens": 3314598.0, "step": 1445, "train_runtime": 2288.2761, "train_tokens_per_second": 1448.513 }, { "entropy": 1.6819924473762513, "epoch": 0.29, "grad_norm": 17.050662994384766, "learning_rate": 1.66639832532983e-05, "loss": 1.085, "mean_token_accuracy": 0.7694588541984558, "num_input_tokens_seen": 3325642, "num_tokens": 3325642.0, "step": 1450, "train_runtime": 2295.8046, "train_tokens_per_second": 1448.574 }, { "entropy": 2.1694716930389406, "epoch": 0.291, "grad_norm": 45.23213577270508, "learning_rate": 1.663980031711257e-05, "loss": 1.6041, "mean_token_accuracy": 0.6828583598136901, "num_input_tokens_seen": 3336545, "num_tokens": 3336545.0, "step": 1455, "train_runtime": 2303.7098, "train_tokens_per_second": 1448.336 }, { "entropy": 2.1461720705032348, "epoch": 0.292, "grad_norm": 20.83429718017578, "learning_rate": 1.661554773256341e-05, "loss": 1.2506, "mean_token_accuracy": 0.7554811954498291, "num_input_tokens_seen": 3345542, "num_tokens": 3345542.0, "step": 1460, "train_runtime": 2311.6667, "train_tokens_per_second": 1447.242 }, { "entropy": 2.2532513618469237, "epoch": 0.293, "grad_norm": 40.27167510986328, "learning_rate": 1.6591225754048963e-05, "loss": 1.4847, "mean_token_accuracy": 0.7033268332481384, "num_input_tokens_seen": 3355682, "num_tokens": 3355682.0, "step": 1465, "train_runtime": 2319.5723, "train_tokens_per_second": 1446.681 }, { "entropy": 1.7720814943313599, "epoch": 0.294, "grad_norm": 21.94749641418457, "learning_rate": 1.6566834636695264e-05, "loss": 1.1893, "mean_token_accuracy": 0.7538271427154541, "num_input_tokens_seen": 3364516, "num_tokens": 3364516.0, "step": 1470, "train_runtime": 2327.3633, "train_tokens_per_second": 1445.634 }, { "entropy": 1.8715754985809325, "epoch": 0.295, "grad_norm": 42.14169692993164, "learning_rate": 1.6542374636353605e-05, "loss": 1.4778, "mean_token_accuracy": 0.7104718208312988, "num_input_tokens_seen": 3373596, "num_tokens": 3373596.0, "step": 1475, "train_runtime": 2334.8764, "train_tokens_per_second": 1444.871 }, { "entropy": 1.6018312454223633, "epoch": 0.296, "grad_norm": 24.99211883544922, "learning_rate": 1.6517846009597804e-05, "loss": 1.1696, "mean_token_accuracy": 0.7623870134353637, "num_input_tokens_seen": 3386374, "num_tokens": 3386374.0, "step": 1480, "train_runtime": 2342.6845, "train_tokens_per_second": 1445.51 }, { "entropy": 1.9978862524032592, "epoch": 0.297, "grad_norm": 42.11578369140625, "learning_rate": 1.6493249013721558e-05, "loss": 1.6348, "mean_token_accuracy": 0.6847002983093262, "num_input_tokens_seen": 3398037, "num_tokens": 3398037.0, "step": 1485, "train_runtime": 2350.1888, "train_tokens_per_second": 1445.857 }, { "entropy": 2.033240818977356, "epoch": 0.298, "grad_norm": 18.06739044189453, "learning_rate": 1.646858390673571e-05, "loss": 1.16, "mean_token_accuracy": 0.758779788017273, "num_input_tokens_seen": 3408486, "num_tokens": 3408486.0, "step": 1490, "train_runtime": 2358.1091, "train_tokens_per_second": 1445.432 }, { "entropy": 2.079922676086426, "epoch": 0.299, "grad_norm": 47.72567367553711, "learning_rate": 1.644385094736556e-05, "loss": 1.6783, "mean_token_accuracy": 0.6876550078392029, "num_input_tokens_seen": 3420610, "num_tokens": 3420610.0, "step": 1495, "train_runtime": 2366.1129, "train_tokens_per_second": 1445.666 }, { "entropy": 2.0529671907424927, "epoch": 0.3, "grad_norm": 19.616802215576172, "learning_rate": 1.6419050395048147e-05, "loss": 1.2334, "mean_token_accuracy": 0.7522718787193299, "num_input_tokens_seen": 3431396, "num_tokens": 3431396.0, "step": 1500, "train_runtime": 2373.9948, "train_tokens_per_second": 1445.41 }, { "entropy": 2.1645299434661864, "epoch": 0.301, "grad_norm": 48.06602096557617, "learning_rate": 1.639418250992954e-05, "loss": 1.3564, "mean_token_accuracy": 0.7289418578147888, "num_input_tokens_seen": 3440773, "num_tokens": 3440773.0, "step": 1505, "train_runtime": 2381.5807, "train_tokens_per_second": 1444.743 }, { "entropy": 2.0120184898376463, "epoch": 0.302, "grad_norm": 19.63300323486328, "learning_rate": 1.636924755286207e-05, "loss": 1.0991, "mean_token_accuracy": 0.7802080273628235, "num_input_tokens_seen": 3449632, "num_tokens": 3449632.0, "step": 1510, "train_runtime": 2389.2015, "train_tokens_per_second": 1443.843 }, { "entropy": 1.6557461738586425, "epoch": 0.303, "grad_norm": 42.99159622192383, "learning_rate": 1.6344245785401653e-05, "loss": 1.5623, "mean_token_accuracy": 0.6970128178596496, "num_input_tokens_seen": 3460079, "num_tokens": 3460079.0, "step": 1515, "train_runtime": 2397.0104, "train_tokens_per_second": 1443.498 }, { "entropy": 1.950923538208008, "epoch": 0.304, "grad_norm": 16.479366302490234, "learning_rate": 1.631917746980499e-05, "loss": 1.0645, "mean_token_accuracy": 0.7878881812095642, "num_input_tokens_seen": 3468948, "num_tokens": 3468948.0, "step": 1520, "train_runtime": 2404.8465, "train_tokens_per_second": 1442.482 }, { "entropy": 1.7957743644714355, "epoch": 0.305, "grad_norm": 44.201019287109375, "learning_rate": 1.629404286902685e-05, "loss": 1.5476, "mean_token_accuracy": 0.7070588231086731, "num_input_tokens_seen": 3477616, "num_tokens": 3477616.0, "step": 1525, "train_runtime": 2412.6478, "train_tokens_per_second": 1441.411 }, { "entropy": 1.8094742059707642, "epoch": 0.306, "grad_norm": 22.249441146850586, "learning_rate": 1.6268842246717307e-05, "loss": 1.3298, "mean_token_accuracy": 0.749446427822113, "num_input_tokens_seen": 3487362, "num_tokens": 3487362.0, "step": 1530, "train_runtime": 2420.5292, "train_tokens_per_second": 1440.744 }, { "entropy": 2.1259793519973753, "epoch": 0.307, "grad_norm": 47.74738693237305, "learning_rate": 1.624357586721896e-05, "loss": 1.7322, "mean_token_accuracy": 0.6826498150825501, "num_input_tokens_seen": 3496633, "num_tokens": 3496633.0, "step": 1535, "train_runtime": 2428.2004, "train_tokens_per_second": 1440.01 }, { "entropy": 2.0165711402893067, "epoch": 0.308, "grad_norm": 18.036636352539062, "learning_rate": 1.6218243995564177e-05, "loss": 1.3342, "mean_token_accuracy": 0.7458345174789429, "num_input_tokens_seen": 3505578, "num_tokens": 3505578.0, "step": 1540, "train_runtime": 2435.6811, "train_tokens_per_second": 1439.26 }, { "entropy": 2.119771146774292, "epoch": 0.309, "grad_norm": 56.17195510864258, "learning_rate": 1.61928468974723e-05, "loss": 1.5459, "mean_token_accuracy": 0.7143497109413147, "num_input_tokens_seen": 3514499, "num_tokens": 3514499.0, "step": 1545, "train_runtime": 2443.608, "train_tokens_per_second": 1438.242 }, { "entropy": 2.577343463897705, "epoch": 0.31, "grad_norm": 18.34308624267578, "learning_rate": 1.6167384839346872e-05, "loss": 1.2329, "mean_token_accuracy": 0.7656438469886779, "num_input_tokens_seen": 3523200, "num_tokens": 3523200.0, "step": 1550, "train_runtime": 2451.5059, "train_tokens_per_second": 1437.157 }, { "entropy": 1.8914738893508911, "epoch": 0.311, "grad_norm": 48.73539352416992, "learning_rate": 1.6141858088272838e-05, "loss": 1.5684, "mean_token_accuracy": 0.706005585193634, "num_input_tokens_seen": 3531916, "num_tokens": 3531916.0, "step": 1555, "train_runtime": 2459.2023, "train_tokens_per_second": 1436.204 }, { "entropy": 2.4871872425079347, "epoch": 0.312, "grad_norm": 26.29279136657715, "learning_rate": 1.6116266912013734e-05, "loss": 1.5824, "mean_token_accuracy": 0.713192468881607, "num_input_tokens_seen": 3543566, "num_tokens": 3543566.0, "step": 1560, "train_runtime": 2467.101, "train_tokens_per_second": 1436.328 }, { "entropy": 2.5022111415863035, "epoch": 0.313, "grad_norm": 49.46891784667969, "learning_rate": 1.609061157900889e-05, "loss": 1.5513, "mean_token_accuracy": 0.7168126583099366, "num_input_tokens_seen": 3555869, "num_tokens": 3555869.0, "step": 1565, "train_runtime": 2474.8621, "train_tokens_per_second": 1436.795 }, { "entropy": 2.2136043310165405, "epoch": 0.314, "grad_norm": 18.485628128051758, "learning_rate": 1.6064892358370608e-05, "loss": 1.2679, "mean_token_accuracy": 0.7696561217308044, "num_input_tokens_seen": 3565814, "num_tokens": 3565814.0, "step": 1570, "train_runtime": 2482.5752, "train_tokens_per_second": 1436.337 }, { "entropy": 2.3896522521972656, "epoch": 0.315, "grad_norm": 53.4306640625, "learning_rate": 1.603910951988135e-05, "loss": 1.6821, "mean_token_accuracy": 0.6917877078056336, "num_input_tokens_seen": 3575145, "num_tokens": 3575145.0, "step": 1575, "train_runtime": 2490.4913, "train_tokens_per_second": 1435.518 }, { "entropy": 2.3319294452667236, "epoch": 0.316, "grad_norm": 17.13321876525879, "learning_rate": 1.601326333399088e-05, "loss": 1.2137, "mean_token_accuracy": 0.7693933248519897, "num_input_tokens_seen": 3585260, "num_tokens": 3585260.0, "step": 1580, "train_runtime": 2498.418, "train_tokens_per_second": 1435.012 }, { "entropy": 2.5293830394744874, "epoch": 0.317, "grad_norm": 51.75634002685547, "learning_rate": 1.598735407181347e-05, "loss": 1.3713, "mean_token_accuracy": 0.737622857093811, "num_input_tokens_seen": 3593933, "num_tokens": 3593933.0, "step": 1585, "train_runtime": 2506.2965, "train_tokens_per_second": 1433.962 }, { "entropy": 2.052696180343628, "epoch": 0.318, "grad_norm": 19.858715057373047, "learning_rate": 1.596138200512501e-05, "loss": 1.2006, "mean_token_accuracy": 0.7682390570640564, "num_input_tokens_seen": 3603708, "num_tokens": 3603708.0, "step": 1590, "train_runtime": 2514.1428, "train_tokens_per_second": 1433.374 }, { "entropy": 2.317439079284668, "epoch": 0.319, "grad_norm": 54.64228820800781, "learning_rate": 1.5935347406360192e-05, "loss": 1.7006, "mean_token_accuracy": 0.6940789580345154, "num_input_tokens_seen": 3611996, "num_tokens": 3611996.0, "step": 1595, "train_runtime": 2521.2732, "train_tokens_per_second": 1432.608 }, { "entropy": 1.9669895410537719, "epoch": 0.32, "grad_norm": 27.65109634399414, "learning_rate": 1.5909250548609644e-05, "loss": 1.275, "mean_token_accuracy": 0.7620911240577698, "num_input_tokens_seen": 3620844, "num_tokens": 3620844.0, "step": 1600, "train_runtime": 2529.2274, "train_tokens_per_second": 1431.601 }, { "entropy": 2.0669785022735594, "epoch": 0.321, "grad_norm": 49.51017379760742, "learning_rate": 1.5883091705617045e-05, "loss": 1.5506, "mean_token_accuracy": 0.6995637774467468, "num_input_tokens_seen": 3630246, "num_tokens": 3630246.0, "step": 1605, "train_runtime": 2537.2326, "train_tokens_per_second": 1430.79 }, { "entropy": 2.021135139465332, "epoch": 0.322, "grad_norm": 16.315515518188477, "learning_rate": 1.585687115177629e-05, "loss": 1.0879, "mean_token_accuracy": 0.7776655077934265, "num_input_tokens_seen": 3638818, "num_tokens": 3638818.0, "step": 1610, "train_runtime": 2544.9769, "train_tokens_per_second": 1429.804 }, { "entropy": 1.9953219413757324, "epoch": 0.323, "grad_norm": 38.39968490600586, "learning_rate": 1.5830589162128574e-05, "loss": 1.6446, "mean_token_accuracy": 0.6941417932510376, "num_input_tokens_seen": 3647405, "num_tokens": 3647405.0, "step": 1615, "train_runtime": 2552.7753, "train_tokens_per_second": 1428.8 }, { "entropy": 2.1994907379150392, "epoch": 0.324, "grad_norm": 23.191429138183594, "learning_rate": 1.5804246012359535e-05, "loss": 1.2025, "mean_token_accuracy": 0.7676089406013489, "num_input_tokens_seen": 3657710, "num_tokens": 3657710.0, "step": 1620, "train_runtime": 2560.5275, "train_tokens_per_second": 1428.499 }, { "entropy": 2.232217264175415, "epoch": 0.325, "grad_norm": 46.24846267700195, "learning_rate": 1.5777841978796348e-05, "loss": 1.7089, "mean_token_accuracy": 0.6922944545745849, "num_input_tokens_seen": 3671715, "num_tokens": 3671715.0, "step": 1625, "train_runtime": 2567.9876, "train_tokens_per_second": 1429.802 }, { "entropy": 2.807818365097046, "epoch": 0.326, "grad_norm": 38.45198059082031, "learning_rate": 1.575137733840483e-05, "loss": 1.3642, "mean_token_accuracy": 0.7345892429351807, "num_input_tokens_seen": 3683760, "num_tokens": 3683760.0, "step": 1630, "train_runtime": 2575.7556, "train_tokens_per_second": 1430.167 }, { "entropy": 2.3509646892547607, "epoch": 0.327, "grad_norm": 46.078269958496094, "learning_rate": 1.572485236878654e-05, "loss": 1.6443, "mean_token_accuracy": 0.6921593189239502, "num_input_tokens_seen": 3694481, "num_tokens": 3694481.0, "step": 1635, "train_runtime": 2583.6829, "train_tokens_per_second": 1429.928 }, { "entropy": 2.190648341178894, "epoch": 0.328, "grad_norm": 25.05006217956543, "learning_rate": 1.5698267348175852e-05, "loss": 1.1943, "mean_token_accuracy": 0.7671049475669861, "num_input_tokens_seen": 3706198, "num_tokens": 3706198.0, "step": 1640, "train_runtime": 2591.7358, "train_tokens_per_second": 1430.006 }, { "entropy": 2.8671186447143553, "epoch": 0.329, "grad_norm": 54.62031936645508, "learning_rate": 1.5671622555437055e-05, "loss": 1.6187, "mean_token_accuracy": 0.702330756187439, "num_input_tokens_seen": 3719382, "num_tokens": 3719382.0, "step": 1645, "train_runtime": 2599.7654, "train_tokens_per_second": 1430.661 }, { "entropy": 2.9085092067718508, "epoch": 0.33, "grad_norm": 22.552122116088867, "learning_rate": 1.5644918270061418e-05, "loss": 1.3027, "mean_token_accuracy": 0.7607730388641357, "num_input_tokens_seen": 3728396, "num_tokens": 3728396.0, "step": 1650, "train_runtime": 2607.3234, "train_tokens_per_second": 1429.971 }, { "entropy": 3.0661331176757813, "epoch": 0.331, "grad_norm": 52.22725296020508, "learning_rate": 1.5618154772164257e-05, "loss": 1.5807, "mean_token_accuracy": 0.7037828087806701, "num_input_tokens_seen": 3739171, "num_tokens": 3739171.0, "step": 1655, "train_runtime": 2614.9707, "train_tokens_per_second": 1429.909 }, { "entropy": 3.413400983810425, "epoch": 0.332, "grad_norm": 19.293006896972656, "learning_rate": 1.5591332342482002e-05, "loss": 1.2238, "mean_token_accuracy": 0.7612704753875732, "num_input_tokens_seen": 3751628, "num_tokens": 3751628.0, "step": 1660, "train_runtime": 2623.0354, "train_tokens_per_second": 1430.262 }, { "entropy": 3.452226161956787, "epoch": 0.333, "grad_norm": 55.92356491088867, "learning_rate": 1.5564451262369247e-05, "loss": 1.4981, "mean_token_accuracy": 0.7082372188568116, "num_input_tokens_seen": 3761302, "num_tokens": 3761302.0, "step": 1665, "train_runtime": 2630.9012, "train_tokens_per_second": 1429.663 }, { "entropy": 2.9226686477661135, "epoch": 0.334, "grad_norm": 17.912302017211914, "learning_rate": 1.55375118137958e-05, "loss": 1.1335, "mean_token_accuracy": 0.7800383687019348, "num_input_tokens_seen": 3771282, "num_tokens": 3771282.0, "step": 1670, "train_runtime": 2638.949, "train_tokens_per_second": 1429.085 }, { "entropy": 3.3792606353759767, "epoch": 0.335, "grad_norm": 63.6988639831543, "learning_rate": 1.5510514279343736e-05, "loss": 1.4894, "mean_token_accuracy": 0.7221513867378235, "num_input_tokens_seen": 3780312, "num_tokens": 3780312.0, "step": 1675, "train_runtime": 2646.8709, "train_tokens_per_second": 1428.219 }, { "entropy": 2.9335317611694336, "epoch": 0.336, "grad_norm": 21.815444946289062, "learning_rate": 1.5483458942204407e-05, "loss": 1.2228, "mean_token_accuracy": 0.7642370820045471, "num_input_tokens_seen": 3788602, "num_tokens": 3788602.0, "step": 1680, "train_runtime": 2654.3798, "train_tokens_per_second": 1427.302 }, { "entropy": 2.2110676765441895, "epoch": 0.337, "grad_norm": 57.52202224731445, "learning_rate": 1.5456346086175508e-05, "loss": 1.6617, "mean_token_accuracy": 0.6957574486732483, "num_input_tokens_seen": 3798632, "num_tokens": 3798632.0, "step": 1685, "train_runtime": 2662.0773, "train_tokens_per_second": 1426.943 }, { "entropy": 2.0072426557540894, "epoch": 0.338, "grad_norm": 32.402225494384766, "learning_rate": 1.542917599565806e-05, "loss": 1.257, "mean_token_accuracy": 0.7507204174995422, "num_input_tokens_seen": 3807456, "num_tokens": 3807456.0, "step": 1690, "train_runtime": 2670.0372, "train_tokens_per_second": 1425.994 }, { "entropy": 2.0644862174987795, "epoch": 0.339, "grad_norm": 57.463714599609375, "learning_rate": 1.540194895565346e-05, "loss": 1.6229, "mean_token_accuracy": 0.704038405418396, "num_input_tokens_seen": 3816125, "num_tokens": 3816125.0, "step": 1695, "train_runtime": 2678.0094, "train_tokens_per_second": 1424.986 }, { "entropy": 2.041370701789856, "epoch": 0.34, "grad_norm": 23.74098014831543, "learning_rate": 1.5374665251760474e-05, "loss": 1.1628, "mean_token_accuracy": 0.7822050333023072, "num_input_tokens_seen": 3825000, "num_tokens": 3825000.0, "step": 1700, "train_runtime": 2685.9025, "train_tokens_per_second": 1424.102 }, { "entropy": 1.9257484674453735, "epoch": 0.341, "grad_norm": 56.89560317993164, "learning_rate": 1.5347325170172246e-05, "loss": 1.6376, "mean_token_accuracy": 0.6935947060585022, "num_input_tokens_seen": 3833853, "num_tokens": 3833853.0, "step": 1705, "train_runtime": 2693.5732, "train_tokens_per_second": 1423.334 }, { "entropy": 1.5545307397842407, "epoch": 0.342, "grad_norm": 23.48671531677246, "learning_rate": 1.531992899767329e-05, "loss": 1.2352, "mean_token_accuracy": 0.773958706855774, "num_input_tokens_seen": 3842794, "num_tokens": 3842794.0, "step": 1710, "train_runtime": 2701.2526, "train_tokens_per_second": 1422.597 }, { "entropy": 1.8448223352432251, "epoch": 0.343, "grad_norm": 63.72190856933594, "learning_rate": 1.5292477021636498e-05, "loss": 1.9283, "mean_token_accuracy": 0.6641803085803986, "num_input_tokens_seen": 3853530, "num_tokens": 3853530.0, "step": 1715, "train_runtime": 2709.0388, "train_tokens_per_second": 1422.471 }, { "entropy": 1.991886806488037, "epoch": 0.344, "grad_norm": 26.530517578125, "learning_rate": 1.5264969530020105e-05, "loss": 1.512, "mean_token_accuracy": 0.735997223854065, "num_input_tokens_seen": 3864332, "num_tokens": 3864332.0, "step": 1720, "train_runtime": 2716.8638, "train_tokens_per_second": 1422.35 }, { "entropy": 2.1041543245315553, "epoch": 0.345, "grad_norm": 60.73847198486328, "learning_rate": 1.5237406811364682e-05, "loss": 1.7151, "mean_token_accuracy": 0.6906277060508728, "num_input_tokens_seen": 3875913, "num_tokens": 3875913.0, "step": 1725, "train_runtime": 2724.7627, "train_tokens_per_second": 1422.477 }, { "entropy": 1.8267709493637085, "epoch": 0.346, "grad_norm": 29.255992889404297, "learning_rate": 1.5209789154790107e-05, "loss": 1.3685, "mean_token_accuracy": 0.7402464032173157, "num_input_tokens_seen": 3886562, "num_tokens": 3886562.0, "step": 1730, "train_runtime": 2732.7325, "train_tokens_per_second": 1422.226 }, { "entropy": 2.3421202182769774, "epoch": 0.347, "grad_norm": 54.433807373046875, "learning_rate": 1.5182116849992528e-05, "loss": 1.6313, "mean_token_accuracy": 0.7029274344444275, "num_input_tokens_seen": 3899442, "num_tokens": 3899442.0, "step": 1735, "train_runtime": 2740.376, "train_tokens_per_second": 1422.959 }, { "entropy": 2.10402193069458, "epoch": 0.348, "grad_norm": 17.509033203125, "learning_rate": 1.5154390187241328e-05, "loss": 1.2385, "mean_token_accuracy": 0.7615158796310425, "num_input_tokens_seen": 3911406, "num_tokens": 3911406.0, "step": 1740, "train_runtime": 2748.0855, "train_tokens_per_second": 1423.32 }, { "entropy": 2.4292985439300536, "epoch": 0.349, "grad_norm": 54.79844665527344, "learning_rate": 1.512660945737608e-05, "loss": 2.1091, "mean_token_accuracy": 0.6180171608924866, "num_input_tokens_seen": 3921850, "num_tokens": 3921850.0, "step": 1745, "train_runtime": 2755.9657, "train_tokens_per_second": 1423.04 }, { "entropy": 2.0884851455688476, "epoch": 0.35, "grad_norm": 18.996780395507812, "learning_rate": 1.5098774951803492e-05, "loss": 1.2193, "mean_token_accuracy": 0.7786159515380859, "num_input_tokens_seen": 3932250, "num_tokens": 3932250.0, "step": 1750, "train_runtime": 2763.7988, "train_tokens_per_second": 1422.77 }, { "entropy": 2.562892484664917, "epoch": 0.351, "grad_norm": 107.32661437988281, "learning_rate": 1.507088696249436e-05, "loss": 1.7106, "mean_token_accuracy": 0.6888882875442505, "num_input_tokens_seen": 3941975, "num_tokens": 3941975.0, "step": 1755, "train_runtime": 2771.7721, "train_tokens_per_second": 1422.186 }, { "entropy": 2.8545804977416993, "epoch": 0.352, "grad_norm": 25.456335067749023, "learning_rate": 1.5042945781980494e-05, "loss": 1.2794, "mean_token_accuracy": 0.7553274035453796, "num_input_tokens_seen": 3952318, "num_tokens": 3952318.0, "step": 1760, "train_runtime": 2779.7564, "train_tokens_per_second": 1421.822 }, { "entropy": 2.5331657409667967, "epoch": 0.353, "grad_norm": 54.58677673339844, "learning_rate": 1.5014951703351655e-05, "loss": 1.6567, "mean_token_accuracy": 0.6946354031562805, "num_input_tokens_seen": 3962190, "num_tokens": 3962190.0, "step": 1765, "train_runtime": 2787.4488, "train_tokens_per_second": 1421.44 }, { "entropy": 2.644570302963257, "epoch": 0.354, "grad_norm": 28.3892765045166, "learning_rate": 1.4986905020252482e-05, "loss": 1.3383, "mean_token_accuracy": 0.7511728763580322, "num_input_tokens_seen": 3971328, "num_tokens": 3971328.0, "step": 1770, "train_runtime": 2795.0248, "train_tokens_per_second": 1420.856 }, { "entropy": 2.7318310737609863, "epoch": 0.355, "grad_norm": 56.480342864990234, "learning_rate": 1.4958806026879411e-05, "loss": 1.8927, "mean_token_accuracy": 0.6630759119987488, "num_input_tokens_seen": 3980680, "num_tokens": 3980680.0, "step": 1775, "train_runtime": 2802.9983, "train_tokens_per_second": 1420.151 }, { "entropy": 2.7034935474395754, "epoch": 0.356, "grad_norm": 27.14031982421875, "learning_rate": 1.4930655017977583e-05, "loss": 1.3822, "mean_token_accuracy": 0.7563728332519531, "num_input_tokens_seen": 3991338, "num_tokens": 3991338.0, "step": 1780, "train_runtime": 2811.0217, "train_tokens_per_second": 1419.889 }, { "entropy": 2.5262166023254395, "epoch": 0.357, "grad_norm": 57.87148666381836, "learning_rate": 1.4902452288837761e-05, "loss": 1.6472, "mean_token_accuracy": 0.7006426215171814, "num_input_tokens_seen": 4001458, "num_tokens": 4001458.0, "step": 1785, "train_runtime": 2818.9355, "train_tokens_per_second": 1419.493 }, { "entropy": 2.3132161140441894, "epoch": 0.358, "grad_norm": 22.39909553527832, "learning_rate": 1.4874198135293232e-05, "loss": 1.2579, "mean_token_accuracy": 0.7711090564727783, "num_input_tokens_seen": 4009776, "num_tokens": 4009776.0, "step": 1790, "train_runtime": 2826.8071, "train_tokens_per_second": 1418.482 }, { "entropy": 2.663615894317627, "epoch": 0.359, "grad_norm": 61.493927001953125, "learning_rate": 1.4845892853716692e-05, "loss": 1.8353, "mean_token_accuracy": 0.6733369708061219, "num_input_tokens_seen": 4018770, "num_tokens": 4018770.0, "step": 1795, "train_runtime": 2834.3734, "train_tokens_per_second": 1417.869 }, { "entropy": 2.468277025222778, "epoch": 0.36, "grad_norm": 34.188987731933594, "learning_rate": 1.4817536741017153e-05, "loss": 1.4463, "mean_token_accuracy": 0.7380968451499939, "num_input_tokens_seen": 4028086, "num_tokens": 4028086.0, "step": 1800, "train_runtime": 2841.9138, "train_tokens_per_second": 1417.385 }, { "entropy": 2.170373773574829, "epoch": 0.361, "grad_norm": 61.47440719604492, "learning_rate": 1.478913009463682e-05, "loss": 1.884, "mean_token_accuracy": 0.6617397546768189, "num_input_tokens_seen": 4036014, "num_tokens": 4036014.0, "step": 1805, "train_runtime": 2849.5864, "train_tokens_per_second": 1416.351 }, { "entropy": 2.1222312688827514, "epoch": 0.362, "grad_norm": 13.96916389465332, "learning_rate": 1.4760673212547975e-05, "loss": 1.1083, "mean_token_accuracy": 0.7896428585052491, "num_input_tokens_seen": 4045420, "num_tokens": 4045420.0, "step": 1810, "train_runtime": 2857.5602, "train_tokens_per_second": 1415.69 }, { "entropy": 2.5397322177886963, "epoch": 0.363, "grad_norm": 53.17131423950195, "learning_rate": 1.473216639324984e-05, "loss": 1.5421, "mean_token_accuracy": 0.7048299551010132, "num_input_tokens_seen": 4054401, "num_tokens": 4054401.0, "step": 1815, "train_runtime": 2865.5915, "train_tokens_per_second": 1414.857 }, { "entropy": 2.3925111293792725, "epoch": 0.364, "grad_norm": 17.78383445739746, "learning_rate": 1.4703609935765463e-05, "loss": 1.0795, "mean_token_accuracy": 0.7832731246948242, "num_input_tokens_seen": 4062106, "num_tokens": 4062106.0, "step": 1820, "train_runtime": 2873.1376, "train_tokens_per_second": 1413.822 }, { "entropy": 2.4115597724914553, "epoch": 0.365, "grad_norm": 47.685272216796875, "learning_rate": 1.467500413963857e-05, "loss": 1.7632, "mean_token_accuracy": 0.6809423327445984, "num_input_tokens_seen": 4072922, "num_tokens": 4072922.0, "step": 1825, "train_runtime": 2880.6098, "train_tokens_per_second": 1413.91 }, { "entropy": 2.8706647872924806, "epoch": 0.366, "grad_norm": 24.268712997436523, "learning_rate": 1.4646349304930426e-05, "loss": 1.5019, "mean_token_accuracy": 0.7367658495903016, "num_input_tokens_seen": 4084652, "num_tokens": 4084652.0, "step": 1830, "train_runtime": 2888.496, "train_tokens_per_second": 1414.11 }, { "entropy": 3.582042121887207, "epoch": 0.367, "grad_norm": 101.1836929321289, "learning_rate": 1.4617645732216686e-05, "loss": 1.9331, "mean_token_accuracy": 0.6643929481506348, "num_input_tokens_seen": 4097847, "num_tokens": 4097847.0, "step": 1835, "train_runtime": 2896.4052, "train_tokens_per_second": 1414.804 }, { "entropy": 3.2730942249298094, "epoch": 0.368, "grad_norm": 32.02375411987305, "learning_rate": 1.4588893722584247e-05, "loss": 1.4783, "mean_token_accuracy": 0.718678092956543, "num_input_tokens_seen": 4110356, "num_tokens": 4110356.0, "step": 1840, "train_runtime": 2904.3582, "train_tokens_per_second": 1415.237 }, { "entropy": 3.381536436080933, "epoch": 0.369, "grad_norm": 59.83891677856445, "learning_rate": 1.456009357762809e-05, "loss": 1.8468, "mean_token_accuracy": 0.6701762795448303, "num_input_tokens_seen": 4123925, "num_tokens": 4123925.0, "step": 1845, "train_runtime": 2912.3879, "train_tokens_per_second": 1415.994 }, { "entropy": 2.498771905899048, "epoch": 0.37, "grad_norm": 36.44648361206055, "learning_rate": 1.4531245599448099e-05, "loss": 1.5699, "mean_token_accuracy": 0.7067472457885742, "num_input_tokens_seen": 4135794, "num_tokens": 4135794.0, "step": 1850, "train_runtime": 2919.9978, "train_tokens_per_second": 1416.369 }, { "entropy": 2.7750861167907717, "epoch": 0.371, "grad_norm": 111.65736389160156, "learning_rate": 1.4502350090645919e-05, "loss": 1.6369, "mean_token_accuracy": 0.699991250038147, "num_input_tokens_seen": 4148170, "num_tokens": 4148170.0, "step": 1855, "train_runtime": 2927.5556, "train_tokens_per_second": 1416.94 }, { "entropy": 2.6607544898986815, "epoch": 0.372, "grad_norm": 39.36896514892578, "learning_rate": 1.4473407354321763e-05, "loss": 1.4801, "mean_token_accuracy": 0.7247755289077759, "num_input_tokens_seen": 4158186, "num_tokens": 4158186.0, "step": 1860, "train_runtime": 2935.4049, "train_tokens_per_second": 1416.563 }, { "entropy": 2.479531764984131, "epoch": 0.373, "grad_norm": 56.30593490600586, "learning_rate": 1.4444417694071242e-05, "loss": 1.6968, "mean_token_accuracy": 0.6847872138023376, "num_input_tokens_seen": 4167982, "num_tokens": 4167982.0, "step": 1865, "train_runtime": 2943.2747, "train_tokens_per_second": 1416.104 }, { "entropy": 1.984696626663208, "epoch": 0.374, "grad_norm": 25.814531326293945, "learning_rate": 1.4415381413982168e-05, "loss": 1.3647, "mean_token_accuracy": 0.745641577243805, "num_input_tokens_seen": 4177556, "num_tokens": 4177556.0, "step": 1870, "train_runtime": 2951.1121, "train_tokens_per_second": 1415.587 }, { "entropy": 1.9822622776031493, "epoch": 0.375, "grad_norm": 68.19324493408203, "learning_rate": 1.4386298818631388e-05, "loss": 1.5919, "mean_token_accuracy": 0.6973026156425476, "num_input_tokens_seen": 4187160, "num_tokens": 4187160.0, "step": 1875, "train_runtime": 2959.0456, "train_tokens_per_second": 1415.037 }, { "entropy": 2.136989641189575, "epoch": 0.376, "grad_norm": 19.91508674621582, "learning_rate": 1.4357170213081556e-05, "loss": 1.2142, "mean_token_accuracy": 0.7613247632980347, "num_input_tokens_seen": 4196366, "num_tokens": 4196366.0, "step": 1880, "train_runtime": 2966.7886, "train_tokens_per_second": 1414.447 }, { "entropy": 1.9142632007598877, "epoch": 0.377, "grad_norm": 54.018985748291016, "learning_rate": 1.4327995902877972e-05, "loss": 1.4641, "mean_token_accuracy": 0.727017080783844, "num_input_tokens_seen": 4205897, "num_tokens": 4205897.0, "step": 1885, "train_runtime": 2974.2305, "train_tokens_per_second": 1414.113 }, { "entropy": 2.098765015602112, "epoch": 0.378, "grad_norm": 21.28632354736328, "learning_rate": 1.4298776194045337e-05, "loss": 1.1848, "mean_token_accuracy": 0.77965008020401, "num_input_tokens_seen": 4214932, "num_tokens": 4214932.0, "step": 1890, "train_runtime": 2982.1651, "train_tokens_per_second": 1413.38 }, { "entropy": 2.1211936712265014, "epoch": 0.379, "grad_norm": 60.83597183227539, "learning_rate": 1.4269511393084572e-05, "loss": 1.811, "mean_token_accuracy": 0.6612493753433227, "num_input_tokens_seen": 4225797, "num_tokens": 4225797.0, "step": 1895, "train_runtime": 2990.1676, "train_tokens_per_second": 1413.231 }, { "entropy": 1.9584113597869872, "epoch": 0.38, "grad_norm": 22.94863510131836, "learning_rate": 1.4240201806969594e-05, "loss": 1.2157, "mean_token_accuracy": 0.7801413059234619, "num_input_tokens_seen": 4235244, "num_tokens": 4235244.0, "step": 1900, "train_runtime": 2998.0739, "train_tokens_per_second": 1412.655 }, { "entropy": 2.2246545791625976, "epoch": 0.381, "grad_norm": 64.25224304199219, "learning_rate": 1.4210847743144087e-05, "loss": 1.676, "mean_token_accuracy": 0.6933815121650696, "num_input_tokens_seen": 4244569, "num_tokens": 4244569.0, "step": 1905, "train_runtime": 3005.6963, "train_tokens_per_second": 1412.175 }, { "entropy": 2.3698882102966308, "epoch": 0.382, "grad_norm": 55.18050003051758, "learning_rate": 1.4181449509518292e-05, "loss": 1.4925, "mean_token_accuracy": 0.7220078825950622, "num_input_tokens_seen": 4253626, "num_tokens": 4253626.0, "step": 1910, "train_runtime": 3013.6912, "train_tokens_per_second": 1411.434 }, { "entropy": 2.005776572227478, "epoch": 0.383, "grad_norm": 62.665157318115234, "learning_rate": 1.4152007414465771e-05, "loss": 1.6778, "mean_token_accuracy": 0.6957811832427978, "num_input_tokens_seen": 4264308, "num_tokens": 4264308.0, "step": 1915, "train_runtime": 3021.2744, "train_tokens_per_second": 1411.427 }, { "entropy": 2.6654239177703856, "epoch": 0.384, "grad_norm": 18.40694808959961, "learning_rate": 1.4122521766820172e-05, "loss": 1.1271, "mean_token_accuracy": 0.7835416316986084, "num_input_tokens_seen": 4274888, "num_tokens": 4274888.0, "step": 1920, "train_runtime": 3029.1587, "train_tokens_per_second": 1411.246 }, { "entropy": 2.1815751791000366, "epoch": 0.385, "grad_norm": 61.71083068847656, "learning_rate": 1.409299287587198e-05, "loss": 1.5673, "mean_token_accuracy": 0.7152623534202576, "num_input_tokens_seen": 4284646, "num_tokens": 4284646.0, "step": 1925, "train_runtime": 3037.0718, "train_tokens_per_second": 1410.782 }, { "entropy": 2.027584671974182, "epoch": 0.386, "grad_norm": 25.472347259521484, "learning_rate": 1.406342105136529e-05, "loss": 1.1251, "mean_token_accuracy": 0.7959362387657165, "num_input_tokens_seen": 4293726, "num_tokens": 4293726.0, "step": 1930, "train_runtime": 3044.8864, "train_tokens_per_second": 1410.143 }, { "entropy": 2.375785541534424, "epoch": 0.387, "grad_norm": 59.81321716308594, "learning_rate": 1.403380660349455e-05, "loss": 1.6089, "mean_token_accuracy": 0.6991539478302002, "num_input_tokens_seen": 4303025, "num_tokens": 4303025.0, "step": 1935, "train_runtime": 3052.5912, "train_tokens_per_second": 1409.63 }, { "entropy": 2.2045113325119017, "epoch": 0.388, "grad_norm": 24.62364959716797, "learning_rate": 1.4004149842901305e-05, "loss": 1.2827, "mean_token_accuracy": 0.7522437453269959, "num_input_tokens_seen": 4312858, "num_tokens": 4312858.0, "step": 1940, "train_runtime": 3060.5657, "train_tokens_per_second": 1409.17 }, { "entropy": 2.59278564453125, "epoch": 0.389, "grad_norm": 62.588836669921875, "learning_rate": 1.3974451080670934e-05, "loss": 1.7455, "mean_token_accuracy": 0.6751283645629883, "num_input_tokens_seen": 4321484, "num_tokens": 4321484.0, "step": 1945, "train_runtime": 3068.1048, "train_tokens_per_second": 1408.519 }, { "entropy": 2.7738879203796385, "epoch": 0.39, "grad_norm": 22.07179069519043, "learning_rate": 1.3944710628329409e-05, "loss": 1.2592, "mean_token_accuracy": 0.7679759621620178, "num_input_tokens_seen": 4333492, "num_tokens": 4333492.0, "step": 1950, "train_runtime": 3075.9768, "train_tokens_per_second": 1408.818 }, { "entropy": 2.7445730686187746, "epoch": 0.391, "grad_norm": 60.60033416748047, "learning_rate": 1.3914928797839996e-05, "loss": 1.7016, "mean_token_accuracy": 0.6835243940353394, "num_input_tokens_seen": 4345899, "num_tokens": 4345899.0, "step": 1955, "train_runtime": 3083.8368, "train_tokens_per_second": 1409.251 }, { "entropy": 2.473898410797119, "epoch": 0.392, "grad_norm": 21.046600341796875, "learning_rate": 1.3885105901600006e-05, "loss": 1.306, "mean_token_accuracy": 0.7607254147529602, "num_input_tokens_seen": 4356552, "num_tokens": 4356552.0, "step": 1960, "train_runtime": 3091.8041, "train_tokens_per_second": 1409.065 }, { "entropy": 2.856091547012329, "epoch": 0.393, "grad_norm": 68.50381469726562, "learning_rate": 1.3855242252437511e-05, "loss": 1.9514, "mean_token_accuracy": 0.6723478078842163, "num_input_tokens_seen": 4369794, "num_tokens": 4369794.0, "step": 1965, "train_runtime": 3099.4006, "train_tokens_per_second": 1409.884 }, { "entropy": 2.9638694763183593, "epoch": 0.394, "grad_norm": 34.5787239074707, "learning_rate": 1.3825338163608055e-05, "loss": 1.5213, "mean_token_accuracy": 0.7341637134552002, "num_input_tokens_seen": 4382380, "num_tokens": 4382380.0, "step": 1970, "train_runtime": 3106.8396, "train_tokens_per_second": 1410.559 }, { "entropy": 2.752903604507446, "epoch": 0.395, "grad_norm": 206.2166290283203, "learning_rate": 1.3795393948791382e-05, "loss": 1.7218, "mean_token_accuracy": 0.6902112841606141, "num_input_tokens_seen": 4394738, "num_tokens": 4394738.0, "step": 1975, "train_runtime": 3114.7166, "train_tokens_per_second": 1410.959 }, { "entropy": 2.607147789001465, "epoch": 0.396, "grad_norm": 20.80355453491211, "learning_rate": 1.3765409922088137e-05, "loss": 1.4608, "mean_token_accuracy": 0.7311769366264343, "num_input_tokens_seen": 4405266, "num_tokens": 4405266.0, "step": 1980, "train_runtime": 3122.4806, "train_tokens_per_second": 1410.823 }, { "entropy": 2.021634650230408, "epoch": 0.397, "grad_norm": 55.49333572387695, "learning_rate": 1.373538639801657e-05, "loss": 1.6455, "mean_token_accuracy": 0.6859090209007264, "num_input_tokens_seen": 4416409, "num_tokens": 4416409.0, "step": 1985, "train_runtime": 3130.3149, "train_tokens_per_second": 1410.851 }, { "entropy": 2.9291773796081544, "epoch": 0.398, "grad_norm": 25.359281539916992, "learning_rate": 1.370532369150924e-05, "loss": 1.3724, "mean_token_accuracy": 0.7535317420959473, "num_input_tokens_seen": 4427264, "num_tokens": 4427264.0, "step": 1990, "train_runtime": 3138.1573, "train_tokens_per_second": 1410.785 }, { "entropy": 2.150673246383667, "epoch": 0.399, "grad_norm": 57.36241912841797, "learning_rate": 1.3675222117909716e-05, "loss": 1.5802, "mean_token_accuracy": 0.7011521220207214, "num_input_tokens_seen": 4438166, "num_tokens": 4438166.0, "step": 1995, "train_runtime": 3145.7294, "train_tokens_per_second": 1410.854 }, { "entropy": 2.2710445404052733, "epoch": 0.4, "grad_norm": 26.528202056884766, "learning_rate": 1.3645081992969262e-05, "loss": 1.2274, "mean_token_accuracy": 0.7760915637016297, "num_input_tokens_seen": 4446534, "num_tokens": 4446534.0, "step": 2000, "train_runtime": 3153.2185, "train_tokens_per_second": 1410.157 }, { "entropy": 2.8309820652008058, "epoch": 0.401, "grad_norm": 59.90489196777344, "learning_rate": 1.3614903632843523e-05, "loss": 1.6837, "mean_token_accuracy": 0.6929812669754029, "num_input_tokens_seen": 4456967, "num_tokens": 4456967.0, "step": 2005, "train_runtime": 3161.0451, "train_tokens_per_second": 1409.966 }, { "entropy": 2.1849318742752075, "epoch": 0.402, "grad_norm": 31.994922637939453, "learning_rate": 1.3584687354089222e-05, "loss": 1.1703, "mean_token_accuracy": 0.7776391267776489, "num_input_tokens_seen": 4467216, "num_tokens": 4467216.0, "step": 2010, "train_runtime": 3168.9319, "train_tokens_per_second": 1409.691 }, { "entropy": 2.428547477722168, "epoch": 0.403, "grad_norm": 65.34469604492188, "learning_rate": 1.3554433473660818e-05, "loss": 1.5612, "mean_token_accuracy": 0.7086606740951538, "num_input_tokens_seen": 4476649, "num_tokens": 4476649.0, "step": 2015, "train_runtime": 3176.7976, "train_tokens_per_second": 1409.17 }, { "entropy": 2.1949065208435057, "epoch": 0.404, "grad_norm": 23.770050048828125, "learning_rate": 1.3524142308907205e-05, "loss": 1.0784, "mean_token_accuracy": 0.7896103978157043, "num_input_tokens_seen": 4485848, "num_tokens": 4485848.0, "step": 2020, "train_runtime": 3184.7482, "train_tokens_per_second": 1408.541 }, { "entropy": 2.484366226196289, "epoch": 0.405, "grad_norm": 57.11235809326172, "learning_rate": 1.3493814177568365e-05, "loss": 1.3034, "mean_token_accuracy": 0.7368498206138611, "num_input_tokens_seen": 4495372, "num_tokens": 4495372.0, "step": 2025, "train_runtime": 3192.4649, "train_tokens_per_second": 1408.119 }, { "entropy": 2.3667689323425294, "epoch": 0.406, "grad_norm": 25.410526275634766, "learning_rate": 1.3463449397772045e-05, "loss": 1.3419, "mean_token_accuracy": 0.7518476366996765, "num_input_tokens_seen": 4505500, "num_tokens": 4505500.0, "step": 2030, "train_runtime": 3200.1312, "train_tokens_per_second": 1407.911 }, { "entropy": 2.2306925773620607, "epoch": 0.407, "grad_norm": 64.74151611328125, "learning_rate": 1.3433048288030424e-05, "loss": 1.5059, "mean_token_accuracy": 0.7199578166007996, "num_input_tokens_seen": 4514049, "num_tokens": 4514049.0, "step": 2035, "train_runtime": 3207.8433, "train_tokens_per_second": 1407.191 }, { "entropy": 2.6097206115722655, "epoch": 0.408, "grad_norm": 43.114845275878906, "learning_rate": 1.3402611167236748e-05, "loss": 1.215, "mean_token_accuracy": 0.7716806769371033, "num_input_tokens_seen": 4523420, "num_tokens": 4523420.0, "step": 2040, "train_runtime": 3215.8583, "train_tokens_per_second": 1406.598 }, { "entropy": 2.7678685665130613, "epoch": 0.409, "grad_norm": 95.0975341796875, "learning_rate": 1.3372138354662018e-05, "loss": 1.6781, "mean_token_accuracy": 0.6925572752952576, "num_input_tokens_seen": 4533235, "num_tokens": 4533235.0, "step": 2045, "train_runtime": 3223.8912, "train_tokens_per_second": 1406.138 }, { "entropy": 2.5352970123291017, "epoch": 0.41, "grad_norm": 25.034561157226562, "learning_rate": 1.3341630169951616e-05, "loss": 1.2398, "mean_token_accuracy": 0.7646899223327637, "num_input_tokens_seen": 4542374, "num_tokens": 4542374.0, "step": 2050, "train_runtime": 3231.4439, "train_tokens_per_second": 1405.679 }, { "entropy": 2.3589534759521484, "epoch": 0.411, "grad_norm": 64.90263366699219, "learning_rate": 1.3311086933121961e-05, "loss": 1.7274, "mean_token_accuracy": 0.6926771521568298, "num_input_tokens_seen": 4552400, "num_tokens": 4552400.0, "step": 2055, "train_runtime": 3239.4237, "train_tokens_per_second": 1405.312 }, { "entropy": 2.361823797225952, "epoch": 0.412, "grad_norm": 38.299442291259766, "learning_rate": 1.3280508964557162e-05, "loss": 1.2408, "mean_token_accuracy": 0.7737499237060547, "num_input_tokens_seen": 4561330, "num_tokens": 4561330.0, "step": 2060, "train_runtime": 3247.0793, "train_tokens_per_second": 1404.749 }, { "entropy": 2.375571918487549, "epoch": 0.413, "grad_norm": 65.03788757324219, "learning_rate": 1.3249896585005628e-05, "loss": 1.64, "mean_token_accuracy": 0.6999081254005433, "num_input_tokens_seen": 4572010, "num_tokens": 4572010.0, "step": 2065, "train_runtime": 3255.1582, "train_tokens_per_second": 1404.543 }, { "entropy": 2.330407238006592, "epoch": 0.414, "grad_norm": 27.661897659301758, "learning_rate": 1.3219250115576745e-05, "loss": 1.0953, "mean_token_accuracy": 0.7858672976493836, "num_input_tokens_seen": 4580580, "num_tokens": 4580580.0, "step": 2070, "train_runtime": 3263.174, "train_tokens_per_second": 1403.719 }, { "entropy": 2.5630285263061525, "epoch": 0.415, "grad_norm": 61.740177154541016, "learning_rate": 1.3188569877737474e-05, "loss": 1.4936, "mean_token_accuracy": 0.7149037003517151, "num_input_tokens_seen": 4588901, "num_tokens": 4588901.0, "step": 2075, "train_runtime": 3271.1388, "train_tokens_per_second": 1402.845 }, { "entropy": 2.705780839920044, "epoch": 0.416, "grad_norm": 30.25331687927246, "learning_rate": 1.3157856193308988e-05, "loss": 1.386, "mean_token_accuracy": 0.7529369831085205, "num_input_tokens_seen": 4597258, "num_tokens": 4597258.0, "step": 2080, "train_runtime": 3278.6804, "train_tokens_per_second": 1402.167 }, { "entropy": 2.638701343536377, "epoch": 0.417, "grad_norm": 61.31464385986328, "learning_rate": 1.312710938446331e-05, "loss": 1.4616, "mean_token_accuracy": 0.7137163519859314, "num_input_tokens_seen": 4605923, "num_tokens": 4605923.0, "step": 2085, "train_runtime": 3286.3762, "train_tokens_per_second": 1401.52 }, { "entropy": 2.5018579959869385, "epoch": 0.418, "grad_norm": 27.023984909057617, "learning_rate": 1.309632977371991e-05, "loss": 1.1993, "mean_token_accuracy": 0.7722452878952026, "num_input_tokens_seen": 4615944, "num_tokens": 4615944.0, "step": 2090, "train_runtime": 3294.3382, "train_tokens_per_second": 1401.175 }, { "entropy": 2.081637239456177, "epoch": 0.419, "grad_norm": 65.72859191894531, "learning_rate": 1.3065517683942339e-05, "loss": 1.7775, "mean_token_accuracy": 0.6911065459251404, "num_input_tokens_seen": 4624738, "num_tokens": 4624738.0, "step": 2095, "train_runtime": 3302.1493, "train_tokens_per_second": 1400.524 }, { "entropy": 1.965171217918396, "epoch": 0.42, "grad_norm": 24.876333236694336, "learning_rate": 1.3034673438334841e-05, "loss": 1.2575, "mean_token_accuracy": 0.7700017213821411, "num_input_tokens_seen": 4634078, "num_tokens": 4634078.0, "step": 2100, "train_runtime": 3310.0184, "train_tokens_per_second": 1400.016 }, { "entropy": 2.3820566177368163, "epoch": 0.421, "grad_norm": 54.88595962524414, "learning_rate": 1.3003797360438961e-05, "loss": 1.7664, "mean_token_accuracy": 0.6889140725135803, "num_input_tokens_seen": 4646363, "num_tokens": 4646363.0, "step": 2105, "train_runtime": 3317.8769, "train_tokens_per_second": 1400.402 }, { "entropy": 2.259655714035034, "epoch": 0.422, "grad_norm": 26.683488845825195, "learning_rate": 1.297288977413014e-05, "loss": 1.4308, "mean_token_accuracy": 0.7392370939254761, "num_input_tokens_seen": 4657688, "num_tokens": 4657688.0, "step": 2110, "train_runtime": 3325.6304, "train_tokens_per_second": 1400.543 }, { "entropy": 2.6297634124755858, "epoch": 0.423, "grad_norm": 56.07148361206055, "learning_rate": 1.2941951003614337e-05, "loss": 1.8045, "mean_token_accuracy": 0.6821731626987457, "num_input_tokens_seen": 4670980, "num_tokens": 4670980.0, "step": 2115, "train_runtime": 3333.4476, "train_tokens_per_second": 1401.246 }, { "entropy": 2.123594784736633, "epoch": 0.424, "grad_norm": 21.180343627929688, "learning_rate": 1.2910981373424614e-05, "loss": 1.1755, "mean_token_accuracy": 0.7733092665672302, "num_input_tokens_seen": 4682632, "num_tokens": 4682632.0, "step": 2120, "train_runtime": 3341.4828, "train_tokens_per_second": 1401.363 }, { "entropy": 2.932101821899414, "epoch": 0.425, "grad_norm": 73.67310333251953, "learning_rate": 1.2879981208417735e-05, "loss": 1.9308, "mean_token_accuracy": 0.6424931526184082, "num_input_tokens_seen": 4692309, "num_tokens": 4692309.0, "step": 2125, "train_runtime": 3349.3203, "train_tokens_per_second": 1400.973 }, { "entropy": 2.272143316268921, "epoch": 0.426, "grad_norm": 21.83971405029297, "learning_rate": 1.2848950833770764e-05, "loss": 1.5694, "mean_token_accuracy": 0.7182354509830475, "num_input_tokens_seen": 4703196, "num_tokens": 4703196.0, "step": 2130, "train_runtime": 3357.2586, "train_tokens_per_second": 1400.904 }, { "entropy": 2.5904906749725343, "epoch": 0.427, "grad_norm": 95.9944839477539, "learning_rate": 1.2817890574977648e-05, "loss": 1.7986, "mean_token_accuracy": 0.6786379218101501, "num_input_tokens_seen": 4713207, "num_tokens": 4713207.0, "step": 2135, "train_runtime": 3365.0003, "train_tokens_per_second": 1400.656 }, { "entropy": 2.4461758613586424, "epoch": 0.428, "grad_norm": 39.06465148925781, "learning_rate": 1.2786800757845802e-05, "loss": 1.4954, "mean_token_accuracy": 0.741902756690979, "num_input_tokens_seen": 4723156, "num_tokens": 4723156.0, "step": 2140, "train_runtime": 3372.9033, "train_tokens_per_second": 1400.324 }, { "entropy": 2.6978917121887207, "epoch": 0.429, "grad_norm": 64.48131561279297, "learning_rate": 1.2755681708492696e-05, "loss": 1.6479, "mean_token_accuracy": 0.7079470157623291, "num_input_tokens_seen": 4732696, "num_tokens": 4732696.0, "step": 2145, "train_runtime": 3380.4964, "train_tokens_per_second": 1400.0 }, { "entropy": 2.295404243469238, "epoch": 0.43, "grad_norm": 28.21259307861328, "learning_rate": 1.2724533753342433e-05, "loss": 1.4037, "mean_token_accuracy": 0.7403062343597412, "num_input_tokens_seen": 4743952, "num_tokens": 4743952.0, "step": 2150, "train_runtime": 3388.605, "train_tokens_per_second": 1399.972 }, { "entropy": 2.5978235244750976, "epoch": 0.431, "grad_norm": 59.41149139404297, "learning_rate": 1.2693357219122331e-05, "loss": 1.7032, "mean_token_accuracy": 0.6875249981880188, "num_input_tokens_seen": 4755076, "num_tokens": 4755076.0, "step": 2155, "train_runtime": 3396.5812, "train_tokens_per_second": 1399.959 }, { "entropy": 2.402075910568237, "epoch": 0.432, "grad_norm": 26.78700828552246, "learning_rate": 1.266215243285947e-05, "loss": 1.2555, "mean_token_accuracy": 0.7547957181930542, "num_input_tokens_seen": 4766822, "num_tokens": 4766822.0, "step": 2160, "train_runtime": 3404.5212, "train_tokens_per_second": 1400.145 }, { "entropy": 2.479512929916382, "epoch": 0.433, "grad_norm": 64.87886810302734, "learning_rate": 1.2630919721877299e-05, "loss": 1.6785, "mean_token_accuracy": 0.690955913066864, "num_input_tokens_seen": 4777812, "num_tokens": 4777812.0, "step": 2165, "train_runtime": 3412.2257, "train_tokens_per_second": 1400.204 }, { "entropy": 2.3782662868499758, "epoch": 0.434, "grad_norm": 33.69001007080078, "learning_rate": 1.2599659413792176e-05, "loss": 1.4438, "mean_token_accuracy": 0.7445960640907288, "num_input_tokens_seen": 4787046, "num_tokens": 4787046.0, "step": 2170, "train_runtime": 3420.0363, "train_tokens_per_second": 1399.706 }, { "entropy": 2.563517618179321, "epoch": 0.435, "grad_norm": 76.82723999023438, "learning_rate": 1.2568371836509936e-05, "loss": 1.6726, "mean_token_accuracy": 0.6899335861206055, "num_input_tokens_seen": 4797136, "num_tokens": 4797136.0, "step": 2175, "train_runtime": 3427.6812, "train_tokens_per_second": 1399.528 }, { "entropy": 2.482169437408447, "epoch": 0.436, "grad_norm": 25.930770874023438, "learning_rate": 1.2537057318222468e-05, "loss": 1.1162, "mean_token_accuracy": 0.7744334816932679, "num_input_tokens_seen": 4806072, "num_tokens": 4806072.0, "step": 2180, "train_runtime": 3435.4667, "train_tokens_per_second": 1398.958 }, { "entropy": 2.9235743999481203, "epoch": 0.437, "grad_norm": 62.73664093017578, "learning_rate": 1.2505716187404242e-05, "loss": 1.7399, "mean_token_accuracy": 0.6750505089759826, "num_input_tokens_seen": 4815747, "num_tokens": 4815747.0, "step": 2185, "train_runtime": 3443.4844, "train_tokens_per_second": 1398.51 }, { "entropy": 2.4902596950531004, "epoch": 0.438, "grad_norm": 24.94045066833496, "learning_rate": 1.2474348772808897e-05, "loss": 1.2361, "mean_token_accuracy": 0.76468266248703, "num_input_tokens_seen": 4824446, "num_tokens": 4824446.0, "step": 2190, "train_runtime": 3451.2613, "train_tokens_per_second": 1397.879 }, { "entropy": 2.337691831588745, "epoch": 0.439, "grad_norm": 60.62818908691406, "learning_rate": 1.2442955403465768e-05, "loss": 1.5405, "mean_token_accuracy": 0.7064475774765014, "num_input_tokens_seen": 4834853, "num_tokens": 4834853.0, "step": 2195, "train_runtime": 3458.8855, "train_tokens_per_second": 1397.807 }, { "entropy": 2.2915517330169677, "epoch": 0.44, "grad_norm": 27.688276290893555, "learning_rate": 1.2411536408676443e-05, "loss": 1.2537, "mean_token_accuracy": 0.7575583338737488, "num_input_tokens_seen": 4844394, "num_tokens": 4844394.0, "step": 2200, "train_runtime": 3466.8937, "train_tokens_per_second": 1397.33 }, { "entropy": 2.7645617961883544, "epoch": 0.441, "grad_norm": 63.701656341552734, "learning_rate": 1.238009211801131e-05, "loss": 1.5341, "mean_token_accuracy": 0.7056471943855286, "num_input_tokens_seen": 4854105, "num_tokens": 4854105.0, "step": 2205, "train_runtime": 3474.5705, "train_tokens_per_second": 1397.037 }, { "entropy": 2.8495275020599364, "epoch": 0.442, "grad_norm": 28.545921325683594, "learning_rate": 1.23486228613061e-05, "loss": 1.161, "mean_token_accuracy": 0.7700004339218139, "num_input_tokens_seen": 4862960, "num_tokens": 4862960.0, "step": 2210, "train_runtime": 3482.5751, "train_tokens_per_second": 1396.369 }, { "entropy": 3.010589599609375, "epoch": 0.443, "grad_norm": 66.53335571289062, "learning_rate": 1.2317128968658424e-05, "loss": 1.4513, "mean_token_accuracy": 0.7317088127136231, "num_input_tokens_seen": 4872397, "num_tokens": 4872397.0, "step": 2215, "train_runtime": 3490.4423, "train_tokens_per_second": 1395.925 }, { "entropy": 2.869585466384888, "epoch": 0.444, "grad_norm": 39.84257507324219, "learning_rate": 1.2285610770424311e-05, "loss": 1.3837, "mean_token_accuracy": 0.7556891441345215, "num_input_tokens_seen": 4881104, "num_tokens": 4881104.0, "step": 2220, "train_runtime": 3497.8673, "train_tokens_per_second": 1395.451 }, { "entropy": 2.7492702484130858, "epoch": 0.445, "grad_norm": 67.31108856201172, "learning_rate": 1.225406859721475e-05, "loss": 1.6301, "mean_token_accuracy": 0.6953318834304809, "num_input_tokens_seen": 4890187, "num_tokens": 4890187.0, "step": 2225, "train_runtime": 3505.744, "train_tokens_per_second": 1394.907 }, { "entropy": 2.4935908555984496, "epoch": 0.446, "grad_norm": 26.493114471435547, "learning_rate": 1.222250277989221e-05, "loss": 1.1222, "mean_token_accuracy": 0.7825849652290344, "num_input_tokens_seen": 4898914, "num_tokens": 4898914.0, "step": 2230, "train_runtime": 3513.6414, "train_tokens_per_second": 1394.256 }, { "entropy": 2.4573601722717284, "epoch": 0.447, "grad_norm": 69.47283172607422, "learning_rate": 1.2190913649567185e-05, "loss": 1.5488, "mean_token_accuracy": 0.720537793636322, "num_input_tokens_seen": 4906945, "num_tokens": 4906945.0, "step": 2235, "train_runtime": 3521.1749, "train_tokens_per_second": 1393.553 }, { "entropy": 2.5373757600784304, "epoch": 0.448, "grad_norm": 28.997455596923828, "learning_rate": 1.2159301537594691e-05, "loss": 1.1585, "mean_token_accuracy": 0.7745639562606812, "num_input_tokens_seen": 4916710, "num_tokens": 4916710.0, "step": 2240, "train_runtime": 3529.0999, "train_tokens_per_second": 1393.191 }, { "entropy": 2.82493896484375, "epoch": 0.449, "grad_norm": 78.38630676269531, "learning_rate": 1.2127666775570837e-05, "loss": 1.6532, "mean_token_accuracy": 0.7043864607810975, "num_input_tokens_seen": 4925767, "num_tokens": 4925767.0, "step": 2245, "train_runtime": 3536.979, "train_tokens_per_second": 1392.648 }, { "entropy": 2.5786003351211546, "epoch": 0.45, "grad_norm": 25.663379669189453, "learning_rate": 1.2096009695329298e-05, "loss": 1.2167, "mean_token_accuracy": 0.7712558031082153, "num_input_tokens_seen": 4934962, "num_tokens": 4934962.0, "step": 2250, "train_runtime": 3544.6356, "train_tokens_per_second": 1392.234 }, { "entropy": 2.7805480003356933, "epoch": 0.451, "grad_norm": 62.76482391357422, "learning_rate": 1.206433062893787e-05, "loss": 1.4777, "mean_token_accuracy": 0.7103058695793152, "num_input_tokens_seen": 4943049, "num_tokens": 4943049.0, "step": 2255, "train_runtime": 3552.1177, "train_tokens_per_second": 1391.578 }, { "entropy": 2.498188877105713, "epoch": 0.452, "grad_norm": 55.635433197021484, "learning_rate": 1.2032629908694969e-05, "loss": 1.2021, "mean_token_accuracy": 0.7681819438934326, "num_input_tokens_seen": 4954602, "num_tokens": 4954602.0, "step": 2260, "train_runtime": 3560.006, "train_tokens_per_second": 1391.74 }, { "entropy": 2.428882026672363, "epoch": 0.453, "grad_norm": 71.52632141113281, "learning_rate": 1.200090786712615e-05, "loss": 1.8435, "mean_token_accuracy": 0.6581845641136169, "num_input_tokens_seen": 4964403, "num_tokens": 4964403.0, "step": 2265, "train_runtime": 3567.5542, "train_tokens_per_second": 1391.542 }, { "entropy": 2.32411642074585, "epoch": 0.454, "grad_norm": 27.487852096557617, "learning_rate": 1.1969164836980618e-05, "loss": 1.3404, "mean_token_accuracy": 0.749431049823761, "num_input_tokens_seen": 4975196, "num_tokens": 4975196.0, "step": 2270, "train_runtime": 3575.3646, "train_tokens_per_second": 1391.521 }, { "entropy": 2.979253816604614, "epoch": 0.455, "grad_norm": 68.48418426513672, "learning_rate": 1.193740115122774e-05, "loss": 1.7784, "mean_token_accuracy": 0.6793139576911926, "num_input_tokens_seen": 4988675, "num_tokens": 4988675.0, "step": 2275, "train_runtime": 3583.4146, "train_tokens_per_second": 1392.157 }, { "entropy": 2.514218473434448, "epoch": 0.456, "grad_norm": 22.849912643432617, "learning_rate": 1.190561714305355e-05, "loss": 1.1799, "mean_token_accuracy": 0.7718681454658508, "num_input_tokens_seen": 5000038, "num_tokens": 5000038.0, "step": 2280, "train_runtime": 3590.8873, "train_tokens_per_second": 1392.424 }, { "entropy": 2.6696307182312013, "epoch": 0.457, "grad_norm": 76.13416290283203, "learning_rate": 1.187381314585725e-05, "loss": 1.8112, "mean_token_accuracy": 0.6729209780693054, "num_input_tokens_seen": 5011585, "num_tokens": 5011585.0, "step": 2285, "train_runtime": 3598.7857, "train_tokens_per_second": 1392.577 }, { "entropy": 2.1659085273742678, "epoch": 0.458, "grad_norm": 24.351327896118164, "learning_rate": 1.184198949324772e-05, "loss": 1.2017, "mean_token_accuracy": 0.7709569334983826, "num_input_tokens_seen": 5021078, "num_tokens": 5021078.0, "step": 2290, "train_runtime": 3606.5352, "train_tokens_per_second": 1392.217 }, { "entropy": 2.2089401960372923, "epoch": 0.459, "grad_norm": 66.46270751953125, "learning_rate": 1.1810146519040023e-05, "loss": 1.9148, "mean_token_accuracy": 0.668359911441803, "num_input_tokens_seen": 5032017, "num_tokens": 5032017.0, "step": 2295, "train_runtime": 3614.1085, "train_tokens_per_second": 1392.326 }, { "entropy": 2.102683186531067, "epoch": 0.46, "grad_norm": 28.139602661132812, "learning_rate": 1.1778284557251887e-05, "loss": 1.4473, "mean_token_accuracy": 0.7397825002670289, "num_input_tokens_seen": 5043396, "num_tokens": 5043396.0, "step": 2300, "train_runtime": 3621.809, "train_tokens_per_second": 1392.507 }, { "entropy": 2.2975786685943604, "epoch": 0.461, "grad_norm": 64.07698822021484, "learning_rate": 1.1746403942100215e-05, "loss": 1.713, "mean_token_accuracy": 0.6901416420936585, "num_input_tokens_seen": 5054717, "num_tokens": 5054717.0, "step": 2305, "train_runtime": 3629.6204, "train_tokens_per_second": 1392.63 }, { "entropy": 1.8617132663726808, "epoch": 0.462, "grad_norm": 23.054637908935547, "learning_rate": 1.1714505007997576e-05, "loss": 0.9983, "mean_token_accuracy": 0.7961058735847473, "num_input_tokens_seen": 5064058, "num_tokens": 5064058.0, "step": 2310, "train_runtime": 3637.2725, "train_tokens_per_second": 1392.268 }, { "entropy": 1.8679781436920166, "epoch": 0.463, "grad_norm": 68.77053833007812, "learning_rate": 1.1682588089548692e-05, "loss": 1.6293, "mean_token_accuracy": 0.6980883717536926, "num_input_tokens_seen": 5075420, "num_tokens": 5075420.0, "step": 2315, "train_runtime": 3645.2003, "train_tokens_per_second": 1392.357 }, { "entropy": 1.9279284477233887, "epoch": 0.464, "grad_norm": 22.324859619140625, "learning_rate": 1.1650653521546937e-05, "loss": 1.3264, "mean_token_accuracy": 0.7469206213951111, "num_input_tokens_seen": 5086040, "num_tokens": 5086040.0, "step": 2320, "train_runtime": 3652.8546, "train_tokens_per_second": 1392.347 }, { "entropy": 2.027264356613159, "epoch": 0.465, "grad_norm": 74.19609832763672, "learning_rate": 1.1618701638970815e-05, "loss": 1.7442, "mean_token_accuracy": 0.6801894903182983, "num_input_tokens_seen": 5095384, "num_tokens": 5095384.0, "step": 2325, "train_runtime": 3660.8571, "train_tokens_per_second": 1391.855 }, { "entropy": 2.0260194301605225, "epoch": 0.466, "grad_norm": 32.923797607421875, "learning_rate": 1.1586732776980456e-05, "loss": 1.3114, "mean_token_accuracy": 0.7560211539268493, "num_input_tokens_seen": 5105304, "num_tokens": 5105304.0, "step": 2330, "train_runtime": 3668.742, "train_tokens_per_second": 1391.568 }, { "entropy": 2.1154204607009888, "epoch": 0.467, "grad_norm": 76.26025390625, "learning_rate": 1.1554747270914098e-05, "loss": 1.5519, "mean_token_accuracy": 0.7048524379730224, "num_input_tokens_seen": 5113894, "num_tokens": 5113894.0, "step": 2335, "train_runtime": 3676.664, "train_tokens_per_second": 1390.906 }, { "entropy": 2.2630292892456056, "epoch": 0.468, "grad_norm": 31.031639099121094, "learning_rate": 1.1522745456284557e-05, "loss": 1.2818, "mean_token_accuracy": 0.7738163709640503, "num_input_tokens_seen": 5122798, "num_tokens": 5122798.0, "step": 2340, "train_runtime": 3684.3793, "train_tokens_per_second": 1390.41 }, { "entropy": 2.3498332500457764, "epoch": 0.469, "grad_norm": 75.29490661621094, "learning_rate": 1.1490727668775735e-05, "loss": 1.6929, "mean_token_accuracy": 0.6886336028575897, "num_input_tokens_seen": 5132088, "num_tokens": 5132088.0, "step": 2345, "train_runtime": 3692.4321, "train_tokens_per_second": 1389.894 }, { "entropy": 2.4634775638580324, "epoch": 0.47, "grad_norm": 24.490215301513672, "learning_rate": 1.1458694244239067e-05, "loss": 1.2958, "mean_token_accuracy": 0.7619746923446655, "num_input_tokens_seen": 5141270, "num_tokens": 5141270.0, "step": 2350, "train_runtime": 3700.4203, "train_tokens_per_second": 1389.375 }, { "entropy": 2.319100761413574, "epoch": 0.471, "grad_norm": 80.76954650878906, "learning_rate": 1.1426645518690015e-05, "loss": 1.734, "mean_token_accuracy": 0.6861746549606323, "num_input_tokens_seen": 5150455, "num_tokens": 5150455.0, "step": 2355, "train_runtime": 3707.9168, "train_tokens_per_second": 1389.043 }, { "entropy": 2.3384434938430787, "epoch": 0.472, "grad_norm": 32.591854095458984, "learning_rate": 1.1394581828304555e-05, "loss": 1.2432, "mean_token_accuracy": 0.7643887042999268, "num_input_tokens_seen": 5158022, "num_tokens": 5158022.0, "step": 2360, "train_runtime": 3715.7369, "train_tokens_per_second": 1388.156 }, { "entropy": 2.628079652786255, "epoch": 0.473, "grad_norm": 85.22615051269531, "learning_rate": 1.136250350941562e-05, "loss": 1.7953, "mean_token_accuracy": 0.6756073594093323, "num_input_tokens_seen": 5167011, "num_tokens": 5167011.0, "step": 2365, "train_runtime": 3723.7958, "train_tokens_per_second": 1387.566 }, { "entropy": 2.5341138362884523, "epoch": 0.474, "grad_norm": 29.02531623840332, "learning_rate": 1.1330410898509594e-05, "loss": 1.2685, "mean_token_accuracy": 0.7671826124191284, "num_input_tokens_seen": 5175280, "num_tokens": 5175280.0, "step": 2370, "train_runtime": 3731.4012, "train_tokens_per_second": 1386.954 }, { "entropy": 2.271046447753906, "epoch": 0.475, "grad_norm": 75.07585906982422, "learning_rate": 1.129830433222278e-05, "loss": 1.861, "mean_token_accuracy": 0.6692618727684021, "num_input_tokens_seen": 5184410, "num_tokens": 5184410.0, "step": 2375, "train_runtime": 3739.3303, "train_tokens_per_second": 1386.454 }, { "entropy": 2.1791377305984496, "epoch": 0.476, "grad_norm": 31.2888126373291, "learning_rate": 1.1266184147337873e-05, "loss": 1.2234, "mean_token_accuracy": 0.7568565011024475, "num_input_tokens_seen": 5194040, "num_tokens": 5194040.0, "step": 2380, "train_runtime": 3747.03, "train_tokens_per_second": 1386.175 }, { "entropy": 2.4959903717041017, "epoch": 0.477, "grad_norm": 66.00300598144531, "learning_rate": 1.1234050680780407e-05, "loss": 1.6588, "mean_token_accuracy": 0.696483850479126, "num_input_tokens_seen": 5203532, "num_tokens": 5203532.0, "step": 2385, "train_runtime": 3754.9904, "train_tokens_per_second": 1385.764 }, { "entropy": 2.3486272573471068, "epoch": 0.478, "grad_norm": 29.015193939208984, "learning_rate": 1.1201904269615242e-05, "loss": 1.3185, "mean_token_accuracy": 0.7605429053306579, "num_input_tokens_seen": 5212816, "num_tokens": 5212816.0, "step": 2390, "train_runtime": 3763.0306, "train_tokens_per_second": 1385.271 }, { "entropy": 2.2502779960632324, "epoch": 0.479, "grad_norm": 85.54402923583984, "learning_rate": 1.116974525104302e-05, "loss": 1.5306, "mean_token_accuracy": 0.718666410446167, "num_input_tokens_seen": 5221320, "num_tokens": 5221320.0, "step": 2395, "train_runtime": 3770.7591, "train_tokens_per_second": 1384.687 }, { "entropy": 2.1846177101135256, "epoch": 0.48, "grad_norm": 29.63794708251953, "learning_rate": 1.113757396239663e-05, "loss": 1.2445, "mean_token_accuracy": 0.7686284780502319, "num_input_tokens_seen": 5230022, "num_tokens": 5230022.0, "step": 2400, "train_runtime": 3778.8094, "train_tokens_per_second": 1384.04 }, { "entropy": 2.394475483894348, "epoch": 0.481, "grad_norm": 70.5522232055664, "learning_rate": 1.110539074113766e-05, "loss": 1.6196, "mean_token_accuracy": 0.706342589855194, "num_input_tokens_seen": 5239086, "num_tokens": 5239086.0, "step": 2405, "train_runtime": 3786.7398, "train_tokens_per_second": 1383.535 }, { "entropy": 2.313472366333008, "epoch": 0.482, "grad_norm": 27.1429386138916, "learning_rate": 1.1073195924852882e-05, "loss": 1.1491, "mean_token_accuracy": 0.7716314315795898, "num_input_tokens_seen": 5248070, "num_tokens": 5248070.0, "step": 2410, "train_runtime": 3794.2955, "train_tokens_per_second": 1383.147 }, { "entropy": 2.039521503448486, "epoch": 0.483, "grad_norm": 74.07726287841797, "learning_rate": 1.1040989851250678e-05, "loss": 1.5477, "mean_token_accuracy": 0.7035903334617615, "num_input_tokens_seen": 5257381, "num_tokens": 5257381.0, "step": 2415, "train_runtime": 3802.1527, "train_tokens_per_second": 1382.738 }, { "entropy": 1.8080446004867554, "epoch": 0.484, "grad_norm": 30.70682144165039, "learning_rate": 1.1008772858157524e-05, "loss": 1.3664, "mean_token_accuracy": 0.7516297578811646, "num_input_tokens_seen": 5265490, "num_tokens": 5265490.0, "step": 2420, "train_runtime": 3809.9037, "train_tokens_per_second": 1382.053 }, { "entropy": 2.603155279159546, "epoch": 0.485, "grad_norm": 73.88636779785156, "learning_rate": 1.097654528351443e-05, "loss": 2.0935, "mean_token_accuracy": 0.6405226349830627, "num_input_tokens_seen": 5278476, "num_tokens": 5278476.0, "step": 2425, "train_runtime": 3817.7616, "train_tokens_per_second": 1382.61 }, { "entropy": 2.1003849029541017, "epoch": 0.486, "grad_norm": 38.27060317993164, "learning_rate": 1.0944307465373405e-05, "loss": 1.55, "mean_token_accuracy": 0.7145233988761902, "num_input_tokens_seen": 5292594, "num_tokens": 5292594.0, "step": 2430, "train_runtime": 3825.8637, "train_tokens_per_second": 1383.372 }, { "entropy": 2.1917747974395754, "epoch": 0.487, "grad_norm": 63.65741729736328, "learning_rate": 1.0912059741893908e-05, "loss": 1.7866, "mean_token_accuracy": 0.6764948725700378, "num_input_tokens_seen": 5304842, "num_tokens": 5304842.0, "step": 2435, "train_runtime": 3833.4189, "train_tokens_per_second": 1383.841 }, { "entropy": 2.2929168224334715, "epoch": 0.488, "grad_norm": 32.77652359008789, "learning_rate": 1.0879802451339298e-05, "loss": 1.3753, "mean_token_accuracy": 0.7458914279937744, "num_input_tokens_seen": 5317460, "num_tokens": 5317460.0, "step": 2440, "train_runtime": 3841.0846, "train_tokens_per_second": 1384.364 }, { "entropy": 2.3096675872802734, "epoch": 0.489, "grad_norm": 67.15498352050781, "learning_rate": 1.0847535932073288e-05, "loss": 1.8752, "mean_token_accuracy": 0.6722553133964538, "num_input_tokens_seen": 5329013, "num_tokens": 5329013.0, "step": 2445, "train_runtime": 3848.9474, "train_tokens_per_second": 1384.538 }, { "entropy": 2.256050872802734, "epoch": 0.49, "grad_norm": 36.92278289794922, "learning_rate": 1.0815260522556394e-05, "loss": 1.4008, "mean_token_accuracy": 0.7438294291496277, "num_input_tokens_seen": 5340320, "num_tokens": 5340320.0, "step": 2450, "train_runtime": 3856.6524, "train_tokens_per_second": 1384.703 }, { "entropy": 2.524534559249878, "epoch": 0.491, "grad_norm": 67.93724060058594, "learning_rate": 1.0782976561342398e-05, "loss": 1.4704, "mean_token_accuracy": 0.7066685438156128, "num_input_tokens_seen": 5352624, "num_tokens": 5352624.0, "step": 2455, "train_runtime": 3864.3586, "train_tokens_per_second": 1385.126 }, { "entropy": 2.5060646533966064, "epoch": 0.492, "grad_norm": 26.436161041259766, "learning_rate": 1.075068438707477e-05, "loss": 1.2244, "mean_token_accuracy": 0.7683283925056458, "num_input_tokens_seen": 5362142, "num_tokens": 5362142.0, "step": 2460, "train_runtime": 3872.2699, "train_tokens_per_second": 1384.754 }, { "entropy": 1.5255397081375122, "epoch": 0.493, "grad_norm": 61.322486877441406, "learning_rate": 1.0718384338483141e-05, "loss": 1.4482, "mean_token_accuracy": 0.7277140736579895, "num_input_tokens_seen": 5371346, "num_tokens": 5371346.0, "step": 2465, "train_runtime": 3879.8039, "train_tokens_per_second": 1384.437 }, { "entropy": 1.8891954660415649, "epoch": 0.494, "grad_norm": 39.74687957763672, "learning_rate": 1.0686076754379734e-05, "loss": 1.5489, "mean_token_accuracy": 0.7149614453315735, "num_input_tokens_seen": 5382042, "num_tokens": 5382042.0, "step": 2470, "train_runtime": 3887.5091, "train_tokens_per_second": 1384.445 }, { "entropy": 1.9988011360168456, "epoch": 0.495, "grad_norm": 74.68470764160156, "learning_rate": 1.0653761973655819e-05, "loss": 1.7294, "mean_token_accuracy": 0.6867223381996155, "num_input_tokens_seen": 5393500, "num_tokens": 5393500.0, "step": 2475, "train_runtime": 3895.1992, "train_tokens_per_second": 1384.653 }, { "entropy": 1.7847257614135743, "epoch": 0.496, "grad_norm": 34.90618133544922, "learning_rate": 1.0621440335278152e-05, "loss": 1.1939, "mean_token_accuracy": 0.7667779922485352, "num_input_tokens_seen": 5400962, "num_tokens": 5400962.0, "step": 2480, "train_runtime": 3902.7445, "train_tokens_per_second": 1383.888 }, { "entropy": 1.8722482681274415, "epoch": 0.497, "grad_norm": 78.09188842773438, "learning_rate": 1.0589112178285432e-05, "loss": 1.5192, "mean_token_accuracy": 0.7083927989006042, "num_input_tokens_seen": 5409833, "num_tokens": 5409833.0, "step": 2485, "train_runtime": 3910.171, "train_tokens_per_second": 1383.528 }, { "entropy": 1.7733655452728272, "epoch": 0.498, "grad_norm": 35.28709411621094, "learning_rate": 1.0556777841784725e-05, "loss": 1.1774, "mean_token_accuracy": 0.7677849292755127, "num_input_tokens_seen": 5420356, "num_tokens": 5420356.0, "step": 2490, "train_runtime": 3917.9915, "train_tokens_per_second": 1383.453 }, { "entropy": 2.413319444656372, "epoch": 0.499, "grad_norm": 74.17864990234375, "learning_rate": 1.0524437664947918e-05, "loss": 1.7066, "mean_token_accuracy": 0.683707857131958, "num_input_tokens_seen": 5431329, "num_tokens": 5431329.0, "step": 2495, "train_runtime": 3925.4987, "train_tokens_per_second": 1383.602 }, { "entropy": 2.02371826171875, "epoch": 0.5, "grad_norm": 35.09919357299805, "learning_rate": 1.0492091987008167e-05, "loss": 1.369, "mean_token_accuracy": 0.7511448740959168, "num_input_tokens_seen": 5443422, "num_tokens": 5443422.0, "step": 2500, "train_runtime": 3933.3239, "train_tokens_per_second": 1383.924 }, { "entropy": 2.104692506790161, "epoch": 0.501, "grad_norm": 68.4740982055664, "learning_rate": 1.0459741147256325e-05, "loss": 1.5156, "mean_token_accuracy": 0.7185858845710754, "num_input_tokens_seen": 5453670, "num_tokens": 5453670.0, "step": 2505, "train_runtime": 3941.0404, "train_tokens_per_second": 1383.815 }, { "entropy": 2.5016586780548096, "epoch": 0.502, "grad_norm": 48.510379791259766, "learning_rate": 1.0427385485037398e-05, "loss": 1.4529, "mean_token_accuracy": 0.7408476829528808, "num_input_tokens_seen": 5464564, "num_tokens": 5464564.0, "step": 2510, "train_runtime": 3948.6536, "train_tokens_per_second": 1383.906 }, { "entropy": 2.6814270496368406, "epoch": 0.503, "grad_norm": 73.85176849365234, "learning_rate": 1.0395025339746965e-05, "loss": 1.8179, "mean_token_accuracy": 0.6835158348083497, "num_input_tokens_seen": 5476777, "num_tokens": 5476777.0, "step": 2515, "train_runtime": 3956.2966, "train_tokens_per_second": 1384.319 }, { "entropy": 1.75837185382843, "epoch": 0.504, "grad_norm": 36.144683837890625, "learning_rate": 1.0362661050827643e-05, "loss": 1.2502, "mean_token_accuracy": 0.7692520976066589, "num_input_tokens_seen": 5489848, "num_tokens": 5489848.0, "step": 2520, "train_runtime": 3964.2761, "train_tokens_per_second": 1384.83 }, { "entropy": 1.8274819612503053, "epoch": 0.505, "grad_norm": 86.38107299804688, "learning_rate": 1.0330292957765502e-05, "loss": 1.6829, "mean_token_accuracy": 0.7023744821548462, "num_input_tokens_seen": 5501783, "num_tokens": 5501783.0, "step": 2525, "train_runtime": 3971.8884, "train_tokens_per_second": 1385.181 }, { "entropy": 1.8927310466766358, "epoch": 0.506, "grad_norm": 30.056936264038086, "learning_rate": 1.0297921400086528e-05, "loss": 1.2869, "mean_token_accuracy": 0.7498111128807068, "num_input_tokens_seen": 5511894, "num_tokens": 5511894.0, "step": 2530, "train_runtime": 3979.6922, "train_tokens_per_second": 1385.005 }, { "entropy": 1.9326712846755982, "epoch": 0.507, "grad_norm": 69.70152282714844, "learning_rate": 1.0265546717353041e-05, "loss": 1.7408, "mean_token_accuracy": 0.6896507740020752, "num_input_tokens_seen": 5524722, "num_tokens": 5524722.0, "step": 2535, "train_runtime": 3987.5988, "train_tokens_per_second": 1385.476 }, { "entropy": 1.635087537765503, "epoch": 0.508, "grad_norm": 55.6661376953125, "learning_rate": 1.0233169249160145e-05, "loss": 1.2363, "mean_token_accuracy": 0.7682471752166748, "num_input_tokens_seen": 5536170, "num_tokens": 5536170.0, "step": 2540, "train_runtime": 3995.2156, "train_tokens_per_second": 1385.7 }, { "entropy": 1.6090543508529662, "epoch": 0.509, "grad_norm": 65.3450698852539, "learning_rate": 1.0200789335132157e-05, "loss": 1.4682, "mean_token_accuracy": 0.7170939803123474, "num_input_tokens_seen": 5543591, "num_tokens": 5543591.0, "step": 2545, "train_runtime": 4002.8278, "train_tokens_per_second": 1384.919 }, { "entropy": 1.296734356880188, "epoch": 0.51, "grad_norm": 24.84371566772461, "learning_rate": 1.0168407314919057e-05, "loss": 0.986, "mean_token_accuracy": 0.792439603805542, "num_input_tokens_seen": 5551322, "num_tokens": 5551322.0, "step": 2550, "train_runtime": 4010.4425, "train_tokens_per_second": 1384.217 }, { "entropy": 1.7663912296295166, "epoch": 0.511, "grad_norm": 64.37688446044922, "learning_rate": 1.013602352819291e-05, "loss": 1.5933, "mean_token_accuracy": 0.7116834282875061, "num_input_tokens_seen": 5561829, "num_tokens": 5561829.0, "step": 2555, "train_runtime": 4018.1372, "train_tokens_per_second": 1384.181 }, { "entropy": 1.306897807121277, "epoch": 0.512, "grad_norm": 25.88380241394043, "learning_rate": 1.0103638314644322e-05, "loss": 1.0418, "mean_token_accuracy": 0.7948366641998291, "num_input_tokens_seen": 5570138, "num_tokens": 5570138.0, "step": 2560, "train_runtime": 4025.7195, "train_tokens_per_second": 1383.638 }, { "entropy": 1.5962664604187011, "epoch": 0.513, "grad_norm": 126.559814453125, "learning_rate": 1.0071252013978852e-05, "loss": 1.4954, "mean_token_accuracy": 0.7237534284591675, "num_input_tokens_seen": 5579271, "num_tokens": 5579271.0, "step": 2565, "train_runtime": 4033.3324, "train_tokens_per_second": 1383.291 }, { "entropy": 1.5552825450897216, "epoch": 0.514, "grad_norm": 33.184959411621094, "learning_rate": 1.0038864965913469e-05, "loss": 1.5116, "mean_token_accuracy": 0.730481481552124, "num_input_tokens_seen": 5592780, "num_tokens": 5592780.0, "step": 2570, "train_runtime": 4040.9267, "train_tokens_per_second": 1384.034 }, { "entropy": 1.6527009725570678, "epoch": 0.515, "grad_norm": 77.59932708740234, "learning_rate": 1.0006477510172984e-05, "loss": 2.2188, "mean_token_accuracy": 0.6179170370101928, "num_input_tokens_seen": 5605735, "num_tokens": 5605735.0, "step": 2575, "train_runtime": 4049.0013, "train_tokens_per_second": 1384.473 }, { "entropy": 1.1595689535140992, "epoch": 0.516, "grad_norm": 28.392168045043945, "learning_rate": 9.974089986486488e-06, "loss": 1.0121, "mean_token_accuracy": 0.7915455341339112, "num_input_tokens_seen": 5615068, "num_tokens": 5615068.0, "step": 2580, "train_runtime": 4056.6635, "train_tokens_per_second": 1384.159 }, { "entropy": 1.362221074104309, "epoch": 0.517, "grad_norm": 75.6092758178711, "learning_rate": 9.941702734583771e-06, "loss": 1.614, "mean_token_accuracy": 0.6984987616539001, "num_input_tokens_seen": 5628158, "num_tokens": 5628158.0, "step": 2585, "train_runtime": 4064.4433, "train_tokens_per_second": 1384.73 }, { "entropy": 1.8262669563293457, "epoch": 0.518, "grad_norm": 41.54428482055664, "learning_rate": 9.90931609419178e-06, "loss": 1.7223, "mean_token_accuracy": 0.6897306799888611, "num_input_tokens_seen": 5641046, "num_tokens": 5641046.0, "step": 2590, "train_runtime": 4072.2719, "train_tokens_per_second": 1385.233 }, { "entropy": 1.7917248725891113, "epoch": 0.519, "grad_norm": 66.194091796875, "learning_rate": 9.876930405031047e-06, "loss": 1.5113, "mean_token_accuracy": 0.7243991613388061, "num_input_tokens_seen": 5650414, "num_tokens": 5650414.0, "step": 2595, "train_runtime": 4080.0088, "train_tokens_per_second": 1384.902 }, { "entropy": 1.6726043939590454, "epoch": 0.52, "grad_norm": 27.1508846282959, "learning_rate": 9.844546006812135e-06, "loss": 1.2843, "mean_token_accuracy": 0.7559401750564575, "num_input_tokens_seen": 5661766, "num_tokens": 5661766.0, "step": 2600, "train_runtime": 4087.5912, "train_tokens_per_second": 1385.111 }, { "entropy": 1.5938451766967774, "epoch": 0.521, "grad_norm": 81.3996810913086, "learning_rate": 9.812163239232051e-06, "loss": 1.6855, "mean_token_accuracy": 0.692830216884613, "num_input_tokens_seen": 5671394, "num_tokens": 5671394.0, "step": 2605, "train_runtime": 4095.3357, "train_tokens_per_second": 1384.842 }, { "entropy": 1.924572229385376, "epoch": 0.522, "grad_norm": 41.10609817504883, "learning_rate": 9.779782441970702e-06, "loss": 1.5987, "mean_token_accuracy": 0.7124988794326782, "num_input_tokens_seen": 5685248, "num_tokens": 5685248.0, "step": 2610, "train_runtime": 4103.4494, "train_tokens_per_second": 1385.48 }, { "entropy": 1.7251343965530395, "epoch": 0.523, "grad_norm": 69.05562591552734, "learning_rate": 9.747403954687334e-06, "loss": 1.5352, "mean_token_accuracy": 0.713352620601654, "num_input_tokens_seen": 5695779, "num_tokens": 5695779.0, "step": 2615, "train_runtime": 4110.9304, "train_tokens_per_second": 1385.521 }, { "entropy": 1.3788309574127198, "epoch": 0.524, "grad_norm": 34.040401458740234, "learning_rate": 9.715028117016955e-06, "loss": 1.368, "mean_token_accuracy": 0.7488721370697021, "num_input_tokens_seen": 5708906, "num_tokens": 5708906.0, "step": 2620, "train_runtime": 4118.8666, "train_tokens_per_second": 1386.038 }, { "entropy": 1.6445790529251099, "epoch": 0.525, "grad_norm": 69.4870376586914, "learning_rate": 9.682655268566783e-06, "loss": 1.6114, "mean_token_accuracy": 0.6951205968856812, "num_input_tokens_seen": 5719142, "num_tokens": 5719142.0, "step": 2625, "train_runtime": 4126.7833, "train_tokens_per_second": 1385.86 }, { "entropy": 1.4327710151672364, "epoch": 0.526, "grad_norm": 28.79164695739746, "learning_rate": 9.650285748912678e-06, "loss": 1.1569, "mean_token_accuracy": 0.7832456588745117, "num_input_tokens_seen": 5728336, "num_tokens": 5728336.0, "step": 2630, "train_runtime": 4134.1435, "train_tokens_per_second": 1385.616 }, { "entropy": 1.5064574241638184, "epoch": 0.527, "grad_norm": 72.98043823242188, "learning_rate": 9.617919897595586e-06, "loss": 1.4193, "mean_token_accuracy": 0.71767098903656, "num_input_tokens_seen": 5737199, "num_tokens": 5737199.0, "step": 2635, "train_runtime": 4141.777, "train_tokens_per_second": 1385.202 }, { "entropy": 1.2398569107055664, "epoch": 0.528, "grad_norm": 25.86004638671875, "learning_rate": 9.58555805411797e-06, "loss": 1.1384, "mean_token_accuracy": 0.7713195443153381, "num_input_tokens_seen": 5749632, "num_tokens": 5749632.0, "step": 2640, "train_runtime": 4149.5014, "train_tokens_per_second": 1385.62 }, { "entropy": 1.581679916381836, "epoch": 0.529, "grad_norm": 73.84771728515625, "learning_rate": 9.553200557940254e-06, "loss": 1.6106, "mean_token_accuracy": 0.6985647320747376, "num_input_tokens_seen": 5757819, "num_tokens": 5757819.0, "step": 2645, "train_runtime": 4157.2669, "train_tokens_per_second": 1385.001 }, { "entropy": 1.778089952468872, "epoch": 0.53, "grad_norm": 41.22148132324219, "learning_rate": 9.520847748477266e-06, "loss": 1.3493, "mean_token_accuracy": 0.752408790588379, "num_input_tokens_seen": 5769558, "num_tokens": 5769558.0, "step": 2650, "train_runtime": 4165.2315, "train_tokens_per_second": 1385.171 }, { "entropy": 1.8056242704391479, "epoch": 0.531, "grad_norm": 71.35599517822266, "learning_rate": 9.488499965094664e-06, "loss": 1.522, "mean_token_accuracy": 0.7116188764572143, "num_input_tokens_seen": 5779462, "num_tokens": 5779462.0, "step": 2655, "train_runtime": 4172.7311, "train_tokens_per_second": 1385.055 }, { "entropy": 1.6065031290054321, "epoch": 0.532, "grad_norm": 31.514812469482422, "learning_rate": 9.45615754710539e-06, "loss": 1.1964, "mean_token_accuracy": 0.7695648074150085, "num_input_tokens_seen": 5790166, "num_tokens": 5790166.0, "step": 2660, "train_runtime": 4180.7181, "train_tokens_per_second": 1384.969 }, { "entropy": 1.810375714302063, "epoch": 0.533, "grad_norm": 74.2752685546875, "learning_rate": 9.423820833766108e-06, "loss": 1.7321, "mean_token_accuracy": 0.6889849662780761, "num_input_tokens_seen": 5798705, "num_tokens": 5798705.0, "step": 2665, "train_runtime": 4188.4723, "train_tokens_per_second": 1384.444 }, { "entropy": 1.616666579246521, "epoch": 0.534, "grad_norm": 30.81418228149414, "learning_rate": 9.391490164273635e-06, "loss": 1.1184, "mean_token_accuracy": 0.7803215146064758, "num_input_tokens_seen": 5807296, "num_tokens": 5807296.0, "step": 2670, "train_runtime": 4195.8773, "train_tokens_per_second": 1384.048 }, { "entropy": 2.1334364891052244, "epoch": 0.535, "grad_norm": 68.137939453125, "learning_rate": 9.359165877761396e-06, "loss": 1.8558, "mean_token_accuracy": 0.6736525416374206, "num_input_tokens_seen": 5818329, "num_tokens": 5818329.0, "step": 2675, "train_runtime": 4203.8445, "train_tokens_per_second": 1384.05 }, { "entropy": 2.012607550621033, "epoch": 0.536, "grad_norm": 33.1670036315918, "learning_rate": 9.32684831329586e-06, "loss": 1.3302, "mean_token_accuracy": 0.7508126258850097, "num_input_tokens_seen": 5827194, "num_tokens": 5827194.0, "step": 2680, "train_runtime": 4211.552, "train_tokens_per_second": 1383.622 }, { "entropy": 1.9244415283203125, "epoch": 0.537, "grad_norm": 74.0466537475586, "learning_rate": 9.29453780987299e-06, "loss": 1.6447, "mean_token_accuracy": 0.7021538615226746, "num_input_tokens_seen": 5837708, "num_tokens": 5837708.0, "step": 2685, "train_runtime": 4219.4287, "train_tokens_per_second": 1383.53 }, { "entropy": 1.7842383623123168, "epoch": 0.538, "grad_norm": 32.70279312133789, "learning_rate": 9.262234706414677e-06, "loss": 1.2491, "mean_token_accuracy": 0.770900285243988, "num_input_tokens_seen": 5846508, "num_tokens": 5846508.0, "step": 2690, "train_runtime": 4227.0022, "train_tokens_per_second": 1383.133 }, { "entropy": 1.9106183052062988, "epoch": 0.539, "grad_norm": 69.90939331054688, "learning_rate": 9.229939341765188e-06, "loss": 1.4267, "mean_token_accuracy": 0.7259824872016907, "num_input_tokens_seen": 5858420, "num_tokens": 5858420.0, "step": 2695, "train_runtime": 4234.9259, "train_tokens_per_second": 1383.358 }, { "entropy": 1.716473603248596, "epoch": 0.54, "grad_norm": 31.017974853515625, "learning_rate": 9.197652054687619e-06, "loss": 1.1801, "mean_token_accuracy": 0.765427029132843, "num_input_tokens_seen": 5866178, "num_tokens": 5866178.0, "step": 2700, "train_runtime": 4242.2508, "train_tokens_per_second": 1382.798 }, { "entropy": 2.2560569763183596, "epoch": 0.541, "grad_norm": 82.98948669433594, "learning_rate": 9.165373183860329e-06, "loss": 1.8941, "mean_token_accuracy": 0.6357517123222352, "num_input_tokens_seen": 5875556, "num_tokens": 5875556.0, "step": 2705, "train_runtime": 4250.0148, "train_tokens_per_second": 1382.479 }, { "entropy": 2.1599568367004394, "epoch": 0.542, "grad_norm": 30.47930145263672, "learning_rate": 9.133103067873403e-06, "loss": 1.3273, "mean_token_accuracy": 0.7458666324615478, "num_input_tokens_seen": 5884356, "num_tokens": 5884356.0, "step": 2710, "train_runtime": 4257.7037, "train_tokens_per_second": 1382.049 }, { "entropy": 1.7243542671203613, "epoch": 0.543, "grad_norm": 65.77870178222656, "learning_rate": 9.100842045225084e-06, "loss": 1.3464, "mean_token_accuracy": 0.7439818024635315, "num_input_tokens_seen": 5893965, "num_tokens": 5893965.0, "step": 2715, "train_runtime": 4265.2001, "train_tokens_per_second": 1381.873 }, { "entropy": 1.46552255153656, "epoch": 0.544, "grad_norm": 50.201194763183594, "learning_rate": 9.06859045431824e-06, "loss": 0.9974, "mean_token_accuracy": 0.7926037192344666, "num_input_tokens_seen": 5905494, "num_tokens": 5905494.0, "step": 2720, "train_runtime": 4273.0316, "train_tokens_per_second": 1382.038 }, { "entropy": 2.087226963043213, "epoch": 0.545, "grad_norm": 61.063533782958984, "learning_rate": 9.036348633456791e-06, "loss": 1.5877, "mean_token_accuracy": 0.7019842743873597, "num_input_tokens_seen": 5916273, "num_tokens": 5916273.0, "step": 2725, "train_runtime": 4280.4631, "train_tokens_per_second": 1382.157 }, { "entropy": 2.266949200630188, "epoch": 0.546, "grad_norm": 50.100914001464844, "learning_rate": 9.004116920842188e-06, "loss": 1.3577, "mean_token_accuracy": 0.7436231136322021, "num_input_tokens_seen": 5926558, "num_tokens": 5926558.0, "step": 2730, "train_runtime": 4288.2963, "train_tokens_per_second": 1382.031 }, { "entropy": 1.764879012107849, "epoch": 0.547, "grad_norm": 93.69525146484375, "learning_rate": 8.971895654569842e-06, "loss": 1.5734, "mean_token_accuracy": 0.7068006873130799, "num_input_tokens_seen": 5935177, "num_tokens": 5935177.0, "step": 2735, "train_runtime": 4295.8883, "train_tokens_per_second": 1381.595 }, { "entropy": 2.600821685791016, "epoch": 0.548, "grad_norm": 43.75839614868164, "learning_rate": 8.939685172625588e-06, "loss": 1.6465, "mean_token_accuracy": 0.7162549793720245, "num_input_tokens_seen": 5943586, "num_tokens": 5943586.0, "step": 2740, "train_runtime": 4303.5828, "train_tokens_per_second": 1381.079 }, { "entropy": 2.386103057861328, "epoch": 0.549, "grad_norm": 77.82188415527344, "learning_rate": 8.907485812882137e-06, "loss": 1.5718, "mean_token_accuracy": 0.7063305377960205, "num_input_tokens_seen": 5954589, "num_tokens": 5954589.0, "step": 2745, "train_runtime": 4311.1286, "train_tokens_per_second": 1381.213 }, { "entropy": 1.9989741325378418, "epoch": 0.55, "grad_norm": 31.21550750732422, "learning_rate": 8.875297913095544e-06, "loss": 1.1993, "mean_token_accuracy": 0.7652674317359924, "num_input_tokens_seen": 5965690, "num_tokens": 5965690.0, "step": 2750, "train_runtime": 4318.886, "train_tokens_per_second": 1381.303 }, { "entropy": 1.9003063917160035, "epoch": 0.551, "grad_norm": 86.46408081054688, "learning_rate": 8.843121810901643e-06, "loss": 1.4187, "mean_token_accuracy": 0.725262188911438, "num_input_tokens_seen": 5975468, "num_tokens": 5975468.0, "step": 2755, "train_runtime": 4326.3772, "train_tokens_per_second": 1381.171 }, { "entropy": 1.8405513286590576, "epoch": 0.552, "grad_norm": 30.006698608398438, "learning_rate": 8.81095784381252e-06, "loss": 1.112, "mean_token_accuracy": 0.7773529648780823, "num_input_tokens_seen": 5983250, "num_tokens": 5983250.0, "step": 2760, "train_runtime": 4334.0118, "train_tokens_per_second": 1380.534 }, { "entropy": 2.0038307666778565, "epoch": 0.553, "grad_norm": 76.08162689208984, "learning_rate": 8.778806349212968e-06, "loss": 1.5726, "mean_token_accuracy": 0.692966103553772, "num_input_tokens_seen": 5993412, "num_tokens": 5993412.0, "step": 2765, "train_runtime": 4341.9027, "train_tokens_per_second": 1380.365 }, { "entropy": 2.259874939918518, "epoch": 0.554, "grad_norm": 35.251853942871094, "learning_rate": 8.746667664356957e-06, "loss": 1.1227, "mean_token_accuracy": 0.7770853400230407, "num_input_tokens_seen": 6002824, "num_tokens": 6002824.0, "step": 2770, "train_runtime": 4349.7881, "train_tokens_per_second": 1380.027 }, { "entropy": 1.9361029148101807, "epoch": 0.555, "grad_norm": 62.7562255859375, "learning_rate": 8.71454212636408e-06, "loss": 1.392, "mean_token_accuracy": 0.7192505836486817, "num_input_tokens_seen": 6012310, "num_tokens": 6012310.0, "step": 2775, "train_runtime": 4357.3737, "train_tokens_per_second": 1379.801 }, { "entropy": 1.6335277080535888, "epoch": 0.556, "grad_norm": 29.018085479736328, "learning_rate": 8.682430072216029e-06, "loss": 1.1443, "mean_token_accuracy": 0.7799576759338379, "num_input_tokens_seen": 6023556, "num_tokens": 6023556.0, "step": 2780, "train_runtime": 4365.2555, "train_tokens_per_second": 1379.886 }, { "entropy": 2.246228647232056, "epoch": 0.557, "grad_norm": 119.56858825683594, "learning_rate": 8.650331838753057e-06, "loss": 1.8123, "mean_token_accuracy": 0.6696184277534485, "num_input_tokens_seen": 6034619, "num_tokens": 6034619.0, "step": 2785, "train_runtime": 4372.8415, "train_tokens_per_second": 1380.022 }, { "entropy": 1.7769938468933106, "epoch": 0.558, "grad_norm": 31.697690963745117, "learning_rate": 8.618247762670445e-06, "loss": 1.2618, "mean_token_accuracy": 0.7423007607460022, "num_input_tokens_seen": 6044358, "num_tokens": 6044358.0, "step": 2790, "train_runtime": 4380.4773, "train_tokens_per_second": 1379.84 }, { "entropy": 2.058828830718994, "epoch": 0.559, "grad_norm": 73.97871398925781, "learning_rate": 8.586178180514968e-06, "loss": 1.5535, "mean_token_accuracy": 0.7015879869461059, "num_input_tokens_seen": 6054829, "num_tokens": 6054829.0, "step": 2795, "train_runtime": 4388.2932, "train_tokens_per_second": 1379.769 }, { "entropy": 1.7645136356353759, "epoch": 0.56, "grad_norm": 36.44921112060547, "learning_rate": 8.554123428681367e-06, "loss": 1.1753, "mean_token_accuracy": 0.7863941192626953, "num_input_tokens_seen": 6065790, "num_tokens": 6065790.0, "step": 2800, "train_runtime": 4396.211, "train_tokens_per_second": 1379.777 }, { "entropy": 1.85941801071167, "epoch": 0.561, "grad_norm": 81.10871887207031, "learning_rate": 8.522083843408823e-06, "loss": 1.3504, "mean_token_accuracy": 0.7299327731132508, "num_input_tokens_seen": 6074558, "num_tokens": 6074558.0, "step": 2805, "train_runtime": 4403.6728, "train_tokens_per_second": 1379.43 }, { "entropy": 1.7038959503173827, "epoch": 0.562, "grad_norm": 37.14789581298828, "learning_rate": 8.490059760777425e-06, "loss": 1.1063, "mean_token_accuracy": 0.7901615619659423, "num_input_tokens_seen": 6083838, "num_tokens": 6083838.0, "step": 2810, "train_runtime": 4411.4517, "train_tokens_per_second": 1379.101 }, { "entropy": 2.4742823123931883, "epoch": 0.563, "grad_norm": 73.43671417236328, "learning_rate": 8.458051516704644e-06, "loss": 1.6155, "mean_token_accuracy": 0.7062512159347534, "num_input_tokens_seen": 6094602, "num_tokens": 6094602.0, "step": 2815, "train_runtime": 4419.1438, "train_tokens_per_second": 1379.136 }, { "entropy": 1.5709030628204346, "epoch": 0.564, "grad_norm": 26.93817138671875, "learning_rate": 8.426059446941817e-06, "loss": 1.0982, "mean_token_accuracy": 0.7795431733131408, "num_input_tokens_seen": 6103088, "num_tokens": 6103088.0, "step": 2820, "train_runtime": 4426.7624, "train_tokens_per_second": 1378.68 }, { "entropy": 1.8601479053497314, "epoch": 0.565, "grad_norm": 65.44580841064453, "learning_rate": 8.394083887070614e-06, "loss": 1.6425, "mean_token_accuracy": 0.6932317852973938, "num_input_tokens_seen": 6115635, "num_tokens": 6115635.0, "step": 2825, "train_runtime": 4434.7311, "train_tokens_per_second": 1379.032 }, { "entropy": 1.8174354076385497, "epoch": 0.566, "grad_norm": 44.99392318725586, "learning_rate": 8.36212517249953e-06, "loss": 1.3217, "mean_token_accuracy": 0.7385932087898255, "num_input_tokens_seen": 6127872, "num_tokens": 6127872.0, "step": 2830, "train_runtime": 4442.5777, "train_tokens_per_second": 1379.351 }, { "entropy": 1.9558209657669068, "epoch": 0.567, "grad_norm": 70.02542114257812, "learning_rate": 8.330183638460356e-06, "loss": 1.5303, "mean_token_accuracy": 0.7230139136314392, "num_input_tokens_seen": 6138135, "num_tokens": 6138135.0, "step": 2835, "train_runtime": 4450.1172, "train_tokens_per_second": 1379.32 }, { "entropy": 2.096907138824463, "epoch": 0.568, "grad_norm": 33.959896087646484, "learning_rate": 8.29825962000467e-06, "loss": 1.3695, "mean_token_accuracy": 0.7333014845848084, "num_input_tokens_seen": 6146880, "num_tokens": 6146880.0, "step": 2840, "train_runtime": 4457.948, "train_tokens_per_second": 1378.859 }, { "entropy": 1.8049304485321045, "epoch": 0.569, "grad_norm": 91.68631744384766, "learning_rate": 8.266353452000326e-06, "loss": 1.664, "mean_token_accuracy": 0.6913930177688599, "num_input_tokens_seen": 6158215, "num_tokens": 6158215.0, "step": 2845, "train_runtime": 4465.4016, "train_tokens_per_second": 1379.095 }, { "entropy": 1.8402242183685302, "epoch": 0.57, "grad_norm": 34.46211242675781, "learning_rate": 8.234465469127919e-06, "loss": 1.0414, "mean_token_accuracy": 0.7853426814079285, "num_input_tokens_seen": 6166704, "num_tokens": 6166704.0, "step": 2850, "train_runtime": 4473.1107, "train_tokens_per_second": 1378.616 }, { "entropy": 2.029818558692932, "epoch": 0.571, "grad_norm": 65.92750549316406, "learning_rate": 8.202596005877307e-06, "loss": 1.4505, "mean_token_accuracy": 0.7184691429138184, "num_input_tokens_seen": 6175558, "num_tokens": 6175558.0, "step": 2855, "train_runtime": 4480.9048, "train_tokens_per_second": 1378.194 }, { "entropy": 1.8893269300460815, "epoch": 0.572, "grad_norm": 30.616565704345703, "learning_rate": 8.170745396544072e-06, "loss": 1.199, "mean_token_accuracy": 0.7608461022377014, "num_input_tokens_seen": 6185280, "num_tokens": 6185280.0, "step": 2860, "train_runtime": 4488.3656, "train_tokens_per_second": 1378.07 }, { "entropy": 2.468494200706482, "epoch": 0.573, "grad_norm": 84.10701751708984, "learning_rate": 8.138913975226044e-06, "loss": 1.6157, "mean_token_accuracy": 0.7092116236686706, "num_input_tokens_seen": 6195491, "num_tokens": 6195491.0, "step": 2865, "train_runtime": 4496.297, "train_tokens_per_second": 1377.91 }, { "entropy": 2.2942047357559203, "epoch": 0.574, "grad_norm": 51.563377380371094, "learning_rate": 8.10710207581976e-06, "loss": 1.2005, "mean_token_accuracy": 0.7683440327644349, "num_input_tokens_seen": 6205270, "num_tokens": 6205270.0, "step": 2870, "train_runtime": 4503.8085, "train_tokens_per_second": 1377.783 }, { "entropy": 2.2453629970550537, "epoch": 0.575, "grad_norm": 72.859130859375, "learning_rate": 8.075310032017e-06, "loss": 1.5614, "mean_token_accuracy": 0.6988577008247375, "num_input_tokens_seen": 6217327, "num_tokens": 6217327.0, "step": 2875, "train_runtime": 4511.7621, "train_tokens_per_second": 1378.026 }, { "entropy": 1.6037590503692627, "epoch": 0.576, "grad_norm": 40.621498107910156, "learning_rate": 8.043538177301256e-06, "loss": 1.121, "mean_token_accuracy": 0.7745033502578735, "num_input_tokens_seen": 6225126, "num_tokens": 6225126.0, "step": 2880, "train_runtime": 4519.3158, "train_tokens_per_second": 1377.449 }, { "entropy": 2.5154204845428465, "epoch": 0.577, "grad_norm": 74.6661148071289, "learning_rate": 8.01178684494425e-06, "loss": 1.6358, "mean_token_accuracy": 0.6975546002388, "num_input_tokens_seen": 6234486, "num_tokens": 6234486.0, "step": 2885, "train_runtime": 4527.0307, "train_tokens_per_second": 1377.169 }, { "entropy": 1.9623025417327882, "epoch": 0.578, "grad_norm": 35.30690002441406, "learning_rate": 7.980056368002435e-06, "loss": 1.1077, "mean_token_accuracy": 0.7761102795600892, "num_input_tokens_seen": 6243548, "num_tokens": 6243548.0, "step": 2890, "train_runtime": 4534.4505, "train_tokens_per_second": 1376.914 }, { "entropy": 2.3874255657196044, "epoch": 0.579, "grad_norm": 92.51774597167969, "learning_rate": 7.948347079313494e-06, "loss": 1.5247, "mean_token_accuracy": 0.715183675289154, "num_input_tokens_seen": 6253248, "num_tokens": 6253248.0, "step": 2895, "train_runtime": 4542.1831, "train_tokens_per_second": 1376.705 }, { "entropy": 2.063613224029541, "epoch": 0.58, "grad_norm": 34.93312072753906, "learning_rate": 7.916659311492871e-06, "loss": 1.1213, "mean_token_accuracy": 0.7736389756202697, "num_input_tokens_seen": 6262170, "num_tokens": 6262170.0, "step": 2900, "train_runtime": 4549.6054, "train_tokens_per_second": 1376.42 }, { "entropy": 2.016595196723938, "epoch": 0.581, "grad_norm": 79.38453674316406, "learning_rate": 7.88499339693025e-06, "loss": 1.563, "mean_token_accuracy": 0.7132612586021423, "num_input_tokens_seen": 6273045, "num_tokens": 6273045.0, "step": 2905, "train_runtime": 4557.4789, "train_tokens_per_second": 1376.429 }, { "entropy": 2.387331461906433, "epoch": 0.582, "grad_norm": 35.66706466674805, "learning_rate": 7.85334966778609e-06, "loss": 1.4036, "mean_token_accuracy": 0.7252639532089233, "num_input_tokens_seen": 6283968, "num_tokens": 6283968.0, "step": 2910, "train_runtime": 4565.3022, "train_tokens_per_second": 1376.463 }, { "entropy": 2.1085668087005613, "epoch": 0.583, "grad_norm": 79.5289535522461, "learning_rate": 7.82172845598814e-06, "loss": 1.6256, "mean_token_accuracy": 0.702094566822052, "num_input_tokens_seen": 6296282, "num_tokens": 6296282.0, "step": 2915, "train_runtime": 4573.1473, "train_tokens_per_second": 1376.794 }, { "entropy": 1.8812416553497315, "epoch": 0.584, "grad_norm": 28.13924789428711, "learning_rate": 7.790130093227943e-06, "loss": 1.1225, "mean_token_accuracy": 0.7655258417129517, "num_input_tokens_seen": 6304416, "num_tokens": 6304416.0, "step": 2920, "train_runtime": 4580.4796, "train_tokens_per_second": 1376.366 }, { "entropy": 1.8508034229278565, "epoch": 0.585, "grad_norm": 86.5930404663086, "learning_rate": 7.758554910957378e-06, "loss": 1.4178, "mean_token_accuracy": 0.7168241143226624, "num_input_tokens_seen": 6315155, "num_tokens": 6315155.0, "step": 2925, "train_runtime": 4588.3703, "train_tokens_per_second": 1376.339 }, { "entropy": 2.0213495254516602, "epoch": 0.586, "grad_norm": 33.586177825927734, "learning_rate": 7.727003240385163e-06, "loss": 1.1274, "mean_token_accuracy": 0.7738096475601196, "num_input_tokens_seen": 6323716, "num_tokens": 6323716.0, "step": 2930, "train_runtime": 4595.9129, "train_tokens_per_second": 1375.943 }, { "entropy": 2.2140179872512817, "epoch": 0.587, "grad_norm": 84.8775405883789, "learning_rate": 7.695475412473393e-06, "loss": 1.4759, "mean_token_accuracy": 0.7161606907844543, "num_input_tokens_seen": 6333050, "num_tokens": 6333050.0, "step": 2935, "train_runtime": 4603.5938, "train_tokens_per_second": 1375.675 }, { "entropy": 2.017851972579956, "epoch": 0.588, "grad_norm": 58.215579986572266, "learning_rate": 7.663971757934064e-06, "loss": 1.1693, "mean_token_accuracy": 0.7630817890167236, "num_input_tokens_seen": 6343722, "num_tokens": 6343722.0, "step": 2940, "train_runtime": 4611.4061, "train_tokens_per_second": 1375.659 }, { "entropy": 1.9276499032974244, "epoch": 0.589, "grad_norm": 64.24576568603516, "learning_rate": 7.632492607225604e-06, "loss": 1.3314, "mean_token_accuracy": 0.7452657461166382, "num_input_tokens_seen": 6353857, "num_tokens": 6353857.0, "step": 2945, "train_runtime": 4619.0029, "train_tokens_per_second": 1375.591 }, { "entropy": 1.7176877021789552, "epoch": 0.59, "grad_norm": 26.575664520263672, "learning_rate": 7.60103829054941e-06, "loss": 0.9894, "mean_token_accuracy": 0.7886354565620423, "num_input_tokens_seen": 6364032, "num_tokens": 6364032.0, "step": 2950, "train_runtime": 4626.7686, "train_tokens_per_second": 1375.481 }, { "entropy": 1.637313437461853, "epoch": 0.591, "grad_norm": 57.495399475097656, "learning_rate": 7.569609137846376e-06, "loss": 1.2626, "mean_token_accuracy": 0.753804087638855, "num_input_tokens_seen": 6375143, "num_tokens": 6375143.0, "step": 2955, "train_runtime": 4634.2386, "train_tokens_per_second": 1375.661 }, { "entropy": 1.5547414541244506, "epoch": 0.592, "grad_norm": 27.789993286132812, "learning_rate": 7.538205478793448e-06, "loss": 0.8742, "mean_token_accuracy": 0.8150353789329529, "num_input_tokens_seen": 6385914, "num_tokens": 6385914.0, "step": 2960, "train_runtime": 4642.1101, "train_tokens_per_second": 1375.649 }, { "entropy": 1.9389745235443114, "epoch": 0.593, "grad_norm": 73.13197326660156, "learning_rate": 7.506827642800146e-06, "loss": 1.3613, "mean_token_accuracy": 0.7316680073738098, "num_input_tokens_seen": 6396419, "num_tokens": 6396419.0, "step": 2965, "train_runtime": 4649.9029, "train_tokens_per_second": 1375.603 }, { "entropy": 1.5035089254379272, "epoch": 0.594, "grad_norm": 29.98301124572754, "learning_rate": 7.475475959005123e-06, "loss": 1.0425, "mean_token_accuracy": 0.7806936025619506, "num_input_tokens_seen": 6405276, "num_tokens": 6405276.0, "step": 2970, "train_runtime": 4657.6328, "train_tokens_per_second": 1375.221 }, { "entropy": 2.181168484687805, "epoch": 0.595, "grad_norm": 87.54061889648438, "learning_rate": 7.444150756272704e-06, "loss": 1.6772, "mean_token_accuracy": 0.681380033493042, "num_input_tokens_seen": 6416098, "num_tokens": 6416098.0, "step": 2975, "train_runtime": 4665.2339, "train_tokens_per_second": 1375.3 }, { "entropy": 2.243989896774292, "epoch": 0.596, "grad_norm": 46.63868713378906, "learning_rate": 7.4128523631894464e-06, "loss": 1.4274, "mean_token_accuracy": 0.7314647316932679, "num_input_tokens_seen": 6427780, "num_tokens": 6427780.0, "step": 2980, "train_runtime": 4673.0717, "train_tokens_per_second": 1375.494 }, { "entropy": 1.9558125734329224, "epoch": 0.597, "grad_norm": 85.36492919921875, "learning_rate": 7.38158110806068e-06, "loss": 1.6963, "mean_token_accuracy": 0.6951684474945068, "num_input_tokens_seen": 6438226, "num_tokens": 6438226.0, "step": 2985, "train_runtime": 4680.5518, "train_tokens_per_second": 1375.527 }, { "entropy": 1.9123180866241456, "epoch": 0.598, "grad_norm": 29.412214279174805, "learning_rate": 7.350337318907075e-06, "loss": 1.2004, "mean_token_accuracy": 0.7604243874549865, "num_input_tokens_seen": 6447506, "num_tokens": 6447506.0, "step": 2990, "train_runtime": 4688.2943, "train_tokens_per_second": 1375.235 }, { "entropy": 1.8343429565429688, "epoch": 0.599, "grad_norm": 76.14191436767578, "learning_rate": 7.319121323461198e-06, "loss": 1.5907, "mean_token_accuracy": 0.691347074508667, "num_input_tokens_seen": 6457489, "num_tokens": 6457489.0, "step": 2995, "train_runtime": 4696.1437, "train_tokens_per_second": 1375.062 }, { "entropy": 1.910910964012146, "epoch": 0.6, "grad_norm": 41.34055709838867, "learning_rate": 7.287933449164068e-06, "loss": 1.3444, "mean_token_accuracy": 0.7558755874633789, "num_input_tokens_seen": 6467976, "num_tokens": 6467976.0, "step": 3000, "train_runtime": 4703.8679, "train_tokens_per_second": 1375.034 }, { "entropy": 2.0232861757278444, "epoch": 0.601, "grad_norm": 75.27515411376953, "learning_rate": 7.256774023161728e-06, "loss": 1.6115, "mean_token_accuracy": 0.6844237089157105, "num_input_tokens_seen": 6476513, "num_tokens": 6476513.0, "step": 3005, "train_runtime": 4711.3227, "train_tokens_per_second": 1374.67 }, { "entropy": 1.7025319576263427, "epoch": 0.602, "grad_norm": 47.8427848815918, "learning_rate": 7.225643372301812e-06, "loss": 1.4691, "mean_token_accuracy": 0.7146709322929382, "num_input_tokens_seen": 6486914, "num_tokens": 6486914.0, "step": 3010, "train_runtime": 4719.0853, "train_tokens_per_second": 1374.613 }, { "entropy": 1.7035044431686401, "epoch": 0.603, "grad_norm": 73.79476165771484, "learning_rate": 7.194541823130119e-06, "loss": 1.3625, "mean_token_accuracy": 0.7199865102767944, "num_input_tokens_seen": 6494653, "num_tokens": 6494653.0, "step": 3015, "train_runtime": 4726.4593, "train_tokens_per_second": 1374.105 }, { "entropy": 1.6917545080184937, "epoch": 0.604, "grad_norm": 41.40031814575195, "learning_rate": 7.163469701887182e-06, "loss": 1.1607, "mean_token_accuracy": 0.7640915036201477, "num_input_tokens_seen": 6503048, "num_tokens": 6503048.0, "step": 3020, "train_runtime": 4734.1336, "train_tokens_per_second": 1373.651 }, { "entropy": 2.3271164655685426, "epoch": 0.605, "grad_norm": 72.9666976928711, "learning_rate": 7.132427334504846e-06, "loss": 1.7467, "mean_token_accuracy": 0.6847102999687195, "num_input_tokens_seen": 6514903, "num_tokens": 6514903.0, "step": 3025, "train_runtime": 4742.0429, "train_tokens_per_second": 1373.86 }, { "entropy": 1.9827677011489868, "epoch": 0.606, "grad_norm": 50.74014663696289, "learning_rate": 7.1014150466028605e-06, "loss": 1.4593, "mean_token_accuracy": 0.727696418762207, "num_input_tokens_seen": 6524700, "num_tokens": 6524700.0, "step": 3030, "train_runtime": 4749.5713, "train_tokens_per_second": 1373.745 }, { "entropy": 1.8603835105895996, "epoch": 0.607, "grad_norm": 84.66570281982422, "learning_rate": 7.070433163485451e-06, "loss": 1.2455, "mean_token_accuracy": 0.7381930947303772, "num_input_tokens_seen": 6535026, "num_tokens": 6535026.0, "step": 3035, "train_runtime": 4757.3663, "train_tokens_per_second": 1373.665 }, { "entropy": 1.9502806425094605, "epoch": 0.608, "grad_norm": 28.435522079467773, "learning_rate": 7.039482010137908e-06, "loss": 1.2243, "mean_token_accuracy": 0.7631527662277222, "num_input_tokens_seen": 6543874, "num_tokens": 6543874.0, "step": 3040, "train_runtime": 4764.8319, "train_tokens_per_second": 1373.369 }, { "entropy": 1.7615466833114624, "epoch": 0.609, "grad_norm": 93.51825714111328, "learning_rate": 7.008561911223186e-06, "loss": 1.5535, "mean_token_accuracy": 0.6985381722450257, "num_input_tokens_seen": 6551590, "num_tokens": 6551590.0, "step": 3045, "train_runtime": 4772.4328, "train_tokens_per_second": 1372.799 }, { "entropy": 1.7815019369125367, "epoch": 0.61, "grad_norm": 47.27823257446289, "learning_rate": 6.977673191078487e-06, "loss": 1.1378, "mean_token_accuracy": 0.7866188645362854, "num_input_tokens_seen": 6561214, "num_tokens": 6561214.0, "step": 3050, "train_runtime": 4780.1739, "train_tokens_per_second": 1372.589 }, { "entropy": 1.7666277170181275, "epoch": 0.611, "grad_norm": 87.87570190429688, "learning_rate": 6.946816173711878e-06, "loss": 1.4909, "mean_token_accuracy": 0.7057694077491761, "num_input_tokens_seen": 6571668, "num_tokens": 6571668.0, "step": 3055, "train_runtime": 4788.0953, "train_tokens_per_second": 1372.502 }, { "entropy": 1.9839014768600465, "epoch": 0.612, "grad_norm": 76.87232208251953, "learning_rate": 6.915991182798865e-06, "loss": 1.5435, "mean_token_accuracy": 0.7039350748062134, "num_input_tokens_seen": 6580824, "num_tokens": 6580824.0, "step": 3060, "train_runtime": 4795.68, "train_tokens_per_second": 1372.24 }, { "entropy": 1.81967933177948, "epoch": 0.613, "grad_norm": 75.82462310791016, "learning_rate": 6.885198541679016e-06, "loss": 1.3786, "mean_token_accuracy": 0.729226815700531, "num_input_tokens_seen": 6588072, "num_tokens": 6588072.0, "step": 3065, "train_runtime": 4803.3248, "train_tokens_per_second": 1371.565 }, { "entropy": 1.9619008779525757, "epoch": 0.614, "grad_norm": 39.52919006347656, "learning_rate": 6.8544385733525665e-06, "loss": 1.261, "mean_token_accuracy": 0.7550681471824646, "num_input_tokens_seen": 6597602, "num_tokens": 6597602.0, "step": 3070, "train_runtime": 4810.8559, "train_tokens_per_second": 1371.399 }, { "entropy": 1.8245057344436646, "epoch": 0.615, "grad_norm": 79.33995056152344, "learning_rate": 6.823711600477025e-06, "loss": 1.4973, "mean_token_accuracy": 0.6888669848442077, "num_input_tokens_seen": 6606503, "num_tokens": 6606503.0, "step": 3075, "train_runtime": 4818.6638, "train_tokens_per_second": 1371.024 }, { "entropy": 1.6480743169784546, "epoch": 0.616, "grad_norm": 54.36875915527344, "learning_rate": 6.793017945363804e-06, "loss": 1.2456, "mean_token_accuracy": 0.7698419451713562, "num_input_tokens_seen": 6613626, "num_tokens": 6613626.0, "step": 3080, "train_runtime": 4826.2716, "train_tokens_per_second": 1370.339 }, { "entropy": 1.5632912397384644, "epoch": 0.617, "grad_norm": 89.47774505615234, "learning_rate": 6.76235792997482e-06, "loss": 1.3031, "mean_token_accuracy": 0.7296841621398926, "num_input_tokens_seen": 6622384, "num_tokens": 6622384.0, "step": 3085, "train_runtime": 4833.993, "train_tokens_per_second": 1369.961 }, { "entropy": 1.7595004796981812, "epoch": 0.618, "grad_norm": 49.35243606567383, "learning_rate": 6.731731875919123e-06, "loss": 1.3023, "mean_token_accuracy": 0.7453101515769959, "num_input_tokens_seen": 6630412, "num_tokens": 6630412.0, "step": 3090, "train_runtime": 4841.4548, "train_tokens_per_second": 1369.508 }, { "entropy": 1.607977819442749, "epoch": 0.619, "grad_norm": 70.41512298583984, "learning_rate": 6.7011401044495304e-06, "loss": 1.3086, "mean_token_accuracy": 0.7488374233245849, "num_input_tokens_seen": 6639187, "num_tokens": 6639187.0, "step": 3095, "train_runtime": 4849.2937, "train_tokens_per_second": 1369.104 }, { "entropy": 1.7185869693756104, "epoch": 0.62, "grad_norm": 32.218013763427734, "learning_rate": 6.670582936459249e-06, "loss": 1.0745, "mean_token_accuracy": 0.7838991165161133, "num_input_tokens_seen": 6649798, "num_tokens": 6649798.0, "step": 3100, "train_runtime": 4856.9724, "train_tokens_per_second": 1369.124 }, { "entropy": 2.1471627950668335, "epoch": 0.621, "grad_norm": 130.4442138671875, "learning_rate": 6.6400606924785095e-06, "loss": 1.6119, "mean_token_accuracy": 0.6989300608634949, "num_input_tokens_seen": 6660374, "num_tokens": 6660374.0, "step": 3105, "train_runtime": 4864.9383, "train_tokens_per_second": 1369.056 }, { "entropy": 1.6597116947174073, "epoch": 0.622, "grad_norm": 23.28336524963379, "learning_rate": 6.609573692671209e-06, "loss": 1.1585, "mean_token_accuracy": 0.767267382144928, "num_input_tokens_seen": 6668634, "num_tokens": 6668634.0, "step": 3110, "train_runtime": 4872.5347, "train_tokens_per_second": 1368.617 }, { "entropy": 2.0447744607925413, "epoch": 0.623, "grad_norm": 88.88422393798828, "learning_rate": 6.579122256831551e-06, "loss": 1.6156, "mean_token_accuracy": 0.6907529354095459, "num_input_tokens_seen": 6679105, "num_tokens": 6679105.0, "step": 3115, "train_runtime": 4880.3661, "train_tokens_per_second": 1368.566 }, { "entropy": 1.5721227169036864, "epoch": 0.624, "grad_norm": 39.92247009277344, "learning_rate": 6.54870670438069e-06, "loss": 1.2505, "mean_token_accuracy": 0.7450488924980163, "num_input_tokens_seen": 6689260, "num_tokens": 6689260.0, "step": 3120, "train_runtime": 4888.0443, "train_tokens_per_second": 1368.494 }, { "entropy": 2.0488489151000975, "epoch": 0.625, "grad_norm": 90.2983627319336, "learning_rate": 6.518327354363374e-06, "loss": 1.5246, "mean_token_accuracy": 0.7033559679985046, "num_input_tokens_seen": 6698663, "num_tokens": 6698663.0, "step": 3125, "train_runtime": 4895.7226, "train_tokens_per_second": 1368.269 }, { "entropy": 1.6353962659835815, "epoch": 0.626, "grad_norm": 35.76404571533203, "learning_rate": 6.487984525444613e-06, "loss": 1.1853, "mean_token_accuracy": 0.7543941974639893, "num_input_tokens_seen": 6708928, "num_tokens": 6708928.0, "step": 3130, "train_runtime": 4903.5984, "train_tokens_per_second": 1368.164 }, { "entropy": 1.3809735774993896, "epoch": 0.627, "grad_norm": 72.60491180419922, "learning_rate": 6.4576785359063225e-06, "loss": 1.2112, "mean_token_accuracy": 0.7561848282814025, "num_input_tokens_seen": 6723182, "num_tokens": 6723182.0, "step": 3135, "train_runtime": 4911.5247, "train_tokens_per_second": 1368.858 }, { "entropy": 1.4279350757598877, "epoch": 0.628, "grad_norm": 28.420202255249023, "learning_rate": 6.42740970364399e-06, "loss": 0.9725, "mean_token_accuracy": 0.7984194397926331, "num_input_tokens_seen": 6736588, "num_tokens": 6736588.0, "step": 3140, "train_runtime": 4919.3014, "train_tokens_per_second": 1369.42 }, { "entropy": 2.5526672124862673, "epoch": 0.629, "grad_norm": 68.55531311035156, "learning_rate": 6.397178346163348e-06, "loss": 2.2873, "mean_token_accuracy": 0.6230687201023102, "num_input_tokens_seen": 6750925, "num_tokens": 6750925.0, "step": 3145, "train_runtime": 4927.403, "train_tokens_per_second": 1370.078 }, { "entropy": 1.4622450590133667, "epoch": 0.63, "grad_norm": 35.11809539794922, "learning_rate": 6.36698478057703e-06, "loss": 1.0287, "mean_token_accuracy": 0.7839748978614807, "num_input_tokens_seen": 6763160, "num_tokens": 6763160.0, "step": 3150, "train_runtime": 4935.1019, "train_tokens_per_second": 1370.42 }, { "entropy": 1.5190858602523805, "epoch": 0.631, "grad_norm": 66.58702087402344, "learning_rate": 6.33682932360125e-06, "loss": 1.291, "mean_token_accuracy": 0.742296326160431, "num_input_tokens_seen": 6776201, "num_tokens": 6776201.0, "step": 3155, "train_runtime": 4943.0744, "train_tokens_per_second": 1370.847 }, { "entropy": 1.339308500289917, "epoch": 0.632, "grad_norm": 44.97834777832031, "learning_rate": 6.306712291552484e-06, "loss": 1.1489, "mean_token_accuracy": 0.7667497634887696, "num_input_tokens_seen": 6789294, "num_tokens": 6789294.0, "step": 3160, "train_runtime": 4950.805, "train_tokens_per_second": 1371.352 }, { "entropy": 1.4301114559173584, "epoch": 0.633, "grad_norm": 66.54664611816406, "learning_rate": 6.276634000344144e-06, "loss": 1.2751, "mean_token_accuracy": 0.7389472603797913, "num_input_tokens_seen": 6801347, "num_tokens": 6801347.0, "step": 3165, "train_runtime": 4958.6675, "train_tokens_per_second": 1371.608 }, { "entropy": 1.0977412819862367, "epoch": 0.634, "grad_norm": 41.012264251708984, "learning_rate": 6.246594765483274e-06, "loss": 1.2931, "mean_token_accuracy": 0.7387320280075074, "num_input_tokens_seen": 6815264, "num_tokens": 6815264.0, "step": 3170, "train_runtime": 4966.7048, "train_tokens_per_second": 1372.19 }, { "entropy": 1.049205720424652, "epoch": 0.635, "grad_norm": 80.68806457519531, "learning_rate": 6.216594902067233e-06, "loss": 1.2644, "mean_token_accuracy": 0.7445415735244751, "num_input_tokens_seen": 6828095, "num_tokens": 6828095.0, "step": 3175, "train_runtime": 4974.7497, "train_tokens_per_second": 1372.55 }, { "entropy": 0.8870461344718933, "epoch": 0.636, "grad_norm": 26.305316925048828, "learning_rate": 6.186634724780394e-06, "loss": 0.8648, "mean_token_accuracy": 0.8064908623695374, "num_input_tokens_seen": 6841602, "num_tokens": 6841602.0, "step": 3180, "train_runtime": 4982.4941, "train_tokens_per_second": 1373.128 }, { "entropy": 0.9507928371429444, "epoch": 0.637, "grad_norm": 97.02124786376953, "learning_rate": 6.156714547890838e-06, "loss": 1.5528, "mean_token_accuracy": 0.7029990792274475, "num_input_tokens_seen": 6853668, "num_tokens": 6853668.0, "step": 3185, "train_runtime": 4990.4879, "train_tokens_per_second": 1373.346 }, { "entropy": 2.3370249271392822, "epoch": 0.638, "grad_norm": 55.54469680786133, "learning_rate": 6.126834685247065e-06, "loss": 1.7633, "mean_token_accuracy": 0.6800727844238281, "num_input_tokens_seen": 6864100, "num_tokens": 6864100.0, "step": 3190, "train_runtime": 4998.0891, "train_tokens_per_second": 1373.345 }, { "entropy": 1.3342639207839966, "epoch": 0.639, "grad_norm": 66.57951354980469, "learning_rate": 6.0969954502746916e-06, "loss": 1.1474, "mean_token_accuracy": 0.7643045902252197, "num_input_tokens_seen": 6875804, "num_tokens": 6875804.0, "step": 3195, "train_runtime": 5006.0428, "train_tokens_per_second": 1373.501 }, { "entropy": 0.9342282295227051, "epoch": 0.64, "grad_norm": 53.53071212768555, "learning_rate": 6.067197155973172e-06, "loss": 1.1898, "mean_token_accuracy": 0.7680463314056396, "num_input_tokens_seen": 6888610, "num_tokens": 6888610.0, "step": 3200, "train_runtime": 5013.9735, "train_tokens_per_second": 1373.882 }, { "entropy": 1.0996434092521667, "epoch": 0.641, "grad_norm": 78.77591705322266, "learning_rate": 6.037440114912521e-06, "loss": 1.6037, "mean_token_accuracy": 0.7061638474464417, "num_input_tokens_seen": 6899769, "num_tokens": 6899769.0, "step": 3205, "train_runtime": 5021.9715, "train_tokens_per_second": 1373.916 }, { "entropy": 1.3301355123519898, "epoch": 0.642, "grad_norm": 53.36579132080078, "learning_rate": 6.00772463923001e-06, "loss": 1.3678, "mean_token_accuracy": 0.7206925034523011, "num_input_tokens_seen": 6909282, "num_tokens": 6909282.0, "step": 3210, "train_runtime": 5029.5947, "train_tokens_per_second": 1373.725 }, { "entropy": 1.004092037677765, "epoch": 0.643, "grad_norm": 99.81475067138672, "learning_rate": 5.9780510406269245e-06, "loss": 1.5373, "mean_token_accuracy": 0.7163637161254883, "num_input_tokens_seen": 6922076, "num_tokens": 6922076.0, "step": 3215, "train_runtime": 5037.6854, "train_tokens_per_second": 1374.059 }, { "entropy": 1.172284734249115, "epoch": 0.644, "grad_norm": 57.861846923828125, "learning_rate": 5.948419630365269e-06, "loss": 1.391, "mean_token_accuracy": 0.7324574947357178, "num_input_tokens_seen": 6934352, "num_tokens": 6934352.0, "step": 3220, "train_runtime": 5045.2362, "train_tokens_per_second": 1374.436 }, { "entropy": 1.0466743230819702, "epoch": 0.645, "grad_norm": 81.56342315673828, "learning_rate": 5.918830719264514e-06, "loss": 1.3136, "mean_token_accuracy": 0.7329088568687439, "num_input_tokens_seen": 6947512, "num_tokens": 6947512.0, "step": 3225, "train_runtime": 5053.2875, "train_tokens_per_second": 1374.85 }, { "entropy": 1.0476303219795227, "epoch": 0.646, "grad_norm": 36.323978424072266, "learning_rate": 5.889284617698339e-06, "loss": 1.0248, "mean_token_accuracy": 0.7943867802619934, "num_input_tokens_seen": 6961646, "num_tokens": 6961646.0, "step": 3230, "train_runtime": 5061.3335, "train_tokens_per_second": 1375.457 }, { "entropy": 1.2763081073760987, "epoch": 0.647, "grad_norm": 67.03099060058594, "learning_rate": 5.8597816355913685e-06, "loss": 1.3998, "mean_token_accuracy": 0.7143377780914306, "num_input_tokens_seen": 6972950, "num_tokens": 6972950.0, "step": 3235, "train_runtime": 5068.9484, "train_tokens_per_second": 1375.621 }, { "entropy": 1.1098780035972595, "epoch": 0.648, "grad_norm": 40.54975128173828, "learning_rate": 5.830322082415922e-06, "loss": 1.358, "mean_token_accuracy": 0.7146369457244873, "num_input_tokens_seen": 6985286, "num_tokens": 6985286.0, "step": 3240, "train_runtime": 5076.9142, "train_tokens_per_second": 1375.892 }, { "entropy": 1.0884857654571534, "epoch": 0.649, "grad_norm": 82.66061401367188, "learning_rate": 5.800906267188773e-06, "loss": 1.9733, "mean_token_accuracy": 0.6199260115623474, "num_input_tokens_seen": 6998859, "num_tokens": 6998859.0, "step": 3245, "train_runtime": 5085.1298, "train_tokens_per_second": 1376.338 }, { "entropy": 1.3101659297943116, "epoch": 0.65, "grad_norm": 29.21197509765625, "learning_rate": 5.771534498467908e-06, "loss": 1.3326, "mean_token_accuracy": 0.7344311118125916, "num_input_tokens_seen": 7009854, "num_tokens": 7009854.0, "step": 3250, "train_runtime": 5092.8432, "train_tokens_per_second": 1376.413 }, { "entropy": 1.5312119841575622, "epoch": 0.651, "grad_norm": 109.76095581054688, "learning_rate": 5.742207084349274e-06, "loss": 1.3821, "mean_token_accuracy": 0.7366913914680481, "num_input_tokens_seen": 7020992, "num_tokens": 7020992.0, "step": 3255, "train_runtime": 5100.8326, "train_tokens_per_second": 1376.44 }, { "entropy": 1.491343879699707, "epoch": 0.652, "grad_norm": 47.0439567565918, "learning_rate": 5.712924332463575e-06, "loss": 1.3979, "mean_token_accuracy": 0.7185340642929077, "num_input_tokens_seen": 7033140, "num_tokens": 7033140.0, "step": 3260, "train_runtime": 5108.926, "train_tokens_per_second": 1376.638 }, { "entropy": 1.1049851179122925, "epoch": 0.653, "grad_norm": 93.34063720703125, "learning_rate": 5.683686549973018e-06, "loss": 1.4034, "mean_token_accuracy": 0.7072288274765015, "num_input_tokens_seen": 7043801, "num_tokens": 7043801.0, "step": 3265, "train_runtime": 5116.6557, "train_tokens_per_second": 1376.642 }, { "entropy": 0.9945809125900269, "epoch": 0.654, "grad_norm": 34.19245147705078, "learning_rate": 5.654494043568109e-06, "loss": 1.1846, "mean_token_accuracy": 0.7553728222846985, "num_input_tokens_seen": 7055668, "num_tokens": 7055668.0, "step": 3270, "train_runtime": 5124.5857, "train_tokens_per_second": 1376.827 }, { "entropy": 1.0684616923332215, "epoch": 0.655, "grad_norm": 80.13645935058594, "learning_rate": 5.625347119464422e-06, "loss": 1.4199, "mean_token_accuracy": 0.7120877385139466, "num_input_tokens_seen": 7070000, "num_tokens": 7070000.0, "step": 3275, "train_runtime": 5132.4815, "train_tokens_per_second": 1377.501 }, { "entropy": 1.0304800510406493, "epoch": 0.656, "grad_norm": 51.343345642089844, "learning_rate": 5.596246083399402e-06, "loss": 1.4432, "mean_token_accuracy": 0.719438111782074, "num_input_tokens_seen": 7082962, "num_tokens": 7082962.0, "step": 3280, "train_runtime": 5140.5795, "train_tokens_per_second": 1377.853 }, { "entropy": 1.3814075708389282, "epoch": 0.657, "grad_norm": 66.12107849121094, "learning_rate": 5.567191240629151e-06, "loss": 1.4182, "mean_token_accuracy": 0.712137508392334, "num_input_tokens_seen": 7094547, "num_tokens": 7094547.0, "step": 3285, "train_runtime": 5148.5643, "train_tokens_per_second": 1377.966 }, { "entropy": 1.4290355920791626, "epoch": 0.658, "grad_norm": 43.175453186035156, "learning_rate": 5.538182895925212e-06, "loss": 1.149, "mean_token_accuracy": 0.7581753849983215, "num_input_tokens_seen": 7104058, "num_tokens": 7104058.0, "step": 3290, "train_runtime": 5156.3069, "train_tokens_per_second": 1377.742 }, { "entropy": 1.028139889240265, "epoch": 0.659, "grad_norm": 68.8287124633789, "learning_rate": 5.509221353571404e-06, "loss": 1.2311, "mean_token_accuracy": 0.7485540986061097, "num_input_tokens_seen": 7117221, "num_tokens": 7117221.0, "step": 3295, "train_runtime": 5164.0308, "train_tokens_per_second": 1378.23 }, { "entropy": 1.481036901473999, "epoch": 0.66, "grad_norm": 48.31966018676758, "learning_rate": 5.4803069173605915e-06, "loss": 1.1383, "mean_token_accuracy": 0.7699462294578552, "num_input_tokens_seen": 7126290, "num_tokens": 7126290.0, "step": 3300, "train_runtime": 5171.796, "train_tokens_per_second": 1377.914 }, { "entropy": 1.360877013206482, "epoch": 0.661, "grad_norm": 70.4743881225586, "learning_rate": 5.451439890591539e-06, "loss": 1.468, "mean_token_accuracy": 0.709950840473175, "num_input_tokens_seen": 7139034, "num_tokens": 7139034.0, "step": 3305, "train_runtime": 5179.6949, "train_tokens_per_second": 1378.273 }, { "entropy": 1.1276076555252075, "epoch": 0.662, "grad_norm": 47.95039367675781, "learning_rate": 5.422620576065689e-06, "loss": 1.1417, "mean_token_accuracy": 0.7634807586669922, "num_input_tokens_seen": 7152512, "num_tokens": 7152512.0, "step": 3310, "train_runtime": 5187.4348, "train_tokens_per_second": 1378.815 }, { "entropy": 1.2883234739303588, "epoch": 0.663, "grad_norm": 71.47711944580078, "learning_rate": 5.3938492760840176e-06, "loss": 1.5005, "mean_token_accuracy": 0.7095641493797302, "num_input_tokens_seen": 7164519, "num_tokens": 7164519.0, "step": 3315, "train_runtime": 5195.385, "train_tokens_per_second": 1379.016 }, { "entropy": 1.580101990699768, "epoch": 0.664, "grad_norm": 30.757640838623047, "learning_rate": 5.365126292443852e-06, "loss": 1.0996, "mean_token_accuracy": 0.7840258717536926, "num_input_tokens_seen": 7176182, "num_tokens": 7176182.0, "step": 3320, "train_runtime": 5203.0377, "train_tokens_per_second": 1379.229 }, { "entropy": 1.3178885221481322, "epoch": 0.665, "grad_norm": 72.54142761230469, "learning_rate": 5.336451926435688e-06, "loss": 1.113, "mean_token_accuracy": 0.7665977120399475, "num_input_tokens_seen": 7185290, "num_tokens": 7185290.0, "step": 3325, "train_runtime": 5210.78, "train_tokens_per_second": 1378.928 }, { "entropy": 1.3558390378952025, "epoch": 0.666, "grad_norm": 40.15840530395508, "learning_rate": 5.307826478840068e-06, "loss": 1.0077, "mean_token_accuracy": 0.7872235774993896, "num_input_tokens_seen": 7194998, "num_tokens": 7194998.0, "step": 3330, "train_runtime": 5218.6663, "train_tokens_per_second": 1378.704 }, { "entropy": 1.7559852600097656, "epoch": 0.667, "grad_norm": 69.71064758300781, "learning_rate": 5.279250249924384e-06, "loss": 1.3817, "mean_token_accuracy": 0.7237523674964905, "num_input_tokens_seen": 7209172, "num_tokens": 7209172.0, "step": 3335, "train_runtime": 5226.4248, "train_tokens_per_second": 1379.37 }, { "entropy": 1.160668921470642, "epoch": 0.668, "grad_norm": 50.834442138671875, "learning_rate": 5.2507235394397595e-06, "loss": 1.1571, "mean_token_accuracy": 0.7607069492340088, "num_input_tokens_seen": 7221632, "num_tokens": 7221632.0, "step": 3340, "train_runtime": 5234.4348, "train_tokens_per_second": 1379.639 }, { "entropy": 1.5356831312179566, "epoch": 0.669, "grad_norm": 93.92755889892578, "learning_rate": 5.222246646617886e-06, "loss": 1.5365, "mean_token_accuracy": 0.7059543132781982, "num_input_tokens_seen": 7231411, "num_tokens": 7231411.0, "step": 3345, "train_runtime": 5242.288, "train_tokens_per_second": 1379.438 }, { "entropy": 1.406112504005432, "epoch": 0.67, "grad_norm": 38.75007629394531, "learning_rate": 5.193819870167893e-06, "loss": 1.1527, "mean_token_accuracy": 0.7605517387390137, "num_input_tokens_seen": 7241916, "num_tokens": 7241916.0, "step": 3350, "train_runtime": 5249.7857, "train_tokens_per_second": 1379.469 }, { "entropy": 1.2977517604827882, "epoch": 0.671, "grad_norm": 83.71302795410156, "learning_rate": 5.165443508273218e-06, "loss": 1.3082, "mean_token_accuracy": 0.7317711710929871, "num_input_tokens_seen": 7254575, "num_tokens": 7254575.0, "step": 3355, "train_runtime": 5257.8636, "train_tokens_per_second": 1379.757 }, { "entropy": 1.3731822490692138, "epoch": 0.672, "grad_norm": 45.7840461730957, "learning_rate": 5.137117858588472e-06, "loss": 1.1387, "mean_token_accuracy": 0.75747652053833, "num_input_tokens_seen": 7266090, "num_tokens": 7266090.0, "step": 3360, "train_runtime": 5265.7203, "train_tokens_per_second": 1379.885 }, { "entropy": 1.6424699544906616, "epoch": 0.673, "grad_norm": 99.49372100830078, "learning_rate": 5.10884321823631e-06, "loss": 1.5842, "mean_token_accuracy": 0.6953648805618287, "num_input_tokens_seen": 7275735, "num_tokens": 7275735.0, "step": 3365, "train_runtime": 5273.3612, "train_tokens_per_second": 1379.715 }, { "entropy": 1.6496708154678346, "epoch": 0.674, "grad_norm": 58.99229431152344, "learning_rate": 5.080619883804333e-06, "loss": 1.2866, "mean_token_accuracy": 0.7398347973823547, "num_input_tokens_seen": 7287078, "num_tokens": 7287078.0, "step": 3370, "train_runtime": 5281.3124, "train_tokens_per_second": 1379.785 }, { "entropy": 1.222231125831604, "epoch": 0.675, "grad_norm": 64.47329711914062, "learning_rate": 5.0524481513419675e-06, "loss": 1.108, "mean_token_accuracy": 0.7603111863136292, "num_input_tokens_seen": 7300061, "num_tokens": 7300061.0, "step": 3375, "train_runtime": 5289.2031, "train_tokens_per_second": 1380.182 }, { "entropy": 1.5332202911376953, "epoch": 0.676, "grad_norm": 38.009098052978516, "learning_rate": 5.02432831635735e-06, "loss": 1.2901, "mean_token_accuracy": 0.7395783424377441, "num_input_tokens_seen": 7309058, "num_tokens": 7309058.0, "step": 3380, "train_runtime": 5296.7803, "train_tokens_per_second": 1379.906 }, { "entropy": 1.6721830129623414, "epoch": 0.677, "grad_norm": 114.00556945800781, "learning_rate": 4.99626067381425e-06, "loss": 1.4868, "mean_token_accuracy": 0.7013270258903503, "num_input_tokens_seen": 7319627, "num_tokens": 7319627.0, "step": 3385, "train_runtime": 5304.6784, "train_tokens_per_second": 1379.844 }, { "entropy": 1.4652080297470094, "epoch": 0.678, "grad_norm": 46.91156768798828, "learning_rate": 4.96824551812895e-06, "loss": 1.1966, "mean_token_accuracy": 0.7518588781356812, "num_input_tokens_seen": 7331542, "num_tokens": 7331542.0, "step": 3390, "train_runtime": 5312.3693, "train_tokens_per_second": 1380.089 }, { "entropy": 1.356518268585205, "epoch": 0.679, "grad_norm": 93.47769927978516, "learning_rate": 4.9402831431671834e-06, "loss": 1.1144, "mean_token_accuracy": 0.7566239953041076, "num_input_tokens_seen": 7341666, "num_tokens": 7341666.0, "step": 3395, "train_runtime": 5320.3662, "train_tokens_per_second": 1379.917 }, { "entropy": 1.5254196405410767, "epoch": 0.68, "grad_norm": 54.97881317138672, "learning_rate": 4.912373842241025e-06, "loss": 1.1071, "mean_token_accuracy": 0.7623493432998657, "num_input_tokens_seen": 7352066, "num_tokens": 7352066.0, "step": 3400, "train_runtime": 5328.3716, "train_tokens_per_second": 1379.796 }, { "entropy": 1.2925235033035278, "epoch": 0.681, "grad_norm": 88.60552978515625, "learning_rate": 4.884517908105837e-06, "loss": 1.0951, "mean_token_accuracy": 0.7789056301116943, "num_input_tokens_seen": 7363693, "num_tokens": 7363693.0, "step": 3405, "train_runtime": 5336.0358, "train_tokens_per_second": 1379.993 }, { "entropy": 1.1981097221374513, "epoch": 0.682, "grad_norm": 25.201887130737305, "learning_rate": 4.856715632957193e-06, "loss": 1.1559, "mean_token_accuracy": 0.7624804019927979, "num_input_tokens_seen": 7378626, "num_tokens": 7378626.0, "step": 3410, "train_runtime": 5344.2447, "train_tokens_per_second": 1380.668 }, { "entropy": 1.3818325042724608, "epoch": 0.683, "grad_norm": 114.46710205078125, "learning_rate": 4.828967308427795e-06, "loss": 1.5341, "mean_token_accuracy": 0.6990022301673889, "num_input_tokens_seen": 7389684, "num_tokens": 7389684.0, "step": 3415, "train_runtime": 5352.334, "train_tokens_per_second": 1380.647 }, { "entropy": 1.3245970010757446, "epoch": 0.684, "grad_norm": 44.36334991455078, "learning_rate": 4.801273225584445e-06, "loss": 1.2813, "mean_token_accuracy": 0.7324728965759277, "num_input_tokens_seen": 7400774, "num_tokens": 7400774.0, "step": 3420, "train_runtime": 5359.9889, "train_tokens_per_second": 1380.744 }, { "entropy": 1.2406444549560547, "epoch": 0.685, "grad_norm": 78.6093521118164, "learning_rate": 4.77363367492496e-06, "loss": 1.0818, "mean_token_accuracy": 0.7650019764900208, "num_input_tokens_seen": 7410897, "num_tokens": 7410897.0, "step": 3425, "train_runtime": 5367.741, "train_tokens_per_second": 1380.636 }, { "entropy": 1.0906942486763, "epoch": 0.686, "grad_norm": 21.425600051879883, "learning_rate": 4.74604894637515e-06, "loss": 0.776, "mean_token_accuracy": 0.8266262531280517, "num_input_tokens_seen": 7421160, "num_tokens": 7421160.0, "step": 3430, "train_runtime": 5375.5291, "train_tokens_per_second": 1380.545 }, { "entropy": 1.2021840810775757, "epoch": 0.687, "grad_norm": 63.47721862792969, "learning_rate": 4.718519329285771e-06, "loss": 0.8577, "mean_token_accuracy": 0.8074276804924011, "num_input_tokens_seen": 7431335, "num_tokens": 7431335.0, "step": 3435, "train_runtime": 5383.1923, "train_tokens_per_second": 1380.47 }, { "entropy": 1.1907160758972168, "epoch": 0.688, "grad_norm": 34.15057373046875, "learning_rate": 4.69104511242947e-06, "loss": 0.8002, "mean_token_accuracy": 0.8193765878677368, "num_input_tokens_seen": 7440070, "num_tokens": 7440070.0, "step": 3440, "train_runtime": 5390.9821, "train_tokens_per_second": 1380.095 }, { "entropy": 1.4512014627456664, "epoch": 0.689, "grad_norm": 66.39862060546875, "learning_rate": 4.663626583997789e-06, "loss": 1.3509, "mean_token_accuracy": 0.725606369972229, "num_input_tokens_seen": 7448491, "num_tokens": 7448491.0, "step": 3445, "train_runtime": 5398.7426, "train_tokens_per_second": 1379.671 }, { "entropy": 1.1273661375045776, "epoch": 0.69, "grad_norm": 64.41907501220703, "learning_rate": 4.63626403159811e-06, "loss": 1.0607, "mean_token_accuracy": 0.7864980459213257, "num_input_tokens_seen": 7458880, "num_tokens": 7458880.0, "step": 3450, "train_runtime": 5406.3258, "train_tokens_per_second": 1379.658 }, { "entropy": 1.2544914484024048, "epoch": 0.691, "grad_norm": 79.77140045166016, "learning_rate": 4.608957742250667e-06, "loss": 1.0653, "mean_token_accuracy": 0.7699116230010986, "num_input_tokens_seen": 7468772, "num_tokens": 7468772.0, "step": 3455, "train_runtime": 5414.1365, "train_tokens_per_second": 1379.495 }, { "entropy": 1.1118125438690185, "epoch": 0.692, "grad_norm": 43.428001403808594, "learning_rate": 4.581708002385506e-06, "loss": 1.1568, "mean_token_accuracy": 0.7658621311187744, "num_input_tokens_seen": 7481270, "num_tokens": 7481270.0, "step": 3460, "train_runtime": 5422.1459, "train_tokens_per_second": 1379.762 }, { "entropy": 1.2565526485443115, "epoch": 0.693, "grad_norm": 69.76953887939453, "learning_rate": 4.554515097839511e-06, "loss": 1.2443, "mean_token_accuracy": 0.736496901512146, "num_input_tokens_seen": 7492065, "num_tokens": 7492065.0, "step": 3465, "train_runtime": 5429.7431, "train_tokens_per_second": 1379.819 }, { "entropy": 1.1914144515991212, "epoch": 0.694, "grad_norm": 56.5172004699707, "learning_rate": 4.527379313853381e-06, "loss": 1.2906, "mean_token_accuracy": 0.7354740619659423, "num_input_tokens_seen": 7503178, "num_tokens": 7503178.0, "step": 3470, "train_runtime": 5437.7304, "train_tokens_per_second": 1379.836 }, { "entropy": 1.3521007776260376, "epoch": 0.695, "grad_norm": 78.12995910644531, "learning_rate": 4.500300935068647e-06, "loss": 1.4214, "mean_token_accuracy": 0.7007991075515747, "num_input_tokens_seen": 7513537, "num_tokens": 7513537.0, "step": 3475, "train_runtime": 5445.4642, "train_tokens_per_second": 1379.779 }, { "entropy": 1.3210823297500611, "epoch": 0.696, "grad_norm": 65.40374755859375, "learning_rate": 4.473280245524696e-06, "loss": 1.0324, "mean_token_accuracy": 0.7783894419670105, "num_input_tokens_seen": 7521506, "num_tokens": 7521506.0, "step": 3480, "train_runtime": 5453.2367, "train_tokens_per_second": 1379.274 }, { "entropy": 1.2770194053649901, "epoch": 0.697, "grad_norm": 87.22956848144531, "learning_rate": 4.4463175286557654e-06, "loss": 0.9104, "mean_token_accuracy": 0.7937685489654541, "num_input_tokens_seen": 7530615, "num_tokens": 7530615.0, "step": 3485, "train_runtime": 5461.1172, "train_tokens_per_second": 1378.951 }, { "entropy": 1.3537542581558228, "epoch": 0.698, "grad_norm": 59.853214263916016, "learning_rate": 4.419413067288006e-06, "loss": 1.1246, "mean_token_accuracy": 0.7583097100257874, "num_input_tokens_seen": 7542760, "num_tokens": 7542760.0, "step": 3490, "train_runtime": 5469.0475, "train_tokens_per_second": 1379.173 }, { "entropy": 1.1941829681396485, "epoch": 0.699, "grad_norm": 67.97786712646484, "learning_rate": 4.39256714363648e-06, "loss": 1.1393, "mean_token_accuracy": 0.761511754989624, "num_input_tokens_seen": 7555072, "num_tokens": 7555072.0, "step": 3495, "train_runtime": 5476.9386, "train_tokens_per_second": 1379.433 }, { "entropy": 1.0276753664016725, "epoch": 0.7, "grad_norm": 61.332237243652344, "learning_rate": 4.3657800393022255e-06, "loss": 1.0819, "mean_token_accuracy": 0.7709161043167114, "num_input_tokens_seen": 7568036, "num_tokens": 7568036.0, "step": 3500, "train_runtime": 5485.0239, "train_tokens_per_second": 1379.764 }, { "entropy": 2.025991916656494, "epoch": 0.701, "grad_norm": 70.11701965332031, "learning_rate": 4.339052035269291e-06, "loss": 1.697, "mean_token_accuracy": 0.6730327725410461, "num_input_tokens_seen": 7579687, "num_tokens": 7579687.0, "step": 3505, "train_runtime": 5492.8852, "train_tokens_per_second": 1379.91 }, { "entropy": 1.5761598587036132, "epoch": 0.702, "grad_norm": 37.813316345214844, "learning_rate": 4.312383411901796e-06, "loss": 1.2894, "mean_token_accuracy": 0.7472969651222229, "num_input_tokens_seen": 7592544, "num_tokens": 7592544.0, "step": 3510, "train_runtime": 5500.8614, "train_tokens_per_second": 1380.246 }, { "entropy": 1.3647594213485719, "epoch": 0.703, "grad_norm": 64.08737182617188, "learning_rate": 4.2857744489409725e-06, "loss": 0.9106, "mean_token_accuracy": 0.8002523303031921, "num_input_tokens_seen": 7603254, "num_tokens": 7603254.0, "step": 3515, "train_runtime": 5508.7207, "train_tokens_per_second": 1380.221 }, { "entropy": 1.3827792167663575, "epoch": 0.704, "grad_norm": 36.621925354003906, "learning_rate": 4.259225425502256e-06, "loss": 1.0013, "mean_token_accuracy": 0.7848365902900696, "num_input_tokens_seen": 7613936, "num_tokens": 7613936.0, "step": 3520, "train_runtime": 5516.5405, "train_tokens_per_second": 1380.201 }, { "entropy": 1.1546970367431642, "epoch": 0.705, "grad_norm": 98.19839477539062, "learning_rate": 4.2327366200723404e-06, "loss": 1.0867, "mean_token_accuracy": 0.7713291764259338, "num_input_tokens_seen": 7626282, "num_tokens": 7626282.0, "step": 3525, "train_runtime": 5524.1125, "train_tokens_per_second": 1380.544 }, { "entropy": 1.3656534671783447, "epoch": 0.706, "grad_norm": 35.8089485168457, "learning_rate": 4.206308310506255e-06, "loss": 1.2152, "mean_token_accuracy": 0.7342224478721618, "num_input_tokens_seen": 7639688, "num_tokens": 7639688.0, "step": 3530, "train_runtime": 5532.2717, "train_tokens_per_second": 1380.931 }, { "entropy": 1.325789499282837, "epoch": 0.707, "grad_norm": 96.21459197998047, "learning_rate": 4.179940774024469e-06, "loss": 1.4048, "mean_token_accuracy": 0.7267882227897644, "num_input_tokens_seen": 7652200, "num_tokens": 7652200.0, "step": 3535, "train_runtime": 5539.8956, "train_tokens_per_second": 1381.29 }, { "entropy": 1.4081687450408935, "epoch": 0.708, "grad_norm": 46.83771896362305, "learning_rate": 4.153634287209955e-06, "loss": 1.0285, "mean_token_accuracy": 0.7776418089866638, "num_input_tokens_seen": 7665218, "num_tokens": 7665218.0, "step": 3540, "train_runtime": 5547.8757, "train_tokens_per_second": 1381.649 }, { "entropy": 1.2783577203750611, "epoch": 0.709, "grad_norm": 69.18657684326172, "learning_rate": 4.127389126005319e-06, "loss": 1.0885, "mean_token_accuracy": 0.7689595103263855, "num_input_tokens_seen": 7678032, "num_tokens": 7678032.0, "step": 3545, "train_runtime": 5555.8978, "train_tokens_per_second": 1381.961 }, { "entropy": 1.8628759384155273, "epoch": 0.71, "grad_norm": 39.14692306518555, "learning_rate": 4.101205565709876e-06, "loss": 1.228, "mean_token_accuracy": 0.7432847023010254, "num_input_tokens_seen": 7688826, "num_tokens": 7688826.0, "step": 3550, "train_runtime": 5563.8139, "train_tokens_per_second": 1381.934 }, { "entropy": 1.3626526117324829, "epoch": 0.711, "grad_norm": 63.692752838134766, "learning_rate": 4.0750838809767875e-06, "loss": 0.9141, "mean_token_accuracy": 0.7917525887489318, "num_input_tokens_seen": 7699381, "num_tokens": 7699381.0, "step": 3555, "train_runtime": 5571.4891, "train_tokens_per_second": 1381.925 }, { "entropy": 1.298656415939331, "epoch": 0.712, "grad_norm": 54.293174743652344, "learning_rate": 4.049024345810169e-06, "loss": 1.2901, "mean_token_accuracy": 0.7231242895126343, "num_input_tokens_seen": 7710538, "num_tokens": 7710538.0, "step": 3560, "train_runtime": 5579.1873, "train_tokens_per_second": 1382.018 }, { "entropy": 1.4240386724472045, "epoch": 0.713, "grad_norm": 93.25569915771484, "learning_rate": 4.0230272335622065e-06, "loss": 1.1743, "mean_token_accuracy": 0.7511500358581543, "num_input_tokens_seen": 7722556, "num_tokens": 7722556.0, "step": 3565, "train_runtime": 5587.1578, "train_tokens_per_second": 1382.198 }, { "entropy": 1.360674262046814, "epoch": 0.714, "grad_norm": 31.99083137512207, "learning_rate": 3.997092816930313e-06, "loss": 1.0609, "mean_token_accuracy": 0.7683918833732605, "num_input_tokens_seen": 7734854, "num_tokens": 7734854.0, "step": 3570, "train_runtime": 5595.1576, "train_tokens_per_second": 1382.419 }, { "entropy": 1.270754337310791, "epoch": 0.715, "grad_norm": 111.44794464111328, "learning_rate": 3.971221367954239e-06, "loss": 1.4725, "mean_token_accuracy": 0.7106862783432006, "num_input_tokens_seen": 7748383, "num_tokens": 7748383.0, "step": 3575, "train_runtime": 5603.2241, "train_tokens_per_second": 1382.844 }, { "entropy": 1.4692581176757813, "epoch": 0.716, "grad_norm": 29.97042465209961, "learning_rate": 3.945413158013249e-06, "loss": 0.7738, "mean_token_accuracy": 0.816168236732483, "num_input_tokens_seen": 7757056, "num_tokens": 7757056.0, "step": 3580, "train_runtime": 5610.7416, "train_tokens_per_second": 1382.537 }, { "entropy": 1.471003270149231, "epoch": 0.717, "grad_norm": 61.03013610839844, "learning_rate": 3.919668457823248e-06, "loss": 1.2461, "mean_token_accuracy": 0.7387072563171386, "num_input_tokens_seen": 7766439, "num_tokens": 7766439.0, "step": 3585, "train_runtime": 5618.5232, "train_tokens_per_second": 1382.292 }, { "entropy": 1.311043381690979, "epoch": 0.718, "grad_norm": 27.74488067626953, "learning_rate": 3.893987537433961e-06, "loss": 0.9035, "mean_token_accuracy": 0.7923862338066101, "num_input_tokens_seen": 7776354, "num_tokens": 7776354.0, "step": 3590, "train_runtime": 5626.0246, "train_tokens_per_second": 1382.211 }, { "entropy": 1.228741216659546, "epoch": 0.719, "grad_norm": 61.496116638183594, "learning_rate": 3.8683706662260945e-06, "loss": 1.1574, "mean_token_accuracy": 0.7505334496498108, "num_input_tokens_seen": 7787384, "num_tokens": 7787384.0, "step": 3595, "train_runtime": 5634.1128, "train_tokens_per_second": 1382.185 }, { "entropy": 1.310219168663025, "epoch": 0.72, "grad_norm": 24.281906127929688, "learning_rate": 3.842818112908498e-06, "loss": 0.8724, "mean_token_accuracy": 0.8005131959915162, "num_input_tokens_seen": 7796174, "num_tokens": 7796174.0, "step": 3600, "train_runtime": 5641.8652, "train_tokens_per_second": 1381.843 }, { "entropy": 1.383880877494812, "epoch": 0.721, "grad_norm": 66.84306335449219, "learning_rate": 3.817330145515374e-06, "loss": 0.9604, "mean_token_accuracy": 0.7808654308319092, "num_input_tokens_seen": 7805054, "num_tokens": 7805054.0, "step": 3605, "train_runtime": 5649.6305, "train_tokens_per_second": 1381.516 }, { "entropy": 1.338327980041504, "epoch": 0.722, "grad_norm": 26.28288459777832, "learning_rate": 3.79190703140343e-06, "loss": 0.8202, "mean_token_accuracy": 0.8055516839027405, "num_input_tokens_seen": 7815046, "num_tokens": 7815046.0, "step": 3610, "train_runtime": 5657.6376, "train_tokens_per_second": 1381.327 }, { "entropy": 1.6157650232315064, "epoch": 0.723, "grad_norm": 75.97126007080078, "learning_rate": 3.766549037249112e-06, "loss": 1.1379, "mean_token_accuracy": 0.7504758834838867, "num_input_tokens_seen": 7824584, "num_tokens": 7824584.0, "step": 3615, "train_runtime": 5665.1635, "train_tokens_per_second": 1381.175 }, { "entropy": 1.2897813320159912, "epoch": 0.724, "grad_norm": 39.537925720214844, "learning_rate": 3.741256429045771e-06, "loss": 1.2922, "mean_token_accuracy": 0.7358455061912537, "num_input_tokens_seen": 7835424, "num_tokens": 7835424.0, "step": 3620, "train_runtime": 5672.9102, "train_tokens_per_second": 1381.2 }, { "entropy": 1.3352119684219361, "epoch": 0.725, "grad_norm": 113.45731353759766, "learning_rate": 3.7160294721009026e-06, "loss": 1.482, "mean_token_accuracy": 0.6937533259391785, "num_input_tokens_seen": 7845869, "num_tokens": 7845869.0, "step": 3625, "train_runtime": 5680.8281, "train_tokens_per_second": 1381.114 }, { "entropy": 1.3501306295394897, "epoch": 0.726, "grad_norm": 25.94645881652832, "learning_rate": 3.690868431033352e-06, "loss": 1.0593, "mean_token_accuracy": 0.7666467428207397, "num_input_tokens_seen": 7855314, "num_tokens": 7855314.0, "step": 3630, "train_runtime": 5688.6461, "train_tokens_per_second": 1380.876 }, { "entropy": 1.2169443607330321, "epoch": 0.727, "grad_norm": 108.32181549072266, "learning_rate": 3.6657735697705267e-06, "loss": 1.0543, "mean_token_accuracy": 0.7693784713745118, "num_input_tokens_seen": 7868535, "num_tokens": 7868535.0, "step": 3635, "train_runtime": 5696.7057, "train_tokens_per_second": 1381.243 }, { "entropy": 1.6997152805328368, "epoch": 0.728, "grad_norm": 68.39515686035156, "learning_rate": 3.6407451515456537e-06, "loss": 1.2771, "mean_token_accuracy": 0.7291630864143371, "num_input_tokens_seen": 7880378, "num_tokens": 7880378.0, "step": 3640, "train_runtime": 5704.2277, "train_tokens_per_second": 1381.498 }, { "entropy": 1.3734771490097046, "epoch": 0.729, "grad_norm": 102.05339050292969, "learning_rate": 3.6157834388949907e-06, "loss": 1.2234, "mean_token_accuracy": 0.734777820110321, "num_input_tokens_seen": 7892012, "num_tokens": 7892012.0, "step": 3645, "train_runtime": 5712.1786, "train_tokens_per_second": 1381.612 }, { "entropy": 1.465916895866394, "epoch": 0.73, "grad_norm": 41.61610412597656, "learning_rate": 3.5908886936550967e-06, "loss": 1.0414, "mean_token_accuracy": 0.7643039345741272, "num_input_tokens_seen": 7900746, "num_tokens": 7900746.0, "step": 3650, "train_runtime": 5719.5327, "train_tokens_per_second": 1381.362 }, { "entropy": 1.5392575979232788, "epoch": 0.731, "grad_norm": 121.2569580078125, "learning_rate": 3.5660611769600604e-06, "loss": 1.3567, "mean_token_accuracy": 0.7084240794181824, "num_input_tokens_seen": 7911043, "num_tokens": 7911043.0, "step": 3655, "train_runtime": 5727.5983, "train_tokens_per_second": 1381.215 }, { "entropy": 2.0418188095092775, "epoch": 0.732, "grad_norm": 30.220109939575195, "learning_rate": 3.541301149238798e-06, "loss": 1.2486, "mean_token_accuracy": 0.7370859503746032, "num_input_tokens_seen": 7923418, "num_tokens": 7923418.0, "step": 3660, "train_runtime": 5735.6269, "train_tokens_per_second": 1381.439 }, { "entropy": 1.1118021965026856, "epoch": 0.733, "grad_norm": 83.36518096923828, "learning_rate": 3.5166088702122738e-06, "loss": 1.1055, "mean_token_accuracy": 0.7630825877189636, "num_input_tokens_seen": 7936936, "num_tokens": 7936936.0, "step": 3665, "train_runtime": 5743.6571, "train_tokens_per_second": 1381.861 }, { "entropy": 1.230707597732544, "epoch": 0.734, "grad_norm": 39.548377990722656, "learning_rate": 3.491984598890812e-06, "loss": 1.1888, "mean_token_accuracy": 0.7464107990264892, "num_input_tokens_seen": 7948660, "num_tokens": 7948660.0, "step": 3670, "train_runtime": 5751.3058, "train_tokens_per_second": 1382.062 }, { "entropy": 1.2973393201828003, "epoch": 0.735, "grad_norm": 74.15829467773438, "learning_rate": 3.4674285935713715e-06, "loss": 1.0265, "mean_token_accuracy": 0.767569637298584, "num_input_tokens_seen": 7958934, "num_tokens": 7958934.0, "step": 3675, "train_runtime": 5759.0931, "train_tokens_per_second": 1381.977 }, { "entropy": 1.3392265796661378, "epoch": 0.736, "grad_norm": 30.114980697631836, "learning_rate": 3.442941111834822e-06, "loss": 1.0624, "mean_token_accuracy": 0.7677551507949829, "num_input_tokens_seen": 7972624, "num_tokens": 7972624.0, "step": 3680, "train_runtime": 5766.9242, "train_tokens_per_second": 1382.474 }, { "entropy": 1.2502947807312013, "epoch": 0.737, "grad_norm": 66.16523742675781, "learning_rate": 3.418522410543266e-06, "loss": 1.1296, "mean_token_accuracy": 0.7529385447502136, "num_input_tokens_seen": 7984610, "num_tokens": 7984610.0, "step": 3685, "train_runtime": 5775.0309, "train_tokens_per_second": 1382.609 }, { "entropy": 1.2900609970092773, "epoch": 0.738, "grad_norm": 34.53410339355469, "learning_rate": 3.3941727458373177e-06, "loss": 1.1091, "mean_token_accuracy": 0.7618632912635803, "num_input_tokens_seen": 7995562, "num_tokens": 7995562.0, "step": 3690, "train_runtime": 5782.8775, "train_tokens_per_second": 1382.627 }, { "entropy": 1.8589048385620117, "epoch": 0.739, "grad_norm": 63.71791458129883, "learning_rate": 3.3698923731334453e-06, "loss": 1.2904, "mean_token_accuracy": 0.7397416234016418, "num_input_tokens_seen": 8007010, "num_tokens": 8007010.0, "step": 3695, "train_runtime": 5790.8039, "train_tokens_per_second": 1382.711 }, { "entropy": 1.2792827606201171, "epoch": 0.74, "grad_norm": 37.82085037231445, "learning_rate": 3.3456815471212634e-06, "loss": 0.9094, "mean_token_accuracy": 0.7880057692527771, "num_input_tokens_seen": 8018576, "num_tokens": 8018576.0, "step": 3700, "train_runtime": 5798.4433, "train_tokens_per_second": 1382.884 }, { "entropy": 1.5248919486999513, "epoch": 0.741, "grad_norm": 58.85905838012695, "learning_rate": 3.321540521760883e-06, "loss": 1.4223, "mean_token_accuracy": 0.7132097601890564, "num_input_tokens_seen": 8032018, "num_tokens": 8032018.0, "step": 3705, "train_runtime": 5806.2949, "train_tokens_per_second": 1383.329 }, { "entropy": 1.1351463675498963, "epoch": 0.742, "grad_norm": 43.24095916748047, "learning_rate": 3.297469550280239e-06, "loss": 0.8731, "mean_token_accuracy": 0.8049924373626709, "num_input_tokens_seen": 8044422, "num_tokens": 8044422.0, "step": 3710, "train_runtime": 5814.4148, "train_tokens_per_second": 1383.531 }, { "entropy": 1.2750665187835692, "epoch": 0.743, "grad_norm": 86.18648529052734, "learning_rate": 3.2734688851724273e-06, "loss": 1.1543, "mean_token_accuracy": 0.7470584511756897, "num_input_tokens_seen": 8057490, "num_tokens": 8057490.0, "step": 3715, "train_runtime": 5822.4754, "train_tokens_per_second": 1383.86 }, { "entropy": 1.684879994392395, "epoch": 0.744, "grad_norm": 52.51597213745117, "learning_rate": 3.249538778193074e-06, "loss": 1.2934, "mean_token_accuracy": 0.7216780781745911, "num_input_tokens_seen": 8068534, "num_tokens": 8068534.0, "step": 3720, "train_runtime": 5830.4665, "train_tokens_per_second": 1383.857 }, { "entropy": 1.2267857551574708, "epoch": 0.745, "grad_norm": 89.6382064819336, "learning_rate": 3.2256794803576707e-06, "loss": 0.9691, "mean_token_accuracy": 0.7910555720329284, "num_input_tokens_seen": 8076482, "num_tokens": 8076482.0, "step": 3725, "train_runtime": 5838.1393, "train_tokens_per_second": 1383.4 }, { "entropy": 1.2867751598358155, "epoch": 0.746, "grad_norm": 29.939369201660156, "learning_rate": 3.201891241938969e-06, "loss": 0.8934, "mean_token_accuracy": 0.7952397346496582, "num_input_tokens_seen": 8086942, "num_tokens": 8086942.0, "step": 3730, "train_runtime": 5845.6627, "train_tokens_per_second": 1383.409 }, { "entropy": 1.369941806793213, "epoch": 0.747, "grad_norm": 58.967952728271484, "learning_rate": 3.178174312464326e-06, "loss": 1.28, "mean_token_accuracy": 0.7203678250312805, "num_input_tokens_seen": 8099103, "num_tokens": 8099103.0, "step": 3735, "train_runtime": 5853.4576, "train_tokens_per_second": 1383.644 }, { "entropy": 1.2704147338867187, "epoch": 0.748, "grad_norm": 31.792207717895508, "learning_rate": 3.1545289407131128e-06, "loss": 0.9142, "mean_token_accuracy": 0.7880112767219544, "num_input_tokens_seen": 8110906, "num_tokens": 8110906.0, "step": 3740, "train_runtime": 5861.4526, "train_tokens_per_second": 1383.771 }, { "entropy": 1.6253525972366334, "epoch": 0.749, "grad_norm": 103.6296157836914, "learning_rate": 3.130955374714094e-06, "loss": 1.6635, "mean_token_accuracy": 0.6671479344367981, "num_input_tokens_seen": 8120765, "num_tokens": 8120765.0, "step": 3745, "train_runtime": 5869.2034, "train_tokens_per_second": 1383.623 }, { "entropy": 1.4856916427612306, "epoch": 0.75, "grad_norm": 38.285648345947266, "learning_rate": 3.107453861742815e-06, "loss": 1.2364, "mean_token_accuracy": 0.7383233428001403, "num_input_tokens_seen": 8128944, "num_tokens": 8128944.0, "step": 3750, "train_runtime": 5876.937, "train_tokens_per_second": 1383.194 }, { "entropy": 1.2395502090454102, "epoch": 0.751, "grad_norm": 55.0019645690918, "learning_rate": 3.0840246483190338e-06, "loss": 1.002, "mean_token_accuracy": 0.7732610821723938, "num_input_tokens_seen": 8139569, "num_tokens": 8139569.0, "step": 3755, "train_runtime": 5884.5032, "train_tokens_per_second": 1383.221 }, { "entropy": 1.1674710035324096, "epoch": 0.752, "grad_norm": 28.131237030029297, "learning_rate": 3.060667980204104e-06, "loss": 0.8729, "mean_token_accuracy": 0.7994015455245972, "num_input_tokens_seen": 8151152, "num_tokens": 8151152.0, "step": 3760, "train_runtime": 5892.3816, "train_tokens_per_second": 1383.337 }, { "entropy": 1.4738351106643677, "epoch": 0.753, "grad_norm": 69.5341796875, "learning_rate": 3.037384102398431e-06, "loss": 1.0914, "mean_token_accuracy": 0.7601243495941162, "num_input_tokens_seen": 8162111, "num_tokens": 8162111.0, "step": 3765, "train_runtime": 5899.9236, "train_tokens_per_second": 1383.427 }, { "entropy": 0.8155726194381714, "epoch": 0.754, "grad_norm": 42.369720458984375, "learning_rate": 3.014173259138867e-06, "loss": 1.101, "mean_token_accuracy": 0.7413458943367004, "num_input_tokens_seen": 8173740, "num_tokens": 8173740.0, "step": 3770, "train_runtime": 5907.8993, "train_tokens_per_second": 1383.527 }, { "entropy": 1.5831396579742432, "epoch": 0.755, "grad_norm": 77.57059478759766, "learning_rate": 2.9910356938961782e-06, "loss": 1.4341, "mean_token_accuracy": 0.7117488861083985, "num_input_tokens_seen": 8186604, "num_tokens": 8186604.0, "step": 3775, "train_runtime": 5915.8604, "train_tokens_per_second": 1383.84 }, { "entropy": 1.4315198898315429, "epoch": 0.756, "grad_norm": 32.01146697998047, "learning_rate": 2.9679716493724795e-06, "loss": 1.2518, "mean_token_accuracy": 0.736680555343628, "num_input_tokens_seen": 8198738, "num_tokens": 8198738.0, "step": 3780, "train_runtime": 5923.8656, "train_tokens_per_second": 1384.018 }, { "entropy": 1.3986699819564818, "epoch": 0.757, "grad_norm": 75.7470932006836, "learning_rate": 2.944981367498677e-06, "loss": 1.1652, "mean_token_accuracy": 0.7525392174720764, "num_input_tokens_seen": 8210536, "num_tokens": 8210536.0, "step": 3785, "train_runtime": 5931.6476, "train_tokens_per_second": 1384.191 }, { "entropy": 1.2854469060897826, "epoch": 0.758, "grad_norm": 35.5533561706543, "learning_rate": 2.9220650894319557e-06, "loss": 1.1521, "mean_token_accuracy": 0.746622908115387, "num_input_tokens_seen": 8224298, "num_tokens": 8224298.0, "step": 3790, "train_runtime": 5939.4116, "train_tokens_per_second": 1384.699 }, { "entropy": 1.3270967960357667, "epoch": 0.759, "grad_norm": 67.43975830078125, "learning_rate": 2.899223055553221e-06, "loss": 1.1547, "mean_token_accuracy": 0.7511350512504578, "num_input_tokens_seen": 8233429, "num_tokens": 8233429.0, "step": 3795, "train_runtime": 5947.2051, "train_tokens_per_second": 1384.42 }, { "entropy": 1.268804931640625, "epoch": 0.76, "grad_norm": 30.384315490722656, "learning_rate": 2.8764555054646083e-06, "loss": 0.9725, "mean_token_accuracy": 0.7705807209014892, "num_input_tokens_seen": 8243088, "num_tokens": 8243088.0, "step": 3800, "train_runtime": 5955.0364, "train_tokens_per_second": 1384.221 }, { "entropy": 1.5043867588043214, "epoch": 0.761, "grad_norm": 92.78527069091797, "learning_rate": 2.853762677986932e-06, "loss": 1.3491, "mean_token_accuracy": 0.7136110663414001, "num_input_tokens_seen": 8254863, "num_tokens": 8254863.0, "step": 3805, "train_runtime": 5963.0848, "train_tokens_per_second": 1384.328 }, { "entropy": 1.4746314525604247, "epoch": 0.762, "grad_norm": 53.46134948730469, "learning_rate": 2.8311448111572304e-06, "loss": 1.2884, "mean_token_accuracy": 0.7179763436317443, "num_input_tokens_seen": 8265916, "num_tokens": 8265916.0, "step": 3810, "train_runtime": 5970.9, "train_tokens_per_second": 1384.367 }, { "entropy": 1.3656929016113282, "epoch": 0.763, "grad_norm": 59.72466278076172, "learning_rate": 2.808602142226212e-06, "loss": 1.1113, "mean_token_accuracy": 0.7416937470436096, "num_input_tokens_seen": 8279548, "num_tokens": 8279548.0, "step": 3815, "train_runtime": 5979.0338, "train_tokens_per_second": 1384.764 }, { "entropy": 1.4616585493087768, "epoch": 0.764, "grad_norm": 32.81327819824219, "learning_rate": 2.786134907655814e-06, "loss": 1.1189, "mean_token_accuracy": 0.7607102632522583, "num_input_tokens_seen": 8290708, "num_tokens": 8290708.0, "step": 3820, "train_runtime": 5986.5324, "train_tokens_per_second": 1384.893 }, { "entropy": 1.0893842458724976, "epoch": 0.765, "grad_norm": 75.71564483642578, "learning_rate": 2.7637433431166903e-06, "loss": 1.1981, "mean_token_accuracy": 0.7351195216178894, "num_input_tokens_seen": 8303436, "num_tokens": 8303436.0, "step": 3825, "train_runtime": 5994.6823, "train_tokens_per_second": 1385.134 }, { "entropy": 1.922438383102417, "epoch": 0.766, "grad_norm": 51.680110931396484, "learning_rate": 2.741427683485759e-06, "loss": 1.4348, "mean_token_accuracy": 0.7030194520950317, "num_input_tokens_seen": 8316720, "num_tokens": 8316720.0, "step": 3830, "train_runtime": 6002.8227, "train_tokens_per_second": 1385.468 }, { "entropy": 1.4860228776931763, "epoch": 0.767, "grad_norm": 80.91349792480469, "learning_rate": 2.7191881628437335e-06, "loss": 0.971, "mean_token_accuracy": 0.7847875833511353, "num_input_tokens_seen": 8328361, "num_tokens": 8328361.0, "step": 3835, "train_runtime": 6010.7915, "train_tokens_per_second": 1385.568 }, { "entropy": 1.6241419792175293, "epoch": 0.768, "grad_norm": 49.88835525512695, "learning_rate": 2.6970250144726563e-06, "loss": 0.9152, "mean_token_accuracy": 0.7817273736000061, "num_input_tokens_seen": 8339202, "num_tokens": 8339202.0, "step": 3840, "train_runtime": 6018.7785, "train_tokens_per_second": 1385.531 }, { "entropy": 1.3531593561172486, "epoch": 0.769, "grad_norm": 77.78016662597656, "learning_rate": 2.674938470853472e-06, "loss": 1.3924, "mean_token_accuracy": 0.6933253407478333, "num_input_tokens_seen": 8351667, "num_tokens": 8351667.0, "step": 3845, "train_runtime": 6026.3944, "train_tokens_per_second": 1385.848 }, { "entropy": 1.5034178018569946, "epoch": 0.77, "grad_norm": 50.22134017944336, "learning_rate": 2.652928763663567e-06, "loss": 1.0682, "mean_token_accuracy": 0.7562308311462402, "num_input_tokens_seen": 8360698, "num_tokens": 8360698.0, "step": 3850, "train_runtime": 6034.2165, "train_tokens_per_second": 1385.548 }, { "entropy": 1.3400921821594238, "epoch": 0.771, "grad_norm": 81.12057495117188, "learning_rate": 2.6309961237743587e-06, "loss": 0.9269, "mean_token_accuracy": 0.7889165639877319, "num_input_tokens_seen": 8370530, "num_tokens": 8370530.0, "step": 3855, "train_runtime": 6042.1609, "train_tokens_per_second": 1385.354 }, { "entropy": 1.5574659824371337, "epoch": 0.772, "grad_norm": 48.00089645385742, "learning_rate": 2.6091407812488567e-06, "loss": 1.3249, "mean_token_accuracy": 0.7249131202697754, "num_input_tokens_seen": 8379790, "num_tokens": 8379790.0, "step": 3860, "train_runtime": 6049.9955, "train_tokens_per_second": 1385.09 }, { "entropy": 1.6976630926132201, "epoch": 0.773, "grad_norm": 59.483970642089844, "learning_rate": 2.5873629653392653e-06, "loss": 1.4304, "mean_token_accuracy": 0.6994726181030273, "num_input_tokens_seen": 8392547, "num_tokens": 8392547.0, "step": 3865, "train_runtime": 6058.0092, "train_tokens_per_second": 1385.364 }, { "entropy": 1.117939329147339, "epoch": 0.774, "grad_norm": 27.776966094970703, "learning_rate": 2.5656629044845714e-06, "loss": 0.8697, "mean_token_accuracy": 0.7957409739494323, "num_input_tokens_seen": 8402402, "num_tokens": 8402402.0, "step": 3870, "train_runtime": 6065.4649, "train_tokens_per_second": 1385.286 }, { "entropy": 1.469796895980835, "epoch": 0.775, "grad_norm": 86.05614471435547, "learning_rate": 2.5440408263081385e-06, "loss": 1.6124, "mean_token_accuracy": 0.6581724524497986, "num_input_tokens_seen": 8413762, "num_tokens": 8413762.0, "step": 3875, "train_runtime": 6073.3658, "train_tokens_per_second": 1385.354 }, { "entropy": 1.4165674686431884, "epoch": 0.776, "grad_norm": 29.25621795654297, "learning_rate": 2.5224969576153413e-06, "loss": 1.088, "mean_token_accuracy": 0.753670048713684, "num_input_tokens_seen": 8424226, "num_tokens": 8424226.0, "step": 3880, "train_runtime": 6080.9381, "train_tokens_per_second": 1385.35 }, { "entropy": 1.6084112167358398, "epoch": 0.777, "grad_norm": 83.57496643066406, "learning_rate": 2.501031524391163e-06, "loss": 1.1041, "mean_token_accuracy": 0.752663516998291, "num_input_tokens_seen": 8433013, "num_tokens": 8433013.0, "step": 3885, "train_runtime": 6088.7862, "train_tokens_per_second": 1385.007 }, { "entropy": 1.297988510131836, "epoch": 0.778, "grad_norm": 42.676002502441406, "learning_rate": 2.479644751797845e-06, "loss": 0.7829, "mean_token_accuracy": 0.8128929853439331, "num_input_tokens_seen": 8443894, "num_tokens": 8443894.0, "step": 3890, "train_runtime": 6096.8415, "train_tokens_per_second": 1384.962 }, { "entropy": 1.4910779237747191, "epoch": 0.779, "grad_norm": 84.19873046875, "learning_rate": 2.458336864172508e-06, "loss": 1.4095, "mean_token_accuracy": 0.6869819402694702, "num_input_tokens_seen": 8456430, "num_tokens": 8456430.0, "step": 3895, "train_runtime": 6104.9148, "train_tokens_per_second": 1385.184 }, { "entropy": 1.4598451137542725, "epoch": 0.78, "grad_norm": 40.738975524902344, "learning_rate": 2.437108085024812e-06, "loss": 1.2405, "mean_token_accuracy": 0.7203732371330261, "num_input_tokens_seen": 8469086, "num_tokens": 8469086.0, "step": 3900, "train_runtime": 6112.5524, "train_tokens_per_second": 1385.524 }, { "entropy": 2.0186686754226684, "epoch": 0.781, "grad_norm": 42.932945251464844, "learning_rate": 2.415958637034609e-06, "loss": 1.4898, "mean_token_accuracy": 0.6851933121681213, "num_input_tokens_seen": 8482037, "num_tokens": 8482037.0, "step": 3905, "train_runtime": 6120.4227, "train_tokens_per_second": 1385.858 }, { "entropy": 1.2855441331863404, "epoch": 0.782, "grad_norm": 40.300716400146484, "learning_rate": 2.3948887420495927e-06, "loss": 0.9142, "mean_token_accuracy": 0.7816412091255188, "num_input_tokens_seen": 8490510, "num_tokens": 8490510.0, "step": 3910, "train_runtime": 6128.0801, "train_tokens_per_second": 1385.509 }, { "entropy": 1.4818305969238281, "epoch": 0.783, "grad_norm": 59.31056213378906, "learning_rate": 2.3738986210829997e-06, "loss": 0.9964, "mean_token_accuracy": 0.76490159034729, "num_input_tokens_seen": 8503124, "num_tokens": 8503124.0, "step": 3915, "train_runtime": 6136.068, "train_tokens_per_second": 1385.761 }, { "entropy": 1.4659812927246094, "epoch": 0.784, "grad_norm": 105.48592376708984, "learning_rate": 2.352988494311259e-06, "loss": 1.1407, "mean_token_accuracy": 0.7515397906303406, "num_input_tokens_seen": 8513886, "num_tokens": 8513886.0, "step": 3920, "train_runtime": 6143.9444, "train_tokens_per_second": 1385.736 }, { "entropy": 1.7747930526733398, "epoch": 0.785, "grad_norm": 78.4128189086914, "learning_rate": 2.332158581071712e-06, "loss": 1.3849, "mean_token_accuracy": 0.7053063154220581, "num_input_tokens_seen": 8525197, "num_tokens": 8525197.0, "step": 3925, "train_runtime": 6151.8237, "train_tokens_per_second": 1385.8 }, { "entropy": 1.4004515647888183, "epoch": 0.786, "grad_norm": 45.03752136230469, "learning_rate": 2.311409099860288e-06, "loss": 1.1357, "mean_token_accuracy": 0.7386579155921936, "num_input_tokens_seen": 8538668, "num_tokens": 8538668.0, "step": 3930, "train_runtime": 6159.5448, "train_tokens_per_second": 1386.25 }, { "entropy": 1.7005187034606934, "epoch": 0.787, "grad_norm": 70.38102722167969, "learning_rate": 2.2907402683292268e-06, "loss": 1.308, "mean_token_accuracy": 0.7250552892684936, "num_input_tokens_seen": 8547144, "num_tokens": 8547144.0, "step": 3935, "train_runtime": 6166.923, "train_tokens_per_second": 1385.966 }, { "entropy": 1.5525089263916017, "epoch": 0.788, "grad_norm": 25.583961486816406, "learning_rate": 2.270152303284795e-06, "loss": 1.1114, "mean_token_accuracy": 0.7659584999084472, "num_input_tokens_seen": 8556734, "num_tokens": 8556734.0, "step": 3940, "train_runtime": 6174.7229, "train_tokens_per_second": 1385.768 }, { "entropy": 1.6095909595489502, "epoch": 0.789, "grad_norm": 91.29248809814453, "learning_rate": 2.249645420684998e-06, "loss": 1.4568, "mean_token_accuracy": 0.6840232491493226, "num_input_tokens_seen": 8568045, "num_tokens": 8568045.0, "step": 3945, "train_runtime": 6182.6149, "train_tokens_per_second": 1385.829 }, { "entropy": 2.106953763961792, "epoch": 0.79, "grad_norm": 49.168731689453125, "learning_rate": 2.2292198356373362e-06, "loss": 1.1941, "mean_token_accuracy": 0.7210730195045472, "num_input_tokens_seen": 8581884, "num_tokens": 8581884.0, "step": 3950, "train_runtime": 6190.721, "train_tokens_per_second": 1386.25 }, { "entropy": 2.8291799783706666, "epoch": 0.791, "grad_norm": 66.48655700683594, "learning_rate": 2.2088757623965263e-06, "loss": 1.6041, "mean_token_accuracy": 0.6770190596580505, "num_input_tokens_seen": 8595240, "num_tokens": 8595240.0, "step": 3955, "train_runtime": 6198.5382, "train_tokens_per_second": 1386.656 }, { "entropy": 1.948809313774109, "epoch": 0.792, "grad_norm": 46.97433090209961, "learning_rate": 2.188613414362273e-06, "loss": 1.055, "mean_token_accuracy": 0.7594554662704468, "num_input_tokens_seen": 8606718, "num_tokens": 8606718.0, "step": 3960, "train_runtime": 6206.5371, "train_tokens_per_second": 1386.718 }, { "entropy": 1.671703028678894, "epoch": 0.793, "grad_norm": 62.104644775390625, "learning_rate": 2.1684330040770183e-06, "loss": 1.0574, "mean_token_accuracy": 0.7520993232727051, "num_input_tokens_seen": 8616648, "num_tokens": 8616648.0, "step": 3965, "train_runtime": 6214.2109, "train_tokens_per_second": 1386.604 }, { "entropy": 1.4121116638183593, "epoch": 0.794, "grad_norm": 40.99410629272461, "learning_rate": 2.148334743223719e-06, "loss": 1.2198, "mean_token_accuracy": 0.7071260929107666, "num_input_tokens_seen": 8630776, "num_tokens": 8630776.0, "step": 3970, "train_runtime": 6222.3426, "train_tokens_per_second": 1387.062 }, { "entropy": 2.0288737058639525, "epoch": 0.795, "grad_norm": 75.88356018066406, "learning_rate": 2.128318842623618e-06, "loss": 1.4625, "mean_token_accuracy": 0.689672839641571, "num_input_tokens_seen": 8643074, "num_tokens": 8643074.0, "step": 3975, "train_runtime": 6230.4229, "train_tokens_per_second": 1387.237 }, { "entropy": 1.7867184281349182, "epoch": 0.796, "grad_norm": 21.77215576171875, "learning_rate": 2.108385512234041e-06, "loss": 1.175, "mean_token_accuracy": 0.7321109414100647, "num_input_tokens_seen": 8656904, "num_tokens": 8656904.0, "step": 3980, "train_runtime": 6238.484, "train_tokens_per_second": 1387.661 }, { "entropy": 1.585437035560608, "epoch": 0.797, "grad_norm": 76.3211441040039, "learning_rate": 2.088534961146197e-06, "loss": 1.0084, "mean_token_accuracy": 0.7689394474029541, "num_input_tokens_seen": 8670955, "num_tokens": 8670955.0, "step": 3985, "train_runtime": 6246.2844, "train_tokens_per_second": 1388.178 }, { "entropy": 1.6252539157867432, "epoch": 0.798, "grad_norm": 39.40887451171875, "learning_rate": 2.068767397582967e-06, "loss": 0.8978, "mean_token_accuracy": 0.7874024868011474, "num_input_tokens_seen": 8679998, "num_tokens": 8679998.0, "step": 3990, "train_runtime": 6253.7242, "train_tokens_per_second": 1387.973 }, { "entropy": 2.0088531732559205, "epoch": 0.799, "grad_norm": 95.7879867553711, "learning_rate": 2.0490830288967443e-06, "loss": 1.2492, "mean_token_accuracy": 0.7146164178848267, "num_input_tokens_seen": 8693591, "num_tokens": 8693591.0, "step": 3995, "train_runtime": 6261.8771, "train_tokens_per_second": 1388.336 }, { "entropy": 1.7150477647781373, "epoch": 0.8, "grad_norm": 41.801395416259766, "learning_rate": 2.029482061567237e-06, "loss": 1.1309, "mean_token_accuracy": 0.7288961052894593, "num_input_tokens_seen": 8705026, "num_tokens": 8705026.0, "step": 4000, "train_runtime": 6269.822, "train_tokens_per_second": 1388.401 }, { "entropy": 1.1376944780349731, "epoch": 0.801, "grad_norm": 55.260807037353516, "learning_rate": 2.0099647011993217e-06, "loss": 1.1216, "mean_token_accuracy": 0.737943971157074, "num_input_tokens_seen": 8717460, "num_tokens": 8717460.0, "step": 4005, "train_runtime": 6277.957, "train_tokens_per_second": 1388.582 }, { "entropy": 1.3161041617393494, "epoch": 0.802, "grad_norm": 38.12187957763672, "learning_rate": 1.990531152520869e-06, "loss": 0.9293, "mean_token_accuracy": 0.7724202394485473, "num_input_tokens_seen": 8728604, "num_tokens": 8728604.0, "step": 4010, "train_runtime": 6285.838, "train_tokens_per_second": 1388.614 }, { "entropy": 1.446150779724121, "epoch": 0.803, "grad_norm": 35.263641357421875, "learning_rate": 1.971181619380611e-06, "loss": 1.0642, "mean_token_accuracy": 0.7536746025085449, "num_input_tokens_seen": 8739238, "num_tokens": 8739238.0, "step": 4015, "train_runtime": 6293.4914, "train_tokens_per_second": 1388.615 }, { "entropy": 1.1960746049880981, "epoch": 0.804, "grad_norm": 26.941038131713867, "learning_rate": 1.9519163047459978e-06, "loss": 0.8346, "mean_token_accuracy": 0.8039800524711609, "num_input_tokens_seen": 8750686, "num_tokens": 8750686.0, "step": 4020, "train_runtime": 6301.2095, "train_tokens_per_second": 1388.731 }, { "entropy": 2.00214524269104, "epoch": 0.805, "grad_norm": 53.24460220336914, "learning_rate": 1.9327354107010566e-06, "loss": 1.2216, "mean_token_accuracy": 0.7318102240562439, "num_input_tokens_seen": 8762262, "num_tokens": 8762262.0, "step": 4025, "train_runtime": 6309.2135, "train_tokens_per_second": 1388.804 }, { "entropy": 1.8178802251815795, "epoch": 0.806, "grad_norm": 33.80361557006836, "learning_rate": 1.9136391384442964e-06, "loss": 1.2618, "mean_token_accuracy": 0.7049457669258118, "num_input_tokens_seen": 8772626, "num_tokens": 8772626.0, "step": 4030, "train_runtime": 6317.2064, "train_tokens_per_second": 1388.688 }, { "entropy": 1.316303515434265, "epoch": 0.807, "grad_norm": 52.86846923828125, "learning_rate": 1.894627688286571e-06, "loss": 1.1504, "mean_token_accuracy": 0.7189980983734131, "num_input_tokens_seen": 8782467, "num_tokens": 8782467.0, "step": 4035, "train_runtime": 6325.1399, "train_tokens_per_second": 1388.502 }, { "entropy": 1.7600452780723572, "epoch": 0.808, "grad_norm": 22.21462059020996, "learning_rate": 1.875701259649002e-06, "loss": 1.0368, "mean_token_accuracy": 0.7424626827239991, "num_input_tokens_seen": 8793852, "num_tokens": 8793852.0, "step": 4040, "train_runtime": 6332.8509, "train_tokens_per_second": 1388.609 }, { "entropy": 2.090176057815552, "epoch": 0.809, "grad_norm": 52.672855377197266, "learning_rate": 1.8568600510608659e-06, "loss": 1.1831, "mean_token_accuracy": 0.7131210446357727, "num_input_tokens_seen": 8804833, "num_tokens": 8804833.0, "step": 4045, "train_runtime": 6340.7224, "train_tokens_per_second": 1388.617 }, { "entropy": 2.9579575061798096, "epoch": 0.81, "grad_norm": 84.00788879394531, "learning_rate": 1.838104260157525e-06, "loss": 1.3071, "mean_token_accuracy": 0.7054033279418945, "num_input_tokens_seen": 8817782, "num_tokens": 8817782.0, "step": 4050, "train_runtime": 6348.5974, "train_tokens_per_second": 1388.934 }, { "entropy": 1.8831562042236327, "epoch": 0.811, "grad_norm": 48.69538116455078, "learning_rate": 1.8194340836783565e-06, "loss": 1.209, "mean_token_accuracy": 0.7215249896049499, "num_input_tokens_seen": 8829116, "num_tokens": 8829116.0, "step": 4055, "train_runtime": 6356.604, "train_tokens_per_second": 1388.967 }, { "entropy": 2.4178826332092287, "epoch": 0.812, "grad_norm": 59.92999267578125, "learning_rate": 1.80084971746467e-06, "loss": 1.1622, "mean_token_accuracy": 0.7277819275856018, "num_input_tokens_seen": 8842240, "num_tokens": 8842240.0, "step": 4060, "train_runtime": 6364.7181, "train_tokens_per_second": 1389.259 }, { "entropy": 1.616713571548462, "epoch": 0.813, "grad_norm": 45.00993347167969, "learning_rate": 1.7823513564576788e-06, "loss": 1.1406, "mean_token_accuracy": 0.7165958523750305, "num_input_tokens_seen": 8851859, "num_tokens": 8851859.0, "step": 4065, "train_runtime": 6372.7885, "train_tokens_per_second": 1389.009 }, { "entropy": 1.8950795650482177, "epoch": 0.814, "grad_norm": 93.71290588378906, "learning_rate": 1.7639391946964312e-06, "loss": 1.0873, "mean_token_accuracy": 0.7411530613899231, "num_input_tokens_seen": 8863578, "num_tokens": 8863578.0, "step": 4070, "train_runtime": 6380.6218, "train_tokens_per_second": 1389.14 }, { "entropy": 1.374240756034851, "epoch": 0.815, "grad_norm": 48.50847625732422, "learning_rate": 1.7456134253157976e-06, "loss": 1.226, "mean_token_accuracy": 0.7121050119400024, "num_input_tokens_seen": 8877646, "num_tokens": 8877646.0, "step": 4075, "train_runtime": 6388.6852, "train_tokens_per_second": 1389.589 }, { "entropy": 1.7577392578125, "epoch": 0.816, "grad_norm": 24.03495979309082, "learning_rate": 1.7273742405444217e-06, "loss": 1.2496, "mean_token_accuracy": 0.7106709361076355, "num_input_tokens_seen": 8889216, "num_tokens": 8889216.0, "step": 4080, "train_runtime": 6396.3843, "train_tokens_per_second": 1389.725 }, { "entropy": 2.3441586971282957, "epoch": 0.817, "grad_norm": 53.1981201171875, "learning_rate": 1.709221831702723e-06, "loss": 1.292, "mean_token_accuracy": 0.6984099745750427, "num_input_tokens_seen": 8902173, "num_tokens": 8902173.0, "step": 4085, "train_runtime": 6404.3884, "train_tokens_per_second": 1390.011 }, { "entropy": 1.7050036787986755, "epoch": 0.818, "grad_norm": 63.9437141418457, "learning_rate": 1.691156389200883e-06, "loss": 1.3228, "mean_token_accuracy": 0.6945767045021057, "num_input_tokens_seen": 8915864, "num_tokens": 8915864.0, "step": 4090, "train_runtime": 6412.6129, "train_tokens_per_second": 1390.364 }, { "entropy": 1.9178019523620606, "epoch": 0.819, "grad_norm": 85.22266387939453, "learning_rate": 1.6731781025368422e-06, "loss": 1.472, "mean_token_accuracy": 0.6666085362434387, "num_input_tokens_seen": 8926159, "num_tokens": 8926159.0, "step": 4095, "train_runtime": 6420.5982, "train_tokens_per_second": 1390.238 }, { "entropy": 2.0152694940567017, "epoch": 0.82, "grad_norm": 38.64442443847656, "learning_rate": 1.6552871602943233e-06, "loss": 1.3457, "mean_token_accuracy": 0.6998928785324097, "num_input_tokens_seen": 8938194, "num_tokens": 8938194.0, "step": 4100, "train_runtime": 6428.2899, "train_tokens_per_second": 1390.447 }, { "entropy": 1.79164776802063, "epoch": 0.821, "grad_norm": 48.320980072021484, "learning_rate": 1.6374837501408403e-06, "loss": 1.0276, "mean_token_accuracy": 0.7520916342735291, "num_input_tokens_seen": 8948924, "num_tokens": 8948924.0, "step": 4105, "train_runtime": 6436.149, "train_tokens_per_second": 1390.416 }, { "entropy": 2.141014003753662, "epoch": 0.822, "grad_norm": 28.1344051361084, "learning_rate": 1.6197680588257435e-06, "loss": 1.075, "mean_token_accuracy": 0.7412958145141602, "num_input_tokens_seen": 8960282, "num_tokens": 8960282.0, "step": 4110, "train_runtime": 6443.8353, "train_tokens_per_second": 1390.52 }, { "entropy": 1.5904229402542114, "epoch": 0.823, "grad_norm": 36.33155059814453, "learning_rate": 1.602140272178253e-06, "loss": 0.862, "mean_token_accuracy": 0.776524031162262, "num_input_tokens_seen": 8970263, "num_tokens": 8970263.0, "step": 4115, "train_runtime": 6451.7245, "train_tokens_per_second": 1390.367 }, { "entropy": 1.2879398107528686, "epoch": 0.824, "grad_norm": 29.97951316833496, "learning_rate": 1.5846005751055116e-06, "loss": 1.1686, "mean_token_accuracy": 0.7246122121810913, "num_input_tokens_seen": 8983202, "num_tokens": 8983202.0, "step": 4120, "train_runtime": 6459.7323, "train_tokens_per_second": 1390.646 }, { "entropy": 2.312497091293335, "epoch": 0.825, "grad_norm": 29.855554580688477, "learning_rate": 1.5671491515906355e-06, "loss": 1.2315, "mean_token_accuracy": 0.7053094387054444, "num_input_tokens_seen": 8997235, "num_tokens": 8997235.0, "step": 4125, "train_runtime": 6467.6165, "train_tokens_per_second": 1391.121 }, { "entropy": 2.197419595718384, "epoch": 0.826, "grad_norm": 33.8692626953125, "learning_rate": 1.5497861846908024e-06, "loss": 1.1629, "mean_token_accuracy": 0.7233014822006225, "num_input_tokens_seen": 9011322, "num_tokens": 9011322.0, "step": 4130, "train_runtime": 6475.7541, "train_tokens_per_second": 1391.548 }, { "entropy": 1.5996154308319093, "epoch": 0.827, "grad_norm": 35.22285079956055, "learning_rate": 1.5325118565353237e-06, "loss": 1.1753, "mean_token_accuracy": 0.7149853944778443, "num_input_tokens_seen": 9022594, "num_tokens": 9022594.0, "step": 4135, "train_runtime": 6483.4853, "train_tokens_per_second": 1391.627 }, { "entropy": 2.0306775093078615, "epoch": 0.828, "grad_norm": 30.22802734375, "learning_rate": 1.51532634832372e-06, "loss": 1.2734, "mean_token_accuracy": 0.705897080898285, "num_input_tokens_seen": 9032384, "num_tokens": 9032384.0, "step": 4140, "train_runtime": 6491.3099, "train_tokens_per_second": 1391.458 }, { "entropy": 1.652352213859558, "epoch": 0.829, "grad_norm": 27.395790100097656, "learning_rate": 1.498229840323847e-06, "loss": 1.2286, "mean_token_accuracy": 0.715501868724823, "num_input_tokens_seen": 9045707, "num_tokens": 9045707.0, "step": 4145, "train_runtime": 6499.4163, "train_tokens_per_second": 1391.772 }, { "entropy": 2.2633752107620237, "epoch": 0.83, "grad_norm": 36.269554138183594, "learning_rate": 1.4812225118699775e-06, "loss": 1.4668, "mean_token_accuracy": 0.6703356981277466, "num_input_tokens_seen": 9059660, "num_tokens": 9059660.0, "step": 4150, "train_runtime": 6507.4324, "train_tokens_per_second": 1392.202 }, { "entropy": 1.715526819229126, "epoch": 0.831, "grad_norm": 39.1240348815918, "learning_rate": 1.464304541360946e-06, "loss": 1.2053, "mean_token_accuracy": 0.7233669281005859, "num_input_tokens_seen": 9068751, "num_tokens": 9068751.0, "step": 4155, "train_runtime": 6515.3264, "train_tokens_per_second": 1391.91 }, { "entropy": 1.7912874221801758, "epoch": 0.832, "grad_norm": 23.351165771484375, "learning_rate": 1.4474761062582509e-06, "loss": 1.2469, "mean_token_accuracy": 0.7053301334381104, "num_input_tokens_seen": 9080544, "num_tokens": 9080544.0, "step": 4160, "train_runtime": 6523.0012, "train_tokens_per_second": 1392.081 }, { "entropy": 2.0490939140319826, "epoch": 0.833, "grad_norm": 38.66179656982422, "learning_rate": 1.4307373830842174e-06, "loss": 1.2817, "mean_token_accuracy": 0.7002087354660034, "num_input_tokens_seen": 9092007, "num_tokens": 9092007.0, "step": 4165, "train_runtime": 6530.7747, "train_tokens_per_second": 1392.179 }, { "entropy": 1.7948603749275207, "epoch": 0.834, "grad_norm": 20.985761642456055, "learning_rate": 1.4140885474201315e-06, "loss": 1.0955, "mean_token_accuracy": 0.727351987361908, "num_input_tokens_seen": 9106060, "num_tokens": 9106060.0, "step": 4170, "train_runtime": 6538.8736, "train_tokens_per_second": 1392.604 }, { "entropy": 2.0243131637573244, "epoch": 0.835, "grad_norm": 35.531856536865234, "learning_rate": 1.3975297739043992e-06, "loss": 1.3455, "mean_token_accuracy": 0.6733543276786804, "num_input_tokens_seen": 9118178, "num_tokens": 9118178.0, "step": 4175, "train_runtime": 6546.995, "train_tokens_per_second": 1392.727 }, { "entropy": 1.9413438796997071, "epoch": 0.836, "grad_norm": 13.00714111328125, "learning_rate": 1.3810612362307208e-06, "loss": 1.2735, "mean_token_accuracy": 0.6986963748931885, "num_input_tokens_seen": 9131858, "num_tokens": 9131858.0, "step": 4180, "train_runtime": 6555.25, "train_tokens_per_second": 1393.06 }, { "entropy": 2.2657758235931396, "epoch": 0.837, "grad_norm": 39.98580551147461, "learning_rate": 1.3646831071462606e-06, "loss": 1.253, "mean_token_accuracy": 0.7096449971199036, "num_input_tokens_seen": 9144953, "num_tokens": 9144953.0, "step": 4185, "train_runtime": 6562.9845, "train_tokens_per_second": 1393.414 }, { "entropy": 2.56455454826355, "epoch": 0.838, "grad_norm": 142.16355895996094, "learning_rate": 1.3483955584498476e-06, "loss": 1.3836, "mean_token_accuracy": 0.6883266925811767, "num_input_tokens_seen": 9156846, "num_tokens": 9156846.0, "step": 4190, "train_runtime": 6571.0968, "train_tokens_per_second": 1393.503 }, { "entropy": 2.4812206745147707, "epoch": 0.839, "grad_norm": 25.875394821166992, "learning_rate": 1.3321987609901553e-06, "loss": 1.1499, "mean_token_accuracy": 0.7294828295707703, "num_input_tokens_seen": 9169675, "num_tokens": 9169675.0, "step": 4195, "train_runtime": 6578.9139, "train_tokens_per_second": 1393.798 }, { "entropy": 2.0839139461517333, "epoch": 0.84, "grad_norm": 26.65656089782715, "learning_rate": 1.3160928846639275e-06, "loss": 1.3017, "mean_token_accuracy": 0.690561842918396, "num_input_tokens_seen": 9183872, "num_tokens": 9183872.0, "step": 4200, "train_runtime": 6587.2027, "train_tokens_per_second": 1394.199 }, { "entropy": 1.8809577703475953, "epoch": 0.841, "grad_norm": 30.178382873535156, "learning_rate": 1.3000780984141881e-06, "loss": 1.1149, "mean_token_accuracy": 0.7315121412277221, "num_input_tokens_seen": 9196267, "num_tokens": 9196267.0, "step": 4205, "train_runtime": 6595.3111, "train_tokens_per_second": 1394.364 }, { "entropy": 1.3200510501861573, "epoch": 0.842, "grad_norm": 11.290911674499512, "learning_rate": 1.2841545702284618e-06, "loss": 0.8431, "mean_token_accuracy": 0.7939366579055787, "num_input_tokens_seen": 9207524, "num_tokens": 9207524.0, "step": 4210, "train_runtime": 6603.2155, "train_tokens_per_second": 1394.4 }, { "entropy": 2.232276678085327, "epoch": 0.843, "grad_norm": 36.492942810058594, "learning_rate": 1.2683224671370286e-06, "loss": 1.2306, "mean_token_accuracy": 0.6959267854690552, "num_input_tokens_seen": 9220828, "num_tokens": 9220828.0, "step": 4215, "train_runtime": 6611.2182, "train_tokens_per_second": 1394.724 }, { "entropy": 1.8813665866851808, "epoch": 0.844, "grad_norm": 15.735624313354492, "learning_rate": 1.252581955211155e-06, "loss": 1.1339, "mean_token_accuracy": 0.7186631441116333, "num_input_tokens_seen": 9234102, "num_tokens": 9234102.0, "step": 4220, "train_runtime": 6619.2044, "train_tokens_per_second": 1395.047 }, { "entropy": 2.1061328172683718, "epoch": 0.845, "grad_norm": 177.8684844970703, "learning_rate": 1.2369331995613664e-06, "loss": 1.1255, "mean_token_accuracy": 0.73298020362854, "num_input_tokens_seen": 9246292, "num_tokens": 9246292.0, "step": 4225, "train_runtime": 6626.9628, "train_tokens_per_second": 1395.253 }, { "entropy": 2.0237887382507322, "epoch": 0.846, "grad_norm": 25.120643615722656, "learning_rate": 1.2213763643357002e-06, "loss": 0.9697, "mean_token_accuracy": 0.7623323559761047, "num_input_tokens_seen": 9257464, "num_tokens": 9257464.0, "step": 4230, "train_runtime": 6634.9942, "train_tokens_per_second": 1395.248 }, { "entropy": 2.4560704708099363, "epoch": 0.847, "grad_norm": 33.17424392700195, "learning_rate": 1.2059116127179993e-06, "loss": 1.1804, "mean_token_accuracy": 0.7174956440925598, "num_input_tokens_seen": 9270239, "num_tokens": 9270239.0, "step": 4235, "train_runtime": 6643.2121, "train_tokens_per_second": 1395.445 }, { "entropy": 1.4510441780090333, "epoch": 0.848, "grad_norm": 15.807106018066406, "learning_rate": 1.1905391069261918e-06, "loss": 1.1528, "mean_token_accuracy": 0.7280502080917358, "num_input_tokens_seen": 9283884, "num_tokens": 9283884.0, "step": 4240, "train_runtime": 6651.2598, "train_tokens_per_second": 1395.808 }, { "entropy": 1.623996877670288, "epoch": 0.849, "grad_norm": 27.195398330688477, "learning_rate": 1.1752590082105863e-06, "loss": 0.9616, "mean_token_accuracy": 0.7494900941848754, "num_input_tokens_seen": 9296867, "num_tokens": 9296867.0, "step": 4245, "train_runtime": 6659.2859, "train_tokens_per_second": 1396.076 }, { "entropy": 1.9291970491409303, "epoch": 0.85, "grad_norm": 19.73321533203125, "learning_rate": 1.1600714768521903e-06, "loss": 1.0578, "mean_token_accuracy": 0.7479368805885315, "num_input_tokens_seen": 9307880, "num_tokens": 9307880.0, "step": 4250, "train_runtime": 6666.9149, "train_tokens_per_second": 1396.13 }, { "entropy": 1.8610557675361634, "epoch": 0.851, "grad_norm": 31.37166404724121, "learning_rate": 1.144976672161019e-06, "loss": 1.0554, "mean_token_accuracy": 0.741044282913208, "num_input_tokens_seen": 9319717, "num_tokens": 9319717.0, "step": 4255, "train_runtime": 6675.0653, "train_tokens_per_second": 1396.199 }, { "entropy": 2.0262247920036316, "epoch": 0.852, "grad_norm": 23.545368194580078, "learning_rate": 1.1299747524744309e-06, "loss": 1.083, "mean_token_accuracy": 0.7292125940322876, "num_input_tokens_seen": 9332524, "num_tokens": 9332524.0, "step": 4260, "train_runtime": 6683.178, "train_tokens_per_second": 1396.42 }, { "entropy": 1.8631191372871398, "epoch": 0.853, "grad_norm": 27.078514099121094, "learning_rate": 1.1150658751554667e-06, "loss": 1.0551, "mean_token_accuracy": 0.7525937557220459, "num_input_tokens_seen": 9344327, "num_tokens": 9344327.0, "step": 4265, "train_runtime": 6691.2042, "train_tokens_per_second": 1396.509 }, { "entropy": 1.7008347988128663, "epoch": 0.854, "grad_norm": 25.129323959350586, "learning_rate": 1.100250196591195e-06, "loss": 1.1582, "mean_token_accuracy": 0.7241525173187255, "num_input_tokens_seen": 9355038, "num_tokens": 9355038.0, "step": 4270, "train_runtime": 6699.1353, "train_tokens_per_second": 1396.455 }, { "entropy": 1.270380187034607, "epoch": 0.855, "grad_norm": 38.039432525634766, "learning_rate": 1.08552787219107e-06, "loss": 1.1899, "mean_token_accuracy": 0.7014187455177308, "num_input_tokens_seen": 9366294, "num_tokens": 9366294.0, "step": 4275, "train_runtime": 6706.8565, "train_tokens_per_second": 1396.525 }, { "entropy": 1.8343629837036133, "epoch": 0.856, "grad_norm": 16.263355255126953, "learning_rate": 1.0708990563853127e-06, "loss": 1.011, "mean_token_accuracy": 0.7384909272193909, "num_input_tokens_seen": 9377790, "num_tokens": 9377790.0, "step": 4280, "train_runtime": 6714.679, "train_tokens_per_second": 1396.61 }, { "entropy": 2.5283599376678465, "epoch": 0.857, "grad_norm": 32.25537872314453, "learning_rate": 1.0563639026232742e-06, "loss": 1.2201, "mean_token_accuracy": 0.7126690745353699, "num_input_tokens_seen": 9389511, "num_tokens": 9389511.0, "step": 4285, "train_runtime": 6722.698, "train_tokens_per_second": 1396.688 }, { "entropy": 1.9582865238189697, "epoch": 0.858, "grad_norm": 28.47056770324707, "learning_rate": 1.041922563371842e-06, "loss": 1.2164, "mean_token_accuracy": 0.6976001858711243, "num_input_tokens_seen": 9403660, "num_tokens": 9403660.0, "step": 4290, "train_runtime": 6731.0604, "train_tokens_per_second": 1397.055 }, { "entropy": 2.0376640796661376, "epoch": 0.859, "grad_norm": 42.3572998046875, "learning_rate": 1.027575190113832e-06, "loss": 1.1808, "mean_token_accuracy": 0.7124897360801696, "num_input_tokens_seen": 9414349, "num_tokens": 9414349.0, "step": 4295, "train_runtime": 6739.105, "train_tokens_per_second": 1396.973 }, { "entropy": 1.3070422530174255, "epoch": 0.86, "grad_norm": 38.204036712646484, "learning_rate": 1.0133219333463983e-06, "loss": 1.0504, "mean_token_accuracy": 0.7352698922157288, "num_input_tokens_seen": 9425962, "num_tokens": 9425962.0, "step": 4300, "train_runtime": 6747.1295, "train_tokens_per_second": 1397.033 }, { "entropy": 2.1670926094055174, "epoch": 0.861, "grad_norm": 37.05472183227539, "learning_rate": 9.991629425794624e-07, "loss": 1.0634, "mean_token_accuracy": 0.7280888080596923, "num_input_tokens_seen": 9436840, "num_tokens": 9436840.0, "step": 4305, "train_runtime": 6754.7729, "train_tokens_per_second": 1397.063 }, { "entropy": 1.524068546295166, "epoch": 0.862, "grad_norm": 14.920330047607422, "learning_rate": 9.85098366334134e-07, "loss": 1.181, "mean_token_accuracy": 0.7058197498321533, "num_input_tokens_seen": 9447586, "num_tokens": 9447586.0, "step": 4310, "train_runtime": 6762.3861, "train_tokens_per_second": 1397.079 }, { "entropy": 1.3642647981643676, "epoch": 0.863, "grad_norm": 28.05571937561035, "learning_rate": 9.711283521411674e-07, "loss": 0.9585, "mean_token_accuracy": 0.7696447610855103, "num_input_tokens_seen": 9457299, "num_tokens": 9457299.0, "step": 4315, "train_runtime": 6770.0857, "train_tokens_per_second": 1396.925 }, { "entropy": 1.4710171699523926, "epoch": 0.864, "grad_norm": 22.026742935180664, "learning_rate": 9.57253046539396e-07, "loss": 1.2371, "mean_token_accuracy": 0.7175422072410583, "num_input_tokens_seen": 9470648, "num_tokens": 9470648.0, "step": 4320, "train_runtime": 6778.2029, "train_tokens_per_second": 1397.221 }, { "entropy": 1.8204709529876708, "epoch": 0.865, "grad_norm": 38.03694152832031, "learning_rate": 9.434725950742119e-07, "loss": 0.9822, "mean_token_accuracy": 0.7467481255531311, "num_input_tokens_seen": 9481254, "num_tokens": 9481254.0, "step": 4325, "train_runtime": 6786.196, "train_tokens_per_second": 1397.138 }, { "entropy": 2.1349268436431883, "epoch": 0.866, "grad_norm": 15.21910285949707, "learning_rate": 9.297871422960336e-07, "loss": 1.0478, "mean_token_accuracy": 0.7343067288398742, "num_input_tokens_seen": 9492448, "num_tokens": 9492448.0, "step": 4330, "train_runtime": 6793.84, "train_tokens_per_second": 1397.214 }, { "entropy": 2.037457299232483, "epoch": 0.867, "grad_norm": 27.028491973876953, "learning_rate": 9.161968317587788e-07, "loss": 1.0423, "mean_token_accuracy": 0.737897264957428, "num_input_tokens_seen": 9505387, "num_tokens": 9505387.0, "step": 4335, "train_runtime": 6801.904, "train_tokens_per_second": 1397.46 }, { "entropy": 2.211966037750244, "epoch": 0.868, "grad_norm": 19.146862030029297, "learning_rate": 9.027018060183801e-07, "loss": 1.247, "mean_token_accuracy": 0.7048168540000915, "num_input_tokens_seen": 9517058, "num_tokens": 9517058.0, "step": 4340, "train_runtime": 6809.4983, "train_tokens_per_second": 1397.615 }, { "entropy": 2.1408352613449098, "epoch": 0.869, "grad_norm": 16.568950653076172, "learning_rate": 8.893022066312674e-07, "loss": 1.0959, "mean_token_accuracy": 0.7281917095184326, "num_input_tokens_seen": 9530839, "num_tokens": 9530839.0, "step": 4345, "train_runtime": 6817.5823, "train_tokens_per_second": 1397.979 }, { "entropy": 1.8684603929519654, "epoch": 0.87, "grad_norm": 11.357898712158203, "learning_rate": 8.759981741529e-07, "loss": 1.1259, "mean_token_accuracy": 0.73239164352417, "num_input_tokens_seen": 9544028, "num_tokens": 9544028.0, "step": 4350, "train_runtime": 6825.5611, "train_tokens_per_second": 1398.277 }, { "entropy": 1.4783331871032714, "epoch": 0.871, "grad_norm": 19.22555160522461, "learning_rate": 8.627898481362817e-07, "loss": 1.0363, "mean_token_accuracy": 0.7363511085510254, "num_input_tokens_seen": 9554864, "num_tokens": 9554864.0, "step": 4355, "train_runtime": 6833.5225, "train_tokens_per_second": 1398.234 }, { "entropy": 1.8633141040802002, "epoch": 0.872, "grad_norm": 167.1220703125, "learning_rate": 8.496773671305025e-07, "loss": 1.014, "mean_token_accuracy": 0.7410434365272522, "num_input_tokens_seen": 9565696, "num_tokens": 9565696.0, "step": 4360, "train_runtime": 6841.2007, "train_tokens_per_second": 1398.248 }, { "entropy": 2.357930135726929, "epoch": 0.873, "grad_norm": 23.767343521118164, "learning_rate": 8.366608686792854e-07, "loss": 1.0848, "mean_token_accuracy": 0.7313786149024963, "num_input_tokens_seen": 9579362, "num_tokens": 9579362.0, "step": 4365, "train_runtime": 6849.3421, "train_tokens_per_second": 1398.581 }, { "entropy": 1.8828378438949585, "epoch": 0.874, "grad_norm": 23.555400848388672, "learning_rate": 8.237404893195377e-07, "loss": 1.0981, "mean_token_accuracy": 0.7282456755638123, "num_input_tokens_seen": 9592024, "num_tokens": 9592024.0, "step": 4370, "train_runtime": 6857.1197, "train_tokens_per_second": 1398.842 }, { "entropy": 0.9149072289466857, "epoch": 0.875, "grad_norm": 17.85443115234375, "learning_rate": 8.109163645799267e-07, "loss": 0.8963, "mean_token_accuracy": 0.7655051708221435, "num_input_tokens_seen": 9602545, "num_tokens": 9602545.0, "step": 4375, "train_runtime": 6864.9923, "train_tokens_per_second": 1398.77 }, { "entropy": 1.569828248023987, "epoch": 0.876, "grad_norm": 17.38178062438965, "learning_rate": 7.981886289794516e-07, "loss": 0.8882, "mean_token_accuracy": 0.7659066200256348, "num_input_tokens_seen": 9615518, "num_tokens": 9615518.0, "step": 4380, "train_runtime": 6873.0118, "train_tokens_per_second": 1399.025 }, { "entropy": 1.6595673322677613, "epoch": 0.877, "grad_norm": 17.1677303314209, "learning_rate": 7.855574160260371e-07, "loss": 1.0996, "mean_token_accuracy": 0.7271485567092896, "num_input_tokens_seen": 9624298, "num_tokens": 9624298.0, "step": 4385, "train_runtime": 6880.7155, "train_tokens_per_second": 1398.735 }, { "entropy": 1.9928987979888917, "epoch": 0.878, "grad_norm": 27.979177474975586, "learning_rate": 7.730228582151278e-07, "loss": 1.154, "mean_token_accuracy": 0.7334787011146545, "num_input_tokens_seen": 9637102, "num_tokens": 9637102.0, "step": 4390, "train_runtime": 6888.4862, "train_tokens_per_second": 1399.016 }, { "entropy": 1.5558886051177978, "epoch": 0.879, "grad_norm": 17.255855560302734, "learning_rate": 7.60585087028305e-07, "loss": 1.0378, "mean_token_accuracy": 0.7313881993293763, "num_input_tokens_seen": 9647736, "num_tokens": 9647736.0, "step": 4395, "train_runtime": 6896.2258, "train_tokens_per_second": 1398.988 }, { "entropy": 1.888039493560791, "epoch": 0.88, "grad_norm": 26.43712043762207, "learning_rate": 7.482442329319062e-07, "loss": 1.0301, "mean_token_accuracy": 0.7435564875602723, "num_input_tokens_seen": 9661340, "num_tokens": 9661340.0, "step": 4400, "train_runtime": 6904.226, "train_tokens_per_second": 1399.337 }, { "entropy": 1.4927092790603638, "epoch": 0.881, "grad_norm": 19.110734939575195, "learning_rate": 7.360004253756459e-07, "loss": 1.0898, "mean_token_accuracy": 0.7219521880149842, "num_input_tokens_seen": 9671403, "num_tokens": 9671403.0, "step": 4405, "train_runtime": 6912.0623, "train_tokens_per_second": 1399.207 }, { "entropy": 2.1694430828094484, "epoch": 0.882, "grad_norm": 14.292014122009277, "learning_rate": 7.238537927912747e-07, "loss": 1.1322, "mean_token_accuracy": 0.7063448548316955, "num_input_tokens_seen": 9682662, "num_tokens": 9682662.0, "step": 4410, "train_runtime": 6920.0836, "train_tokens_per_second": 1399.212 }, { "entropy": 2.132378673553467, "epoch": 0.883, "grad_norm": 8.485075950622559, "learning_rate": 7.118044625912213e-07, "loss": 0.9792, "mean_token_accuracy": 0.7484685897827148, "num_input_tokens_seen": 9694055, "num_tokens": 9694055.0, "step": 4415, "train_runtime": 6927.9028, "train_tokens_per_second": 1399.277 }, { "entropy": 1.6945796489715577, "epoch": 0.884, "grad_norm": 11.843306541442871, "learning_rate": 6.99852561167258e-07, "loss": 0.9306, "mean_token_accuracy": 0.7579019904136658, "num_input_tokens_seen": 9707438, "num_tokens": 9707438.0, "step": 4420, "train_runtime": 6936.1853, "train_tokens_per_second": 1399.536 }, { "entropy": 1.8733111381530763, "epoch": 0.885, "grad_norm": 11.642597198486328, "learning_rate": 6.879982138891717e-07, "loss": 1.1303, "mean_token_accuracy": 0.7266334176063538, "num_input_tokens_seen": 9719025, "num_tokens": 9719025.0, "step": 4425, "train_runtime": 6944.0761, "train_tokens_per_second": 1399.614 }, { "entropy": 2.041522765159607, "epoch": 0.886, "grad_norm": 24.557912826538086, "learning_rate": 6.76241545103461e-07, "loss": 1.1254, "mean_token_accuracy": 0.7399904251098632, "num_input_tokens_seen": 9732636, "num_tokens": 9732636.0, "step": 4430, "train_runtime": 6952.1919, "train_tokens_per_second": 1399.938 }, { "entropy": 1.6106015682220458, "epoch": 0.887, "grad_norm": 16.95770263671875, "learning_rate": 6.645826781320141e-07, "loss": 0.9594, "mean_token_accuracy": 0.7614152789115906, "num_input_tokens_seen": 9742600, "num_tokens": 9742600.0, "step": 4435, "train_runtime": 6959.9659, "train_tokens_per_second": 1399.806 }, { "entropy": 1.921290969848633, "epoch": 0.888, "grad_norm": 9.643671989440918, "learning_rate": 6.530217352708301e-07, "loss": 0.9541, "mean_token_accuracy": 0.7498831629753113, "num_input_tokens_seen": 9754038, "num_tokens": 9754038.0, "step": 4440, "train_runtime": 6967.926, "train_tokens_per_second": 1399.848 }, { "entropy": 1.5248892784118653, "epoch": 0.889, "grad_norm": 26.351572036743164, "learning_rate": 6.415588377887305e-07, "loss": 1.1115, "mean_token_accuracy": 0.7154369354248047, "num_input_tokens_seen": 9767192, "num_tokens": 9767192.0, "step": 4445, "train_runtime": 6975.7466, "train_tokens_per_second": 1400.164 }, { "entropy": 2.283488082885742, "epoch": 0.89, "grad_norm": 12.980268478393555, "learning_rate": 6.30194105926083e-07, "loss": 1.2257, "mean_token_accuracy": 0.6955106735229493, "num_input_tokens_seen": 9780928, "num_tokens": 9780928.0, "step": 4450, "train_runtime": 6983.6626, "train_tokens_per_second": 1400.544 }, { "entropy": 2.071175956726074, "epoch": 0.891, "grad_norm": 48.86467361450195, "learning_rate": 6.1892765889355e-07, "loss": 1.1117, "mean_token_accuracy": 0.7277013182640075, "num_input_tokens_seen": 9790948, "num_tokens": 9790948.0, "step": 4455, "train_runtime": 6991.2539, "train_tokens_per_second": 1400.457 }, { "entropy": 1.7884487152099608, "epoch": 0.892, "grad_norm": 10.123736381530762, "learning_rate": 6.077596148708275e-07, "loss": 1.2003, "mean_token_accuracy": 0.704963767528534, "num_input_tokens_seen": 9800990, "num_tokens": 9800990.0, "step": 4460, "train_runtime": 6999.2121, "train_tokens_per_second": 1400.299 }, { "entropy": 1.3440423011779785, "epoch": 0.893, "grad_norm": 12.741273880004883, "learning_rate": 5.966900910054141e-07, "loss": 0.949, "mean_token_accuracy": 0.7485662341117859, "num_input_tokens_seen": 9813676, "num_tokens": 9813676.0, "step": 4465, "train_runtime": 7007.1092, "train_tokens_per_second": 1400.531 }, { "entropy": 1.4932385206222534, "epoch": 0.894, "grad_norm": 10.435506820678711, "learning_rate": 5.857192034113757e-07, "loss": 1.0837, "mean_token_accuracy": 0.7148552536964417, "num_input_tokens_seen": 9827986, "num_tokens": 9827986.0, "step": 4470, "train_runtime": 7015.4571, "train_tokens_per_second": 1400.905 }, { "entropy": 2.5356574058532715, "epoch": 0.895, "grad_norm": 66.74246215820312, "learning_rate": 5.748470671681328e-07, "loss": 1.0372, "mean_token_accuracy": 0.7371655106544495, "num_input_tokens_seen": 9842005, "num_tokens": 9842005.0, "step": 4475, "train_runtime": 7023.4435, "train_tokens_per_second": 1401.308 }, { "entropy": 2.1233946084976196, "epoch": 0.896, "grad_norm": 12.831204414367676, "learning_rate": 5.640737963192511e-07, "loss": 1.1414, "mean_token_accuracy": 0.7187443614006043, "num_input_tokens_seen": 9853910, "num_tokens": 9853910.0, "step": 4480, "train_runtime": 7031.1191, "train_tokens_per_second": 1401.471 }, { "entropy": 1.3726256608963012, "epoch": 0.897, "grad_norm": 12.873527526855469, "learning_rate": 5.533995038712403e-07, "loss": 0.9564, "mean_token_accuracy": 0.7501531481742859, "num_input_tokens_seen": 9865908, "num_tokens": 9865908.0, "step": 4485, "train_runtime": 7039.1647, "train_tokens_per_second": 1401.574 }, { "entropy": 1.4813295602798462, "epoch": 0.898, "grad_norm": 7.144620895385742, "learning_rate": 5.428243017923817e-07, "loss": 1.0278, "mean_token_accuracy": 0.734214437007904, "num_input_tokens_seen": 9878760, "num_tokens": 9878760.0, "step": 4490, "train_runtime": 7047.2798, "train_tokens_per_second": 1401.783 }, { "entropy": 1.5501669645309448, "epoch": 0.899, "grad_norm": 10.664590835571289, "learning_rate": 5.323483010115382e-07, "loss": 1.1356, "mean_token_accuracy": 0.7062461733818054, "num_input_tokens_seen": 9891729, "num_tokens": 9891729.0, "step": 4495, "train_runtime": 7055.3207, "train_tokens_per_second": 1402.024 }, { "entropy": 2.005420613288879, "epoch": 0.9, "grad_norm": 7.034104347229004, "learning_rate": 5.219716114170026e-07, "loss": 0.9761, "mean_token_accuracy": 0.7466647505760193, "num_input_tokens_seen": 9903066, "num_tokens": 9903066.0, "step": 4500, "train_runtime": 7063.3251, "train_tokens_per_second": 1402.04 }, { "entropy": 1.4854703903198243, "epoch": 0.901, "grad_norm": 7.215117454528809, "learning_rate": 5.116943418553355e-07, "loss": 1.0362, "mean_token_accuracy": 0.7365163087844848, "num_input_tokens_seen": 9912294, "num_tokens": 9912294.0, "step": 4505, "train_runtime": 7070.8391, "train_tokens_per_second": 1401.855 }, { "entropy": 1.7960006475448609, "epoch": 0.902, "grad_norm": 12.38187313079834, "learning_rate": 5.01516600130233e-07, "loss": 1.18, "mean_token_accuracy": 0.7058088183403015, "num_input_tokens_seen": 9924654, "num_tokens": 9924654.0, "step": 4510, "train_runtime": 7078.5385, "train_tokens_per_second": 1402.077 }, { "entropy": 2.235483264923096, "epoch": 0.903, "grad_norm": 11.460939407348633, "learning_rate": 4.914384930013927e-07, "loss": 1.0803, "mean_token_accuracy": 0.7402546763420105, "num_input_tokens_seen": 9936749, "num_tokens": 9936749.0, "step": 4515, "train_runtime": 7086.5062, "train_tokens_per_second": 1402.207 }, { "entropy": 1.8061770915985107, "epoch": 0.904, "grad_norm": 11.38514232635498, "learning_rate": 4.814601261833851e-07, "loss": 1.0616, "mean_token_accuracy": 0.7249554634094239, "num_input_tokens_seen": 9949520, "num_tokens": 9949520.0, "step": 4520, "train_runtime": 7094.4487, "train_tokens_per_second": 1402.437 }, { "entropy": 1.8826250791549684, "epoch": 0.905, "grad_norm": 15.045506477355957, "learning_rate": 4.715816043445609e-07, "loss": 0.9696, "mean_token_accuracy": 0.7387994289398193, "num_input_tokens_seen": 9959483, "num_tokens": 9959483.0, "step": 4525, "train_runtime": 7102.2528, "train_tokens_per_second": 1402.299 }, { "entropy": 1.707918119430542, "epoch": 0.906, "grad_norm": 8.700443267822266, "learning_rate": 4.618030311059352e-07, "loss": 1.0826, "mean_token_accuracy": 0.7196035623550415, "num_input_tokens_seen": 9968050, "num_tokens": 9968050.0, "step": 4530, "train_runtime": 7109.913, "train_tokens_per_second": 1401.993 }, { "entropy": 1.5039629220962525, "epoch": 0.907, "grad_norm": 18.27132225036621, "learning_rate": 4.521245090401172e-07, "loss": 0.8729, "mean_token_accuracy": 0.7737865328788758, "num_input_tokens_seen": 9981414, "num_tokens": 9981414.0, "step": 4535, "train_runtime": 7117.9583, "train_tokens_per_second": 1402.286 }, { "entropy": 2.1300718784332275, "epoch": 0.908, "grad_norm": 14.693971633911133, "learning_rate": 4.425461396702213e-07, "loss": 1.0823, "mean_token_accuracy": 0.7407090187072753, "num_input_tokens_seen": 9992836, "num_tokens": 9992836.0, "step": 4540, "train_runtime": 7125.5709, "train_tokens_per_second": 1402.391 }, { "entropy": 1.6231330394744874, "epoch": 0.909, "grad_norm": 7.898678302764893, "learning_rate": 4.3306802346881116e-07, "loss": 1.1031, "mean_token_accuracy": 0.7187168836593628, "num_input_tokens_seen": 10003721, "num_tokens": 10003721.0, "step": 4545, "train_runtime": 7133.426, "train_tokens_per_second": 1402.373 }, { "entropy": 1.357316267490387, "epoch": 0.91, "grad_norm": 16.154481887817383, "learning_rate": 4.2369025985684264e-07, "loss": 0.8898, "mean_token_accuracy": 0.7598424553871155, "num_input_tokens_seen": 10018480, "num_tokens": 10018480.0, "step": 4550, "train_runtime": 7141.5171, "train_tokens_per_second": 1402.85 }, { "entropy": 2.120417606830597, "epoch": 0.911, "grad_norm": 5.7568535804748535, "learning_rate": 4.1441294720261373e-07, "loss": 1.1725, "mean_token_accuracy": 0.702811884880066, "num_input_tokens_seen": 10031120, "num_tokens": 10031120.0, "step": 4555, "train_runtime": 7149.4858, "train_tokens_per_second": 1403.055 }, { "entropy": 1.3933011651039124, "epoch": 0.912, "grad_norm": 8.397788047790527, "learning_rate": 4.0523618282074964e-07, "loss": 1.1138, "mean_token_accuracy": 0.7203466296195984, "num_input_tokens_seen": 10044212, "num_tokens": 10044212.0, "step": 4560, "train_runtime": 7157.3569, "train_tokens_per_second": 1403.341 }, { "entropy": 1.7097836017608643, "epoch": 0.913, "grad_norm": 8.537699699401855, "learning_rate": 3.961600629711615e-07, "loss": 0.9851, "mean_token_accuracy": 0.7496803522109985, "num_input_tokens_seen": 10055508, "num_tokens": 10055508.0, "step": 4565, "train_runtime": 7165.4316, "train_tokens_per_second": 1403.336 }, { "entropy": 2.0122339487075807, "epoch": 0.914, "grad_norm": 6.044821739196777, "learning_rate": 3.87184682858055e-07, "loss": 0.9539, "mean_token_accuracy": 0.7540565609931946, "num_input_tokens_seen": 10068938, "num_tokens": 10068938.0, "step": 4570, "train_runtime": 7173.2003, "train_tokens_per_second": 1403.688 }, { "entropy": 1.195107090473175, "epoch": 0.915, "grad_norm": 9.444576263427734, "learning_rate": 3.783101366289199e-07, "loss": 1.1576, "mean_token_accuracy": 0.7122646450996399, "num_input_tokens_seen": 10080731, "num_tokens": 10080731.0, "step": 4575, "train_runtime": 7181.1627, "train_tokens_per_second": 1403.774 }, { "entropy": 1.8492287874221802, "epoch": 0.916, "grad_norm": 12.034231185913086, "learning_rate": 3.695365173735466e-07, "loss": 1.0141, "mean_token_accuracy": 0.7444185614585876, "num_input_tokens_seen": 10093752, "num_tokens": 10093752.0, "step": 4580, "train_runtime": 7189.3247, "train_tokens_per_second": 1403.992 }, { "entropy": 1.5432074546813965, "epoch": 0.917, "grad_norm": 5.998719215393066, "learning_rate": 3.608639171230488e-07, "loss": 0.8495, "mean_token_accuracy": 0.7698346138000488, "num_input_tokens_seen": 10103778, "num_tokens": 10103778.0, "step": 4585, "train_runtime": 7197.2982, "train_tokens_per_second": 1403.829 }, { "entropy": 2.14777729511261, "epoch": 0.918, "grad_norm": 15.779912948608398, "learning_rate": 3.522924268489003e-07, "loss": 1.3484, "mean_token_accuracy": 0.6770269274711609, "num_input_tokens_seen": 10114686, "num_tokens": 10114686.0, "step": 4590, "train_runtime": 7204.9463, "train_tokens_per_second": 1403.853 }, { "entropy": 1.6165046215057373, "epoch": 0.919, "grad_norm": 6.6977057456970215, "learning_rate": 3.438221364619776e-07, "loss": 0.9043, "mean_token_accuracy": 0.7653825640678406, "num_input_tokens_seen": 10125783, "num_tokens": 10125783.0, "step": 4595, "train_runtime": 7212.5568, "train_tokens_per_second": 1403.91 }, { "entropy": 1.6717356443405151, "epoch": 0.92, "grad_norm": 8.92138385772705, "learning_rate": 3.3545313481161743e-07, "loss": 1.068, "mean_token_accuracy": 0.7325842380523682, "num_input_tokens_seen": 10136530, "num_tokens": 10136530.0, "step": 4600, "train_runtime": 7220.4485, "train_tokens_per_second": 1403.864 }, { "entropy": 1.930350089073181, "epoch": 0.921, "grad_norm": 8.80907917022705, "learning_rate": 3.271855096846899e-07, "loss": 1.1257, "mean_token_accuracy": 0.7260264992713928, "num_input_tokens_seen": 10149307, "num_tokens": 10149307.0, "step": 4605, "train_runtime": 7228.3794, "train_tokens_per_second": 1404.092 }, { "entropy": 1.4779713869094848, "epoch": 0.922, "grad_norm": 10.08457088470459, "learning_rate": 3.190193478046677e-07, "loss": 0.9078, "mean_token_accuracy": 0.7654569864273071, "num_input_tokens_seen": 10161136, "num_tokens": 10161136.0, "step": 4610, "train_runtime": 7236.3195, "train_tokens_per_second": 1404.186 }, { "entropy": 1.4505781531333923, "epoch": 0.923, "grad_norm": 14.213709831237793, "learning_rate": 3.1095473483072733e-07, "loss": 1.2072, "mean_token_accuracy": 0.707127320766449, "num_input_tokens_seen": 10174491, "num_tokens": 10174491.0, "step": 4615, "train_runtime": 7244.1276, "train_tokens_per_second": 1404.516 }, { "entropy": 1.7937794208526612, "epoch": 0.924, "grad_norm": 8.706068992614746, "learning_rate": 3.029917553568407e-07, "loss": 1.2421, "mean_token_accuracy": 0.6931609988212586, "num_input_tokens_seen": 10187778, "num_tokens": 10187778.0, "step": 4620, "train_runtime": 7252.1283, "train_tokens_per_second": 1404.798 }, { "entropy": 2.4729114055633543, "epoch": 0.925, "grad_norm": 33.4887809753418, "learning_rate": 2.951304929108956e-07, "loss": 1.2208, "mean_token_accuracy": 0.7066600322723389, "num_input_tokens_seen": 10201216, "num_tokens": 10201216.0, "step": 4625, "train_runtime": 7259.7177, "train_tokens_per_second": 1405.181 }, { "entropy": 2.1598052740097047, "epoch": 0.926, "grad_norm": 5.8318634033203125, "learning_rate": 2.873710299538146e-07, "loss": 1.1152, "mean_token_accuracy": 0.7258899092674256, "num_input_tokens_seen": 10212272, "num_tokens": 10212272.0, "step": 4630, "train_runtime": 7267.6613, "train_tokens_per_second": 1405.166 }, { "entropy": 1.2053056478500366, "epoch": 0.927, "grad_norm": 15.132913589477539, "learning_rate": 2.7971344787869114e-07, "loss": 0.8745, "mean_token_accuracy": 0.7816118717193603, "num_input_tokens_seen": 10222547, "num_tokens": 10222547.0, "step": 4635, "train_runtime": 7275.4641, "train_tokens_per_second": 1405.071 }, { "entropy": 1.417881965637207, "epoch": 0.928, "grad_norm": 7.951303005218506, "learning_rate": 2.721578270099412e-07, "loss": 1.2872, "mean_token_accuracy": 0.6851288676261902, "num_input_tokens_seen": 10234770, "num_tokens": 10234770.0, "step": 4640, "train_runtime": 7283.5373, "train_tokens_per_second": 1405.192 }, { "entropy": 2.010025215148926, "epoch": 0.929, "grad_norm": 13.547046661376953, "learning_rate": 2.647042466024485e-07, "loss": 1.1106, "mean_token_accuracy": 0.7200621604919434, "num_input_tokens_seen": 10246587, "num_tokens": 10246587.0, "step": 4645, "train_runtime": 7291.3791, "train_tokens_per_second": 1405.302 }, { "entropy": 2.272212862968445, "epoch": 0.93, "grad_norm": 57.44300079345703, "learning_rate": 2.5735278484074865e-07, "loss": 1.185, "mean_token_accuracy": 0.6991275191307068, "num_input_tokens_seen": 10258980, "num_tokens": 10258980.0, "step": 4650, "train_runtime": 7299.3725, "train_tokens_per_second": 1405.461 }, { "entropy": 1.3293099403381348, "epoch": 0.931, "grad_norm": 7.847452163696289, "learning_rate": 2.5010351883819283e-07, "loss": 1.0101, "mean_token_accuracy": 0.7313501596450805, "num_input_tokens_seen": 10268780, "num_tokens": 10268780.0, "step": 4655, "train_runtime": 7306.9054, "train_tokens_per_second": 1405.353 }, { "entropy": 1.0743606448173524, "epoch": 0.932, "grad_norm": 6.7764105796813965, "learning_rate": 2.429565246361532e-07, "loss": 0.9207, "mean_token_accuracy": 0.7556821584701539, "num_input_tokens_seen": 10279204, "num_tokens": 10279204.0, "step": 4660, "train_runtime": 7314.9254, "train_tokens_per_second": 1405.237 }, { "entropy": 1.2841739416122437, "epoch": 0.933, "grad_norm": 22.646886825561523, "learning_rate": 2.359118772032176e-07, "loss": 0.9944, "mean_token_accuracy": 0.7400426864624023, "num_input_tokens_seen": 10292291, "num_tokens": 10292291.0, "step": 4665, "train_runtime": 7322.8648, "train_tokens_per_second": 1405.501 }, { "entropy": 1.943468451499939, "epoch": 0.934, "grad_norm": 9.264581680297852, "learning_rate": 2.289696504344019e-07, "loss": 1.0731, "mean_token_accuracy": 0.715036141872406, "num_input_tokens_seen": 10304976, "num_tokens": 10304976.0, "step": 4670, "train_runtime": 7330.9053, "train_tokens_per_second": 1405.689 }, { "entropy": 1.4508253574371337, "epoch": 0.935, "grad_norm": 33.67607116699219, "learning_rate": 2.2212991715038324e-07, "loss": 1.1683, "mean_token_accuracy": 0.7240985870361328, "num_input_tokens_seen": 10319021, "num_tokens": 10319021.0, "step": 4675, "train_runtime": 7338.8356, "train_tokens_per_second": 1406.084 }, { "entropy": 1.5609180927276611, "epoch": 0.936, "grad_norm": 11.602736473083496, "learning_rate": 2.1539274909672337e-07, "loss": 0.9051, "mean_token_accuracy": 0.7605258464813233, "num_input_tokens_seen": 10328094, "num_tokens": 10328094.0, "step": 4680, "train_runtime": 7346.6299, "train_tokens_per_second": 1405.827 }, { "entropy": 1.7601311683654786, "epoch": 0.937, "grad_norm": 4.9318718910217285, "learning_rate": 2.0875821694313014e-07, "loss": 0.866, "mean_token_accuracy": 0.7825977206230164, "num_input_tokens_seen": 10341566, "num_tokens": 10341566.0, "step": 4685, "train_runtime": 7354.3756, "train_tokens_per_second": 1406.179 }, { "entropy": 1.5141392707824708, "epoch": 0.938, "grad_norm": 15.47298526763916, "learning_rate": 2.0222639028270486e-07, "loss": 0.9023, "mean_token_accuracy": 0.7577151656150818, "num_input_tokens_seen": 10351902, "num_tokens": 10351902.0, "step": 4690, "train_runtime": 7362.2747, "train_tokens_per_second": 1406.074 }, { "entropy": 1.243553137779236, "epoch": 0.939, "grad_norm": 6.127535343170166, "learning_rate": 1.9579733763121943e-07, "loss": 0.9515, "mean_token_accuracy": 0.7612414836883545, "num_input_tokens_seen": 10359674, "num_tokens": 10359674.0, "step": 4695, "train_runtime": 7369.947, "train_tokens_per_second": 1405.665 }, { "entropy": 1.4229054570198059, "epoch": 0.94, "grad_norm": 5.495715618133545, "learning_rate": 1.8947112642639376e-07, "loss": 0.8238, "mean_token_accuracy": 0.7889649748802186, "num_input_tokens_seen": 10369480, "num_tokens": 10369480.0, "step": 4700, "train_runtime": 7377.4914, "train_tokens_per_second": 1405.556 }, { "entropy": 1.730784034729004, "epoch": 0.941, "grad_norm": 12.507272720336914, "learning_rate": 1.8324782302718835e-07, "loss": 1.0808, "mean_token_accuracy": 0.7321398496627808, "num_input_tokens_seen": 10382014, "num_tokens": 10382014.0, "step": 4705, "train_runtime": 7385.5691, "train_tokens_per_second": 1405.716 }, { "entropy": 1.5762312889099122, "epoch": 0.942, "grad_norm": 5.205252647399902, "learning_rate": 1.7712749271311392e-07, "loss": 1.0161, "mean_token_accuracy": 0.7497888922691345, "num_input_tokens_seen": 10393862, "num_tokens": 10393862.0, "step": 4710, "train_runtime": 7393.519, "train_tokens_per_second": 1405.807 }, { "entropy": 1.6902926921844483, "epoch": 0.943, "grad_norm": 7.1734418869018555, "learning_rate": 1.7111019968353625e-07, "loss": 1.1387, "mean_token_accuracy": 0.7142775058746338, "num_input_tokens_seen": 10405665, "num_tokens": 10405665.0, "step": 4715, "train_runtime": 7401.2421, "train_tokens_per_second": 1405.935 }, { "entropy": 1.5544729709625245, "epoch": 0.944, "grad_norm": 9.978860855102539, "learning_rate": 1.6519600705701465e-07, "loss": 1.0953, "mean_token_accuracy": 0.7222066283226013, "num_input_tokens_seen": 10417086, "num_tokens": 10417086.0, "step": 4720, "train_runtime": 7409.2806, "train_tokens_per_second": 1405.951 }, { "entropy": 1.3775517225265503, "epoch": 0.945, "grad_norm": 6.700134754180908, "learning_rate": 1.5938497687062905e-07, "loss": 0.7445, "mean_token_accuracy": 0.7971397876739502, "num_input_tokens_seen": 10426571, "num_tokens": 10426571.0, "step": 4725, "train_runtime": 7417.1181, "train_tokens_per_second": 1405.744 }, { "entropy": 1.786057186126709, "epoch": 0.946, "grad_norm": 15.491426467895508, "learning_rate": 1.5367717007933826e-07, "loss": 1.1601, "mean_token_accuracy": 0.7168210029602051, "num_input_tokens_seen": 10437224, "num_tokens": 10437224.0, "step": 4730, "train_runtime": 7424.7732, "train_tokens_per_second": 1405.73 }, { "entropy": 1.4827812671661378, "epoch": 0.947, "grad_norm": 9.146665573120117, "learning_rate": 1.4807264655533282e-07, "loss": 0.9349, "mean_token_accuracy": 0.7576950907707214, "num_input_tokens_seen": 10446085, "num_tokens": 10446085.0, "step": 4735, "train_runtime": 7432.5318, "train_tokens_per_second": 1405.454 }, { "entropy": 1.8378747940063476, "epoch": 0.948, "grad_norm": 11.476910591125488, "learning_rate": 1.4257146508741436e-07, "loss": 1.0734, "mean_token_accuracy": 0.7258716940879821, "num_input_tokens_seen": 10458022, "num_tokens": 10458022.0, "step": 4740, "train_runtime": 7440.3052, "train_tokens_per_second": 1405.59 }, { "entropy": 1.608994197845459, "epoch": 0.949, "grad_norm": 8.907936096191406, "learning_rate": 1.3717368338037163e-07, "loss": 0.9454, "mean_token_accuracy": 0.7541883707046508, "num_input_tokens_seen": 10471631, "num_tokens": 10471631.0, "step": 4745, "train_runtime": 7448.4026, "train_tokens_per_second": 1405.89 }, { "entropy": 1.53304181098938, "epoch": 0.95, "grad_norm": 8.358020782470703, "learning_rate": 1.318793580543809e-07, "loss": 0.9896, "mean_token_accuracy": 0.7534858822822571, "num_input_tokens_seen": 10483528, "num_tokens": 10483528.0, "step": 4750, "train_runtime": 7456.3164, "train_tokens_per_second": 1405.993 }, { "entropy": 1.9662979602813722, "epoch": 0.951, "grad_norm": 7.691495895385742, "learning_rate": 1.2668854464441104e-07, "loss": 1.0605, "mean_token_accuracy": 0.726511538028717, "num_input_tokens_seen": 10496804, "num_tokens": 10496804.0, "step": 4755, "train_runtime": 7464.5487, "train_tokens_per_second": 1406.221 }, { "entropy": 2.0233491897583007, "epoch": 0.952, "grad_norm": 9.727952003479004, "learning_rate": 1.2160129759963723e-07, "loss": 1.2931, "mean_token_accuracy": 0.6846873044967652, "num_input_tokens_seen": 10510146, "num_tokens": 10510146.0, "step": 4760, "train_runtime": 7472.3163, "train_tokens_per_second": 1406.545 }, { "entropy": 1.669122838973999, "epoch": 0.953, "grad_norm": 5.147618293762207, "learning_rate": 1.1661767028287363e-07, "loss": 0.9196, "mean_token_accuracy": 0.759040915966034, "num_input_tokens_seen": 10523545, "num_tokens": 10523545.0, "step": 4765, "train_runtime": 7480.5118, "train_tokens_per_second": 1406.795 }, { "entropy": 1.5854533433914184, "epoch": 0.954, "grad_norm": 16.651477813720703, "learning_rate": 1.1173771497001273e-07, "loss": 1.332, "mean_token_accuracy": 0.7037554979324341, "num_input_tokens_seen": 10537536, "num_tokens": 10537536.0, "step": 4770, "train_runtime": 7488.3568, "train_tokens_per_second": 1407.189 }, { "entropy": 1.8342015504837037, "epoch": 0.955, "grad_norm": 18.505809783935547, "learning_rate": 1.0696148284947694e-07, "loss": 1.0151, "mean_token_accuracy": 0.7473283171653747, "num_input_tokens_seen": 10551554, "num_tokens": 10551554.0, "step": 4775, "train_runtime": 7496.4002, "train_tokens_per_second": 1407.549 }, { "entropy": 2.0686303615570067, "epoch": 0.956, "grad_norm": 8.64905834197998, "learning_rate": 1.0228902402168118e-07, "loss": 1.0566, "mean_token_accuracy": 0.736926007270813, "num_input_tokens_seen": 10565962, "num_tokens": 10565962.0, "step": 4780, "train_runtime": 7504.515, "train_tokens_per_second": 1407.947 }, { "entropy": 1.6846899747848512, "epoch": 0.957, "grad_norm": 7.293910026550293, "learning_rate": 9.772038749850665e-08, "loss": 1.1336, "mean_token_accuracy": 0.7104639530181884, "num_input_tokens_seen": 10578076, "num_tokens": 10578076.0, "step": 4785, "train_runtime": 7512.1774, "train_tokens_per_second": 1408.124 }, { "entropy": 1.4189405679702758, "epoch": 0.958, "grad_norm": 15.14293098449707, "learning_rate": 9.32556212027902e-08, "loss": 1.0243, "mean_token_accuracy": 0.7429918885231018, "num_input_tokens_seen": 10590166, "num_tokens": 10590166.0, "step": 4790, "train_runtime": 7520.125, "train_tokens_per_second": 1408.243 }, { "entropy": 2.2229612350463865, "epoch": 0.959, "grad_norm": 24.87360382080078, "learning_rate": 8.889477196781571e-08, "loss": 1.0969, "mean_token_accuracy": 0.7167801022529602, "num_input_tokens_seen": 10603050, "num_tokens": 10603050.0, "step": 4795, "train_runtime": 7528.0245, "train_tokens_per_second": 1408.477 }, { "entropy": 1.9885172367095947, "epoch": 0.96, "grad_norm": 57.06581115722656, "learning_rate": 8.463788553683017e-08, "loss": 1.1855, "mean_token_accuracy": 0.7048902750015259, "num_input_tokens_seen": 10614792, "num_tokens": 10614792.0, "step": 4800, "train_runtime": 7535.6978, "train_tokens_per_second": 1408.601 }, { "entropy": 2.099240279197693, "epoch": 0.961, "grad_norm": 10.795445442199707, "learning_rate": 8.04850065625551e-08, "loss": 1.125, "mean_token_accuracy": 0.7275569677352905, "num_input_tokens_seen": 10626986, "num_tokens": 10626986.0, "step": 4805, "train_runtime": 7543.6666, "train_tokens_per_second": 1408.73 }, { "entropy": 1.6192439317703247, "epoch": 0.962, "grad_norm": 24.295989990234375, "learning_rate": 7.643617860672914e-08, "loss": 0.9459, "mean_token_accuracy": 0.7598836421966553, "num_input_tokens_seen": 10638032, "num_tokens": 10638032.0, "step": 4810, "train_runtime": 7551.5133, "train_tokens_per_second": 1408.728 }, { "entropy": 1.1080342173576354, "epoch": 0.963, "grad_norm": 7.032379627227783, "learning_rate": 7.24914441396396e-08, "loss": 0.9307, "mean_token_accuracy": 0.7420334696769715, "num_input_tokens_seen": 10650274, "num_tokens": 10650274.0, "step": 4815, "train_runtime": 7559.4617, "train_tokens_per_second": 1408.867 }, { "entropy": 1.5703843593597413, "epoch": 0.964, "grad_norm": 5.671489715576172, "learning_rate": 6.865084453968495e-08, "loss": 0.9057, "mean_token_accuracy": 0.7595803499221802, "num_input_tokens_seen": 10661862, "num_tokens": 10661862.0, "step": 4820, "train_runtime": 7567.4252, "train_tokens_per_second": 1408.915 }, { "entropy": 1.6959205389022827, "epoch": 0.965, "grad_norm": 9.617012977600098, "learning_rate": 6.491442009293858e-08, "loss": 1.0924, "mean_token_accuracy": 0.7288106322288513, "num_input_tokens_seen": 10676906, "num_tokens": 10676906.0, "step": 4825, "train_runtime": 7575.5688, "train_tokens_per_second": 1409.387 }, { "entropy": 1.8105109214782715, "epoch": 0.966, "grad_norm": 12.973583221435547, "learning_rate": 6.12822099927235e-08, "loss": 1.0375, "mean_token_accuracy": 0.7407364010810852, "num_input_tokens_seen": 10689580, "num_tokens": 10689580.0, "step": 4830, "train_runtime": 7583.3323, "train_tokens_per_second": 1409.615 }, { "entropy": 2.1567612171173094, "epoch": 0.967, "grad_norm": 42.382545471191406, "learning_rate": 5.7754252339204955e-08, "loss": 1.0357, "mean_token_accuracy": 0.7431897163391114, "num_input_tokens_seen": 10703005, "num_tokens": 10703005.0, "step": 4835, "train_runtime": 7591.4232, "train_tokens_per_second": 1409.881 }, { "entropy": 1.4394302129745484, "epoch": 0.968, "grad_norm": 7.607638835906982, "learning_rate": 5.4330584138989615e-08, "loss": 0.9239, "mean_token_accuracy": 0.7622191667556762, "num_input_tokens_seen": 10714700, "num_tokens": 10714700.0, "step": 4840, "train_runtime": 7599.3904, "train_tokens_per_second": 1409.942 }, { "entropy": 1.3555230379104615, "epoch": 0.969, "grad_norm": 6.09700870513916, "learning_rate": 5.1011241304738115e-08, "loss": 1.0503, "mean_token_accuracy": 0.722472357749939, "num_input_tokens_seen": 10728460, "num_tokens": 10728460.0, "step": 4845, "train_runtime": 7607.2911, "train_tokens_per_second": 1410.287 }, { "entropy": 1.5757088661193848, "epoch": 0.97, "grad_norm": 8.40723705291748, "learning_rate": 4.779625865478421e-08, "loss": 0.9956, "mean_token_accuracy": 0.7373381853103638, "num_input_tokens_seen": 10741124, "num_tokens": 10741124.0, "step": 4850, "train_runtime": 7615.3331, "train_tokens_per_second": 1410.46 }, { "entropy": 1.6040757179260254, "epoch": 0.971, "grad_norm": 10.422640800476074, "learning_rate": 4.468566991277512e-08, "loss": 1.0678, "mean_token_accuracy": 0.7337944626808166, "num_input_tokens_seen": 10752344, "num_tokens": 10752344.0, "step": 4855, "train_runtime": 7623.0195, "train_tokens_per_second": 1410.51 }, { "entropy": 1.6141828775405884, "epoch": 0.972, "grad_norm": 5.870348930358887, "learning_rate": 4.1679507707315106e-08, "loss": 1.0699, "mean_token_accuracy": 0.7307448863983155, "num_input_tokens_seen": 10762822, "num_tokens": 10762822.0, "step": 4860, "train_runtime": 7630.9512, "train_tokens_per_second": 1410.417 }, { "entropy": 0.8655026197433472, "epoch": 0.973, "grad_norm": 11.990184783935547, "learning_rate": 3.877780357162353e-08, "loss": 0.9495, "mean_token_accuracy": 0.7445281028747559, "num_input_tokens_seen": 10773389, "num_tokens": 10773389.0, "step": 4865, "train_runtime": 7638.792, "train_tokens_per_second": 1410.352 }, { "entropy": 1.1548554182052613, "epoch": 0.974, "grad_norm": 5.626952648162842, "learning_rate": 3.598058794320402e-08, "loss": 0.9719, "mean_token_accuracy": 0.7400847196578979, "num_input_tokens_seen": 10781218, "num_tokens": 10781218.0, "step": 4870, "train_runtime": 7646.2906, "train_tokens_per_second": 1409.993 }, { "entropy": 2.0187264919281005, "epoch": 0.975, "grad_norm": 13.410210609436035, "learning_rate": 3.3287890163523626e-08, "loss": 1.0108, "mean_token_accuracy": 0.746239984035492, "num_input_tokens_seen": 10793008, "num_tokens": 10793008.0, "step": 4875, "train_runtime": 7654.1924, "train_tokens_per_second": 1410.078 }, { "entropy": 1.9458348274230957, "epoch": 0.976, "grad_norm": 9.771912574768066, "learning_rate": 3.0699738477708576e-08, "loss": 1.0139, "mean_token_accuracy": 0.7470673680305481, "num_input_tokens_seen": 10805812, "num_tokens": 10805812.0, "step": 4880, "train_runtime": 7662.2433, "train_tokens_per_second": 1410.267 }, { "entropy": 1.4348028421401977, "epoch": 0.977, "grad_norm": 7.12460994720459, "learning_rate": 2.8216160034244544e-08, "loss": 1.0912, "mean_token_accuracy": 0.7254775404930115, "num_input_tokens_seen": 10818640, "num_tokens": 10818640.0, "step": 4885, "train_runtime": 7669.9641, "train_tokens_per_second": 1410.52 }, { "entropy": 1.5954820156097411, "epoch": 0.978, "grad_norm": 11.460736274719238, "learning_rate": 2.583718088469689e-08, "loss": 1.0907, "mean_token_accuracy": 0.7227721214294434, "num_input_tokens_seen": 10830678, "num_tokens": 10830678.0, "step": 4890, "train_runtime": 7678.0411, "train_tokens_per_second": 1410.604 }, { "entropy": 1.7250642776489258, "epoch": 0.979, "grad_norm": 10.71335506439209, "learning_rate": 2.3562825983427517e-08, "loss": 1.0554, "mean_token_accuracy": 0.7291754722595215, "num_input_tokens_seen": 10843771, "num_tokens": 10843771.0, "step": 4895, "train_runtime": 7686.0263, "train_tokens_per_second": 1410.842 }, { "entropy": 1.78792142868042, "epoch": 0.98, "grad_norm": 9.017132759094238, "learning_rate": 2.1393119187345103e-08, "loss": 1.1313, "mean_token_accuracy": 0.7231282949447632, "num_input_tokens_seen": 10857828, "num_tokens": 10857828.0, "step": 4900, "train_runtime": 7694.2124, "train_tokens_per_second": 1411.168 }, { "entropy": 1.8033751010894776, "epoch": 0.981, "grad_norm": 13.800712585449219, "learning_rate": 1.93280832556475e-08, "loss": 1.1291, "mean_token_accuracy": 0.7148388981819153, "num_input_tokens_seen": 10871933, "num_tokens": 10871933.0, "step": 4905, "train_runtime": 7702.2279, "train_tokens_per_second": 1411.531 }, { "entropy": 1.5847553968429566, "epoch": 0.982, "grad_norm": 6.25689172744751, "learning_rate": 1.7367739849584174e-08, "loss": 0.9607, "mean_token_accuracy": 0.751439380645752, "num_input_tokens_seen": 10882578, "num_tokens": 10882578.0, "step": 4910, "train_runtime": 7709.7171, "train_tokens_per_second": 1411.541 }, { "entropy": 1.543188452720642, "epoch": 0.983, "grad_norm": 19.561914443969727, "learning_rate": 1.5512109532229703e-08, "loss": 0.9552, "mean_token_accuracy": 0.7475033164024353, "num_input_tokens_seen": 10894907, "num_tokens": 10894907.0, "step": 4915, "train_runtime": 7717.7144, "train_tokens_per_second": 1411.675 }, { "entropy": 1.582392644882202, "epoch": 0.984, "grad_norm": 18.354089736938477, "learning_rate": 1.376121176826728e-08, "loss": 1.1902, "mean_token_accuracy": 0.712268054485321, "num_input_tokens_seen": 10905186, "num_tokens": 10905186.0, "step": 4920, "train_runtime": 7725.4893, "train_tokens_per_second": 1411.585 }, { "entropy": 1.8910357713699342, "epoch": 0.985, "grad_norm": 6.1599626541137695, "learning_rate": 1.2115064923787778e-08, "loss": 0.9499, "mean_token_accuracy": 0.7612193584442138, "num_input_tokens_seen": 10916458, "num_tokens": 10916458.0, "step": 4925, "train_runtime": 7733.34, "train_tokens_per_second": 1411.61 }, { "entropy": 2.0721890687942506, "epoch": 0.986, "grad_norm": 60.224365234375, "learning_rate": 1.057368626609101e-08, "loss": 1.0979, "mean_token_accuracy": 0.728142511844635, "num_input_tokens_seen": 10928424, "num_tokens": 10928424.0, "step": 4930, "train_runtime": 7741.1854, "train_tokens_per_second": 1411.725 }, { "entropy": 1.3407955169677734, "epoch": 0.987, "grad_norm": 13.35656452178955, "learning_rate": 9.137091963510314e-09, "loss": 0.8477, "mean_token_accuracy": 0.7785102248191833, "num_input_tokens_seen": 10940524, "num_tokens": 10940524.0, "step": 4935, "train_runtime": 7748.8234, "train_tokens_per_second": 1411.895 }, { "entropy": 1.7365794420242309, "epoch": 0.988, "grad_norm": 7.942739486694336, "learning_rate": 7.80529708523936e-09, "loss": 1.0342, "mean_token_accuracy": 0.7385509252548218, "num_input_tokens_seen": 10953712, "num_tokens": 10953712.0, "step": 4940, "train_runtime": 7756.6322, "train_tokens_per_second": 1412.174 }, { "entropy": 1.6682606935501099, "epoch": 0.989, "grad_norm": 12.909952163696289, "learning_rate": 6.5783156011778315e-09, "loss": 1.0518, "mean_token_accuracy": 0.7278354167938232, "num_input_tokens_seen": 10966318, "num_tokens": 10966318.0, "step": 4945, "train_runtime": 7764.73, "train_tokens_per_second": 1412.324 }, { "entropy": 1.3110265731811523, "epoch": 0.99, "grad_norm": 5.09813117980957, "learning_rate": 5.456160381779319e-09, "loss": 0.8264, "mean_token_accuracy": 0.7860806584358215, "num_input_tokens_seen": 10976160, "num_tokens": 10976160.0, "step": 4950, "train_runtime": 7772.5088, "train_tokens_per_second": 1412.177 }, { "entropy": 1.7595218658447265, "epoch": 0.991, "grad_norm": 9.121787071228027, "learning_rate": 4.438843197922538e-09, "loss": 0.9817, "mean_token_accuracy": 0.7427166938781739, "num_input_tokens_seen": 10989299, "num_tokens": 10989299.0, "step": 4955, "train_runtime": 7780.5673, "train_tokens_per_second": 1412.403 }, { "entropy": 2.1359416961669924, "epoch": 0.992, "grad_norm": 9.002752304077148, "learning_rate": 3.526374720782544e-09, "loss": 1.3379, "mean_token_accuracy": 0.6794438481330871, "num_input_tokens_seen": 11001036, "num_tokens": 11001036.0, "step": 4960, "train_runtime": 7788.1756, "train_tokens_per_second": 1412.531 }, { "entropy": 1.278689169883728, "epoch": 0.993, "grad_norm": 8.387939453125, "learning_rate": 2.7187645217219283e-09, "loss": 0.9401, "mean_token_accuracy": 0.756875741481781, "num_input_tokens_seen": 11013766, "num_tokens": 11013766.0, "step": 4965, "train_runtime": 7796.1071, "train_tokens_per_second": 1412.726 }, { "entropy": 1.3905704855918883, "epoch": 0.994, "grad_norm": 20.114803314208984, "learning_rate": 2.0160210721886788e-09, "loss": 0.9876, "mean_token_accuracy": 0.7458947777748108, "num_input_tokens_seen": 11024608, "num_tokens": 11024608.0, "step": 4970, "train_runtime": 7803.5985, "train_tokens_per_second": 1412.76 }, { "entropy": 1.5877843260765077, "epoch": 0.995, "grad_norm": 18.071781158447266, "learning_rate": 1.4181517436306913e-09, "loss": 1.103, "mean_token_accuracy": 0.7296060442924499, "num_input_tokens_seen": 11037759, "num_tokens": 11037759.0, "step": 4975, "train_runtime": 7811.5694, "train_tokens_per_second": 1413.001 }, { "entropy": 1.5151433944702148, "epoch": 0.996, "grad_norm": 9.041559219360352, "learning_rate": 9.251628074136154e-10, "loss": 1.0944, "mean_token_accuracy": 0.733426547050476, "num_input_tokens_seen": 11052188, "num_tokens": 11052188.0, "step": 4980, "train_runtime": 7819.6271, "train_tokens_per_second": 1413.391 }, { "entropy": 1.3551754713058473, "epoch": 0.997, "grad_norm": 6.245151519775391, "learning_rate": 5.370594347575697e-10, "loss": 1.0299, "mean_token_accuracy": 0.7346912264823914, "num_input_tokens_seen": 11064880, "num_tokens": 11064880.0, "step": 4985, "train_runtime": 7827.6178, "train_tokens_per_second": 1413.569 }, { "entropy": 1.3726842880249024, "epoch": 0.998, "grad_norm": 7.23486328125, "learning_rate": 2.538456966838521e-10, "loss": 0.8654, "mean_token_accuracy": 0.7638851881027222, "num_input_tokens_seen": 11075496, "num_tokens": 11075496.0, "step": 4990, "train_runtime": 7835.1669, "train_tokens_per_second": 1413.562 }, { "entropy": 1.7637147426605224, "epoch": 0.999, "grad_norm": 13.324174880981445, "learning_rate": 7.552456397053042e-11, "loss": 1.0701, "mean_token_accuracy": 0.7285847425460815, "num_input_tokens_seen": 11085841, "num_tokens": 11085841.0, "step": 4995, "train_runtime": 7843.2103, "train_tokens_per_second": 1413.432 }, { "entropy": 2.12231388092041, "epoch": 1.0, "grad_norm": 14.363571166992188, "learning_rate": 2.0979071224669357e-12, "loss": 1.158, "mean_token_accuracy": 0.706820809841156, "num_input_tokens_seen": 11100156, "num_tokens": 11100156.0, "step": 5000, "train_runtime": 7851.1941, "train_tokens_per_second": 1413.818 }, { "epoch": 1.0, "num_input_tokens_seen": 11100156, "step": 5000, "total_flos": 2.57984362543317e+17, "train_loss": 1.2531329622745513, "train_runtime": 7851.2112, "train_samples_per_second": 0.637, "train_steps_per_second": 0.637, "train_tokens_per_second": 1363.491 } ], "logging_steps": 5, "max_steps": 5000, "num_input_tokens_seen": 11100156, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.57984362543317e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }