{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2819, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000354735721887194, "grad_norm": 1.5881168842315674, "learning_rate": 0.0, "loss": 0.7665, "num_input_tokens_seen": 3998524, "step": 1, "train_runtime": 27.9781, "train_tokens_per_second": 142916.367 }, { "epoch": 0.000709471443774388, "grad_norm": 1.544163465499878, "learning_rate": 6.666666666666667e-06, "loss": 0.7335, "num_input_tokens_seen": 7886722, "step": 2, "train_runtime": 63.6929, "train_tokens_per_second": 123824.273 }, { "epoch": 0.001064207165661582, "grad_norm": 1.142810344696045, "learning_rate": 1.3333333333333333e-05, "loss": 0.7523, "num_input_tokens_seen": 11716217, "step": 3, "train_runtime": 79.6606, "train_tokens_per_second": 147076.751 }, { "epoch": 0.001418942887548776, "grad_norm": 1.6597518920898438, "learning_rate": 2e-05, "loss": 0.7095, "num_input_tokens_seen": 15717553, "step": 4, "train_runtime": 99.5251, "train_tokens_per_second": 157925.469 }, { "epoch": 0.0017736786094359701, "grad_norm": 4.615099906921387, "learning_rate": 2.6666666666666667e-05, "loss": 0.8572, "num_input_tokens_seen": 19491895, "step": 5, "train_runtime": 122.7493, "train_tokens_per_second": 158794.405 }, { "epoch": 0.002128414331323164, "grad_norm": 4.796090602874756, "learning_rate": 3.3333333333333335e-05, "loss": 0.8403, "num_input_tokens_seen": 23475149, "step": 6, "train_runtime": 141.8901, "train_tokens_per_second": 165446.013 }, { "epoch": 0.0024831500532103584, "grad_norm": 3.4782140254974365, "learning_rate": 4e-05, "loss": 0.8419, "num_input_tokens_seen": 27297828, "step": 7, "train_runtime": 161.8908, "train_tokens_per_second": 168618.762 }, { "epoch": 0.002837885775097552, "grad_norm": 2.7081494331359863, "learning_rate": 3.999998752730692e-05, "loss": 0.8275, "num_input_tokens_seen": 31164479, "step": 8, "train_runtime": 181.4074, "train_tokens_per_second": 171792.729 }, { "epoch": 0.0031926214969847464, "grad_norm": 2.78900146484375, "learning_rate": 3.999995010924321e-05, "loss": 0.7812, "num_input_tokens_seen": 35046573, "step": 9, "train_runtime": 199.3685, "train_tokens_per_second": 175787.948 }, { "epoch": 0.0035473572188719402, "grad_norm": 2.086149215698242, "learning_rate": 3.9999887745855554e-05, "loss": 0.7738, "num_input_tokens_seen": 38886472, "step": 10, "train_runtime": 215.565, "train_tokens_per_second": 180393.296 }, { "epoch": 0.0039020929407591345, "grad_norm": 1.7231736183166504, "learning_rate": 3.999980043722173e-05, "loss": 0.7443, "num_input_tokens_seen": 42847227, "step": 11, "train_runtime": 237.7753, "train_tokens_per_second": 180200.5 }, { "epoch": 0.004256828662646328, "grad_norm": 1.4852551221847534, "learning_rate": 3.999968818345064e-05, "loss": 0.7191, "num_input_tokens_seen": 46746647, "step": 12, "train_runtime": 265.6946, "train_tokens_per_second": 175941.25 }, { "epoch": 0.0046115643845335225, "grad_norm": 1.2579160928726196, "learning_rate": 3.999955098468229e-05, "loss": 0.6978, "num_input_tokens_seen": 50571017, "step": 13, "train_runtime": 299.001, "train_tokens_per_second": 169133.245 }, { "epoch": 0.004966300106420717, "grad_norm": 1.1194126605987549, "learning_rate": 3.9999388841087815e-05, "loss": 0.6753, "num_input_tokens_seen": 54380917, "step": 14, "train_runtime": 321.7643, "train_tokens_per_second": 169008.526 }, { "epoch": 0.005321035828307911, "grad_norm": 0.8913609385490417, "learning_rate": 3.999920175286944e-05, "loss": 0.6884, "num_input_tokens_seen": 58288455, "step": 15, "train_runtime": 346.5986, "train_tokens_per_second": 168172.802 }, { "epoch": 0.005675771550195104, "grad_norm": 1.0659162998199463, "learning_rate": 3.999898972026052e-05, "loss": 0.6685, "num_input_tokens_seen": 62167647, "step": 16, "train_runtime": 366.1319, "train_tokens_per_second": 169795.75 }, { "epoch": 0.006030507272082299, "grad_norm": 1.313161849975586, "learning_rate": 3.999875274352551e-05, "loss": 0.6622, "num_input_tokens_seen": 66047446, "step": 17, "train_runtime": 386.9712, "train_tokens_per_second": 170677.919 }, { "epoch": 0.006385242993969493, "grad_norm": 1.1570100784301758, "learning_rate": 3.999849082295999e-05, "loss": 0.645, "num_input_tokens_seen": 69933117, "step": 18, "train_runtime": 406.1387, "train_tokens_per_second": 172190.227 }, { "epoch": 0.006739978715856687, "grad_norm": 0.9069727063179016, "learning_rate": 3.999820395889065e-05, "loss": 0.6381, "num_input_tokens_seen": 73723271, "step": 19, "train_runtime": 427.4668, "train_tokens_per_second": 172465.492 }, { "epoch": 0.0070947144377438804, "grad_norm": 0.8015176057815552, "learning_rate": 3.999789215167527e-05, "loss": 0.6276, "num_input_tokens_seen": 77631549, "step": 20, "train_runtime": 447.2391, "train_tokens_per_second": 173579.507 }, { "epoch": 0.007449450159631075, "grad_norm": 0.8420839309692383, "learning_rate": 3.9997555401702775e-05, "loss": 0.6287, "num_input_tokens_seen": 81500021, "step": 21, "train_runtime": 469.3231, "train_tokens_per_second": 173654.392 }, { "epoch": 0.007804185881518269, "grad_norm": 0.9281284213066101, "learning_rate": 3.9997193709393175e-05, "loss": 0.6219, "num_input_tokens_seen": 85464244, "step": 22, "train_runtime": 492.7728, "train_tokens_per_second": 173435.397 }, { "epoch": 0.008158921603405462, "grad_norm": 0.97067791223526, "learning_rate": 3.99968070751976e-05, "loss": 0.6022, "num_input_tokens_seen": 89289309, "step": 23, "train_runtime": 514.4518, "train_tokens_per_second": 173562.054 }, { "epoch": 0.008513657325292657, "grad_norm": 0.9147783517837524, "learning_rate": 3.999639549959828e-05, "loss": 0.6131, "num_input_tokens_seen": 93227233, "step": 24, "train_runtime": 540.089, "train_tokens_per_second": 172614.561 }, { "epoch": 0.00886839304717985, "grad_norm": 0.7977441549301147, "learning_rate": 3.999595898310857e-05, "loss": 0.5963, "num_input_tokens_seen": 97050627, "step": 25, "train_runtime": 564.9358, "train_tokens_per_second": 171790.533 }, { "epoch": 0.009223128769067045, "grad_norm": 0.5944493412971497, "learning_rate": 3.9995497526272926e-05, "loss": 0.5986, "num_input_tokens_seen": 100975843, "step": 26, "train_runtime": 598.2484, "train_tokens_per_second": 168785.822 }, { "epoch": 0.00957786449095424, "grad_norm": 0.7259588837623596, "learning_rate": 3.99950111296669e-05, "loss": 0.5851, "num_input_tokens_seen": 104817347, "step": 27, "train_runtime": 625.8043, "train_tokens_per_second": 167492.211 }, { "epoch": 0.009932600212841433, "grad_norm": 0.6456223726272583, "learning_rate": 3.999449979389716e-05, "loss": 0.5858, "num_input_tokens_seen": 108785109, "step": 28, "train_runtime": 655.4202, "train_tokens_per_second": 165977.653 }, { "epoch": 0.010287335934728628, "grad_norm": 0.5353304743766785, "learning_rate": 3.999396351960148e-05, "loss": 0.5867, "num_input_tokens_seen": 112676072, "step": 29, "train_runtime": 673.1623, "train_tokens_per_second": 167383.217 }, { "epoch": 0.010642071656615822, "grad_norm": 0.5961151123046875, "learning_rate": 3.999340230744875e-05, "loss": 0.5768, "num_input_tokens_seen": 116622672, "step": 30, "train_runtime": 693.5642, "train_tokens_per_second": 168149.789 }, { "epoch": 0.010996807378503014, "grad_norm": 0.6669013500213623, "learning_rate": 3.9992816158138935e-05, "loss": 0.5755, "num_input_tokens_seen": 120492238, "step": 31, "train_runtime": 711.2411, "train_tokens_per_second": 169411.249 }, { "epoch": 0.011351543100390209, "grad_norm": 0.4966793656349182, "learning_rate": 3.9992205072403136e-05, "loss": 0.5817, "num_input_tokens_seen": 124310378, "step": 32, "train_runtime": 730.2924, "train_tokens_per_second": 170220.003 }, { "epoch": 0.011706278822277403, "grad_norm": 0.46290192008018494, "learning_rate": 3.999156905100353e-05, "loss": 0.5692, "num_input_tokens_seen": 128233600, "step": 33, "train_runtime": 754.4852, "train_tokens_per_second": 169961.711 }, { "epoch": 0.012061014544164597, "grad_norm": 0.4880334436893463, "learning_rate": 3.999090809473341e-05, "loss": 0.5703, "num_input_tokens_seen": 132100282, "step": 34, "train_runtime": 774.8074, "train_tokens_per_second": 170494.342 }, { "epoch": 0.012415750266051791, "grad_norm": 0.4206273555755615, "learning_rate": 3.9990222204417174e-05, "loss": 0.5729, "num_input_tokens_seen": 135945855, "step": 35, "train_runtime": 809.5063, "train_tokens_per_second": 167936.754 }, { "epoch": 0.012770485987938986, "grad_norm": 0.41888782382011414, "learning_rate": 3.998951138091031e-05, "loss": 0.5772, "num_input_tokens_seen": 139830096, "step": 36, "train_runtime": 828.4036, "train_tokens_per_second": 168794.647 }, { "epoch": 0.01312522170982618, "grad_norm": 0.5227063298225403, "learning_rate": 3.998877562509939e-05, "loss": 0.5625, "num_input_tokens_seen": 143728247, "step": 37, "train_runtime": 852.5844, "train_tokens_per_second": 168579.498 }, { "epoch": 0.013479957431713374, "grad_norm": 0.4039026200771332, "learning_rate": 3.998801493790211e-05, "loss": 0.5589, "num_input_tokens_seen": 147621628, "step": 38, "train_runtime": 876.0881, "train_tokens_per_second": 168500.889 }, { "epoch": 0.013834693153600568, "grad_norm": 0.5043486952781677, "learning_rate": 3.9987229320267265e-05, "loss": 0.553, "num_input_tokens_seen": 151532182, "step": 39, "train_runtime": 898.6599, "train_tokens_per_second": 168620.164 }, { "epoch": 0.014189428875487761, "grad_norm": 0.40140554308891296, "learning_rate": 3.998641877317471e-05, "loss": 0.5645, "num_input_tokens_seen": 155415796, "step": 40, "train_runtime": 925.8843, "train_tokens_per_second": 167856.599 }, { "epoch": 0.014544164597374955, "grad_norm": 0.3369249403476715, "learning_rate": 3.998558329763544e-05, "loss": 0.5551, "num_input_tokens_seen": 159273419, "step": 41, "train_runtime": 954.8376, "train_tokens_per_second": 166806.818 }, { "epoch": 0.01489890031926215, "grad_norm": 0.443901389837265, "learning_rate": 3.99847228946915e-05, "loss": 0.5631, "num_input_tokens_seen": 163153054, "step": 42, "train_runtime": 981.9417, "train_tokens_per_second": 166153.5 }, { "epoch": 0.015253636041149344, "grad_norm": 0.37935540080070496, "learning_rate": 3.9983837565416044e-05, "loss": 0.5641, "num_input_tokens_seen": 167029591, "step": 43, "train_runtime": 1007.7339, "train_tokens_per_second": 165747.718 }, { "epoch": 0.015608371763036538, "grad_norm": 0.38591811060905457, "learning_rate": 3.998292731091332e-05, "loss": 0.5625, "num_input_tokens_seen": 170837133, "step": 44, "train_runtime": 1040.6682, "train_tokens_per_second": 164161.002 }, { "epoch": 0.01596310748492373, "grad_norm": 0.3942011892795563, "learning_rate": 3.9981992132318665e-05, "loss": 0.5398, "num_input_tokens_seen": 174805294, "step": 45, "train_runtime": 1068.1623, "train_tokens_per_second": 163650.497 }, { "epoch": 0.016317843206810925, "grad_norm": 0.4780094623565674, "learning_rate": 3.9981032030798494e-05, "loss": 0.5614, "num_input_tokens_seen": 178616018, "step": 46, "train_runtime": 1095.4541, "train_tokens_per_second": 163052.037 }, { "epoch": 0.01667257892869812, "grad_norm": 0.6414566040039062, "learning_rate": 3.998004700755031e-05, "loss": 0.5465, "num_input_tokens_seen": 182527106, "step": 47, "train_runtime": 1116.7201, "train_tokens_per_second": 163449.295 }, { "epoch": 0.017027314650585313, "grad_norm": 0.9614126086235046, "learning_rate": 3.997903706380271e-05, "loss": 0.5498, "num_input_tokens_seen": 186370701, "step": 48, "train_runtime": 1140.7159, "train_tokens_per_second": 163380.468 }, { "epoch": 0.017382050372472507, "grad_norm": 1.2105962038040161, "learning_rate": 3.997800220081535e-05, "loss": 0.5451, "num_input_tokens_seen": 190209225, "step": 49, "train_runtime": 1172.6643, "train_tokens_per_second": 162202.619 }, { "epoch": 0.0177367860943597, "grad_norm": 0.481635183095932, "learning_rate": 3.997694241987901e-05, "loss": 0.5282, "num_input_tokens_seen": 194178799, "step": 50, "train_runtime": 1191.9482, "train_tokens_per_second": 162908.761 }, { "epoch": 0.018091521816246896, "grad_norm": 0.9815385341644287, "learning_rate": 3.997585772231549e-05, "loss": 0.5574, "num_input_tokens_seen": 198066457, "step": 51, "train_runtime": 1221.3691, "train_tokens_per_second": 162167.573 }, { "epoch": 0.01844625753813409, "grad_norm": 0.9300844073295593, "learning_rate": 3.997474810947773e-05, "loss": 0.548, "num_input_tokens_seen": 201977762, "step": 52, "train_runtime": 1245.0637, "train_tokens_per_second": 162222.829 }, { "epoch": 0.018800993260021284, "grad_norm": 0.5128892064094543, "learning_rate": 3.997361358274969e-05, "loss": 0.5371, "num_input_tokens_seen": 205861790, "step": 53, "train_runtime": 1272.534, "train_tokens_per_second": 161773.118 }, { "epoch": 0.01915572898190848, "grad_norm": 0.7819227576255798, "learning_rate": 3.997245414354645e-05, "loss": 0.5309, "num_input_tokens_seen": 209837357, "step": 54, "train_runtime": 1297.7465, "train_tokens_per_second": 161693.644 }, { "epoch": 0.019510464703795673, "grad_norm": 0.6350584626197815, "learning_rate": 3.997126979331413e-05, "loss": 0.545, "num_input_tokens_seen": 213663401, "step": 55, "train_runtime": 1325.8294, "train_tokens_per_second": 161154.52 }, { "epoch": 0.019865200425682867, "grad_norm": 0.5151866674423218, "learning_rate": 3.997006053352994e-05, "loss": 0.5383, "num_input_tokens_seen": 217569773, "step": 56, "train_runtime": 1351.9366, "train_tokens_per_second": 160931.941 }, { "epoch": 0.02021993614757006, "grad_norm": 0.6919141411781311, "learning_rate": 3.996882636570215e-05, "loss": 0.5479, "num_input_tokens_seen": 221440016, "step": 57, "train_runtime": 1377.4958, "train_tokens_per_second": 160755.487 }, { "epoch": 0.020574671869457255, "grad_norm": 0.5004721879959106, "learning_rate": 3.996756729137011e-05, "loss": 0.5366, "num_input_tokens_seen": 225266556, "step": 58, "train_runtime": 1398.3686, "train_tokens_per_second": 161092.405 }, { "epoch": 0.02092940759134445, "grad_norm": 0.7219316363334656, "learning_rate": 3.996628331210421e-05, "loss": 0.5346, "num_input_tokens_seen": 229234530, "step": 59, "train_runtime": 1427.5033, "train_tokens_per_second": 160584.241 }, { "epoch": 0.021284143313231644, "grad_norm": 0.7143905758857727, "learning_rate": 3.996497442950592e-05, "loss": 0.5282, "num_input_tokens_seen": 233092182, "step": 60, "train_runtime": 1450.9046, "train_tokens_per_second": 160653.001 }, { "epoch": 0.021638879035118838, "grad_norm": 0.888725757598877, "learning_rate": 3.996364064520777e-05, "loss": 0.5325, "num_input_tokens_seen": 237063114, "step": 61, "train_runtime": 1474.659, "train_tokens_per_second": 160757.919 }, { "epoch": 0.02199361475700603, "grad_norm": 1.008042573928833, "learning_rate": 3.9962281960873366e-05, "loss": 0.5577, "num_input_tokens_seen": 240916079, "step": 62, "train_runtime": 1491.1654, "train_tokens_per_second": 161562.277 }, { "epoch": 0.022348350478893223, "grad_norm": 0.9338603615760803, "learning_rate": 3.9960898378197324e-05, "loss": 0.5418, "num_input_tokens_seen": 244868171, "step": 63, "train_runtime": 1508.0798, "train_tokens_per_second": 162370.826 }, { "epoch": 0.022703086200780417, "grad_norm": 0.5956737995147705, "learning_rate": 3.9959489898905366e-05, "loss": 0.5259, "num_input_tokens_seen": 248793597, "step": 64, "train_runtime": 1533.488, "train_tokens_per_second": 162240.329 }, { "epoch": 0.02305782192266761, "grad_norm": 0.6041797995567322, "learning_rate": 3.995805652475424e-05, "loss": 0.5331, "num_input_tokens_seen": 252673897, "step": 65, "train_runtime": 1553.9805, "train_tokens_per_second": 162597.853 }, { "epoch": 0.023412557644554806, "grad_norm": 0.6845144629478455, "learning_rate": 3.995659825753174e-05, "loss": 0.5273, "num_input_tokens_seen": 256601842, "step": 66, "train_runtime": 1587.5351, "train_tokens_per_second": 161635.384 }, { "epoch": 0.023767293366442, "grad_norm": 0.5089904069900513, "learning_rate": 3.995511509905673e-05, "loss": 0.5152, "num_input_tokens_seen": 260535858, "step": 67, "train_runtime": 1607.3791, "train_tokens_per_second": 162087.376 }, { "epoch": 0.024122029088329194, "grad_norm": 0.6634523272514343, "learning_rate": 3.99536070511791e-05, "loss": 0.5402, "num_input_tokens_seen": 264411942, "step": 68, "train_runtime": 1632.2688, "train_tokens_per_second": 161990.437 }, { "epoch": 0.02447676481021639, "grad_norm": 0.5291318893432617, "learning_rate": 3.99520741157798e-05, "loss": 0.5199, "num_input_tokens_seen": 268219696, "step": 69, "train_runtime": 1656.6058, "train_tokens_per_second": 161909.187 }, { "epoch": 0.024831500532103583, "grad_norm": 0.5262149572372437, "learning_rate": 3.99505162947708e-05, "loss": 0.5229, "num_input_tokens_seen": 272089567, "step": 70, "train_runtime": 1681.3586, "train_tokens_per_second": 161827.204 }, { "epoch": 0.025186236253990777, "grad_norm": 0.48122671246528625, "learning_rate": 3.9948933590095135e-05, "loss": 0.5263, "num_input_tokens_seen": 276000207, "step": 71, "train_runtime": 1703.8785, "train_tokens_per_second": 161983.506 }, { "epoch": 0.02554097197587797, "grad_norm": 0.4533829987049103, "learning_rate": 3.994732600372686e-05, "loss": 0.516, "num_input_tokens_seen": 279899066, "step": 72, "train_runtime": 1722.4474, "train_tokens_per_second": 162500.787 }, { "epoch": 0.025895707697765166, "grad_norm": 0.5013334155082703, "learning_rate": 3.994569353767107e-05, "loss": 0.5256, "num_input_tokens_seen": 283793284, "step": 73, "train_runtime": 1751.4831, "train_tokens_per_second": 162030.271 }, { "epoch": 0.02625044341965236, "grad_norm": 0.49290770292282104, "learning_rate": 3.9944036193963885e-05, "loss": 0.5262, "num_input_tokens_seen": 287670621, "step": 74, "train_runtime": 1775.5073, "train_tokens_per_second": 162021.651 }, { "epoch": 0.026605179141539554, "grad_norm": 0.43674829602241516, "learning_rate": 3.994235397467246e-05, "loss": 0.5179, "num_input_tokens_seen": 291536034, "step": 75, "train_runtime": 1801.6589, "train_tokens_per_second": 161815.33 }, { "epoch": 0.026959914863426748, "grad_norm": 0.4117055833339691, "learning_rate": 3.994064688189498e-05, "loss": 0.5214, "num_input_tokens_seen": 295412367, "step": 76, "train_runtime": 1827.7745, "train_tokens_per_second": 161624.077 }, { "epoch": 0.027314650585313942, "grad_norm": 0.42348745465278625, "learning_rate": 3.993891491776065e-05, "loss": 0.5326, "num_input_tokens_seen": 299278459, "step": 77, "train_runtime": 1858.0767, "train_tokens_per_second": 161068.951 }, { "epoch": 0.027669386307201137, "grad_norm": 0.3981715440750122, "learning_rate": 3.993715808442968e-05, "loss": 0.5167, "num_input_tokens_seen": 303178917, "step": 78, "train_runtime": 1885.4503, "train_tokens_per_second": 160799.205 }, { "epoch": 0.028024122029088328, "grad_norm": 0.43623870611190796, "learning_rate": 3.9935376384093336e-05, "loss": 0.5102, "num_input_tokens_seen": 307052734, "step": 79, "train_runtime": 1913.1353, "train_tokens_per_second": 160497.135 }, { "epoch": 0.028378857750975522, "grad_norm": 0.4211648106575012, "learning_rate": 3.993356981897387e-05, "loss": 0.5231, "num_input_tokens_seen": 310913145, "step": 80, "train_runtime": 1946.668, "train_tokens_per_second": 159715.55 }, { "epoch": 0.028733593472862716, "grad_norm": 0.4384077489376068, "learning_rate": 3.993173839132455e-05, "loss": 0.5122, "num_input_tokens_seen": 314800133, "step": 81, "train_runtime": 1976.7857, "train_tokens_per_second": 159248.486 }, { "epoch": 0.02908832919474991, "grad_norm": 0.45680293440818787, "learning_rate": 3.992988210342966e-05, "loss": 0.5237, "num_input_tokens_seen": 318704329, "step": 82, "train_runtime": 2010.2241, "train_tokens_per_second": 158541.694 }, { "epoch": 0.029443064916637104, "grad_norm": 0.5009905099868774, "learning_rate": 3.99280009576045e-05, "loss": 0.5093, "num_input_tokens_seen": 322612454, "step": 83, "train_runtime": 2033.0105, "train_tokens_per_second": 158687.058 }, { "epoch": 0.0297978006385243, "grad_norm": 0.49878960847854614, "learning_rate": 3.9926094956195356e-05, "loss": 0.5084, "num_input_tokens_seen": 326540091, "step": 84, "train_runtime": 2061.9012, "train_tokens_per_second": 158368.448 }, { "epoch": 0.030152536360411493, "grad_norm": 0.4919455349445343, "learning_rate": 3.992416410157953e-05, "loss": 0.5268, "num_input_tokens_seen": 330378582, "step": 85, "train_runtime": 2080.8077, "train_tokens_per_second": 158774.199 }, { "epoch": 0.030507272082298687, "grad_norm": 0.3973747491836548, "learning_rate": 3.9922208396165316e-05, "loss": 0.5035, "num_input_tokens_seen": 334265537, "step": 86, "train_runtime": 2107.8257, "train_tokens_per_second": 158583.1 }, { "epoch": 0.03086200780418588, "grad_norm": 0.3015368580818176, "learning_rate": 3.992022784239201e-05, "loss": 0.5131, "num_input_tokens_seen": 338176911, "step": 87, "train_runtime": 2130.7254, "train_tokens_per_second": 158714.448 }, { "epoch": 0.031216743526073076, "grad_norm": 2.8588054180145264, "learning_rate": 3.9918222442729885e-05, "loss": 0.5051, "num_input_tokens_seen": 342096392, "step": 88, "train_runtime": 2159.6682, "train_tokens_per_second": 158402.291 }, { "epoch": 0.03157147924796027, "grad_norm": 0.43037283420562744, "learning_rate": 3.9916192199680225e-05, "loss": 0.5136, "num_input_tokens_seen": 345987855, "step": 89, "train_runtime": 2194.3072, "train_tokens_per_second": 157675.211 }, { "epoch": 0.03192621496984746, "grad_norm": 0.5554024577140808, "learning_rate": 3.991413711577529e-05, "loss": 0.512, "num_input_tokens_seen": 349834863, "step": 90, "train_runtime": 2223.652, "train_tokens_per_second": 157324.463 }, { "epoch": 0.03228095069173466, "grad_norm": 0.5606537461280823, "learning_rate": 3.991205719357831e-05, "loss": 0.518, "num_input_tokens_seen": 353694697, "step": 91, "train_runtime": 2245.5207, "train_tokens_per_second": 157511.214 }, { "epoch": 0.03263568641362185, "grad_norm": 0.5129927396774292, "learning_rate": 3.9909952435683524e-05, "loss": 0.5083, "num_input_tokens_seen": 357667916, "step": 92, "train_runtime": 2269.1952, "train_tokens_per_second": 157618.844 }, { "epoch": 0.03299042213550905, "grad_norm": 0.44299882650375366, "learning_rate": 3.990782284471612e-05, "loss": 0.5146, "num_input_tokens_seen": 361496906, "step": 93, "train_runtime": 2305.6299, "train_tokens_per_second": 156788.783 }, { "epoch": 0.03334515785739624, "grad_norm": 0.47308510541915894, "learning_rate": 3.990566842333228e-05, "loss": 0.5193, "num_input_tokens_seen": 365422013, "step": 94, "train_runtime": 2330.7094, "train_tokens_per_second": 156785.744 }, { "epoch": 0.033699893579283435, "grad_norm": 0.5903277397155762, "learning_rate": 3.9903489174219144e-05, "loss": 0.5036, "num_input_tokens_seen": 369317490, "step": 95, "train_runtime": 2354.4242, "train_tokens_per_second": 156861.065 }, { "epoch": 0.034054629301170626, "grad_norm": 0.6449141502380371, "learning_rate": 3.990128510009482e-05, "loss": 0.5152, "num_input_tokens_seen": 373117790, "step": 96, "train_runtime": 2387.7938, "train_tokens_per_second": 156260.473 }, { "epoch": 0.034409365023057824, "grad_norm": 0.6022350788116455, "learning_rate": 3.989905620370839e-05, "loss": 0.5178, "num_input_tokens_seen": 377012142, "step": 97, "train_runtime": 2414.739, "train_tokens_per_second": 156129.563 }, { "epoch": 0.034764100744945015, "grad_norm": 0.5150610208511353, "learning_rate": 3.9896802487839884e-05, "loss": 0.5174, "num_input_tokens_seen": 380909410, "step": 98, "train_runtime": 2437.9858, "train_tokens_per_second": 156239.39 }, { "epoch": 0.03511883646683221, "grad_norm": 0.5354108810424805, "learning_rate": 3.9894523955300284e-05, "loss": 0.5204, "num_input_tokens_seen": 384800913, "step": 99, "train_runtime": 2461.5463, "train_tokens_per_second": 156324.875 }, { "epoch": 0.0354735721887194, "grad_norm": 0.6937741637229919, "learning_rate": 3.9892220608931544e-05, "loss": 0.5139, "num_input_tokens_seen": 388644274, "step": 100, "train_runtime": 2485.1009, "train_tokens_per_second": 156389.739 }, { "epoch": 0.0358283079106066, "grad_norm": 0.848224937915802, "learning_rate": 3.9889892451606557e-05, "loss": 0.506, "num_input_tokens_seen": 392569459, "step": 101, "train_runtime": 2505.5818, "train_tokens_per_second": 156677.968 }, { "epoch": 0.03618304363249379, "grad_norm": 0.6934736371040344, "learning_rate": 3.988753948622916e-05, "loss": 0.5166, "num_input_tokens_seen": 396474540, "step": 102, "train_runtime": 2523.2848, "train_tokens_per_second": 157126.352 }, { "epoch": 0.03653777935438099, "grad_norm": 0.4044578969478607, "learning_rate": 3.9885161715734135e-05, "loss": 0.4934, "num_input_tokens_seen": 400367007, "step": 103, "train_runtime": 2546.9417, "train_tokens_per_second": 157195.198 }, { "epoch": 0.03689251507626818, "grad_norm": 0.5699591636657715, "learning_rate": 3.9882759143087194e-05, "loss": 0.5203, "num_input_tokens_seen": 404275230, "step": 104, "train_runtime": 2570.1442, "train_tokens_per_second": 157296.71 }, { "epoch": 0.03724725079815538, "grad_norm": 0.5560294985771179, "learning_rate": 3.9880331771285e-05, "loss": 0.4983, "num_input_tokens_seen": 408137406, "step": 105, "train_runtime": 2589.2891, "train_tokens_per_second": 157625.273 }, { "epoch": 0.03760198652004257, "grad_norm": 0.388006329536438, "learning_rate": 3.9877879603355144e-05, "loss": 0.5062, "num_input_tokens_seen": 412026776, "step": 106, "train_runtime": 2612.0925, "train_tokens_per_second": 157738.202 }, { "epoch": 0.03795672224192976, "grad_norm": 0.380976140499115, "learning_rate": 3.9875402642356136e-05, "loss": 0.4952, "num_input_tokens_seen": 415925376, "step": 107, "train_runtime": 2639.6838, "train_tokens_per_second": 157566.361 }, { "epoch": 0.03831145796381696, "grad_norm": 0.4127376973628998, "learning_rate": 3.987290089137741e-05, "loss": 0.4927, "num_input_tokens_seen": 419868602, "step": 108, "train_runtime": 2662.4599, "train_tokens_per_second": 157699.503 }, { "epoch": 0.03866619368570415, "grad_norm": 0.39321979880332947, "learning_rate": 3.987037435353933e-05, "loss": 0.5022, "num_input_tokens_seen": 423776296, "step": 109, "train_runtime": 2695.6927, "train_tokens_per_second": 157204.974 }, { "epoch": 0.039020929407591345, "grad_norm": 0.4572235643863678, "learning_rate": 3.9867823031993154e-05, "loss": 0.4981, "num_input_tokens_seen": 427747889, "step": 110, "train_runtime": 2715.4272, "train_tokens_per_second": 157525.082 }, { "epoch": 0.039375665129478536, "grad_norm": 0.4462689459323883, "learning_rate": 3.9865246929921076e-05, "loss": 0.5156, "num_input_tokens_seen": 431547749, "step": 111, "train_runtime": 2740.5526, "train_tokens_per_second": 157467.418 }, { "epoch": 0.039730400851365734, "grad_norm": 0.5381746888160706, "learning_rate": 3.98626460505362e-05, "loss": 0.4926, "num_input_tokens_seen": 435447880, "step": 112, "train_runtime": 2763.6829, "train_tokens_per_second": 157560.724 }, { "epoch": 0.040085136573252925, "grad_norm": 0.5972180366516113, "learning_rate": 3.9860020397082516e-05, "loss": 0.4875, "num_input_tokens_seen": 439314236, "step": 113, "train_runtime": 2789.5136, "train_tokens_per_second": 157487.754 }, { "epoch": 0.04043987229514012, "grad_norm": 0.6147491931915283, "learning_rate": 3.985736997283491e-05, "loss": 0.4954, "num_input_tokens_seen": 443226572, "step": 114, "train_runtime": 2823.5308, "train_tokens_per_second": 156976.001 }, { "epoch": 0.04079460801702731, "grad_norm": 0.5261881351470947, "learning_rate": 3.9854694781099184e-05, "loss": 0.5083, "num_input_tokens_seen": 447045176, "step": 115, "train_runtime": 2849.4573, "train_tokens_per_second": 156887.834 }, { "epoch": 0.04114934373891451, "grad_norm": 0.36811837553977966, "learning_rate": 3.9851994825212024e-05, "loss": 0.5119, "num_input_tokens_seen": 450993789, "step": 116, "train_runtime": 2867.2085, "train_tokens_per_second": 157293.685 }, { "epoch": 0.0415040794608017, "grad_norm": 0.3951771557331085, "learning_rate": 3.984927010854099e-05, "loss": 0.4895, "num_input_tokens_seen": 454857393, "step": 117, "train_runtime": 2893.9135, "train_tokens_per_second": 157177.261 }, { "epoch": 0.0418588151826889, "grad_norm": 0.6010317802429199, "learning_rate": 3.9846520634484564e-05, "loss": 0.4986, "num_input_tokens_seen": 458797430, "step": 118, "train_runtime": 2914.8754, "train_tokens_per_second": 157398.641 }, { "epoch": 0.04221355090457609, "grad_norm": 0.5483181476593018, "learning_rate": 3.9843746406472055e-05, "loss": 0.5, "num_input_tokens_seen": 462718069, "step": 119, "train_runtime": 2940.8665, "train_tokens_per_second": 157340.727 }, { "epoch": 0.04256828662646329, "grad_norm": 0.3607918322086334, "learning_rate": 3.984094742796368e-05, "loss": 0.4934, "num_input_tokens_seen": 466565847, "step": 120, "train_runtime": 2967.387, "train_tokens_per_second": 157231.21 }, { "epoch": 0.04292302234835048, "grad_norm": 0.4084886908531189, "learning_rate": 3.9838123702450525e-05, "loss": 0.4846, "num_input_tokens_seen": 470499963, "step": 121, "train_runtime": 2993.6298, "train_tokens_per_second": 157167.052 }, { "epoch": 0.043277758070237676, "grad_norm": 0.4939357042312622, "learning_rate": 3.983527523345453e-05, "loss": 0.4924, "num_input_tokens_seen": 474305032, "step": 122, "train_runtime": 3034.9959, "train_tokens_per_second": 156278.639 }, { "epoch": 0.04363249379212487, "grad_norm": 0.4633379876613617, "learning_rate": 3.983240202452851e-05, "loss": 0.5103, "num_input_tokens_seen": 478284377, "step": 123, "train_runtime": 3064.0271, "train_tokens_per_second": 156096.656 }, { "epoch": 0.04398722951401206, "grad_norm": 0.36714237928390503, "learning_rate": 3.9829504079256114e-05, "loss": 0.5006, "num_input_tokens_seen": 482122586, "step": 124, "train_runtime": 3085.4878, "train_tokens_per_second": 156254.9 }, { "epoch": 0.044341965235899256, "grad_norm": 0.3552481532096863, "learning_rate": 3.982658140125188e-05, "loss": 0.4939, "num_input_tokens_seen": 485981707, "step": 125, "train_runtime": 3112.9212, "train_tokens_per_second": 156117.575 }, { "epoch": 0.044696700957786446, "grad_norm": 0.4539552927017212, "learning_rate": 3.982363399416116e-05, "loss": 0.4921, "num_input_tokens_seen": 489871783, "step": 126, "train_runtime": 3140.1554, "train_tokens_per_second": 156002.402 }, { "epoch": 0.045051436679673644, "grad_norm": 0.5406495332717896, "learning_rate": 3.9820661861660166e-05, "loss": 0.4947, "num_input_tokens_seen": 493683494, "step": 127, "train_runtime": 3166.0247, "train_tokens_per_second": 155931.664 }, { "epoch": 0.045406172401560835, "grad_norm": 0.41733577847480774, "learning_rate": 3.9817665007455964e-05, "loss": 0.4955, "num_input_tokens_seen": 497569861, "step": 128, "train_runtime": 3190.4461, "train_tokens_per_second": 155956.203 }, { "epoch": 0.04576090812344803, "grad_norm": 0.37230923771858215, "learning_rate": 3.981464343528642e-05, "loss": 0.4997, "num_input_tokens_seen": 501505742, "step": 129, "train_runtime": 3220.0095, "train_tokens_per_second": 155746.664 }, { "epoch": 0.04611564384533522, "grad_norm": 0.5505240559577942, "learning_rate": 3.9811597148920246e-05, "loss": 0.5017, "num_input_tokens_seen": 505349523, "step": 130, "train_runtime": 3240.2165, "train_tokens_per_second": 155961.655 }, { "epoch": 0.04647037956722242, "grad_norm": 0.6588237881660461, "learning_rate": 3.980852615215701e-05, "loss": 0.5021, "num_input_tokens_seen": 509258477, "step": 131, "train_runtime": 3267.144, "train_tokens_per_second": 155872.678 }, { "epoch": 0.04682511528910961, "grad_norm": 0.7694488763809204, "learning_rate": 3.980543044882703e-05, "loss": 0.5059, "num_input_tokens_seen": 513158580, "step": 132, "train_runtime": 3297.057, "train_tokens_per_second": 155641.404 }, { "epoch": 0.04717985101099681, "grad_norm": 0.8401789665222168, "learning_rate": 3.980231004279151e-05, "loss": 0.4961, "num_input_tokens_seen": 516992419, "step": 133, "train_runtime": 3317.3398, "train_tokens_per_second": 155845.482 }, { "epoch": 0.047534586732884, "grad_norm": 0.7816863656044006, "learning_rate": 3.979916493794244e-05, "loss": 0.5057, "num_input_tokens_seen": 520830080, "step": 134, "train_runtime": 3342.7631, "train_tokens_per_second": 155808.255 }, { "epoch": 0.0478893224547712, "grad_norm": 0.4462583065032959, "learning_rate": 3.9795995138202596e-05, "loss": 0.4958, "num_input_tokens_seen": 524676917, "step": 135, "train_runtime": 3369.8314, "train_tokens_per_second": 155698.269 }, { "epoch": 0.04824405817665839, "grad_norm": 0.5488406419754028, "learning_rate": 3.9792800647525575e-05, "loss": 0.4925, "num_input_tokens_seen": 528581821, "step": 136, "train_runtime": 3402.083, "train_tokens_per_second": 155370.056 }, { "epoch": 0.048598793898545586, "grad_norm": 1.056166172027588, "learning_rate": 3.978958146989578e-05, "loss": 0.5076, "num_input_tokens_seen": 532455246, "step": 137, "train_runtime": 3434.5388, "train_tokens_per_second": 155029.618 }, { "epoch": 0.04895352962043278, "grad_norm": 0.43731772899627686, "learning_rate": 3.9786337609328374e-05, "loss": 0.4791, "num_input_tokens_seen": 536283214, "step": 138, "train_runtime": 3460.8248, "train_tokens_per_second": 154958.208 }, { "epoch": 0.049308265342319975, "grad_norm": 0.5201188325881958, "learning_rate": 3.978306906986934e-05, "loss": 0.4911, "num_input_tokens_seen": 540181214, "step": 139, "train_runtime": 3481.8429, "train_tokens_per_second": 155142.329 }, { "epoch": 0.049663001064207166, "grad_norm": 0.46922430396080017, "learning_rate": 3.977977585559542e-05, "loss": 0.4882, "num_input_tokens_seen": 544023549, "step": 140, "train_runtime": 3504.7981, "train_tokens_per_second": 155222.508 }, { "epoch": 0.050017736786094356, "grad_norm": 0.662368655204773, "learning_rate": 3.9776457970614136e-05, "loss": 0.4987, "num_input_tokens_seen": 547921919, "step": 141, "train_runtime": 3523.0366, "train_tokens_per_second": 155525.468 }, { "epoch": 0.050372472507981554, "grad_norm": 0.5297757387161255, "learning_rate": 3.977311541906379e-05, "loss": 0.4931, "num_input_tokens_seen": 551792093, "step": 142, "train_runtime": 3549.6114, "train_tokens_per_second": 155451.411 }, { "epoch": 0.050727208229868745, "grad_norm": 0.458456426858902, "learning_rate": 3.976974820511345e-05, "loss": 0.4872, "num_input_tokens_seen": 555745453, "step": 143, "train_runtime": 3574.6504, "train_tokens_per_second": 155468.476 }, { "epoch": 0.05108194395175594, "grad_norm": 0.5834052562713623, "learning_rate": 3.976635633296292e-05, "loss": 0.4878, "num_input_tokens_seen": 559589950, "step": 144, "train_runtime": 3598.8373, "train_tokens_per_second": 155491.872 }, { "epoch": 0.05143667967364313, "grad_norm": 0.7014194130897522, "learning_rate": 3.97629398068428e-05, "loss": 0.4773, "num_input_tokens_seen": 563431850, "step": 145, "train_runtime": 3626.9961, "train_tokens_per_second": 155343.935 }, { "epoch": 0.05179141539553033, "grad_norm": 0.43420204520225525, "learning_rate": 3.97594986310144e-05, "loss": 0.4888, "num_input_tokens_seen": 567434634, "step": 146, "train_runtime": 3653.1697, "train_tokens_per_second": 155326.655 }, { "epoch": 0.05214615111741752, "grad_norm": 0.44394513964653015, "learning_rate": 3.975603280976979e-05, "loss": 0.4814, "num_input_tokens_seen": 571269781, "step": 147, "train_runtime": 3675.7846, "train_tokens_per_second": 155414.381 }, { "epoch": 0.05250088683930472, "grad_norm": 0.3589637279510498, "learning_rate": 3.975254234743181e-05, "loss": 0.4967, "num_input_tokens_seen": 575174298, "step": 148, "train_runtime": 3702.7035, "train_tokens_per_second": 155339.009 }, { "epoch": 0.05285562256119191, "grad_norm": 0.48913124203681946, "learning_rate": 3.9749027248353986e-05, "loss": 0.5014, "num_input_tokens_seen": 578971903, "step": 149, "train_runtime": 3731.241, "train_tokens_per_second": 155168.725 }, { "epoch": 0.05321035828307911, "grad_norm": 0.4869951605796814, "learning_rate": 3.9745487516920584e-05, "loss": 0.4833, "num_input_tokens_seen": 582843232, "step": 150, "train_runtime": 3750.7296, "train_tokens_per_second": 155394.628 }, { "epoch": 0.0535650940049663, "grad_norm": 0.3559737205505371, "learning_rate": 3.974192315754663e-05, "loss": 0.496, "num_input_tokens_seen": 586736667, "step": 151, "train_runtime": 3782.6352, "train_tokens_per_second": 155113.205 }, { "epoch": 0.053919829726853497, "grad_norm": 0.4112870991230011, "learning_rate": 3.9738334174677816e-05, "loss": 0.4866, "num_input_tokens_seen": 590561421, "step": 152, "train_runtime": 3820.9086, "train_tokens_per_second": 154560.47 }, { "epoch": 0.05427456544874069, "grad_norm": 0.4540790021419525, "learning_rate": 3.9734720572790586e-05, "loss": 0.4861, "num_input_tokens_seen": 594517701, "step": 153, "train_runtime": 3849.2216, "train_tokens_per_second": 154451.408 }, { "epoch": 0.054629301170627885, "grad_norm": 0.4978800415992737, "learning_rate": 3.973108235639206e-05, "loss": 0.4984, "num_input_tokens_seen": 598444194, "step": 154, "train_runtime": 3867.0052, "train_tokens_per_second": 154756.5 }, { "epoch": 0.054984036892515076, "grad_norm": 0.4487268924713135, "learning_rate": 3.9727419530020086e-05, "loss": 0.4858, "num_input_tokens_seen": 602327521, "step": 155, "train_runtime": 3893.623, "train_tokens_per_second": 154695.902 }, { "epoch": 0.05533877261440227, "grad_norm": 0.32056644558906555, "learning_rate": 3.9723732098243186e-05, "loss": 0.4815, "num_input_tokens_seen": 606183111, "step": 156, "train_runtime": 3922.8716, "train_tokens_per_second": 154525.352 }, { "epoch": 0.055693508336289464, "grad_norm": 0.37670671939849854, "learning_rate": 3.972002006566059e-05, "loss": 0.486, "num_input_tokens_seen": 610035582, "step": 157, "train_runtime": 3950.475, "train_tokens_per_second": 154420.819 }, { "epoch": 0.056048244058176655, "grad_norm": 0.4474126398563385, "learning_rate": 3.9716283436902194e-05, "loss": 0.4794, "num_input_tokens_seen": 613956177, "step": 158, "train_runtime": 3972.6882, "train_tokens_per_second": 154544.264 }, { "epoch": 0.05640297978006385, "grad_norm": 0.5084024667739868, "learning_rate": 3.971252221662858e-05, "loss": 0.4922, "num_input_tokens_seen": 617833224, "step": 159, "train_runtime": 4003.613, "train_tokens_per_second": 154318.917 }, { "epoch": 0.056757715501951043, "grad_norm": 0.4801258444786072, "learning_rate": 3.970873640953101e-05, "loss": 0.4925, "num_input_tokens_seen": 621737465, "step": 160, "train_runtime": 4026.0467, "train_tokens_per_second": 154428.775 }, { "epoch": 0.05711245122383824, "grad_norm": 0.4814443588256836, "learning_rate": 3.9704926020331404e-05, "loss": 0.4749, "num_input_tokens_seen": 625594895, "step": 161, "train_runtime": 4051.3103, "train_tokens_per_second": 154417.915 }, { "epoch": 0.05746718694572543, "grad_norm": 0.43517211079597473, "learning_rate": 3.970109105378234e-05, "loss": 0.4817, "num_input_tokens_seen": 629486471, "step": 162, "train_runtime": 4075.6527, "train_tokens_per_second": 154450.469 }, { "epoch": 0.05782192266761263, "grad_norm": 0.4131617248058319, "learning_rate": 3.9697231514667046e-05, "loss": 0.4727, "num_input_tokens_seen": 633433234, "step": 163, "train_runtime": 4102.4623, "train_tokens_per_second": 154403.181 }, { "epoch": 0.05817665838949982, "grad_norm": 0.4073924720287323, "learning_rate": 3.969334740779942e-05, "loss": 0.4838, "num_input_tokens_seen": 637285291, "step": 164, "train_runtime": 4129.0819, "train_tokens_per_second": 154340.676 }, { "epoch": 0.05853139411138702, "grad_norm": 0.4632602334022522, "learning_rate": 3.9689438738023985e-05, "loss": 0.4913, "num_input_tokens_seen": 641259905, "step": 165, "train_runtime": 4152.273, "train_tokens_per_second": 154435.871 }, { "epoch": 0.05888612983327421, "grad_norm": 0.47688356041908264, "learning_rate": 3.9685505510215905e-05, "loss": 0.4813, "num_input_tokens_seen": 645089955, "step": 166, "train_runtime": 4185.4618, "train_tokens_per_second": 154126.353 }, { "epoch": 0.05924086555516141, "grad_norm": 0.4781550168991089, "learning_rate": 3.968154772928097e-05, "loss": 0.4788, "num_input_tokens_seen": 649033653, "step": 167, "train_runtime": 4217.8037, "train_tokens_per_second": 153879.53 }, { "epoch": 0.0595956012770486, "grad_norm": 0.41193506121635437, "learning_rate": 3.967756540015561e-05, "loss": 0.4863, "num_input_tokens_seen": 652907351, "step": 168, "train_runtime": 4243.4606, "train_tokens_per_second": 153862.004 }, { "epoch": 0.059950336998935795, "grad_norm": 0.42754465341567993, "learning_rate": 3.967355852780685e-05, "loss": 0.4821, "num_input_tokens_seen": 656828325, "step": 169, "train_runtime": 4267.7099, "train_tokens_per_second": 153906.508 }, { "epoch": 0.060305072720822986, "grad_norm": 0.42541855573654175, "learning_rate": 3.9669527117232346e-05, "loss": 0.4808, "num_input_tokens_seen": 660701557, "step": 170, "train_runtime": 4301.3755, "train_tokens_per_second": 153602.392 }, { "epoch": 0.060659808442710184, "grad_norm": 0.4048808813095093, "learning_rate": 3.966547117346035e-05, "loss": 0.4879, "num_input_tokens_seen": 664588428, "step": 171, "train_runtime": 4324.6164, "train_tokens_per_second": 153675.696 }, { "epoch": 0.061014544164597374, "grad_norm": 0.3953753709793091, "learning_rate": 3.966139070154971e-05, "loss": 0.4755, "num_input_tokens_seen": 668477792, "step": 172, "train_runtime": 4358.0265, "train_tokens_per_second": 153390.024 }, { "epoch": 0.06136927988648457, "grad_norm": 0.3948153555393219, "learning_rate": 3.965728570658988e-05, "loss": 0.4683, "num_input_tokens_seen": 672327611, "step": 173, "train_runtime": 4391.819, "train_tokens_per_second": 153086.365 }, { "epoch": 0.06172401560837176, "grad_norm": 0.43367359042167664, "learning_rate": 3.96531561937009e-05, "loss": 0.4794, "num_input_tokens_seen": 676252821, "step": 174, "train_runtime": 4419.0796, "train_tokens_per_second": 153030.243 }, { "epoch": 0.06207875133025896, "grad_norm": 0.47016212344169617, "learning_rate": 3.964900216803338e-05, "loss": 0.4855, "num_input_tokens_seen": 680125952, "step": 175, "train_runtime": 4464.0403, "train_tokens_per_second": 152356.589 }, { "epoch": 0.06243348705214615, "grad_norm": 0.4836790859699249, "learning_rate": 3.9644823634768496e-05, "loss": 0.4777, "num_input_tokens_seen": 684019564, "step": 176, "train_runtime": 4491.8695, "train_tokens_per_second": 152279.484 }, { "epoch": 0.06278822277403334, "grad_norm": 0.4689669609069824, "learning_rate": 3.964062059911802e-05, "loss": 0.4853, "num_input_tokens_seen": 687905798, "step": 177, "train_runtime": 4511.203, "train_tokens_per_second": 152488.328 }, { "epoch": 0.06314295849592054, "grad_norm": 0.503004789352417, "learning_rate": 3.963639306632427e-05, "loss": 0.4997, "num_input_tokens_seen": 691776972, "step": 178, "train_runtime": 4532.1749, "train_tokens_per_second": 152636.866 }, { "epoch": 0.06349769421780774, "grad_norm": 0.4454919099807739, "learning_rate": 3.963214104166011e-05, "loss": 0.4923, "num_input_tokens_seen": 695665023, "step": 179, "train_runtime": 4552.2225, "train_tokens_per_second": 152818.765 }, { "epoch": 0.06385242993969492, "grad_norm": 0.4555080235004425, "learning_rate": 3.962786453042896e-05, "loss": 0.4735, "num_input_tokens_seen": 699492457, "step": 180, "train_runtime": 4574.5434, "train_tokens_per_second": 152909.786 }, { "epoch": 0.06420716566158212, "grad_norm": 0.531252920627594, "learning_rate": 3.9623563537964784e-05, "loss": 0.4855, "num_input_tokens_seen": 703398790, "step": 181, "train_runtime": 4591.2107, "train_tokens_per_second": 153205.514 }, { "epoch": 0.06456190138346932, "grad_norm": 0.49857935309410095, "learning_rate": 3.9619238069632084e-05, "loss": 0.4796, "num_input_tokens_seen": 707233795, "step": 182, "train_runtime": 4620.7686, "train_tokens_per_second": 153055.447 }, { "epoch": 0.06491663710535651, "grad_norm": 0.3705490827560425, "learning_rate": 3.9614888130825865e-05, "loss": 0.4896, "num_input_tokens_seen": 711223349, "step": 183, "train_runtime": 4640.5587, "train_tokens_per_second": 153262.439 }, { "epoch": 0.0652713728272437, "grad_norm": 0.5289691686630249, "learning_rate": 3.96105137269717e-05, "loss": 0.4734, "num_input_tokens_seen": 715160761, "step": 184, "train_runtime": 4675.2395, "train_tokens_per_second": 152967.726 }, { "epoch": 0.0656261085491309, "grad_norm": 0.6161568760871887, "learning_rate": 3.960611486352562e-05, "loss": 0.4817, "num_input_tokens_seen": 719031169, "step": 185, "train_runtime": 4698.5366, "train_tokens_per_second": 153033.003 }, { "epoch": 0.0659808442710181, "grad_norm": 0.6905276775360107, "learning_rate": 3.960169154597421e-05, "loss": 0.4809, "num_input_tokens_seen": 722864256, "step": 186, "train_runtime": 4722.9788, "train_tokens_per_second": 153052.616 }, { "epoch": 0.06633557999290529, "grad_norm": 0.6194483041763306, "learning_rate": 3.9597243779834536e-05, "loss": 0.475, "num_input_tokens_seen": 726760197, "step": 187, "train_runtime": 4754.5754, "train_tokens_per_second": 152854.911 }, { "epoch": 0.06669031571479248, "grad_norm": 0.5449129343032837, "learning_rate": 3.959277157065416e-05, "loss": 0.4799, "num_input_tokens_seen": 730595912, "step": 188, "train_runtime": 4774.532, "train_tokens_per_second": 153019.375 }, { "epoch": 0.06704505143667967, "grad_norm": 0.43782398104667664, "learning_rate": 3.958827492401112e-05, "loss": 0.4784, "num_input_tokens_seen": 734517651, "step": 189, "train_runtime": 4799.5921, "train_tokens_per_second": 153037.515 }, { "epoch": 0.06739978715856687, "grad_norm": 0.4514002501964569, "learning_rate": 3.958375384551396e-05, "loss": 0.4793, "num_input_tokens_seen": 738392521, "step": 190, "train_runtime": 4826.8465, "train_tokens_per_second": 152976.177 }, { "epoch": 0.06775452288045407, "grad_norm": 0.5548785924911499, "learning_rate": 3.9579208340801684e-05, "loss": 0.4739, "num_input_tokens_seen": 742278109, "step": 191, "train_runtime": 4853.7172, "train_tokens_per_second": 152929.823 }, { "epoch": 0.06810925860234125, "grad_norm": 0.5180841684341431, "learning_rate": 3.957463841554375e-05, "loss": 0.4771, "num_input_tokens_seen": 746213442, "step": 192, "train_runtime": 4883.3039, "train_tokens_per_second": 152809.136 }, { "epoch": 0.06846399432422845, "grad_norm": 0.36239928007125854, "learning_rate": 3.957004407544009e-05, "loss": 0.4774, "num_input_tokens_seen": 750059461, "step": 193, "train_runtime": 4904.2826, "train_tokens_per_second": 152939.691 }, { "epoch": 0.06881873004611565, "grad_norm": 0.44470763206481934, "learning_rate": 3.9565425326221086e-05, "loss": 0.4709, "num_input_tokens_seen": 754019362, "step": 194, "train_runtime": 4922.9385, "train_tokens_per_second": 153164.49 }, { "epoch": 0.06917346576800283, "grad_norm": 0.4628245532512665, "learning_rate": 3.956078217364755e-05, "loss": 0.4913, "num_input_tokens_seen": 757867064, "step": 195, "train_runtime": 4944.2491, "train_tokens_per_second": 153282.539 }, { "epoch": 0.06952820148989003, "grad_norm": 0.47045958042144775, "learning_rate": 3.9556114623510755e-05, "loss": 0.4764, "num_input_tokens_seen": 761788383, "step": 196, "train_runtime": 4975.7016, "train_tokens_per_second": 153101.701 }, { "epoch": 0.06988293721177723, "grad_norm": 0.4941798150539398, "learning_rate": 3.955142268163239e-05, "loss": 0.4693, "num_input_tokens_seen": 765620017, "step": 197, "train_runtime": 4997.4805, "train_tokens_per_second": 153201.201 }, { "epoch": 0.07023767293366442, "grad_norm": 0.6446677446365356, "learning_rate": 3.954670635386457e-05, "loss": 0.4715, "num_input_tokens_seen": 769568715, "step": 198, "train_runtime": 5024.9798, "train_tokens_per_second": 153148.618 }, { "epoch": 0.07059240865555161, "grad_norm": 0.4515077471733093, "learning_rate": 3.954196564608983e-05, "loss": 0.4662, "num_input_tokens_seen": 773386114, "step": 199, "train_runtime": 5051.281, "train_tokens_per_second": 153106.927 }, { "epoch": 0.0709471443774388, "grad_norm": 0.5575351715087891, "learning_rate": 3.9537200564221106e-05, "loss": 0.4847, "num_input_tokens_seen": 777328710, "step": 200, "train_runtime": 5087.8492, "train_tokens_per_second": 152781.397 }, { "epoch": 0.071301880099326, "grad_norm": 0.5603287220001221, "learning_rate": 3.953241111420174e-05, "loss": 0.4727, "num_input_tokens_seen": 781200921, "step": 201, "train_runtime": 5221.1454, "train_tokens_per_second": 149622.518 }, { "epoch": 0.0716566158212132, "grad_norm": 0.5389100909233093, "learning_rate": 3.952759730200546e-05, "loss": 0.4709, "num_input_tokens_seen": 785091710, "step": 202, "train_runtime": 5250.8798, "train_tokens_per_second": 149516.222 }, { "epoch": 0.07201135154310039, "grad_norm": 0.5642770528793335, "learning_rate": 3.952275913363639e-05, "loss": 0.4656, "num_input_tokens_seen": 788922397, "step": 203, "train_runtime": 5273.0468, "train_tokens_per_second": 149614.146 }, { "epoch": 0.07236608726498758, "grad_norm": 0.6803531646728516, "learning_rate": 3.9517896615129034e-05, "loss": 0.4771, "num_input_tokens_seen": 792864906, "step": 204, "train_runtime": 5301.1808, "train_tokens_per_second": 149563.831 }, { "epoch": 0.07272082298687478, "grad_norm": 0.5791733860969543, "learning_rate": 3.951300975254825e-05, "loss": 0.4844, "num_input_tokens_seen": 796715877, "step": 205, "train_runtime": 5332.4118, "train_tokens_per_second": 149410.044 }, { "epoch": 0.07307555870876198, "grad_norm": 0.5049728155136108, "learning_rate": 3.9508098551989284e-05, "loss": 0.4798, "num_input_tokens_seen": 800616822, "step": 206, "train_runtime": 5357.1643, "train_tokens_per_second": 149447.874 }, { "epoch": 0.07343029443064916, "grad_norm": 0.42561304569244385, "learning_rate": 3.950316301957772e-05, "loss": 0.4718, "num_input_tokens_seen": 804512188, "step": 207, "train_runtime": 5379.4056, "train_tokens_per_second": 149554.103 }, { "epoch": 0.07378503015253636, "grad_norm": 0.42885103821754456, "learning_rate": 3.94982031614695e-05, "loss": 0.47, "num_input_tokens_seen": 808421598, "step": 208, "train_runtime": 5403.0578, "train_tokens_per_second": 149622.979 }, { "epoch": 0.07413976587442356, "grad_norm": 0.4377151429653168, "learning_rate": 3.94932189838509e-05, "loss": 0.4762, "num_input_tokens_seen": 812412549, "step": 209, "train_runtime": 5431.4159, "train_tokens_per_second": 149576.568 }, { "epoch": 0.07449450159631076, "grad_norm": 0.44841229915618896, "learning_rate": 3.948821049293853e-05, "loss": 0.4781, "num_input_tokens_seen": 816226839, "step": 210, "train_runtime": 5458.3202, "train_tokens_per_second": 149538.102 }, { "epoch": 0.07484923731819794, "grad_norm": 0.4328293204307556, "learning_rate": 3.9483177694979324e-05, "loss": 0.4642, "num_input_tokens_seen": 820182959, "step": 211, "train_runtime": 5478.6975, "train_tokens_per_second": 149704.005 }, { "epoch": 0.07520397304008514, "grad_norm": 0.36082589626312256, "learning_rate": 3.9478120596250546e-05, "loss": 0.4712, "num_input_tokens_seen": 824028855, "step": 212, "train_runtime": 5502.2252, "train_tokens_per_second": 149762.836 }, { "epoch": 0.07555870876197233, "grad_norm": 0.41742637753486633, "learning_rate": 3.9473039203059743e-05, "loss": 0.4818, "num_input_tokens_seen": 827917566, "step": 213, "train_runtime": 5535.0398, "train_tokens_per_second": 149577.527 }, { "epoch": 0.07591344448385952, "grad_norm": 0.4003700315952301, "learning_rate": 3.946793352174481e-05, "loss": 0.4763, "num_input_tokens_seen": 831886655, "step": 214, "train_runtime": 5563.16, "train_tokens_per_second": 149534.915 }, { "epoch": 0.07626818020574672, "grad_norm": 0.39387205243110657, "learning_rate": 3.946280355867388e-05, "loss": 0.4733, "num_input_tokens_seen": 835744540, "step": 215, "train_runtime": 5589.9838, "train_tokens_per_second": 149507.508 }, { "epoch": 0.07662291592763391, "grad_norm": 0.32485446333885193, "learning_rate": 3.945764932024541e-05, "loss": 0.4686, "num_input_tokens_seen": 839625550, "step": 216, "train_runtime": 5621.9457, "train_tokens_per_second": 149347.859 }, { "epoch": 0.07697765164952111, "grad_norm": 0.37994804978370667, "learning_rate": 3.945247081288813e-05, "loss": 0.4768, "num_input_tokens_seen": 843547973, "step": 217, "train_runtime": 5641.6789, "train_tokens_per_second": 149520.734 }, { "epoch": 0.0773323873714083, "grad_norm": 0.36248156428337097, "learning_rate": 3.944726804306101e-05, "loss": 0.4741, "num_input_tokens_seen": 847423112, "step": 218, "train_runtime": 5663.3234, "train_tokens_per_second": 149633.536 }, { "epoch": 0.0776871230932955, "grad_norm": 0.34399229288101196, "learning_rate": 3.944204101725333e-05, "loss": 0.4697, "num_input_tokens_seen": 851388722, "step": 219, "train_runtime": 5685.6552, "train_tokens_per_second": 149743.292 }, { "epoch": 0.07804185881518269, "grad_norm": 0.327374130487442, "learning_rate": 3.943678974198458e-05, "loss": 0.4566, "num_input_tokens_seen": 855276868, "step": 220, "train_runtime": 5716.0564, "train_tokens_per_second": 149627.088 }, { "epoch": 0.07839659453706989, "grad_norm": 0.3246159255504608, "learning_rate": 3.943151422380453e-05, "loss": 0.4766, "num_input_tokens_seen": 859175058, "step": 221, "train_runtime": 5737.1307, "train_tokens_per_second": 149756.925 }, { "epoch": 0.07875133025895707, "grad_norm": 0.36769402027130127, "learning_rate": 3.942621446929316e-05, "loss": 0.4693, "num_input_tokens_seen": 863078427, "step": 222, "train_runtime": 5755.6111, "train_tokens_per_second": 149954.265 }, { "epoch": 0.07910606598084427, "grad_norm": 0.30971771478652954, "learning_rate": 3.94208904850607e-05, "loss": 0.4708, "num_input_tokens_seen": 866893973, "step": 223, "train_runtime": 5781.0771, "train_tokens_per_second": 149953.713 }, { "epoch": 0.07946080170273147, "grad_norm": 0.3847598433494568, "learning_rate": 3.941554227774758e-05, "loss": 0.4748, "num_input_tokens_seen": 870831017, "step": 224, "train_runtime": 5819.2821, "train_tokens_per_second": 149645.781 }, { "epoch": 0.07981553742461867, "grad_norm": 0.4233938455581665, "learning_rate": 3.941016985402448e-05, "loss": 0.4715, "num_input_tokens_seen": 874678934, "step": 225, "train_runtime": 5841.6691, "train_tokens_per_second": 149730.995 }, { "epoch": 0.08017027314650585, "grad_norm": 0.43376457691192627, "learning_rate": 3.940477322059223e-05, "loss": 0.485, "num_input_tokens_seen": 878541167, "step": 226, "train_runtime": 5864.7913, "train_tokens_per_second": 149799.221 }, { "epoch": 0.08052500886839305, "grad_norm": 0.44589635729789734, "learning_rate": 3.93993523841819e-05, "loss": 0.4687, "num_input_tokens_seen": 882405916, "step": 227, "train_runtime": 5886.3683, "train_tokens_per_second": 149906.677 }, { "epoch": 0.08087974459028024, "grad_norm": 0.48144909739494324, "learning_rate": 3.939390735155473e-05, "loss": 0.4751, "num_input_tokens_seen": 886306301, "step": 228, "train_runtime": 5916.676, "train_tokens_per_second": 149798.011 }, { "epoch": 0.08123448031216743, "grad_norm": 0.3917838931083679, "learning_rate": 3.9388438129502143e-05, "loss": 0.4769, "num_input_tokens_seen": 890207113, "step": 229, "train_runtime": 5942.8574, "train_tokens_per_second": 149794.461 }, { "epoch": 0.08158921603405463, "grad_norm": 0.43941351771354675, "learning_rate": 3.938294472484573e-05, "loss": 0.4623, "num_input_tokens_seen": 894063986, "step": 230, "train_runtime": 5972.324, "train_tokens_per_second": 149701.187 }, { "epoch": 0.08194395175594182, "grad_norm": 0.5082888603210449, "learning_rate": 3.937742714443725e-05, "loss": 0.4782, "num_input_tokens_seen": 898022449, "step": 231, "train_runtime": 5998.8243, "train_tokens_per_second": 149699.742 }, { "epoch": 0.08229868747782902, "grad_norm": 0.39915531873703003, "learning_rate": 3.9371885395158604e-05, "loss": 0.4622, "num_input_tokens_seen": 901847585, "step": 232, "train_runtime": 6021.0262, "train_tokens_per_second": 149783.036 }, { "epoch": 0.0826534231997162, "grad_norm": 0.34025025367736816, "learning_rate": 3.936631948392186e-05, "loss": 0.4683, "num_input_tokens_seen": 905785039, "step": 233, "train_runtime": 6060.186, "train_tokens_per_second": 149464.892 }, { "epoch": 0.0830081589216034, "grad_norm": 0.3581564724445343, "learning_rate": 3.936072941766919e-05, "loss": 0.4699, "num_input_tokens_seen": 909666379, "step": 234, "train_runtime": 6083.095, "train_tokens_per_second": 149540.058 }, { "epoch": 0.0833628946434906, "grad_norm": 0.42775794863700867, "learning_rate": 3.935511520337293e-05, "loss": 0.481, "num_input_tokens_seen": 913520073, "step": 235, "train_runtime": 6111.5822, "train_tokens_per_second": 149473.58 }, { "epoch": 0.0837176303653778, "grad_norm": 0.4778541028499603, "learning_rate": 3.93494768480355e-05, "loss": 0.4777, "num_input_tokens_seen": 917344914, "step": 236, "train_runtime": 6137.379, "train_tokens_per_second": 149468.513 }, { "epoch": 0.08407236608726498, "grad_norm": 0.4914158582687378, "learning_rate": 3.934381435868946e-05, "loss": 0.4702, "num_input_tokens_seen": 921205661, "step": 237, "train_runtime": 6164.5105, "train_tokens_per_second": 149436.951 }, { "epoch": 0.08442710180915218, "grad_norm": 0.4360828399658203, "learning_rate": 3.933812774239746e-05, "loss": 0.4772, "num_input_tokens_seen": 925060472, "step": 238, "train_runtime": 6184.5771, "train_tokens_per_second": 149575.38 }, { "epoch": 0.08478183753103938, "grad_norm": 0.36260002851486206, "learning_rate": 3.933241700625223e-05, "loss": 0.4665, "num_input_tokens_seen": 928920767, "step": 239, "train_runtime": 6208.9228, "train_tokens_per_second": 149610.617 }, { "epoch": 0.08513657325292658, "grad_norm": 0.3739655911922455, "learning_rate": 3.93266821573766e-05, "loss": 0.4752, "num_input_tokens_seen": 932777406, "step": 240, "train_runtime": 6235.8682, "train_tokens_per_second": 149582.604 }, { "epoch": 0.08549130897481376, "grad_norm": 0.3974170386791229, "learning_rate": 3.932092320292348e-05, "loss": 0.4801, "num_input_tokens_seen": 936707364, "step": 241, "train_runtime": 6262.235, "train_tokens_per_second": 149580.359 }, { "epoch": 0.08584604469670096, "grad_norm": 0.4152378737926483, "learning_rate": 3.931514015007583e-05, "loss": 0.4682, "num_input_tokens_seen": 940564062, "step": 242, "train_runtime": 6288.7752, "train_tokens_per_second": 149562.361 }, { "epoch": 0.08620078041858815, "grad_norm": 0.389809787273407, "learning_rate": 3.9309333006046674e-05, "loss": 0.4747, "num_input_tokens_seen": 944522198, "step": 243, "train_runtime": 6309.7169, "train_tokens_per_second": 149693.277 }, { "epoch": 0.08655551614047535, "grad_norm": 0.3831012547016144, "learning_rate": 3.930350177807909e-05, "loss": 0.4548, "num_input_tokens_seen": 948434514, "step": 244, "train_runtime": 6334.8195, "train_tokens_per_second": 149717.685 }, { "epoch": 0.08691025186236254, "grad_norm": 0.482959508895874, "learning_rate": 3.929764647344618e-05, "loss": 0.4726, "num_input_tokens_seen": 952312930, "step": 245, "train_runtime": 6355.4751, "train_tokens_per_second": 149841.344 }, { "epoch": 0.08726498758424973, "grad_norm": 0.49053046107292175, "learning_rate": 3.92917670994511e-05, "loss": 0.485, "num_input_tokens_seen": 956179381, "step": 246, "train_runtime": 6379.4214, "train_tokens_per_second": 149884.97 }, { "epoch": 0.08761972330613693, "grad_norm": 0.46980175375938416, "learning_rate": 3.9285863663427e-05, "loss": 0.4715, "num_input_tokens_seen": 960005120, "step": 247, "train_runtime": 6403.3953, "train_tokens_per_second": 149921.265 }, { "epoch": 0.08797445902802412, "grad_norm": 0.43777140974998474, "learning_rate": 3.927993617273705e-05, "loss": 0.4748, "num_input_tokens_seen": 963898352, "step": 248, "train_runtime": 6431.9714, "train_tokens_per_second": 149860.486 }, { "epoch": 0.08832919474991131, "grad_norm": 0.4011574387550354, "learning_rate": 3.9273984634774446e-05, "loss": 0.4671, "num_input_tokens_seen": 967766133, "step": 249, "train_runtime": 6463.5401, "train_tokens_per_second": 149726.947 }, { "epoch": 0.08868393047179851, "grad_norm": 0.3703927993774414, "learning_rate": 3.926800905696235e-05, "loss": 0.4744, "num_input_tokens_seen": 971686885, "step": 250, "train_runtime": 6487.5867, "train_tokens_per_second": 149776.324 }, { "epoch": 0.08903866619368571, "grad_norm": 0.350877970457077, "learning_rate": 3.9262009446753903e-05, "loss": 0.4669, "num_input_tokens_seen": 975557591, "step": 251, "train_runtime": 6508.8745, "train_tokens_per_second": 149881.15 }, { "epoch": 0.08939340191557289, "grad_norm": 0.3934401571750641, "learning_rate": 3.925598581163226e-05, "loss": 0.4598, "num_input_tokens_seen": 979501687, "step": 252, "train_runtime": 6525.2343, "train_tokens_per_second": 150109.811 }, { "epoch": 0.08974813763746009, "grad_norm": 0.430716335773468, "learning_rate": 3.92499381591105e-05, "loss": 0.4631, "num_input_tokens_seen": 983258197, "step": 253, "train_runtime": 6559.6682, "train_tokens_per_second": 149894.501 }, { "epoch": 0.09010287335934729, "grad_norm": 0.3944050371646881, "learning_rate": 3.924386649673167e-05, "loss": 0.4533, "num_input_tokens_seen": 987231646, "step": 254, "train_runtime": 6588.1704, "train_tokens_per_second": 149849.137 }, { "epoch": 0.09045760908123449, "grad_norm": 0.35090169310569763, "learning_rate": 3.9237770832068786e-05, "loss": 0.4599, "num_input_tokens_seen": 991056938, "step": 255, "train_runtime": 6620.346, "train_tokens_per_second": 149698.662 }, { "epoch": 0.09081234480312167, "grad_norm": 0.4511895179748535, "learning_rate": 3.923165117272477e-05, "loss": 0.4663, "num_input_tokens_seen": 995002875, "step": 256, "train_runtime": 6645.892, "train_tokens_per_second": 149716.98 }, { "epoch": 0.09116708052500887, "grad_norm": 0.4223383665084839, "learning_rate": 3.922550752633249e-05, "loss": 0.4582, "num_input_tokens_seen": 998905379, "step": 257, "train_runtime": 6672.88, "train_tokens_per_second": 149696.29 }, { "epoch": 0.09152181624689606, "grad_norm": 0.40742969512939453, "learning_rate": 3.921933990055472e-05, "loss": 0.4696, "num_input_tokens_seen": 1002811461, "step": 258, "train_runtime": 6697.3838, "train_tokens_per_second": 149731.819 }, { "epoch": 0.09187655196878326, "grad_norm": 0.4335317015647888, "learning_rate": 3.921314830308416e-05, "loss": 0.4812, "num_input_tokens_seen": 1006705715, "step": 259, "train_runtime": 6728.1339, "train_tokens_per_second": 149626.291 }, { "epoch": 0.09223128769067045, "grad_norm": 0.38751599192619324, "learning_rate": 3.92069327416434e-05, "loss": 0.4675, "num_input_tokens_seen": 1010562879, "step": 260, "train_runtime": 6748.8849, "train_tokens_per_second": 149737.755 }, { "epoch": 0.09258602341255764, "grad_norm": 0.3572450280189514, "learning_rate": 3.920069322398491e-05, "loss": 0.4681, "num_input_tokens_seen": 1014481223, "step": 261, "train_runtime": 6765.8321, "train_tokens_per_second": 149941.826 }, { "epoch": 0.09294075913444484, "grad_norm": 0.43057382106781006, "learning_rate": 3.919442975789106e-05, "loss": 0.4604, "num_input_tokens_seen": 1018393049, "step": 262, "train_runtime": 6798.7362, "train_tokens_per_second": 149791.523 }, { "epoch": 0.09329549485633203, "grad_norm": 0.4607337713241577, "learning_rate": 3.918814235117406e-05, "loss": 0.4683, "num_input_tokens_seen": 1022311283, "step": 263, "train_runtime": 6827.0701, "train_tokens_per_second": 149743.781 }, { "epoch": 0.09365023057821922, "grad_norm": 0.453078031539917, "learning_rate": 3.918183101167603e-05, "loss": 0.4753, "num_input_tokens_seen": 1026194121, "step": 264, "train_runtime": 6849.484, "train_tokens_per_second": 149820.647 }, { "epoch": 0.09400496630010642, "grad_norm": 0.45536941289901733, "learning_rate": 3.9175495747268876e-05, "loss": 0.4594, "num_input_tokens_seen": 1030072351, "step": 265, "train_runtime": 6871.1526, "train_tokens_per_second": 149912.601 }, { "epoch": 0.09435970202199362, "grad_norm": 0.40657851099967957, "learning_rate": 3.916913656585441e-05, "loss": 0.4644, "num_input_tokens_seen": 1033962976, "step": 266, "train_runtime": 6894.8588, "train_tokens_per_second": 149961.443 }, { "epoch": 0.0947144377438808, "grad_norm": 0.3889278471469879, "learning_rate": 3.9162753475364216e-05, "loss": 0.4672, "num_input_tokens_seen": 1037854826, "step": 267, "train_runtime": 6917.0329, "train_tokens_per_second": 150043.356 }, { "epoch": 0.095069173465768, "grad_norm": 0.5013661980628967, "learning_rate": 3.915634648375974e-05, "loss": 0.4645, "num_input_tokens_seen": 1041707183, "step": 268, "train_runtime": 6943.0516, "train_tokens_per_second": 150035.927 }, { "epoch": 0.0954239091876552, "grad_norm": 0.44063103199005127, "learning_rate": 3.9149915599032234e-05, "loss": 0.469, "num_input_tokens_seen": 1045712296, "step": 269, "train_runtime": 6969.8206, "train_tokens_per_second": 150034.32 }, { "epoch": 0.0957786449095424, "grad_norm": 0.43154481053352356, "learning_rate": 3.914346082920274e-05, "loss": 0.4692, "num_input_tokens_seen": 1049600077, "step": 270, "train_runtime": 6993.7124, "train_tokens_per_second": 150077.673 }, { "epoch": 0.09613338063142958, "grad_norm": 0.43110308051109314, "learning_rate": 3.913698218232208e-05, "loss": 0.4857, "num_input_tokens_seen": 1053440388, "step": 271, "train_runtime": 7019.3746, "train_tokens_per_second": 150076.102 }, { "epoch": 0.09648811635331678, "grad_norm": 0.36761319637298584, "learning_rate": 3.9130479666470876e-05, "loss": 0.4631, "num_input_tokens_seen": 1057351087, "step": 272, "train_runtime": 7040.8576, "train_tokens_per_second": 150173.622 }, { "epoch": 0.09684285207520398, "grad_norm": 0.3839249312877655, "learning_rate": 3.9123953289759534e-05, "loss": 0.4777, "num_input_tokens_seen": 1061194757, "step": 273, "train_runtime": 7068.8465, "train_tokens_per_second": 150122.761 }, { "epoch": 0.09719758779709117, "grad_norm": 0.4533555209636688, "learning_rate": 3.911740306032818e-05, "loss": 0.4691, "num_input_tokens_seen": 1065091365, "step": 274, "train_runtime": 7089.7998, "train_tokens_per_second": 150228.694 }, { "epoch": 0.09755232351897836, "grad_norm": 0.5043864846229553, "learning_rate": 3.9110828986346735e-05, "loss": 0.4582, "num_input_tokens_seen": 1068978224, "step": 275, "train_runtime": 7120.8933, "train_tokens_per_second": 150118.556 }, { "epoch": 0.09790705924086555, "grad_norm": 0.6310486793518066, "learning_rate": 3.910423107601481e-05, "loss": 0.4774, "num_input_tokens_seen": 1072856773, "step": 276, "train_runtime": 7150.8183, "train_tokens_per_second": 150032.727 }, { "epoch": 0.09826179496275275, "grad_norm": 0.6532495021820068, "learning_rate": 3.9097609337561814e-05, "loss": 0.4676, "num_input_tokens_seen": 1076710681, "step": 277, "train_runtime": 7174.9621, "train_tokens_per_second": 150064.998 }, { "epoch": 0.09861653068463995, "grad_norm": 0.7031738758087158, "learning_rate": 3.909096377924682e-05, "loss": 0.4523, "num_input_tokens_seen": 1080571771, "step": 278, "train_runtime": 7207.8325, "train_tokens_per_second": 149916.326 }, { "epoch": 0.09897126640652713, "grad_norm": 0.6146794557571411, "learning_rate": 3.908429440935862e-05, "loss": 0.4658, "num_input_tokens_seen": 1084498523, "step": 279, "train_runtime": 7240.3976, "train_tokens_per_second": 149784.387 }, { "epoch": 0.09932600212841433, "grad_norm": 0.5154471397399902, "learning_rate": 3.9077601236215726e-05, "loss": 0.4716, "num_input_tokens_seen": 1088363570, "step": 280, "train_runtime": 7266.5037, "train_tokens_per_second": 149778.161 }, { "epoch": 0.09968073785030153, "grad_norm": 0.6295011639595032, "learning_rate": 3.907088426816632e-05, "loss": 0.4755, "num_input_tokens_seen": 1092252283, "step": 281, "train_runtime": 7291.0168, "train_tokens_per_second": 149807.95 }, { "epoch": 0.10003547357218871, "grad_norm": 0.4971694350242615, "learning_rate": 3.9064143513588285e-05, "loss": 0.4805, "num_input_tokens_seen": 1096128205, "step": 282, "train_runtime": 7318.0155, "train_tokens_per_second": 149784.899 }, { "epoch": 0.10039020929407591, "grad_norm": 0.42510926723480225, "learning_rate": 3.905737898088914e-05, "loss": 0.4687, "num_input_tokens_seen": 1100015545, "step": 283, "train_runtime": 7344.2411, "train_tokens_per_second": 149779.334 }, { "epoch": 0.10074494501596311, "grad_norm": 0.4745088815689087, "learning_rate": 3.905059067850609e-05, "loss": 0.4621, "num_input_tokens_seen": 1103887685, "step": 284, "train_runtime": 7384.7622, "train_tokens_per_second": 149481.819 }, { "epoch": 0.1010996807378503, "grad_norm": 0.369765967130661, "learning_rate": 3.904377861490597e-05, "loss": 0.4642, "num_input_tokens_seen": 1107760839, "step": 285, "train_runtime": 7411.3649, "train_tokens_per_second": 149467.858 }, { "epoch": 0.10145441645973749, "grad_norm": 0.3189277946949005, "learning_rate": 3.903694279858525e-05, "loss": 0.4685, "num_input_tokens_seen": 1111607810, "step": 286, "train_runtime": 7443.8481, "train_tokens_per_second": 149332.414 }, { "epoch": 0.10180915218162469, "grad_norm": 0.33537256717681885, "learning_rate": 3.903008323807006e-05, "loss": 0.4537, "num_input_tokens_seen": 1115521739, "step": 287, "train_runtime": 7467.3741, "train_tokens_per_second": 149386.08 }, { "epoch": 0.10216388790351189, "grad_norm": 0.38531938195228577, "learning_rate": 3.9023199941916094e-05, "loss": 0.4686, "num_input_tokens_seen": 1119401376, "step": 288, "train_runtime": 7500.8801, "train_tokens_per_second": 149236.004 }, { "epoch": 0.10251862362539908, "grad_norm": 0.3277662694454193, "learning_rate": 3.9016292918708685e-05, "loss": 0.4575, "num_input_tokens_seen": 1123256354, "step": 289, "train_runtime": 7530.9334, "train_tokens_per_second": 149152.343 }, { "epoch": 0.10287335934728627, "grad_norm": 0.32823508977890015, "learning_rate": 3.900936217706275e-05, "loss": 0.4683, "num_input_tokens_seen": 1127182743, "step": 290, "train_runtime": 7561.5839, "train_tokens_per_second": 149067.014 }, { "epoch": 0.10322809506917346, "grad_norm": 0.33498483896255493, "learning_rate": 3.900240772562279e-05, "loss": 0.457, "num_input_tokens_seen": 1131054168, "step": 291, "train_runtime": 7584.4039, "train_tokens_per_second": 149128.948 }, { "epoch": 0.10358283079106066, "grad_norm": 0.37516701221466064, "learning_rate": 3.8995429573062894e-05, "loss": 0.4679, "num_input_tokens_seen": 1134990632, "step": 292, "train_runtime": 7612.8525, "train_tokens_per_second": 149088.746 }, { "epoch": 0.10393756651294786, "grad_norm": 0.360461562871933, "learning_rate": 3.8988427728086673e-05, "loss": 0.4696, "num_input_tokens_seen": 1138806194, "step": 293, "train_runtime": 7647.7801, "train_tokens_per_second": 148906.764 }, { "epoch": 0.10429230223483504, "grad_norm": 0.38688668608665466, "learning_rate": 3.8981402199427326e-05, "loss": 0.4559, "num_input_tokens_seen": 1142744966, "step": 294, "train_runtime": 7670.4051, "train_tokens_per_second": 148981.045 }, { "epoch": 0.10464703795672224, "grad_norm": 0.37085723876953125, "learning_rate": 3.8974352995847576e-05, "loss": 0.4597, "num_input_tokens_seen": 1146523548, "step": 295, "train_runtime": 7698.0614, "train_tokens_per_second": 148936.66 }, { "epoch": 0.10500177367860944, "grad_norm": 0.3474552631378174, "learning_rate": 3.8967280126139686e-05, "loss": 0.4599, "num_input_tokens_seen": 1150443950, "step": 296, "train_runtime": 7731.4309, "train_tokens_per_second": 148800.909 }, { "epoch": 0.10535650940049664, "grad_norm": 0.41424769163131714, "learning_rate": 3.896018359912541e-05, "loss": 0.459, "num_input_tokens_seen": 1154274539, "step": 297, "train_runtime": 7764.3807, "train_tokens_per_second": 148662.796 }, { "epoch": 0.10571124512238382, "grad_norm": 0.5696576833724976, "learning_rate": 3.8953063423656055e-05, "loss": 0.4611, "num_input_tokens_seen": 1158148163, "step": 298, "train_runtime": 7792.6784, "train_tokens_per_second": 148620.038 }, { "epoch": 0.10606598084427102, "grad_norm": 0.5040177702903748, "learning_rate": 3.894591960861237e-05, "loss": 0.4517, "num_input_tokens_seen": 1162028627, "step": 299, "train_runtime": 7820.7258, "train_tokens_per_second": 148583.221 }, { "epoch": 0.10642071656615822, "grad_norm": 0.5318338871002197, "learning_rate": 3.8938752162904645e-05, "loss": 0.4773, "num_input_tokens_seen": 1165956190, "step": 300, "train_runtime": 7847.3692, "train_tokens_per_second": 148579.245 }, { "epoch": 0.1067754522880454, "grad_norm": 0.4548926055431366, "learning_rate": 3.893156109547259e-05, "loss": 0.4615, "num_input_tokens_seen": 1169838770, "step": 301, "train_runtime": 7880.279, "train_tokens_per_second": 148451.441 }, { "epoch": 0.1071301880099326, "grad_norm": 0.3378381133079529, "learning_rate": 3.8924346415285416e-05, "loss": 0.4608, "num_input_tokens_seen": 1173726987, "step": 302, "train_runtime": 7915.0046, "train_tokens_per_second": 148291.384 }, { "epoch": 0.1074849237318198, "grad_norm": 0.4122856557369232, "learning_rate": 3.891710813134177e-05, "loss": 0.4612, "num_input_tokens_seen": 1177646457, "step": 303, "train_runtime": 7938.234, "train_tokens_per_second": 148351.189 }, { "epoch": 0.10783965945370699, "grad_norm": 0.47603219747543335, "learning_rate": 3.890984625266974e-05, "loss": 0.458, "num_input_tokens_seen": 1181471217, "step": 304, "train_runtime": 7959.0584, "train_tokens_per_second": 148443.593 }, { "epoch": 0.10819439517559418, "grad_norm": 0.5493431091308594, "learning_rate": 3.890256078832685e-05, "loss": 0.4663, "num_input_tokens_seen": 1185378978, "step": 305, "train_runtime": 7989.0811, "train_tokens_per_second": 148374.884 }, { "epoch": 0.10854913089748137, "grad_norm": 0.47409284114837646, "learning_rate": 3.8895251747400025e-05, "loss": 0.4524, "num_input_tokens_seen": 1189262999, "step": 306, "train_runtime": 8022.6607, "train_tokens_per_second": 148237.978 }, { "epoch": 0.10890386661936857, "grad_norm": 0.3292784094810486, "learning_rate": 3.888791913900561e-05, "loss": 0.4493, "num_input_tokens_seen": 1193151827, "step": 307, "train_runtime": 8053.6134, "train_tokens_per_second": 148151.117 }, { "epoch": 0.10925860234125577, "grad_norm": 0.3551090955734253, "learning_rate": 3.888056297228935e-05, "loss": 0.4614, "num_input_tokens_seen": 1197055807, "step": 308, "train_runtime": 8080.3915, "train_tokens_per_second": 148143.293 }, { "epoch": 0.10961333806314295, "grad_norm": 0.4404109716415405, "learning_rate": 3.8873183256426356e-05, "loss": 0.4548, "num_input_tokens_seen": 1200925982, "step": 309, "train_runtime": 8121.2078, "train_tokens_per_second": 147875.292 }, { "epoch": 0.10996807378503015, "grad_norm": 0.39072999358177185, "learning_rate": 3.8865780000621134e-05, "loss": 0.4687, "num_input_tokens_seen": 1204805585, "step": 310, "train_runtime": 8142.8681, "train_tokens_per_second": 147958.381 }, { "epoch": 0.11032280950691735, "grad_norm": 0.31280359625816345, "learning_rate": 3.8858353214107525e-05, "loss": 0.4626, "num_input_tokens_seen": 1208631925, "step": 311, "train_runtime": 8164.7287, "train_tokens_per_second": 148030.875 }, { "epoch": 0.11067754522880455, "grad_norm": 0.28192588686943054, "learning_rate": 3.885090290614875e-05, "loss": 0.4616, "num_input_tokens_seen": 1212561294, "step": 312, "train_runtime": 8204.0313, "train_tokens_per_second": 147800.666 }, { "epoch": 0.11103228095069173, "grad_norm": 0.425094872713089, "learning_rate": 3.8843429086037325e-05, "loss": 0.4565, "num_input_tokens_seen": 1216361307, "step": 313, "train_runtime": 8231.9742, "train_tokens_per_second": 147760.584 }, { "epoch": 0.11138701667257893, "grad_norm": 0.4595361649990082, "learning_rate": 3.883593176309512e-05, "loss": 0.4642, "num_input_tokens_seen": 1220235883, "step": 314, "train_runtime": 8254.5336, "train_tokens_per_second": 147826.144 }, { "epoch": 0.11174175239446613, "grad_norm": 0.4154299795627594, "learning_rate": 3.882841094667334e-05, "loss": 0.4606, "num_input_tokens_seen": 1224134232, "step": 315, "train_runtime": 8288.9255, "train_tokens_per_second": 147683.102 }, { "epoch": 0.11209648811635331, "grad_norm": 0.39004507660865784, "learning_rate": 3.882086664615245e-05, "loss": 0.4584, "num_input_tokens_seen": 1228054108, "step": 316, "train_runtime": 8317.4982, "train_tokens_per_second": 147647.054 }, { "epoch": 0.11245122383824051, "grad_norm": 0.4492207467556, "learning_rate": 3.8813298870942225e-05, "loss": 0.4593, "num_input_tokens_seen": 1231985030, "step": 317, "train_runtime": 8348.1721, "train_tokens_per_second": 147575.423 }, { "epoch": 0.1128059595601277, "grad_norm": 0.4645117223262787, "learning_rate": 3.8805707630481716e-05, "loss": 0.4629, "num_input_tokens_seen": 1235848696, "step": 318, "train_runtime": 8380.5099, "train_tokens_per_second": 147467.005 }, { "epoch": 0.1131606952820149, "grad_norm": 0.5223351716995239, "learning_rate": 3.879809293423925e-05, "loss": 0.4593, "num_input_tokens_seen": 1239734877, "step": 319, "train_runtime": 8403.8146, "train_tokens_per_second": 147520.494 }, { "epoch": 0.11351543100390209, "grad_norm": 0.36142227053642273, "learning_rate": 3.8790454791712414e-05, "loss": 0.4681, "num_input_tokens_seen": 1243647817, "step": 320, "train_runtime": 8431.675, "train_tokens_per_second": 147497.125 }, { "epoch": 0.11387016672578928, "grad_norm": 0.4002094566822052, "learning_rate": 3.878279321242801e-05, "loss": 0.4679, "num_input_tokens_seen": 1247482954, "step": 321, "train_runtime": 8453.9011, "train_tokens_per_second": 147562.994 }, { "epoch": 0.11422490244767648, "grad_norm": 0.4444311559200287, "learning_rate": 3.87751082059421e-05, "loss": 0.4597, "num_input_tokens_seen": 1251347248, "step": 322, "train_runtime": 8486.157, "train_tokens_per_second": 147457.47 }, { "epoch": 0.11457963816956368, "grad_norm": 0.40154212713241577, "learning_rate": 3.876739978183995e-05, "loss": 0.4639, "num_input_tokens_seen": 1255202785, "step": 323, "train_runtime": 8517.7838, "train_tokens_per_second": 147362.602 }, { "epoch": 0.11493437389145086, "grad_norm": 0.3640346825122833, "learning_rate": 3.875966794973605e-05, "loss": 0.4682, "num_input_tokens_seen": 1259078023, "step": 324, "train_runtime": 8549.8193, "train_tokens_per_second": 147263.7 }, { "epoch": 0.11528910961333806, "grad_norm": 0.3710702359676361, "learning_rate": 3.875191271927407e-05, "loss": 0.4651, "num_input_tokens_seen": 1262960533, "step": 325, "train_runtime": 8575.5577, "train_tokens_per_second": 147274.45 }, { "epoch": 0.11564384533522526, "grad_norm": 0.4058944582939148, "learning_rate": 3.874413410012688e-05, "loss": 0.4592, "num_input_tokens_seen": 1266863581, "step": 326, "train_runtime": 8598.4487, "train_tokens_per_second": 147336.296 }, { "epoch": 0.11599858105711246, "grad_norm": 0.4061448574066162, "learning_rate": 3.87363321019965e-05, "loss": 0.4754, "num_input_tokens_seen": 1270744293, "step": 327, "train_runtime": 8626.1016, "train_tokens_per_second": 147313.856 }, { "epoch": 0.11635331677899964, "grad_norm": 0.37233835458755493, "learning_rate": 3.872850673461413e-05, "loss": 0.4433, "num_input_tokens_seen": 1274588429, "step": 328, "train_runtime": 8652.1429, "train_tokens_per_second": 147314.769 }, { "epoch": 0.11670805250088684, "grad_norm": 0.440794974565506, "learning_rate": 3.872065800774011e-05, "loss": 0.4492, "num_input_tokens_seen": 1278532780, "step": 329, "train_runtime": 8673.0623, "train_tokens_per_second": 147414.228 }, { "epoch": 0.11706278822277404, "grad_norm": 0.4977404773235321, "learning_rate": 3.8712785931163924e-05, "loss": 0.4446, "num_input_tokens_seen": 1282420879, "step": 330, "train_runtime": 8692.347, "train_tokens_per_second": 147534.478 }, { "epoch": 0.11741752394466123, "grad_norm": 0.34330108761787415, "learning_rate": 3.870489051470416e-05, "loss": 0.4581, "num_input_tokens_seen": 1286346530, "step": 331, "train_runtime": 8715.0422, "train_tokens_per_second": 147600.722 }, { "epoch": 0.11777225966654842, "grad_norm": 0.38466134667396545, "learning_rate": 3.869697176820853e-05, "loss": 0.4432, "num_input_tokens_seen": 1290268414, "step": 332, "train_runtime": 8745.1228, "train_tokens_per_second": 147541.486 }, { "epoch": 0.11812699538843562, "grad_norm": 0.39412081241607666, "learning_rate": 3.868902970155384e-05, "loss": 0.4559, "num_input_tokens_seen": 1294178890, "step": 333, "train_runtime": 8761.8708, "train_tokens_per_second": 147705.771 }, { "epoch": 0.11848173111032281, "grad_norm": 0.38777267932891846, "learning_rate": 3.8681064324646004e-05, "loss": 0.4568, "num_input_tokens_seen": 1297967324, "step": 334, "train_runtime": 8794.5837, "train_tokens_per_second": 147587.126 }, { "epoch": 0.11883646683221, "grad_norm": 0.43612733483314514, "learning_rate": 3.8673075647419976e-05, "loss": 0.4583, "num_input_tokens_seen": 1301827078, "step": 335, "train_runtime": 8815.6437, "train_tokens_per_second": 147672.378 }, { "epoch": 0.1191912025540972, "grad_norm": 0.43412190675735474, "learning_rate": 3.866506367983979e-05, "loss": 0.4474, "num_input_tokens_seen": 1305700850, "step": 336, "train_runtime": 8849.2624, "train_tokens_per_second": 147549.116 }, { "epoch": 0.11954593827598439, "grad_norm": 0.3276077210903168, "learning_rate": 3.865702843189853e-05, "loss": 0.4572, "num_input_tokens_seen": 1309618383, "step": 337, "train_runtime": 8877.1035, "train_tokens_per_second": 147527.668 }, { "epoch": 0.11990067399787159, "grad_norm": 0.36729946732521057, "learning_rate": 3.864896991361831e-05, "loss": 0.4618, "num_input_tokens_seen": 1313560098, "step": 338, "train_runtime": 8906.9813, "train_tokens_per_second": 147475.34 }, { "epoch": 0.12025540971975877, "grad_norm": 0.4698219895362854, "learning_rate": 3.864088813505028e-05, "loss": 0.4518, "num_input_tokens_seen": 1317384596, "step": 339, "train_runtime": 8932.7411, "train_tokens_per_second": 147478.202 }, { "epoch": 0.12061014544164597, "grad_norm": 0.39570629596710205, "learning_rate": 3.863278310627459e-05, "loss": 0.4513, "num_input_tokens_seen": 1321348402, "step": 340, "train_runtime": 8961.2702, "train_tokens_per_second": 147451.017 }, { "epoch": 0.12096488116353317, "grad_norm": 0.46769773960113525, "learning_rate": 3.862465483740039e-05, "loss": 0.4538, "num_input_tokens_seen": 1325186940, "step": 341, "train_runtime": 8993.1691, "train_tokens_per_second": 147354.835 }, { "epoch": 0.12131961688542037, "grad_norm": 0.515110433101654, "learning_rate": 3.8616503338565825e-05, "loss": 0.4664, "num_input_tokens_seen": 1329079763, "step": 342, "train_runtime": 9020.967, "train_tokens_per_second": 147332.294 }, { "epoch": 0.12167435260730755, "grad_norm": 0.4738190472126007, "learning_rate": 3.860832861993801e-05, "loss": 0.4525, "num_input_tokens_seen": 1332985681, "step": 343, "train_runtime": 9044.6879, "train_tokens_per_second": 147377.741 }, { "epoch": 0.12202908832919475, "grad_norm": 0.43057140707969666, "learning_rate": 3.860013069171302e-05, "loss": 0.4672, "num_input_tokens_seen": 1336840110, "step": 344, "train_runtime": 9064.1477, "train_tokens_per_second": 147486.576 }, { "epoch": 0.12238382405108195, "grad_norm": 0.4195503890514374, "learning_rate": 3.859190956411588e-05, "loss": 0.4437, "num_input_tokens_seen": 1340726920, "step": 345, "train_runtime": 9098.0594, "train_tokens_per_second": 147364.054 }, { "epoch": 0.12273855977296914, "grad_norm": 0.394125759601593, "learning_rate": 3.858366524740055e-05, "loss": 0.4632, "num_input_tokens_seen": 1344572477, "step": 346, "train_runtime": 9134.633, "train_tokens_per_second": 147195.019 }, { "epoch": 0.12309329549485633, "grad_norm": 0.30572205781936646, "learning_rate": 3.8575397751849905e-05, "loss": 0.4627, "num_input_tokens_seen": 1348530679, "step": 347, "train_runtime": 9163.3865, "train_tokens_per_second": 147165.099 }, { "epoch": 0.12344803121674353, "grad_norm": 0.3732057511806488, "learning_rate": 3.856710708777575e-05, "loss": 0.4576, "num_input_tokens_seen": 1352316315, "step": 348, "train_runtime": 9201.4341, "train_tokens_per_second": 146967.995 }, { "epoch": 0.12380276693863072, "grad_norm": 0.34277018904685974, "learning_rate": 3.855879326551877e-05, "loss": 0.4538, "num_input_tokens_seen": 1356229496, "step": 349, "train_runtime": 9226.6988, "train_tokens_per_second": 146989.679 }, { "epoch": 0.12415750266051792, "grad_norm": 0.39131778478622437, "learning_rate": 3.8550456295448544e-05, "loss": 0.4474, "num_input_tokens_seen": 1360124599, "step": 350, "train_runtime": 9257.1032, "train_tokens_per_second": 146927.67 }, { "epoch": 0.1245122383824051, "grad_norm": 0.42182812094688416, "learning_rate": 3.8542096187963517e-05, "loss": 0.4534, "num_input_tokens_seen": 1364061421, "step": 351, "train_runtime": 9284.2898, "train_tokens_per_second": 146921.46 }, { "epoch": 0.1248669741042923, "grad_norm": 0.3409145474433899, "learning_rate": 3.853371295349098e-05, "loss": 0.4417, "num_input_tokens_seen": 1367919557, "step": 352, "train_runtime": 9304.1639, "train_tokens_per_second": 147022.298 }, { "epoch": 0.1252217098261795, "grad_norm": 0.38491109013557434, "learning_rate": 3.8525306602487114e-05, "loss": 0.4484, "num_input_tokens_seen": 1371820623, "step": 353, "train_runtime": 9349.4992, "train_tokens_per_second": 146726.643 }, { "epoch": 0.12557644554806668, "grad_norm": 0.6764065623283386, "learning_rate": 3.851687714543688e-05, "loss": 0.4549, "num_input_tokens_seen": 1375749226, "step": 354, "train_runtime": 9371.1466, "train_tokens_per_second": 146806.926 }, { "epoch": 0.1259311812699539, "grad_norm": 0.4574657380580902, "learning_rate": 3.8508424592854085e-05, "loss": 0.4603, "num_input_tokens_seen": 1379595125, "step": 355, "train_runtime": 9404.6767, "train_tokens_per_second": 146692.457 }, { "epoch": 0.12628591699184108, "grad_norm": 0.40197065472602844, "learning_rate": 3.8499948955281344e-05, "loss": 0.4522, "num_input_tokens_seen": 1383484875, "step": 356, "train_runtime": 9428.3255, "train_tokens_per_second": 146737.072 }, { "epoch": 0.12664065271372826, "grad_norm": 0.3553198277950287, "learning_rate": 3.849145024329006e-05, "loss": 0.4557, "num_input_tokens_seen": 1387331877, "step": 357, "train_runtime": 9451.3291, "train_tokens_per_second": 146786.961 }, { "epoch": 0.12699538843561547, "grad_norm": 0.43874508142471313, "learning_rate": 3.8482928467480405e-05, "loss": 0.4566, "num_input_tokens_seen": 1391210644, "step": 358, "train_runtime": 9475.9933, "train_tokens_per_second": 146814.229 }, { "epoch": 0.12735012415750266, "grad_norm": 0.461038202047348, "learning_rate": 3.847438363848134e-05, "loss": 0.4591, "num_input_tokens_seen": 1395149824, "step": 359, "train_runtime": 9499.8501, "train_tokens_per_second": 146860.193 }, { "epoch": 0.12770485987938984, "grad_norm": 0.3504069447517395, "learning_rate": 3.8465815766950564e-05, "loss": 0.4611, "num_input_tokens_seen": 1399001762, "step": 360, "train_runtime": 9527.4955, "train_tokens_per_second": 146838.354 }, { "epoch": 0.12805959560127705, "grad_norm": 0.3256968557834625, "learning_rate": 3.845722486357452e-05, "loss": 0.4537, "num_input_tokens_seen": 1402964330, "step": 361, "train_runtime": 9553.5871, "train_tokens_per_second": 146852.1 }, { "epoch": 0.12841433132316424, "grad_norm": 0.3986396789550781, "learning_rate": 3.844861093906837e-05, "loss": 0.4442, "num_input_tokens_seen": 1406844155, "step": 362, "train_runtime": 9580.5772, "train_tokens_per_second": 146843.362 }, { "epoch": 0.12876906704505145, "grad_norm": 0.37686970829963684, "learning_rate": 3.8439974004176015e-05, "loss": 0.4441, "num_input_tokens_seen": 1410710747, "step": 363, "train_runtime": 9605.8882, "train_tokens_per_second": 146858.96 }, { "epoch": 0.12912380276693863, "grad_norm": 0.3876573443412781, "learning_rate": 3.843131406967003e-05, "loss": 0.4655, "num_input_tokens_seen": 1414599732, "step": 364, "train_runtime": 9626.4445, "train_tokens_per_second": 146949.347 }, { "epoch": 0.12947853848882582, "grad_norm": 0.3373655676841736, "learning_rate": 3.8422631146351686e-05, "loss": 0.4459, "num_input_tokens_seen": 1418438807, "step": 365, "train_runtime": 9652.7876, "train_tokens_per_second": 146946.029 }, { "epoch": 0.12983327421071303, "grad_norm": 0.40747854113578796, "learning_rate": 3.841392524505092e-05, "loss": 0.4535, "num_input_tokens_seen": 1422301425, "step": 366, "train_runtime": 9675.269, "train_tokens_per_second": 147003.812 }, { "epoch": 0.1301880099326002, "grad_norm": 0.4003390669822693, "learning_rate": 3.8405196376626334e-05, "loss": 0.4466, "num_input_tokens_seen": 1426212666, "step": 367, "train_runtime": 9696.5195, "train_tokens_per_second": 147085.01 }, { "epoch": 0.1305427456544874, "grad_norm": 1.0277981758117676, "learning_rate": 3.8396444551965196e-05, "loss": 0.4642, "num_input_tokens_seen": 1430112644, "step": 368, "train_runtime": 9720.0433, "train_tokens_per_second": 147130.275 }, { "epoch": 0.1308974813763746, "grad_norm": 0.40213146805763245, "learning_rate": 3.838766978198337e-05, "loss": 0.4369, "num_input_tokens_seen": 1433967419, "step": 369, "train_runtime": 9741.8746, "train_tokens_per_second": 147196.251 }, { "epoch": 0.1312522170982618, "grad_norm": 0.41060495376586914, "learning_rate": 3.8378872077625375e-05, "loss": 0.447, "num_input_tokens_seen": 1437820320, "step": 370, "train_runtime": 9765.119, "train_tokens_per_second": 147240.431 }, { "epoch": 0.13160695282014898, "grad_norm": 0.3911396563053131, "learning_rate": 3.83700514498643e-05, "loss": 0.4511, "num_input_tokens_seen": 1441703745, "step": 371, "train_runtime": 9791.8709, "train_tokens_per_second": 147234.758 }, { "epoch": 0.1319616885420362, "grad_norm": 0.4008573591709137, "learning_rate": 3.836120790970185e-05, "loss": 0.4547, "num_input_tokens_seen": 1445575316, "step": 372, "train_runtime": 9818.6437, "train_tokens_per_second": 147227.597 }, { "epoch": 0.13231642426392337, "grad_norm": 0.3271460235118866, "learning_rate": 3.83523414681683e-05, "loss": 0.4369, "num_input_tokens_seen": 1449415262, "step": 373, "train_runtime": 9842.3353, "train_tokens_per_second": 147263.349 }, { "epoch": 0.13267115998581058, "grad_norm": 0.3129989802837372, "learning_rate": 3.83434521363225e-05, "loss": 0.4565, "num_input_tokens_seen": 1453314582, "step": 374, "train_runtime": 9862.6537, "train_tokens_per_second": 147355.329 }, { "epoch": 0.13302589570769777, "grad_norm": 0.3312121629714966, "learning_rate": 3.833453992525182e-05, "loss": 0.4412, "num_input_tokens_seen": 1457311276, "step": 375, "train_runtime": 9895.2978, "train_tokens_per_second": 147273.109 }, { "epoch": 0.13338063142958495, "grad_norm": 0.38074392080307007, "learning_rate": 3.832560484607221e-05, "loss": 0.467, "num_input_tokens_seen": 1461181501, "step": 376, "train_runtime": 9924.297, "train_tokens_per_second": 147232.746 }, { "epoch": 0.13373536715147216, "grad_norm": 0.33574220538139343, "learning_rate": 3.831664690992811e-05, "loss": 0.4599, "num_input_tokens_seen": 1465068876, "step": 377, "train_runtime": 9944.3568, "train_tokens_per_second": 147326.66 }, { "epoch": 0.13409010287335935, "grad_norm": 0.3080052435398102, "learning_rate": 3.830766612799248e-05, "loss": 0.4491, "num_input_tokens_seen": 1468974383, "step": 378, "train_runtime": 9974.3793, "train_tokens_per_second": 147274.767 }, { "epoch": 0.13444483859524653, "grad_norm": 0.31924402713775635, "learning_rate": 3.829866251146677e-05, "loss": 0.4464, "num_input_tokens_seen": 1472805162, "step": 379, "train_runtime": 9997.5929, "train_tokens_per_second": 147315.976 }, { "epoch": 0.13479957431713374, "grad_norm": 0.3651372194290161, "learning_rate": 3.828963607158091e-05, "loss": 0.4441, "num_input_tokens_seen": 1476781768, "step": 380, "train_runtime": 10031.8101, "train_tokens_per_second": 147209.9 }, { "epoch": 0.13515431003902093, "grad_norm": 0.3380815386772156, "learning_rate": 3.828058681959332e-05, "loss": 0.4506, "num_input_tokens_seen": 1480615380, "step": 381, "train_runtime": 10053.3498, "train_tokens_per_second": 147275.825 }, { "epoch": 0.13550904576090814, "grad_norm": 0.35499194264411926, "learning_rate": 3.827151476679084e-05, "loss": 0.4525, "num_input_tokens_seen": 1484438558, "step": 382, "train_runtime": 10090.314, "train_tokens_per_second": 147115.2 }, { "epoch": 0.13586378148279532, "grad_norm": 0.34679675102233887, "learning_rate": 3.826241992448876e-05, "loss": 0.4561, "num_input_tokens_seen": 1488417184, "step": 383, "train_runtime": 10118.5834, "train_tokens_per_second": 147097.388 }, { "epoch": 0.1362185172046825, "grad_norm": 0.35001853108406067, "learning_rate": 3.825330230403081e-05, "loss": 0.4566, "num_input_tokens_seen": 1492277649, "step": 384, "train_runtime": 10138.6176, "train_tokens_per_second": 147187.488 }, { "epoch": 0.13657325292656972, "grad_norm": 0.29190677404403687, "learning_rate": 3.824416191678911e-05, "loss": 0.4481, "num_input_tokens_seen": 1496199427, "step": 385, "train_runtime": 10169.8443, "train_tokens_per_second": 147121.174 }, { "epoch": 0.1369279886484569, "grad_norm": 0.32444220781326294, "learning_rate": 3.8234998774164184e-05, "loss": 0.4538, "num_input_tokens_seen": 1500123550, "step": 386, "train_runtime": 10195.2335, "train_tokens_per_second": 147139.695 }, { "epoch": 0.13728272437034408, "grad_norm": 0.37043145298957825, "learning_rate": 3.8225812887584936e-05, "loss": 0.4503, "num_input_tokens_seen": 1504032105, "step": 387, "train_runtime": 10220.4781, "train_tokens_per_second": 147158.683 }, { "epoch": 0.1376374600922313, "grad_norm": 0.31767600774765015, "learning_rate": 3.821660426850866e-05, "loss": 0.4467, "num_input_tokens_seen": 1507861118, "step": 388, "train_runtime": 10242.4564, "train_tokens_per_second": 147216.748 }, { "epoch": 0.13799219581411848, "grad_norm": 0.31968122720718384, "learning_rate": 3.8207372928420955e-05, "loss": 0.4529, "num_input_tokens_seen": 1511667581, "step": 389, "train_runtime": 10281.5679, "train_tokens_per_second": 147026.952 }, { "epoch": 0.13834693153600566, "grad_norm": 0.548297643661499, "learning_rate": 3.819811887883581e-05, "loss": 0.4417, "num_input_tokens_seen": 1515627060, "step": 390, "train_runtime": 10306.6614, "train_tokens_per_second": 147053.153 }, { "epoch": 0.13870166725789287, "grad_norm": 0.31557756662368774, "learning_rate": 3.8188842131295505e-05, "loss": 0.4577, "num_input_tokens_seen": 1519491527, "step": 391, "train_runtime": 10333.1309, "train_tokens_per_second": 147050.448 }, { "epoch": 0.13905640297978006, "grad_norm": 0.3714887499809265, "learning_rate": 3.817954269737065e-05, "loss": 0.4487, "num_input_tokens_seen": 1523409309, "step": 392, "train_runtime": 10355.9011, "train_tokens_per_second": 147105.432 }, { "epoch": 0.13941113870166727, "grad_norm": 0.32044118642807007, "learning_rate": 3.817022058866014e-05, "loss": 0.4485, "num_input_tokens_seen": 1527266582, "step": 393, "train_runtime": 10376.878, "train_tokens_per_second": 147179.776 }, { "epoch": 0.13976587442355445, "grad_norm": 0.3070315420627594, "learning_rate": 3.8160875816791155e-05, "loss": 0.4443, "num_input_tokens_seen": 1531180236, "step": 394, "train_runtime": 10396.5697, "train_tokens_per_second": 147277.447 }, { "epoch": 0.14012061014544164, "grad_norm": 0.36768239736557007, "learning_rate": 3.815150839341915e-05, "loss": 0.4461, "num_input_tokens_seen": 1534973209, "step": 395, "train_runtime": 10423.1661, "train_tokens_per_second": 147265.542 }, { "epoch": 0.14047534586732885, "grad_norm": 0.3916236460208893, "learning_rate": 3.81421183302278e-05, "loss": 0.4468, "num_input_tokens_seen": 1538924607, "step": 396, "train_runtime": 10451.0899, "train_tokens_per_second": 147250.154 }, { "epoch": 0.14083008158921603, "grad_norm": 0.3720170259475708, "learning_rate": 3.813270563892908e-05, "loss": 0.4489, "num_input_tokens_seen": 1542820874, "step": 397, "train_runtime": 10478.2462, "train_tokens_per_second": 147240.372 }, { "epoch": 0.14118481731110322, "grad_norm": 0.40765777230262756, "learning_rate": 3.812327033126311e-05, "loss": 0.4475, "num_input_tokens_seen": 1546715335, "step": 398, "train_runtime": 10502.3072, "train_tokens_per_second": 147273.861 }, { "epoch": 0.14153955303299043, "grad_norm": 0.5503748655319214, "learning_rate": 3.81138124189983e-05, "loss": 0.4549, "num_input_tokens_seen": 1550549037, "step": 399, "train_runtime": 10531.1783, "train_tokens_per_second": 147234.145 }, { "epoch": 0.1418942887548776, "grad_norm": 0.5545937418937683, "learning_rate": 3.810433191393118e-05, "loss": 0.4439, "num_input_tokens_seen": 1554376236, "step": 400, "train_runtime": 10550.7949, "train_tokens_per_second": 147323.14 }, { "epoch": 0.14224902447676482, "grad_norm": 0.6318577527999878, "learning_rate": 3.8094828827886516e-05, "loss": 0.4487, "num_input_tokens_seen": 1558348110, "step": 401, "train_runtime": 10685.0073, "train_tokens_per_second": 145844.365 }, { "epoch": 0.142603760198652, "grad_norm": 0.5840385556221008, "learning_rate": 3.80853031727172e-05, "loss": 0.4597, "num_input_tokens_seen": 1562196391, "step": 402, "train_runtime": 10711.1385, "train_tokens_per_second": 145847.838 }, { "epoch": 0.1429584959205392, "grad_norm": 0.3575369715690613, "learning_rate": 3.8075754960304305e-05, "loss": 0.4462, "num_input_tokens_seen": 1566077134, "step": 403, "train_runtime": 10731.8042, "train_tokens_per_second": 145928.598 }, { "epoch": 0.1433132316424264, "grad_norm": 0.43962499499320984, "learning_rate": 3.8066184202557014e-05, "loss": 0.447, "num_input_tokens_seen": 1569989240, "step": 404, "train_runtime": 10764.6395, "train_tokens_per_second": 145846.894 }, { "epoch": 0.1436679673643136, "grad_norm": 0.5118480920791626, "learning_rate": 3.805659091141263e-05, "loss": 0.4475, "num_input_tokens_seen": 1573866858, "step": 405, "train_runtime": 10788.3907, "train_tokens_per_second": 145885.229 }, { "epoch": 0.14402270308620077, "grad_norm": 0.42756739258766174, "learning_rate": 3.804697509883659e-05, "loss": 0.4382, "num_input_tokens_seen": 1577798884, "step": 406, "train_runtime": 10823.2495, "train_tokens_per_second": 145778.667 }, { "epoch": 0.14437743880808798, "grad_norm": 0.48387327790260315, "learning_rate": 3.803733677682239e-05, "loss": 0.456, "num_input_tokens_seen": 1581620769, "step": 407, "train_runtime": 10850.4709, "train_tokens_per_second": 145765.173 }, { "epoch": 0.14473217452997517, "grad_norm": 0.4202973246574402, "learning_rate": 3.802767595739161e-05, "loss": 0.4443, "num_input_tokens_seen": 1585588159, "step": 408, "train_runtime": 10873.514, "train_tokens_per_second": 145821.135 }, { "epoch": 0.14508691025186235, "grad_norm": 0.39786994457244873, "learning_rate": 3.8017992652593904e-05, "loss": 0.4445, "num_input_tokens_seen": 1589414586, "step": 409, "train_runtime": 10903.367, "train_tokens_per_second": 145772.823 }, { "epoch": 0.14544164597374956, "grad_norm": 0.3251352310180664, "learning_rate": 3.800828687450696e-05, "loss": 0.443, "num_input_tokens_seen": 1593342820, "step": 410, "train_runtime": 10929.6867, "train_tokens_per_second": 145781.198 }, { "epoch": 0.14579638169563675, "grad_norm": 0.4169404208660126, "learning_rate": 3.799855863523648e-05, "loss": 0.4548, "num_input_tokens_seen": 1597219569, "step": 411, "train_runtime": 10964.2413, "train_tokens_per_second": 145675.338 }, { "epoch": 0.14615111741752396, "grad_norm": 0.41821253299713135, "learning_rate": 3.798880794691623e-05, "loss": 0.4486, "num_input_tokens_seen": 1601090266, "step": 412, "train_runtime": 10986.6388, "train_tokens_per_second": 145730.674 }, { "epoch": 0.14650585313941114, "grad_norm": 0.5936509370803833, "learning_rate": 3.797903482170791e-05, "loss": 0.4512, "num_input_tokens_seen": 1604948810, "step": 413, "train_runtime": 11017.3693, "train_tokens_per_second": 145674.414 }, { "epoch": 0.14686058886129832, "grad_norm": 0.33095014095306396, "learning_rate": 3.796923927180126e-05, "loss": 0.4527, "num_input_tokens_seen": 1608854472, "step": 414, "train_runtime": 11051.0321, "train_tokens_per_second": 145584.092 }, { "epoch": 0.14721532458318554, "grad_norm": 0.5824556946754456, "learning_rate": 3.7959421309413965e-05, "loss": 0.4424, "num_input_tokens_seen": 1612745299, "step": 415, "train_runtime": 11077.7922, "train_tokens_per_second": 145583.639 }, { "epoch": 0.14757006030507272, "grad_norm": 0.4411673843860626, "learning_rate": 3.794958094679166e-05, "loss": 0.4604, "num_input_tokens_seen": 1616643557, "step": 416, "train_runtime": 11098.1216, "train_tokens_per_second": 145668.215 }, { "epoch": 0.1479247960269599, "grad_norm": 0.39862850308418274, "learning_rate": 3.7939718196207946e-05, "loss": 0.4627, "num_input_tokens_seen": 1620498973, "step": 417, "train_runtime": 11124.2992, "train_tokens_per_second": 145672.006 }, { "epoch": 0.14827953174884712, "grad_norm": 0.45346176624298096, "learning_rate": 3.792983306996431e-05, "loss": 0.4616, "num_input_tokens_seen": 1624334240, "step": 418, "train_runtime": 11159.9669, "train_tokens_per_second": 145550.095 }, { "epoch": 0.1486342674707343, "grad_norm": 0.451163649559021, "learning_rate": 3.791992558039018e-05, "loss": 0.4469, "num_input_tokens_seen": 1628228518, "step": 419, "train_runtime": 11191.1239, "train_tokens_per_second": 145492.851 }, { "epoch": 0.1489890031926215, "grad_norm": 0.4565865695476532, "learning_rate": 3.790999573984285e-05, "loss": 0.4548, "num_input_tokens_seen": 1632150870, "step": 420, "train_runtime": 11217.4922, "train_tokens_per_second": 145500.512 }, { "epoch": 0.1493437389145087, "grad_norm": 0.8012377619743347, "learning_rate": 3.790004356070752e-05, "loss": 0.4483, "num_input_tokens_seen": 1636004972, "step": 421, "train_runtime": 11238.2179, "train_tokens_per_second": 145575.125 }, { "epoch": 0.14969847463639588, "grad_norm": 0.34871503710746765, "learning_rate": 3.789006905539722e-05, "loss": 0.4528, "num_input_tokens_seen": 1639897635, "step": 422, "train_runtime": 11260.6816, "train_tokens_per_second": 145630.407 }, { "epoch": 0.1500532103582831, "grad_norm": 0.36943575739860535, "learning_rate": 3.788007223635286e-05, "loss": 0.4504, "num_input_tokens_seen": 1643753084, "step": 423, "train_runtime": 11293.1721, "train_tokens_per_second": 145552.824 }, { "epoch": 0.15040794608017027, "grad_norm": 0.4908830225467682, "learning_rate": 3.787005311604317e-05, "loss": 0.4477, "num_input_tokens_seen": 1647623667, "step": 424, "train_runtime": 11315.7429, "train_tokens_per_second": 145604.551 }, { "epoch": 0.15076268180205746, "grad_norm": 0.3872721493244171, "learning_rate": 3.786001170696467e-05, "loss": 0.4681, "num_input_tokens_seen": 1651484826, "step": 425, "train_runtime": 11347.8221, "train_tokens_per_second": 145533.197 }, { "epoch": 0.15111741752394467, "grad_norm": 0.39296841621398926, "learning_rate": 3.784994802164171e-05, "loss": 0.4449, "num_input_tokens_seen": 1655433446, "step": 426, "train_runtime": 11369.6939, "train_tokens_per_second": 145600.529 }, { "epoch": 0.15147215324583185, "grad_norm": 0.3682827651500702, "learning_rate": 3.783986207262643e-05, "loss": 0.456, "num_input_tokens_seen": 1659324132, "step": 427, "train_runtime": 11399.3324, "train_tokens_per_second": 145563.273 }, { "epoch": 0.15182688896771904, "grad_norm": 0.32845428586006165, "learning_rate": 3.78297538724987e-05, "loss": 0.4465, "num_input_tokens_seen": 1663198892, "step": 428, "train_runtime": 11431.2063, "train_tokens_per_second": 145496.359 }, { "epoch": 0.15218162468960625, "grad_norm": 0.37449803948402405, "learning_rate": 3.781962343386619e-05, "loss": 0.4469, "num_input_tokens_seen": 1667049592, "step": 429, "train_runtime": 11456.8793, "train_tokens_per_second": 145506.428 }, { "epoch": 0.15253636041149343, "grad_norm": 0.44134941697120667, "learning_rate": 3.780947076936428e-05, "loss": 0.4502, "num_input_tokens_seen": 1670925465, "step": 430, "train_runtime": 11474.4533, "train_tokens_per_second": 145621.358 }, { "epoch": 0.15289109613338064, "grad_norm": 0.4290532171726227, "learning_rate": 3.779929589165607e-05, "loss": 0.4607, "num_input_tokens_seen": 1674854181, "step": 431, "train_runtime": 11500.9785, "train_tokens_per_second": 145627.104 }, { "epoch": 0.15324583185526783, "grad_norm": 0.4009090065956116, "learning_rate": 3.778909881343237e-05, "loss": 0.4514, "num_input_tokens_seen": 1678737292, "step": 432, "train_runtime": 11530.98, "train_tokens_per_second": 145584.963 }, { "epoch": 0.153600567577155, "grad_norm": 0.4232281446456909, "learning_rate": 3.777887954741169e-05, "loss": 0.4573, "num_input_tokens_seen": 1682698315, "step": 433, "train_runtime": 11553.9736, "train_tokens_per_second": 145638.061 }, { "epoch": 0.15395530329904222, "grad_norm": 0.42833954095840454, "learning_rate": 3.776863810634021e-05, "loss": 0.4522, "num_input_tokens_seen": 1686555825, "step": 434, "train_runtime": 11579.1875, "train_tokens_per_second": 145654.074 }, { "epoch": 0.1543100390209294, "grad_norm": 0.39111125469207764, "learning_rate": 3.775837450299176e-05, "loss": 0.4467, "num_input_tokens_seen": 1690456689, "step": 435, "train_runtime": 11598.9884, "train_tokens_per_second": 145741.735 }, { "epoch": 0.1546647747428166, "grad_norm": 0.33718404173851013, "learning_rate": 3.7748088750167824e-05, "loss": 0.4456, "num_input_tokens_seen": 1694321977, "step": 436, "train_runtime": 11627.1545, "train_tokens_per_second": 145721.12 }, { "epoch": 0.1550195104647038, "grad_norm": 0.3980780243873596, "learning_rate": 3.773778086069749e-05, "loss": 0.449, "num_input_tokens_seen": 1698184840, "step": 437, "train_runtime": 11648.0411, "train_tokens_per_second": 145791.454 }, { "epoch": 0.155374246186591, "grad_norm": 0.45191627740859985, "learning_rate": 3.772745084743749e-05, "loss": 0.4589, "num_input_tokens_seen": 1702087496, "step": 438, "train_runtime": 11679.6557, "train_tokens_per_second": 145730.965 }, { "epoch": 0.15572898190847817, "grad_norm": 0.4346320331096649, "learning_rate": 3.7717098723272116e-05, "loss": 0.4559, "num_input_tokens_seen": 1706030475, "step": 439, "train_runtime": 11705.706, "train_tokens_per_second": 145743.492 }, { "epoch": 0.15608371763036538, "grad_norm": 0.419439435005188, "learning_rate": 3.7706724501113265e-05, "loss": 0.4488, "num_input_tokens_seen": 1709804820, "step": 440, "train_runtime": 11732.8217, "train_tokens_per_second": 145728.356 }, { "epoch": 0.15643845335225257, "grad_norm": 0.3767973482608795, "learning_rate": 3.769632819390039e-05, "loss": 0.4494, "num_input_tokens_seen": 1713700923, "step": 441, "train_runtime": 11760.3811, "train_tokens_per_second": 145718.145 }, { "epoch": 0.15679318907413978, "grad_norm": 0.40142303705215454, "learning_rate": 3.768590981460047e-05, "loss": 0.4387, "num_input_tokens_seen": 1717609422, "step": 442, "train_runtime": 11790.4208, "train_tokens_per_second": 145678.382 }, { "epoch": 0.15714792479602696, "grad_norm": 0.3799995183944702, "learning_rate": 3.7675469376208054e-05, "loss": 0.4521, "num_input_tokens_seen": 1721452592, "step": 443, "train_runtime": 11816.3396, "train_tokens_per_second": 145684.082 }, { "epoch": 0.15750266051791414, "grad_norm": 0.2658306062221527, "learning_rate": 3.7665006891745156e-05, "loss": 0.4439, "num_input_tokens_seen": 1725343822, "step": 444, "train_runtime": 11837.5127, "train_tokens_per_second": 145752.226 }, { "epoch": 0.15785739623980136, "grad_norm": 0.32704290747642517, "learning_rate": 3.765452237426133e-05, "loss": 0.444, "num_input_tokens_seen": 1729246496, "step": 445, "train_runtime": 11864.7056, "train_tokens_per_second": 145747.105 }, { "epoch": 0.15821213196168854, "grad_norm": 0.38583970069885254, "learning_rate": 3.7644015836833575e-05, "loss": 0.4664, "num_input_tokens_seen": 1733090036, "step": 446, "train_runtime": 11885.5825, "train_tokens_per_second": 145814.48 }, { "epoch": 0.15856686768357572, "grad_norm": 0.4610465466976166, "learning_rate": 3.763348729256639e-05, "loss": 0.4529, "num_input_tokens_seen": 1736899860, "step": 447, "train_runtime": 11912.7404, "train_tokens_per_second": 145801.873 }, { "epoch": 0.15892160340546294, "grad_norm": 0.49127617478370667, "learning_rate": 3.7622936754591695e-05, "loss": 0.4453, "num_input_tokens_seen": 1740772229, "step": 448, "train_runtime": 11942.2446, "train_tokens_per_second": 145765.916 }, { "epoch": 0.15927633912735012, "grad_norm": 0.36213958263397217, "learning_rate": 3.7612364236068856e-05, "loss": 0.4548, "num_input_tokens_seen": 1744684553, "step": 449, "train_runtime": 11967.4667, "train_tokens_per_second": 145785.619 }, { "epoch": 0.15963107484923733, "grad_norm": 0.30055952072143555, "learning_rate": 3.760176975018465e-05, "loss": 0.4499, "num_input_tokens_seen": 1748616032, "step": 450, "train_runtime": 12006.2505, "train_tokens_per_second": 145642.141 }, { "epoch": 0.15998581057112451, "grad_norm": 0.37520644068717957, "learning_rate": 3.7591153310153255e-05, "loss": 0.4499, "num_input_tokens_seen": 1752559998, "step": 451, "train_runtime": 12031.5379, "train_tokens_per_second": 145663.839 }, { "epoch": 0.1603405462930117, "grad_norm": 0.45451244711875916, "learning_rate": 3.758051492921622e-05, "loss": 0.4426, "num_input_tokens_seen": 1756436388, "step": 452, "train_runtime": 12059.2751, "train_tokens_per_second": 145650.246 }, { "epoch": 0.1606952820148989, "grad_norm": 0.43047982454299927, "learning_rate": 3.756985462064249e-05, "loss": 0.4469, "num_input_tokens_seen": 1760295188, "step": 453, "train_runtime": 12085.0626, "train_tokens_per_second": 145658.757 }, { "epoch": 0.1610500177367861, "grad_norm": 0.3625195324420929, "learning_rate": 3.755917239772833e-05, "loss": 0.4471, "num_input_tokens_seen": 1764201429, "step": 454, "train_runtime": 12115.5391, "train_tokens_per_second": 145614.77 }, { "epoch": 0.16140475345867328, "grad_norm": 0.38840344548225403, "learning_rate": 3.7548468273797356e-05, "loss": 0.4478, "num_input_tokens_seen": 1768143901, "step": 455, "train_runtime": 12143.6493, "train_tokens_per_second": 145602.352 }, { "epoch": 0.1617594891805605, "grad_norm": 0.37198111414909363, "learning_rate": 3.753774226220047e-05, "loss": 0.4536, "num_input_tokens_seen": 1772029913, "step": 456, "train_runtime": 12165.2087, "train_tokens_per_second": 145663.75 }, { "epoch": 0.16211422490244767, "grad_norm": 0.34891775250434875, "learning_rate": 3.7526994376315934e-05, "loss": 0.4415, "num_input_tokens_seen": 1775925557, "step": 457, "train_runtime": 12194.2364, "train_tokens_per_second": 145636.472 }, { "epoch": 0.16246896062433486, "grad_norm": 0.36661437153816223, "learning_rate": 3.751622462954923e-05, "loss": 0.4315, "num_input_tokens_seen": 1779829512, "step": 458, "train_runtime": 12219.2852, "train_tokens_per_second": 145657.417 }, { "epoch": 0.16282369634622207, "grad_norm": 0.44350698590278625, "learning_rate": 3.750543303533313e-05, "loss": 0.4414, "num_input_tokens_seen": 1783724542, "step": 459, "train_runtime": 12250.4003, "train_tokens_per_second": 145605.408 }, { "epoch": 0.16317843206810925, "grad_norm": 0.5031194686889648, "learning_rate": 3.749461960712768e-05, "loss": 0.4574, "num_input_tokens_seen": 1787563774, "step": 460, "train_runtime": 12278.1337, "train_tokens_per_second": 145589.209 }, { "epoch": 0.16353316778999646, "grad_norm": 0.43463778495788574, "learning_rate": 3.7483784358420126e-05, "loss": 0.4409, "num_input_tokens_seen": 1791476657, "step": 461, "train_runtime": 12309.676, "train_tokens_per_second": 145534.022 }, { "epoch": 0.16388790351188365, "grad_norm": 0.37832731008529663, "learning_rate": 3.7472927302724926e-05, "loss": 0.4497, "num_input_tokens_seen": 1795297541, "step": 462, "train_runtime": 12335.8424, "train_tokens_per_second": 145535.059 }, { "epoch": 0.16424263923377083, "grad_norm": 0.5823975205421448, "learning_rate": 3.746204845358378e-05, "loss": 0.4455, "num_input_tokens_seen": 1799176699, "step": 463, "train_runtime": 12364.6578, "train_tokens_per_second": 145509.623 }, { "epoch": 0.16459737495565804, "grad_norm": 0.4312954246997833, "learning_rate": 3.745114782456553e-05, "loss": 0.4511, "num_input_tokens_seen": 1803069156, "step": 464, "train_runtime": 12396.1118, "train_tokens_per_second": 145454.412 }, { "epoch": 0.16495211067754523, "grad_norm": 0.31632131338119507, "learning_rate": 3.744022542926618e-05, "loss": 0.427, "num_input_tokens_seen": 1806893997, "step": 465, "train_runtime": 12419.1488, "train_tokens_per_second": 145492.579 }, { "epoch": 0.1653068463994324, "grad_norm": 0.31348514556884766, "learning_rate": 3.742928128130892e-05, "loss": 0.444, "num_input_tokens_seen": 1810744483, "step": 466, "train_runtime": 12444.5975, "train_tokens_per_second": 145504.464 }, { "epoch": 0.16566158212131962, "grad_norm": 0.41682156920433044, "learning_rate": 3.7418315394344044e-05, "loss": 0.4387, "num_input_tokens_seen": 1814689121, "step": 467, "train_runtime": 12473.7436, "train_tokens_per_second": 145480.714 }, { "epoch": 0.1660163178432068, "grad_norm": 0.4270724058151245, "learning_rate": 3.740732778204897e-05, "loss": 0.4378, "num_input_tokens_seen": 1818533940, "step": 468, "train_runtime": 12503.0984, "train_tokens_per_second": 145446.663 }, { "epoch": 0.16637105356509402, "grad_norm": 0.3687804639339447, "learning_rate": 3.73963184581282e-05, "loss": 0.4459, "num_input_tokens_seen": 1822458617, "step": 469, "train_runtime": 12541.6798, "train_tokens_per_second": 145312.163 }, { "epoch": 0.1667257892869812, "grad_norm": 0.48158982396125793, "learning_rate": 3.738528743631333e-05, "loss": 0.449, "num_input_tokens_seen": 1826326750, "step": 470, "train_runtime": 12567.6351, "train_tokens_per_second": 145319.842 }, { "epoch": 0.16708052500886839, "grad_norm": 0.3738994300365448, "learning_rate": 3.737423473036303e-05, "loss": 0.4435, "num_input_tokens_seen": 1830248472, "step": 471, "train_runtime": 12599.8894, "train_tokens_per_second": 145259.091 }, { "epoch": 0.1674352607307556, "grad_norm": 0.32823920249938965, "learning_rate": 3.7363160354062976e-05, "loss": 0.4454, "num_input_tokens_seen": 1834084939, "step": 472, "train_runtime": 12622.5711, "train_tokens_per_second": 145302.008 }, { "epoch": 0.16778999645264278, "grad_norm": 0.5778218507766724, "learning_rate": 3.73520643212259e-05, "loss": 0.456, "num_input_tokens_seen": 1837950919, "step": 473, "train_runtime": 12640.5723, "train_tokens_per_second": 145400.926 }, { "epoch": 0.16814473217452997, "grad_norm": 0.31276804208755493, "learning_rate": 3.734094664569156e-05, "loss": 0.4463, "num_input_tokens_seen": 1841831150, "step": 474, "train_runtime": 12661.3854, "train_tokens_per_second": 145468.375 }, { "epoch": 0.16849946789641718, "grad_norm": 0.3621354401111603, "learning_rate": 3.732980734132668e-05, "loss": 0.4467, "num_input_tokens_seen": 1845744233, "step": 475, "train_runtime": 12685.3669, "train_tokens_per_second": 145501.841 }, { "epoch": 0.16885420361830436, "grad_norm": 0.3967226445674896, "learning_rate": 3.731864642202498e-05, "loss": 0.447, "num_input_tokens_seen": 1849645100, "step": 476, "train_runtime": 12705.2808, "train_tokens_per_second": 145580.813 }, { "epoch": 0.16920893934019154, "grad_norm": 0.40200141072273254, "learning_rate": 3.730746390170713e-05, "loss": 0.447, "num_input_tokens_seen": 1853485430, "step": 477, "train_runtime": 12728.4273, "train_tokens_per_second": 145617.789 }, { "epoch": 0.16956367506207876, "grad_norm": 0.38998711109161377, "learning_rate": 3.729625979432074e-05, "loss": 0.4435, "num_input_tokens_seen": 1857348490, "step": 478, "train_runtime": 12754.5744, "train_tokens_per_second": 145622.145 }, { "epoch": 0.16991841078396594, "grad_norm": 0.38086843490600586, "learning_rate": 3.7285034113840346e-05, "loss": 0.4544, "num_input_tokens_seen": 1861200831, "step": 479, "train_runtime": 12778.4365, "train_tokens_per_second": 145651.686 }, { "epoch": 0.17027314650585315, "grad_norm": 0.47302645444869995, "learning_rate": 3.7273786874267405e-05, "loss": 0.4331, "num_input_tokens_seen": 1865138392, "step": 480, "train_runtime": 12805.8462, "train_tokens_per_second": 145647.415 }, { "epoch": 0.17062788222774034, "grad_norm": 0.3040490746498108, "learning_rate": 3.726251808963024e-05, "loss": 0.4444, "num_input_tokens_seen": 1869044972, "step": 481, "train_runtime": 12825.9331, "train_tokens_per_second": 145723.899 }, { "epoch": 0.17098261794962752, "grad_norm": 0.2945583760738373, "learning_rate": 3.725122777398408e-05, "loss": 0.4385, "num_input_tokens_seen": 1872951372, "step": 482, "train_runtime": 12846.1903, "train_tokens_per_second": 145798.196 }, { "epoch": 0.17133735367151473, "grad_norm": 0.3358820080757141, "learning_rate": 3.723991594141098e-05, "loss": 0.4523, "num_input_tokens_seen": 1876875379, "step": 483, "train_runtime": 12866.9461, "train_tokens_per_second": 145867.975 }, { "epoch": 0.17169208939340191, "grad_norm": 0.35176628828048706, "learning_rate": 3.7228582606019834e-05, "loss": 0.4405, "num_input_tokens_seen": 1880701876, "step": 484, "train_runtime": 12892.242, "train_tokens_per_second": 145878.573 }, { "epoch": 0.1720468251152891, "grad_norm": 0.3170473873615265, "learning_rate": 3.721722778194637e-05, "loss": 0.444, "num_input_tokens_seen": 1884666129, "step": 485, "train_runtime": 12917.3747, "train_tokens_per_second": 145901.639 }, { "epoch": 0.1724015608371763, "grad_norm": 0.3054451048374176, "learning_rate": 3.7205851483353105e-05, "loss": 0.4512, "num_input_tokens_seen": 1888488748, "step": 486, "train_runtime": 12936.493, "train_tokens_per_second": 145981.507 }, { "epoch": 0.1727562965590635, "grad_norm": 0.3031492233276367, "learning_rate": 3.7194453724429356e-05, "loss": 0.4438, "num_input_tokens_seen": 1892435070, "step": 487, "train_runtime": 12961.6838, "train_tokens_per_second": 146002.255 }, { "epoch": 0.1731110322809507, "grad_norm": 0.30045071244239807, "learning_rate": 3.7183034519391204e-05, "loss": 0.4363, "num_input_tokens_seen": 1896317888, "step": 488, "train_runtime": 12982.9121, "train_tokens_per_second": 146062.6 }, { "epoch": 0.1734657680028379, "grad_norm": 0.3359653353691101, "learning_rate": 3.7171593882481455e-05, "loss": 0.4267, "num_input_tokens_seen": 1900255764, "step": 489, "train_runtime": 13010.9205, "train_tokens_per_second": 146050.833 }, { "epoch": 0.17382050372472507, "grad_norm": 0.3404531478881836, "learning_rate": 3.716013182796967e-05, "loss": 0.4455, "num_input_tokens_seen": 1904118468, "step": 490, "train_runtime": 13040.2403, "train_tokens_per_second": 146018.664 }, { "epoch": 0.17417523944661228, "grad_norm": 2.184518575668335, "learning_rate": 3.7148648370152134e-05, "loss": 0.4476, "num_input_tokens_seen": 1908017552, "step": 491, "train_runtime": 13064.1779, "train_tokens_per_second": 146049.569 }, { "epoch": 0.17452997516849947, "grad_norm": 0.37678197026252747, "learning_rate": 3.7137143523351787e-05, "loss": 0.4321, "num_input_tokens_seen": 1911952013, "step": 492, "train_runtime": 13097.6834, "train_tokens_per_second": 145976.35 }, { "epoch": 0.17488471089038665, "grad_norm": 0.5353854894638062, "learning_rate": 3.712561730191829e-05, "loss": 0.4501, "num_input_tokens_seen": 1915806564, "step": 493, "train_runtime": 13123.2979, "train_tokens_per_second": 145985.146 }, { "epoch": 0.17523944661227386, "grad_norm": 0.5258532166481018, "learning_rate": 3.7114069720227934e-05, "loss": 0.4505, "num_input_tokens_seen": 1919708862, "step": 494, "train_runtime": 13147.4772, "train_tokens_per_second": 146013.477 }, { "epoch": 0.17559418233416105, "grad_norm": 0.46366867423057556, "learning_rate": 3.710250079268367e-05, "loss": 0.4396, "num_input_tokens_seen": 1923649394, "step": 495, "train_runtime": 13166.0402, "train_tokens_per_second": 146106.905 }, { "epoch": 0.17594891805604823, "grad_norm": 0.35830312967300415, "learning_rate": 3.7090910533715055e-05, "loss": 0.4484, "num_input_tokens_seen": 1927501397, "step": 496, "train_runtime": 13187.6735, "train_tokens_per_second": 146159.32 }, { "epoch": 0.17630365377793544, "grad_norm": 0.3890186846256256, "learning_rate": 3.7079298957778274e-05, "loss": 0.4507, "num_input_tokens_seen": 1931326737, "step": 497, "train_runtime": 13215.1164, "train_tokens_per_second": 146145.269 }, { "epoch": 0.17665838949982263, "grad_norm": 0.44150644540786743, "learning_rate": 3.7067666079356096e-05, "loss": 0.445, "num_input_tokens_seen": 1935182211, "step": 498, "train_runtime": 13240.7681, "train_tokens_per_second": 146153.319 }, { "epoch": 0.17701312522170984, "grad_norm": 0.3966057002544403, "learning_rate": 3.7056011912957836e-05, "loss": 0.4513, "num_input_tokens_seen": 1939126881, "step": 499, "train_runtime": 13267.5126, "train_tokens_per_second": 146156.024 }, { "epoch": 0.17736786094359702, "grad_norm": 0.34988221526145935, "learning_rate": 3.7044336473119386e-05, "loss": 0.4407, "num_input_tokens_seen": 1942987504, "step": 500, "train_runtime": 13288.6534, "train_tokens_per_second": 146214.025 }, { "epoch": 0.1777225966654842, "grad_norm": 0.38208162784576416, "learning_rate": 3.7032639774403174e-05, "loss": 0.4427, "num_input_tokens_seen": 1946844519, "step": 501, "train_runtime": 13321.1167, "train_tokens_per_second": 146147.246 }, { "epoch": 0.17807733238737142, "grad_norm": 0.41595372557640076, "learning_rate": 3.702092183139811e-05, "loss": 0.4536, "num_input_tokens_seen": 1950793502, "step": 502, "train_runtime": 13343.3276, "train_tokens_per_second": 146199.926 }, { "epoch": 0.1784320681092586, "grad_norm": 0.4240153729915619, "learning_rate": 3.700918265871964e-05, "loss": 0.4346, "num_input_tokens_seen": 1954693664, "step": 503, "train_runtime": 13370.6452, "train_tokens_per_second": 146192.92 }, { "epoch": 0.17878680383114579, "grad_norm": 0.3719361126422882, "learning_rate": 3.699742227100968e-05, "loss": 0.4463, "num_input_tokens_seen": 1958529629, "step": 504, "train_runtime": 13389.85, "train_tokens_per_second": 146269.721 }, { "epoch": 0.179141539553033, "grad_norm": 0.3351946771144867, "learning_rate": 3.6985640682936594e-05, "loss": 0.4475, "num_input_tokens_seen": 1962511791, "step": 505, "train_runtime": 13416.8764, "train_tokens_per_second": 146271.885 }, { "epoch": 0.17949627527492018, "grad_norm": 0.3888389468193054, "learning_rate": 3.697383790919519e-05, "loss": 0.4431, "num_input_tokens_seen": 1966351497, "step": 506, "train_runtime": 13439.3498, "train_tokens_per_second": 146312.993 }, { "epoch": 0.1798510109968074, "grad_norm": 0.4288236200809479, "learning_rate": 3.6962013964506705e-05, "loss": 0.4436, "num_input_tokens_seen": 1970307969, "step": 507, "train_runtime": 13466.3543, "train_tokens_per_second": 146313.392 }, { "epoch": 0.18020574671869458, "grad_norm": 0.6455765962600708, "learning_rate": 3.6950168863618784e-05, "loss": 0.4457, "num_input_tokens_seen": 1974121293, "step": 508, "train_runtime": 13493.863, "train_tokens_per_second": 146297.712 }, { "epoch": 0.18056048244058176, "grad_norm": 0.34024563431739807, "learning_rate": 3.6938302621305474e-05, "loss": 0.4435, "num_input_tokens_seen": 1978025404, "step": 509, "train_runtime": 13526.773, "train_tokens_per_second": 146230.398 }, { "epoch": 0.18091521816246897, "grad_norm": 0.44953852891921997, "learning_rate": 3.692641525236715e-05, "loss": 0.4428, "num_input_tokens_seen": 1981905162, "step": 510, "train_runtime": 13554.724, "train_tokens_per_second": 146215.088 }, { "epoch": 0.18126995388435616, "grad_norm": 0.3346947133541107, "learning_rate": 3.691450677163057e-05, "loss": 0.4492, "num_input_tokens_seen": 1985771146, "step": 511, "train_runtime": 13575.4833, "train_tokens_per_second": 146276.276 }, { "epoch": 0.18162468960624334, "grad_norm": 0.34466466307640076, "learning_rate": 3.690257719394883e-05, "loss": 0.4545, "num_input_tokens_seen": 1989721776, "step": 512, "train_runtime": 13601.5359, "train_tokens_per_second": 146286.551 }, { "epoch": 0.18197942532813055, "grad_norm": 0.4090231955051422, "learning_rate": 3.689062653420132e-05, "loss": 0.4411, "num_input_tokens_seen": 1993605505, "step": 513, "train_runtime": 13621.2552, "train_tokens_per_second": 146359.897 }, { "epoch": 0.18233416105001773, "grad_norm": 0.7770067453384399, "learning_rate": 3.687865480729371e-05, "loss": 0.4432, "num_input_tokens_seen": 1997430764, "step": 514, "train_runtime": 13654.1159, "train_tokens_per_second": 146287.814 }, { "epoch": 0.18268889677190492, "grad_norm": 0.42496374249458313, "learning_rate": 3.6866662028158e-05, "loss": 0.441, "num_input_tokens_seen": 2001351737, "step": 515, "train_runtime": 13679.9916, "train_tokens_per_second": 146297.731 }, { "epoch": 0.18304363249379213, "grad_norm": 0.4014492332935333, "learning_rate": 3.685464821175239e-05, "loss": 0.4299, "num_input_tokens_seen": 2005235752, "step": 516, "train_runtime": 13706.9638, "train_tokens_per_second": 146293.211 }, { "epoch": 0.1833983682156793, "grad_norm": 0.4270946979522705, "learning_rate": 3.6842613373061365e-05, "loss": 0.4376, "num_input_tokens_seen": 2009155858, "step": 517, "train_runtime": 13727.5768, "train_tokens_per_second": 146359.105 }, { "epoch": 0.18375310393756653, "grad_norm": 0.5136831998825073, "learning_rate": 3.683055752709559e-05, "loss": 0.4453, "num_input_tokens_seen": 2013084710, "step": 518, "train_runtime": 13755.6388, "train_tokens_per_second": 146346.145 }, { "epoch": 0.1841078396594537, "grad_norm": 0.41497254371643066, "learning_rate": 3.681848068889196e-05, "loss": 0.4417, "num_input_tokens_seen": 2016964767, "step": 519, "train_runtime": 13778.8482, "train_tokens_per_second": 146381.232 }, { "epoch": 0.1844625753813409, "grad_norm": 0.32113608717918396, "learning_rate": 3.680638287351355e-05, "loss": 0.4424, "num_input_tokens_seen": 2020902407, "step": 520, "train_runtime": 13812.3475, "train_tokens_per_second": 146311.292 }, { "epoch": 0.1848173111032281, "grad_norm": 0.3658073842525482, "learning_rate": 3.67942640960496e-05, "loss": 0.4476, "num_input_tokens_seen": 2024717598, "step": 521, "train_runtime": 13841.1765, "train_tokens_per_second": 146282.189 }, { "epoch": 0.1851720468251153, "grad_norm": 0.3903835415840149, "learning_rate": 3.6782124371615465e-05, "loss": 0.437, "num_input_tokens_seen": 2028648122, "step": 522, "train_runtime": 13871.2306, "train_tokens_per_second": 146248.605 }, { "epoch": 0.18552678254700247, "grad_norm": 0.4141230881214142, "learning_rate": 3.676996371535268e-05, "loss": 0.4404, "num_input_tokens_seen": 2032476940, "step": 523, "train_runtime": 13899.0189, "train_tokens_per_second": 146231.684 }, { "epoch": 0.18588151826888968, "grad_norm": 0.3542817234992981, "learning_rate": 3.675778214242883e-05, "loss": 0.4269, "num_input_tokens_seen": 2036397384, "step": 524, "train_runtime": 13932.5533, "train_tokens_per_second": 146161.105 }, { "epoch": 0.18623625399077687, "grad_norm": 0.369373619556427, "learning_rate": 3.6745579668037625e-05, "loss": 0.446, "num_input_tokens_seen": 2040219095, "step": 525, "train_runtime": 13965.2762, "train_tokens_per_second": 146092.284 }, { "epoch": 0.18659098971266405, "grad_norm": 0.45926445722579956, "learning_rate": 3.673335630739885e-05, "loss": 0.4359, "num_input_tokens_seen": 2044046297, "step": 526, "train_runtime": 13995.8522, "train_tokens_per_second": 146046.577 }, { "epoch": 0.18694572543455126, "grad_norm": 0.35391315817832947, "learning_rate": 3.6721112075758325e-05, "loss": 0.4434, "num_input_tokens_seen": 2047881306, "step": 527, "train_runtime": 14029.4563, "train_tokens_per_second": 145970.112 }, { "epoch": 0.18730046115643845, "grad_norm": 0.41480588912963867, "learning_rate": 3.670884698838789e-05, "loss": 0.4382, "num_input_tokens_seen": 2051740648, "step": 528, "train_runtime": 14068.244, "train_tokens_per_second": 145841.987 }, { "epoch": 0.18765519687832566, "grad_norm": 0.3678593635559082, "learning_rate": 3.6696561060585424e-05, "loss": 0.4459, "num_input_tokens_seen": 2055630105, "step": 529, "train_runtime": 14091.342, "train_tokens_per_second": 145878.945 }, { "epoch": 0.18800993260021284, "grad_norm": 0.3168484568595886, "learning_rate": 3.668425430767479e-05, "loss": 0.4541, "num_input_tokens_seen": 2059505197, "step": 530, "train_runtime": 14118.407, "train_tokens_per_second": 145873.766 }, { "epoch": 0.18836466832210003, "grad_norm": 0.3465515971183777, "learning_rate": 3.667192674500581e-05, "loss": 0.4425, "num_input_tokens_seen": 2063418951, "step": 531, "train_runtime": 14141.3993, "train_tokens_per_second": 145913.35 }, { "epoch": 0.18871940404398724, "grad_norm": 0.38155362010002136, "learning_rate": 3.665957838795429e-05, "loss": 0.4307, "num_input_tokens_seen": 2067319040, "step": 532, "train_runtime": 14163.7449, "train_tokens_per_second": 145958.506 }, { "epoch": 0.18907413976587442, "grad_norm": 0.3624736964702606, "learning_rate": 3.664720925192193e-05, "loss": 0.4441, "num_input_tokens_seen": 2071128768, "step": 533, "train_runtime": 14194.4823, "train_tokens_per_second": 145910.835 }, { "epoch": 0.1894288754877616, "grad_norm": 0.3383430242538452, "learning_rate": 3.663481935233641e-05, "loss": 0.4361, "num_input_tokens_seen": 2074994819, "step": 534, "train_runtime": 14217.5623, "train_tokens_per_second": 145945.892 }, { "epoch": 0.18978361120964882, "grad_norm": 0.36401423811912537, "learning_rate": 3.6622408704651254e-05, "loss": 0.446, "num_input_tokens_seen": 2078908874, "step": 535, "train_runtime": 14244.9916, "train_tokens_per_second": 145939.635 }, { "epoch": 0.190138346931536, "grad_norm": 0.3691839873790741, "learning_rate": 3.660997732434588e-05, "loss": 0.4661, "num_input_tokens_seen": 2082826663, "step": 536, "train_runtime": 14278.9166, "train_tokens_per_second": 145867.275 }, { "epoch": 0.1904930826534232, "grad_norm": 0.3884320855140686, "learning_rate": 3.6597525226925566e-05, "loss": 0.4342, "num_input_tokens_seen": 2086649186, "step": 537, "train_runtime": 14304.4925, "train_tokens_per_second": 145873.695 }, { "epoch": 0.1908478183753104, "grad_norm": 0.40589696168899536, "learning_rate": 3.6585052427921436e-05, "loss": 0.4555, "num_input_tokens_seen": 2090565474, "step": 538, "train_runtime": 14333.9094, "train_tokens_per_second": 145847.543 }, { "epoch": 0.19120255409719758, "grad_norm": 0.2990073561668396, "learning_rate": 3.657255894289043e-05, "loss": 0.4444, "num_input_tokens_seen": 2094394287, "step": 539, "train_runtime": 14370.2693, "train_tokens_per_second": 145744.957 }, { "epoch": 0.1915572898190848, "grad_norm": 0.3332250714302063, "learning_rate": 3.656004478741528e-05, "loss": 0.4321, "num_input_tokens_seen": 2098256308, "step": 540, "train_runtime": 14402.9601, "train_tokens_per_second": 145682.297 }, { "epoch": 0.19191202554097198, "grad_norm": 0.34724074602127075, "learning_rate": 3.6547509977104526e-05, "loss": 0.4301, "num_input_tokens_seen": 2102218719, "step": 541, "train_runtime": 14435.5085, "train_tokens_per_second": 145628.31 }, { "epoch": 0.19226676126285916, "grad_norm": 0.41705313324928284, "learning_rate": 3.653495452759243e-05, "loss": 0.4408, "num_input_tokens_seen": 2106069175, "step": 542, "train_runtime": 14462.874, "train_tokens_per_second": 145618.995 }, { "epoch": 0.19262149698474637, "grad_norm": 0.3085917532444, "learning_rate": 3.652237845453903e-05, "loss": 0.4386, "num_input_tokens_seen": 2109942587, "step": 543, "train_runtime": 14495.9299, "train_tokens_per_second": 145554.138 }, { "epoch": 0.19297623270663355, "grad_norm": 0.33952146768569946, "learning_rate": 3.650978177363008e-05, "loss": 0.4435, "num_input_tokens_seen": 2113847439, "step": 544, "train_runtime": 14523.5158, "train_tokens_per_second": 145546.538 }, { "epoch": 0.19333096842852074, "grad_norm": 1.7973573207855225, "learning_rate": 3.6497164500577026e-05, "loss": 0.433, "num_input_tokens_seen": 2117710207, "step": 545, "train_runtime": 14560.7349, "train_tokens_per_second": 145439.789 }, { "epoch": 0.19368570415040795, "grad_norm": 0.3573378920555115, "learning_rate": 3.6484526651117e-05, "loss": 0.4354, "num_input_tokens_seen": 2121591315, "step": 546, "train_runtime": 14577.5706, "train_tokens_per_second": 145538.058 }, { "epoch": 0.19404043987229513, "grad_norm": 0.47980356216430664, "learning_rate": 3.647186824101282e-05, "loss": 0.4531, "num_input_tokens_seen": 2125527831, "step": 547, "train_runtime": 14598.699, "train_tokens_per_second": 145597.072 }, { "epoch": 0.19439517559418235, "grad_norm": 0.4583019018173218, "learning_rate": 3.645918928605293e-05, "loss": 0.4393, "num_input_tokens_seen": 2129407326, "step": 548, "train_runtime": 14629.1341, "train_tokens_per_second": 145559.355 }, { "epoch": 0.19474991131606953, "grad_norm": 0.40817567706108093, "learning_rate": 3.6446489802051385e-05, "loss": 0.4394, "num_input_tokens_seen": 2133312970, "step": 549, "train_runtime": 14662.7819, "train_tokens_per_second": 145491.693 }, { "epoch": 0.1951046470379567, "grad_norm": 0.31258949637413025, "learning_rate": 3.643376980484788e-05, "loss": 0.4279, "num_input_tokens_seen": 2137221220, "step": 550, "train_runtime": 14690.1657, "train_tokens_per_second": 145486.529 }, { "epoch": 0.19545938275984392, "grad_norm": 0.47751858830451965, "learning_rate": 3.642102931030766e-05, "loss": 0.44, "num_input_tokens_seen": 2141092551, "step": 551, "train_runtime": 14716.1399, "train_tokens_per_second": 145492.81 }, { "epoch": 0.1958141184817311, "grad_norm": 0.4963304102420807, "learning_rate": 3.640826833432157e-05, "loss": 0.4518, "num_input_tokens_seen": 2144967165, "step": 552, "train_runtime": 14743.9916, "train_tokens_per_second": 145480.765 }, { "epoch": 0.1961688542036183, "grad_norm": 0.4628588557243347, "learning_rate": 3.639548689280598e-05, "loss": 0.4375, "num_input_tokens_seen": 2148822040, "step": 553, "train_runtime": 14764.9391, "train_tokens_per_second": 145535.449 }, { "epoch": 0.1965235899255055, "grad_norm": 0.42071136832237244, "learning_rate": 3.638268500170277e-05, "loss": 0.4403, "num_input_tokens_seen": 2152697725, "step": 554, "train_runtime": 14788.6984, "train_tokens_per_second": 145563.705 }, { "epoch": 0.1968783256473927, "grad_norm": 0.3706282675266266, "learning_rate": 3.6369862676979375e-05, "loss": 0.4468, "num_input_tokens_seen": 2156554163, "step": 555, "train_runtime": 14811.2393, "train_tokens_per_second": 145602.547 }, { "epoch": 0.1972330613692799, "grad_norm": 0.3890003263950348, "learning_rate": 3.635701993462867e-05, "loss": 0.4249, "num_input_tokens_seen": 2160516880, "step": 556, "train_runtime": 14835.3912, "train_tokens_per_second": 145632.619 }, { "epoch": 0.19758779709116708, "grad_norm": 0.5130414366722107, "learning_rate": 3.634415679066902e-05, "loss": 0.4449, "num_input_tokens_seen": 2164376951, "step": 557, "train_runtime": 14857.5408, "train_tokens_per_second": 145675.316 }, { "epoch": 0.19794253281305427, "grad_norm": 0.44610396027565, "learning_rate": 3.633127326114422e-05, "loss": 0.4422, "num_input_tokens_seen": 2168239746, "step": 558, "train_runtime": 14881.1074, "train_tokens_per_second": 145704.194 }, { "epoch": 0.19829726853494148, "grad_norm": 0.3673466742038727, "learning_rate": 3.6318369362123515e-05, "loss": 0.4418, "num_input_tokens_seen": 2172168743, "step": 559, "train_runtime": 14902.515, "train_tokens_per_second": 145758.534 }, { "epoch": 0.19865200425682866, "grad_norm": 0.4019934833049774, "learning_rate": 3.630544510970153e-05, "loss": 0.4353, "num_input_tokens_seen": 2176014549, "step": 560, "train_runtime": 14930.2568, "train_tokens_per_second": 145745.286 }, { "epoch": 0.19900673997871585, "grad_norm": 0.36611777544021606, "learning_rate": 3.6292500519998295e-05, "loss": 0.4319, "num_input_tokens_seen": 2179888925, "step": 561, "train_runtime": 14953.2015, "train_tokens_per_second": 145780.749 }, { "epoch": 0.19936147570060306, "grad_norm": 0.30342233180999756, "learning_rate": 3.6279535609159193e-05, "loss": 0.427, "num_input_tokens_seen": 2183756147, "step": 562, "train_runtime": 14973.9987, "train_tokens_per_second": 145836.54 }, { "epoch": 0.19971621142249024, "grad_norm": 0.37788841128349304, "learning_rate": 3.626655039335497e-05, "loss": 0.4423, "num_input_tokens_seen": 2187678699, "step": 563, "train_runtime": 15002.1498, "train_tokens_per_second": 145824.347 }, { "epoch": 0.20007094714437743, "grad_norm": 0.4012748897075653, "learning_rate": 3.625354488878168e-05, "loss": 0.4475, "num_input_tokens_seen": 2191549311, "step": 564, "train_runtime": 15029.1967, "train_tokens_per_second": 145819.458 }, { "epoch": 0.20042568286626464, "grad_norm": 0.42396849393844604, "learning_rate": 3.6240519111660686e-05, "loss": 0.4421, "num_input_tokens_seen": 2195496032, "step": 565, "train_runtime": 15061.658, "train_tokens_per_second": 145767.221 }, { "epoch": 0.20078041858815182, "grad_norm": 0.30875110626220703, "learning_rate": 3.622747307823865e-05, "loss": 0.4469, "num_input_tokens_seen": 2199290908, "step": 566, "train_runtime": 15081.5588, "train_tokens_per_second": 145826.499 }, { "epoch": 0.20113515431003903, "grad_norm": 0.3458799123764038, "learning_rate": 3.6214406804787484e-05, "loss": 0.4466, "num_input_tokens_seen": 2203146108, "step": 567, "train_runtime": 15101.5581, "train_tokens_per_second": 145888.663 }, { "epoch": 0.20148989003192622, "grad_norm": 0.32859525084495544, "learning_rate": 3.620132030760435e-05, "loss": 0.4358, "num_input_tokens_seen": 2207022945, "step": 568, "train_runtime": 15128.469, "train_tokens_per_second": 145885.413 }, { "epoch": 0.2018446257538134, "grad_norm": 0.36493226885795593, "learning_rate": 3.618821360301163e-05, "loss": 0.4328, "num_input_tokens_seen": 2210908618, "step": 569, "train_runtime": 15154.8441, "train_tokens_per_second": 145887.916 }, { "epoch": 0.2021993614757006, "grad_norm": 0.3770865499973297, "learning_rate": 3.617508670735692e-05, "loss": 0.4357, "num_input_tokens_seen": 2214749537, "step": 570, "train_runtime": 15176.9369, "train_tokens_per_second": 145928.625 }, { "epoch": 0.2025540971975878, "grad_norm": 0.38569116592407227, "learning_rate": 3.6161939637012995e-05, "loss": 0.4382, "num_input_tokens_seen": 2218765240, "step": 571, "train_runtime": 15198.92, "train_tokens_per_second": 145981.769 }, { "epoch": 0.20290883291947498, "grad_norm": 0.46096622943878174, "learning_rate": 3.614877240837779e-05, "loss": 0.4345, "num_input_tokens_seen": 2222605564, "step": 572, "train_runtime": 15223.3014, "train_tokens_per_second": 146000.234 }, { "epoch": 0.2032635686413622, "grad_norm": 0.3546644449234009, "learning_rate": 3.6135585037874386e-05, "loss": 0.4368, "num_input_tokens_seen": 2226517525, "step": 573, "train_runtime": 15255.0391, "train_tokens_per_second": 145952.921 }, { "epoch": 0.20361830436324937, "grad_norm": 0.42111048102378845, "learning_rate": 3.612237754195098e-05, "loss": 0.4449, "num_input_tokens_seen": 2230398056, "step": 574, "train_runtime": 15280.6023, "train_tokens_per_second": 145962.705 }, { "epoch": 0.2039730400851366, "grad_norm": 0.5152305364608765, "learning_rate": 3.610914993708089e-05, "loss": 0.4357, "num_input_tokens_seen": 2234227987, "step": 575, "train_runtime": 15302.6737, "train_tokens_per_second": 146002.459 }, { "epoch": 0.20432777580702377, "grad_norm": 0.38833945989608765, "learning_rate": 3.609590223976248e-05, "loss": 0.4428, "num_input_tokens_seen": 2238145825, "step": 576, "train_runtime": 15328.6523, "train_tokens_per_second": 146010.607 }, { "epoch": 0.20468251152891095, "grad_norm": 0.3119691014289856, "learning_rate": 3.608263446651922e-05, "loss": 0.4407, "num_input_tokens_seen": 2241966526, "step": 577, "train_runtime": 15362.062, "train_tokens_per_second": 145941.77 }, { "epoch": 0.20503724725079817, "grad_norm": 0.3151058852672577, "learning_rate": 3.606934663389957e-05, "loss": 0.4296, "num_input_tokens_seen": 2245860578, "step": 578, "train_runtime": 15396.9743, "train_tokens_per_second": 145863.761 }, { "epoch": 0.20539198297268535, "grad_norm": 0.38438719511032104, "learning_rate": 3.605603875847707e-05, "loss": 0.4425, "num_input_tokens_seen": 2249753091, "step": 579, "train_runtime": 15422.7025, "train_tokens_per_second": 145872.819 }, { "epoch": 0.20574671869457253, "grad_norm": 0.4980340898036957, "learning_rate": 3.604271085685019e-05, "loss": 0.4504, "num_input_tokens_seen": 2253594846, "step": 580, "train_runtime": 15444.9036, "train_tokens_per_second": 145911.875 }, { "epoch": 0.20610145441645975, "grad_norm": 0.3519046902656555, "learning_rate": 3.6029362945642436e-05, "loss": 0.4338, "num_input_tokens_seen": 2257459863, "step": 581, "train_runtime": 15483.4228, "train_tokens_per_second": 145798.503 }, { "epoch": 0.20645619013834693, "grad_norm": 0.3642440438270569, "learning_rate": 3.601599504150224e-05, "loss": 0.4344, "num_input_tokens_seen": 2261312487, "step": 582, "train_runtime": 15516.0169, "train_tokens_per_second": 145740.528 }, { "epoch": 0.2068109258602341, "grad_norm": 0.3887627124786377, "learning_rate": 3.600260716110298e-05, "loss": 0.4426, "num_input_tokens_seen": 2265190415, "step": 583, "train_runtime": 15547.4449, "train_tokens_per_second": 145695.35 }, { "epoch": 0.20716566158212132, "grad_norm": 0.35812249779701233, "learning_rate": 3.598919932114294e-05, "loss": 0.4494, "num_input_tokens_seen": 2269103037, "step": 584, "train_runtime": 15571.8732, "train_tokens_per_second": 145718.052 }, { "epoch": 0.2075203973040085, "grad_norm": 0.3030877709388733, "learning_rate": 3.5975771538345325e-05, "loss": 0.4285, "num_input_tokens_seen": 2272939749, "step": 585, "train_runtime": 15601.6232, "train_tokens_per_second": 145686.108 }, { "epoch": 0.20787513302589572, "grad_norm": 0.3823722004890442, "learning_rate": 3.5962323829458175e-05, "loss": 0.4377, "num_input_tokens_seen": 2276886234, "step": 586, "train_runtime": 15633.9909, "train_tokens_per_second": 145636.917 }, { "epoch": 0.2082298687477829, "grad_norm": 0.2990856468677521, "learning_rate": 3.594885621125442e-05, "loss": 0.4401, "num_input_tokens_seen": 2280723067, "step": 587, "train_runtime": 15655.2017, "train_tokens_per_second": 145684.681 }, { "epoch": 0.2085846044696701, "grad_norm": 0.3197517693042755, "learning_rate": 3.59353687005318e-05, "loss": 0.4353, "num_input_tokens_seen": 2284601486, "step": 588, "train_runtime": 15688.3639, "train_tokens_per_second": 145623.948 }, { "epoch": 0.2089393401915573, "grad_norm": 0.38753095269203186, "learning_rate": 3.592186131411288e-05, "loss": 0.4442, "num_input_tokens_seen": 2288508212, "step": 589, "train_runtime": 15715.11, "train_tokens_per_second": 145624.703 }, { "epoch": 0.20929407591344448, "grad_norm": 0.3741745054721832, "learning_rate": 3.5908334068845e-05, "loss": 0.4279, "num_input_tokens_seen": 2292378557, "step": 590, "train_runtime": 15743.1764, "train_tokens_per_second": 145610.93 }, { "epoch": 0.20964881163533167, "grad_norm": 0.40955662727355957, "learning_rate": 3.589478698160028e-05, "loss": 0.4554, "num_input_tokens_seen": 2296221300, "step": 591, "train_runtime": 15778.6972, "train_tokens_per_second": 145526.672 }, { "epoch": 0.21000354735721888, "grad_norm": 0.3269948959350586, "learning_rate": 3.58812200692756e-05, "loss": 0.4304, "num_input_tokens_seen": 2300160836, "step": 592, "train_runtime": 15805.6993, "train_tokens_per_second": 145527.306 }, { "epoch": 0.21035828307910606, "grad_norm": 0.4231071174144745, "learning_rate": 3.586763334879252e-05, "loss": 0.4327, "num_input_tokens_seen": 2304095882, "step": 593, "train_runtime": 15829.0896, "train_tokens_per_second": 145560.859 }, { "epoch": 0.21071301880099327, "grad_norm": 0.3181189298629761, "learning_rate": 3.5854026837097377e-05, "loss": 0.4496, "num_input_tokens_seen": 2308014153, "step": 594, "train_runtime": 15857.5029, "train_tokens_per_second": 145547.138 }, { "epoch": 0.21106775452288046, "grad_norm": 0.37563279271125793, "learning_rate": 3.584040055116113e-05, "loss": 0.4441, "num_input_tokens_seen": 2311947668, "step": 595, "train_runtime": 15882.5836, "train_tokens_per_second": 145564.961 }, { "epoch": 0.21142249024476764, "grad_norm": 0.36154329776763916, "learning_rate": 3.582675450797944e-05, "loss": 0.4274, "num_input_tokens_seen": 2315794867, "step": 596, "train_runtime": 15912.3657, "train_tokens_per_second": 145534.292 }, { "epoch": 0.21177722596665485, "grad_norm": 0.4388446509838104, "learning_rate": 3.5813088724572595e-05, "loss": 0.4483, "num_input_tokens_seen": 2319731232, "step": 597, "train_runtime": 15933.6184, "train_tokens_per_second": 145587.221 }, { "epoch": 0.21213196168854204, "grad_norm": 0.31233885884284973, "learning_rate": 3.579940321798551e-05, "loss": 0.4351, "num_input_tokens_seen": 2323619239, "step": 598, "train_runtime": 15955.6538, "train_tokens_per_second": 145629.835 }, { "epoch": 0.21248669741042922, "grad_norm": 0.3549239933490753, "learning_rate": 3.578569800528769e-05, "loss": 0.4377, "num_input_tokens_seen": 2327579693, "step": 599, "train_runtime": 15982.2581, "train_tokens_per_second": 145635.221 }, { "epoch": 0.21284143313231643, "grad_norm": 0.3239596486091614, "learning_rate": 3.5771973103573226e-05, "loss": 0.4406, "num_input_tokens_seen": 2331454195, "step": 600, "train_runtime": 16008.0828, "train_tokens_per_second": 145642.312 }, { "epoch": 0.21319616885420362, "grad_norm": 0.3380473554134369, "learning_rate": 3.5758228529960776e-05, "loss": 0.4363, "num_input_tokens_seen": 2335373963, "step": 601, "train_runtime": 16134.9224, "train_tokens_per_second": 144740.328 }, { "epoch": 0.2135509045760908, "grad_norm": 0.32756727933883667, "learning_rate": 3.574446430159352e-05, "loss": 0.4414, "num_input_tokens_seen": 2339252256, "step": 602, "train_runtime": 16156.1743, "train_tokens_per_second": 144789.986 }, { "epoch": 0.213905640297978, "grad_norm": 0.27406609058380127, "learning_rate": 3.5730680435639154e-05, "loss": 0.4316, "num_input_tokens_seen": 2343138750, "step": 603, "train_runtime": 16185.1215, "train_tokens_per_second": 144771.156 }, { "epoch": 0.2142603760198652, "grad_norm": 0.331087201833725, "learning_rate": 3.571687694928987e-05, "loss": 0.439, "num_input_tokens_seen": 2347047794, "step": 604, "train_runtime": 16218.1697, "train_tokens_per_second": 144717.181 }, { "epoch": 0.2146151117417524, "grad_norm": 0.32688748836517334, "learning_rate": 3.5703053859762347e-05, "loss": 0.4434, "num_input_tokens_seen": 2350953489, "step": 605, "train_runtime": 16255.6859, "train_tokens_per_second": 144623.457 }, { "epoch": 0.2149698474636396, "grad_norm": 0.29274553060531616, "learning_rate": 3.568921118429768e-05, "loss": 0.4279, "num_input_tokens_seen": 2354920516, "step": 606, "train_runtime": 16281.4716, "train_tokens_per_second": 144638.063 }, { "epoch": 0.21532458318552677, "grad_norm": 0.29829666018486023, "learning_rate": 3.5675348940161426e-05, "loss": 0.4313, "num_input_tokens_seen": 2358792879, "step": 607, "train_runtime": 16312.3662, "train_tokens_per_second": 144601.516 }, { "epoch": 0.21567931890741399, "grad_norm": 0.28845328092575073, "learning_rate": 3.566146714464354e-05, "loss": 0.4322, "num_input_tokens_seen": 2362760801, "step": 608, "train_runtime": 16331.7131, "train_tokens_per_second": 144673.176 }, { "epoch": 0.21603405462930117, "grad_norm": 0.2557801604270935, "learning_rate": 3.5647565815058346e-05, "loss": 0.4405, "num_input_tokens_seen": 2366603599, "step": 609, "train_runtime": 16358.6632, "train_tokens_per_second": 144669.743 }, { "epoch": 0.21638879035118835, "grad_norm": 0.2735631763935089, "learning_rate": 3.563364496874456e-05, "loss": 0.4352, "num_input_tokens_seen": 2370455063, "step": 610, "train_runtime": 16381.4887, "train_tokens_per_second": 144703.275 }, { "epoch": 0.21674352607307557, "grad_norm": 0.3030248284339905, "learning_rate": 3.5619704623065216e-05, "loss": 0.4362, "num_input_tokens_seen": 2374347690, "step": 611, "train_runtime": 16404.6532, "train_tokens_per_second": 144736.232 }, { "epoch": 0.21709826179496275, "grad_norm": 0.3182970881462097, "learning_rate": 3.560574479540768e-05, "loss": 0.4377, "num_input_tokens_seen": 2378246363, "step": 612, "train_runtime": 16442.7459, "train_tokens_per_second": 144638.029 }, { "epoch": 0.21745299751684993, "grad_norm": 0.4088495373725891, "learning_rate": 3.559176550318363e-05, "loss": 0.4228, "num_input_tokens_seen": 2382094091, "step": 613, "train_runtime": 16479.5681, "train_tokens_per_second": 144548.332 }, { "epoch": 0.21780773323873714, "grad_norm": 0.33024629950523376, "learning_rate": 3.5577766763828986e-05, "loss": 0.4392, "num_input_tokens_seen": 2386062462, "step": 614, "train_runtime": 16511.7573, "train_tokens_per_second": 144506.876 }, { "epoch": 0.21816246896062433, "grad_norm": 0.33025574684143066, "learning_rate": 3.556374859480396e-05, "loss": 0.4257, "num_input_tokens_seen": 2389947266, "step": 615, "train_runtime": 16531.3893, "train_tokens_per_second": 144570.261 }, { "epoch": 0.21851720468251154, "grad_norm": 0.28675857186317444, "learning_rate": 3.5549711013592995e-05, "loss": 0.4494, "num_input_tokens_seen": 2393811622, "step": 616, "train_runtime": 16553.8303, "train_tokens_per_second": 144607.718 }, { "epoch": 0.21887194040439872, "grad_norm": 0.264691025018692, "learning_rate": 3.55356540377047e-05, "loss": 0.4328, "num_input_tokens_seen": 2397661003, "step": 617, "train_runtime": 16588.7966, "train_tokens_per_second": 144534.957 }, { "epoch": 0.2192266761262859, "grad_norm": 0.3757021725177765, "learning_rate": 3.552157768467195e-05, "loss": 0.4364, "num_input_tokens_seen": 2401602712, "step": 618, "train_runtime": 16616.5087, "train_tokens_per_second": 144531.126 }, { "epoch": 0.21958141184817312, "grad_norm": 0.31388115882873535, "learning_rate": 3.5507481972051724e-05, "loss": 0.4469, "num_input_tokens_seen": 2405471127, "step": 619, "train_runtime": 16642.197, "train_tokens_per_second": 144540.479 }, { "epoch": 0.2199361475700603, "grad_norm": 0.3356575667858124, "learning_rate": 3.5493366917425175e-05, "loss": 0.4406, "num_input_tokens_seen": 2409323537, "step": 620, "train_runtime": 16668.9689, "train_tokens_per_second": 144539.446 }, { "epoch": 0.2202908832919475, "grad_norm": 0.39978477358818054, "learning_rate": 3.547923253839758e-05, "loss": 0.441, "num_input_tokens_seen": 2413197977, "step": 621, "train_runtime": 16688.6282, "train_tokens_per_second": 144601.339 }, { "epoch": 0.2206456190138347, "grad_norm": 0.3526746332645416, "learning_rate": 3.546507885259831e-05, "loss": 0.4312, "num_input_tokens_seen": 2417022397, "step": 622, "train_runtime": 16714.6697, "train_tokens_per_second": 144604.856 }, { "epoch": 0.22100035473572188, "grad_norm": 0.34158727526664734, "learning_rate": 3.5450905877680846e-05, "loss": 0.4268, "num_input_tokens_seen": 2420900003, "step": 623, "train_runtime": 16735.114, "train_tokens_per_second": 144659.905 }, { "epoch": 0.2213550904576091, "grad_norm": 0.345994770526886, "learning_rate": 3.543671363132267e-05, "loss": 0.4367, "num_input_tokens_seen": 2424838565, "step": 624, "train_runtime": 16766.9767, "train_tokens_per_second": 144619.904 }, { "epoch": 0.22170982617949628, "grad_norm": 0.3709126114845276, "learning_rate": 3.542250213122536e-05, "loss": 0.4326, "num_input_tokens_seen": 2428698162, "step": 625, "train_runtime": 16787.0404, "train_tokens_per_second": 144676.971 }, { "epoch": 0.22206456190138346, "grad_norm": 0.3383060395717621, "learning_rate": 3.5408271395114475e-05, "loss": 0.4395, "num_input_tokens_seen": 2432610986, "step": 626, "train_runtime": 16815.8067, "train_tokens_per_second": 144662.164 }, { "epoch": 0.22241929762327067, "grad_norm": 0.2709101140499115, "learning_rate": 3.539402144073958e-05, "loss": 0.4319, "num_input_tokens_seen": 2436467281, "step": 627, "train_runtime": 16834.4258, "train_tokens_per_second": 144731.237 }, { "epoch": 0.22277403334515786, "grad_norm": 1.007468342781067, "learning_rate": 3.53797522858742e-05, "loss": 0.4356, "num_input_tokens_seen": 2440298309, "step": 628, "train_runtime": 16853.7125, "train_tokens_per_second": 144792.924 }, { "epoch": 0.22312876906704504, "grad_norm": 0.32186731696128845, "learning_rate": 3.536546394831582e-05, "loss": 0.4409, "num_input_tokens_seen": 2444202674, "step": 629, "train_runtime": 16874.1247, "train_tokens_per_second": 144849.153 }, { "epoch": 0.22348350478893225, "grad_norm": 0.45035314559936523, "learning_rate": 3.535115644588584e-05, "loss": 0.4443, "num_input_tokens_seen": 2448043229, "step": 630, "train_runtime": 16908.5019, "train_tokens_per_second": 144781.794 }, { "epoch": 0.22383824051081944, "grad_norm": 0.32039934396743774, "learning_rate": 3.533682979642957e-05, "loss": 0.4323, "num_input_tokens_seen": 2451936168, "step": 631, "train_runtime": 16934.5428, "train_tokens_per_second": 144789.039 }, { "epoch": 0.22419297623270662, "grad_norm": 0.361707478761673, "learning_rate": 3.53224840178162e-05, "loss": 0.4313, "num_input_tokens_seen": 2455822221, "step": 632, "train_runtime": 16964.0475, "train_tokens_per_second": 144766.29 }, { "epoch": 0.22454771195459383, "grad_norm": 0.2907269299030304, "learning_rate": 3.530811912793878e-05, "loss": 0.4305, "num_input_tokens_seen": 2459701810, "step": 633, "train_runtime": 16986.2218, "train_tokens_per_second": 144805.704 }, { "epoch": 0.22490244767648102, "grad_norm": 0.30233198404312134, "learning_rate": 3.5293735144714196e-05, "loss": 0.4288, "num_input_tokens_seen": 2463665979, "step": 634, "train_runtime": 17005.3715, "train_tokens_per_second": 144875.752 }, { "epoch": 0.22525718339836823, "grad_norm": 0.433780312538147, "learning_rate": 3.5279332086083146e-05, "loss": 0.4498, "num_input_tokens_seen": 2467523279, "step": 635, "train_runtime": 17034.5516, "train_tokens_per_second": 144854.02 }, { "epoch": 0.2256119191202554, "grad_norm": 0.4226613938808441, "learning_rate": 3.526490997001014e-05, "loss": 0.4277, "num_input_tokens_seen": 2471500301, "step": 636, "train_runtime": 17061.0987, "train_tokens_per_second": 144861.731 }, { "epoch": 0.2259666548421426, "grad_norm": 0.47030097246170044, "learning_rate": 3.525046881448341e-05, "loss": 0.4324, "num_input_tokens_seen": 2475286242, "step": 637, "train_runtime": 17088.4414, "train_tokens_per_second": 144851.493 }, { "epoch": 0.2263213905640298, "grad_norm": 0.4131212830543518, "learning_rate": 3.5236008637514985e-05, "loss": 0.4391, "num_input_tokens_seen": 2479194346, "step": 638, "train_runtime": 17116.5647, "train_tokens_per_second": 144841.818 }, { "epoch": 0.226676126285917, "grad_norm": 0.35719868540763855, "learning_rate": 3.5221529457140606e-05, "loss": 0.429, "num_input_tokens_seen": 2483092420, "step": 639, "train_runtime": 17150.9919, "train_tokens_per_second": 144778.357 }, { "epoch": 0.22703086200780417, "grad_norm": 0.2966323494911194, "learning_rate": 3.5207031291419695e-05, "loss": 0.4305, "num_input_tokens_seen": 2486933421, "step": 640, "train_runtime": 17171.2961, "train_tokens_per_second": 144830.85 }, { "epoch": 0.22738559772969139, "grad_norm": 0.3294377326965332, "learning_rate": 3.5192514158435375e-05, "loss": 0.4273, "num_input_tokens_seen": 2490909721, "step": 641, "train_runtime": 17195.5443, "train_tokens_per_second": 144857.858 }, { "epoch": 0.22774033345157857, "grad_norm": 0.37328293919563293, "learning_rate": 3.517797807629443e-05, "loss": 0.444, "num_input_tokens_seen": 2494765625, "step": 642, "train_runtime": 17224.4274, "train_tokens_per_second": 144838.813 }, { "epoch": 0.22809506917346578, "grad_norm": 0.3903438448905945, "learning_rate": 3.516342306312726e-05, "loss": 0.4339, "num_input_tokens_seen": 2498681756, "step": 643, "train_runtime": 17251.2515, "train_tokens_per_second": 144840.608 }, { "epoch": 0.22844980489535296, "grad_norm": 0.3415432572364807, "learning_rate": 3.5148849137087877e-05, "loss": 0.4397, "num_input_tokens_seen": 2502533880, "step": 644, "train_runtime": 17276.6024, "train_tokens_per_second": 144851.043 }, { "epoch": 0.22880454061724015, "grad_norm": 0.3202110826969147, "learning_rate": 3.513425631635391e-05, "loss": 0.4321, "num_input_tokens_seen": 2506440511, "step": 645, "train_runtime": 17316.7652, "train_tokens_per_second": 144740.688 }, { "epoch": 0.22915927633912736, "grad_norm": 0.3906806707382202, "learning_rate": 3.5119644619126524e-05, "loss": 0.4336, "num_input_tokens_seen": 2510326336, "step": 646, "train_runtime": 17338.6582, "train_tokens_per_second": 144782.042 }, { "epoch": 0.22951401206101454, "grad_norm": 0.4462188482284546, "learning_rate": 3.5105014063630445e-05, "loss": 0.4425, "num_input_tokens_seen": 2514175360, "step": 647, "train_runtime": 17361.4125, "train_tokens_per_second": 144813.987 }, { "epoch": 0.22986874778290173, "grad_norm": 0.35985511541366577, "learning_rate": 3.5090364668113914e-05, "loss": 0.4298, "num_input_tokens_seen": 2518073568, "step": 648, "train_runtime": 17384.0249, "train_tokens_per_second": 144849.859 }, { "epoch": 0.23022348350478894, "grad_norm": 0.318536639213562, "learning_rate": 3.507569645084868e-05, "loss": 0.4291, "num_input_tokens_seen": 2522013374, "step": 649, "train_runtime": 17410.393, "train_tokens_per_second": 144856.775 }, { "epoch": 0.23057821922667612, "grad_norm": 0.4086730480194092, "learning_rate": 3.5061009430129944e-05, "loss": 0.4268, "num_input_tokens_seen": 2525937867, "step": 650, "train_runtime": 17430.9725, "train_tokens_per_second": 144910.897 }, { "epoch": 0.2309329549485633, "grad_norm": 0.4640146493911743, "learning_rate": 3.504630362427639e-05, "loss": 0.4335, "num_input_tokens_seen": 2529803827, "step": 651, "train_runtime": 17457.9158, "train_tokens_per_second": 144908.697 }, { "epoch": 0.23128769067045052, "grad_norm": 0.4105338454246521, "learning_rate": 3.503157905163012e-05, "loss": 0.4295, "num_input_tokens_seen": 2533713242, "step": 652, "train_runtime": 17482.7372, "train_tokens_per_second": 144926.576 }, { "epoch": 0.2316424263923377, "grad_norm": 0.3166554570198059, "learning_rate": 3.5016835730556636e-05, "loss": 0.4356, "num_input_tokens_seen": 2537612693, "step": 653, "train_runtime": 17510.4362, "train_tokens_per_second": 144920.016 }, { "epoch": 0.23199716211422491, "grad_norm": 0.3236859440803528, "learning_rate": 3.500207367944482e-05, "loss": 0.436, "num_input_tokens_seen": 2541472362, "step": 654, "train_runtime": 17536.2372, "train_tokens_per_second": 144926.892 }, { "epoch": 0.2323518978361121, "grad_norm": 0.6789435744285583, "learning_rate": 3.4987292916706944e-05, "loss": 0.4349, "num_input_tokens_seen": 2545382911, "step": 655, "train_runtime": 17572.9595, "train_tokens_per_second": 144846.57 }, { "epoch": 0.23270663355799928, "grad_norm": 0.4421486556529999, "learning_rate": 3.497249346077859e-05, "loss": 0.4226, "num_input_tokens_seen": 2549168061, "step": 656, "train_runtime": 17593.61, "train_tokens_per_second": 144891.7 }, { "epoch": 0.2330613692798865, "grad_norm": 0.39481446146965027, "learning_rate": 3.495767533011866e-05, "loss": 0.4348, "num_input_tokens_seen": 2553059844, "step": 657, "train_runtime": 17623.1114, "train_tokens_per_second": 144869.983 }, { "epoch": 0.23341610500177368, "grad_norm": 0.34375619888305664, "learning_rate": 3.494283854320937e-05, "loss": 0.4275, "num_input_tokens_seen": 2556895217, "step": 658, "train_runtime": 17653.1341, "train_tokens_per_second": 144840.865 }, { "epoch": 0.23377084072366086, "grad_norm": 0.4107329249382019, "learning_rate": 3.492798311855617e-05, "loss": 0.4367, "num_input_tokens_seen": 2560833297, "step": 659, "train_runtime": 17685.4717, "train_tokens_per_second": 144798.699 }, { "epoch": 0.23412557644554807, "grad_norm": 0.4343647360801697, "learning_rate": 3.491310907468779e-05, "loss": 0.4396, "num_input_tokens_seen": 2564740407, "step": 660, "train_runtime": 17707.4739, "train_tokens_per_second": 144839.429 }, { "epoch": 0.23448031216743526, "grad_norm": 0.4015113413333893, "learning_rate": 3.4898216430156156e-05, "loss": 0.4415, "num_input_tokens_seen": 2568640044, "step": 661, "train_runtime": 17736.824, "train_tokens_per_second": 144819.616 }, { "epoch": 0.23483504788932247, "grad_norm": 0.36426711082458496, "learning_rate": 3.488330520353641e-05, "loss": 0.431, "num_input_tokens_seen": 2572452854, "step": 662, "train_runtime": 17758.9583, "train_tokens_per_second": 144853.814 }, { "epoch": 0.23518978361120965, "grad_norm": 0.359160840511322, "learning_rate": 3.486837541342688e-05, "loss": 0.4369, "num_input_tokens_seen": 2576312535, "step": 663, "train_runtime": 17791.2613, "train_tokens_per_second": 144807.751 }, { "epoch": 0.23554451933309684, "grad_norm": 0.271325021982193, "learning_rate": 3.4853427078449015e-05, "loss": 0.4265, "num_input_tokens_seen": 2580254534, "step": 664, "train_runtime": 17816.8198, "train_tokens_per_second": 144821.274 }, { "epoch": 0.23589925505498405, "grad_norm": 0.3469088077545166, "learning_rate": 3.483846021724743e-05, "loss": 0.4429, "num_input_tokens_seen": 2584148777, "step": 665, "train_runtime": 17839.2951, "train_tokens_per_second": 144857.113 }, { "epoch": 0.23625399077687123, "grad_norm": 0.35063183307647705, "learning_rate": 3.482347484848982e-05, "loss": 0.4309, "num_input_tokens_seen": 2587996767, "step": 666, "train_runtime": 17863.6717, "train_tokens_per_second": 144874.85 }, { "epoch": 0.23660872649875841, "grad_norm": 0.43926548957824707, "learning_rate": 3.4808470990867e-05, "loss": 0.4395, "num_input_tokens_seen": 2591888612, "step": 667, "train_runtime": 17892.3043, "train_tokens_per_second": 144860.526 }, { "epoch": 0.23696346222064563, "grad_norm": 0.34878402948379517, "learning_rate": 3.4793448663092786e-05, "loss": 0.4379, "num_input_tokens_seen": 2595795738, "step": 668, "train_runtime": 17919.6087, "train_tokens_per_second": 144857.836 }, { "epoch": 0.2373181979425328, "grad_norm": 0.29766952991485596, "learning_rate": 3.4778407883904086e-05, "loss": 0.4377, "num_input_tokens_seen": 2599701476, "step": 669, "train_runtime": 17939.8052, "train_tokens_per_second": 144912.47 }, { "epoch": 0.23767293366442, "grad_norm": 0.391713410615921, "learning_rate": 3.47633486720608e-05, "loss": 0.4365, "num_input_tokens_seen": 2603625791, "step": 670, "train_runtime": 17972.6893, "train_tokens_per_second": 144865.677 }, { "epoch": 0.2380276693863072, "grad_norm": 0.3732571005821228, "learning_rate": 3.474827104634582e-05, "loss": 0.4313, "num_input_tokens_seen": 2607510710, "step": 671, "train_runtime": 18004.5404, "train_tokens_per_second": 144825.175 }, { "epoch": 0.2383824051081944, "grad_norm": 0.3485734164714813, "learning_rate": 3.4733175025565006e-05, "loss": 0.4265, "num_input_tokens_seen": 2611433835, "step": 672, "train_runtime": 18032.6268, "train_tokens_per_second": 144817.162 }, { "epoch": 0.2387371408300816, "grad_norm": 0.2736017405986786, "learning_rate": 3.471806062854716e-05, "loss": 0.4272, "num_input_tokens_seen": 2615252685, "step": 673, "train_runtime": 18064.0061, "train_tokens_per_second": 144777.004 }, { "epoch": 0.23909187655196878, "grad_norm": 0.35721302032470703, "learning_rate": 3.4702927874144015e-05, "loss": 0.4308, "num_input_tokens_seen": 2619097095, "step": 674, "train_runtime": 18085.091, "train_tokens_per_second": 144820.786 }, { "epoch": 0.23944661227385597, "grad_norm": 0.42588353157043457, "learning_rate": 3.468777678123017e-05, "loss": 0.4341, "num_input_tokens_seen": 2622975634, "step": 675, "train_runtime": 18115.1229, "train_tokens_per_second": 144794.802 }, { "epoch": 0.23980134799574318, "grad_norm": 0.49168187379837036, "learning_rate": 3.467260736870314e-05, "loss": 0.4304, "num_input_tokens_seen": 2626851905, "step": 676, "train_runtime": 18145.2272, "train_tokens_per_second": 144768.202 }, { "epoch": 0.24015608371763036, "grad_norm": 0.34103822708129883, "learning_rate": 3.465741965548325e-05, "loss": 0.4422, "num_input_tokens_seen": 2630714549, "step": 677, "train_runtime": 18172.1866, "train_tokens_per_second": 144765.988 }, { "epoch": 0.24051081943951755, "grad_norm": 0.328505277633667, "learning_rate": 3.464221366051369e-05, "loss": 0.4428, "num_input_tokens_seen": 2634585895, "step": 678, "train_runtime": 18193.0267, "train_tokens_per_second": 144812.951 }, { "epoch": 0.24086555516140476, "grad_norm": 0.35076984763145447, "learning_rate": 3.462698940276041e-05, "loss": 0.4276, "num_input_tokens_seen": 2638454162, "step": 679, "train_runtime": 18214.8388, "train_tokens_per_second": 144851.909 }, { "epoch": 0.24122029088329194, "grad_norm": 0.3189830183982849, "learning_rate": 3.461174690121217e-05, "loss": 0.4348, "num_input_tokens_seen": 2642349653, "step": 680, "train_runtime": 18243.5273, "train_tokens_per_second": 144837.652 }, { "epoch": 0.24157502660517916, "grad_norm": 0.2929215133190155, "learning_rate": 3.459648617488047e-05, "loss": 0.4193, "num_input_tokens_seen": 2646228341, "step": 681, "train_runtime": 18263.5803, "train_tokens_per_second": 144890.996 }, { "epoch": 0.24192976232706634, "grad_norm": 0.32214075326919556, "learning_rate": 3.4581207242799554e-05, "loss": 0.4378, "num_input_tokens_seen": 2650141279, "step": 682, "train_runtime": 18287.3024, "train_tokens_per_second": 144917.015 }, { "epoch": 0.24228449804895352, "grad_norm": 0.33646491169929504, "learning_rate": 3.456591012402635e-05, "loss": 0.4514, "num_input_tokens_seen": 2654007382, "step": 683, "train_runtime": 18314.5225, "train_tokens_per_second": 144912.726 }, { "epoch": 0.24263923377084073, "grad_norm": 0.3260505497455597, "learning_rate": 3.45505948376405e-05, "loss": 0.4306, "num_input_tokens_seen": 2657840860, "step": 684, "train_runtime": 18347.2256, "train_tokens_per_second": 144863.366 }, { "epoch": 0.24299396949272792, "grad_norm": 0.3106876015663147, "learning_rate": 3.453526140274428e-05, "loss": 0.4474, "num_input_tokens_seen": 2661804367, "step": 685, "train_runtime": 18366.7527, "train_tokens_per_second": 144925.149 }, { "epoch": 0.2433487052146151, "grad_norm": 0.35184788703918457, "learning_rate": 3.451990983846262e-05, "loss": 0.4316, "num_input_tokens_seen": 2665717505, "step": 686, "train_runtime": 18389.4927, "train_tokens_per_second": 144958.73 }, { "epoch": 0.2437034409365023, "grad_norm": 0.30808883905410767, "learning_rate": 3.450454016394305e-05, "loss": 0.4316, "num_input_tokens_seen": 2669576624, "step": 687, "train_runtime": 18413.1508, "train_tokens_per_second": 144982.065 }, { "epoch": 0.2440581766583895, "grad_norm": 0.43722862005233765, "learning_rate": 3.4489152398355696e-05, "loss": 0.4269, "num_input_tokens_seen": 2673481080, "step": 688, "train_runtime": 18441.3363, "train_tokens_per_second": 144972.199 }, { "epoch": 0.24441291238027668, "grad_norm": 0.30032846331596375, "learning_rate": 3.4473746560893245e-05, "loss": 0.4456, "num_input_tokens_seen": 2677332772, "step": 689, "train_runtime": 18473.7118, "train_tokens_per_second": 144926.629 }, { "epoch": 0.2447676481021639, "grad_norm": 0.2920217514038086, "learning_rate": 3.445832267077092e-05, "loss": 0.4303, "num_input_tokens_seen": 2681258132, "step": 690, "train_runtime": 18496.108, "train_tokens_per_second": 144963.369 }, { "epoch": 0.24512238382405108, "grad_norm": 0.28699666261672974, "learning_rate": 3.444288074722648e-05, "loss": 0.4236, "num_input_tokens_seen": 2685156686, "step": 691, "train_runtime": 18529.004, "train_tokens_per_second": 144916.407 }, { "epoch": 0.2454771195459383, "grad_norm": 0.3152562975883484, "learning_rate": 3.4427420809520145e-05, "loss": 0.4377, "num_input_tokens_seen": 2689084431, "step": 692, "train_runtime": 18549.463, "train_tokens_per_second": 144968.317 }, { "epoch": 0.24583185526782547, "grad_norm": 0.3627362549304962, "learning_rate": 3.4411942876934637e-05, "loss": 0.4445, "num_input_tokens_seen": 2692957080, "step": 693, "train_runtime": 18581.2656, "train_tokens_per_second": 144928.614 }, { "epoch": 0.24618659098971266, "grad_norm": 0.31976139545440674, "learning_rate": 3.439644696877509e-05, "loss": 0.442, "num_input_tokens_seen": 2696889377, "step": 694, "train_runtime": 18601.5923, "train_tokens_per_second": 144981.641 }, { "epoch": 0.24654132671159987, "grad_norm": 0.30854523181915283, "learning_rate": 3.438093310436909e-05, "loss": 0.4413, "num_input_tokens_seen": 2700733612, "step": 695, "train_runtime": 18630.2769, "train_tokens_per_second": 144964.759 }, { "epoch": 0.24689606243348705, "grad_norm": 0.42881548404693604, "learning_rate": 3.436540130306659e-05, "loss": 0.4295, "num_input_tokens_seen": 2704675753, "step": 696, "train_runtime": 18651.399, "train_tokens_per_second": 145011.951 }, { "epoch": 0.24725079815537424, "grad_norm": 0.3389131724834442, "learning_rate": 3.4349851584239946e-05, "loss": 0.431, "num_input_tokens_seen": 2708536645, "step": 697, "train_runtime": 18679.7484, "train_tokens_per_second": 144998.561 }, { "epoch": 0.24760553387726145, "grad_norm": 0.35836321115493774, "learning_rate": 3.4334283967283824e-05, "loss": 0.4204, "num_input_tokens_seen": 2712469059, "step": 698, "train_runtime": 18720.9241, "train_tokens_per_second": 144889.699 }, { "epoch": 0.24796026959914863, "grad_norm": 0.414092093706131, "learning_rate": 3.431869847161525e-05, "loss": 0.4275, "num_input_tokens_seen": 2716392346, "step": 699, "train_runtime": 18743.4808, "train_tokens_per_second": 144924.648 }, { "epoch": 0.24831500532103584, "grad_norm": 0.3597494959831238, "learning_rate": 3.430309511667353e-05, "loss": 0.4144, "num_input_tokens_seen": 2720281117, "step": 700, "train_runtime": 18771.3265, "train_tokens_per_second": 144916.829 }, { "epoch": 0.24866974104292303, "grad_norm": 0.3231336772441864, "learning_rate": 3.4287473921920254e-05, "loss": 0.4338, "num_input_tokens_seen": 2724193562, "step": 701, "train_runtime": 18791.1995, "train_tokens_per_second": 144971.776 }, { "epoch": 0.2490244767648102, "grad_norm": 5.555556297302246, "learning_rate": 3.427183490683925e-05, "loss": 0.4349, "num_input_tokens_seen": 2727972169, "step": 702, "train_runtime": 18812.0997, "train_tokens_per_second": 145011.573 }, { "epoch": 0.24937921248669742, "grad_norm": 0.4749988615512848, "learning_rate": 3.425617809093659e-05, "loss": 0.4361, "num_input_tokens_seen": 2731809793, "step": 703, "train_runtime": 18844.3155, "train_tokens_per_second": 144967.313 }, { "epoch": 0.2497339482085846, "grad_norm": 0.7476677894592285, "learning_rate": 3.4240503493740526e-05, "loss": 0.4306, "num_input_tokens_seen": 2735793871, "step": 704, "train_runtime": 18877.7151, "train_tokens_per_second": 144921.875 }, { "epoch": 0.2500886839304718, "grad_norm": 0.749248743057251, "learning_rate": 3.422481113480153e-05, "loss": 0.4487, "num_input_tokens_seen": 2739620990, "step": 705, "train_runtime": 18899.7253, "train_tokens_per_second": 144955.598 }, { "epoch": 0.250443419652359, "grad_norm": 0.5285269021987915, "learning_rate": 3.4209101033692165e-05, "loss": 0.4267, "num_input_tokens_seen": 2743449171, "step": 706, "train_runtime": 18926.0176, "train_tokens_per_second": 144956.495 }, { "epoch": 0.2507981553742462, "grad_norm": 0.5255205631256104, "learning_rate": 3.4193373210007186e-05, "loss": 0.4336, "num_input_tokens_seen": 2747327146, "step": 707, "train_runtime": 18947.2593, "train_tokens_per_second": 144998.656 }, { "epoch": 0.25115289109613337, "grad_norm": 0.4879884123802185, "learning_rate": 3.417762768336341e-05, "loss": 0.4281, "num_input_tokens_seen": 2751157227, "step": 708, "train_runtime": 18970.6804, "train_tokens_per_second": 145021.537 }, { "epoch": 0.2515076268180206, "grad_norm": 0.7113572359085083, "learning_rate": 3.416186447339975e-05, "loss": 0.4419, "num_input_tokens_seen": 2755030028, "step": 709, "train_runtime": 18997.7216, "train_tokens_per_second": 145018.971 }, { "epoch": 0.2518623625399078, "grad_norm": 0.45200061798095703, "learning_rate": 3.414608359977719e-05, "loss": 0.4419, "num_input_tokens_seen": 2758933478, "step": 710, "train_runtime": 19020.6987, "train_tokens_per_second": 145049.008 }, { "epoch": 0.25221709826179495, "grad_norm": 0.4216223359107971, "learning_rate": 3.41302850821787e-05, "loss": 0.4424, "num_input_tokens_seen": 2762821218, "step": 711, "train_runtime": 19046.1596, "train_tokens_per_second": 145059.228 }, { "epoch": 0.25257183398368216, "grad_norm": 0.4681555926799774, "learning_rate": 3.411446894030931e-05, "loss": 0.4323, "num_input_tokens_seen": 2766725370, "step": 712, "train_runtime": 19073.2477, "train_tokens_per_second": 145057.906 }, { "epoch": 0.25292656970556937, "grad_norm": 0.4181062579154968, "learning_rate": 3.4098635193895994e-05, "loss": 0.4254, "num_input_tokens_seen": 2770630280, "step": 713, "train_runtime": 19112.0726, "train_tokens_per_second": 144967.547 }, { "epoch": 0.2532813054274565, "grad_norm": 0.43935954570770264, "learning_rate": 3.4082783862687714e-05, "loss": 0.4304, "num_input_tokens_seen": 2774483549, "step": 714, "train_runtime": 19144.9796, "train_tokens_per_second": 144919.64 }, { "epoch": 0.25363604114934374, "grad_norm": 0.32361456751823425, "learning_rate": 3.406691496645533e-05, "loss": 0.4354, "num_input_tokens_seen": 2778322706, "step": 715, "train_runtime": 19164.3683, "train_tokens_per_second": 144973.352 }, { "epoch": 0.25399077687123095, "grad_norm": 0.3993259370326996, "learning_rate": 3.4051028524991644e-05, "loss": 0.4363, "num_input_tokens_seen": 2782152781, "step": 716, "train_runtime": 19198.2447, "train_tokens_per_second": 144917.039 }, { "epoch": 0.2543455125931181, "grad_norm": 0.38063907623291016, "learning_rate": 3.4035124558111325e-05, "loss": 0.4384, "num_input_tokens_seen": 2786102177, "step": 717, "train_runtime": 19236.3253, "train_tokens_per_second": 144835.468 }, { "epoch": 0.2547002483150053, "grad_norm": 0.3694206774234772, "learning_rate": 3.40192030856509e-05, "loss": 0.4372, "num_input_tokens_seen": 2789961664, "step": 718, "train_runtime": 19262.8841, "train_tokens_per_second": 144836.134 }, { "epoch": 0.25505498403689253, "grad_norm": 0.344200998544693, "learning_rate": 3.400326412746872e-05, "loss": 0.4342, "num_input_tokens_seen": 2793868865, "step": 719, "train_runtime": 19296.6406, "train_tokens_per_second": 144785.246 }, { "epoch": 0.2554097197587797, "grad_norm": 0.4210270941257477, "learning_rate": 3.3987307703444984e-05, "loss": 0.434, "num_input_tokens_seen": 2797715514, "step": 720, "train_runtime": 19318.6473, "train_tokens_per_second": 144819.431 }, { "epoch": 0.2557644554806669, "grad_norm": 0.36089566349983215, "learning_rate": 3.397133383348163e-05, "loss": 0.432, "num_input_tokens_seen": 2801637906, "step": 721, "train_runtime": 19348.4, "train_tokens_per_second": 144799.462 }, { "epoch": 0.2561191912025541, "grad_norm": 0.3441641330718994, "learning_rate": 3.395534253750238e-05, "loss": 0.4316, "num_input_tokens_seen": 2805523134, "step": 722, "train_runtime": 19382.5586, "train_tokens_per_second": 144744.726 }, { "epoch": 0.25647392692444126, "grad_norm": 0.3640066087245941, "learning_rate": 3.393933383545269e-05, "loss": 0.4198, "num_input_tokens_seen": 2809453769, "step": 723, "train_runtime": 19413.2834, "train_tokens_per_second": 144718.114 }, { "epoch": 0.2568286626463285, "grad_norm": 0.36628884077072144, "learning_rate": 3.392330774729973e-05, "loss": 0.4205, "num_input_tokens_seen": 2813322020, "step": 724, "train_runtime": 19447.703, "train_tokens_per_second": 144660.889 }, { "epoch": 0.2571833983682157, "grad_norm": 0.33735132217407227, "learning_rate": 3.390726429303233e-05, "loss": 0.4167, "num_input_tokens_seen": 2817193785, "step": 725, "train_runtime": 19468.762, "train_tokens_per_second": 144703.284 }, { "epoch": 0.2575381340901029, "grad_norm": 0.3476066589355469, "learning_rate": 3.389120349266102e-05, "loss": 0.4352, "num_input_tokens_seen": 2821076660, "step": 726, "train_runtime": 19493.8871, "train_tokens_per_second": 144715.964 }, { "epoch": 0.25789286981199006, "grad_norm": 0.5600104331970215, "learning_rate": 3.387512536621792e-05, "loss": 0.433, "num_input_tokens_seen": 2824939359, "step": 727, "train_runtime": 19516.5826, "train_tokens_per_second": 144745.595 }, { "epoch": 0.25824760553387727, "grad_norm": 0.33711037039756775, "learning_rate": 3.38590299337568e-05, "loss": 0.4329, "num_input_tokens_seen": 2828731447, "step": 728, "train_runtime": 19538.6386, "train_tokens_per_second": 144776.282 }, { "epoch": 0.2586023412557645, "grad_norm": 0.3425234854221344, "learning_rate": 3.3842917215352984e-05, "loss": 0.4331, "num_input_tokens_seen": 2832685275, "step": 729, "train_runtime": 19573.7355, "train_tokens_per_second": 144718.686 }, { "epoch": 0.25895707697765163, "grad_norm": 0.38372644782066345, "learning_rate": 3.3826787231103396e-05, "loss": 0.4313, "num_input_tokens_seen": 2836545625, "step": 730, "train_runtime": 19600.3581, "train_tokens_per_second": 144719.072 }, { "epoch": 0.25931181269953885, "grad_norm": 0.33523571491241455, "learning_rate": 3.381064000112644e-05, "loss": 0.4242, "num_input_tokens_seen": 2840499748, "step": 731, "train_runtime": 19632.4775, "train_tokens_per_second": 144683.713 }, { "epoch": 0.25966654842142606, "grad_norm": 0.4347497522830963, "learning_rate": 3.379447554556209e-05, "loss": 0.4316, "num_input_tokens_seen": 2844427563, "step": 732, "train_runtime": 19658.9836, "train_tokens_per_second": 144688.435 }, { "epoch": 0.2600212841433132, "grad_norm": 0.9292910695075989, "learning_rate": 3.3778293884571756e-05, "loss": 0.4352, "num_input_tokens_seen": 2848307681, "step": 733, "train_runtime": 19686.3681, "train_tokens_per_second": 144684.264 }, { "epoch": 0.2603760198652004, "grad_norm": 0.3784007132053375, "learning_rate": 3.376209503833833e-05, "loss": 0.429, "num_input_tokens_seen": 2852211943, "step": 734, "train_runtime": 19708.2934, "train_tokens_per_second": 144721.407 }, { "epoch": 0.26073075558708764, "grad_norm": 0.49971529841423035, "learning_rate": 3.374587902706613e-05, "loss": 0.4408, "num_input_tokens_seen": 2856071264, "step": 735, "train_runtime": 19736.6083, "train_tokens_per_second": 144709.325 }, { "epoch": 0.2610854913089748, "grad_norm": 0.3703380525112152, "learning_rate": 3.3729645870980906e-05, "loss": 0.4338, "num_input_tokens_seen": 2859952868, "step": 736, "train_runtime": 19758.8798, "train_tokens_per_second": 144742.662 }, { "epoch": 0.261440227030862, "grad_norm": 0.3115621507167816, "learning_rate": 3.371339559032977e-05, "loss": 0.4344, "num_input_tokens_seen": 2863800996, "step": 737, "train_runtime": 19797.2952, "train_tokens_per_second": 144656.175 }, { "epoch": 0.2617949627527492, "grad_norm": 0.2952059209346771, "learning_rate": 3.3697128205381186e-05, "loss": 0.4377, "num_input_tokens_seen": 2867676826, "step": 738, "train_runtime": 19824.6782, "train_tokens_per_second": 144651.873 }, { "epoch": 0.2621496984746364, "grad_norm": 0.3109380304813385, "learning_rate": 3.368084373642498e-05, "loss": 0.4226, "num_input_tokens_seen": 2871592420, "step": 739, "train_runtime": 19855.105, "train_tokens_per_second": 144627.41 }, { "epoch": 0.2625044341965236, "grad_norm": 0.31396153569221497, "learning_rate": 3.366454220377226e-05, "loss": 0.4356, "num_input_tokens_seen": 2875497315, "step": 740, "train_runtime": 19879.4299, "train_tokens_per_second": 144646.87 }, { "epoch": 0.2628591699184108, "grad_norm": 0.28385084867477417, "learning_rate": 3.3648223627755427e-05, "loss": 0.4265, "num_input_tokens_seen": 2879382066, "step": 741, "train_runtime": 19903.1659, "train_tokens_per_second": 144669.55 }, { "epoch": 0.26321390564029795, "grad_norm": 0.29586324095726013, "learning_rate": 3.3631888028728145e-05, "loss": 0.4292, "num_input_tokens_seen": 2883237989, "step": 742, "train_runtime": 19930.2236, "train_tokens_per_second": 144666.615 }, { "epoch": 0.26356864136218516, "grad_norm": 0.3007299602031708, "learning_rate": 3.361553542706531e-05, "loss": 0.4308, "num_input_tokens_seen": 2887160506, "step": 743, "train_runtime": 19961.7266, "train_tokens_per_second": 144634.809 }, { "epoch": 0.2639233770840724, "grad_norm": 0.3198500871658325, "learning_rate": 3.359916584316301e-05, "loss": 0.4287, "num_input_tokens_seen": 2890985927, "step": 744, "train_runtime": 19989.3503, "train_tokens_per_second": 144626.308 }, { "epoch": 0.2642781128059596, "grad_norm": 0.2854132354259491, "learning_rate": 3.358277929743853e-05, "loss": 0.4252, "num_input_tokens_seen": 2894858653, "step": 745, "train_runtime": 20012.6514, "train_tokens_per_second": 144651.43 }, { "epoch": 0.26463284852784674, "grad_norm": 0.28493934869766235, "learning_rate": 3.3566375810330294e-05, "loss": 0.4389, "num_input_tokens_seen": 2898758148, "step": 746, "train_runtime": 20034.8935, "train_tokens_per_second": 144685.478 }, { "epoch": 0.26498758424973395, "grad_norm": 0.29270586371421814, "learning_rate": 3.354995540229789e-05, "loss": 0.4228, "num_input_tokens_seen": 2902590377, "step": 747, "train_runtime": 20065.1202, "train_tokens_per_second": 144658.51 }, { "epoch": 0.26534231997162117, "grad_norm": 0.3120173513889313, "learning_rate": 3.353351809382197e-05, "loss": 0.4263, "num_input_tokens_seen": 2906488902, "step": 748, "train_runtime": 20096.9324, "train_tokens_per_second": 144623.51 }, { "epoch": 0.2656970556935083, "grad_norm": 0.37213245034217834, "learning_rate": 3.3517063905404295e-05, "loss": 0.4368, "num_input_tokens_seen": 2910306109, "step": 749, "train_runtime": 20117.9464, "train_tokens_per_second": 144662.186 }, { "epoch": 0.26605179141539553, "grad_norm": 0.30378639698028564, "learning_rate": 3.350059285756766e-05, "loss": 0.4231, "num_input_tokens_seen": 2914199735, "step": 750, "train_runtime": 20138.1237, "train_tokens_per_second": 144710.588 }, { "epoch": 0.26640652713728274, "grad_norm": 0.29938435554504395, "learning_rate": 3.348410497085591e-05, "loss": 0.442, "num_input_tokens_seen": 2918073637, "step": 751, "train_runtime": 20165.9216, "train_tokens_per_second": 144703.212 }, { "epoch": 0.2667612628591699, "grad_norm": 0.34353309869766235, "learning_rate": 3.346760026583387e-05, "loss": 0.4448, "num_input_tokens_seen": 2921990641, "step": 752, "train_runtime": 20197.3789, "train_tokens_per_second": 144671.774 }, { "epoch": 0.2671159985810571, "grad_norm": 0.2545029819011688, "learning_rate": 3.3451078763087356e-05, "loss": 0.4234, "num_input_tokens_seen": 2925871809, "step": 753, "train_runtime": 20218.94, "train_tokens_per_second": 144709.456 }, { "epoch": 0.2674707343029443, "grad_norm": 0.2935727834701538, "learning_rate": 3.343454048322313e-05, "loss": 0.4365, "num_input_tokens_seen": 2929755143, "step": 754, "train_runtime": 20248.0232, "train_tokens_per_second": 144693.391 }, { "epoch": 0.2678254700248315, "grad_norm": 0.3091091513633728, "learning_rate": 3.3417985446868884e-05, "loss": 0.4208, "num_input_tokens_seen": 2933647790, "step": 755, "train_runtime": 20273.6423, "train_tokens_per_second": 144702.552 }, { "epoch": 0.2681802057467187, "grad_norm": 0.27228492498397827, "learning_rate": 3.34014136746732e-05, "loss": 0.4369, "num_input_tokens_seen": 2937522877, "step": 756, "train_runtime": 20303.3368, "train_tokens_per_second": 144681.778 }, { "epoch": 0.2685349414686059, "grad_norm": 0.28577375411987305, "learning_rate": 3.338482518730555e-05, "loss": 0.4304, "num_input_tokens_seen": 2941437641, "step": 757, "train_runtime": 20336.462, "train_tokens_per_second": 144638.612 }, { "epoch": 0.26888967719049306, "grad_norm": 0.2961982488632202, "learning_rate": 3.336822000545623e-05, "loss": 0.4252, "num_input_tokens_seen": 2945340066, "step": 758, "train_runtime": 20367.1912, "train_tokens_per_second": 144611.991 }, { "epoch": 0.26924441291238027, "grad_norm": 0.32090136408805847, "learning_rate": 3.335159814983639e-05, "loss": 0.4382, "num_input_tokens_seen": 2949243126, "step": 759, "train_runtime": 20387.9161, "train_tokens_per_second": 144656.429 }, { "epoch": 0.2695991486342675, "grad_norm": 0.372272253036499, "learning_rate": 3.333495964117796e-05, "loss": 0.4318, "num_input_tokens_seen": 2953103625, "step": 760, "train_runtime": 20420.6106, "train_tokens_per_second": 144613.875 }, { "epoch": 0.26995388435615464, "grad_norm": 0.36175337433815, "learning_rate": 3.331830450023362e-05, "loss": 0.4298, "num_input_tokens_seen": 2956930861, "step": 761, "train_runtime": 20445.8865, "train_tokens_per_second": 144622.287 }, { "epoch": 0.27030862007804185, "grad_norm": 0.30598026514053345, "learning_rate": 3.330163274777685e-05, "loss": 0.4261, "num_input_tokens_seen": 2960840999, "step": 762, "train_runtime": 20474.3601, "train_tokens_per_second": 144612.138 }, { "epoch": 0.27066335579992906, "grad_norm": 0.3306232988834381, "learning_rate": 3.328494440460178e-05, "loss": 0.4295, "num_input_tokens_seen": 2964726454, "step": 763, "train_runtime": 20506.2452, "train_tokens_per_second": 144576.758 }, { "epoch": 0.2710180915218163, "grad_norm": 0.36704978346824646, "learning_rate": 3.326823949152329e-05, "loss": 0.4298, "num_input_tokens_seen": 2968610026, "step": 764, "train_runtime": 20532.9, "train_tokens_per_second": 144578.214 }, { "epoch": 0.27137282724370343, "grad_norm": 0.33671852946281433, "learning_rate": 3.3251518029376906e-05, "loss": 0.4355, "num_input_tokens_seen": 2972526953, "step": 765, "train_runtime": 20562.4474, "train_tokens_per_second": 144560.951 }, { "epoch": 0.27172756296559064, "grad_norm": 0.2806100845336914, "learning_rate": 3.323478003901879e-05, "loss": 0.4251, "num_input_tokens_seen": 2976469366, "step": 766, "train_runtime": 20580.7238, "train_tokens_per_second": 144624.135 }, { "epoch": 0.27208229868747785, "grad_norm": 0.307730108499527, "learning_rate": 3.321802554132572e-05, "loss": 0.432, "num_input_tokens_seen": 2980335475, "step": 767, "train_runtime": 20612.3672, "train_tokens_per_second": 144589.675 }, { "epoch": 0.272437034409365, "grad_norm": 0.30750295519828796, "learning_rate": 3.320125455719507e-05, "loss": 0.4191, "num_input_tokens_seen": 2984247050, "step": 768, "train_runtime": 20638.9397, "train_tokens_per_second": 144593.041 }, { "epoch": 0.2727917701312522, "grad_norm": 0.3284481167793274, "learning_rate": 3.318446710754477e-05, "loss": 0.4254, "num_input_tokens_seen": 2988114689, "step": 769, "train_runtime": 20665.6741, "train_tokens_per_second": 144593.139 }, { "epoch": 0.27314650585313943, "grad_norm": 0.33315837383270264, "learning_rate": 3.316766321331329e-05, "loss": 0.4248, "num_input_tokens_seen": 2991995549, "step": 770, "train_runtime": 20687.899, "train_tokens_per_second": 144625.394 }, { "epoch": 0.2735012415750266, "grad_norm": 0.27497756481170654, "learning_rate": 3.3150842895459626e-05, "loss": 0.4342, "num_input_tokens_seen": 2995817438, "step": 771, "train_runtime": 20711.4855, "train_tokens_per_second": 144645.223 }, { "epoch": 0.2738559772969138, "grad_norm": 0.23477092385292053, "learning_rate": 3.313400617496322e-05, "loss": 0.4327, "num_input_tokens_seen": 2999668168, "step": 772, "train_runtime": 20733.0468, "train_tokens_per_second": 144680.528 }, { "epoch": 0.274210713018801, "grad_norm": 0.28046298027038574, "learning_rate": 3.311715307282402e-05, "loss": 0.4369, "num_input_tokens_seen": 3003574321, "step": 773, "train_runtime": 20753.5982, "train_tokens_per_second": 144725.473 }, { "epoch": 0.27456544874068817, "grad_norm": 0.3080970346927643, "learning_rate": 3.3100283610062374e-05, "loss": 0.4256, "num_input_tokens_seen": 3007467142, "step": 774, "train_runtime": 20780.4179, "train_tokens_per_second": 144726.018 }, { "epoch": 0.2749201844625754, "grad_norm": 0.3535284698009491, "learning_rate": 3.308339780771904e-05, "loss": 0.4438, "num_input_tokens_seen": 3011385444, "step": 775, "train_runtime": 20806.5118, "train_tokens_per_second": 144732.835 }, { "epoch": 0.2752749201844626, "grad_norm": 0.3581681251525879, "learning_rate": 3.306649568685517e-05, "loss": 0.416, "num_input_tokens_seen": 3015316569, "step": 776, "train_runtime": 20844.6576, "train_tokens_per_second": 144656.565 }, { "epoch": 0.27562965590634975, "grad_norm": 0.38618436455726624, "learning_rate": 3.304957726855225e-05, "loss": 0.4196, "num_input_tokens_seen": 3019150641, "step": 777, "train_runtime": 20867.0141, "train_tokens_per_second": 144685.321 }, { "epoch": 0.27598439162823696, "grad_norm": 0.2966167628765106, "learning_rate": 3.3032642573912114e-05, "loss": 0.4341, "num_input_tokens_seen": 3023061276, "step": 778, "train_runtime": 20895.1632, "train_tokens_per_second": 144677.562 }, { "epoch": 0.27633912735012417, "grad_norm": 0.3393070697784424, "learning_rate": 3.301569162405688e-05, "loss": 0.4136, "num_input_tokens_seen": 3027007142, "step": 779, "train_runtime": 20927.8594, "train_tokens_per_second": 144640.075 }, { "epoch": 0.2766938630720113, "grad_norm": 0.2598637640476227, "learning_rate": 3.299872444012895e-05, "loss": 0.4226, "num_input_tokens_seen": 3030860333, "step": 780, "train_runtime": 20948.6135, "train_tokens_per_second": 144680.713 }, { "epoch": 0.27704859879389854, "grad_norm": 0.33122000098228455, "learning_rate": 3.2981741043290975e-05, "loss": 0.4245, "num_input_tokens_seen": 3034825276, "step": 781, "train_runtime": 20977.7256, "train_tokens_per_second": 144668.937 }, { "epoch": 0.27740333451578575, "grad_norm": 0.27475839853286743, "learning_rate": 3.296474145472583e-05, "loss": 0.4319, "num_input_tokens_seen": 3038602509, "step": 782, "train_runtime": 21006.3014, "train_tokens_per_second": 144651.952 }, { "epoch": 0.27775807023767296, "grad_norm": 0.2930700480937958, "learning_rate": 3.294772569563656e-05, "loss": 0.4285, "num_input_tokens_seen": 3042491962, "step": 783, "train_runtime": 21025.5494, "train_tokens_per_second": 144704.517 }, { "epoch": 0.2781128059595601, "grad_norm": 0.3133538067340851, "learning_rate": 3.293069378724641e-05, "loss": 0.4246, "num_input_tokens_seen": 3046280854, "step": 784, "train_runtime": 21054.4736, "train_tokens_per_second": 144685.681 }, { "epoch": 0.27846754168144733, "grad_norm": 0.30708372592926025, "learning_rate": 3.291364575079876e-05, "loss": 0.4152, "num_input_tokens_seen": 3050162570, "step": 785, "train_runtime": 21082.8771, "train_tokens_per_second": 144674.873 }, { "epoch": 0.27882227740333454, "grad_norm": 0.2622973620891571, "learning_rate": 3.28965816075571e-05, "loss": 0.4329, "num_input_tokens_seen": 3054066741, "step": 786, "train_runtime": 21122.6108, "train_tokens_per_second": 144587.559 }, { "epoch": 0.2791770131252217, "grad_norm": 0.2962149977684021, "learning_rate": 3.287950137880502e-05, "loss": 0.4227, "num_input_tokens_seen": 3057959874, "step": 787, "train_runtime": 21147.429, "train_tokens_per_second": 144601.969 }, { "epoch": 0.2795317488471089, "grad_norm": 0.2769160568714142, "learning_rate": 3.286240508584615e-05, "loss": 0.404, "num_input_tokens_seen": 3061838009, "step": 788, "train_runtime": 21176.3114, "train_tokens_per_second": 144587.882 }, { "epoch": 0.2798864845689961, "grad_norm": 0.29237183928489685, "learning_rate": 3.2845292750004185e-05, "loss": 0.4401, "num_input_tokens_seen": 3065721158, "step": 789, "train_runtime": 21197.2648, "train_tokens_per_second": 144628.148 }, { "epoch": 0.2802412202908833, "grad_norm": 0.35701775550842285, "learning_rate": 3.2828164392622804e-05, "loss": 0.4214, "num_input_tokens_seen": 3069657992, "step": 790, "train_runtime": 21228.8153, "train_tokens_per_second": 144598.648 }, { "epoch": 0.2805959560127705, "grad_norm": 0.2598082423210144, "learning_rate": 3.281102003506569e-05, "loss": 0.4362, "num_input_tokens_seen": 3073507565, "step": 791, "train_runtime": 21261.1761, "train_tokens_per_second": 144559.621 }, { "epoch": 0.2809506917346577, "grad_norm": 0.30767160654067993, "learning_rate": 3.279385969871647e-05, "loss": 0.4294, "num_input_tokens_seen": 3077371520, "step": 792, "train_runtime": 21290.873, "train_tokens_per_second": 144539.471 }, { "epoch": 0.28130542745654485, "grad_norm": 0.3389342129230499, "learning_rate": 3.2776683404978705e-05, "loss": 0.4219, "num_input_tokens_seen": 3081269994, "step": 793, "train_runtime": 21317.6967, "train_tokens_per_second": 144540.475 }, { "epoch": 0.28166016317843207, "grad_norm": 0.28167858719825745, "learning_rate": 3.275949117527586e-05, "loss": 0.4212, "num_input_tokens_seen": 3085188988, "step": 794, "train_runtime": 21344.3067, "train_tokens_per_second": 144543.884 }, { "epoch": 0.2820148989003193, "grad_norm": 0.3200244903564453, "learning_rate": 3.2742283031051286e-05, "loss": 0.4386, "num_input_tokens_seen": 3089027037, "step": 795, "train_runtime": 21369.2523, "train_tokens_per_second": 144554.755 }, { "epoch": 0.28236963462220643, "grad_norm": 0.26694780588150024, "learning_rate": 3.272505899376816e-05, "loss": 0.4203, "num_input_tokens_seen": 3092907150, "step": 796, "train_runtime": 21393.1356, "train_tokens_per_second": 144574.746 }, { "epoch": 0.28272437034409365, "grad_norm": 0.2627147138118744, "learning_rate": 3.270781908490949e-05, "loss": 0.4216, "num_input_tokens_seen": 3096783493, "step": 797, "train_runtime": 21420.2826, "train_tokens_per_second": 144572.485 }, { "epoch": 0.28307910606598086, "grad_norm": 0.2615334689617157, "learning_rate": 3.26905633259781e-05, "loss": 0.4289, "num_input_tokens_seen": 3100739990, "step": 798, "train_runtime": 21445.4715, "train_tokens_per_second": 144587.168 }, { "epoch": 0.283433841787868, "grad_norm": 0.28952619433403015, "learning_rate": 3.267329173849656e-05, "loss": 0.4398, "num_input_tokens_seen": 3104557182, "step": 799, "train_runtime": 21467.0369, "train_tokens_per_second": 144619.735 }, { "epoch": 0.2837885775097552, "grad_norm": 0.27217721939086914, "learning_rate": 3.265600434400719e-05, "loss": 0.4341, "num_input_tokens_seen": 3108471957, "step": 800, "train_runtime": 21490.3753, "train_tokens_per_second": 144644.843 }, { "epoch": 0.28414331323164244, "grad_norm": 0.2631455361843109, "learning_rate": 3.263870116407204e-05, "loss": 0.4193, "num_input_tokens_seen": 3112421892, "step": 801, "train_runtime": 21629.567, "train_tokens_per_second": 143896.635 }, { "epoch": 0.28449804895352965, "grad_norm": 0.24109405279159546, "learning_rate": 3.262138222027281e-05, "loss": 0.4291, "num_input_tokens_seen": 3116278511, "step": 802, "train_runtime": 21662.5661, "train_tokens_per_second": 143855.464 }, { "epoch": 0.2848527846754168, "grad_norm": 0.2979956567287445, "learning_rate": 3.260404753421092e-05, "loss": 0.4277, "num_input_tokens_seen": 3120129816, "step": 803, "train_runtime": 21693.7076, "train_tokens_per_second": 143826.49 }, { "epoch": 0.285207520397304, "grad_norm": 0.5256218910217285, "learning_rate": 3.258669712750736e-05, "loss": 0.4382, "num_input_tokens_seen": 3123975306, "step": 804, "train_runtime": 21719.7275, "train_tokens_per_second": 143831.238 }, { "epoch": 0.2855622561191912, "grad_norm": 0.298115611076355, "learning_rate": 3.256933102180278e-05, "loss": 0.4406, "num_input_tokens_seen": 3127875096, "step": 805, "train_runtime": 21746.7123, "train_tokens_per_second": 143832.091 }, { "epoch": 0.2859169918410784, "grad_norm": 0.2937434911727905, "learning_rate": 3.255194923875738e-05, "loss": 0.4256, "num_input_tokens_seen": 3131677969, "step": 806, "train_runtime": 21769.9091, "train_tokens_per_second": 143853.516 }, { "epoch": 0.2862717275629656, "grad_norm": 0.30076491832733154, "learning_rate": 3.253455180005093e-05, "loss": 0.424, "num_input_tokens_seen": 3135705940, "step": 807, "train_runtime": 21792.7056, "train_tokens_per_second": 143887.868 }, { "epoch": 0.2866264632848528, "grad_norm": 0.2963750660419464, "learning_rate": 3.2517138727382725e-05, "loss": 0.4278, "num_input_tokens_seen": 3139571028, "step": 808, "train_runtime": 21819.8919, "train_tokens_per_second": 143885.728 }, { "epoch": 0.28698119900673996, "grad_norm": 0.33046266436576843, "learning_rate": 3.2499710042471544e-05, "loss": 0.4289, "num_input_tokens_seen": 3143378780, "step": 809, "train_runtime": 21848.6035, "train_tokens_per_second": 143870.925 }, { "epoch": 0.2873359347286272, "grad_norm": 0.44056999683380127, "learning_rate": 3.248226576705566e-05, "loss": 0.4296, "num_input_tokens_seen": 3147295385, "step": 810, "train_runtime": 21874.5866, "train_tokens_per_second": 143879.08 }, { "epoch": 0.2876906704505144, "grad_norm": 0.47247377038002014, "learning_rate": 3.2464805922892786e-05, "loss": 0.4266, "num_input_tokens_seen": 3151188597, "step": 811, "train_runtime": 21897.7172, "train_tokens_per_second": 143904.891 }, { "epoch": 0.28804540617240154, "grad_norm": 0.4537815451622009, "learning_rate": 3.244733053176003e-05, "loss": 0.4251, "num_input_tokens_seen": 3155102698, "step": 812, "train_runtime": 21919.5044, "train_tokens_per_second": 143940.421 }, { "epoch": 0.28840014189428875, "grad_norm": 0.40554171800613403, "learning_rate": 3.242983961545394e-05, "loss": 0.4307, "num_input_tokens_seen": 3158996886, "step": 813, "train_runtime": 21952.1547, "train_tokens_per_second": 143903.727 }, { "epoch": 0.28875487761617596, "grad_norm": 0.39647138118743896, "learning_rate": 3.2412333195790377e-05, "loss": 0.4187, "num_input_tokens_seen": 3162872428, "step": 814, "train_runtime": 21977.3124, "train_tokens_per_second": 143915.342 }, { "epoch": 0.2891096133380631, "grad_norm": 0.30424001812934875, "learning_rate": 3.239481129460457e-05, "loss": 0.4226, "num_input_tokens_seen": 3166720415, "step": 815, "train_runtime": 21997.0124, "train_tokens_per_second": 143961.387 }, { "epoch": 0.28946434905995033, "grad_norm": 0.3154950737953186, "learning_rate": 3.237727393375105e-05, "loss": 0.4254, "num_input_tokens_seen": 3170584561, "step": 816, "train_runtime": 22020.8219, "train_tokens_per_second": 143981.209 }, { "epoch": 0.28981908478183754, "grad_norm": 0.38170167803764343, "learning_rate": 3.2359721135103624e-05, "loss": 0.4244, "num_input_tokens_seen": 3174523523, "step": 817, "train_runtime": 22057.4932, "train_tokens_per_second": 143920.413 }, { "epoch": 0.2901738205037247, "grad_norm": 0.33963891863822937, "learning_rate": 3.234215292055535e-05, "loss": 0.4126, "num_input_tokens_seen": 3178463204, "step": 818, "train_runtime": 22085.0268, "train_tokens_per_second": 143919.373 }, { "epoch": 0.2905285562256119, "grad_norm": 0.40921714901924133, "learning_rate": 3.232456931201855e-05, "loss": 0.4078, "num_input_tokens_seen": 3182299673, "step": 819, "train_runtime": 22108.135, "train_tokens_per_second": 143942.475 }, { "epoch": 0.2908832919474991, "grad_norm": 0.34743034839630127, "learning_rate": 3.23069703314247e-05, "loss": 0.4383, "num_input_tokens_seen": 3186206997, "step": 820, "train_runtime": 22131.6103, "train_tokens_per_second": 143966.343 }, { "epoch": 0.29123802766938633, "grad_norm": 0.2844639718532562, "learning_rate": 3.228935600072446e-05, "loss": 0.4286, "num_input_tokens_seen": 3190057260, "step": 821, "train_runtime": 22158.7043, "train_tokens_per_second": 143964.07 }, { "epoch": 0.2915927633912735, "grad_norm": 0.28426653146743774, "learning_rate": 3.227172634188766e-05, "loss": 0.4211, "num_input_tokens_seen": 3193972235, "step": 822, "train_runtime": 22194.8604, "train_tokens_per_second": 143905.939 }, { "epoch": 0.2919474991131607, "grad_norm": 0.2815766930580139, "learning_rate": 3.2254081376903236e-05, "loss": 0.4268, "num_input_tokens_seen": 3197869601, "step": 823, "train_runtime": 22219.9227, "train_tokens_per_second": 143919.024 }, { "epoch": 0.2923022348350479, "grad_norm": 0.34467995166778564, "learning_rate": 3.2236421127779195e-05, "loss": 0.4237, "num_input_tokens_seen": 3201722060, "step": 824, "train_runtime": 22250.1372, "train_tokens_per_second": 143896.733 }, { "epoch": 0.29265697055693507, "grad_norm": 0.2921852171421051, "learning_rate": 3.221874561654263e-05, "loss": 0.4246, "num_input_tokens_seen": 3205656073, "step": 825, "train_runtime": 22278.6202, "train_tokens_per_second": 143889.345 }, { "epoch": 0.2930117062788223, "grad_norm": 0.25858768820762634, "learning_rate": 3.2201054865239676e-05, "loss": 0.4325, "num_input_tokens_seen": 3209562802, "step": 826, "train_runtime": 22301.7387, "train_tokens_per_second": 143915.362 }, { "epoch": 0.2933664420007095, "grad_norm": 0.2993992269039154, "learning_rate": 3.218334889593544e-05, "loss": 0.4345, "num_input_tokens_seen": 3213483042, "step": 827, "train_runtime": 22321.9236, "train_tokens_per_second": 143960.847 }, { "epoch": 0.29372117772259665, "grad_norm": 0.3201778531074524, "learning_rate": 3.216562773071405e-05, "loss": 0.4339, "num_input_tokens_seen": 3217282690, "step": 828, "train_runtime": 22345.1578, "train_tokens_per_second": 143981.202 }, { "epoch": 0.29407591344448386, "grad_norm": 0.3041447103023529, "learning_rate": 3.214789139167858e-05, "loss": 0.4233, "num_input_tokens_seen": 3221235457, "step": 829, "train_runtime": 22366.6482, "train_tokens_per_second": 144019.588 }, { "epoch": 0.2944306491663711, "grad_norm": 0.3313015401363373, "learning_rate": 3.2130139900951e-05, "loss": 0.4177, "num_input_tokens_seen": 3225092022, "step": 830, "train_runtime": 22394.9599, "train_tokens_per_second": 144009.725 }, { "epoch": 0.29478538488825823, "grad_norm": 0.2946214973926544, "learning_rate": 3.2112373280672215e-05, "loss": 0.4101, "num_input_tokens_seen": 3229000058, "step": 831, "train_runtime": 22425.7116, "train_tokens_per_second": 143986.515 }, { "epoch": 0.29514012061014544, "grad_norm": 0.29532864689826965, "learning_rate": 3.209459155300198e-05, "loss": 0.4321, "num_input_tokens_seen": 3232863574, "step": 832, "train_runtime": 22449.1509, "train_tokens_per_second": 144008.278 }, { "epoch": 0.29549485633203265, "grad_norm": 0.31067830324172974, "learning_rate": 3.207679474011889e-05, "loss": 0.4263, "num_input_tokens_seen": 3236777909, "step": 833, "train_runtime": 22470.398, "train_tokens_per_second": 144046.31 }, { "epoch": 0.2958495920539198, "grad_norm": 0.306651771068573, "learning_rate": 3.205898286422038e-05, "loss": 0.4105, "num_input_tokens_seen": 3240677315, "step": 834, "train_runtime": 22499.0977, "train_tokens_per_second": 144035.879 }, { "epoch": 0.296204327775807, "grad_norm": 0.48882198333740234, "learning_rate": 3.204115594752265e-05, "loss": 0.4276, "num_input_tokens_seen": 3244535401, "step": 835, "train_runtime": 22519.7067, "train_tokens_per_second": 144075.384 }, { "epoch": 0.29655906349769423, "grad_norm": 0.4011690020561218, "learning_rate": 3.202331401226066e-05, "loss": 0.4361, "num_input_tokens_seen": 3248395160, "step": 836, "train_runtime": 22549.3961, "train_tokens_per_second": 144056.858 }, { "epoch": 0.2969137992195814, "grad_norm": 0.31670987606048584, "learning_rate": 3.2005457080688114e-05, "loss": 0.4277, "num_input_tokens_seen": 3252314517, "step": 837, "train_runtime": 22575.9929, "train_tokens_per_second": 144060.752 }, { "epoch": 0.2972685349414686, "grad_norm": 0.3661700487136841, "learning_rate": 3.198758517507742e-05, "loss": 0.4276, "num_input_tokens_seen": 3256163200, "step": 838, "train_runtime": 22606.8195, "train_tokens_per_second": 144034.556 }, { "epoch": 0.2976232706633558, "grad_norm": 0.3766869008541107, "learning_rate": 3.196969831771964e-05, "loss": 0.4231, "num_input_tokens_seen": 3260079727, "step": 839, "train_runtime": 22628.2023, "train_tokens_per_second": 144071.53 }, { "epoch": 0.297978006385243, "grad_norm": 0.39170366525650024, "learning_rate": 3.195179653092451e-05, "loss": 0.4331, "num_input_tokens_seen": 3264015175, "step": 840, "train_runtime": 22659.0879, "train_tokens_per_second": 144048.833 }, { "epoch": 0.2983327421071302, "grad_norm": 0.28673863410949707, "learning_rate": 3.1933879837020386e-05, "loss": 0.4232, "num_input_tokens_seen": 3267943021, "step": 841, "train_runtime": 22685.7571, "train_tokens_per_second": 144052.632 }, { "epoch": 0.2986874778290174, "grad_norm": 0.3016822934150696, "learning_rate": 3.191594825835421e-05, "loss": 0.4147, "num_input_tokens_seen": 3271807589, "step": 842, "train_runtime": 22713.1085, "train_tokens_per_second": 144049.309 }, { "epoch": 0.2990422135509046, "grad_norm": 0.3086939752101898, "learning_rate": 3.189800181729149e-05, "loss": 0.4196, "num_input_tokens_seen": 3275689155, "step": 843, "train_runtime": 22743.1802, "train_tokens_per_second": 144029.512 }, { "epoch": 0.29939694927279176, "grad_norm": 0.44738370180130005, "learning_rate": 3.1880040536216256e-05, "loss": 0.4202, "num_input_tokens_seen": 3279629715, "step": 844, "train_runtime": 22762.6504, "train_tokens_per_second": 144079.431 }, { "epoch": 0.29975168499467897, "grad_norm": 0.340026319026947, "learning_rate": 3.186206443753108e-05, "loss": 0.4237, "num_input_tokens_seen": 3283403939, "step": 845, "train_runtime": 22794.5521, "train_tokens_per_second": 144043.363 }, { "epoch": 0.3001064207165662, "grad_norm": 0.38083982467651367, "learning_rate": 3.1844073543656986e-05, "loss": 0.438, "num_input_tokens_seen": 3287347716, "step": 846, "train_runtime": 22824.5236, "train_tokens_per_second": 144027.002 }, { "epoch": 0.30046115643845334, "grad_norm": 0.3081817924976349, "learning_rate": 3.182606787703348e-05, "loss": 0.419, "num_input_tokens_seen": 3291254621, "step": 847, "train_runtime": 22847.7629, "train_tokens_per_second": 144051.505 }, { "epoch": 0.30081589216034055, "grad_norm": 0.3139006793498993, "learning_rate": 3.1808047460118454e-05, "loss": 0.4343, "num_input_tokens_seen": 3295193390, "step": 848, "train_runtime": 22878.9028, "train_tokens_per_second": 144027.597 }, { "epoch": 0.30117062788222776, "grad_norm": 0.2836022675037384, "learning_rate": 3.1790012315388244e-05, "loss": 0.424, "num_input_tokens_seen": 3299078357, "step": 849, "train_runtime": 22906.2619, "train_tokens_per_second": 144025.174 }, { "epoch": 0.3015253636041149, "grad_norm": 0.3160383403301239, "learning_rate": 3.177196246533752e-05, "loss": 0.4218, "num_input_tokens_seen": 3302953189, "step": 850, "train_runtime": 22923.1991, "train_tokens_per_second": 144087.794 }, { "epoch": 0.3018800993260021, "grad_norm": 0.30597734451293945, "learning_rate": 3.1753897932479306e-05, "loss": 0.4327, "num_input_tokens_seen": 3306858765, "step": 851, "train_runtime": 22943.0714, "train_tokens_per_second": 144133.221 }, { "epoch": 0.30223483504788934, "grad_norm": 0.2817297577857971, "learning_rate": 3.1735818739344944e-05, "loss": 0.4241, "num_input_tokens_seen": 3310706620, "step": 852, "train_runtime": 22972.1541, "train_tokens_per_second": 144118.249 }, { "epoch": 0.3025895707697765, "grad_norm": 0.396006315946579, "learning_rate": 3.171772490848406e-05, "loss": 0.4183, "num_input_tokens_seen": 3314585272, "step": 853, "train_runtime": 22995.5251, "train_tokens_per_second": 144140.447 }, { "epoch": 0.3029443064916637, "grad_norm": 0.30512532591819763, "learning_rate": 3.169961646246452e-05, "loss": 0.4263, "num_input_tokens_seen": 3318451374, "step": 854, "train_runtime": 23022.8995, "train_tokens_per_second": 144136.987 }, { "epoch": 0.3032990422135509, "grad_norm": 0.3180830180644989, "learning_rate": 3.168149342387245e-05, "loss": 0.4271, "num_input_tokens_seen": 3322322188, "step": 855, "train_runtime": 23045.1809, "train_tokens_per_second": 144165.594 }, { "epoch": 0.3036537779354381, "grad_norm": 0.2843480408191681, "learning_rate": 3.1663355815312135e-05, "loss": 0.4222, "num_input_tokens_seen": 3326211532, "step": 856, "train_runtime": 23075.5845, "train_tokens_per_second": 144144.194 }, { "epoch": 0.3040085136573253, "grad_norm": 0.3353406488895416, "learning_rate": 3.164520365940609e-05, "loss": 0.4079, "num_input_tokens_seen": 3330032804, "step": 857, "train_runtime": 23097.0136, "train_tokens_per_second": 144175.903 }, { "epoch": 0.3043632493792125, "grad_norm": 0.3359241783618927, "learning_rate": 3.1627036978794925e-05, "loss": 0.4244, "num_input_tokens_seen": 3333892948, "step": 858, "train_runtime": 23123.8944, "train_tokens_per_second": 144175.236 }, { "epoch": 0.30471798510109965, "grad_norm": 0.376526802778244, "learning_rate": 3.160885579613738e-05, "loss": 0.4277, "num_input_tokens_seen": 3337779147, "step": 859, "train_runtime": 23150.6462, "train_tokens_per_second": 144176.5 }, { "epoch": 0.30507272082298686, "grad_norm": 0.3517661988735199, "learning_rate": 3.159066013411029e-05, "loss": 0.4184, "num_input_tokens_seen": 3341676937, "step": 860, "train_runtime": 23177.0594, "train_tokens_per_second": 144180.367 }, { "epoch": 0.3054274565448741, "grad_norm": 0.44237974286079407, "learning_rate": 3.1572450015408545e-05, "loss": 0.4298, "num_input_tokens_seen": 3345542185, "step": 861, "train_runtime": 23202.3183, "train_tokens_per_second": 144189.996 }, { "epoch": 0.3057821922667613, "grad_norm": 0.3818778395652771, "learning_rate": 3.155422546274506e-05, "loss": 0.4197, "num_input_tokens_seen": 3349391839, "step": 862, "train_runtime": 23228.6273, "train_tokens_per_second": 144192.414 }, { "epoch": 0.30613692798864844, "grad_norm": 0.3634932041168213, "learning_rate": 3.1535986498850784e-05, "loss": 0.4299, "num_input_tokens_seen": 3353274003, "step": 863, "train_runtime": 23256.9216, "train_tokens_per_second": 144183.915 }, { "epoch": 0.30649166371053566, "grad_norm": 0.7755188941955566, "learning_rate": 3.15177331464746e-05, "loss": 0.4311, "num_input_tokens_seen": 3357156993, "step": 864, "train_runtime": 23287.604, "train_tokens_per_second": 144160.687 }, { "epoch": 0.30684639943242287, "grad_norm": 0.3350399434566498, "learning_rate": 3.1499465428383345e-05, "loss": 0.4221, "num_input_tokens_seen": 3361030991, "step": 865, "train_runtime": 23323.9363, "train_tokens_per_second": 144102.22 }, { "epoch": 0.30720113515431, "grad_norm": 0.3014869689941406, "learning_rate": 3.14811833673618e-05, "loss": 0.4256, "num_input_tokens_seen": 3364942207, "step": 866, "train_runtime": 23345.436, "train_tokens_per_second": 144137.047 }, { "epoch": 0.30755587087619723, "grad_norm": 0.2887047529220581, "learning_rate": 3.1462886986212605e-05, "loss": 0.4129, "num_input_tokens_seen": 3368824821, "step": 867, "train_runtime": 23367.8973, "train_tokens_per_second": 144164.654 }, { "epoch": 0.30791060659808445, "grad_norm": 0.2710968255996704, "learning_rate": 3.144457630775629e-05, "loss": 0.4286, "num_input_tokens_seen": 3372722861, "step": 868, "train_runtime": 23403.9471, "train_tokens_per_second": 144109.148 }, { "epoch": 0.3082653423199716, "grad_norm": 0.6538788080215454, "learning_rate": 3.1426251354831196e-05, "loss": 0.4194, "num_input_tokens_seen": 3376568587, "step": 869, "train_runtime": 23421.5652, "train_tokens_per_second": 144164.942 }, { "epoch": 0.3086200780418588, "grad_norm": 0.7473627328872681, "learning_rate": 3.140791215029347e-05, "loss": 0.433, "num_input_tokens_seen": 3380510014, "step": 870, "train_runtime": 23445.007, "train_tokens_per_second": 144188.91 }, { "epoch": 0.308974813763746, "grad_norm": 0.3218751549720764, "learning_rate": 3.1389558717017036e-05, "loss": 0.4239, "num_input_tokens_seen": 3384375949, "step": 871, "train_runtime": 23474.2156, "train_tokens_per_second": 144174.187 }, { "epoch": 0.3093295494856332, "grad_norm": 0.2858458161354065, "learning_rate": 3.1371191077893574e-05, "loss": 0.4243, "num_input_tokens_seen": 3388254136, "step": 872, "train_runtime": 23507.6564, "train_tokens_per_second": 144134.068 }, { "epoch": 0.3096842852075204, "grad_norm": 0.31663233041763306, "learning_rate": 3.135280925583248e-05, "loss": 0.4272, "num_input_tokens_seen": 3392123027, "step": 873, "train_runtime": 23535.4303, "train_tokens_per_second": 144128.362 }, { "epoch": 0.3100390209294076, "grad_norm": 0.7640390396118164, "learning_rate": 3.133441327376083e-05, "loss": 0.4171, "num_input_tokens_seen": 3395937715, "step": 874, "train_runtime": 23564.3033, "train_tokens_per_second": 144113.648 }, { "epoch": 0.31039375665129476, "grad_norm": 0.2909909188747406, "learning_rate": 3.131600315462337e-05, "loss": 0.426, "num_input_tokens_seen": 3399886739, "step": 875, "train_runtime": 23599.1641, "train_tokens_per_second": 144068.1 }, { "epoch": 0.310748492373182, "grad_norm": 0.3190307021141052, "learning_rate": 3.1297578921382474e-05, "loss": 0.4242, "num_input_tokens_seen": 3403730745, "step": 876, "train_runtime": 23623.6962, "train_tokens_per_second": 144081.211 }, { "epoch": 0.3111032280950692, "grad_norm": 0.3339364528656006, "learning_rate": 3.1279140597018135e-05, "loss": 0.416, "num_input_tokens_seen": 3407580678, "step": 877, "train_runtime": 23650.7673, "train_tokens_per_second": 144079.075 }, { "epoch": 0.31145796381695634, "grad_norm": 0.3285560607910156, "learning_rate": 3.126068820452789e-05, "loss": 0.4167, "num_input_tokens_seen": 3411517518, "step": 878, "train_runtime": 23682.9914, "train_tokens_per_second": 144049.265 }, { "epoch": 0.31181269953884355, "grad_norm": 0.3208244740962982, "learning_rate": 3.124222176692686e-05, "loss": 0.4315, "num_input_tokens_seen": 3415392181, "step": 879, "train_runtime": 23712.8356, "train_tokens_per_second": 144031.369 }, { "epoch": 0.31216743526073076, "grad_norm": 0.4962501525878906, "learning_rate": 3.122374130724765e-05, "loss": 0.4354, "num_input_tokens_seen": 3419229082, "step": 880, "train_runtime": 23746.8754, "train_tokens_per_second": 143986.484 }, { "epoch": 0.312522170982618, "grad_norm": 0.3161901533603668, "learning_rate": 3.120524684854038e-05, "loss": 0.4167, "num_input_tokens_seen": 3423089737, "step": 881, "train_runtime": 23773.8404, "train_tokens_per_second": 143985.561 }, { "epoch": 0.31287690670450513, "grad_norm": 0.4886123239994049, "learning_rate": 3.118673841387262e-05, "loss": 0.4219, "num_input_tokens_seen": 3426960730, "step": 882, "train_runtime": 23793.636, "train_tokens_per_second": 144028.459 }, { "epoch": 0.31323164242639234, "grad_norm": 0.2929796576499939, "learning_rate": 3.116821602632936e-05, "loss": 0.41, "num_input_tokens_seen": 3430840323, "step": 883, "train_runtime": 23811.0846, "train_tokens_per_second": 144085.848 }, { "epoch": 0.31358637814827955, "grad_norm": 0.309627890586853, "learning_rate": 3.1149679709013035e-05, "loss": 0.4369, "num_input_tokens_seen": 3434711469, "step": 884, "train_runtime": 23845.0645, "train_tokens_per_second": 144042.868 }, { "epoch": 0.3139411138701667, "grad_norm": 0.3303746283054352, "learning_rate": 3.113112948504339e-05, "loss": 0.4215, "num_input_tokens_seen": 3438643666, "step": 885, "train_runtime": 23865.954, "train_tokens_per_second": 144081.551 }, { "epoch": 0.3142958495920539, "grad_norm": 0.3964732885360718, "learning_rate": 3.111256537755757e-05, "loss": 0.4454, "num_input_tokens_seen": 3442516556, "step": 886, "train_runtime": 23896.4847, "train_tokens_per_second": 144059.538 }, { "epoch": 0.31465058531394113, "grad_norm": 0.4032782018184662, "learning_rate": 3.1093987409710015e-05, "loss": 0.4213, "num_input_tokens_seen": 3446389453, "step": 887, "train_runtime": 23933.3511, "train_tokens_per_second": 143999.452 }, { "epoch": 0.3150053210358283, "grad_norm": 0.29762929677963257, "learning_rate": 3.107539560467246e-05, "loss": 0.4256, "num_input_tokens_seen": 3450259297, "step": 888, "train_runtime": 23967.4735, "train_tokens_per_second": 143955.903 }, { "epoch": 0.3153600567577155, "grad_norm": 0.27979356050491333, "learning_rate": 3.105678998563387e-05, "loss": 0.4422, "num_input_tokens_seen": 3454169735, "step": 889, "train_runtime": 23999.9217, "train_tokens_per_second": 143924.209 }, { "epoch": 0.3157147924796027, "grad_norm": 0.37913379073143005, "learning_rate": 3.1038170575800483e-05, "loss": 0.4128, "num_input_tokens_seen": 3458031207, "step": 890, "train_runtime": 24027.8699, "train_tokens_per_second": 143917.51 }, { "epoch": 0.31606952820148987, "grad_norm": 0.35888147354125977, "learning_rate": 3.101953739839572e-05, "loss": 0.4196, "num_input_tokens_seen": 3461897924, "step": 891, "train_runtime": 24049.0618, "train_tokens_per_second": 143951.475 }, { "epoch": 0.3164242639233771, "grad_norm": 0.31002962589263916, "learning_rate": 3.100089047666015e-05, "loss": 0.4299, "num_input_tokens_seen": 3465826640, "step": 892, "train_runtime": 24070.1501, "train_tokens_per_second": 143988.576 }, { "epoch": 0.3167789996452643, "grad_norm": 0.33103877305984497, "learning_rate": 3.098222983385152e-05, "loss": 0.4205, "num_input_tokens_seen": 3469753400, "step": 893, "train_runtime": 24093.6175, "train_tokens_per_second": 144011.309 }, { "epoch": 0.31713373536715145, "grad_norm": 0.41219577193260193, "learning_rate": 3.096355549324468e-05, "loss": 0.4213, "num_input_tokens_seen": 3473615176, "step": 894, "train_runtime": 24122.759, "train_tokens_per_second": 143997.425 }, { "epoch": 0.31748847108903866, "grad_norm": 0.3257758319377899, "learning_rate": 3.094486747813156e-05, "loss": 0.4219, "num_input_tokens_seen": 3477513294, "step": 895, "train_runtime": 24148.4437, "train_tokens_per_second": 144005.69 }, { "epoch": 0.31784320681092587, "grad_norm": 0.3361544907093048, "learning_rate": 3.092616581182114e-05, "loss": 0.4161, "num_input_tokens_seen": 3481352630, "step": 896, "train_runtime": 24174.2183, "train_tokens_per_second": 144010.97 }, { "epoch": 0.318197942532813, "grad_norm": 0.33864957094192505, "learning_rate": 3.090745051763944e-05, "loss": 0.423, "num_input_tokens_seen": 3485238452, "step": 897, "train_runtime": 24209.8601, "train_tokens_per_second": 143959.463 }, { "epoch": 0.31855267825470024, "grad_norm": 0.32999980449676514, "learning_rate": 3.0888721618929474e-05, "loss": 0.4096, "num_input_tokens_seen": 3489136752, "step": 898, "train_runtime": 24234.457, "train_tokens_per_second": 143974.208 }, { "epoch": 0.31890741397658745, "grad_norm": 0.31488683819770813, "learning_rate": 3.0869979139051216e-05, "loss": 0.4162, "num_input_tokens_seen": 3493013399, "step": 899, "train_runtime": 24261.0132, "train_tokens_per_second": 143976.403 }, { "epoch": 0.31926214969847466, "grad_norm": 0.3950006067752838, "learning_rate": 3.08512231013816e-05, "loss": 0.4193, "num_input_tokens_seen": 3496930108, "step": 900, "train_runtime": 24287.761, "train_tokens_per_second": 143979.106 }, { "epoch": 0.3196168854203618, "grad_norm": 0.41678711771965027, "learning_rate": 3.0832453529314444e-05, "loss": 0.4193, "num_input_tokens_seen": 3500760390, "step": 901, "train_runtime": 24315.0154, "train_tokens_per_second": 143975.249 }, { "epoch": 0.31997162114224903, "grad_norm": 0.30840614438056946, "learning_rate": 3.081367044626046e-05, "loss": 0.422, "num_input_tokens_seen": 3504702348, "step": 902, "train_runtime": 24341.4541, "train_tokens_per_second": 143980.813 }, { "epoch": 0.32032635686413624, "grad_norm": 0.28759294748306274, "learning_rate": 3.079487387564721e-05, "loss": 0.4247, "num_input_tokens_seen": 3508619097, "step": 903, "train_runtime": 24373.5352, "train_tokens_per_second": 143951.998 }, { "epoch": 0.3206810925860234, "grad_norm": 0.33111506700515747, "learning_rate": 3.077606384091908e-05, "loss": 0.416, "num_input_tokens_seen": 3512499771, "step": 904, "train_runtime": 24396.551, "train_tokens_per_second": 143975.26 }, { "epoch": 0.3210358283079106, "grad_norm": 0.31747981905937195, "learning_rate": 3.075724036553726e-05, "loss": 0.4221, "num_input_tokens_seen": 3516507781, "step": 905, "train_runtime": 24421.9018, "train_tokens_per_second": 143989.924 }, { "epoch": 0.3213905640297978, "grad_norm": 0.2788420617580414, "learning_rate": 3.073840347297968e-05, "loss": 0.4176, "num_input_tokens_seen": 3520370165, "step": 906, "train_runtime": 24453.001, "train_tokens_per_second": 143964.75 }, { "epoch": 0.321745299751685, "grad_norm": 0.24935147166252136, "learning_rate": 3.0719553186741025e-05, "loss": 0.422, "num_input_tokens_seen": 3524248114, "step": 907, "train_runtime": 24480.0985, "train_tokens_per_second": 143963.805 }, { "epoch": 0.3221000354735722, "grad_norm": 0.29618147015571594, "learning_rate": 3.0700689530332674e-05, "loss": 0.4175, "num_input_tokens_seen": 3528078245, "step": 908, "train_runtime": 24507.2776, "train_tokens_per_second": 143960.431 }, { "epoch": 0.3224547711954594, "grad_norm": 0.27627885341644287, "learning_rate": 3.0681812527282686e-05, "loss": 0.4196, "num_input_tokens_seen": 3532042205, "step": 909, "train_runtime": 24540.1507, "train_tokens_per_second": 143929.117 }, { "epoch": 0.32280950691734656, "grad_norm": 0.4229852259159088, "learning_rate": 3.0662922201135774e-05, "loss": 0.4117, "num_input_tokens_seen": 3535896632, "step": 910, "train_runtime": 24559.9949, "train_tokens_per_second": 143969.762 }, { "epoch": 0.32316424263923377, "grad_norm": 0.28385022282600403, "learning_rate": 3.0644018575453255e-05, "loss": 0.4099, "num_input_tokens_seen": 3539788053, "step": 911, "train_runtime": 24587.019, "train_tokens_per_second": 143969.794 }, { "epoch": 0.323518978361121, "grad_norm": 0.276902437210083, "learning_rate": 3.0625101673813045e-05, "loss": 0.4308, "num_input_tokens_seen": 3543703644, "step": 912, "train_runtime": 24617.0698, "train_tokens_per_second": 143953.106 }, { "epoch": 0.32387371408300814, "grad_norm": 0.3041742742061615, "learning_rate": 3.060617151980962e-05, "loss": 0.4194, "num_input_tokens_seen": 3547565282, "step": 913, "train_runtime": 24636.6499, "train_tokens_per_second": 143995.441 }, { "epoch": 0.32422844980489535, "grad_norm": 0.2900278866291046, "learning_rate": 3.058722813705397e-05, "loss": 0.4243, "num_input_tokens_seen": 3551429252, "step": 914, "train_runtime": 24663.2204, "train_tokens_per_second": 143996.98 }, { "epoch": 0.32458318552678256, "grad_norm": 0.2748449146747589, "learning_rate": 3.0568271549173605e-05, "loss": 0.4296, "num_input_tokens_seen": 3555354224, "step": 915, "train_runtime": 24690.3651, "train_tokens_per_second": 143997.637 }, { "epoch": 0.3249379212486697, "grad_norm": 0.2980968952178955, "learning_rate": 3.0549301779812486e-05, "loss": 0.421, "num_input_tokens_seen": 3559209553, "step": 916, "train_runtime": 24718.1798, "train_tokens_per_second": 143991.571 }, { "epoch": 0.3252926569705569, "grad_norm": 0.32440489530563354, "learning_rate": 3.053031885263102e-05, "loss": 0.4127, "num_input_tokens_seen": 3563087992, "step": 917, "train_runtime": 24739.6615, "train_tokens_per_second": 144023.312 }, { "epoch": 0.32564739269244414, "grad_norm": 0.712638795375824, "learning_rate": 3.051132279130604e-05, "loss": 0.4034, "num_input_tokens_seen": 3567029353, "step": 918, "train_runtime": 24775.3761, "train_tokens_per_second": 143974.781 }, { "epoch": 0.32600212841433135, "grad_norm": 0.4422747790813446, "learning_rate": 3.0492313619530755e-05, "loss": 0.4199, "num_input_tokens_seen": 3570904813, "step": 919, "train_runtime": 24796.211, "train_tokens_per_second": 144010.1 }, { "epoch": 0.3263568641362185, "grad_norm": 0.3181817829608917, "learning_rate": 3.0473291361014713e-05, "loss": 0.4165, "num_input_tokens_seen": 3574871602, "step": 920, "train_runtime": 24816.6966, "train_tokens_per_second": 144051.066 }, { "epoch": 0.3267115998581057, "grad_norm": 0.3687998652458191, "learning_rate": 3.0454256039483784e-05, "loss": 0.4197, "num_input_tokens_seen": 3578677524, "step": 921, "train_runtime": 24844.1043, "train_tokens_per_second": 144045.343 }, { "epoch": 0.32706633557999293, "grad_norm": 0.31957611441612244, "learning_rate": 3.0435207678680164e-05, "loss": 0.422, "num_input_tokens_seen": 3582634058, "step": 922, "train_runtime": 24870.546, "train_tokens_per_second": 144051.283 }, { "epoch": 0.3274210713018801, "grad_norm": 0.28640422224998474, "learning_rate": 3.041614630236227e-05, "loss": 0.4195, "num_input_tokens_seen": 3586501105, "step": 923, "train_runtime": 24900.1478, "train_tokens_per_second": 144035.334 }, { "epoch": 0.3277758070237673, "grad_norm": 0.30606159567832947, "learning_rate": 3.0397071934304773e-05, "loss": 0.426, "num_input_tokens_seen": 3590356872, "step": 924, "train_runtime": 24926.6204, "train_tokens_per_second": 144037.05 }, { "epoch": 0.3281305427456545, "grad_norm": 0.31563109159469604, "learning_rate": 3.0377984598298553e-05, "loss": 0.4155, "num_input_tokens_seen": 3594249873, "step": 925, "train_runtime": 24953.9871, "train_tokens_per_second": 144035.094 }, { "epoch": 0.32848527846754166, "grad_norm": 0.36180245876312256, "learning_rate": 3.0358884318150655e-05, "loss": 0.4214, "num_input_tokens_seen": 3598087239, "step": 926, "train_runtime": 24976.3936, "train_tokens_per_second": 144059.519 }, { "epoch": 0.3288400141894289, "grad_norm": 0.3502935469150543, "learning_rate": 3.033977111768428e-05, "loss": 0.4255, "num_input_tokens_seen": 3601961021, "step": 927, "train_runtime": 25006.6557, "train_tokens_per_second": 144040.093 }, { "epoch": 0.3291947499113161, "grad_norm": 0.2857171893119812, "learning_rate": 3.0320645020738726e-05, "loss": 0.4155, "num_input_tokens_seen": 3605862928, "step": 928, "train_runtime": 25027.2726, "train_tokens_per_second": 144077.343 }, { "epoch": 0.32954948563320324, "grad_norm": 0.2906404137611389, "learning_rate": 3.030150605116939e-05, "loss": 0.4179, "num_input_tokens_seen": 3609780672, "step": 929, "train_runtime": 25055.0957, "train_tokens_per_second": 144073.713 }, { "epoch": 0.32990422135509045, "grad_norm": 0.3446275591850281, "learning_rate": 3.0282354232847715e-05, "loss": 0.4235, "num_input_tokens_seen": 3613616908, "step": 930, "train_runtime": 25081.3816, "train_tokens_per_second": 144075.672 }, { "epoch": 0.33025895707697767, "grad_norm": 0.2910221815109253, "learning_rate": 3.0263189589661195e-05, "loss": 0.4185, "num_input_tokens_seen": 3617608557, "step": 931, "train_runtime": 25116.7146, "train_tokens_per_second": 144031.917 }, { "epoch": 0.3306136927988648, "grad_norm": 0.25484707951545715, "learning_rate": 3.024401214551328e-05, "loss": 0.4222, "num_input_tokens_seen": 3621450194, "step": 932, "train_runtime": 25149.6261, "train_tokens_per_second": 143996.184 }, { "epoch": 0.33096842852075203, "grad_norm": 0.28969043493270874, "learning_rate": 3.0224821924323423e-05, "loss": 0.4311, "num_input_tokens_seen": 3625338564, "step": 933, "train_runtime": 25180.9437, "train_tokens_per_second": 143971.513 }, { "epoch": 0.33132316424263925, "grad_norm": 0.3808700144290924, "learning_rate": 3.0205618950026987e-05, "loss": 0.4194, "num_input_tokens_seen": 3629239972, "step": 934, "train_runtime": 25207.0032, "train_tokens_per_second": 143977.447 }, { "epoch": 0.3316778999645264, "grad_norm": 0.33397942781448364, "learning_rate": 3.0186403246575263e-05, "loss": 0.4301, "num_input_tokens_seen": 3633058108, "step": 935, "train_runtime": 25234.3745, "train_tokens_per_second": 143972.584 }, { "epoch": 0.3320326356864136, "grad_norm": 0.30257412791252136, "learning_rate": 3.0167174837935397e-05, "loss": 0.4177, "num_input_tokens_seen": 3637051098, "step": 936, "train_runtime": 25254.4786, "train_tokens_per_second": 144016.083 }, { "epoch": 0.3323873714083008, "grad_norm": 0.2951897978782654, "learning_rate": 3.0147933748090402e-05, "loss": 0.4241, "num_input_tokens_seen": 3640858286, "step": 937, "train_runtime": 25277.718, "train_tokens_per_second": 144034.295 }, { "epoch": 0.33274210713018804, "grad_norm": 0.28511711955070496, "learning_rate": 3.0128680001039093e-05, "loss": 0.4259, "num_input_tokens_seen": 3644720756, "step": 938, "train_runtime": 25304.8266, "train_tokens_per_second": 144032.631 }, { "epoch": 0.3330968428520752, "grad_norm": 0.320285826921463, "learning_rate": 3.010941362079608e-05, "loss": 0.4153, "num_input_tokens_seen": 3648574694, "step": 939, "train_runtime": 25330.1288, "train_tokens_per_second": 144040.906 }, { "epoch": 0.3334515785739624, "grad_norm": 0.33538973331451416, "learning_rate": 3.009013463139173e-05, "loss": 0.4359, "num_input_tokens_seen": 3652516721, "step": 940, "train_runtime": 25349.4547, "train_tokens_per_second": 144086.6 }, { "epoch": 0.3338063142958496, "grad_norm": 0.31675776839256287, "learning_rate": 3.0070843056872134e-05, "loss": 0.4153, "num_input_tokens_seen": 3656418223, "step": 941, "train_runtime": 25378.9045, "train_tokens_per_second": 144073.13 }, { "epoch": 0.33416105001773677, "grad_norm": 0.260688453912735, "learning_rate": 3.0051538921299074e-05, "loss": 0.4203, "num_input_tokens_seen": 3660293404, "step": 942, "train_runtime": 25405.707, "train_tokens_per_second": 144073.668 }, { "epoch": 0.334515785739624, "grad_norm": 0.2899554371833801, "learning_rate": 3.0032222248750016e-05, "loss": 0.4205, "num_input_tokens_seen": 3664227392, "step": 943, "train_runtime": 25432.7542, "train_tokens_per_second": 144075.131 }, { "epoch": 0.3348705214615112, "grad_norm": 0.3045352101325989, "learning_rate": 3.0012893063318043e-05, "loss": 0.4205, "num_input_tokens_seen": 3668096829, "step": 944, "train_runtime": 25454.1914, "train_tokens_per_second": 144105.808 }, { "epoch": 0.33522525718339835, "grad_norm": 0.3386250436306, "learning_rate": 2.9993551389111865e-05, "loss": 0.422, "num_input_tokens_seen": 3672011194, "step": 945, "train_runtime": 25480.4527, "train_tokens_per_second": 144110.909 }, { "epoch": 0.33557999290528556, "grad_norm": 0.2812177836894989, "learning_rate": 2.997419725025575e-05, "loss": 0.4169, "num_input_tokens_seen": 3675936712, "step": 946, "train_runtime": 25506.632, "train_tokens_per_second": 144116.899 }, { "epoch": 0.3359347286271728, "grad_norm": 0.2958664894104004, "learning_rate": 2.9954830670889524e-05, "loss": 0.4306, "num_input_tokens_seen": 3679794518, "step": 947, "train_runtime": 25540.3095, "train_tokens_per_second": 144077.914 }, { "epoch": 0.33628946434905993, "grad_norm": 0.4579515755176544, "learning_rate": 2.9935451675168533e-05, "loss": 0.4103, "num_input_tokens_seen": 3683667058, "step": 948, "train_runtime": 25569.0927, "train_tokens_per_second": 144067.179 }, { "epoch": 0.33664420007094714, "grad_norm": 0.3171616494655609, "learning_rate": 2.9916060287263594e-05, "loss": 0.4249, "num_input_tokens_seen": 3687559145, "step": 949, "train_runtime": 25602.2614, "train_tokens_per_second": 144032.556 }, { "epoch": 0.33699893579283435, "grad_norm": 0.35659292340278625, "learning_rate": 2.9896656531360997e-05, "loss": 0.4194, "num_input_tokens_seen": 3691457616, "step": 950, "train_runtime": 25634.1979, "train_tokens_per_second": 144005.193 }, { "epoch": 0.3373536715147215, "grad_norm": 0.3650408685207367, "learning_rate": 2.9877240431662442e-05, "loss": 0.4262, "num_input_tokens_seen": 3695367925, "step": 951, "train_runtime": 25659.5328, "train_tokens_per_second": 144015.402 }, { "epoch": 0.3377084072366087, "grad_norm": 0.26571065187454224, "learning_rate": 2.9857812012385045e-05, "loss": 0.4162, "num_input_tokens_seen": 3699235920, "step": 952, "train_runtime": 25683.1597, "train_tokens_per_second": 144033.521 }, { "epoch": 0.33806314295849593, "grad_norm": 0.25287312269210815, "learning_rate": 2.9838371297761273e-05, "loss": 0.4099, "num_input_tokens_seen": 3703151657, "step": 953, "train_runtime": 25716.2755, "train_tokens_per_second": 144000.311 }, { "epoch": 0.3384178786803831, "grad_norm": 0.26724275946617126, "learning_rate": 2.9818918312038934e-05, "loss": 0.4143, "num_input_tokens_seen": 3706997734, "step": 954, "train_runtime": 25754.737, "train_tokens_per_second": 143934.598 }, { "epoch": 0.3387726144022703, "grad_norm": 0.3108718991279602, "learning_rate": 2.9799453079481136e-05, "loss": 0.4169, "num_input_tokens_seen": 3710858968, "step": 955, "train_runtime": 25783.664, "train_tokens_per_second": 143922.872 }, { "epoch": 0.3391273501241575, "grad_norm": 0.27770107984542847, "learning_rate": 2.9779975624366276e-05, "loss": 0.4157, "num_input_tokens_seen": 3714780349, "step": 956, "train_runtime": 25809.7494, "train_tokens_per_second": 143929.346 }, { "epoch": 0.3394820858460447, "grad_norm": 0.32204490900039673, "learning_rate": 2.976048597098797e-05, "loss": 0.4153, "num_input_tokens_seen": 3718640842, "step": 957, "train_runtime": 25832.093, "train_tokens_per_second": 143954.299 }, { "epoch": 0.3398368215679319, "grad_norm": 0.273165225982666, "learning_rate": 2.9740984143655075e-05, "loss": 0.4202, "num_input_tokens_seen": 3722549524, "step": 958, "train_runtime": 25855.0714, "train_tokens_per_second": 143977.538 }, { "epoch": 0.3401915572898191, "grad_norm": 0.30673152208328247, "learning_rate": 2.972147016669163e-05, "loss": 0.4183, "num_input_tokens_seen": 3726449849, "step": 959, "train_runtime": 25883.2901, "train_tokens_per_second": 143971.258 }, { "epoch": 0.3405462930117063, "grad_norm": 0.374790221452713, "learning_rate": 2.97019440644368e-05, "loss": 0.413, "num_input_tokens_seen": 3730327317, "step": 960, "train_runtime": 25900.4401, "train_tokens_per_second": 144025.635 }, { "epoch": 0.34090102873359346, "grad_norm": 0.26896795630455017, "learning_rate": 2.9682405861244902e-05, "loss": 0.41, "num_input_tokens_seen": 3734188757, "step": 961, "train_runtime": 25926.7507, "train_tokens_per_second": 144028.413 }, { "epoch": 0.34125576445548067, "grad_norm": 0.3007280230522156, "learning_rate": 2.966285558148534e-05, "loss": 0.4183, "num_input_tokens_seen": 3738080450, "step": 962, "train_runtime": 25959.8905, "train_tokens_per_second": 143994.461 }, { "epoch": 0.3416105001773679, "grad_norm": 0.3270433843135834, "learning_rate": 2.9643293249542573e-05, "loss": 0.4126, "num_input_tokens_seen": 3741993630, "step": 963, "train_runtime": 25986.0973, "train_tokens_per_second": 143999.831 }, { "epoch": 0.34196523589925504, "grad_norm": 0.37757495045661926, "learning_rate": 2.9623718889816105e-05, "loss": 0.4208, "num_input_tokens_seen": 3745861351, "step": 964, "train_runtime": 26007.7278, "train_tokens_per_second": 144028.781 }, { "epoch": 0.34231997162114225, "grad_norm": 0.25081804394721985, "learning_rate": 2.9604132526720426e-05, "loss": 0.4208, "num_input_tokens_seen": 3749757561, "step": 965, "train_runtime": 26030.6329, "train_tokens_per_second": 144051.724 }, { "epoch": 0.34267470734302946, "grad_norm": 0.2989189922809601, "learning_rate": 2.958453418468501e-05, "loss": 0.4272, "num_input_tokens_seen": 3753571170, "step": 966, "train_runtime": 26062.7449, "train_tokens_per_second": 144020.562 }, { "epoch": 0.3430294430649166, "grad_norm": 0.3071904182434082, "learning_rate": 2.9564923888154267e-05, "loss": 0.4133, "num_input_tokens_seen": 3757482199, "step": 967, "train_runtime": 26088.9229, "train_tokens_per_second": 144025.961 }, { "epoch": 0.34338417878680383, "grad_norm": 0.26230040192604065, "learning_rate": 2.954530166158752e-05, "loss": 0.4251, "num_input_tokens_seen": 3761324746, "step": 968, "train_runtime": 26110.1424, "train_tokens_per_second": 144056.079 }, { "epoch": 0.34373891450869104, "grad_norm": 0.2847675681114197, "learning_rate": 2.952566752945896e-05, "loss": 0.42, "num_input_tokens_seen": 3765193658, "step": 969, "train_runtime": 26135.6404, "train_tokens_per_second": 144063.57 }, { "epoch": 0.3440936502305782, "grad_norm": 0.277387797832489, "learning_rate": 2.9506021516257646e-05, "loss": 0.4143, "num_input_tokens_seen": 3769041363, "step": 970, "train_runtime": 26161.9899, "train_tokens_per_second": 144065.546 }, { "epoch": 0.3444483859524654, "grad_norm": 0.317099928855896, "learning_rate": 2.9486363646487443e-05, "loss": 0.4228, "num_input_tokens_seen": 3772955243, "step": 971, "train_runtime": 26195.7425, "train_tokens_per_second": 144029.33 }, { "epoch": 0.3448031216743526, "grad_norm": 0.30853453278541565, "learning_rate": 2.946669394466702e-05, "loss": 0.417, "num_input_tokens_seen": 3776821587, "step": 972, "train_runtime": 26224.0242, "train_tokens_per_second": 144021.435 }, { "epoch": 0.3451578573962398, "grad_norm": 0.33585870265960693, "learning_rate": 2.944701243532978e-05, "loss": 0.4224, "num_input_tokens_seen": 3780634477, "step": 973, "train_runtime": 26254.3657, "train_tokens_per_second": 144000.222 }, { "epoch": 0.345512593118127, "grad_norm": 0.29483336210250854, "learning_rate": 2.942731914302387e-05, "loss": 0.4094, "num_input_tokens_seen": 3784525347, "step": 974, "train_runtime": 26280.4326, "train_tokens_per_second": 144005.443 }, { "epoch": 0.3458673288400142, "grad_norm": 0.2839784622192383, "learning_rate": 2.940761409231213e-05, "loss": 0.4176, "num_input_tokens_seen": 3788407729, "step": 975, "train_runtime": 26309.0405, "train_tokens_per_second": 143996.423 }, { "epoch": 0.3462220645619014, "grad_norm": 0.4073295593261719, "learning_rate": 2.938789730777206e-05, "loss": 0.4141, "num_input_tokens_seen": 3792304277, "step": 976, "train_runtime": 26335.527, "train_tokens_per_second": 143999.559 }, { "epoch": 0.34657680028378857, "grad_norm": 0.3177393972873688, "learning_rate": 2.9368168813995806e-05, "loss": 0.4196, "num_input_tokens_seen": 3796152833, "step": 977, "train_runtime": 26358.4351, "train_tokens_per_second": 144020.417 }, { "epoch": 0.3469315360056758, "grad_norm": 0.5254577994346619, "learning_rate": 2.934842863559011e-05, "loss": 0.4282, "num_input_tokens_seen": 3800049028, "step": 978, "train_runtime": 26385.5485, "train_tokens_per_second": 144020.088 }, { "epoch": 0.347286271727563, "grad_norm": 0.3055552542209625, "learning_rate": 2.932867679717629e-05, "loss": 0.4179, "num_input_tokens_seen": 3803888849, "step": 979, "train_runtime": 26413.5716, "train_tokens_per_second": 144012.665 }, { "epoch": 0.34764100744945015, "grad_norm": 0.3013061285018921, "learning_rate": 2.930891332339021e-05, "loss": 0.4264, "num_input_tokens_seen": 3807799779, "step": 980, "train_runtime": 26452.6417, "train_tokens_per_second": 143947.808 }, { "epoch": 0.34799574317133736, "grad_norm": 0.40968310832977295, "learning_rate": 2.9289138238882245e-05, "loss": 0.4148, "num_input_tokens_seen": 3811697610, "step": 981, "train_runtime": 26485.7214, "train_tokens_per_second": 143915.19 }, { "epoch": 0.34835047889322457, "grad_norm": 0.3367553651332855, "learning_rate": 2.926935156831725e-05, "loss": 0.4006, "num_input_tokens_seen": 3815555001, "step": 982, "train_runtime": 26505.8405, "train_tokens_per_second": 143951.481 }, { "epoch": 0.3487052146151117, "grad_norm": 0.34006190299987793, "learning_rate": 2.9249553336374527e-05, "loss": 0.4036, "num_input_tokens_seen": 3819433520, "step": 983, "train_runtime": 26543.4029, "train_tokens_per_second": 143893.891 }, { "epoch": 0.34905995033699894, "grad_norm": 0.2868550419807434, "learning_rate": 2.9229743567747814e-05, "loss": 0.4002, "num_input_tokens_seen": 3823265618, "step": 984, "train_runtime": 26567.0746, "train_tokens_per_second": 143909.922 }, { "epoch": 0.34941468605888615, "grad_norm": 0.33082929253578186, "learning_rate": 2.9209922287145212e-05, "loss": 0.413, "num_input_tokens_seen": 3827210484, "step": 985, "train_runtime": 26592.5825, "train_tokens_per_second": 143920.226 }, { "epoch": 0.3497694217807733, "grad_norm": 0.2733980417251587, "learning_rate": 2.9190089519289212e-05, "loss": 0.4218, "num_input_tokens_seen": 3831047426, "step": 986, "train_runtime": 26618.0516, "train_tokens_per_second": 143926.666 }, { "epoch": 0.3501241575026605, "grad_norm": 0.2574746608734131, "learning_rate": 2.9170245288916606e-05, "loss": 0.4047, "num_input_tokens_seen": 3834967410, "step": 987, "train_runtime": 26643.3367, "train_tokens_per_second": 143937.205 }, { "epoch": 0.3504788932245477, "grad_norm": 0.2640879452228546, "learning_rate": 2.9150389620778495e-05, "loss": 0.4174, "num_input_tokens_seen": 3838836050, "step": 988, "train_runtime": 26663.7038, "train_tokens_per_second": 143972.348 }, { "epoch": 0.3508336289464349, "grad_norm": 0.32342126965522766, "learning_rate": 2.9130522539640247e-05, "loss": 0.4128, "num_input_tokens_seen": 3842731459, "step": 989, "train_runtime": 26696.8916, "train_tokens_per_second": 143939.284 }, { "epoch": 0.3511883646683221, "grad_norm": 0.33166882395744324, "learning_rate": 2.911064407028147e-05, "loss": 0.4367, "num_input_tokens_seen": 3846625316, "step": 990, "train_runtime": 26723.8467, "train_tokens_per_second": 143939.806 }, { "epoch": 0.3515431003902093, "grad_norm": 0.2955820560455322, "learning_rate": 2.9090754237495953e-05, "loss": 0.4181, "num_input_tokens_seen": 3850522752, "step": 991, "train_runtime": 26749.2322, "train_tokens_per_second": 143948.907 }, { "epoch": 0.35189783611209646, "grad_norm": 0.3324822783470154, "learning_rate": 2.9070853066091685e-05, "loss": 0.4199, "num_input_tokens_seen": 3854343541, "step": 992, "train_runtime": 26784.0848, "train_tokens_per_second": 143904.247 }, { "epoch": 0.3522525718339837, "grad_norm": 0.3291053771972656, "learning_rate": 2.9050940580890783e-05, "loss": 0.4144, "num_input_tokens_seen": 3858300077, "step": 993, "train_runtime": 26806.4533, "train_tokens_per_second": 143931.763 }, { "epoch": 0.3526073075558709, "grad_norm": 0.3676835894584656, "learning_rate": 2.9031016806729474e-05, "loss": 0.433, "num_input_tokens_seen": 3862198400, "step": 994, "train_runtime": 26824.5566, "train_tokens_per_second": 143979.953 }, { "epoch": 0.3529620432777581, "grad_norm": 0.2937004566192627, "learning_rate": 2.9011081768458077e-05, "loss": 0.4165, "num_input_tokens_seen": 3866014199, "step": 995, "train_runtime": 26852.1007, "train_tokens_per_second": 143974.367 }, { "epoch": 0.35331677899964525, "grad_norm": 0.33474525809288025, "learning_rate": 2.899113549094095e-05, "loss": 0.4227, "num_input_tokens_seen": 3869884794, "step": 996, "train_runtime": 26876.0498, "train_tokens_per_second": 143990.089 }, { "epoch": 0.35367151472153247, "grad_norm": 0.4190887212753296, "learning_rate": 2.8971177999056474e-05, "loss": 0.42, "num_input_tokens_seen": 3873755854, "step": 997, "train_runtime": 26902.9991, "train_tokens_per_second": 143989.74 }, { "epoch": 0.3540262504434197, "grad_norm": 0.2918322682380676, "learning_rate": 2.895120931769702e-05, "loss": 0.4215, "num_input_tokens_seen": 3877655725, "step": 998, "train_runtime": 26919.874, "train_tokens_per_second": 144044.349 }, { "epoch": 0.35438098616530683, "grad_norm": 0.2927844226360321, "learning_rate": 2.89312294717689e-05, "loss": 0.4185, "num_input_tokens_seen": 3881573541, "step": 999, "train_runtime": 26942.0526, "train_tokens_per_second": 144071.188 }, { "epoch": 0.35473572188719404, "grad_norm": 0.3124673366546631, "learning_rate": 2.891123848619238e-05, "loss": 0.4249, "num_input_tokens_seen": 3885463432, "step": 1000, "train_runtime": 26968.1605, "train_tokens_per_second": 144075.953 }, { "epoch": 0.35509045760908126, "grad_norm": 0.31998175382614136, "learning_rate": 2.8891236385901584e-05, "loss": 0.4291, "num_input_tokens_seen": 3889380393, "step": 1001, "train_runtime": 27071.3471, "train_tokens_per_second": 143671.476 }, { "epoch": 0.3554451933309684, "grad_norm": 0.5325219631195068, "learning_rate": 2.8871223195844533e-05, "loss": 0.4124, "num_input_tokens_seen": 3893262433, "step": 1002, "train_runtime": 27107.6091, "train_tokens_per_second": 143622.494 }, { "epoch": 0.3557999290528556, "grad_norm": 0.3206736743450165, "learning_rate": 2.8851198940983054e-05, "loss": 0.4126, "num_input_tokens_seen": 3897116741, "step": 1003, "train_runtime": 27134.3652, "train_tokens_per_second": 143622.919 }, { "epoch": 0.35615466477474284, "grad_norm": 0.39520376920700073, "learning_rate": 2.883116364629279e-05, "loss": 0.4229, "num_input_tokens_seen": 3901043603, "step": 1004, "train_runtime": 27163.4943, "train_tokens_per_second": 143613.468 }, { "epoch": 0.35650940049663, "grad_norm": 0.3232939839363098, "learning_rate": 2.881111733676315e-05, "loss": 0.4116, "num_input_tokens_seen": 3904896770, "step": 1005, "train_runtime": 27193.1124, "train_tokens_per_second": 143598.744 }, { "epoch": 0.3568641362185172, "grad_norm": 0.29255425930023193, "learning_rate": 2.879106003739728e-05, "loss": 0.4289, "num_input_tokens_seen": 3908694141, "step": 1006, "train_runtime": 27224.4752, "train_tokens_per_second": 143572.801 }, { "epoch": 0.3572188719404044, "grad_norm": 0.6172589063644409, "learning_rate": 2.8770991773212032e-05, "loss": 0.4035, "num_input_tokens_seen": 3912585781, "step": 1007, "train_runtime": 27241.8038, "train_tokens_per_second": 143624.328 }, { "epoch": 0.35757360766229157, "grad_norm": 0.30115655064582825, "learning_rate": 2.8750912569237937e-05, "loss": 0.4301, "num_input_tokens_seen": 3916472753, "step": 1008, "train_runtime": 27267.4459, "train_tokens_per_second": 143631.815 }, { "epoch": 0.3579283433841788, "grad_norm": 0.2608516216278076, "learning_rate": 2.8730822450519172e-05, "loss": 0.414, "num_input_tokens_seen": 3920378307, "step": 1009, "train_runtime": 27296.5321, "train_tokens_per_second": 143621.845 }, { "epoch": 0.358283079106066, "grad_norm": 0.30225321650505066, "learning_rate": 2.8710721442113523e-05, "loss": 0.414, "num_input_tokens_seen": 3924232237, "step": 1010, "train_runtime": 27327.4384, "train_tokens_per_second": 143600.442 }, { "epoch": 0.35863781482795315, "grad_norm": 0.5918945670127869, "learning_rate": 2.869060956909236e-05, "loss": 0.421, "num_input_tokens_seen": 3928046535, "step": 1011, "train_runtime": 27352.8084, "train_tokens_per_second": 143606.699 }, { "epoch": 0.35899255054984036, "grad_norm": 0.30162158608436584, "learning_rate": 2.8670486856540612e-05, "loss": 0.4112, "num_input_tokens_seen": 3931919135, "step": 1012, "train_runtime": 27375.416, "train_tokens_per_second": 143629.566 }, { "epoch": 0.3593472862717276, "grad_norm": 0.5989415645599365, "learning_rate": 2.865035332955671e-05, "loss": 0.4131, "num_input_tokens_seen": 3935878556, "step": 1013, "train_runtime": 27403.0501, "train_tokens_per_second": 143629.214 }, { "epoch": 0.3597020219936148, "grad_norm": 0.2793273329734802, "learning_rate": 2.8630209013252593e-05, "loss": 0.4203, "num_input_tokens_seen": 3939759697, "step": 1014, "train_runtime": 27433.7391, "train_tokens_per_second": 143610.015 }, { "epoch": 0.36005675771550194, "grad_norm": 0.25752344727516174, "learning_rate": 2.861005393275364e-05, "loss": 0.4229, "num_input_tokens_seen": 3943620823, "step": 1015, "train_runtime": 27454.8628, "train_tokens_per_second": 143640.158 }, { "epoch": 0.36041149343738915, "grad_norm": 0.32048457860946655, "learning_rate": 2.8589888113198675e-05, "loss": 0.4272, "num_input_tokens_seen": 3947497611, "step": 1016, "train_runtime": 27474.5995, "train_tokens_per_second": 143678.077 }, { "epoch": 0.36076622915927636, "grad_norm": 0.357028067111969, "learning_rate": 2.85697115797399e-05, "loss": 0.4196, "num_input_tokens_seen": 3951416215, "step": 1017, "train_runtime": 27506.6121, "train_tokens_per_second": 143653.323 }, { "epoch": 0.3611209648811635, "grad_norm": 0.32205328345298767, "learning_rate": 2.8549524357542887e-05, "loss": 0.4248, "num_input_tokens_seen": 3955274504, "step": 1018, "train_runtime": 27536.828, "train_tokens_per_second": 143635.807 }, { "epoch": 0.36147570060305073, "grad_norm": 0.5319463014602661, "learning_rate": 2.8529326471786536e-05, "loss": 0.4178, "num_input_tokens_seen": 3959207363, "step": 1019, "train_runtime": 27561.8591, "train_tokens_per_second": 143648.052 }, { "epoch": 0.36183043632493794, "grad_norm": 0.3035302758216858, "learning_rate": 2.850911794766305e-05, "loss": 0.4133, "num_input_tokens_seen": 3963110205, "step": 1020, "train_runtime": 27589.8937, "train_tokens_per_second": 143643.548 }, { "epoch": 0.3621851720468251, "grad_norm": 0.347476601600647, "learning_rate": 2.8488898810377907e-05, "loss": 0.4215, "num_input_tokens_seen": 3966993672, "step": 1021, "train_runtime": 27616.4817, "train_tokens_per_second": 143645.875 }, { "epoch": 0.3625399077687123, "grad_norm": 0.353494256734848, "learning_rate": 2.8468669085149812e-05, "loss": 0.4156, "num_input_tokens_seen": 3970836955, "step": 1022, "train_runtime": 27646.5871, "train_tokens_per_second": 143628.468 }, { "epoch": 0.3628946434905995, "grad_norm": 0.29660457372665405, "learning_rate": 2.8448428797210673e-05, "loss": 0.4181, "num_input_tokens_seen": 3974742927, "step": 1023, "train_runtime": 27673.8466, "train_tokens_per_second": 143628.133 }, { "epoch": 0.3632493792124867, "grad_norm": 0.2958623468875885, "learning_rate": 2.8428177971805593e-05, "loss": 0.4123, "num_input_tokens_seen": 3978571880, "step": 1024, "train_runtime": 27709.1932, "train_tokens_per_second": 143583.101 }, { "epoch": 0.3636041149343739, "grad_norm": 0.3423788249492645, "learning_rate": 2.8407916634192802e-05, "loss": 0.4272, "num_input_tokens_seen": 3982588389, "step": 1025, "train_runtime": 27725.8725, "train_tokens_per_second": 143641.589 }, { "epoch": 0.3639588506562611, "grad_norm": 0.3430069386959076, "learning_rate": 2.8387644809643635e-05, "loss": 0.4158, "num_input_tokens_seen": 3986417074, "step": 1026, "train_runtime": 27752.0022, "train_tokens_per_second": 143644.305 }, { "epoch": 0.36431358637814826, "grad_norm": 0.2900354564189911, "learning_rate": 2.8367362523442534e-05, "loss": 0.4103, "num_input_tokens_seen": 3990250347, "step": 1027, "train_runtime": 27778.5192, "train_tokens_per_second": 143645.178 }, { "epoch": 0.36466832210003547, "grad_norm": 0.2975355386734009, "learning_rate": 2.8347069800886955e-05, "loss": 0.4173, "num_input_tokens_seen": 3994192059, "step": 1028, "train_runtime": 27806.8857, "train_tokens_per_second": 143640.395 }, { "epoch": 0.3650230578219227, "grad_norm": 0.3661717474460602, "learning_rate": 2.8326766667287394e-05, "loss": 0.4251, "num_input_tokens_seen": 3998067537, "step": 1029, "train_runtime": 27832.7475, "train_tokens_per_second": 143646.169 }, { "epoch": 0.36537779354380984, "grad_norm": 0.3340739607810974, "learning_rate": 2.830645314796733e-05, "loss": 0.4171, "num_input_tokens_seen": 4001885814, "step": 1030, "train_runtime": 27851.5976, "train_tokens_per_second": 143686.042 }, { "epoch": 0.36573252926569705, "grad_norm": 0.2818034887313843, "learning_rate": 2.8286129268263188e-05, "loss": 0.415, "num_input_tokens_seen": 4005751293, "step": 1031, "train_runtime": 27880.2119, "train_tokens_per_second": 143677.218 }, { "epoch": 0.36608726498758426, "grad_norm": 0.3315163850784302, "learning_rate": 2.826579505352432e-05, "loss": 0.4111, "num_input_tokens_seen": 4009655358, "step": 1032, "train_runtime": 27901.7813, "train_tokens_per_second": 143706.071 }, { "epoch": 0.36644200070947147, "grad_norm": 0.35817989706993103, "learning_rate": 2.824545052911297e-05, "loss": 0.4145, "num_input_tokens_seen": 4013537215, "step": 1033, "train_runtime": 27927.7833, "train_tokens_per_second": 143711.27 }, { "epoch": 0.3667967364313586, "grad_norm": 0.39178240299224854, "learning_rate": 2.8225095720404244e-05, "loss": 0.4168, "num_input_tokens_seen": 4017434580, "step": 1034, "train_runtime": 27949.3949, "train_tokens_per_second": 143739.591 }, { "epoch": 0.36715147215324584, "grad_norm": 0.3213486075401306, "learning_rate": 2.8204730652786056e-05, "loss": 0.4136, "num_input_tokens_seen": 4021358559, "step": 1035, "train_runtime": 27973.3737, "train_tokens_per_second": 143756.653 }, { "epoch": 0.36750620787513305, "grad_norm": 0.30480343103408813, "learning_rate": 2.818435535165914e-05, "loss": 0.419, "num_input_tokens_seen": 4025231146, "step": 1036, "train_runtime": 28000.0569, "train_tokens_per_second": 143757.963 }, { "epoch": 0.3678609435970202, "grad_norm": 0.273701936006546, "learning_rate": 2.8163969842436986e-05, "loss": 0.4175, "num_input_tokens_seen": 4029165799, "step": 1037, "train_runtime": 28020.7854, "train_tokens_per_second": 143792.037 }, { "epoch": 0.3682156793189074, "grad_norm": 0.370406836271286, "learning_rate": 2.8143574150545803e-05, "loss": 0.4237, "num_input_tokens_seen": 4033046519, "step": 1038, "train_runtime": 28052.6543, "train_tokens_per_second": 143767.02 }, { "epoch": 0.36857041504079463, "grad_norm": 0.5239799618721008, "learning_rate": 2.812316830142452e-05, "loss": 0.4081, "num_input_tokens_seen": 4036885846, "step": 1039, "train_runtime": 28080.4904, "train_tokens_per_second": 143761.23 }, { "epoch": 0.3689251507626818, "grad_norm": 0.3102260231971741, "learning_rate": 2.8102752320524725e-05, "loss": 0.4178, "num_input_tokens_seen": 4040772283, "step": 1040, "train_runtime": 28105.6397, "train_tokens_per_second": 143770.87 }, { "epoch": 0.369279886484569, "grad_norm": 0.3554935157299042, "learning_rate": 2.8082326233310636e-05, "loss": 0.4151, "num_input_tokens_seen": 4044638067, "step": 1041, "train_runtime": 28127.9275, "train_tokens_per_second": 143794.386 }, { "epoch": 0.3696346222064562, "grad_norm": 0.2865562438964844, "learning_rate": 2.8061890065259104e-05, "loss": 0.4177, "num_input_tokens_seen": 4048595192, "step": 1042, "train_runtime": 28147.9989, "train_tokens_per_second": 143832.434 }, { "epoch": 0.36998935792834337, "grad_norm": 0.32438114285469055, "learning_rate": 2.804144384185952e-05, "loss": 0.404, "num_input_tokens_seen": 4052425399, "step": 1043, "train_runtime": 28176.2567, "train_tokens_per_second": 143824.123 }, { "epoch": 0.3703440936502306, "grad_norm": 0.3007911145687103, "learning_rate": 2.802098758861383e-05, "loss": 0.4186, "num_input_tokens_seen": 4056340027, "step": 1044, "train_runtime": 28196.3575, "train_tokens_per_second": 143860.427 }, { "epoch": 0.3706988293721178, "grad_norm": 0.4041476547718048, "learning_rate": 2.8000521331036496e-05, "loss": 0.4102, "num_input_tokens_seen": 4060208527, "step": 1045, "train_runtime": 28219.8552, "train_tokens_per_second": 143877.724 }, { "epoch": 0.37105356509400494, "grad_norm": 0.27056336402893066, "learning_rate": 2.7980045094654458e-05, "loss": 0.4123, "num_input_tokens_seen": 4064144205, "step": 1046, "train_runtime": 28245.8995, "train_tokens_per_second": 143884.397 }, { "epoch": 0.37140830081589216, "grad_norm": 0.3359624147415161, "learning_rate": 2.795955890500708e-05, "loss": 0.4084, "num_input_tokens_seen": 4068074106, "step": 1047, "train_runtime": 28265.2993, "train_tokens_per_second": 143924.678 }, { "epoch": 0.37176303653777937, "grad_norm": 0.33044904470443726, "learning_rate": 2.793906278764617e-05, "loss": 0.4149, "num_input_tokens_seen": 4071921298, "step": 1048, "train_runtime": 28304.0951, "train_tokens_per_second": 143863.327 }, { "epoch": 0.3721177722596665, "grad_norm": 0.2970147430896759, "learning_rate": 2.7918556768135908e-05, "loss": 0.4202, "num_input_tokens_seen": 4075822912, "step": 1049, "train_runtime": 28325.9925, "train_tokens_per_second": 143889.854 }, { "epoch": 0.37247250798155374, "grad_norm": 0.36984196305274963, "learning_rate": 2.7898040872052815e-05, "loss": 0.4176, "num_input_tokens_seen": 4079731723, "step": 1050, "train_runtime": 28356.7563, "train_tokens_per_second": 143871.594 }, { "epoch": 0.37282724370344095, "grad_norm": 0.2885070741176605, "learning_rate": 2.7877515124985745e-05, "loss": 0.4088, "num_input_tokens_seen": 4083671172, "step": 1051, "train_runtime": 28376.0812, "train_tokens_per_second": 143912.443 }, { "epoch": 0.3731819794253281, "grad_norm": 0.39557456970214844, "learning_rate": 2.7856979552535835e-05, "loss": 0.4197, "num_input_tokens_seen": 4087472830, "step": 1052, "train_runtime": 28406.8791, "train_tokens_per_second": 143890.246 }, { "epoch": 0.3735367151472153, "grad_norm": 0.2640579640865326, "learning_rate": 2.783643418031646e-05, "loss": 0.4072, "num_input_tokens_seen": 4091345156, "step": 1053, "train_runtime": 28423.6803, "train_tokens_per_second": 143941.429 }, { "epoch": 0.3738914508691025, "grad_norm": 0.3196854293346405, "learning_rate": 2.7815879033953246e-05, "loss": 0.4226, "num_input_tokens_seen": 4095235188, "step": 1054, "train_runtime": 28442.9863, "train_tokens_per_second": 143980.493 }, { "epoch": 0.37424618659098974, "grad_norm": 0.29866960644721985, "learning_rate": 2.7795314139083992e-05, "loss": 0.4064, "num_input_tokens_seen": 4099162959, "step": 1055, "train_runtime": 28470.8279, "train_tokens_per_second": 143977.652 }, { "epoch": 0.3746009223128769, "grad_norm": 0.30159908533096313, "learning_rate": 2.777473952135866e-05, "loss": 0.4258, "num_input_tokens_seen": 4103081907, "step": 1056, "train_runtime": 28499.7646, "train_tokens_per_second": 143968.975 }, { "epoch": 0.3749556580347641, "grad_norm": 0.2887648046016693, "learning_rate": 2.7754155206439337e-05, "loss": 0.4114, "num_input_tokens_seen": 4106983245, "step": 1057, "train_runtime": 28524.6115, "train_tokens_per_second": 143980.34 }, { "epoch": 0.3753103937566513, "grad_norm": 0.6041732430458069, "learning_rate": 2.773356122000021e-05, "loss": 0.4155, "num_input_tokens_seen": 4110881984, "step": 1058, "train_runtime": 28545.8249, "train_tokens_per_second": 144009.921 }, { "epoch": 0.3756651294785385, "grad_norm": 0.2794676125049591, "learning_rate": 2.771295758772753e-05, "loss": 0.4174, "num_input_tokens_seen": 4114697291, "step": 1059, "train_runtime": 28565.4258, "train_tokens_per_second": 144044.668 }, { "epoch": 0.3760198652004257, "grad_norm": 0.31825336813926697, "learning_rate": 2.7692344335319564e-05, "loss": 0.41, "num_input_tokens_seen": 4118618905, "step": 1060, "train_runtime": 28593.5112, "train_tokens_per_second": 144040.334 }, { "epoch": 0.3763746009223129, "grad_norm": 0.2842598557472229, "learning_rate": 2.7671721488486593e-05, "loss": 0.4187, "num_input_tokens_seen": 4122496193, "step": 1061, "train_runtime": 28623.6982, "train_tokens_per_second": 144023.884 }, { "epoch": 0.37672933664420005, "grad_norm": 0.3959364593029022, "learning_rate": 2.7651089072950875e-05, "loss": 0.4141, "num_input_tokens_seen": 4126381019, "step": 1062, "train_runtime": 28644.1411, "train_tokens_per_second": 144056.72 }, { "epoch": 0.37708407236608726, "grad_norm": 0.2813427448272705, "learning_rate": 2.763044711444657e-05, "loss": 0.4229, "num_input_tokens_seen": 4130207024, "step": 1063, "train_runtime": 28665.8702, "train_tokens_per_second": 144080.992 }, { "epoch": 0.3774388080879745, "grad_norm": 0.3007519841194153, "learning_rate": 2.7609795638719767e-05, "loss": 0.4239, "num_input_tokens_seen": 4134132414, "step": 1064, "train_runtime": 28694.6685, "train_tokens_per_second": 144073.19 }, { "epoch": 0.37779354380986163, "grad_norm": 0.32703569531440735, "learning_rate": 2.758913467152842e-05, "loss": 0.434, "num_input_tokens_seen": 4138069105, "step": 1065, "train_runtime": 28719.1076, "train_tokens_per_second": 144087.663 }, { "epoch": 0.37814827953174884, "grad_norm": 0.30362701416015625, "learning_rate": 2.7568464238642314e-05, "loss": 0.4179, "num_input_tokens_seen": 4141919606, "step": 1066, "train_runtime": 28739.2364, "train_tokens_per_second": 144120.726 }, { "epoch": 0.37850301525363605, "grad_norm": 0.2945757210254669, "learning_rate": 2.7547784365843047e-05, "loss": 0.411, "num_input_tokens_seen": 4145735824, "step": 1067, "train_runtime": 28759.4417, "train_tokens_per_second": 144152.166 }, { "epoch": 0.3788577509755232, "grad_norm": 0.44982609152793884, "learning_rate": 2.7527095078923998e-05, "loss": 0.4108, "num_input_tokens_seen": 4149624262, "step": 1068, "train_runtime": 28787.9071, "train_tokens_per_second": 144144.701 }, { "epoch": 0.3792124866974104, "grad_norm": 0.3371615409851074, "learning_rate": 2.7506396403690265e-05, "loss": 0.4229, "num_input_tokens_seen": 4153503048, "step": 1069, "train_runtime": 28819.8849, "train_tokens_per_second": 144119.349 }, { "epoch": 0.37956722241929763, "grad_norm": 0.28694552183151245, "learning_rate": 2.748568836595868e-05, "loss": 0.4057, "num_input_tokens_seen": 4157413368, "step": 1070, "train_runtime": 28843.0993, "train_tokens_per_second": 144138.926 }, { "epoch": 0.3799219581411848, "grad_norm": 0.38943207263946533, "learning_rate": 2.7464970991557747e-05, "loss": 0.411, "num_input_tokens_seen": 4161300509, "step": 1071, "train_runtime": 28871.8104, "train_tokens_per_second": 144130.224 }, { "epoch": 0.380276693863072, "grad_norm": 0.3626099228858948, "learning_rate": 2.74442443063276e-05, "loss": 0.4092, "num_input_tokens_seen": 4165158133, "step": 1072, "train_runtime": 28892.1633, "train_tokens_per_second": 144162.21 }, { "epoch": 0.3806314295849592, "grad_norm": 0.3522633910179138, "learning_rate": 2.742350833612e-05, "loss": 0.4093, "num_input_tokens_seen": 4169049542, "step": 1073, "train_runtime": 28914.8201, "train_tokens_per_second": 144183.831 }, { "epoch": 0.3809861653068464, "grad_norm": 0.3053697645664215, "learning_rate": 2.7402763106798295e-05, "loss": 0.4073, "num_input_tokens_seen": 4172907290, "step": 1074, "train_runtime": 28946.6379, "train_tokens_per_second": 144158.617 }, { "epoch": 0.3813409010287336, "grad_norm": 0.33161184191703796, "learning_rate": 2.7382008644237357e-05, "loss": 0.4134, "num_input_tokens_seen": 4176830055, "step": 1075, "train_runtime": 28979.2991, "train_tokens_per_second": 144131.507 }, { "epoch": 0.3816956367506208, "grad_norm": 0.2917381823062897, "learning_rate": 2.7361244974323604e-05, "loss": 0.4307, "num_input_tokens_seen": 4180669021, "step": 1076, "train_runtime": 29016.8303, "train_tokens_per_second": 144077.385 }, { "epoch": 0.382050372472508, "grad_norm": 0.29322105646133423, "learning_rate": 2.7340472122954923e-05, "loss": 0.416, "num_input_tokens_seen": 4184490425, "step": 1077, "train_runtime": 29047.2425, "train_tokens_per_second": 144058.095 }, { "epoch": 0.38240510819439516, "grad_norm": 0.2688227891921997, "learning_rate": 2.731969011604065e-05, "loss": 0.4214, "num_input_tokens_seen": 4188424276, "step": 1078, "train_runtime": 29073.2774, "train_tokens_per_second": 144064.401 }, { "epoch": 0.38275984391628237, "grad_norm": 0.30244678258895874, "learning_rate": 2.7298898979501546e-05, "loss": 0.4181, "num_input_tokens_seen": 4192307958, "step": 1079, "train_runtime": 29102.1342, "train_tokens_per_second": 144055.0 }, { "epoch": 0.3831145796381696, "grad_norm": 0.40815064311027527, "learning_rate": 2.7278098739269757e-05, "loss": 0.4211, "num_input_tokens_seen": 4196206457, "step": 1080, "train_runtime": 29142.6449, "train_tokens_per_second": 143988.525 }, { "epoch": 0.38346931536005674, "grad_norm": 0.28092995285987854, "learning_rate": 2.725728942128878e-05, "loss": 0.413, "num_input_tokens_seen": 4200055169, "step": 1081, "train_runtime": 29168.5067, "train_tokens_per_second": 143992.807 }, { "epoch": 0.38382405108194395, "grad_norm": 0.25556737184524536, "learning_rate": 2.7236471051513444e-05, "loss": 0.4107, "num_input_tokens_seen": 4204013395, "step": 1082, "train_runtime": 29189.468, "train_tokens_per_second": 144025.009 }, { "epoch": 0.38417878680383116, "grad_norm": 0.2533196210861206, "learning_rate": 2.7215643655909865e-05, "loss": 0.4214, "num_input_tokens_seen": 4207903695, "step": 1083, "train_runtime": 29218.969, "train_tokens_per_second": 144012.737 }, { "epoch": 0.3845335225257183, "grad_norm": 0.4049537479877472, "learning_rate": 2.7194807260455403e-05, "loss": 0.4171, "num_input_tokens_seen": 4211742054, "step": 1084, "train_runtime": 29248.2218, "train_tokens_per_second": 143999.935 }, { "epoch": 0.38488825824760553, "grad_norm": 0.27035924792289734, "learning_rate": 2.7173961891138665e-05, "loss": 0.4115, "num_input_tokens_seen": 4215701485, "step": 1085, "train_runtime": 29274.0162, "train_tokens_per_second": 144008.306 }, { "epoch": 0.38524299396949274, "grad_norm": 0.28107696771621704, "learning_rate": 2.7153107573959444e-05, "loss": 0.4168, "num_input_tokens_seen": 4219592885, "step": 1086, "train_runtime": 29294.4684, "train_tokens_per_second": 144040.602 }, { "epoch": 0.3855977296913799, "grad_norm": 0.31568124890327454, "learning_rate": 2.713224433492868e-05, "loss": 0.4145, "num_input_tokens_seen": 4223440691, "step": 1087, "train_runtime": 29318.793, "train_tokens_per_second": 144052.339 }, { "epoch": 0.3859524654132671, "grad_norm": 0.26365429162979126, "learning_rate": 2.711137220006845e-05, "loss": 0.4099, "num_input_tokens_seen": 4227284111, "step": 1088, "train_runtime": 29338.7729, "train_tokens_per_second": 144085.239 }, { "epoch": 0.3863072011351543, "grad_norm": 0.28616955876350403, "learning_rate": 2.7090491195411934e-05, "loss": 0.431, "num_input_tokens_seen": 4231116982, "step": 1089, "train_runtime": 29368.5528, "train_tokens_per_second": 144069.645 }, { "epoch": 0.3866619368570415, "grad_norm": 0.3336220979690552, "learning_rate": 2.706960134700337e-05, "loss": 0.4194, "num_input_tokens_seen": 4234955627, "step": 1090, "train_runtime": 29390.0272, "train_tokens_per_second": 144094.988 }, { "epoch": 0.3870166725789287, "grad_norm": 0.328285276889801, "learning_rate": 2.704870268089802e-05, "loss": 0.4185, "num_input_tokens_seen": 4238859218, "step": 1091, "train_runtime": 29415.5608, "train_tokens_per_second": 144102.614 }, { "epoch": 0.3873714083008159, "grad_norm": 0.3175509572029114, "learning_rate": 2.702779522316214e-05, "loss": 0.4183, "num_input_tokens_seen": 4242748050, "step": 1092, "train_runtime": 29441.8378, "train_tokens_per_second": 144106.088 }, { "epoch": 0.3877261440227031, "grad_norm": 0.2880552411079407, "learning_rate": 2.7006878999872975e-05, "loss": 0.409, "num_input_tokens_seen": 4246646472, "step": 1093, "train_runtime": 29467.3471, "train_tokens_per_second": 144113.634 }, { "epoch": 0.38808087974459027, "grad_norm": 0.38579973578453064, "learning_rate": 2.698595403711868e-05, "loss": 0.4133, "num_input_tokens_seen": 4250608032, "step": 1094, "train_runtime": 29489.9401, "train_tokens_per_second": 144137.561 }, { "epoch": 0.3884356154664775, "grad_norm": 0.2738286256790161, "learning_rate": 2.6965020360998325e-05, "loss": 0.4082, "num_input_tokens_seen": 4254427808, "step": 1095, "train_runtime": 29510.0028, "train_tokens_per_second": 144169.007 }, { "epoch": 0.3887903511883647, "grad_norm": 0.2520446181297302, "learning_rate": 2.694407799762184e-05, "loss": 0.3995, "num_input_tokens_seen": 4258329589, "step": 1096, "train_runtime": 29533.6968, "train_tokens_per_second": 144185.458 }, { "epoch": 0.38914508691025185, "grad_norm": 0.3114929795265198, "learning_rate": 2.692312697310999e-05, "loss": 0.4017, "num_input_tokens_seen": 4262172190, "step": 1097, "train_runtime": 29561.5714, "train_tokens_per_second": 144179.487 }, { "epoch": 0.38949982263213906, "grad_norm": 0.30174484848976135, "learning_rate": 2.6902167313594346e-05, "loss": 0.4093, "num_input_tokens_seen": 4266072360, "step": 1098, "train_runtime": 29587.761, "train_tokens_per_second": 144183.683 }, { "epoch": 0.38985455835402627, "grad_norm": 0.3968474566936493, "learning_rate": 2.6881199045217248e-05, "loss": 0.4157, "num_input_tokens_seen": 4269972249, "step": 1099, "train_runtime": 29606.6321, "train_tokens_per_second": 144223.505 }, { "epoch": 0.3902092940759134, "grad_norm": 0.25231820344924927, "learning_rate": 2.686022219413177e-05, "loss": 0.4317, "num_input_tokens_seen": 4273889446, "step": 1100, "train_runtime": 29642.6902, "train_tokens_per_second": 144180.215 }, { "epoch": 0.39056402979780064, "grad_norm": 0.3623861074447632, "learning_rate": 2.68392367865017e-05, "loss": 0.4101, "num_input_tokens_seen": 4277734739, "step": 1101, "train_runtime": 29673.7068, "train_tokens_per_second": 144159.096 }, { "epoch": 0.39091876551968785, "grad_norm": 0.33853572607040405, "learning_rate": 2.6818242848501487e-05, "loss": 0.4041, "num_input_tokens_seen": 4281651230, "step": 1102, "train_runtime": 29704.9866, "train_tokens_per_second": 144139.14 }, { "epoch": 0.391273501241575, "grad_norm": 0.41319769620895386, "learning_rate": 2.679724040631623e-05, "loss": 0.4189, "num_input_tokens_seen": 4285554261, "step": 1103, "train_runtime": 29732.8648, "train_tokens_per_second": 144135.262 }, { "epoch": 0.3916282369634622, "grad_norm": 0.46715494990348816, "learning_rate": 2.677622948614163e-05, "loss": 0.4198, "num_input_tokens_seen": 4289497618, "step": 1104, "train_runtime": 29777.3314, "train_tokens_per_second": 144052.453 }, { "epoch": 0.39198297268534943, "grad_norm": 0.27947694063186646, "learning_rate": 2.675521011418397e-05, "loss": 0.4283, "num_input_tokens_seen": 4293362970, "step": 1105, "train_runtime": 29804.3237, "train_tokens_per_second": 144051.682 }, { "epoch": 0.3923377084072366, "grad_norm": 0.317179411649704, "learning_rate": 2.673418231666005e-05, "loss": 0.4198, "num_input_tokens_seen": 4297234327, "step": 1106, "train_runtime": 29827.5181, "train_tokens_per_second": 144069.457 }, { "epoch": 0.3926924441291238, "grad_norm": 0.23747405409812927, "learning_rate": 2.671314611979721e-05, "loss": 0.4024, "num_input_tokens_seen": 4301111191, "step": 1107, "train_runtime": 29854.4758, "train_tokens_per_second": 144069.225 }, { "epoch": 0.393047179851011, "grad_norm": 0.2455449253320694, "learning_rate": 2.669210154983325e-05, "loss": 0.4292, "num_input_tokens_seen": 4304956095, "step": 1108, "train_runtime": 29880.0961, "train_tokens_per_second": 144074.373 }, { "epoch": 0.39340191557289816, "grad_norm": 0.27323436737060547, "learning_rate": 2.6671048633016416e-05, "loss": 0.4063, "num_input_tokens_seen": 4308843970, "step": 1109, "train_runtime": 29913.675, "train_tokens_per_second": 144042.615 }, { "epoch": 0.3937566512947854, "grad_norm": 0.2642803490161896, "learning_rate": 2.6649987395605364e-05, "loss": 0.4064, "num_input_tokens_seen": 4312669773, "step": 1110, "train_runtime": 29946.8685, "train_tokens_per_second": 144010.709 }, { "epoch": 0.3941113870166726, "grad_norm": 0.3113062381744385, "learning_rate": 2.6628917863869128e-05, "loss": 0.418, "num_input_tokens_seen": 4316520553, "step": 1111, "train_runtime": 29971.6298, "train_tokens_per_second": 144020.214 }, { "epoch": 0.3944661227385598, "grad_norm": 0.2169419527053833, "learning_rate": 2.6607840064087088e-05, "loss": 0.4013, "num_input_tokens_seen": 4320433727, "step": 1112, "train_runtime": 30007.295, "train_tokens_per_second": 143979.447 }, { "epoch": 0.39482085846044696, "grad_norm": 0.328036367893219, "learning_rate": 2.658675402254894e-05, "loss": 0.4096, "num_input_tokens_seen": 4324268774, "step": 1113, "train_runtime": 30035.2949, "train_tokens_per_second": 143972.909 }, { "epoch": 0.39517559418233417, "grad_norm": 0.33042603731155396, "learning_rate": 2.6565659765554663e-05, "loss": 0.4125, "num_input_tokens_seen": 4328153666, "step": 1114, "train_runtime": 30061.8282, "train_tokens_per_second": 143975.065 }, { "epoch": 0.3955303299042214, "grad_norm": 0.34170234203338623, "learning_rate": 2.654455731941446e-05, "loss": 0.3963, "num_input_tokens_seen": 4331948854, "step": 1115, "train_runtime": 30081.3057, "train_tokens_per_second": 144008.006 }, { "epoch": 0.39588506562610853, "grad_norm": 0.32933542132377625, "learning_rate": 2.652344671044877e-05, "loss": 0.4111, "num_input_tokens_seen": 4335895477, "step": 1116, "train_runtime": 30115.0001, "train_tokens_per_second": 143977.933 }, { "epoch": 0.39623980134799575, "grad_norm": 0.2862233519554138, "learning_rate": 2.6502327964988217e-05, "loss": 0.4183, "num_input_tokens_seen": 4339783041, "step": 1117, "train_runtime": 30134.2777, "train_tokens_per_second": 144014.835 }, { "epoch": 0.39659453706988296, "grad_norm": 0.3204945921897888, "learning_rate": 2.6481201109373555e-05, "loss": 0.4132, "num_input_tokens_seen": 4343668380, "step": 1118, "train_runtime": 30167.5835, "train_tokens_per_second": 143984.631 }, { "epoch": 0.3969492727917701, "grad_norm": 0.5568520426750183, "learning_rate": 2.6460066169955668e-05, "loss": 0.4147, "num_input_tokens_seen": 4347578357, "step": 1119, "train_runtime": 30199.4651, "train_tokens_per_second": 143962.098 }, { "epoch": 0.3973040085136573, "grad_norm": 0.32541170716285706, "learning_rate": 2.6438923173095504e-05, "loss": 0.4184, "num_input_tokens_seen": 4351399268, "step": 1120, "train_runtime": 30226.6092, "train_tokens_per_second": 143959.226 }, { "epoch": 0.39765874423554454, "grad_norm": 0.3182770311832428, "learning_rate": 2.6417772145164096e-05, "loss": 0.4088, "num_input_tokens_seen": 4355347729, "step": 1121, "train_runtime": 30255.4895, "train_tokens_per_second": 143952.314 }, { "epoch": 0.3980134799574317, "grad_norm": 0.293206125497818, "learning_rate": 2.6396613112542455e-05, "loss": 0.4102, "num_input_tokens_seen": 4359168239, "step": 1122, "train_runtime": 30279.0322, "train_tokens_per_second": 143966.565 }, { "epoch": 0.3983682156793189, "grad_norm": 0.38074013590812683, "learning_rate": 2.63754461016216e-05, "loss": 0.4154, "num_input_tokens_seen": 4363027793, "step": 1123, "train_runtime": 30305.6929, "train_tokens_per_second": 143967.267 }, { "epoch": 0.3987229514012061, "grad_norm": 0.29285693168640137, "learning_rate": 2.6354271138802493e-05, "loss": 0.422, "num_input_tokens_seen": 4366963861, "step": 1124, "train_runtime": 30326.766, "train_tokens_per_second": 143997.018 }, { "epoch": 0.39907768712309327, "grad_norm": 0.2649383246898651, "learning_rate": 2.6333088250496012e-05, "loss": 0.4161, "num_input_tokens_seen": 4370830914, "step": 1125, "train_runtime": 30355.5315, "train_tokens_per_second": 143987.955 }, { "epoch": 0.3994324228449805, "grad_norm": 0.28976091742515564, "learning_rate": 2.631189746312293e-05, "loss": 0.4128, "num_input_tokens_seen": 4374729083, "step": 1126, "train_runtime": 30374.1008, "train_tokens_per_second": 144028.267 }, { "epoch": 0.3997871585668677, "grad_norm": 0.31683388352394104, "learning_rate": 2.6290698803113862e-05, "loss": 0.4069, "num_input_tokens_seen": 4378586762, "step": 1127, "train_runtime": 30393.2878, "train_tokens_per_second": 144064.268 }, { "epoch": 0.40014189428875485, "grad_norm": 0.3026888966560364, "learning_rate": 2.626949229690924e-05, "loss": 0.4113, "num_input_tokens_seen": 4382508824, "step": 1128, "train_runtime": 30425.6599, "train_tokens_per_second": 144039.894 }, { "epoch": 0.40049663001064206, "grad_norm": 0.3745909035205841, "learning_rate": 2.6248277970959296e-05, "loss": 0.4118, "num_input_tokens_seen": 4386289515, "step": 1129, "train_runtime": 30445.6325, "train_tokens_per_second": 144069.581 }, { "epoch": 0.4008513657325293, "grad_norm": 0.3526168763637543, "learning_rate": 2.6227055851724014e-05, "loss": 0.4094, "num_input_tokens_seen": 4390145455, "step": 1130, "train_runtime": 30466.3394, "train_tokens_per_second": 144098.226 }, { "epoch": 0.4012061014544165, "grad_norm": 0.36967024207115173, "learning_rate": 2.6205825965673075e-05, "loss": 0.4197, "num_input_tokens_seen": 4394036448, "step": 1131, "train_runtime": 30497.3161, "train_tokens_per_second": 144079.447 }, { "epoch": 0.40156083717630364, "grad_norm": 0.338208943605423, "learning_rate": 2.6184588339285878e-05, "loss": 0.4112, "num_input_tokens_seen": 4397878502, "step": 1132, "train_runtime": 30522.4911, "train_tokens_per_second": 144086.486 }, { "epoch": 0.40191557289819085, "grad_norm": 0.3080781400203705, "learning_rate": 2.6163342999051457e-05, "loss": 0.4148, "num_input_tokens_seen": 4401762904, "step": 1133, "train_runtime": 30554.936, "train_tokens_per_second": 144060.616 }, { "epoch": 0.40227030862007807, "grad_norm": 0.3017362952232361, "learning_rate": 2.6142089971468472e-05, "loss": 0.4078, "num_input_tokens_seen": 4405625623, "step": 1134, "train_runtime": 30580.7653, "train_tokens_per_second": 144065.251 }, { "epoch": 0.4026250443419652, "grad_norm": 0.46041059494018555, "learning_rate": 2.6120829283045172e-05, "loss": 0.4169, "num_input_tokens_seen": 4409498289, "step": 1135, "train_runtime": 30599.2548, "train_tokens_per_second": 144104.761 }, { "epoch": 0.40297978006385243, "grad_norm": 0.33199429512023926, "learning_rate": 2.6099560960299366e-05, "loss": 0.4073, "num_input_tokens_seen": 4413379088, "step": 1136, "train_runtime": 30627.3372, "train_tokens_per_second": 144099.34 }, { "epoch": 0.40333451578573964, "grad_norm": 0.6457142233848572, "learning_rate": 2.6078285029758378e-05, "loss": 0.4205, "num_input_tokens_seen": 4417260839, "step": 1137, "train_runtime": 30649.6267, "train_tokens_per_second": 144121.196 }, { "epoch": 0.4036892515076268, "grad_norm": 0.4084729254245758, "learning_rate": 2.6057001517959015e-05, "loss": 0.4146, "num_input_tokens_seen": 4421155472, "step": 1138, "train_runtime": 30679.8722, "train_tokens_per_second": 144106.059 }, { "epoch": 0.404043987229514, "grad_norm": 0.4934241771697998, "learning_rate": 2.603571045144756e-05, "loss": 0.4178, "num_input_tokens_seen": 4424990578, "step": 1139, "train_runtime": 30711.1999, "train_tokens_per_second": 144083.936 }, { "epoch": 0.4043987229514012, "grad_norm": 0.31345292925834656, "learning_rate": 2.6014411856779704e-05, "loss": 0.4151, "num_input_tokens_seen": 4428949837, "step": 1140, "train_runtime": 30742.3925, "train_tokens_per_second": 144066.531 }, { "epoch": 0.4047534586732884, "grad_norm": 0.3064870536327362, "learning_rate": 2.599310576052053e-05, "loss": 0.4086, "num_input_tokens_seen": 4432812161, "step": 1141, "train_runtime": 30762.2555, "train_tokens_per_second": 144099.062 }, { "epoch": 0.4051081943951756, "grad_norm": 0.30950385332107544, "learning_rate": 2.5971792189244473e-05, "loss": 0.4157, "num_input_tokens_seen": 4436714065, "step": 1142, "train_runtime": 30783.2478, "train_tokens_per_second": 144127.549 }, { "epoch": 0.4054629301170628, "grad_norm": 0.27737322449684143, "learning_rate": 2.5950471169535304e-05, "loss": 0.4136, "num_input_tokens_seen": 4440630454, "step": 1143, "train_runtime": 30808.2567, "train_tokens_per_second": 144137.674 }, { "epoch": 0.40581766583894996, "grad_norm": 0.2900916039943695, "learning_rate": 2.5929142727986065e-05, "loss": 0.4014, "num_input_tokens_seen": 4444444316, "step": 1144, "train_runtime": 30827.523, "train_tokens_per_second": 144171.308 }, { "epoch": 0.40617240156083717, "grad_norm": 0.3379303216934204, "learning_rate": 2.5907806891199077e-05, "loss": 0.4055, "num_input_tokens_seen": 4448363071, "step": 1145, "train_runtime": 30863.0123, "train_tokens_per_second": 144132.499 }, { "epoch": 0.4065271372827244, "grad_norm": 0.394167959690094, "learning_rate": 2.5886463685785873e-05, "loss": 0.4213, "num_input_tokens_seen": 4452147222, "step": 1146, "train_runtime": 30889.1845, "train_tokens_per_second": 144132.883 }, { "epoch": 0.40688187300461154, "grad_norm": 0.2633504569530487, "learning_rate": 2.5865113138367172e-05, "loss": 0.415, "num_input_tokens_seen": 4456023232, "step": 1147, "train_runtime": 30916.455, "train_tokens_per_second": 144131.118 }, { "epoch": 0.40723660872649875, "grad_norm": 0.2982477843761444, "learning_rate": 2.584375527557286e-05, "loss": 0.4068, "num_input_tokens_seen": 4459903949, "step": 1148, "train_runtime": 30944.3047, "train_tokens_per_second": 144126.811 }, { "epoch": 0.40759134444838596, "grad_norm": 0.33363062143325806, "learning_rate": 2.5822390124041956e-05, "loss": 0.4025, "num_input_tokens_seen": 4463844456, "step": 1149, "train_runtime": 30976.3298, "train_tokens_per_second": 144105.015 }, { "epoch": 0.4079460801702732, "grad_norm": 0.263336718082428, "learning_rate": 2.5801017710422537e-05, "loss": 0.402, "num_input_tokens_seen": 4467747853, "step": 1150, "train_runtime": 30998.5809, "train_tokens_per_second": 144127.496 }, { "epoch": 0.40830081589216033, "grad_norm": 0.5514596700668335, "learning_rate": 2.577963806137177e-05, "loss": 0.4154, "num_input_tokens_seen": 4471613071, "step": 1151, "train_runtime": 31017.3663, "train_tokens_per_second": 144164.821 }, { "epoch": 0.40865555161404754, "grad_norm": 0.28670480847358704, "learning_rate": 2.5758251203555834e-05, "loss": 0.4122, "num_input_tokens_seen": 4475528196, "step": 1152, "train_runtime": 31044.1376, "train_tokens_per_second": 144166.614 }, { "epoch": 0.40901028733593475, "grad_norm": 0.27259179949760437, "learning_rate": 2.57368571636499e-05, "loss": 0.4033, "num_input_tokens_seen": 4479438436, "step": 1153, "train_runtime": 31071.8285, "train_tokens_per_second": 144163.979 }, { "epoch": 0.4093650230578219, "grad_norm": 0.281355082988739, "learning_rate": 2.5715455968338092e-05, "loss": 0.4117, "num_input_tokens_seen": 4483308846, "step": 1154, "train_runtime": 31091.2966, "train_tokens_per_second": 144198.195 }, { "epoch": 0.4097197587797091, "grad_norm": 0.29880133271217346, "learning_rate": 2.5694047644313474e-05, "loss": 0.413, "num_input_tokens_seen": 4487141892, "step": 1155, "train_runtime": 31119.5146, "train_tokens_per_second": 144190.613 }, { "epoch": 0.41007449450159633, "grad_norm": 0.30011364817619324, "learning_rate": 2.567263221827798e-05, "loss": 0.4013, "num_input_tokens_seen": 4491105721, "step": 1156, "train_runtime": 31145.5169, "train_tokens_per_second": 144197.501 }, { "epoch": 0.4104292302234835, "grad_norm": 0.2331283539533615, "learning_rate": 2.5651209716942426e-05, "loss": 0.405, "num_input_tokens_seen": 4494944745, "step": 1157, "train_runtime": 31165.9356, "train_tokens_per_second": 144226.209 }, { "epoch": 0.4107839659453707, "grad_norm": 0.29876378178596497, "learning_rate": 2.5629780167026432e-05, "loss": 0.4241, "num_input_tokens_seen": 4498785552, "step": 1158, "train_runtime": 31186.693, "train_tokens_per_second": 144253.37 }, { "epoch": 0.4111387016672579, "grad_norm": 0.49370697140693665, "learning_rate": 2.560834359525842e-05, "loss": 0.4099, "num_input_tokens_seen": 4502733189, "step": 1159, "train_runtime": 31213.0352, "train_tokens_per_second": 144258.101 }, { "epoch": 0.41149343738914507, "grad_norm": 0.31477391719818115, "learning_rate": 2.558690002837557e-05, "loss": 0.4168, "num_input_tokens_seen": 4506554075, "step": 1160, "train_runtime": 31237.6044, "train_tokens_per_second": 144266.955 }, { "epoch": 0.4118481731110323, "grad_norm": 0.31607845425605774, "learning_rate": 2.5565449493123783e-05, "loss": 0.4084, "num_input_tokens_seen": 4510475588, "step": 1161, "train_runtime": 31271.5928, "train_tokens_per_second": 144235.557 }, { "epoch": 0.4122029088329195, "grad_norm": 0.27520954608917236, "learning_rate": 2.5543992016257652e-05, "loss": 0.4131, "num_input_tokens_seen": 4514402651, "step": 1162, "train_runtime": 31300.8344, "train_tokens_per_second": 144226.272 }, { "epoch": 0.41255764455480665, "grad_norm": 0.2806287407875061, "learning_rate": 2.5522527624540434e-05, "loss": 0.4129, "num_input_tokens_seen": 4518308494, "step": 1163, "train_runtime": 31337.3529, "train_tokens_per_second": 144182.839 }, { "epoch": 0.41291238027669386, "grad_norm": 0.24862787127494812, "learning_rate": 2.5501056344743997e-05, "loss": 0.4082, "num_input_tokens_seen": 4522174414, "step": 1164, "train_runtime": 31366.4182, "train_tokens_per_second": 144172.484 }, { "epoch": 0.41326711599858107, "grad_norm": 0.27473288774490356, "learning_rate": 2.5479578203648824e-05, "loss": 0.4083, "num_input_tokens_seen": 4526029725, "step": 1165, "train_runtime": 31383.2797, "train_tokens_per_second": 144217.869 }, { "epoch": 0.4136218517204682, "grad_norm": 0.29409557580947876, "learning_rate": 2.5458093228043926e-05, "loss": 0.4032, "num_input_tokens_seen": 4529945936, "step": 1166, "train_runtime": 31403.3694, "train_tokens_per_second": 144250.315 }, { "epoch": 0.41397658744235544, "grad_norm": 0.317661315202713, "learning_rate": 2.5436601444726862e-05, "loss": 0.4127, "num_input_tokens_seen": 4533773889, "step": 1167, "train_runtime": 31423.9194, "train_tokens_per_second": 144277.798 }, { "epoch": 0.41433132316424265, "grad_norm": 0.32830700278282166, "learning_rate": 2.541510288050367e-05, "loss": 0.4197, "num_input_tokens_seen": 4537671685, "step": 1168, "train_runtime": 31448.9981, "train_tokens_per_second": 144286.685 }, { "epoch": 0.41468605888612986, "grad_norm": 0.33016014099121094, "learning_rate": 2.539359756218885e-05, "loss": 0.412, "num_input_tokens_seen": 4541575791, "step": 1169, "train_runtime": 31481.187, "train_tokens_per_second": 144263.169 }, { "epoch": 0.415040794608017, "grad_norm": 0.26012682914733887, "learning_rate": 2.5372085516605333e-05, "loss": 0.4154, "num_input_tokens_seen": 4545380579, "step": 1170, "train_runtime": 31507.5879, "train_tokens_per_second": 144263.045 }, { "epoch": 0.41539553032990423, "grad_norm": 0.28809893131256104, "learning_rate": 2.5350566770584423e-05, "loss": 0.4088, "num_input_tokens_seen": 4549325238, "step": 1171, "train_runtime": 31526.3609, "train_tokens_per_second": 144302.264 }, { "epoch": 0.41575026605179144, "grad_norm": 0.31024470925331116, "learning_rate": 2.5329041350965794e-05, "loss": 0.4145, "num_input_tokens_seen": 4553222204, "step": 1172, "train_runtime": 31552.3726, "train_tokens_per_second": 144306.809 }, { "epoch": 0.4161050017736786, "grad_norm": 0.27126652002334595, "learning_rate": 2.5307509284597442e-05, "loss": 0.4091, "num_input_tokens_seen": 4557107041, "step": 1173, "train_runtime": 31573.9246, "train_tokens_per_second": 144331.346 }, { "epoch": 0.4164597374955658, "grad_norm": 0.5229272246360779, "learning_rate": 2.5285970598335654e-05, "loss": 0.4001, "num_input_tokens_seen": 4560964814, "step": 1174, "train_runtime": 31609.0623, "train_tokens_per_second": 144292.949 }, { "epoch": 0.416814473217453, "grad_norm": 0.2734823226928711, "learning_rate": 2.5264425319044968e-05, "loss": 0.4221, "num_input_tokens_seen": 4564879109, "step": 1175, "train_runtime": 31628.6902, "train_tokens_per_second": 144327.162 }, { "epoch": 0.4171692089393402, "grad_norm": 0.30360567569732666, "learning_rate": 2.5242873473598165e-05, "loss": 0.3972, "num_input_tokens_seen": 4568721362, "step": 1176, "train_runtime": 31647.9767, "train_tokens_per_second": 144360.614 }, { "epoch": 0.4175239446612274, "grad_norm": 0.24792782962322235, "learning_rate": 2.5221315088876175e-05, "loss": 0.4149, "num_input_tokens_seen": 4572652644, "step": 1177, "train_runtime": 31676.4655, "train_tokens_per_second": 144354.888 }, { "epoch": 0.4178786803831146, "grad_norm": 0.3008357584476471, "learning_rate": 2.519975019176813e-05, "loss": 0.4051, "num_input_tokens_seen": 4576546213, "step": 1178, "train_runtime": 31703.5807, "train_tokens_per_second": 144354.237 }, { "epoch": 0.41823341610500175, "grad_norm": 0.34042495489120483, "learning_rate": 2.5178178809171258e-05, "loss": 0.4208, "num_input_tokens_seen": 4580457268, "step": 1179, "train_runtime": 31736.4015, "train_tokens_per_second": 144328.186 }, { "epoch": 0.41858815182688897, "grad_norm": 0.3081004321575165, "learning_rate": 2.515660096799088e-05, "loss": 0.4119, "num_input_tokens_seen": 4584336600, "step": 1180, "train_runtime": 31763.3528, "train_tokens_per_second": 144327.856 }, { "epoch": 0.4189428875487762, "grad_norm": 0.28521934151649475, "learning_rate": 2.513501669514037e-05, "loss": 0.4222, "num_input_tokens_seen": 4588176175, "step": 1181, "train_runtime": 31789.904, "train_tokens_per_second": 144328.091 }, { "epoch": 0.41929762327066333, "grad_norm": 0.3037222623825073, "learning_rate": 2.511342601754114e-05, "loss": 0.4168, "num_input_tokens_seen": 4592095794, "step": 1182, "train_runtime": 31816.1843, "train_tokens_per_second": 144332.072 }, { "epoch": 0.41965235899255054, "grad_norm": 0.3653005361557007, "learning_rate": 2.5091828962122582e-05, "loss": 0.4048, "num_input_tokens_seen": 4595957900, "step": 1183, "train_runtime": 31842.7518, "train_tokens_per_second": 144332.937 }, { "epoch": 0.42000709471443776, "grad_norm": 0.29538774490356445, "learning_rate": 2.507022555582203e-05, "loss": 0.4224, "num_input_tokens_seen": 4599877761, "step": 1184, "train_runtime": 31871.7651, "train_tokens_per_second": 144324.538 }, { "epoch": 0.4203618304363249, "grad_norm": 0.24752765893936157, "learning_rate": 2.5048615825584755e-05, "loss": 0.4128, "num_input_tokens_seen": 4603811274, "step": 1185, "train_runtime": 31892.4084, "train_tokens_per_second": 144354.456 }, { "epoch": 0.4207165661582121, "grad_norm": 0.27755990624427795, "learning_rate": 2.5026999798363908e-05, "loss": 0.4241, "num_input_tokens_seen": 4607714146, "step": 1186, "train_runtime": 31918.8854, "train_tokens_per_second": 144356.988 }, { "epoch": 0.42107130188009934, "grad_norm": 0.3174995183944702, "learning_rate": 2.5005377501120497e-05, "loss": 0.4002, "num_input_tokens_seen": 4611560335, "step": 1187, "train_runtime": 31951.8184, "train_tokens_per_second": 144328.572 }, { "epoch": 0.42142603760198655, "grad_norm": 0.2634425163269043, "learning_rate": 2.4983748960823347e-05, "loss": 0.4014, "num_input_tokens_seen": 4615436496, "step": 1188, "train_runtime": 31976.6483, "train_tokens_per_second": 144337.72 }, { "epoch": 0.4217807733238737, "grad_norm": 0.2959415912628174, "learning_rate": 2.496211420444908e-05, "loss": 0.4136, "num_input_tokens_seen": 4619351556, "step": 1189, "train_runtime": 32004.3639, "train_tokens_per_second": 144335.053 }, { "epoch": 0.4221355090457609, "grad_norm": 0.2991218566894531, "learning_rate": 2.494047325898205e-05, "loss": 0.4161, "num_input_tokens_seen": 4623202613, "step": 1190, "train_runtime": 32047.5489, "train_tokens_per_second": 144260.724 }, { "epoch": 0.4224902447676481, "grad_norm": 0.3083164095878601, "learning_rate": 2.491882615141436e-05, "loss": 0.4074, "num_input_tokens_seen": 4627073701, "step": 1191, "train_runtime": 32069.9294, "train_tokens_per_second": 144280.757 }, { "epoch": 0.4228449804895353, "grad_norm": 0.3677341639995575, "learning_rate": 2.4897172908745782e-05, "loss": 0.4235, "num_input_tokens_seen": 4630898451, "step": 1192, "train_runtime": 32104.6908, "train_tokens_per_second": 144243.671 }, { "epoch": 0.4231997162114225, "grad_norm": 0.27863743901252747, "learning_rate": 2.4875513557983725e-05, "loss": 0.4156, "num_input_tokens_seen": 4634812662, "step": 1193, "train_runtime": 32137.3541, "train_tokens_per_second": 144218.863 }, { "epoch": 0.4235544519333097, "grad_norm": 0.4812796711921692, "learning_rate": 2.4853848126143244e-05, "loss": 0.4084, "num_input_tokens_seen": 4638691681, "step": 1194, "train_runtime": 32159.4411, "train_tokens_per_second": 144240.432 }, { "epoch": 0.42390918765519686, "grad_norm": 0.3099289536476135, "learning_rate": 2.4832176640246974e-05, "loss": 0.4147, "num_input_tokens_seen": 4642571609, "step": 1195, "train_runtime": 32188.5388, "train_tokens_per_second": 144230.58 }, { "epoch": 0.4242639233770841, "grad_norm": 0.2816953957080841, "learning_rate": 2.4810499127325077e-05, "loss": 0.4238, "num_input_tokens_seen": 4646503039, "step": 1196, "train_runtime": 32221.6333, "train_tokens_per_second": 144204.454 }, { "epoch": 0.4246186590989713, "grad_norm": 0.29463666677474976, "learning_rate": 2.4788815614415257e-05, "loss": 0.3978, "num_input_tokens_seen": 4650323478, "step": 1197, "train_runtime": 32251.2313, "train_tokens_per_second": 144190.571 }, { "epoch": 0.42497339482085844, "grad_norm": 0.3588016927242279, "learning_rate": 2.47671261285627e-05, "loss": 0.4129, "num_input_tokens_seen": 4654195634, "step": 1198, "train_runtime": 32270.9266, "train_tokens_per_second": 144222.559 }, { "epoch": 0.42532813054274565, "grad_norm": 0.265558123588562, "learning_rate": 2.4745430696820034e-05, "loss": 0.4053, "num_input_tokens_seen": 4658044932, "step": 1199, "train_runtime": 32294.8083, "train_tokens_per_second": 144235.101 }, { "epoch": 0.42568286626463286, "grad_norm": 0.4499177038669586, "learning_rate": 2.4723729346247297e-05, "loss": 0.4007, "num_input_tokens_seen": 4661929727, "step": 1200, "train_runtime": 32326.657, "train_tokens_per_second": 144213.171 }, { "epoch": 0.42603760198652, "grad_norm": 0.29890599846839905, "learning_rate": 2.4702022103911927e-05, "loss": 0.4107, "num_input_tokens_seen": 4665806332, "step": 1201, "train_runtime": 32453.926, "train_tokens_per_second": 143767.085 }, { "epoch": 0.42639233770840723, "grad_norm": 0.2622828781604767, "learning_rate": 2.4680308996888695e-05, "loss": 0.4084, "num_input_tokens_seen": 4669652782, "step": 1202, "train_runtime": 32480.9973, "train_tokens_per_second": 143765.684 }, { "epoch": 0.42674707343029444, "grad_norm": 0.2935852110385895, "learning_rate": 2.4658590052259697e-05, "loss": 0.4061, "num_input_tokens_seen": 4673547303, "step": 1203, "train_runtime": 32505.975, "train_tokens_per_second": 143775.023 }, { "epoch": 0.4271018091521816, "grad_norm": 0.32842063903808594, "learning_rate": 2.4636865297114308e-05, "loss": 0.4041, "num_input_tokens_seen": 4677434565, "step": 1204, "train_runtime": 32531.6493, "train_tokens_per_second": 143781.046 }, { "epoch": 0.4274565448740688, "grad_norm": 0.3300853371620178, "learning_rate": 2.461513475854914e-05, "loss": 0.4252, "num_input_tokens_seen": 4681327437, "step": 1205, "train_runtime": 32560.6223, "train_tokens_per_second": 143772.665 }, { "epoch": 0.427811280595956, "grad_norm": 0.29357653856277466, "learning_rate": 2.4593398463668036e-05, "loss": 0.4083, "num_input_tokens_seen": 4685197156, "step": 1206, "train_runtime": 32588.1474, "train_tokens_per_second": 143769.975 }, { "epoch": 0.42816601631784323, "grad_norm": 0.32908618450164795, "learning_rate": 2.4571656439581995e-05, "loss": 0.4187, "num_input_tokens_seen": 4689135238, "step": 1207, "train_runtime": 32614.3296, "train_tokens_per_second": 143775.307 }, { "epoch": 0.4285207520397304, "grad_norm": 0.30186375975608826, "learning_rate": 2.4549908713409196e-05, "loss": 0.4084, "num_input_tokens_seen": 4692971596, "step": 1208, "train_runtime": 32646.7608, "train_tokens_per_second": 143749.992 }, { "epoch": 0.4288754877616176, "grad_norm": 1.3700268268585205, "learning_rate": 2.45281553122749e-05, "loss": 0.4082, "num_input_tokens_seen": 4696899927, "step": 1209, "train_runtime": 32674.1688, "train_tokens_per_second": 143749.638 }, { "epoch": 0.4292302234835048, "grad_norm": 0.31876248121261597, "learning_rate": 2.4506396263311453e-05, "loss": 0.4065, "num_input_tokens_seen": 4700785137, "step": 1210, "train_runtime": 32705.0678, "train_tokens_per_second": 143732.622 }, { "epoch": 0.42958495920539197, "grad_norm": 0.3333556652069092, "learning_rate": 2.4484631593658258e-05, "loss": 0.3989, "num_input_tokens_seen": 4704689941, "step": 1211, "train_runtime": 32733.9268, "train_tokens_per_second": 143725.193 }, { "epoch": 0.4299396949272792, "grad_norm": 0.5238432884216309, "learning_rate": 2.4462861330461714e-05, "loss": 0.4016, "num_input_tokens_seen": 4708598512, "step": 1212, "train_runtime": 32757.8274, "train_tokens_per_second": 143739.646 }, { "epoch": 0.4302944306491664, "grad_norm": 0.34738603234291077, "learning_rate": 2.44410855008752e-05, "loss": 0.4014, "num_input_tokens_seen": 4712453850, "step": 1213, "train_runtime": 32784.0932, "train_tokens_per_second": 143742.083 }, { "epoch": 0.43064916637105355, "grad_norm": 0.2933795154094696, "learning_rate": 2.441930413205905e-05, "loss": 0.4176, "num_input_tokens_seen": 4716324404, "step": 1214, "train_runtime": 32805.693, "train_tokens_per_second": 143765.425 }, { "epoch": 0.43100390209294076, "grad_norm": 0.27811941504478455, "learning_rate": 2.4397517251180486e-05, "loss": 0.3983, "num_input_tokens_seen": 4720160216, "step": 1215, "train_runtime": 32834.0851, "train_tokens_per_second": 143757.933 }, { "epoch": 0.43135863781482797, "grad_norm": 0.27525874972343445, "learning_rate": 2.4375724885413616e-05, "loss": 0.4146, "num_input_tokens_seen": 4724054924, "step": 1216, "train_runtime": 32863.5516, "train_tokens_per_second": 143747.547 }, { "epoch": 0.43171337353671513, "grad_norm": 0.23570583760738373, "learning_rate": 2.4353927061939397e-05, "loss": 0.4078, "num_input_tokens_seen": 4727958570, "step": 1217, "train_runtime": 32897.3308, "train_tokens_per_second": 143718.607 }, { "epoch": 0.43206810925860234, "grad_norm": 0.28015071153640747, "learning_rate": 2.4332123807945575e-05, "loss": 0.3999, "num_input_tokens_seen": 4731866440, "step": 1218, "train_runtime": 32925.6351, "train_tokens_per_second": 143713.748 }, { "epoch": 0.43242284498048955, "grad_norm": 0.436252236366272, "learning_rate": 2.431031515062669e-05, "loss": 0.4139, "num_input_tokens_seen": 4735719025, "step": 1219, "train_runtime": 32962.7833, "train_tokens_per_second": 143668.663 }, { "epoch": 0.4327775807023767, "grad_norm": 0.25452709197998047, "learning_rate": 2.4288501117184012e-05, "loss": 0.4034, "num_input_tokens_seen": 4739554648, "step": 1220, "train_runtime": 32982.7741, "train_tokens_per_second": 143697.878 }, { "epoch": 0.4331323164242639, "grad_norm": 0.277667373418808, "learning_rate": 2.42666817348255e-05, "loss": 0.4187, "num_input_tokens_seen": 4743357966, "step": 1221, "train_runtime": 33003.0893, "train_tokens_per_second": 143724.665 }, { "epoch": 0.43348705214615113, "grad_norm": 0.39205873012542725, "learning_rate": 2.4244857030765813e-05, "loss": 0.4249, "num_input_tokens_seen": 4747248521, "step": 1222, "train_runtime": 33029.856, "train_tokens_per_second": 143725.983 }, { "epoch": 0.4338417878680383, "grad_norm": 0.2661028802394867, "learning_rate": 2.4223027032226235e-05, "loss": 0.4046, "num_input_tokens_seen": 4751186874, "step": 1223, "train_runtime": 33057.0001, "train_tokens_per_second": 143727.103 }, { "epoch": 0.4341965235899255, "grad_norm": 0.29254165291786194, "learning_rate": 2.4201191766434645e-05, "loss": 0.4174, "num_input_tokens_seen": 4755128721, "step": 1224, "train_runtime": 33079.555, "train_tokens_per_second": 143748.267 }, { "epoch": 0.4345512593118127, "grad_norm": 0.4405198395252228, "learning_rate": 2.417935126062551e-05, "loss": 0.4163, "num_input_tokens_seen": 4758996508, "step": 1225, "train_runtime": 33105.7376, "train_tokens_per_second": 143751.411 }, { "epoch": 0.43490599503369987, "grad_norm": 0.3299012780189514, "learning_rate": 2.4157505542039806e-05, "loss": 0.4165, "num_input_tokens_seen": 4762846601, "step": 1226, "train_runtime": 33133.1385, "train_tokens_per_second": 143748.731 }, { "epoch": 0.4352607307555871, "grad_norm": 0.254413902759552, "learning_rate": 2.4135654637925044e-05, "loss": 0.4088, "num_input_tokens_seen": 4766746436, "step": 1227, "train_runtime": 33158.7346, "train_tokens_per_second": 143755.378 }, { "epoch": 0.4356154664774743, "grad_norm": 0.3466357886791229, "learning_rate": 2.4113798575535185e-05, "loss": 0.4211, "num_input_tokens_seen": 4770575046, "step": 1228, "train_runtime": 33185.0284, "train_tokens_per_second": 143756.847 }, { "epoch": 0.4359702021993615, "grad_norm": 0.3809378445148468, "learning_rate": 2.4091937382130617e-05, "loss": 0.4099, "num_input_tokens_seen": 4774523944, "step": 1229, "train_runtime": 33207.6009, "train_tokens_per_second": 143778.045 }, { "epoch": 0.43632493792124866, "grad_norm": 0.26151373982429504, "learning_rate": 2.4070071084978136e-05, "loss": 0.4144, "num_input_tokens_seen": 4778373333, "step": 1230, "train_runtime": 33237.0134, "train_tokens_per_second": 143766.628 }, { "epoch": 0.43667967364313587, "grad_norm": 0.36808428168296814, "learning_rate": 2.4048199711350905e-05, "loss": 0.4067, "num_input_tokens_seen": 4782256683, "step": 1231, "train_runtime": 33266.2446, "train_tokens_per_second": 143757.035 }, { "epoch": 0.4370344093650231, "grad_norm": 0.32230043411254883, "learning_rate": 2.4026323288528424e-05, "loss": 0.4101, "num_input_tokens_seen": 4786153292, "step": 1232, "train_runtime": 33291.9392, "train_tokens_per_second": 143763.127 }, { "epoch": 0.43738914508691024, "grad_norm": 0.3105776906013489, "learning_rate": 2.4004441843796474e-05, "loss": 0.4175, "num_input_tokens_seen": 4790038940, "step": 1233, "train_runtime": 33317.1874, "train_tokens_per_second": 143770.808 }, { "epoch": 0.43774388080879745, "grad_norm": 0.276603639125824, "learning_rate": 2.3982555404447112e-05, "loss": 0.4094, "num_input_tokens_seen": 4793838682, "step": 1234, "train_runtime": 33338.7676, "train_tokens_per_second": 143791.718 }, { "epoch": 0.43809861653068466, "grad_norm": 0.26512864232063293, "learning_rate": 2.396066399777863e-05, "loss": 0.4111, "num_input_tokens_seen": 4797746495, "step": 1235, "train_runtime": 33360.571, "train_tokens_per_second": 143814.88 }, { "epoch": 0.4384533522525718, "grad_norm": 0.28836068511009216, "learning_rate": 2.3938767651095495e-05, "loss": 0.4108, "num_input_tokens_seen": 4801729585, "step": 1236, "train_runtime": 33376.9663, "train_tokens_per_second": 143863.572 }, { "epoch": 0.438808087974459, "grad_norm": 0.29483187198638916, "learning_rate": 2.3916866391708352e-05, "loss": 0.4065, "num_input_tokens_seen": 4805601275, "step": 1237, "train_runtime": 33396.6992, "train_tokens_per_second": 143894.498 }, { "epoch": 0.43916282369634624, "grad_norm": 0.4727686643600464, "learning_rate": 2.3894960246933975e-05, "loss": 0.4088, "num_input_tokens_seen": 4809496683, "step": 1238, "train_runtime": 33425.2842, "train_tokens_per_second": 143887.982 }, { "epoch": 0.4395175594182334, "grad_norm": 0.3152243196964264, "learning_rate": 2.3873049244095228e-05, "loss": 0.4076, "num_input_tokens_seen": 4813387107, "step": 1239, "train_runtime": 33457.3535, "train_tokens_per_second": 143866.343 }, { "epoch": 0.4398722951401206, "grad_norm": 0.25903385877609253, "learning_rate": 2.385113341052102e-05, "loss": 0.4033, "num_input_tokens_seen": 4817234820, "step": 1240, "train_runtime": 33479.7544, "train_tokens_per_second": 143885.011 }, { "epoch": 0.4402270308620078, "grad_norm": 0.3005770444869995, "learning_rate": 2.3829212773546305e-05, "loss": 0.4082, "num_input_tokens_seen": 4821193634, "step": 1241, "train_runtime": 33502.3194, "train_tokens_per_second": 143906.265 }, { "epoch": 0.440581766583895, "grad_norm": 0.2841758728027344, "learning_rate": 2.3807287360512028e-05, "loss": 0.4029, "num_input_tokens_seen": 4825095644, "step": 1242, "train_runtime": 33524.5079, "train_tokens_per_second": 143927.411 }, { "epoch": 0.4409365023057822, "grad_norm": 0.41492795944213867, "learning_rate": 2.378535719876507e-05, "loss": 0.4168, "num_input_tokens_seen": 4829000813, "step": 1243, "train_runtime": 33553.565, "train_tokens_per_second": 143919.158 }, { "epoch": 0.4412912380276694, "grad_norm": 0.7162259817123413, "learning_rate": 2.3763422315658256e-05, "loss": 0.4016, "num_input_tokens_seen": 4832856091, "step": 1244, "train_runtime": 33581.429, "train_tokens_per_second": 143914.545 }, { "epoch": 0.44164597374955655, "grad_norm": 0.28054094314575195, "learning_rate": 2.3741482738550294e-05, "loss": 0.413, "num_input_tokens_seen": 4836784888, "step": 1245, "train_runtime": 33619.8825, "train_tokens_per_second": 143866.799 }, { "epoch": 0.44200070947144376, "grad_norm": 0.2937318682670593, "learning_rate": 2.371953849480574e-05, "loss": 0.4177, "num_input_tokens_seen": 4840618066, "step": 1246, "train_runtime": 33645.2682, "train_tokens_per_second": 143872.179 }, { "epoch": 0.442355445193331, "grad_norm": 0.3462585210800171, "learning_rate": 2.369758961179498e-05, "loss": 0.4177, "num_input_tokens_seen": 4844523612, "step": 1247, "train_runtime": 33672.1411, "train_tokens_per_second": 143873.346 }, { "epoch": 0.4427101809152182, "grad_norm": 0.23972797393798828, "learning_rate": 2.3675636116894185e-05, "loss": 0.4022, "num_input_tokens_seen": 4848364261, "step": 1248, "train_runtime": 33698.9564, "train_tokens_per_second": 143872.831 }, { "epoch": 0.44306491663710534, "grad_norm": 0.46071964502334595, "learning_rate": 2.3653678037485267e-05, "loss": 0.4128, "num_input_tokens_seen": 4852334880, "step": 1249, "train_runtime": 33720.7033, "train_tokens_per_second": 143897.796 }, { "epoch": 0.44341965235899256, "grad_norm": 0.28775453567504883, "learning_rate": 2.3631715400955867e-05, "loss": 0.4009, "num_input_tokens_seen": 4856146162, "step": 1250, "train_runtime": 33746.8692, "train_tokens_per_second": 143899.161 }, { "epoch": 0.44377438808087977, "grad_norm": 0.2461645007133484, "learning_rate": 2.3609748234699308e-05, "loss": 0.4029, "num_input_tokens_seen": 4860058316, "step": 1251, "train_runtime": 33773.0511, "train_tokens_per_second": 143903.442 }, { "epoch": 0.4441291238027669, "grad_norm": 0.26986831426620483, "learning_rate": 2.3587776566114564e-05, "loss": 0.407, "num_input_tokens_seen": 4863909812, "step": 1252, "train_runtime": 33795.3019, "train_tokens_per_second": 143922.662 }, { "epoch": 0.44448385952465413, "grad_norm": 0.25189054012298584, "learning_rate": 2.3565800422606226e-05, "loss": 0.4174, "num_input_tokens_seen": 4867834760, "step": 1253, "train_runtime": 33824.2465, "train_tokens_per_second": 143915.542 }, { "epoch": 0.44483859524654135, "grad_norm": 0.2626068890094757, "learning_rate": 2.354381983158446e-05, "loss": 0.4056, "num_input_tokens_seen": 4871678083, "step": 1254, "train_runtime": 33852.0782, "train_tokens_per_second": 143910.754 }, { "epoch": 0.4451933309684285, "grad_norm": 0.28057220578193665, "learning_rate": 2.3521834820464978e-05, "loss": 0.4164, "num_input_tokens_seen": 4875537247, "step": 1255, "train_runtime": 33877.6487, "train_tokens_per_second": 143916.046 }, { "epoch": 0.4455480666903157, "grad_norm": 0.23324431478977203, "learning_rate": 2.3499845416669013e-05, "loss": 0.4093, "num_input_tokens_seen": 4879411262, "step": 1256, "train_runtime": 33900.5466, "train_tokens_per_second": 143933.115 }, { "epoch": 0.4459028024122029, "grad_norm": 0.24609394371509552, "learning_rate": 2.347785164762328e-05, "loss": 0.4081, "num_input_tokens_seen": 4883220368, "step": 1257, "train_runtime": 33923.6381, "train_tokens_per_second": 143947.426 }, { "epoch": 0.4462575381340901, "grad_norm": 0.2727622985839844, "learning_rate": 2.345585354075992e-05, "loss": 0.3965, "num_input_tokens_seen": 4887080838, "step": 1258, "train_runtime": 33954.9429, "train_tokens_per_second": 143928.407 }, { "epoch": 0.4466122738559773, "grad_norm": 0.2555004954338074, "learning_rate": 2.3433851123516508e-05, "loss": 0.4123, "num_input_tokens_seen": 4890986896, "step": 1259, "train_runtime": 33987.0854, "train_tokens_per_second": 143907.218 }, { "epoch": 0.4469670095778645, "grad_norm": 0.25071290135383606, "learning_rate": 2.3411844423335976e-05, "loss": 0.4053, "num_input_tokens_seen": 4894862158, "step": 1260, "train_runtime": 34011.2234, "train_tokens_per_second": 143919.027 }, { "epoch": 0.44732174529975166, "grad_norm": 0.232564315199852, "learning_rate": 2.3389833467666607e-05, "loss": 0.412, "num_input_tokens_seen": 4898746543, "step": 1261, "train_runtime": 34033.8216, "train_tokens_per_second": 143937.598 }, { "epoch": 0.4476764810216389, "grad_norm": 0.29182595014572144, "learning_rate": 2.336781828396199e-05, "loss": 0.4181, "num_input_tokens_seen": 4902648652, "step": 1262, "train_runtime": 34054.674, "train_tokens_per_second": 143964.046 }, { "epoch": 0.4480312167435261, "grad_norm": 0.2603941857814789, "learning_rate": 2.3345798899681e-05, "loss": 0.4052, "num_input_tokens_seen": 4906536684, "step": 1263, "train_runtime": 34080.4957, "train_tokens_per_second": 143969.053 }, { "epoch": 0.44838595246541324, "grad_norm": 0.2606193721294403, "learning_rate": 2.332377534228772e-05, "loss": 0.4112, "num_input_tokens_seen": 4910339933, "step": 1264, "train_runtime": 34101.756, "train_tokens_per_second": 143990.824 }, { "epoch": 0.44874068818730045, "grad_norm": 0.3912082016468048, "learning_rate": 2.330174763925147e-05, "loss": 0.396, "num_input_tokens_seen": 4914263980, "step": 1265, "train_runtime": 34134.6592, "train_tokens_per_second": 143966.986 }, { "epoch": 0.44909542390918766, "grad_norm": 0.27192461490631104, "learning_rate": 2.327971581804672e-05, "loss": 0.4206, "num_input_tokens_seen": 4918086733, "step": 1266, "train_runtime": 34159.5997, "train_tokens_per_second": 143973.781 }, { "epoch": 0.4494501596310749, "grad_norm": 0.2477947473526001, "learning_rate": 2.3257679906153094e-05, "loss": 0.4099, "num_input_tokens_seen": 4922030721, "step": 1267, "train_runtime": 34185.6185, "train_tokens_per_second": 143979.572 }, { "epoch": 0.44980489535296203, "grad_norm": 0.24683484435081482, "learning_rate": 2.32356399310553e-05, "loss": 0.3995, "num_input_tokens_seen": 4925896008, "step": 1268, "train_runtime": 34209.6923, "train_tokens_per_second": 143991.24 }, { "epoch": 0.45015963107484924, "grad_norm": 0.22945919632911682, "learning_rate": 2.3213595920243127e-05, "loss": 0.4107, "num_input_tokens_seen": 4929821106, "step": 1269, "train_runtime": 34231.4144, "train_tokens_per_second": 144014.532 }, { "epoch": 0.45051436679673645, "grad_norm": 0.24435026943683624, "learning_rate": 2.3191547901211385e-05, "loss": 0.4304, "num_input_tokens_seen": 4933677999, "step": 1270, "train_runtime": 34258.5931, "train_tokens_per_second": 144012.861 }, { "epoch": 0.4508691025186236, "grad_norm": 0.2522900104522705, "learning_rate": 2.3169495901459905e-05, "loss": 0.4066, "num_input_tokens_seen": 4937530029, "step": 1271, "train_runtime": 34278.8332, "train_tokens_per_second": 144040.201 }, { "epoch": 0.4512238382405108, "grad_norm": 0.418501615524292, "learning_rate": 2.3147439948493462e-05, "loss": 0.416, "num_input_tokens_seen": 4941455678, "step": 1272, "train_runtime": 34315.6442, "train_tokens_per_second": 144000.085 }, { "epoch": 0.45157857396239803, "grad_norm": 0.2566138505935669, "learning_rate": 2.3125380069821772e-05, "loss": 0.4026, "num_input_tokens_seen": 4945233125, "step": 1273, "train_runtime": 34346.6604, "train_tokens_per_second": 143980.028 }, { "epoch": 0.4519333096842852, "grad_norm": 0.24685513973236084, "learning_rate": 2.3103316292959437e-05, "loss": 0.4157, "num_input_tokens_seen": 4949112851, "step": 1274, "train_runtime": 34378.678, "train_tokens_per_second": 143958.789 }, { "epoch": 0.4522880454061724, "grad_norm": 0.34732431173324585, "learning_rate": 2.3081248645425936e-05, "loss": 0.4128, "num_input_tokens_seen": 4952971992, "step": 1275, "train_runtime": 34402.7478, "train_tokens_per_second": 143970.244 }, { "epoch": 0.4526427811280596, "grad_norm": 0.2914241850376129, "learning_rate": 2.3059177154745572e-05, "loss": 0.421, "num_input_tokens_seen": 4956845127, "step": 1276, "train_runtime": 34434.9555, "train_tokens_per_second": 143948.062 }, { "epoch": 0.45299751684994677, "grad_norm": 0.2743145227432251, "learning_rate": 2.303710184844743e-05, "loss": 0.4098, "num_input_tokens_seen": 4960764290, "step": 1277, "train_runtime": 34456.0817, "train_tokens_per_second": 143973.547 }, { "epoch": 0.453352252571834, "grad_norm": 0.26053953170776367, "learning_rate": 2.3015022754065358e-05, "loss": 0.4176, "num_input_tokens_seen": 4964529855, "step": 1278, "train_runtime": 34482.3253, "train_tokens_per_second": 143973.175 }, { "epoch": 0.4537069882937212, "grad_norm": 0.3364761471748352, "learning_rate": 2.2992939899137947e-05, "loss": 0.406, "num_input_tokens_seen": 4968470858, "step": 1279, "train_runtime": 34509.8784, "train_tokens_per_second": 143972.424 }, { "epoch": 0.45406172401560835, "grad_norm": 0.4305266737937927, "learning_rate": 2.2970853311208455e-05, "loss": 0.417, "num_input_tokens_seen": 4972336487, "step": 1280, "train_runtime": 34539.7176, "train_tokens_per_second": 143959.964 }, { "epoch": 0.45441645973749556, "grad_norm": 0.31449154019355774, "learning_rate": 2.2948763017824804e-05, "loss": 0.3961, "num_input_tokens_seen": 4976214638, "step": 1281, "train_runtime": 34562.3267, "train_tokens_per_second": 143977.999 }, { "epoch": 0.45477119545938277, "grad_norm": 0.24909436702728271, "learning_rate": 2.2926669046539548e-05, "loss": 0.4191, "num_input_tokens_seen": 4980101141, "step": 1282, "train_runtime": 34584.6378, "train_tokens_per_second": 143997.493 }, { "epoch": 0.4551259311812699, "grad_norm": 0.31953638792037964, "learning_rate": 2.290457142490981e-05, "loss": 0.4069, "num_input_tokens_seen": 4984007222, "step": 1283, "train_runtime": 34614.1575, "train_tokens_per_second": 143987.535 }, { "epoch": 0.45548066690315714, "grad_norm": 0.3114950954914093, "learning_rate": 2.2882470180497277e-05, "loss": 0.4108, "num_input_tokens_seen": 4987890758, "step": 1284, "train_runtime": 34634.6302, "train_tokens_per_second": 144014.552 }, { "epoch": 0.45583540262504435, "grad_norm": 0.6252780556678772, "learning_rate": 2.2860365340868162e-05, "loss": 0.424, "num_input_tokens_seen": 4991818512, "step": 1285, "train_runtime": 34653.4138, "train_tokens_per_second": 144049.834 }, { "epoch": 0.45619013834693156, "grad_norm": 0.24292562901973724, "learning_rate": 2.283825693359313e-05, "loss": 0.4002, "num_input_tokens_seen": 4995613342, "step": 1286, "train_runtime": 34685.0379, "train_tokens_per_second": 144027.905 }, { "epoch": 0.4565448740688187, "grad_norm": 0.2231011688709259, "learning_rate": 2.2816144986247342e-05, "loss": 0.405, "num_input_tokens_seen": 4999502933, "step": 1287, "train_runtime": 34716.6714, "train_tokens_per_second": 144008.706 }, { "epoch": 0.45689960979070593, "grad_norm": 0.23259273171424866, "learning_rate": 2.2794029526410348e-05, "loss": 0.4034, "num_input_tokens_seen": 5003402988, "step": 1288, "train_runtime": 34742.1501, "train_tokens_per_second": 144015.353 }, { "epoch": 0.45725434551259314, "grad_norm": 0.3179205656051636, "learning_rate": 2.2771910581666075e-05, "loss": 0.407, "num_input_tokens_seen": 5007331601, "step": 1289, "train_runtime": 34774.4877, "train_tokens_per_second": 143994.403 }, { "epoch": 0.4576090812344803, "grad_norm": 0.23604604601860046, "learning_rate": 2.2749788179602807e-05, "loss": 0.4112, "num_input_tokens_seen": 5011172613, "step": 1290, "train_runtime": 34806.1652, "train_tokens_per_second": 143973.706 }, { "epoch": 0.4579638169563675, "grad_norm": 0.24936999380588531, "learning_rate": 2.2727662347813136e-05, "loss": 0.4142, "num_input_tokens_seen": 5015097568, "step": 1291, "train_runtime": 34837.8975, "train_tokens_per_second": 143955.231 }, { "epoch": 0.4583185526782547, "grad_norm": 0.22957651317119598, "learning_rate": 2.2705533113893932e-05, "loss": 0.4132, "num_input_tokens_seen": 5018980388, "step": 1292, "train_runtime": 34864.6223, "train_tokens_per_second": 143956.253 }, { "epoch": 0.4586732884001419, "grad_norm": 0.23690378665924072, "learning_rate": 2.2683400505446312e-05, "loss": 0.4123, "num_input_tokens_seen": 5022846072, "step": 1293, "train_runtime": 34887.6596, "train_tokens_per_second": 143971.998 }, { "epoch": 0.4590280241220291, "grad_norm": 0.7049081921577454, "learning_rate": 2.26612645500756e-05, "loss": 0.4035, "num_input_tokens_seen": 5026753517, "step": 1294, "train_runtime": 34917.7525, "train_tokens_per_second": 143959.824 }, { "epoch": 0.4593827598439163, "grad_norm": 0.2592252790927887, "learning_rate": 2.263912527539129e-05, "loss": 0.408, "num_input_tokens_seen": 5030649170, "step": 1295, "train_runtime": 34945.6407, "train_tokens_per_second": 143956.415 }, { "epoch": 0.45973749556580346, "grad_norm": 0.287469744682312, "learning_rate": 2.261698270900702e-05, "loss": 0.4073, "num_input_tokens_seen": 5034574838, "step": 1296, "train_runtime": 34971.9326, "train_tokens_per_second": 143960.441 }, { "epoch": 0.46009223128769067, "grad_norm": 0.36348605155944824, "learning_rate": 2.2594836878540538e-05, "loss": 0.425, "num_input_tokens_seen": 5038425266, "step": 1297, "train_runtime": 34996.4211, "train_tokens_per_second": 143969.729 }, { "epoch": 0.4604469670095779, "grad_norm": 0.26596367359161377, "learning_rate": 2.2572687811613664e-05, "loss": 0.4172, "num_input_tokens_seen": 5042321974, "step": 1298, "train_runtime": 35023.0715, "train_tokens_per_second": 143971.438 }, { "epoch": 0.46080170273146503, "grad_norm": 0.3064494729042053, "learning_rate": 2.255053553585223e-05, "loss": 0.4098, "num_input_tokens_seen": 5046239902, "step": 1299, "train_runtime": 35053.7707, "train_tokens_per_second": 143957.121 }, { "epoch": 0.46115643845335225, "grad_norm": 0.25967180728912354, "learning_rate": 2.2528380078886104e-05, "loss": 0.4087, "num_input_tokens_seen": 5050136427, "step": 1300, "train_runtime": 35076.0695, "train_tokens_per_second": 143976.691 }, { "epoch": 0.46151117417523946, "grad_norm": 0.22883804142475128, "learning_rate": 2.250622146834911e-05, "loss": 0.3991, "num_input_tokens_seen": 5054016897, "step": 1301, "train_runtime": 35098.1714, "train_tokens_per_second": 143996.587 }, { "epoch": 0.4618659098971266, "grad_norm": 0.23243603110313416, "learning_rate": 2.2484059731878993e-05, "loss": 0.4062, "num_input_tokens_seen": 5057916017, "step": 1302, "train_runtime": 35117.2784, "train_tokens_per_second": 144029.271 }, { "epoch": 0.4622206456190138, "grad_norm": 0.2556648552417755, "learning_rate": 2.246189489711741e-05, "loss": 0.4021, "num_input_tokens_seen": 5061800858, "step": 1303, "train_runtime": 35148.3435, "train_tokens_per_second": 144012.501 }, { "epoch": 0.46257538134090104, "grad_norm": 0.2512328624725342, "learning_rate": 2.2439726991709877e-05, "loss": 0.4072, "num_input_tokens_seen": 5065713306, "step": 1304, "train_runtime": 35173.9197, "train_tokens_per_second": 144019.016 }, { "epoch": 0.46293011706278825, "grad_norm": 0.2443309873342514, "learning_rate": 2.2417556043305752e-05, "loss": 0.4132, "num_input_tokens_seen": 5069591181, "step": 1305, "train_runtime": 35193.4734, "train_tokens_per_second": 144049.186 }, { "epoch": 0.4632848527846754, "grad_norm": 0.2247077077627182, "learning_rate": 2.239538207955817e-05, "loss": 0.4083, "num_input_tokens_seen": 5073415764, "step": 1306, "train_runtime": 35215.2265, "train_tokens_per_second": 144068.81 }, { "epoch": 0.4636395885065626, "grad_norm": 0.3250524699687958, "learning_rate": 2.2373205128124038e-05, "loss": 0.4148, "num_input_tokens_seen": 5077299093, "step": 1307, "train_runtime": 35246.6507, "train_tokens_per_second": 144050.541 }, { "epoch": 0.46399432422844983, "grad_norm": 0.2622189223766327, "learning_rate": 2.2351025216663986e-05, "loss": 0.4115, "num_input_tokens_seen": 5081215138, "step": 1308, "train_runtime": 35270.0692, "train_tokens_per_second": 144065.925 }, { "epoch": 0.464349059950337, "grad_norm": 0.253925085067749, "learning_rate": 2.2328842372842335e-05, "loss": 0.4203, "num_input_tokens_seen": 5085025400, "step": 1309, "train_runtime": 35289.0732, "train_tokens_per_second": 144096.315 }, { "epoch": 0.4647037956722242, "grad_norm": 0.3282214403152466, "learning_rate": 2.230665662432707e-05, "loss": 0.3973, "num_input_tokens_seen": 5088893165, "step": 1310, "train_runtime": 35326.4505, "train_tokens_per_second": 144053.34 }, { "epoch": 0.4650585313941114, "grad_norm": 0.2549915313720703, "learning_rate": 2.2284467998789792e-05, "loss": 0.4063, "num_input_tokens_seen": 5092852019, "step": 1311, "train_runtime": 35350.263, "train_tokens_per_second": 144068.292 }, { "epoch": 0.46541326711599856, "grad_norm": 0.22722120583057404, "learning_rate": 2.226227652390569e-05, "loss": 0.4033, "num_input_tokens_seen": 5096708180, "step": 1312, "train_runtime": 35376.0956, "train_tokens_per_second": 144072.094 }, { "epoch": 0.4657680028378858, "grad_norm": 0.2596665322780609, "learning_rate": 2.224008222735351e-05, "loss": 0.406, "num_input_tokens_seen": 5100593403, "step": 1313, "train_runtime": 35407.9936, "train_tokens_per_second": 144052.031 }, { "epoch": 0.466122738559773, "grad_norm": 0.231379896402359, "learning_rate": 2.2217885136815518e-05, "loss": 0.4072, "num_input_tokens_seen": 5104510552, "step": 1314, "train_runtime": 35430.3449, "train_tokens_per_second": 144071.715 }, { "epoch": 0.46647747428166014, "grad_norm": 0.23279482126235962, "learning_rate": 2.2195685279977468e-05, "loss": 0.4042, "num_input_tokens_seen": 5108416699, "step": 1315, "train_runtime": 35454.5277, "train_tokens_per_second": 144083.62 }, { "epoch": 0.46683221000354735, "grad_norm": 0.31322941184043884, "learning_rate": 2.217348268452856e-05, "loss": 0.4101, "num_input_tokens_seen": 5112314934, "step": 1316, "train_runtime": 35475.8627, "train_tokens_per_second": 144106.853 }, { "epoch": 0.46718694572543457, "grad_norm": 0.24372750520706177, "learning_rate": 2.2151277378161396e-05, "loss": 0.4121, "num_input_tokens_seen": 5116195589, "step": 1317, "train_runtime": 35497.5683, "train_tokens_per_second": 144128.058 }, { "epoch": 0.4675416814473217, "grad_norm": 0.2643415927886963, "learning_rate": 2.212906938857199e-05, "loss": 0.4106, "num_input_tokens_seen": 5120074158, "step": 1318, "train_runtime": 35532.2572, "train_tokens_per_second": 144096.507 }, { "epoch": 0.46789641716920893, "grad_norm": 0.25321412086486816, "learning_rate": 2.2106858743459685e-05, "loss": 0.4053, "num_input_tokens_seen": 5123969207, "step": 1319, "train_runtime": 35563.1761, "train_tokens_per_second": 144080.753 }, { "epoch": 0.46825115289109615, "grad_norm": 0.24102304875850677, "learning_rate": 2.2084645470527122e-05, "loss": 0.4142, "num_input_tokens_seen": 5127856870, "step": 1320, "train_runtime": 35595.4244, "train_tokens_per_second": 144059.439 }, { "epoch": 0.4686058886129833, "grad_norm": 0.25037679076194763, "learning_rate": 2.206242959748025e-05, "loss": 0.4078, "num_input_tokens_seen": 5131688755, "step": 1321, "train_runtime": 35622.8113, "train_tokens_per_second": 144056.254 }, { "epoch": 0.4689606243348705, "grad_norm": 0.23294520378112793, "learning_rate": 2.2040211152028234e-05, "loss": 0.409, "num_input_tokens_seen": 5135541938, "step": 1322, "train_runtime": 35651.8906, "train_tokens_per_second": 144046.833 }, { "epoch": 0.4693153600567577, "grad_norm": 0.35393279790878296, "learning_rate": 2.2017990161883464e-05, "loss": 0.4079, "num_input_tokens_seen": 5139438346, "step": 1323, "train_runtime": 35672.8645, "train_tokens_per_second": 144071.367 }, { "epoch": 0.46967009577864494, "grad_norm": 0.22677886486053467, "learning_rate": 2.1995766654761504e-05, "loss": 0.4058, "num_input_tokens_seen": 5143350014, "step": 1324, "train_runtime": 35694.3989, "train_tokens_per_second": 144094.036 }, { "epoch": 0.4700248315005321, "grad_norm": 0.3117527365684509, "learning_rate": 2.1973540658381043e-05, "loss": 0.3962, "num_input_tokens_seen": 5147229150, "step": 1325, "train_runtime": 35727.8778, "train_tokens_per_second": 144067.587 }, { "epoch": 0.4703795672224193, "grad_norm": 0.24982531368732452, "learning_rate": 2.1951312200463897e-05, "loss": 0.406, "num_input_tokens_seen": 5151075818, "step": 1326, "train_runtime": 35753.6056, "train_tokens_per_second": 144071.506 }, { "epoch": 0.4707343029443065, "grad_norm": 0.23793800175189972, "learning_rate": 2.192908130873493e-05, "loss": 0.412, "num_input_tokens_seen": 5154938571, "step": 1327, "train_runtime": 35780.3589, "train_tokens_per_second": 144071.74 }, { "epoch": 0.47108903866619367, "grad_norm": 0.2331334948539734, "learning_rate": 2.1906848010922042e-05, "loss": 0.4023, "num_input_tokens_seen": 5158901823, "step": 1328, "train_runtime": 35797.9634, "train_tokens_per_second": 144111.601 }, { "epoch": 0.4714437743880809, "grad_norm": 0.2592419385910034, "learning_rate": 2.1884612334756166e-05, "loss": 0.4234, "num_input_tokens_seen": 5162722782, "step": 1329, "train_runtime": 35823.2137, "train_tokens_per_second": 144116.685 }, { "epoch": 0.4717985101099681, "grad_norm": 0.262740820646286, "learning_rate": 2.1862374307971162e-05, "loss": 0.4046, "num_input_tokens_seen": 5166656183, "step": 1330, "train_runtime": 35850.5866, "train_tokens_per_second": 144116.364 }, { "epoch": 0.47215324583185525, "grad_norm": 0.26203182339668274, "learning_rate": 2.1840133958303835e-05, "loss": 0.4068, "num_input_tokens_seen": 5170547710, "step": 1331, "train_runtime": 35878.713, "train_tokens_per_second": 144111.85 }, { "epoch": 0.47250798155374246, "grad_norm": 0.250895619392395, "learning_rate": 2.1817891313493902e-05, "loss": 0.4013, "num_input_tokens_seen": 5174431686, "step": 1332, "train_runtime": 35901.59, "train_tokens_per_second": 144128.204 }, { "epoch": 0.4728627172756297, "grad_norm": 0.24888338148593903, "learning_rate": 2.1795646401283933e-05, "loss": 0.4113, "num_input_tokens_seen": 5178333134, "step": 1333, "train_runtime": 35922.8651, "train_tokens_per_second": 144151.451 }, { "epoch": 0.47321745299751683, "grad_norm": 0.29383769631385803, "learning_rate": 2.1773399249419312e-05, "loss": 0.4145, "num_input_tokens_seen": 5182219603, "step": 1334, "train_runtime": 35945.1065, "train_tokens_per_second": 144170.378 }, { "epoch": 0.47357218871940404, "grad_norm": 0.2537422180175781, "learning_rate": 2.1751149885648237e-05, "loss": 0.4115, "num_input_tokens_seen": 5186152646, "step": 1335, "train_runtime": 35971.9546, "train_tokens_per_second": 144172.111 }, { "epoch": 0.47392692444129125, "grad_norm": 0.3778870701789856, "learning_rate": 2.1728898337721657e-05, "loss": 0.424, "num_input_tokens_seen": 5190017673, "step": 1336, "train_runtime": 36006.8728, "train_tokens_per_second": 144139.64 }, { "epoch": 0.4742816601631784, "grad_norm": 0.27395132184028625, "learning_rate": 2.170664463339324e-05, "loss": 0.4139, "num_input_tokens_seen": 5193886741, "step": 1337, "train_runtime": 36040.8612, "train_tokens_per_second": 144111.061 }, { "epoch": 0.4746363958850656, "grad_norm": 0.23147693276405334, "learning_rate": 2.168438880041935e-05, "loss": 0.4057, "num_input_tokens_seen": 5197803675, "step": 1338, "train_runtime": 36075.6419, "train_tokens_per_second": 144080.698 }, { "epoch": 0.47499113160695283, "grad_norm": 0.2853138744831085, "learning_rate": 2.1662130866559e-05, "loss": 0.4155, "num_input_tokens_seen": 5201678771, "step": 1339, "train_runtime": 36106.7189, "train_tokens_per_second": 144064.011 }, { "epoch": 0.47534586732884, "grad_norm": 0.35343456268310547, "learning_rate": 2.1639870859573836e-05, "loss": 0.398, "num_input_tokens_seen": 5205590980, "step": 1340, "train_runtime": 36140.6768, "train_tokens_per_second": 144036.898 }, { "epoch": 0.4757006030507272, "grad_norm": 0.3095698654651642, "learning_rate": 2.1617608807228087e-05, "loss": 0.4039, "num_input_tokens_seen": 5209492936, "step": 1341, "train_runtime": 36165.0146, "train_tokens_per_second": 144047.859 }, { "epoch": 0.4760553387726144, "grad_norm": 0.25755155086517334, "learning_rate": 2.1595344737288513e-05, "loss": 0.4067, "num_input_tokens_seen": 5213403624, "step": 1342, "train_runtime": 36189.4121, "train_tokens_per_second": 144058.809 }, { "epoch": 0.4764100744945016, "grad_norm": 0.3232269287109375, "learning_rate": 2.157307867752441e-05, "loss": 0.3995, "num_input_tokens_seen": 5217173064, "step": 1343, "train_runtime": 36217.5143, "train_tokens_per_second": 144051.108 }, { "epoch": 0.4767648102163888, "grad_norm": 0.26626676321029663, "learning_rate": 2.155081065570756e-05, "loss": 0.4091, "num_input_tokens_seen": 5221119767, "step": 1344, "train_runtime": 36246.582, "train_tokens_per_second": 144044.472 }, { "epoch": 0.477119545938276, "grad_norm": 0.41144442558288574, "learning_rate": 2.152854069961216e-05, "loss": 0.4188, "num_input_tokens_seen": 5224937854, "step": 1345, "train_runtime": 36272.2187, "train_tokens_per_second": 144047.926 }, { "epoch": 0.4774742816601632, "grad_norm": 0.3358058035373688, "learning_rate": 2.150626883701487e-05, "loss": 0.395, "num_input_tokens_seen": 5228863839, "step": 1346, "train_runtime": 36292.4072, "train_tokens_per_second": 144075.972 }, { "epoch": 0.47782901738205036, "grad_norm": 0.26901355385780334, "learning_rate": 2.148399509569468e-05, "loss": 0.4094, "num_input_tokens_seen": 5232818597, "step": 1347, "train_runtime": 36313.0749, "train_tokens_per_second": 144102.878 }, { "epoch": 0.47818375310393757, "grad_norm": 0.38011717796325684, "learning_rate": 2.1461719503432962e-05, "loss": 0.4083, "num_input_tokens_seen": 5236711191, "step": 1348, "train_runtime": 36333.5747, "train_tokens_per_second": 144128.708 }, { "epoch": 0.4785384888258248, "grad_norm": 0.3541699945926666, "learning_rate": 2.1439442088013362e-05, "loss": 0.4121, "num_input_tokens_seen": 5240570340, "step": 1349, "train_runtime": 36359.9158, "train_tokens_per_second": 144130.431 }, { "epoch": 0.47889322454771194, "grad_norm": 0.3266730010509491, "learning_rate": 2.1417162877221833e-05, "loss": 0.4082, "num_input_tokens_seen": 5244469067, "step": 1350, "train_runtime": 36380.9326, "train_tokens_per_second": 144154.333 }, { "epoch": 0.47924796026959915, "grad_norm": 0.2681606411933899, "learning_rate": 2.139488189884653e-05, "loss": 0.411, "num_input_tokens_seen": 5248338944, "step": 1351, "train_runtime": 36409.6764, "train_tokens_per_second": 144146.817 }, { "epoch": 0.47960269599148636, "grad_norm": 0.32533618807792664, "learning_rate": 2.1372599180677854e-05, "loss": 0.4079, "num_input_tokens_seen": 5252180271, "step": 1352, "train_runtime": 36449.2061, "train_tokens_per_second": 144095.876 }, { "epoch": 0.4799574317133735, "grad_norm": 0.26449185609817505, "learning_rate": 2.1350314750508345e-05, "loss": 0.4035, "num_input_tokens_seen": 5255967526, "step": 1353, "train_runtime": 36474.7183, "train_tokens_per_second": 144098.92 }, { "epoch": 0.48031216743526073, "grad_norm": 0.3368779122829437, "learning_rate": 2.132802863613269e-05, "loss": 0.4064, "num_input_tokens_seen": 5259900659, "step": 1354, "train_runtime": 36497.6429, "train_tokens_per_second": 144116.174 }, { "epoch": 0.48066690315714794, "grad_norm": 0.2846793830394745, "learning_rate": 2.1305740865347674e-05, "loss": 0.4102, "num_input_tokens_seen": 5263758657, "step": 1355, "train_runtime": 36525.8155, "train_tokens_per_second": 144110.64 }, { "epoch": 0.4810216388790351, "grad_norm": 0.3853268623352051, "learning_rate": 2.1283451465952153e-05, "loss": 0.4182, "num_input_tokens_seen": 5267646420, "step": 1356, "train_runtime": 36553.0662, "train_tokens_per_second": 144109.564 }, { "epoch": 0.4813763746009223, "grad_norm": 0.308368057012558, "learning_rate": 2.126116046574701e-05, "loss": 0.408, "num_input_tokens_seen": 5271529940, "step": 1357, "train_runtime": 36579.8945, "train_tokens_per_second": 144110.037 }, { "epoch": 0.4817311103228095, "grad_norm": 0.2527218461036682, "learning_rate": 2.1238867892535117e-05, "loss": 0.4094, "num_input_tokens_seen": 5275431597, "step": 1358, "train_runtime": 36609.2435, "train_tokens_per_second": 144101.082 }, { "epoch": 0.4820858460446967, "grad_norm": 0.3460901081562042, "learning_rate": 2.1216573774121333e-05, "loss": 0.3991, "num_input_tokens_seen": 5279306694, "step": 1359, "train_runtime": 36632.5055, "train_tokens_per_second": 144115.359 }, { "epoch": 0.4824405817665839, "grad_norm": 0.24704410135746002, "learning_rate": 2.1194278138312418e-05, "loss": 0.4008, "num_input_tokens_seen": 5283131299, "step": 1360, "train_runtime": 36655.0179, "train_tokens_per_second": 144131.189 }, { "epoch": 0.4827953174884711, "grad_norm": 0.2699699401855469, "learning_rate": 2.1171981012917034e-05, "loss": 0.4087, "num_input_tokens_seen": 5287038176, "step": 1361, "train_runtime": 36675.7398, "train_tokens_per_second": 144156.279 }, { "epoch": 0.4831500532103583, "grad_norm": 0.26377612352371216, "learning_rate": 2.1149682425745698e-05, "loss": 0.3942, "num_input_tokens_seen": 5290969354, "step": 1362, "train_runtime": 36703.6009, "train_tokens_per_second": 144153.958 }, { "epoch": 0.48350478893224547, "grad_norm": 0.2519405484199524, "learning_rate": 2.1127382404610764e-05, "loss": 0.4085, "num_input_tokens_seen": 5294825895, "step": 1363, "train_runtime": 36730.4579, "train_tokens_per_second": 144153.55 }, { "epoch": 0.4838595246541327, "grad_norm": 0.2838764190673828, "learning_rate": 2.1105080977326355e-05, "loss": 0.4156, "num_input_tokens_seen": 5298686455, "step": 1364, "train_runtime": 36749.0169, "train_tokens_per_second": 144185.802 }, { "epoch": 0.4842142603760199, "grad_norm": 0.2802467942237854, "learning_rate": 2.1082778171708355e-05, "loss": 0.3972, "num_input_tokens_seen": 5302541955, "step": 1365, "train_runtime": 36774.661, "train_tokens_per_second": 144190.097 }, { "epoch": 0.48456899609790705, "grad_norm": 0.271724134683609, "learning_rate": 2.1060474015574376e-05, "loss": 0.4115, "num_input_tokens_seen": 5306458612, "step": 1366, "train_runtime": 36799.5931, "train_tokens_per_second": 144198.839 }, { "epoch": 0.48492373181979426, "grad_norm": 0.32947617769241333, "learning_rate": 2.10381685367437e-05, "loss": 0.4075, "num_input_tokens_seen": 5310365303, "step": 1367, "train_runtime": 36831.0327, "train_tokens_per_second": 144181.819 }, { "epoch": 0.48527846754168147, "grad_norm": 0.2676451802253723, "learning_rate": 2.1015861763037274e-05, "loss": 0.401, "num_input_tokens_seen": 5314213346, "step": 1368, "train_runtime": 36858.9374, "train_tokens_per_second": 144177.063 }, { "epoch": 0.4856332032635686, "grad_norm": 0.2777988314628601, "learning_rate": 2.099355372227766e-05, "loss": 0.4197, "num_input_tokens_seen": 5318116259, "step": 1369, "train_runtime": 36878.9028, "train_tokens_per_second": 144204.839 }, { "epoch": 0.48598793898545584, "grad_norm": 0.34371787309646606, "learning_rate": 2.097124444228897e-05, "loss": 0.3941, "num_input_tokens_seen": 5322009962, "step": 1370, "train_runtime": 36906.4196, "train_tokens_per_second": 144202.825 }, { "epoch": 0.48634267470734305, "grad_norm": 0.24753466248512268, "learning_rate": 2.0948933950896895e-05, "loss": 0.4005, "num_input_tokens_seen": 5325896528, "step": 1371, "train_runtime": 36929.0015, "train_tokens_per_second": 144219.89 }, { "epoch": 0.4866974104292302, "grad_norm": 0.2790910303592682, "learning_rate": 2.092662227592863e-05, "loss": 0.4024, "num_input_tokens_seen": 5329797438, "step": 1372, "train_runtime": 36955.7562, "train_tokens_per_second": 144221.036 }, { "epoch": 0.4870521461511174, "grad_norm": 0.2540944218635559, "learning_rate": 2.090430944521284e-05, "loss": 0.4001, "num_input_tokens_seen": 5333681172, "step": 1373, "train_runtime": 36981.1656, "train_tokens_per_second": 144226.962 }, { "epoch": 0.4874068818730046, "grad_norm": 0.2579306364059448, "learning_rate": 2.0881995486579632e-05, "loss": 0.4015, "num_input_tokens_seen": 5337576568, "step": 1374, "train_runtime": 37007.2484, "train_tokens_per_second": 144230.571 }, { "epoch": 0.4877616175948918, "grad_norm": 0.26343291997909546, "learning_rate": 2.0859680427860524e-05, "loss": 0.4131, "num_input_tokens_seen": 5341494504, "step": 1375, "train_runtime": 37028.9585, "train_tokens_per_second": 144251.816 }, { "epoch": 0.488116353316779, "grad_norm": 0.2644374370574951, "learning_rate": 2.0837364296888402e-05, "loss": 0.4016, "num_input_tokens_seen": 5345331856, "step": 1376, "train_runtime": 37049.4192, "train_tokens_per_second": 144275.726 }, { "epoch": 0.4884710890386662, "grad_norm": 0.25947093963623047, "learning_rate": 2.081504712149749e-05, "loss": 0.3996, "num_input_tokens_seen": 5349171210, "step": 1377, "train_runtime": 37079.8475, "train_tokens_per_second": 144260.874 }, { "epoch": 0.48882582476055336, "grad_norm": 0.3295785188674927, "learning_rate": 2.0792728929523326e-05, "loss": 0.4084, "num_input_tokens_seen": 5353083370, "step": 1378, "train_runtime": 37101.9853, "train_tokens_per_second": 144280.241 }, { "epoch": 0.4891805604824406, "grad_norm": 0.2471219301223755, "learning_rate": 2.077040974880269e-05, "loss": 0.406, "num_input_tokens_seen": 5356924996, "step": 1379, "train_runtime": 37121.7516, "train_tokens_per_second": 144306.903 }, { "epoch": 0.4895352962043278, "grad_norm": 0.6253356337547302, "learning_rate": 2.0748089607173622e-05, "loss": 0.4069, "num_input_tokens_seen": 5360783720, "step": 1380, "train_runtime": 37156.4369, "train_tokens_per_second": 144276.044 }, { "epoch": 0.489890031926215, "grad_norm": 0.3539663255214691, "learning_rate": 2.0725768532475353e-05, "loss": 0.3926, "num_input_tokens_seen": 5364674098, "step": 1381, "train_runtime": 37189.6579, "train_tokens_per_second": 144251.773 }, { "epoch": 0.49024476764810215, "grad_norm": 0.48141607642173767, "learning_rate": 2.0703446552548257e-05, "loss": 0.4112, "num_input_tokens_seen": 5368535118, "step": 1382, "train_runtime": 37227.0175, "train_tokens_per_second": 144210.723 }, { "epoch": 0.49059950336998936, "grad_norm": 0.252078652381897, "learning_rate": 2.068112369523387e-05, "loss": 0.4097, "num_input_tokens_seen": 5372368247, "step": 1383, "train_runtime": 37246.5969, "train_tokens_per_second": 144237.828 }, { "epoch": 0.4909542390918766, "grad_norm": 0.3395332992076874, "learning_rate": 2.0658799988374798e-05, "loss": 0.4162, "num_input_tokens_seen": 5376310421, "step": 1384, "train_runtime": 37270.2493, "train_tokens_per_second": 144252.065 }, { "epoch": 0.49130897481376373, "grad_norm": 0.3525991439819336, "learning_rate": 2.0636475459814725e-05, "loss": 0.3986, "num_input_tokens_seen": 5380097215, "step": 1385, "train_runtime": 37296.3947, "train_tokens_per_second": 144252.474 }, { "epoch": 0.49166371053565094, "grad_norm": 0.8011724352836609, "learning_rate": 2.0614150137398346e-05, "loss": 0.3995, "num_input_tokens_seen": 5383993064, "step": 1386, "train_runtime": 37315.4969, "train_tokens_per_second": 144283.033 }, { "epoch": 0.49201844625753816, "grad_norm": 0.31038326025009155, "learning_rate": 2.059182404897135e-05, "loss": 0.4161, "num_input_tokens_seen": 5387842561, "step": 1387, "train_runtime": 37342.0886, "train_tokens_per_second": 144283.375 }, { "epoch": 0.4923731819794253, "grad_norm": 0.3020518720149994, "learning_rate": 2.0569497222380384e-05, "loss": 0.3973, "num_input_tokens_seen": 5391782595, "step": 1388, "train_runtime": 37364.3932, "train_tokens_per_second": 144302.694 }, { "epoch": 0.4927279177013125, "grad_norm": 0.3200208246707916, "learning_rate": 2.0547169685473004e-05, "loss": 0.4075, "num_input_tokens_seen": 5395618138, "step": 1389, "train_runtime": 37384.8521, "train_tokens_per_second": 144326.32 }, { "epoch": 0.49308265342319973, "grad_norm": 0.2724689245223999, "learning_rate": 2.052484146609767e-05, "loss": 0.4002, "num_input_tokens_seen": 5399520134, "step": 1390, "train_runtime": 37412.4116, "train_tokens_per_second": 144324.3 }, { "epoch": 0.4934373891450869, "grad_norm": 0.2326410561800003, "learning_rate": 2.0502512592103693e-05, "loss": 0.4092, "num_input_tokens_seen": 5403392659, "step": 1391, "train_runtime": 37433.3649, "train_tokens_per_second": 144346.966 }, { "epoch": 0.4937921248669741, "grad_norm": 0.4600861966609955, "learning_rate": 2.0480183091341164e-05, "loss": 0.4101, "num_input_tokens_seen": 5407205643, "step": 1392, "train_runtime": 37461.3707, "train_tokens_per_second": 144340.838 }, { "epoch": 0.4941468605888613, "grad_norm": 0.3090153932571411, "learning_rate": 2.0457852991661012e-05, "loss": 0.4088, "num_input_tokens_seen": 5411182588, "step": 1393, "train_runtime": 37495.875, "train_tokens_per_second": 144314.077 }, { "epoch": 0.49450159631074847, "grad_norm": 0.3785208761692047, "learning_rate": 2.0435522320914886e-05, "loss": 0.4063, "num_input_tokens_seen": 5415047334, "step": 1394, "train_runtime": 37522.0994, "train_tokens_per_second": 144316.214 }, { "epoch": 0.4948563320326357, "grad_norm": 0.28021863102912903, "learning_rate": 2.0413191106955123e-05, "loss": 0.4061, "num_input_tokens_seen": 5418987783, "step": 1395, "train_runtime": 37548.116, "train_tokens_per_second": 144321.164 }, { "epoch": 0.4952110677545229, "grad_norm": 0.38613012433052063, "learning_rate": 2.039085937763478e-05, "loss": 0.3985, "num_input_tokens_seen": 5422878372, "step": 1396, "train_runtime": 37577.5861, "train_tokens_per_second": 144311.515 }, { "epoch": 0.49556580347641005, "grad_norm": 0.4293982982635498, "learning_rate": 2.036852716080753e-05, "loss": 0.4054, "num_input_tokens_seen": 5426755228, "step": 1397, "train_runtime": 37600.7941, "train_tokens_per_second": 144325.548 }, { "epoch": 0.49592053919829726, "grad_norm": 0.9721444249153137, "learning_rate": 2.0346194484327658e-05, "loss": 0.4014, "num_input_tokens_seen": 5430624485, "step": 1398, "train_runtime": 37634.1113, "train_tokens_per_second": 144300.591 }, { "epoch": 0.4962752749201845, "grad_norm": 0.27938297390937805, "learning_rate": 2.0323861376050035e-05, "loss": 0.4122, "num_input_tokens_seen": 5434521556, "step": 1399, "train_runtime": 37653.8166, "train_tokens_per_second": 144328.571 }, { "epoch": 0.4966300106420717, "grad_norm": 0.2548452615737915, "learning_rate": 2.0301527863830054e-05, "loss": 0.4104, "num_input_tokens_seen": 5438417857, "step": 1400, "train_runtime": 37678.4893, "train_tokens_per_second": 144337.471 }, { "epoch": 0.49698474636395884, "grad_norm": 0.36232176423072815, "learning_rate": 2.0279193975523625e-05, "loss": 0.3829, "num_input_tokens_seen": 5442317412, "step": 1401, "train_runtime": 37825.2743, "train_tokens_per_second": 143880.448 }, { "epoch": 0.49733948208584605, "grad_norm": 0.4031754434108734, "learning_rate": 2.025685973898712e-05, "loss": 0.4005, "num_input_tokens_seen": 5446155348, "step": 1402, "train_runtime": 37852.0914, "train_tokens_per_second": 143879.906 }, { "epoch": 0.49769421780773326, "grad_norm": 0.3048885762691498, "learning_rate": 2.0234525182077344e-05, "loss": 0.4107, "num_input_tokens_seen": 5450062306, "step": 1403, "train_runtime": 37872.8063, "train_tokens_per_second": 143904.369 }, { "epoch": 0.4980489535296204, "grad_norm": 0.3332984149456024, "learning_rate": 2.0212190332651508e-05, "loss": 0.4083, "num_input_tokens_seen": 5453959028, "step": 1404, "train_runtime": 37905.731, "train_tokens_per_second": 143882.175 }, { "epoch": 0.49840368925150763, "grad_norm": 0.24496692419052124, "learning_rate": 2.0189855218567184e-05, "loss": 0.412, "num_input_tokens_seen": 5457872250, "step": 1405, "train_runtime": 37933.1629, "train_tokens_per_second": 143881.286 }, { "epoch": 0.49875842497339484, "grad_norm": 0.35448384284973145, "learning_rate": 2.016751986768227e-05, "loss": 0.4047, "num_input_tokens_seen": 5461730117, "step": 1406, "train_runtime": 37965.0884, "train_tokens_per_second": 143861.91 }, { "epoch": 0.499113160695282, "grad_norm": 0.28985846042633057, "learning_rate": 2.0145184307854966e-05, "loss": 0.408, "num_input_tokens_seen": 5465608802, "step": 1407, "train_runtime": 38000.474, "train_tokens_per_second": 143830.016 }, { "epoch": 0.4994678964171692, "grad_norm": 0.28923770785331726, "learning_rate": 2.012284856694373e-05, "loss": 0.4103, "num_input_tokens_seen": 5469493933, "step": 1408, "train_runtime": 38028.1344, "train_tokens_per_second": 143827.564 }, { "epoch": 0.4998226321390564, "grad_norm": 0.32351604104042053, "learning_rate": 2.010051267280725e-05, "loss": 0.4174, "num_input_tokens_seen": 5473337138, "step": 1409, "train_runtime": 38066.2618, "train_tokens_per_second": 143784.466 }, { "epoch": 0.5001773678609436, "grad_norm": 0.2792946994304657, "learning_rate": 2.0078176653304394e-05, "loss": 0.404, "num_input_tokens_seen": 5477284422, "step": 1410, "train_runtime": 38092.1811, "train_tokens_per_second": 143790.255 }, { "epoch": 0.5005321035828308, "grad_norm": 0.3025364279747009, "learning_rate": 2.00558405362942e-05, "loss": 0.4037, "num_input_tokens_seen": 5481156582, "step": 1411, "train_runtime": 38115.4402, "train_tokens_per_second": 143804.1 }, { "epoch": 0.500886839304718, "grad_norm": 0.26406988501548767, "learning_rate": 2.0033504349635825e-05, "loss": 0.4024, "num_input_tokens_seen": 5484980601, "step": 1412, "train_runtime": 38142.0197, "train_tokens_per_second": 143804.147 }, { "epoch": 0.5012415750266052, "grad_norm": 0.33043596148490906, "learning_rate": 2.0011168121188492e-05, "loss": 0.4126, "num_input_tokens_seen": 5488831233, "step": 1413, "train_runtime": 38166.1195, "train_tokens_per_second": 143814.234 }, { "epoch": 0.5015963107484924, "grad_norm": 0.42783525586128235, "learning_rate": 1.9988831878811515e-05, "loss": 0.4076, "num_input_tokens_seen": 5492753303, "step": 1414, "train_runtime": 38186.746, "train_tokens_per_second": 143839.261 }, { "epoch": 0.5019510464703796, "grad_norm": 0.3128156363964081, "learning_rate": 1.9966495650364185e-05, "loss": 0.4125, "num_input_tokens_seen": 5496677400, "step": 1415, "train_runtime": 38213.4113, "train_tokens_per_second": 143841.578 }, { "epoch": 0.5023057821922667, "grad_norm": 0.3217310607433319, "learning_rate": 1.9944159463705804e-05, "loss": 0.4107, "num_input_tokens_seen": 5500430269, "step": 1416, "train_runtime": 38240.4864, "train_tokens_per_second": 143837.874 }, { "epoch": 0.502660517914154, "grad_norm": 0.3517940640449524, "learning_rate": 1.992182334669561e-05, "loss": 0.4041, "num_input_tokens_seen": 5504288435, "step": 1417, "train_runtime": 38268.9704, "train_tokens_per_second": 143831.631 }, { "epoch": 0.5030152536360412, "grad_norm": 0.2717674672603607, "learning_rate": 1.9899487327192757e-05, "loss": 0.3979, "num_input_tokens_seen": 5508184724, "step": 1418, "train_runtime": 38288.116, "train_tokens_per_second": 143861.472 }, { "epoch": 0.5033699893579283, "grad_norm": 0.2727733850479126, "learning_rate": 1.9877151433056273e-05, "loss": 0.4065, "num_input_tokens_seen": 5512188729, "step": 1419, "train_runtime": 38308.4105, "train_tokens_per_second": 143889.779 }, { "epoch": 0.5037247250798156, "grad_norm": 0.2991688549518585, "learning_rate": 1.985481569214504e-05, "loss": 0.4028, "num_input_tokens_seen": 5516053811, "step": 1420, "train_runtime": 38334.6447, "train_tokens_per_second": 143892.134 }, { "epoch": 0.5040794608017027, "grad_norm": 0.2680800259113312, "learning_rate": 1.983248013231774e-05, "loss": 0.4143, "num_input_tokens_seen": 5519929764, "step": 1421, "train_runtime": 38366.388, "train_tokens_per_second": 143874.106 }, { "epoch": 0.5044341965235899, "grad_norm": 0.3648293912410736, "learning_rate": 1.9810144781432826e-05, "loss": 0.4098, "num_input_tokens_seen": 5523746154, "step": 1422, "train_runtime": 38392.4824, "train_tokens_per_second": 143875.723 }, { "epoch": 0.5047889322454772, "grad_norm": 0.42201557755470276, "learning_rate": 1.9787809667348496e-05, "loss": 0.406, "num_input_tokens_seen": 5527625768, "step": 1423, "train_runtime": 38414.3652, "train_tokens_per_second": 143894.758 }, { "epoch": 0.5051436679673643, "grad_norm": 0.3261992037296295, "learning_rate": 1.9765474817922662e-05, "loss": 0.4115, "num_input_tokens_seen": 5531540366, "step": 1424, "train_runtime": 38440.8634, "train_tokens_per_second": 143897.402 }, { "epoch": 0.5054984036892515, "grad_norm": 0.329378604888916, "learning_rate": 1.9743140261012884e-05, "loss": 0.4085, "num_input_tokens_seen": 5535427610, "step": 1425, "train_runtime": 38465.8317, "train_tokens_per_second": 143905.054 }, { "epoch": 0.5058531394111387, "grad_norm": 0.3409455120563507, "learning_rate": 1.972080602447638e-05, "loss": 0.3995, "num_input_tokens_seen": 5539400907, "step": 1426, "train_runtime": 38497.021, "train_tokens_per_second": 143891.677 }, { "epoch": 0.5062078751330259, "grad_norm": 0.3330079913139343, "learning_rate": 1.9698472136169953e-05, "loss": 0.4148, "num_input_tokens_seen": 5543202100, "step": 1427, "train_runtime": 38516.1551, "train_tokens_per_second": 143918.885 }, { "epoch": 0.506562610854913, "grad_norm": 0.31920886039733887, "learning_rate": 1.967613862394997e-05, "loss": 0.379, "num_input_tokens_seen": 5547120123, "step": 1428, "train_runtime": 38537.0931, "train_tokens_per_second": 143942.36 }, { "epoch": 0.5069173465768003, "grad_norm": 0.27143266797065735, "learning_rate": 1.965380551567235e-05, "loss": 0.4099, "num_input_tokens_seen": 5551039667, "step": 1429, "train_runtime": 38562.4874, "train_tokens_per_second": 143949.212 }, { "epoch": 0.5072720822986875, "grad_norm": 0.31961044669151306, "learning_rate": 1.963147283919248e-05, "loss": 0.4165, "num_input_tokens_seen": 5554905497, "step": 1430, "train_runtime": 38582.2789, "train_tokens_per_second": 143975.567 }, { "epoch": 0.5076268180205746, "grad_norm": 0.40358197689056396, "learning_rate": 1.9609140622365225e-05, "loss": 0.4051, "num_input_tokens_seen": 5558721425, "step": 1431, "train_runtime": 38605.1481, "train_tokens_per_second": 143989.123 }, { "epoch": 0.5079815537424619, "grad_norm": 0.27690017223358154, "learning_rate": 1.958680889304488e-05, "loss": 0.4091, "num_input_tokens_seen": 5562634124, "step": 1432, "train_runtime": 38631.0963, "train_tokens_per_second": 143993.69 }, { "epoch": 0.5083362894643491, "grad_norm": 0.38954538106918335, "learning_rate": 1.9564477679085127e-05, "loss": 0.4028, "num_input_tokens_seen": 5566507109, "step": 1433, "train_runtime": 38662.2231, "train_tokens_per_second": 143977.937 }, { "epoch": 0.5086910251862362, "grad_norm": 0.34156179428100586, "learning_rate": 1.954214700833899e-05, "loss": 0.4181, "num_input_tokens_seen": 5570415921, "step": 1434, "train_runtime": 38699.3272, "train_tokens_per_second": 143940.898 }, { "epoch": 0.5090457609081235, "grad_norm": 0.3504965305328369, "learning_rate": 1.951981690865884e-05, "loss": 0.4091, "num_input_tokens_seen": 5574283407, "step": 1435, "train_runtime": 38731.5269, "train_tokens_per_second": 143921.086 }, { "epoch": 0.5094004966300106, "grad_norm": 0.3118722438812256, "learning_rate": 1.9497487407896317e-05, "loss": 0.4071, "num_input_tokens_seen": 5578159614, "step": 1436, "train_runtime": 38753.532, "train_tokens_per_second": 143939.386 }, { "epoch": 0.5097552323518978, "grad_norm": 0.2730555832386017, "learning_rate": 1.947515853390233e-05, "loss": 0.401, "num_input_tokens_seen": 5582036482, "step": 1437, "train_runtime": 38781.0192, "train_tokens_per_second": 143937.333 }, { "epoch": 0.5101099680737851, "grad_norm": 0.32884350419044495, "learning_rate": 1.9452830314526996e-05, "loss": 0.3996, "num_input_tokens_seen": 5585921196, "step": 1438, "train_runtime": 38810.9412, "train_tokens_per_second": 143926.455 }, { "epoch": 0.5104647037956722, "grad_norm": 0.3353901505470276, "learning_rate": 1.943050277761963e-05, "loss": 0.402, "num_input_tokens_seen": 5589843986, "step": 1439, "train_runtime": 38830.6312, "train_tokens_per_second": 143954.497 }, { "epoch": 0.5108194395175594, "grad_norm": 0.28526920080184937, "learning_rate": 1.9408175951028657e-05, "loss": 0.4207, "num_input_tokens_seen": 5593687121, "step": 1440, "train_runtime": 38854.4094, "train_tokens_per_second": 143965.311 }, { "epoch": 0.5111741752394466, "grad_norm": 0.271196186542511, "learning_rate": 1.9385849862601657e-05, "loss": 0.4056, "num_input_tokens_seen": 5597563550, "step": 1441, "train_runtime": 38879.7012, "train_tokens_per_second": 143971.362 }, { "epoch": 0.5115289109613338, "grad_norm": 0.3360932469367981, "learning_rate": 1.9363524540185278e-05, "loss": 0.4135, "num_input_tokens_seen": 5601472107, "step": 1442, "train_runtime": 38905.3787, "train_tokens_per_second": 143976.805 }, { "epoch": 0.511883646683221, "grad_norm": 0.2699111998081207, "learning_rate": 1.9341200011625202e-05, "loss": 0.3976, "num_input_tokens_seen": 5605299151, "step": 1443, "train_runtime": 38926.0877, "train_tokens_per_second": 143998.524 }, { "epoch": 0.5122383824051082, "grad_norm": 0.336609810590744, "learning_rate": 1.9318876304766134e-05, "loss": 0.413, "num_input_tokens_seen": 5609281666, "step": 1444, "train_runtime": 38952.879, "train_tokens_per_second": 144001.722 }, { "epoch": 0.5125931181269954, "grad_norm": 0.28702065348625183, "learning_rate": 1.929655344745175e-05, "loss": 0.4065, "num_input_tokens_seen": 5613081430, "step": 1445, "train_runtime": 38986.5288, "train_tokens_per_second": 143974.896 }, { "epoch": 0.5129478538488825, "grad_norm": 0.3304135501384735, "learning_rate": 1.927423146752466e-05, "loss": 0.4122, "num_input_tokens_seen": 5616981789, "step": 1446, "train_runtime": 39016.1784, "train_tokens_per_second": 143965.453 }, { "epoch": 0.5133025895707698, "grad_norm": 0.2704916000366211, "learning_rate": 1.925191039282638e-05, "loss": 0.4076, "num_input_tokens_seen": 5620831267, "step": 1447, "train_runtime": 39040.9464, "train_tokens_per_second": 143972.721 }, { "epoch": 0.513657325292657, "grad_norm": 0.29761219024658203, "learning_rate": 1.9229590251197313e-05, "loss": 0.399, "num_input_tokens_seen": 5624681765, "step": 1448, "train_runtime": 39068.7367, "train_tokens_per_second": 143968.867 }, { "epoch": 0.5140120610145442, "grad_norm": 0.5597415566444397, "learning_rate": 1.920727107047668e-05, "loss": 0.4103, "num_input_tokens_seen": 5628618347, "step": 1449, "train_runtime": 39095.408, "train_tokens_per_second": 143971.342 }, { "epoch": 0.5143667967364314, "grad_norm": 0.24471384286880493, "learning_rate": 1.918495287850251e-05, "loss": 0.4044, "num_input_tokens_seen": 5632495502, "step": 1450, "train_runtime": 39116.2184, "train_tokens_per_second": 143993.866 }, { "epoch": 0.5147215324583185, "grad_norm": 0.2966848909854889, "learning_rate": 1.9162635703111608e-05, "loss": 0.4044, "num_input_tokens_seen": 5636345287, "step": 1451, "train_runtime": 39136.8884, "train_tokens_per_second": 144016.183 }, { "epoch": 0.5150762681802058, "grad_norm": 0.3883833587169647, "learning_rate": 1.9140319572139482e-05, "loss": 0.4138, "num_input_tokens_seen": 5640292082, "step": 1452, "train_runtime": 39158.0534, "train_tokens_per_second": 144039.133 }, { "epoch": 0.515431003902093, "grad_norm": 0.2775777280330658, "learning_rate": 1.9118004513420374e-05, "loss": 0.4023, "num_input_tokens_seen": 5644097113, "step": 1453, "train_runtime": 39184.1921, "train_tokens_per_second": 144040.155 }, { "epoch": 0.5157857396239801, "grad_norm": 0.2838282585144043, "learning_rate": 1.9095690554787167e-05, "loss": 0.3848, "num_input_tokens_seen": 5647954059, "step": 1454, "train_runtime": 39205.6922, "train_tokens_per_second": 144059.542 }, { "epoch": 0.5161404753458674, "grad_norm": 0.25941258668899536, "learning_rate": 1.9073377724071373e-05, "loss": 0.3995, "num_input_tokens_seen": 5651830230, "step": 1455, "train_runtime": 39230.1683, "train_tokens_per_second": 144068.468 }, { "epoch": 0.5164952110677545, "grad_norm": 0.33907854557037354, "learning_rate": 1.9051066049103105e-05, "loss": 0.4055, "num_input_tokens_seen": 5655755633, "step": 1456, "train_runtime": 39250.5032, "train_tokens_per_second": 144093.838 }, { "epoch": 0.5168499467896417, "grad_norm": 0.2564164698123932, "learning_rate": 1.9028755557711043e-05, "loss": 0.4084, "num_input_tokens_seen": 5659561955, "step": 1457, "train_runtime": 39276.1572, "train_tokens_per_second": 144096.632 }, { "epoch": 0.517204682511529, "grad_norm": 0.28565847873687744, "learning_rate": 1.9006446277722355e-05, "loss": 0.411, "num_input_tokens_seen": 5663442715, "step": 1458, "train_runtime": 39298.8054, "train_tokens_per_second": 144112.338 }, { "epoch": 0.5175594182334161, "grad_norm": 0.2758665680885315, "learning_rate": 1.898413823696273e-05, "loss": 0.4025, "num_input_tokens_seen": 5667332099, "step": 1459, "train_runtime": 39325.7485, "train_tokens_per_second": 144112.504 }, { "epoch": 0.5179141539553033, "grad_norm": 0.34617215394973755, "learning_rate": 1.8961831463256305e-05, "loss": 0.3935, "num_input_tokens_seen": 5671239799, "step": 1460, "train_runtime": 39352.3772, "train_tokens_per_second": 144114.288 }, { "epoch": 0.5182688896771905, "grad_norm": 0.8062020540237427, "learning_rate": 1.893952598442563e-05, "loss": 0.41, "num_input_tokens_seen": 5675158639, "step": 1461, "train_runtime": 39372.3969, "train_tokens_per_second": 144140.542 }, { "epoch": 0.5186236253990777, "grad_norm": 0.24629850685596466, "learning_rate": 1.8917221828291652e-05, "loss": 0.3976, "num_input_tokens_seen": 5679028034, "step": 1462, "train_runtime": 39402.2616, "train_tokens_per_second": 144129.494 }, { "epoch": 0.5189783611209648, "grad_norm": 0.368761271238327, "learning_rate": 1.8894919022673655e-05, "loss": 0.4081, "num_input_tokens_seen": 5682869073, "step": 1463, "train_runtime": 39429.6379, "train_tokens_per_second": 144126.839 }, { "epoch": 0.5193330968428521, "grad_norm": 0.2330440729856491, "learning_rate": 1.8872617595389246e-05, "loss": 0.3954, "num_input_tokens_seen": 5686764579, "step": 1464, "train_runtime": 39448.6788, "train_tokens_per_second": 144156.021 }, { "epoch": 0.5196878325647393, "grad_norm": 0.27651447057724, "learning_rate": 1.885031757425431e-05, "loss": 0.393, "num_input_tokens_seen": 5690651841, "step": 1465, "train_runtime": 39469.7227, "train_tokens_per_second": 144177.649 }, { "epoch": 0.5200425682866264, "grad_norm": 0.29954302310943604, "learning_rate": 1.8828018987082973e-05, "loss": 0.4049, "num_input_tokens_seen": 5694523072, "step": 1466, "train_runtime": 39496.2609, "train_tokens_per_second": 144178.789 }, { "epoch": 0.5203973040085137, "grad_norm": 0.2333865910768509, "learning_rate": 1.880572186168759e-05, "loss": 0.402, "num_input_tokens_seen": 5698389334, "step": 1467, "train_runtime": 39528.6179, "train_tokens_per_second": 144158.578 }, { "epoch": 0.5207520397304009, "grad_norm": 0.2650558650493622, "learning_rate": 1.878342622587867e-05, "loss": 0.41, "num_input_tokens_seen": 5702303871, "step": 1468, "train_runtime": 39560.7356, "train_tokens_per_second": 144140.492 }, { "epoch": 0.521106775452288, "grad_norm": 0.29068347811698914, "learning_rate": 1.8761132107464883e-05, "loss": 0.405, "num_input_tokens_seen": 5706239517, "step": 1469, "train_runtime": 39581.229, "train_tokens_per_second": 144165.294 }, { "epoch": 0.5214615111741753, "grad_norm": 0.2792675197124481, "learning_rate": 1.8738839534252998e-05, "loss": 0.4094, "num_input_tokens_seen": 5710034901, "step": 1470, "train_runtime": 39604.0404, "train_tokens_per_second": 144178.09 }, { "epoch": 0.5218162468960624, "grad_norm": 0.24885092675685883, "learning_rate": 1.8716548534047853e-05, "loss": 0.4033, "num_input_tokens_seen": 5714005508, "step": 1471, "train_runtime": 39631.624, "train_tokens_per_second": 144177.93 }, { "epoch": 0.5221709826179496, "grad_norm": 0.3278358280658722, "learning_rate": 1.869425913465233e-05, "loss": 0.3987, "num_input_tokens_seen": 5717834401, "step": 1472, "train_runtime": 39653.3411, "train_tokens_per_second": 144195.527 }, { "epoch": 0.5225257183398369, "grad_norm": 0.24406498670578003, "learning_rate": 1.8671971363867317e-05, "loss": 0.4043, "num_input_tokens_seen": 5721778509, "step": 1473, "train_runtime": 39685.2988, "train_tokens_per_second": 144178.794 }, { "epoch": 0.522880454061724, "grad_norm": 0.3074008822441101, "learning_rate": 1.8649685249491658e-05, "loss": 0.4111, "num_input_tokens_seen": 5725667983, "step": 1474, "train_runtime": 39707.5552, "train_tokens_per_second": 144195.933 }, { "epoch": 0.5232351897836112, "grad_norm": 0.3589054048061371, "learning_rate": 1.8627400819322146e-05, "loss": 0.406, "num_input_tokens_seen": 5729537812, "step": 1475, "train_runtime": 39730.6381, "train_tokens_per_second": 144209.559 }, { "epoch": 0.5235899255054984, "grad_norm": 0.238463893532753, "learning_rate": 1.8605118101153476e-05, "loss": 0.4069, "num_input_tokens_seen": 5733424413, "step": 1476, "train_runtime": 39769.256, "train_tokens_per_second": 144167.253 }, { "epoch": 0.5239446612273856, "grad_norm": 0.2561679780483246, "learning_rate": 1.858283712277818e-05, "loss": 0.399, "num_input_tokens_seen": 5737351608, "step": 1477, "train_runtime": 39788.0483, "train_tokens_per_second": 144197.865 }, { "epoch": 0.5242993969492727, "grad_norm": 0.29915305972099304, "learning_rate": 1.8560557911986644e-05, "loss": 0.4004, "num_input_tokens_seen": 5741215029, "step": 1478, "train_runtime": 39808.0045, "train_tokens_per_second": 144222.628 }, { "epoch": 0.52465413267116, "grad_norm": 0.43326207995414734, "learning_rate": 1.8538280496567045e-05, "loss": 0.3947, "num_input_tokens_seen": 5745045058, "step": 1479, "train_runtime": 39836.0283, "train_tokens_per_second": 144217.316 }, { "epoch": 0.5250088683930472, "grad_norm": 0.3759647607803345, "learning_rate": 1.8516004904305322e-05, "loss": 0.3964, "num_input_tokens_seen": 5748932914, "step": 1480, "train_runtime": 39867.771, "train_tokens_per_second": 144200.008 }, { "epoch": 0.5253636041149343, "grad_norm": 0.9726167917251587, "learning_rate": 1.8493731162985135e-05, "loss": 0.4089, "num_input_tokens_seen": 5752829584, "step": 1481, "train_runtime": 39892.9181, "train_tokens_per_second": 144206.788 }, { "epoch": 0.5257183398368216, "grad_norm": 0.2759690582752228, "learning_rate": 1.8471459300387846e-05, "loss": 0.4077, "num_input_tokens_seen": 5756702267, "step": 1482, "train_runtime": 39916.6555, "train_tokens_per_second": 144218.051 }, { "epoch": 0.5260730755587087, "grad_norm": 0.28375405073165894, "learning_rate": 1.8449189344292455e-05, "loss": 0.4116, "num_input_tokens_seen": 5760611415, "step": 1483, "train_runtime": 39949.5467, "train_tokens_per_second": 144197.166 }, { "epoch": 0.5264278112805959, "grad_norm": 0.2958325743675232, "learning_rate": 1.8426921322475596e-05, "loss": 0.4046, "num_input_tokens_seen": 5764530617, "step": 1484, "train_runtime": 39979.2917, "train_tokens_per_second": 144187.913 }, { "epoch": 0.5267825470024832, "grad_norm": 0.4105106592178345, "learning_rate": 1.8404655262711494e-05, "loss": 0.4146, "num_input_tokens_seen": 5768374425, "step": 1485, "train_runtime": 40010.8731, "train_tokens_per_second": 144170.171 }, { "epoch": 0.5271372827243703, "grad_norm": 0.2628815174102783, "learning_rate": 1.838239119277192e-05, "loss": 0.4075, "num_input_tokens_seen": 5772293368, "step": 1486, "train_runtime": 40035.3793, "train_tokens_per_second": 144179.81 }, { "epoch": 0.5274920184462576, "grad_norm": 0.31561917066574097, "learning_rate": 1.8360129140426163e-05, "loss": 0.4103, "num_input_tokens_seen": 5776130106, "step": 1487, "train_runtime": 40067.2874, "train_tokens_per_second": 144160.747 }, { "epoch": 0.5278467541681447, "grad_norm": 0.3631454408168793, "learning_rate": 1.8337869133441e-05, "loss": 0.4069, "num_input_tokens_seen": 5780003731, "step": 1488, "train_runtime": 40093.7386, "train_tokens_per_second": 144162.254 }, { "epoch": 0.5282014898900319, "grad_norm": 0.41451314091682434, "learning_rate": 1.831561119958066e-05, "loss": 0.4031, "num_input_tokens_seen": 5783892610, "step": 1489, "train_runtime": 40118.4814, "train_tokens_per_second": 144170.278 }, { "epoch": 0.5285562256119192, "grad_norm": 0.7642858028411865, "learning_rate": 1.829335536660677e-05, "loss": 0.4152, "num_input_tokens_seen": 5787778544, "step": 1490, "train_runtime": 40148.5313, "train_tokens_per_second": 144159.16 }, { "epoch": 0.5289109613338063, "grad_norm": 0.3261334300041199, "learning_rate": 1.827110166227835e-05, "loss": 0.4139, "num_input_tokens_seen": 5791684649, "step": 1491, "train_runtime": 40181.2626, "train_tokens_per_second": 144138.941 }, { "epoch": 0.5292656970556935, "grad_norm": 0.2963302731513977, "learning_rate": 1.8248850114351766e-05, "loss": 0.4042, "num_input_tokens_seen": 5795554438, "step": 1492, "train_runtime": 40207.0232, "train_tokens_per_second": 144142.838 }, { "epoch": 0.5296204327775808, "grad_norm": 0.3689391613006592, "learning_rate": 1.8226600750580688e-05, "loss": 0.4008, "num_input_tokens_seen": 5799350800, "step": 1493, "train_runtime": 40235.7868, "train_tokens_per_second": 144134.147 }, { "epoch": 0.5299751684994679, "grad_norm": 0.7913079261779785, "learning_rate": 1.820435359871607e-05, "loss": 0.3983, "num_input_tokens_seen": 5803250934, "step": 1494, "train_runtime": 40263.1901, "train_tokens_per_second": 144132.915 }, { "epoch": 0.5303299042213551, "grad_norm": 0.32644954323768616, "learning_rate": 1.8182108686506104e-05, "loss": 0.3982, "num_input_tokens_seen": 5807137190, "step": 1495, "train_runtime": 40289.7019, "train_tokens_per_second": 144134.529 }, { "epoch": 0.5306846399432423, "grad_norm": 0.2728475332260132, "learning_rate": 1.815986604169617e-05, "loss": 0.4003, "num_input_tokens_seen": 5810961476, "step": 1496, "train_runtime": 40317.385, "train_tokens_per_second": 144130.416 }, { "epoch": 0.5310393756651295, "grad_norm": 0.32384949922561646, "learning_rate": 1.8137625692028848e-05, "loss": 0.4088, "num_input_tokens_seen": 5814880449, "step": 1497, "train_runtime": 40340.4661, "train_tokens_per_second": 144145.098 }, { "epoch": 0.5313941113870166, "grad_norm": 0.29135560989379883, "learning_rate": 1.811538766524384e-05, "loss": 0.403, "num_input_tokens_seen": 5818748663, "step": 1498, "train_runtime": 40376.6468, "train_tokens_per_second": 144111.736 }, { "epoch": 0.5317488471089039, "grad_norm": 0.3274799585342407, "learning_rate": 1.8093151989077958e-05, "loss": 0.4133, "num_input_tokens_seen": 5822667913, "step": 1499, "train_runtime": 40398.8586, "train_tokens_per_second": 144129.515 }, { "epoch": 0.5321035828307911, "grad_norm": 0.27199772000312805, "learning_rate": 1.8070918691265075e-05, "loss": 0.4021, "num_input_tokens_seen": 5826540856, "step": 1500, "train_runtime": 40425.5812, "train_tokens_per_second": 144130.045 }, { "epoch": 0.5324583185526782, "grad_norm": 0.2876832187175751, "learning_rate": 1.8048687799536113e-05, "loss": 0.4131, "num_input_tokens_seen": 5830361254, "step": 1501, "train_runtime": 40455.6802, "train_tokens_per_second": 144117.247 }, { "epoch": 0.5328130542745655, "grad_norm": 0.30805331468582153, "learning_rate": 1.8026459341618964e-05, "loss": 0.4028, "num_input_tokens_seen": 5834296035, "step": 1502, "train_runtime": 40487.6821, "train_tokens_per_second": 144100.52 }, { "epoch": 0.5331677899964526, "grad_norm": 0.24543137848377228, "learning_rate": 1.8004233345238503e-05, "loss": 0.4161, "num_input_tokens_seen": 5838106199, "step": 1503, "train_runtime": 40506.7349, "train_tokens_per_second": 144126.803 }, { "epoch": 0.5335225257183398, "grad_norm": 0.28611651062965393, "learning_rate": 1.798200983811654e-05, "loss": 0.4129, "num_input_tokens_seen": 5841964495, "step": 1504, "train_runtime": 40532.4765, "train_tokens_per_second": 144130.46 }, { "epoch": 0.5338772614402271, "grad_norm": 0.26275157928466797, "learning_rate": 1.7959788847971772e-05, "loss": 0.4044, "num_input_tokens_seen": 5845872171, "step": 1505, "train_runtime": 40552.0083, "train_tokens_per_second": 144157.402 }, { "epoch": 0.5342319971621142, "grad_norm": 0.26633700728416443, "learning_rate": 1.793757040251976e-05, "loss": 0.3876, "num_input_tokens_seen": 5849802871, "step": 1506, "train_runtime": 40574.5495, "train_tokens_per_second": 144174.192 }, { "epoch": 0.5345867328840014, "grad_norm": 0.2590622007846832, "learning_rate": 1.7915354529472884e-05, "loss": 0.396, "num_input_tokens_seen": 5853743616, "step": 1507, "train_runtime": 40603.0445, "train_tokens_per_second": 144170.066 }, { "epoch": 0.5349414686058886, "grad_norm": 0.28848928213119507, "learning_rate": 1.7893141256540325e-05, "loss": 0.4007, "num_input_tokens_seen": 5857672706, "step": 1508, "train_runtime": 40631.9565, "train_tokens_per_second": 144164.18 }, { "epoch": 0.5352962043277758, "grad_norm": 0.2559303939342499, "learning_rate": 1.7870930611428013e-05, "loss": 0.4, "num_input_tokens_seen": 5861501596, "step": 1509, "train_runtime": 40652.1598, "train_tokens_per_second": 144186.72 }, { "epoch": 0.535650940049663, "grad_norm": 0.38295578956604004, "learning_rate": 1.7848722621838607e-05, "loss": 0.4052, "num_input_tokens_seen": 5865410054, "step": 1510, "train_runtime": 40682.962, "train_tokens_per_second": 144173.624 }, { "epoch": 0.5360056757715502, "grad_norm": 0.27155184745788574, "learning_rate": 1.7826517315471447e-05, "loss": 0.4095, "num_input_tokens_seen": 5869221418, "step": 1511, "train_runtime": 40707.1443, "train_tokens_per_second": 144181.605 }, { "epoch": 0.5363604114934374, "grad_norm": 0.24941319227218628, "learning_rate": 1.7804314720022532e-05, "loss": 0.4073, "num_input_tokens_seen": 5873156575, "step": 1512, "train_runtime": 40738.863, "train_tokens_per_second": 144165.942 }, { "epoch": 0.5367151472153245, "grad_norm": 0.2488413006067276, "learning_rate": 1.7782114863184485e-05, "loss": 0.4086, "num_input_tokens_seen": 5877008662, "step": 1513, "train_runtime": 40763.0363, "train_tokens_per_second": 144174.949 }, { "epoch": 0.5370698829372118, "grad_norm": 0.3095279037952423, "learning_rate": 1.7759917772646496e-05, "loss": 0.3885, "num_input_tokens_seen": 5880850551, "step": 1514, "train_runtime": 40786.1196, "train_tokens_per_second": 144187.547 }, { "epoch": 0.537424618659099, "grad_norm": 0.3074031472206116, "learning_rate": 1.7737723476094317e-05, "loss": 0.4025, "num_input_tokens_seen": 5884790339, "step": 1515, "train_runtime": 40817.7548, "train_tokens_per_second": 144172.318 }, { "epoch": 0.5377793543809861, "grad_norm": 0.394349068403244, "learning_rate": 1.7715532001210214e-05, "loss": 0.4142, "num_input_tokens_seen": 5888636612, "step": 1516, "train_runtime": 40845.3809, "train_tokens_per_second": 144168.973 }, { "epoch": 0.5381340901028734, "grad_norm": 0.36233124136924744, "learning_rate": 1.7693343375672932e-05, "loss": 0.4052, "num_input_tokens_seen": 5892532511, "step": 1517, "train_runtime": 40874.0688, "train_tokens_per_second": 144163.101 }, { "epoch": 0.5384888258247605, "grad_norm": 0.27233192324638367, "learning_rate": 1.767115762715767e-05, "loss": 0.4048, "num_input_tokens_seen": 5896359589, "step": 1518, "train_runtime": 40889.8349, "train_tokens_per_second": 144201.11 }, { "epoch": 0.5388435615466477, "grad_norm": 0.4003518521785736, "learning_rate": 1.7648974783336014e-05, "loss": 0.3954, "num_input_tokens_seen": 5900318505, "step": 1519, "train_runtime": 40910.5775, "train_tokens_per_second": 144224.767 }, { "epoch": 0.539198297268535, "grad_norm": 0.4997280538082123, "learning_rate": 1.762679487187597e-05, "loss": 0.4021, "num_input_tokens_seen": 5904150321, "step": 1520, "train_runtime": 40936.8578, "train_tokens_per_second": 144225.782 }, { "epoch": 0.5395530329904221, "grad_norm": 0.3076015114784241, "learning_rate": 1.7604617920441834e-05, "loss": 0.4117, "num_input_tokens_seen": 5908066855, "step": 1521, "train_runtime": 40962.1705, "train_tokens_per_second": 144232.271 }, { "epoch": 0.5399077687123093, "grad_norm": 0.26677802205085754, "learning_rate": 1.758244395669425e-05, "loss": 0.4124, "num_input_tokens_seen": 5911904429, "step": 1522, "train_runtime": 40987.1699, "train_tokens_per_second": 144237.927 }, { "epoch": 0.5402625044341965, "grad_norm": 0.38192254304885864, "learning_rate": 1.7560273008290126e-05, "loss": 0.4028, "num_input_tokens_seen": 5915776366, "step": 1523, "train_runtime": 41013.286, "train_tokens_per_second": 144240.487 }, { "epoch": 0.5406172401560837, "grad_norm": 0.2408948540687561, "learning_rate": 1.7538105102882598e-05, "loss": 0.4092, "num_input_tokens_seen": 5919638567, "step": 1524, "train_runtime": 41041.0166, "train_tokens_per_second": 144237.133 }, { "epoch": 0.5409719758779709, "grad_norm": 0.29991084337234497, "learning_rate": 1.7515940268121014e-05, "loss": 0.4133, "num_input_tokens_seen": 5923508463, "step": 1525, "train_runtime": 41067.7457, "train_tokens_per_second": 144237.487 }, { "epoch": 0.5413267115998581, "grad_norm": 0.23578938841819763, "learning_rate": 1.74937785316509e-05, "loss": 0.4076, "num_input_tokens_seen": 5927427291, "step": 1526, "train_runtime": 41098.6604, "train_tokens_per_second": 144224.343 }, { "epoch": 0.5416814473217453, "grad_norm": 0.4090564548969269, "learning_rate": 1.74716199211139e-05, "loss": 0.4025, "num_input_tokens_seen": 5931231311, "step": 1527, "train_runtime": 41121.8306, "train_tokens_per_second": 144235.586 }, { "epoch": 0.5420361830436325, "grad_norm": 0.23325422406196594, "learning_rate": 1.7449464464147774e-05, "loss": 0.4103, "num_input_tokens_seen": 5935114339, "step": 1528, "train_runtime": 41142.5046, "train_tokens_per_second": 144257.487 }, { "epoch": 0.5423909187655197, "grad_norm": 0.4540786147117615, "learning_rate": 1.7427312188386346e-05, "loss": 0.3956, "num_input_tokens_seen": 5939034105, "step": 1529, "train_runtime": 41168.9695, "train_tokens_per_second": 144259.965 }, { "epoch": 0.5427456544874069, "grad_norm": 0.3649185001850128, "learning_rate": 1.7405163121459462e-05, "loss": 0.3994, "num_input_tokens_seen": 5942999986, "step": 1530, "train_runtime": 41192.7236, "train_tokens_per_second": 144273.053 }, { "epoch": 0.5431003902092941, "grad_norm": 0.24790219962596893, "learning_rate": 1.7383017290992978e-05, "loss": 0.4057, "num_input_tokens_seen": 5946763053, "step": 1531, "train_runtime": 41225.2121, "train_tokens_per_second": 144250.636 }, { "epoch": 0.5434551259311813, "grad_norm": 0.23142871260643005, "learning_rate": 1.7360874724608715e-05, "loss": 0.4111, "num_input_tokens_seen": 5950721799, "step": 1532, "train_runtime": 41253.0042, "train_tokens_per_second": 144249.417 }, { "epoch": 0.5438098616530684, "grad_norm": 0.2961154282093048, "learning_rate": 1.7338735449924406e-05, "loss": 0.4047, "num_input_tokens_seen": 5954582565, "step": 1533, "train_runtime": 41281.8381, "train_tokens_per_second": 144242.186 }, { "epoch": 0.5441645973749557, "grad_norm": 0.2962127923965454, "learning_rate": 1.731659949455369e-05, "loss": 0.3958, "num_input_tokens_seen": 5958451681, "step": 1534, "train_runtime": 41308.235, "train_tokens_per_second": 144243.676 }, { "epoch": 0.5445193330968429, "grad_norm": 0.24817949533462524, "learning_rate": 1.729446688610607e-05, "loss": 0.4061, "num_input_tokens_seen": 5962390838, "step": 1535, "train_runtime": 41329.3727, "train_tokens_per_second": 144265.215 }, { "epoch": 0.54487406881873, "grad_norm": 0.2864895164966583, "learning_rate": 1.7272337652186867e-05, "loss": 0.3912, "num_input_tokens_seen": 5966270405, "step": 1536, "train_runtime": 41350.0616, "train_tokens_per_second": 144286.857 }, { "epoch": 0.5452288045406173, "grad_norm": 0.256242036819458, "learning_rate": 1.7250211820397196e-05, "loss": 0.4058, "num_input_tokens_seen": 5970186358, "step": 1537, "train_runtime": 41380.1541, "train_tokens_per_second": 144276.562 }, { "epoch": 0.5455835402625044, "grad_norm": 0.4587211310863495, "learning_rate": 1.7228089418333935e-05, "loss": 0.3836, "num_input_tokens_seen": 5974137748, "step": 1538, "train_runtime": 41407.5685, "train_tokens_per_second": 144276.468 }, { "epoch": 0.5459382759843916, "grad_norm": 0.3040269911289215, "learning_rate": 1.7205970473589662e-05, "loss": 0.4023, "num_input_tokens_seen": 5978019848, "step": 1539, "train_runtime": 41442.6199, "train_tokens_per_second": 144248.116 }, { "epoch": 0.5462930117062789, "grad_norm": 0.29923006892204285, "learning_rate": 1.7183855013752664e-05, "loss": 0.4117, "num_input_tokens_seen": 5981892873, "step": 1540, "train_runtime": 41469.8934, "train_tokens_per_second": 144246.642 }, { "epoch": 0.546647747428166, "grad_norm": 0.23424877226352692, "learning_rate": 1.7161743066406874e-05, "loss": 0.4076, "num_input_tokens_seen": 5985797252, "step": 1541, "train_runtime": 41502.2825, "train_tokens_per_second": 144228.146 }, { "epoch": 0.5470024831500532, "grad_norm": 0.25011181831359863, "learning_rate": 1.7139634659131848e-05, "loss": 0.4038, "num_input_tokens_seen": 5989734731, "step": 1542, "train_runtime": 41525.8224, "train_tokens_per_second": 144241.207 }, { "epoch": 0.5473572188719404, "grad_norm": 0.3316638171672821, "learning_rate": 1.7117529819502726e-05, "loss": 0.3953, "num_input_tokens_seen": 5993629241, "step": 1543, "train_runtime": 41558.0928, "train_tokens_per_second": 144222.914 }, { "epoch": 0.5477119545938276, "grad_norm": 0.2544938623905182, "learning_rate": 1.70954285750902e-05, "loss": 0.402, "num_input_tokens_seen": 5997527743, "step": 1544, "train_runtime": 41583.24, "train_tokens_per_second": 144229.448 }, { "epoch": 0.5480666903157148, "grad_norm": 0.25552380084991455, "learning_rate": 1.7073330953460462e-05, "loss": 0.4012, "num_input_tokens_seen": 6001442345, "step": 1545, "train_runtime": 41614.045, "train_tokens_per_second": 144216.75 }, { "epoch": 0.548421426037602, "grad_norm": 0.23428387939929962, "learning_rate": 1.70512369821752e-05, "loss": 0.4055, "num_input_tokens_seen": 6005317848, "step": 1546, "train_runtime": 41635.3499, "train_tokens_per_second": 144236.036 }, { "epoch": 0.5487761617594892, "grad_norm": 0.24128268659114838, "learning_rate": 1.702914668879155e-05, "loss": 0.3968, "num_input_tokens_seen": 6009260182, "step": 1547, "train_runtime": 41657.4982, "train_tokens_per_second": 144253.986 }, { "epoch": 0.5491308974813763, "grad_norm": 0.3031538128852844, "learning_rate": 1.700706010086206e-05, "loss": 0.406, "num_input_tokens_seen": 6013087915, "step": 1548, "train_runtime": 41681.026, "train_tokens_per_second": 144264.393 }, { "epoch": 0.5494856332032636, "grad_norm": 0.28149282932281494, "learning_rate": 1.6984977245934645e-05, "loss": 0.4077, "num_input_tokens_seen": 6017010424, "step": 1549, "train_runtime": 41712.5779, "train_tokens_per_second": 144249.306 }, { "epoch": 0.5498403689251508, "grad_norm": 0.35050463676452637, "learning_rate": 1.696289815155258e-05, "loss": 0.4032, "num_input_tokens_seen": 6020824506, "step": 1550, "train_runtime": 41732.4549, "train_tokens_per_second": 144271.994 }, { "epoch": 0.5501951046470379, "grad_norm": 0.25769004225730896, "learning_rate": 1.6940822845254438e-05, "loss": 0.4012, "num_input_tokens_seen": 6024751430, "step": 1551, "train_runtime": 41754.1343, "train_tokens_per_second": 144291.135 }, { "epoch": 0.5505498403689252, "grad_norm": 0.28601354360580444, "learning_rate": 1.6918751354574067e-05, "loss": 0.3959, "num_input_tokens_seen": 6028656679, "step": 1552, "train_runtime": 41780.4144, "train_tokens_per_second": 144293.846 }, { "epoch": 0.5509045760908123, "grad_norm": 0.2766757011413574, "learning_rate": 1.6896683707040567e-05, "loss": 0.4089, "num_input_tokens_seen": 6032479483, "step": 1553, "train_runtime": 41799.8264, "train_tokens_per_second": 144318.29 }, { "epoch": 0.5512593118126995, "grad_norm": 0.36491262912750244, "learning_rate": 1.6874619930178235e-05, "loss": 0.3997, "num_input_tokens_seen": 6036331939, "step": 1554, "train_runtime": 41826.2936, "train_tokens_per_second": 144319.074 }, { "epoch": 0.5516140475345868, "grad_norm": 0.2792539894580841, "learning_rate": 1.685256005150654e-05, "loss": 0.3933, "num_input_tokens_seen": 6040204173, "step": 1555, "train_runtime": 41858.7133, "train_tokens_per_second": 144299.805 }, { "epoch": 0.5519687832564739, "grad_norm": 0.25138598680496216, "learning_rate": 1.6830504098540098e-05, "loss": 0.3952, "num_input_tokens_seen": 6044174521, "step": 1556, "train_runtime": 41879.6063, "train_tokens_per_second": 144322.62 }, { "epoch": 0.5523235189783611, "grad_norm": 0.2944367527961731, "learning_rate": 1.6808452098788625e-05, "loss": 0.4086, "num_input_tokens_seen": 6048008047, "step": 1557, "train_runtime": 41899.1356, "train_tokens_per_second": 144346.845 }, { "epoch": 0.5526782547002483, "grad_norm": 0.4433019161224365, "learning_rate": 1.6786404079756883e-05, "loss": 0.3904, "num_input_tokens_seen": 6051898545, "step": 1558, "train_runtime": 41922.2114, "train_tokens_per_second": 144360.193 }, { "epoch": 0.5530329904221355, "grad_norm": 0.3129234313964844, "learning_rate": 1.6764360068944706e-05, "loss": 0.397, "num_input_tokens_seen": 6055808195, "step": 1559, "train_runtime": 41943.3955, "train_tokens_per_second": 144380.495 }, { "epoch": 0.5533877261440227, "grad_norm": 0.2574203610420227, "learning_rate": 1.6742320093846912e-05, "loss": 0.4175, "num_input_tokens_seen": 6059702671, "step": 1560, "train_runtime": 41969.4264, "train_tokens_per_second": 144383.738 }, { "epoch": 0.5537424618659099, "grad_norm": 0.27509605884552, "learning_rate": 1.6720284181953285e-05, "loss": 0.3974, "num_input_tokens_seen": 6063584982, "step": 1561, "train_runtime": 41992.7897, "train_tokens_per_second": 144395.86 }, { "epoch": 0.5540971975877971, "grad_norm": 0.20502659678459167, "learning_rate": 1.6698252360748535e-05, "loss": 0.4012, "num_input_tokens_seen": 6067545698, "step": 1562, "train_runtime": 42022.1438, "train_tokens_per_second": 144389.247 }, { "epoch": 0.5544519333096842, "grad_norm": 0.23883271217346191, "learning_rate": 1.6676224657712288e-05, "loss": 0.4134, "num_input_tokens_seen": 6071356102, "step": 1563, "train_runtime": 42044.675, "train_tokens_per_second": 144402.498 }, { "epoch": 0.5548066690315715, "grad_norm": 0.4201478660106659, "learning_rate": 1.665420110031901e-05, "loss": 0.3962, "num_input_tokens_seen": 6075266105, "step": 1564, "train_runtime": 42075.3402, "train_tokens_per_second": 144390.184 }, { "epoch": 0.5551614047534587, "grad_norm": 0.3998377025127411, "learning_rate": 1.6632181716038012e-05, "loss": 0.3981, "num_input_tokens_seen": 6079165689, "step": 1565, "train_runtime": 42101.0776, "train_tokens_per_second": 144394.539 }, { "epoch": 0.5555161404753459, "grad_norm": 0.21326018869876862, "learning_rate": 1.66101665323334e-05, "loss": 0.4013, "num_input_tokens_seen": 6083061265, "step": 1566, "train_runtime": 42129.0929, "train_tokens_per_second": 144390.986 }, { "epoch": 0.5558708761972331, "grad_norm": 0.27623307704925537, "learning_rate": 1.658815557666403e-05, "loss": 0.4018, "num_input_tokens_seen": 6086977612, "step": 1567, "train_runtime": 42156.4787, "train_tokens_per_second": 144390.087 }, { "epoch": 0.5562256119191202, "grad_norm": 0.5112008452415466, "learning_rate": 1.65661488764835e-05, "loss": 0.3901, "num_input_tokens_seen": 6090817810, "step": 1568, "train_runtime": 42173.2057, "train_tokens_per_second": 144423.875 }, { "epoch": 0.5565803476410075, "grad_norm": 0.2352496087551117, "learning_rate": 1.6544146459240087e-05, "loss": 0.3992, "num_input_tokens_seen": 6094723558, "step": 1569, "train_runtime": 42205.9992, "train_tokens_per_second": 144404.2 }, { "epoch": 0.5569350833628947, "grad_norm": 0.2269514799118042, "learning_rate": 1.652214835237673e-05, "loss": 0.4015, "num_input_tokens_seen": 6098627732, "step": 1570, "train_runtime": 42239.6504, "train_tokens_per_second": 144381.586 }, { "epoch": 0.5572898190847818, "grad_norm": 0.2572070360183716, "learning_rate": 1.650015458333099e-05, "loss": 0.4086, "num_input_tokens_seen": 6102516385, "step": 1571, "train_runtime": 42267.2486, "train_tokens_per_second": 144379.315 }, { "epoch": 0.5576445548066691, "grad_norm": 0.3000538647174835, "learning_rate": 1.647816517953503e-05, "loss": 0.3939, "num_input_tokens_seen": 6106364649, "step": 1572, "train_runtime": 42298.7223, "train_tokens_per_second": 144362.863 }, { "epoch": 0.5579992905285562, "grad_norm": 0.2900410592556, "learning_rate": 1.6456180168415546e-05, "loss": 0.3937, "num_input_tokens_seen": 6110272454, "step": 1573, "train_runtime": 42319.6258, "train_tokens_per_second": 144383.896 }, { "epoch": 0.5583540262504434, "grad_norm": 0.28004229068756104, "learning_rate": 1.6434199577393778e-05, "loss": 0.409, "num_input_tokens_seen": 6114196395, "step": 1574, "train_runtime": 42340.8839, "train_tokens_per_second": 144404.08 }, { "epoch": 0.5587087619723307, "grad_norm": 0.33586350083351135, "learning_rate": 1.6412223433885442e-05, "loss": 0.4119, "num_input_tokens_seen": 6118050546, "step": 1575, "train_runtime": 42361.831, "train_tokens_per_second": 144423.657 }, { "epoch": 0.5590634976942178, "grad_norm": 0.23077863454818726, "learning_rate": 1.63902517653007e-05, "loss": 0.397, "num_input_tokens_seen": 6121958724, "step": 1576, "train_runtime": 42389.0033, "train_tokens_per_second": 144423.276 }, { "epoch": 0.559418233416105, "grad_norm": 0.2997426390647888, "learning_rate": 1.636828459904414e-05, "loss": 0.3949, "num_input_tokens_seen": 6125817026, "step": 1577, "train_runtime": 42414.8348, "train_tokens_per_second": 144426.285 }, { "epoch": 0.5597729691379922, "grad_norm": 0.2171984314918518, "learning_rate": 1.634632196251474e-05, "loss": 0.3837, "num_input_tokens_seen": 6129732292, "step": 1578, "train_runtime": 42453.4619, "train_tokens_per_second": 144387.101 }, { "epoch": 0.5601277048598794, "grad_norm": 0.5718631744384766, "learning_rate": 1.6324363883105822e-05, "loss": 0.3989, "num_input_tokens_seen": 6133638171, "step": 1579, "train_runtime": 42477.999, "train_tokens_per_second": 144395.648 }, { "epoch": 0.5604824405817666, "grad_norm": 0.28490811586380005, "learning_rate": 1.630241038820502e-05, "loss": 0.4018, "num_input_tokens_seen": 6137496237, "step": 1580, "train_runtime": 42504.4845, "train_tokens_per_second": 144396.44 }, { "epoch": 0.5608371763036538, "grad_norm": 0.30123281478881836, "learning_rate": 1.628046150519426e-05, "loss": 0.4032, "num_input_tokens_seen": 6141418659, "step": 1581, "train_runtime": 42529.5452, "train_tokens_per_second": 144403.582 }, { "epoch": 0.561191912025541, "grad_norm": 0.3507060408592224, "learning_rate": 1.6258517261449716e-05, "loss": 0.4106, "num_input_tokens_seen": 6145297648, "step": 1582, "train_runtime": 42553.2045, "train_tokens_per_second": 144414.451 }, { "epoch": 0.5615466477474281, "grad_norm": 0.35466569662094116, "learning_rate": 1.623657768434175e-05, "loss": 0.404, "num_input_tokens_seen": 6149155607, "step": 1583, "train_runtime": 42583.7354, "train_tokens_per_second": 144401.508 }, { "epoch": 0.5619013834693154, "grad_norm": 0.27986398339271545, "learning_rate": 1.6214642801234937e-05, "loss": 0.4067, "num_input_tokens_seen": 6153115427, "step": 1584, "train_runtime": 42611.8436, "train_tokens_per_second": 144399.184 }, { "epoch": 0.5622561191912026, "grad_norm": 0.530474841594696, "learning_rate": 1.619271263948798e-05, "loss": 0.4008, "num_input_tokens_seen": 6156951694, "step": 1585, "train_runtime": 42641.001, "train_tokens_per_second": 144390.412 }, { "epoch": 0.5626108549130897, "grad_norm": 0.426332950592041, "learning_rate": 1.617078722645369e-05, "loss": 0.4141, "num_input_tokens_seen": 6160891661, "step": 1586, "train_runtime": 42672.549, "train_tokens_per_second": 144375.994 }, { "epoch": 0.562965590634977, "grad_norm": 0.3133285641670227, "learning_rate": 1.614886658947898e-05, "loss": 0.3994, "num_input_tokens_seen": 6164835474, "step": 1587, "train_runtime": 42695.1777, "train_tokens_per_second": 144391.845 }, { "epoch": 0.5633203263568641, "grad_norm": 0.2723548710346222, "learning_rate": 1.6126950755904785e-05, "loss": 0.4028, "num_input_tokens_seen": 6168698058, "step": 1588, "train_runtime": 42724.4696, "train_tokens_per_second": 144383.257 }, { "epoch": 0.5636750620787513, "grad_norm": 0.2812039256095886, "learning_rate": 1.6105039753066032e-05, "loss": 0.4064, "num_input_tokens_seen": 6172629613, "step": 1589, "train_runtime": 42758.6264, "train_tokens_per_second": 144359.867 }, { "epoch": 0.5640297978006386, "grad_norm": 0.3341984748840332, "learning_rate": 1.608313360829165e-05, "loss": 0.4045, "num_input_tokens_seen": 6176513585, "step": 1590, "train_runtime": 42778.5097, "train_tokens_per_second": 144383.561 }, { "epoch": 0.5643845335225257, "grad_norm": 0.2845912575721741, "learning_rate": 1.6061232348904515e-05, "loss": 0.394, "num_input_tokens_seen": 6180383137, "step": 1591, "train_runtime": 42804.9575, "train_tokens_per_second": 144384.751 }, { "epoch": 0.5647392692444129, "grad_norm": 0.24787037074565887, "learning_rate": 1.603933600222138e-05, "loss": 0.4102, "num_input_tokens_seen": 6184277946, "step": 1592, "train_runtime": 42836.2939, "train_tokens_per_second": 144370.051 }, { "epoch": 0.5650940049663001, "grad_norm": 0.44460466504096985, "learning_rate": 1.601744459555289e-05, "loss": 0.3905, "num_input_tokens_seen": 6188131940, "step": 1593, "train_runtime": 42865.4548, "train_tokens_per_second": 144361.747 }, { "epoch": 0.5654487406881873, "grad_norm": 0.22588367760181427, "learning_rate": 1.5995558156203536e-05, "loss": 0.3959, "num_input_tokens_seen": 6192062214, "step": 1594, "train_runtime": 42893.971, "train_tokens_per_second": 144357.402 }, { "epoch": 0.5658034764100744, "grad_norm": 0.2568758428096771, "learning_rate": 1.5973676711471586e-05, "loss": 0.3918, "num_input_tokens_seen": 6195962570, "step": 1595, "train_runtime": 42913.0187, "train_tokens_per_second": 144384.216 }, { "epoch": 0.5661582121319617, "grad_norm": 0.36549267172813416, "learning_rate": 1.59518002886491e-05, "loss": 0.4043, "num_input_tokens_seen": 6199945956, "step": 1596, "train_runtime": 42933.3477, "train_tokens_per_second": 144408.631 }, { "epoch": 0.5665129478538489, "grad_norm": 0.23572957515716553, "learning_rate": 1.592992891502187e-05, "loss": 0.3966, "num_input_tokens_seen": 6203850676, "step": 1597, "train_runtime": 42959.5653, "train_tokens_per_second": 144411.393 }, { "epoch": 0.566867683575736, "grad_norm": 0.21345362067222595, "learning_rate": 1.590806261786939e-05, "loss": 0.4045, "num_input_tokens_seen": 6207750342, "step": 1598, "train_runtime": 42979.6927, "train_tokens_per_second": 144434.498 }, { "epoch": 0.5672224192976233, "grad_norm": 0.3452296555042267, "learning_rate": 1.588620142446482e-05, "loss": 0.4053, "num_input_tokens_seen": 6211579235, "step": 1599, "train_runtime": 43005.0403, "train_tokens_per_second": 144438.401 }, { "epoch": 0.5675771550195104, "grad_norm": 0.24532006680965424, "learning_rate": 1.5864345362074963e-05, "loss": 0.4003, "num_input_tokens_seen": 6215483586, "step": 1600, "train_runtime": 43024.593, "train_tokens_per_second": 144463.507 }, { "epoch": 0.5679318907413976, "grad_norm": 0.25604483485221863, "learning_rate": 1.58424944579602e-05, "loss": 0.3941, "num_input_tokens_seen": 6219383931, "step": 1601, "train_runtime": 43138.3435, "train_tokens_per_second": 144172.989 }, { "epoch": 0.5682866264632849, "grad_norm": 0.26605191826820374, "learning_rate": 1.58206487393745e-05, "loss": 0.3964, "num_input_tokens_seen": 6223237324, "step": 1602, "train_runtime": 43163.9256, "train_tokens_per_second": 144176.815 }, { "epoch": 0.568641362185172, "grad_norm": 0.25254690647125244, "learning_rate": 1.5798808233565358e-05, "loss": 0.4029, "num_input_tokens_seen": 6227122530, "step": 1603, "train_runtime": 43185.7037, "train_tokens_per_second": 144194.073 }, { "epoch": 0.5689960979070593, "grad_norm": 0.261894166469574, "learning_rate": 1.577697296777377e-05, "loss": 0.3961, "num_input_tokens_seen": 6231013066, "step": 1604, "train_runtime": 43213.9812, "train_tokens_per_second": 144189.748 }, { "epoch": 0.5693508336289465, "grad_norm": 0.27477148175239563, "learning_rate": 1.5755142969234186e-05, "loss": 0.4016, "num_input_tokens_seen": 6234898049, "step": 1605, "train_runtime": 43236.8171, "train_tokens_per_second": 144203.447 }, { "epoch": 0.5697055693508336, "grad_norm": 0.20418761670589447, "learning_rate": 1.57333182651745e-05, "loss": 0.4089, "num_input_tokens_seen": 6238724186, "step": 1606, "train_runtime": 43264.817, "train_tokens_per_second": 144198.557 }, { "epoch": 0.5700603050727209, "grad_norm": 0.3419208824634552, "learning_rate": 1.5711498882815998e-05, "loss": 0.4022, "num_input_tokens_seen": 6242644879, "step": 1607, "train_runtime": 43291.3201, "train_tokens_per_second": 144200.844 }, { "epoch": 0.570415040794608, "grad_norm": 0.31883135437965393, "learning_rate": 1.5689684849373316e-05, "loss": 0.4016, "num_input_tokens_seen": 6246485024, "step": 1608, "train_runtime": 43316.6693, "train_tokens_per_second": 144205.109 }, { "epoch": 0.5707697765164952, "grad_norm": 0.22627516090869904, "learning_rate": 1.5667876192054428e-05, "loss": 0.3967, "num_input_tokens_seen": 6250430107, "step": 1609, "train_runtime": 43354.6487, "train_tokens_per_second": 144169.779 }, { "epoch": 0.5711245122383825, "grad_norm": 0.28581127524375916, "learning_rate": 1.564607293806061e-05, "loss": 0.4011, "num_input_tokens_seen": 6254305936, "step": 1610, "train_runtime": 43393.2872, "train_tokens_per_second": 144130.725 }, { "epoch": 0.5714792479602696, "grad_norm": 0.3458307981491089, "learning_rate": 1.562427511458639e-05, "loss": 0.3995, "num_input_tokens_seen": 6258201789, "step": 1611, "train_runtime": 43426.7307, "train_tokens_per_second": 144109.439 }, { "epoch": 0.5718339836821568, "grad_norm": 0.26382559537887573, "learning_rate": 1.560248274881952e-05, "loss": 0.3962, "num_input_tokens_seen": 6262027583, "step": 1612, "train_runtime": 43449.5008, "train_tokens_per_second": 144121.968 }, { "epoch": 0.572188719404044, "grad_norm": 0.26131895184516907, "learning_rate": 1.5580695867940957e-05, "loss": 0.3926, "num_input_tokens_seen": 6265964412, "step": 1613, "train_runtime": 43479.8028, "train_tokens_per_second": 144112.071 }, { "epoch": 0.5725434551259312, "grad_norm": 0.27407509088516235, "learning_rate": 1.5558914499124802e-05, "loss": 0.4025, "num_input_tokens_seen": 6269869134, "step": 1614, "train_runtime": 43512.0474, "train_tokens_per_second": 144095.015 }, { "epoch": 0.5728981908478183, "grad_norm": 0.3889545500278473, "learning_rate": 1.553713866953829e-05, "loss": 0.401, "num_input_tokens_seen": 6273782208, "step": 1615, "train_runtime": 43533.441, "train_tokens_per_second": 144114.089 }, { "epoch": 0.5732529265697056, "grad_norm": 0.26476407051086426, "learning_rate": 1.5515368406341745e-05, "loss": 0.4044, "num_input_tokens_seen": 6277703065, "step": 1616, "train_runtime": 43565.7298, "train_tokens_per_second": 144097.278 }, { "epoch": 0.5736076622915928, "grad_norm": 0.8898442387580872, "learning_rate": 1.5493603736688547e-05, "loss": 0.4087, "num_input_tokens_seen": 6281597809, "step": 1617, "train_runtime": 43584.1532, "train_tokens_per_second": 144125.728 }, { "epoch": 0.5739623980134799, "grad_norm": 0.22464619576931, "learning_rate": 1.5471844687725105e-05, "loss": 0.3969, "num_input_tokens_seen": 6285466897, "step": 1618, "train_runtime": 43617.0956, "train_tokens_per_second": 144105.581 }, { "epoch": 0.5743171337353672, "grad_norm": 0.23440738022327423, "learning_rate": 1.5450091286590808e-05, "loss": 0.4086, "num_input_tokens_seen": 6289345659, "step": 1619, "train_runtime": 43640.9978, "train_tokens_per_second": 144115.533 }, { "epoch": 0.5746718694572543, "grad_norm": 0.228241428732872, "learning_rate": 1.5428343560418008e-05, "loss": 0.3993, "num_input_tokens_seen": 6293223484, "step": 1620, "train_runtime": 43667.768, "train_tokens_per_second": 144115.987 }, { "epoch": 0.5750266051791415, "grad_norm": 0.3148879408836365, "learning_rate": 1.5406601536331974e-05, "loss": 0.4004, "num_input_tokens_seen": 6297069634, "step": 1621, "train_runtime": 43706.9456, "train_tokens_per_second": 144074.804 }, { "epoch": 0.5753813409010288, "grad_norm": 0.23982982337474823, "learning_rate": 1.5384865241450865e-05, "loss": 0.3965, "num_input_tokens_seen": 6300992325, "step": 1622, "train_runtime": 43741.2552, "train_tokens_per_second": 144051.475 }, { "epoch": 0.5757360766229159, "grad_norm": 0.20290322601795197, "learning_rate": 1.5363134702885695e-05, "loss": 0.4151, "num_input_tokens_seen": 6304944100, "step": 1623, "train_runtime": 43769.5986, "train_tokens_per_second": 144048.479 }, { "epoch": 0.5760908123448031, "grad_norm": 0.2524275481700897, "learning_rate": 1.5341409947740303e-05, "loss": 0.4003, "num_input_tokens_seen": 6308869259, "step": 1624, "train_runtime": 43790.4616, "train_tokens_per_second": 144069.485 }, { "epoch": 0.5764455480666903, "grad_norm": 0.20323196053504944, "learning_rate": 1.531969100311131e-05, "loss": 0.3992, "num_input_tokens_seen": 6312759236, "step": 1625, "train_runtime": 43816.3946, "train_tokens_per_second": 144072.996 }, { "epoch": 0.5768002837885775, "grad_norm": 0.2157827466726303, "learning_rate": 1.529797789608808e-05, "loss": 0.4166, "num_input_tokens_seen": 6316661850, "step": 1626, "train_runtime": 43835.9482, "train_tokens_per_second": 144097.758 }, { "epoch": 0.5771550195104647, "grad_norm": 0.2703179121017456, "learning_rate": 1.527627065375271e-05, "loss": 0.3932, "num_input_tokens_seen": 6320580335, "step": 1627, "train_runtime": 43860.2659, "train_tokens_per_second": 144107.205 }, { "epoch": 0.5775097552323519, "grad_norm": 2.3622887134552, "learning_rate": 1.5254569303179976e-05, "loss": 0.4053, "num_input_tokens_seen": 6324432355, "step": 1628, "train_runtime": 43880.5499, "train_tokens_per_second": 144128.375 }, { "epoch": 0.5778644909542391, "grad_norm": 0.2741285264492035, "learning_rate": 1.5232873871437304e-05, "loss": 0.3997, "num_input_tokens_seen": 6328353736, "step": 1629, "train_runtime": 43912.016, "train_tokens_per_second": 144114.398 }, { "epoch": 0.5782192266761262, "grad_norm": 0.2389143854379654, "learning_rate": 1.5211184385584745e-05, "loss": 0.403, "num_input_tokens_seen": 6332240685, "step": 1630, "train_runtime": 43944.23, "train_tokens_per_second": 144097.204 }, { "epoch": 0.5785739623980135, "grad_norm": 0.30222800374031067, "learning_rate": 1.5189500872674934e-05, "loss": 0.4003, "num_input_tokens_seen": 6336152836, "step": 1631, "train_runtime": 43971.4799, "train_tokens_per_second": 144096.875 }, { "epoch": 0.5789286981199007, "grad_norm": 0.3340497612953186, "learning_rate": 1.5167823359753038e-05, "loss": 0.4029, "num_input_tokens_seen": 6339998904, "step": 1632, "train_runtime": 43995.5462, "train_tokens_per_second": 144105.471 }, { "epoch": 0.5792834338417878, "grad_norm": 0.25181910395622253, "learning_rate": 1.514615187385676e-05, "loss": 0.4025, "num_input_tokens_seen": 6343797697, "step": 1633, "train_runtime": 44021.2102, "train_tokens_per_second": 144107.753 }, { "epoch": 0.5796381695636751, "grad_norm": 0.25329869985580444, "learning_rate": 1.5124486442016282e-05, "loss": 0.4005, "num_input_tokens_seen": 6347710708, "step": 1634, "train_runtime": 44048.7497, "train_tokens_per_second": 144106.49 }, { "epoch": 0.5799929052855622, "grad_norm": 0.43154966831207275, "learning_rate": 1.5102827091254227e-05, "loss": 0.3883, "num_input_tokens_seen": 6351571351, "step": 1635, "train_runtime": 44073.0316, "train_tokens_per_second": 144114.692 }, { "epoch": 0.5803476410074494, "grad_norm": 0.3511795401573181, "learning_rate": 1.5081173848585642e-05, "loss": 0.4016, "num_input_tokens_seen": 6355467474, "step": 1636, "train_runtime": 44101.7323, "train_tokens_per_second": 144109.248 }, { "epoch": 0.5807023767293367, "grad_norm": 0.4780612289905548, "learning_rate": 1.505952674101795e-05, "loss": 0.4039, "num_input_tokens_seen": 6359344492, "step": 1637, "train_runtime": 44123.3779, "train_tokens_per_second": 144126.42 }, { "epoch": 0.5810571124512238, "grad_norm": 0.22193492949008942, "learning_rate": 1.5037885795550928e-05, "loss": 0.4007, "num_input_tokens_seen": 6363202194, "step": 1638, "train_runtime": 44145.1391, "train_tokens_per_second": 144142.76 }, { "epoch": 0.581411848173111, "grad_norm": 0.22378170490264893, "learning_rate": 1.501625103917666e-05, "loss": 0.4003, "num_input_tokens_seen": 6367117044, "step": 1639, "train_runtime": 44166.18, "train_tokens_per_second": 144162.729 }, { "epoch": 0.5817665838949982, "grad_norm": 0.22087986767292023, "learning_rate": 1.499462249887951e-05, "loss": 0.414, "num_input_tokens_seen": 6371016602, "step": 1640, "train_runtime": 44195.0236, "train_tokens_per_second": 144156.878 }, { "epoch": 0.5821213196168854, "grad_norm": 0.2477138191461563, "learning_rate": 1.4973000201636097e-05, "loss": 0.4113, "num_input_tokens_seen": 6374888101, "step": 1641, "train_runtime": 44226.5696, "train_tokens_per_second": 144141.591 }, { "epoch": 0.5824760553387727, "grad_norm": 0.2544703185558319, "learning_rate": 1.495138417441525e-05, "loss": 0.4032, "num_input_tokens_seen": 6378748603, "step": 1642, "train_runtime": 44253.518, "train_tokens_per_second": 144141.051 }, { "epoch": 0.5828307910606598, "grad_norm": 0.22841998934745789, "learning_rate": 1.4929774444177972e-05, "loss": 0.3899, "num_input_tokens_seen": 6382665182, "step": 1643, "train_runtime": 44286.8811, "train_tokens_per_second": 144120.901 }, { "epoch": 0.583185526782547, "grad_norm": 0.2535979747772217, "learning_rate": 1.4908171037877426e-05, "loss": 0.4036, "num_input_tokens_seen": 6386601112, "step": 1644, "train_runtime": 44308.286, "train_tokens_per_second": 144140.107 }, { "epoch": 0.5835402625044342, "grad_norm": 0.28712576627731323, "learning_rate": 1.4886573982458862e-05, "loss": 0.4038, "num_input_tokens_seen": 6390408788, "step": 1645, "train_runtime": 44333.755, "train_tokens_per_second": 144143.188 }, { "epoch": 0.5838949982263214, "grad_norm": 0.2366296648979187, "learning_rate": 1.4864983304859632e-05, "loss": 0.3985, "num_input_tokens_seen": 6394313138, "step": 1646, "train_runtime": 44367.9196, "train_tokens_per_second": 144120.193 }, { "epoch": 0.5842497339482086, "grad_norm": 0.23507720232009888, "learning_rate": 1.4843399032009128e-05, "loss": 0.4077, "num_input_tokens_seen": 6398253436, "step": 1647, "train_runtime": 44393.6122, "train_tokens_per_second": 144125.542 }, { "epoch": 0.5846044696700958, "grad_norm": 0.23740364611148834, "learning_rate": 1.4821821190828747e-05, "loss": 0.3981, "num_input_tokens_seen": 6402084080, "step": 1648, "train_runtime": 44417.3022, "train_tokens_per_second": 144134.915 }, { "epoch": 0.584959205391983, "grad_norm": 0.24759545922279358, "learning_rate": 1.480024980823187e-05, "loss": 0.4109, "num_input_tokens_seen": 6405923091, "step": 1649, "train_runtime": 44442.4133, "train_tokens_per_second": 144139.857 }, { "epoch": 0.5853139411138701, "grad_norm": 0.2423948049545288, "learning_rate": 1.4778684911123833e-05, "loss": 0.3916, "num_input_tokens_seen": 6409835453, "step": 1650, "train_runtime": 44476.3595, "train_tokens_per_second": 144117.808 }, { "epoch": 0.5856686768357574, "grad_norm": 0.2725060284137726, "learning_rate": 1.4757126526401848e-05, "loss": 0.4054, "num_input_tokens_seen": 6413667463, "step": 1651, "train_runtime": 44502.9305, "train_tokens_per_second": 144117.868 }, { "epoch": 0.5860234125576446, "grad_norm": 0.2479783594608307, "learning_rate": 1.4735574680955034e-05, "loss": 0.4045, "num_input_tokens_seen": 6417565000, "step": 1652, "train_runtime": 44528.0501, "train_tokens_per_second": 144124.097 }, { "epoch": 0.5863781482795317, "grad_norm": 0.27830949425697327, "learning_rate": 1.4714029401664355e-05, "loss": 0.3948, "num_input_tokens_seen": 6421524141, "step": 1653, "train_runtime": 44561.7286, "train_tokens_per_second": 144104.018 }, { "epoch": 0.586732884001419, "grad_norm": 0.241390198469162, "learning_rate": 1.4692490715402565e-05, "loss": 0.3977, "num_input_tokens_seen": 6425396804, "step": 1654, "train_runtime": 44595.2854, "train_tokens_per_second": 144082.424 }, { "epoch": 0.5870876197233061, "grad_norm": 0.21539154648780823, "learning_rate": 1.4670958649034213e-05, "loss": 0.3995, "num_input_tokens_seen": 6429279872, "step": 1655, "train_runtime": 44620.5268, "train_tokens_per_second": 144087.942 }, { "epoch": 0.5874423554451933, "grad_norm": 0.3432818651199341, "learning_rate": 1.4649433229415588e-05, "loss": 0.402, "num_input_tokens_seen": 6433194258, "step": 1656, "train_runtime": 44647.506, "train_tokens_per_second": 144088.547 }, { "epoch": 0.5877970911670806, "grad_norm": 0.28937917947769165, "learning_rate": 1.4627914483394677e-05, "loss": 0.4051, "num_input_tokens_seen": 6437139089, "step": 1657, "train_runtime": 44673.4233, "train_tokens_per_second": 144093.258 }, { "epoch": 0.5881518268889677, "grad_norm": 0.22726382315158844, "learning_rate": 1.4606402437811156e-05, "loss": 0.3951, "num_input_tokens_seen": 6441049824, "step": 1658, "train_runtime": 44704.8945, "train_tokens_per_second": 144079.298 }, { "epoch": 0.5885065626108549, "grad_norm": 0.3624221086502075, "learning_rate": 1.4584897119496337e-05, "loss": 0.396, "num_input_tokens_seen": 6444893083, "step": 1659, "train_runtime": 44748.5915, "train_tokens_per_second": 144024.49 }, { "epoch": 0.5888612983327421, "grad_norm": 0.23202212154865265, "learning_rate": 1.4563398555273143e-05, "loss": 0.409, "num_input_tokens_seen": 6448748550, "step": 1660, "train_runtime": 44781.4293, "train_tokens_per_second": 144004.974 }, { "epoch": 0.5892160340546293, "grad_norm": 0.2076507806777954, "learning_rate": 1.4541906771956079e-05, "loss": 0.3952, "num_input_tokens_seen": 6452643042, "step": 1661, "train_runtime": 44806.3452, "train_tokens_per_second": 144011.814 }, { "epoch": 0.5895707697765165, "grad_norm": 0.24860623478889465, "learning_rate": 1.452042179635119e-05, "loss": 0.4067, "num_input_tokens_seen": 6456570188, "step": 1662, "train_runtime": 44840.0997, "train_tokens_per_second": 143990.986 }, { "epoch": 0.5899255054984037, "grad_norm": 0.2226317971944809, "learning_rate": 1.4498943655256006e-05, "loss": 0.3988, "num_input_tokens_seen": 6460427190, "step": 1663, "train_runtime": 44860.877, "train_tokens_per_second": 144010.274 }, { "epoch": 0.5902802412202909, "grad_norm": 0.22176897525787354, "learning_rate": 1.4477472375459573e-05, "loss": 0.3985, "num_input_tokens_seen": 6464348710, "step": 1664, "train_runtime": 44889.9793, "train_tokens_per_second": 144004.27 }, { "epoch": 0.590634976942178, "grad_norm": 0.28707846999168396, "learning_rate": 1.445600798374235e-05, "loss": 0.4053, "num_input_tokens_seen": 6468205069, "step": 1665, "train_runtime": 44921.6734, "train_tokens_per_second": 143988.515 }, { "epoch": 0.5909897126640653, "grad_norm": 0.23913680016994476, "learning_rate": 1.4434550506876219e-05, "loss": 0.3969, "num_input_tokens_seen": 6472176040, "step": 1666, "train_runtime": 44942.7705, "train_tokens_per_second": 144009.28 }, { "epoch": 0.5913444483859525, "grad_norm": 0.20977284014225006, "learning_rate": 1.4413099971624431e-05, "loss": 0.4, "num_input_tokens_seen": 6476037029, "step": 1667, "train_runtime": 44962.6692, "train_tokens_per_second": 144031.419 }, { "epoch": 0.5916991841078396, "grad_norm": 2.630610466003418, "learning_rate": 1.4391656404741578e-05, "loss": 0.4027, "num_input_tokens_seen": 6479983470, "step": 1668, "train_runtime": 44993.9817, "train_tokens_per_second": 144018.894 }, { "epoch": 0.5920539198297269, "grad_norm": 0.23201151192188263, "learning_rate": 1.4370219832973575e-05, "loss": 0.4011, "num_input_tokens_seen": 6483836084, "step": 1669, "train_runtime": 45013.9577, "train_tokens_per_second": 144040.569 }, { "epoch": 0.592408655551614, "grad_norm": 0.2534656822681427, "learning_rate": 1.4348790283057582e-05, "loss": 0.3954, "num_input_tokens_seen": 6487672187, "step": 1670, "train_runtime": 45044.7708, "train_tokens_per_second": 144027.2 }, { "epoch": 0.5927633912735012, "grad_norm": 0.2418633997440338, "learning_rate": 1.4327367781722025e-05, "loss": 0.391, "num_input_tokens_seen": 6491526392, "step": 1671, "train_runtime": 45064.6094, "train_tokens_per_second": 144049.321 }, { "epoch": 0.5931181269953885, "grad_norm": 0.2447713017463684, "learning_rate": 1.4305952355686535e-05, "loss": 0.3993, "num_input_tokens_seen": 6495413861, "step": 1672, "train_runtime": 45091.5411, "train_tokens_per_second": 144049.498 }, { "epoch": 0.5934728627172756, "grad_norm": 0.2137424796819687, "learning_rate": 1.4284544031661913e-05, "loss": 0.3903, "num_input_tokens_seen": 6499305120, "step": 1673, "train_runtime": 45119.6513, "train_tokens_per_second": 144045.996 }, { "epoch": 0.5938275984391628, "grad_norm": 1.1571125984191895, "learning_rate": 1.4263142836350107e-05, "loss": 0.3985, "num_input_tokens_seen": 6503186300, "step": 1674, "train_runtime": 45145.9475, "train_tokens_per_second": 144048.063 }, { "epoch": 0.59418233416105, "grad_norm": 0.3074084222316742, "learning_rate": 1.4241748796444175e-05, "loss": 0.3953, "num_input_tokens_seen": 6507039351, "step": 1675, "train_runtime": 45170.699, "train_tokens_per_second": 144054.431 }, { "epoch": 0.5945370698829372, "grad_norm": 0.22722487151622772, "learning_rate": 1.4220361938628236e-05, "loss": 0.4001, "num_input_tokens_seen": 6510946548, "step": 1676, "train_runtime": 45190.5328, "train_tokens_per_second": 144077.667 }, { "epoch": 0.5948918056048244, "grad_norm": 0.28759926557540894, "learning_rate": 1.419898228957747e-05, "loss": 0.4006, "num_input_tokens_seen": 6514793019, "step": 1677, "train_runtime": 45219.0389, "train_tokens_per_second": 144071.904 }, { "epoch": 0.5952465413267116, "grad_norm": 0.2593168020248413, "learning_rate": 1.4177609875958051e-05, "loss": 0.3946, "num_input_tokens_seen": 6518706148, "step": 1678, "train_runtime": 45238.3281, "train_tokens_per_second": 144096.973 }, { "epoch": 0.5956012770485988, "grad_norm": 0.26567956805229187, "learning_rate": 1.4156244724427138e-05, "loss": 0.4054, "num_input_tokens_seen": 6522623656, "step": 1679, "train_runtime": 45265.0775, "train_tokens_per_second": 144098.365 }, { "epoch": 0.595956012770486, "grad_norm": 0.3254028260707855, "learning_rate": 1.4134886861632828e-05, "loss": 0.398, "num_input_tokens_seen": 6526483956, "step": 1680, "train_runtime": 45298.3714, "train_tokens_per_second": 144077.673 }, { "epoch": 0.5963107484923732, "grad_norm": 0.2315923273563385, "learning_rate": 1.4113536314214136e-05, "loss": 0.4047, "num_input_tokens_seen": 6530419429, "step": 1681, "train_runtime": 45329.6243, "train_tokens_per_second": 144065.157 }, { "epoch": 0.5966654842142604, "grad_norm": 0.29940980672836304, "learning_rate": 1.4092193108800926e-05, "loss": 0.3948, "num_input_tokens_seen": 6534274757, "step": 1682, "train_runtime": 45352.6313, "train_tokens_per_second": 144077.082 }, { "epoch": 0.5970202199361476, "grad_norm": 0.2613833546638489, "learning_rate": 1.407085727201394e-05, "loss": 0.4006, "num_input_tokens_seen": 6538265670, "step": 1683, "train_runtime": 45377.4449, "train_tokens_per_second": 144086.246 }, { "epoch": 0.5973749556580348, "grad_norm": 0.6613319516181946, "learning_rate": 1.4049528830464705e-05, "loss": 0.3976, "num_input_tokens_seen": 6542110837, "step": 1684, "train_runtime": 45408.4858, "train_tokens_per_second": 144072.429 }, { "epoch": 0.5977296913799219, "grad_norm": 0.2528979182243347, "learning_rate": 1.402820781075553e-05, "loss": 0.3965, "num_input_tokens_seen": 6545977556, "step": 1685, "train_runtime": 45436.3193, "train_tokens_per_second": 144069.275 }, { "epoch": 0.5980844271018092, "grad_norm": 0.32407960295677185, "learning_rate": 1.4006894239479474e-05, "loss": 0.4039, "num_input_tokens_seen": 6549833671, "step": 1686, "train_runtime": 45464.3712, "train_tokens_per_second": 144065.199 }, { "epoch": 0.5984391628236964, "grad_norm": 0.22957119345664978, "learning_rate": 1.3985588143220303e-05, "loss": 0.3875, "num_input_tokens_seen": 6553729597, "step": 1687, "train_runtime": 45500.6041, "train_tokens_per_second": 144036.101 }, { "epoch": 0.5987938985455835, "grad_norm": 0.363802045583725, "learning_rate": 1.3964289548552446e-05, "loss": 0.4038, "num_input_tokens_seen": 6557669549, "step": 1688, "train_runtime": 45530.7869, "train_tokens_per_second": 144027.152 }, { "epoch": 0.5991486342674708, "grad_norm": 0.24421647191047668, "learning_rate": 1.394299848204099e-05, "loss": 0.398, "num_input_tokens_seen": 6561506878, "step": 1689, "train_runtime": 45556.5796, "train_tokens_per_second": 144029.84 }, { "epoch": 0.5995033699893579, "grad_norm": 0.2524860203266144, "learning_rate": 1.392171497024163e-05, "loss": 0.3997, "num_input_tokens_seen": 6565388156, "step": 1690, "train_runtime": 45584.0978, "train_tokens_per_second": 144028.038 }, { "epoch": 0.5998581057112451, "grad_norm": 0.4557822644710541, "learning_rate": 1.3900439039700641e-05, "loss": 0.4026, "num_input_tokens_seen": 6569241426, "step": 1691, "train_runtime": 45609.9715, "train_tokens_per_second": 144030.816 }, { "epoch": 0.6002128414331324, "grad_norm": 0.2629491090774536, "learning_rate": 1.3879170716954828e-05, "loss": 0.3894, "num_input_tokens_seen": 6573119320, "step": 1692, "train_runtime": 45638.2452, "train_tokens_per_second": 144026.557 }, { "epoch": 0.6005675771550195, "grad_norm": 0.23954999446868896, "learning_rate": 1.3857910028531538e-05, "loss": 0.4058, "num_input_tokens_seen": 6577095682, "step": 1693, "train_runtime": 45666.0045, "train_tokens_per_second": 144026.081 }, { "epoch": 0.6009223128769067, "grad_norm": 0.23758983612060547, "learning_rate": 1.3836657000948553e-05, "loss": 0.3956, "num_input_tokens_seen": 6580989056, "step": 1694, "train_runtime": 45688.3929, "train_tokens_per_second": 144040.721 }, { "epoch": 0.6012770485987939, "grad_norm": 0.4538305997848511, "learning_rate": 1.381541166071413e-05, "loss": 0.4049, "num_input_tokens_seen": 6584826436, "step": 1695, "train_runtime": 45707.3729, "train_tokens_per_second": 144064.863 }, { "epoch": 0.6016317843206811, "grad_norm": 0.37268364429473877, "learning_rate": 1.379417403432693e-05, "loss": 0.3964, "num_input_tokens_seen": 6588789204, "step": 1696, "train_runtime": 45739.1427, "train_tokens_per_second": 144051.436 }, { "epoch": 0.6019865200425683, "grad_norm": 0.6962811350822449, "learning_rate": 1.3772944148275996e-05, "loss": 0.4052, "num_input_tokens_seen": 6592604290, "step": 1697, "train_runtime": 45768.6197, "train_tokens_per_second": 144042.017 }, { "epoch": 0.6023412557644555, "grad_norm": 0.27605104446411133, "learning_rate": 1.3751722029040707e-05, "loss": 0.4, "num_input_tokens_seen": 6596532785, "step": 1698, "train_runtime": 45790.907, "train_tokens_per_second": 144057.701 }, { "epoch": 0.6026959914863427, "grad_norm": 0.31423744559288025, "learning_rate": 1.3730507703090763e-05, "loss": 0.3907, "num_input_tokens_seen": 6600426211, "step": 1699, "train_runtime": 45829.792, "train_tokens_per_second": 144020.427 }, { "epoch": 0.6030507272082298, "grad_norm": 0.2289421707391739, "learning_rate": 1.3709301196886146e-05, "loss": 0.4101, "num_input_tokens_seen": 6604249321, "step": 1700, "train_runtime": 45858.8379, "train_tokens_per_second": 144012.575 }, { "epoch": 0.6034054629301171, "grad_norm": 0.3014447093009949, "learning_rate": 1.3688102536877077e-05, "loss": 0.4003, "num_input_tokens_seen": 6608117642, "step": 1701, "train_runtime": 45896.9514, "train_tokens_per_second": 143977.267 }, { "epoch": 0.6037601986520043, "grad_norm": 0.25381818413734436, "learning_rate": 1.3666911749503991e-05, "loss": 0.4007, "num_input_tokens_seen": 6612064825, "step": 1702, "train_runtime": 45917.9217, "train_tokens_per_second": 143997.476 }, { "epoch": 0.6041149343738914, "grad_norm": 0.22297725081443787, "learning_rate": 1.3645728861197512e-05, "loss": 0.3932, "num_input_tokens_seen": 6615920006, "step": 1703, "train_runtime": 45945.297, "train_tokens_per_second": 143995.587 }, { "epoch": 0.6044696700957787, "grad_norm": 0.22876065969467163, "learning_rate": 1.3624553898378404e-05, "loss": 0.4064, "num_input_tokens_seen": 6619822831, "step": 1704, "train_runtime": 45974.6316, "train_tokens_per_second": 143988.6 }, { "epoch": 0.6048244058176658, "grad_norm": 0.3984639048576355, "learning_rate": 1.3603386887457548e-05, "loss": 0.4074, "num_input_tokens_seen": 6623687651, "step": 1705, "train_runtime": 45997.6276, "train_tokens_per_second": 144000.636 }, { "epoch": 0.605179141539553, "grad_norm": 0.26801055669784546, "learning_rate": 1.3582227854835914e-05, "loss": 0.3888, "num_input_tokens_seen": 6627537577, "step": 1706, "train_runtime": 46029.5753, "train_tokens_per_second": 143984.33 }, { "epoch": 0.6055338772614403, "grad_norm": 0.22577790915966034, "learning_rate": 1.3561076826904503e-05, "loss": 0.3975, "num_input_tokens_seen": 6631433741, "step": 1707, "train_runtime": 46056.7523, "train_tokens_per_second": 143983.964 }, { "epoch": 0.6058886129833274, "grad_norm": 0.3057527244091034, "learning_rate": 1.353993383004434e-05, "loss": 0.4051, "num_input_tokens_seen": 6635350879, "step": 1708, "train_runtime": 46084.4446, "train_tokens_per_second": 143982.442 }, { "epoch": 0.6062433487052146, "grad_norm": 1.2823753356933594, "learning_rate": 1.3518798890626448e-05, "loss": 0.4082, "num_input_tokens_seen": 6639213520, "step": 1709, "train_runtime": 46106.0065, "train_tokens_per_second": 143998.885 }, { "epoch": 0.6065980844271018, "grad_norm": 0.3872416019439697, "learning_rate": 1.3497672035011786e-05, "loss": 0.3956, "num_input_tokens_seen": 6643119254, "step": 1710, "train_runtime": 46127.7088, "train_tokens_per_second": 144015.808 }, { "epoch": 0.606952820148989, "grad_norm": 0.585898756980896, "learning_rate": 1.347655328955123e-05, "loss": 0.3937, "num_input_tokens_seen": 6647005187, "step": 1711, "train_runtime": 46154.0045, "train_tokens_per_second": 144017.952 }, { "epoch": 0.6073075558708761, "grad_norm": 0.319771409034729, "learning_rate": 1.3455442680585554e-05, "loss": 0.4016, "num_input_tokens_seen": 6650918323, "step": 1712, "train_runtime": 46173.8295, "train_tokens_per_second": 144040.865 }, { "epoch": 0.6076622915927634, "grad_norm": 0.273605614900589, "learning_rate": 1.343434023444535e-05, "loss": 0.406, "num_input_tokens_seen": 6654821187, "step": 1713, "train_runtime": 46195.1654, "train_tokens_per_second": 144058.824 }, { "epoch": 0.6080170273146506, "grad_norm": 0.27409878373146057, "learning_rate": 1.3413245977451066e-05, "loss": 0.407, "num_input_tokens_seen": 6658792138, "step": 1714, "train_runtime": 46215.6364, "train_tokens_per_second": 144080.936 }, { "epoch": 0.6083717630365377, "grad_norm": 0.2735706567764282, "learning_rate": 1.3392159935912919e-05, "loss": 0.4094, "num_input_tokens_seen": 6662585887, "step": 1715, "train_runtime": 46243.0823, "train_tokens_per_second": 144077.461 }, { "epoch": 0.608726498758425, "grad_norm": 0.43351587653160095, "learning_rate": 1.337108213613088e-05, "loss": 0.4131, "num_input_tokens_seen": 6666477420, "step": 1716, "train_runtime": 46282.4227, "train_tokens_per_second": 144039.076 }, { "epoch": 0.6090812344803122, "grad_norm": 0.26185905933380127, "learning_rate": 1.3350012604394642e-05, "loss": 0.4031, "num_input_tokens_seen": 6670345387, "step": 1717, "train_runtime": 46302.2263, "train_tokens_per_second": 144061.008 }, { "epoch": 0.6094359702021993, "grad_norm": 0.288223534822464, "learning_rate": 1.3328951366983594e-05, "loss": 0.3977, "num_input_tokens_seen": 6674271964, "step": 1718, "train_runtime": 46340.1782, "train_tokens_per_second": 144027.758 }, { "epoch": 0.6097907059240866, "grad_norm": 0.3409617841243744, "learning_rate": 1.3307898450166756e-05, "loss": 0.3972, "num_input_tokens_seen": 6678159693, "step": 1719, "train_runtime": 46366.3494, "train_tokens_per_second": 144030.31 }, { "epoch": 0.6101454416459737, "grad_norm": 0.2300737053155899, "learning_rate": 1.3286853880202798e-05, "loss": 0.3839, "num_input_tokens_seen": 6682038096, "step": 1720, "train_runtime": 46386.9086, "train_tokens_per_second": 144050.084 }, { "epoch": 0.610500177367861, "grad_norm": 0.22955051064491272, "learning_rate": 1.3265817683339955e-05, "loss": 0.403, "num_input_tokens_seen": 6685897762, "step": 1721, "train_runtime": 46414.8328, "train_tokens_per_second": 144046.577 }, { "epoch": 0.6108549130897482, "grad_norm": 0.2181219607591629, "learning_rate": 1.3244789885816039e-05, "loss": 0.3922, "num_input_tokens_seen": 6689768543, "step": 1722, "train_runtime": 46443.3482, "train_tokens_per_second": 144041.479 }, { "epoch": 0.6112096488116353, "grad_norm": 0.2532305419445038, "learning_rate": 1.322377051385837e-05, "loss": 0.4056, "num_input_tokens_seen": 6693623016, "step": 1723, "train_runtime": 46469.8732, "train_tokens_per_second": 144042.205 }, { "epoch": 0.6115643845335226, "grad_norm": 0.21047240495681763, "learning_rate": 1.3202759593683774e-05, "loss": 0.4111, "num_input_tokens_seen": 6697567658, "step": 1724, "train_runtime": 46504.5993, "train_tokens_per_second": 144019.468 }, { "epoch": 0.6119191202554097, "grad_norm": 0.2270185947418213, "learning_rate": 1.3181757151498518e-05, "loss": 0.3948, "num_input_tokens_seen": 6701389209, "step": 1725, "train_runtime": 46538.2109, "train_tokens_per_second": 143997.568 }, { "epoch": 0.6122738559772969, "grad_norm": 0.2058209925889969, "learning_rate": 1.3160763213498307e-05, "loss": 0.3942, "num_input_tokens_seen": 6705323071, "step": 1726, "train_runtime": 46564.9677, "train_tokens_per_second": 143999.307 }, { "epoch": 0.6126285916991842, "grad_norm": 0.3605073094367981, "learning_rate": 1.3139777805868235e-05, "loss": 0.4107, "num_input_tokens_seen": 6709205288, "step": 1727, "train_runtime": 46592.9529, "train_tokens_per_second": 143996.138 }, { "epoch": 0.6129833274210713, "grad_norm": 0.25537943840026855, "learning_rate": 1.3118800954782755e-05, "loss": 0.3901, "num_input_tokens_seen": 6713128745, "step": 1728, "train_runtime": 46628.5535, "train_tokens_per_second": 143970.341 }, { "epoch": 0.6133380631429585, "grad_norm": 0.26886075735092163, "learning_rate": 1.3097832686405655e-05, "loss": 0.402, "num_input_tokens_seen": 6716983041, "step": 1729, "train_runtime": 46661.9131, "train_tokens_per_second": 143950.014 }, { "epoch": 0.6136927988648457, "grad_norm": 0.22546818852424622, "learning_rate": 1.307687302689001e-05, "loss": 0.4042, "num_input_tokens_seen": 6720874186, "step": 1730, "train_runtime": 46680.4931, "train_tokens_per_second": 143976.075 }, { "epoch": 0.6140475345867329, "grad_norm": 0.21960383653640747, "learning_rate": 1.3055922002378165e-05, "loss": 0.4013, "num_input_tokens_seen": 6724785795, "step": 1731, "train_runtime": 46704.5044, "train_tokens_per_second": 143985.808 }, { "epoch": 0.61440227030862, "grad_norm": 0.32836639881134033, "learning_rate": 1.303497963900168e-05, "loss": 0.3971, "num_input_tokens_seen": 6728627451, "step": 1732, "train_runtime": 46724.4883, "train_tokens_per_second": 144006.445 }, { "epoch": 0.6147570060305073, "grad_norm": 0.22896228730678558, "learning_rate": 1.3014045962881322e-05, "loss": 0.3958, "num_input_tokens_seen": 6732501831, "step": 1733, "train_runtime": 46753.1716, "train_tokens_per_second": 144000.965 }, { "epoch": 0.6151117417523945, "grad_norm": 0.24461375176906586, "learning_rate": 1.2993121000127031e-05, "loss": 0.3949, "num_input_tokens_seen": 6736417926, "step": 1734, "train_runtime": 46785.4616, "train_tokens_per_second": 143985.283 }, { "epoch": 0.6154664774742816, "grad_norm": 0.24818579852581024, "learning_rate": 1.2972204776837865e-05, "loss": 0.392, "num_input_tokens_seen": 6740305101, "step": 1735, "train_runtime": 46814.4156, "train_tokens_per_second": 143979.264 }, { "epoch": 0.6158212131961689, "grad_norm": 0.24757389724254608, "learning_rate": 1.2951297319101985e-05, "loss": 0.3949, "num_input_tokens_seen": 6744184135, "step": 1736, "train_runtime": 46834.6875, "train_tokens_per_second": 143999.768 }, { "epoch": 0.616175948918056, "grad_norm": 0.29986482858657837, "learning_rate": 1.2930398652996639e-05, "loss": 0.3994, "num_input_tokens_seen": 6748053296, "step": 1737, "train_runtime": 46864.5906, "train_tokens_per_second": 143990.446 }, { "epoch": 0.6165306846399432, "grad_norm": 0.2546665370464325, "learning_rate": 1.2909508804588071e-05, "loss": 0.4028, "num_input_tokens_seen": 6751890887, "step": 1738, "train_runtime": 46890.0247, "train_tokens_per_second": 143994.185 }, { "epoch": 0.6168854203618305, "grad_norm": 0.27249735593795776, "learning_rate": 1.2888627799931555e-05, "loss": 0.3956, "num_input_tokens_seen": 6755799711, "step": 1739, "train_runtime": 46916.1669, "train_tokens_per_second": 143997.265 }, { "epoch": 0.6172401560837176, "grad_norm": 0.24627511203289032, "learning_rate": 1.2867755665071328e-05, "loss": 0.4011, "num_input_tokens_seen": 6759666937, "step": 1740, "train_runtime": 46942.6183, "train_tokens_per_second": 143998.507 }, { "epoch": 0.6175948918056048, "grad_norm": 0.23832634091377258, "learning_rate": 1.2846892426040563e-05, "loss": 0.4022, "num_input_tokens_seen": 6763556663, "step": 1741, "train_runtime": 46972.0329, "train_tokens_per_second": 143991.142 }, { "epoch": 0.617949627527492, "grad_norm": 0.19963128864765167, "learning_rate": 1.2826038108861335e-05, "loss": 0.3924, "num_input_tokens_seen": 6767478881, "step": 1742, "train_runtime": 47004.5318, "train_tokens_per_second": 143975.03 }, { "epoch": 0.6183043632493792, "grad_norm": 0.21688218414783478, "learning_rate": 1.2805192739544602e-05, "loss": 0.3856, "num_input_tokens_seen": 6771406274, "step": 1743, "train_runtime": 47035.3982, "train_tokens_per_second": 143964.047 }, { "epoch": 0.6186590989712664, "grad_norm": 0.22063183784484863, "learning_rate": 1.2784356344090145e-05, "loss": 0.3987, "num_input_tokens_seen": 6775374805, "step": 1744, "train_runtime": 47061.2528, "train_tokens_per_second": 143969.283 }, { "epoch": 0.6190138346931536, "grad_norm": 0.2149783968925476, "learning_rate": 1.2763528948486563e-05, "loss": 0.3915, "num_input_tokens_seen": 6779283872, "step": 1745, "train_runtime": 47087.2007, "train_tokens_per_second": 143972.965 }, { "epoch": 0.6193685704150408, "grad_norm": 0.23783527314662933, "learning_rate": 1.2742710578711226e-05, "loss": 0.4021, "num_input_tokens_seen": 6783198929, "step": 1746, "train_runtime": 47115.4814, "train_tokens_per_second": 143969.641 }, { "epoch": 0.6197233061369279, "grad_norm": 0.31937509775161743, "learning_rate": 1.2721901260730252e-05, "loss": 0.3915, "num_input_tokens_seen": 6787025540, "step": 1747, "train_runtime": 47141.1852, "train_tokens_per_second": 143972.314 }, { "epoch": 0.6200780418588152, "grad_norm": 0.22142767906188965, "learning_rate": 1.2701101020498459e-05, "loss": 0.4048, "num_input_tokens_seen": 6790912227, "step": 1748, "train_runtime": 47171.3355, "train_tokens_per_second": 143962.687 }, { "epoch": 0.6204327775807024, "grad_norm": 0.2654256820678711, "learning_rate": 1.2680309883959361e-05, "loss": 0.3965, "num_input_tokens_seen": 6794736597, "step": 1749, "train_runtime": 47190.7164, "train_tokens_per_second": 143984.604 }, { "epoch": 0.6207875133025895, "grad_norm": 0.23318524658679962, "learning_rate": 1.2659527877045087e-05, "loss": 0.4036, "num_input_tokens_seen": 6798603587, "step": 1750, "train_runtime": 47218.3907, "train_tokens_per_second": 143982.111 }, { "epoch": 0.6211422490244768, "grad_norm": 0.24187995493412018, "learning_rate": 1.2638755025676401e-05, "loss": 0.4031, "num_input_tokens_seen": 6802504156, "step": 1751, "train_runtime": 47244.1145, "train_tokens_per_second": 143986.277 }, { "epoch": 0.621496984746364, "grad_norm": 0.25943684577941895, "learning_rate": 1.2617991355762644e-05, "loss": 0.4039, "num_input_tokens_seen": 6806403813, "step": 1752, "train_runtime": 47275.6776, "train_tokens_per_second": 143972.634 }, { "epoch": 0.6218517204682511, "grad_norm": 0.2744631767272949, "learning_rate": 1.2597236893201712e-05, "loss": 0.3978, "num_input_tokens_seen": 6810313420, "step": 1753, "train_runtime": 47296.5169, "train_tokens_per_second": 143991.859 }, { "epoch": 0.6222064561901384, "grad_norm": 0.23202426731586456, "learning_rate": 1.2576491663880002e-05, "loss": 0.4036, "num_input_tokens_seen": 6814097035, "step": 1754, "train_runtime": 47318.256, "train_tokens_per_second": 144005.667 }, { "epoch": 0.6225611919120255, "grad_norm": 0.22693972289562225, "learning_rate": 1.2555755693672404e-05, "loss": 0.3943, "num_input_tokens_seen": 6817955863, "step": 1755, "train_runtime": 47339.7245, "train_tokens_per_second": 144021.875 }, { "epoch": 0.6229159276339127, "grad_norm": 0.3150312602519989, "learning_rate": 1.2535029008442266e-05, "loss": 0.3861, "num_input_tokens_seen": 6821812140, "step": 1756, "train_runtime": 47365.5967, "train_tokens_per_second": 144024.622 }, { "epoch": 0.6232706633558, "grad_norm": 0.23023925721645355, "learning_rate": 1.2514311634041325e-05, "loss": 0.3922, "num_input_tokens_seen": 6825697662, "step": 1757, "train_runtime": 47387.0643, "train_tokens_per_second": 144041.37 }, { "epoch": 0.6236253990776871, "grad_norm": 0.25059518218040466, "learning_rate": 1.2493603596309743e-05, "loss": 0.4086, "num_input_tokens_seen": 6829608361, "step": 1758, "train_runtime": 47410.879, "train_tokens_per_second": 144051.503 }, { "epoch": 0.6239801347995744, "grad_norm": 0.2647024989128113, "learning_rate": 1.247290492107601e-05, "loss": 0.3988, "num_input_tokens_seen": 6833492943, "step": 1759, "train_runtime": 47433.6525, "train_tokens_per_second": 144064.237 }, { "epoch": 0.6243348705214615, "grad_norm": 0.25038957595825195, "learning_rate": 1.2452215634156954e-05, "loss": 0.3951, "num_input_tokens_seen": 6837418684, "step": 1760, "train_runtime": 47453.634, "train_tokens_per_second": 144086.303 }, { "epoch": 0.6246896062433487, "grad_norm": 0.23461785912513733, "learning_rate": 1.243153576135769e-05, "loss": 0.3973, "num_input_tokens_seen": 6841283429, "step": 1761, "train_runtime": 47473.4783, "train_tokens_per_second": 144107.482 }, { "epoch": 0.625044341965236, "grad_norm": 0.23786494135856628, "learning_rate": 1.2410865328471589e-05, "loss": 0.4063, "num_input_tokens_seen": 6845177132, "step": 1762, "train_runtime": 47494.2249, "train_tokens_per_second": 144126.515 }, { "epoch": 0.6253990776871231, "grad_norm": 0.2626393735408783, "learning_rate": 1.239020436128024e-05, "loss": 0.4092, "num_input_tokens_seen": 6848940399, "step": 1763, "train_runtime": 47518.0908, "train_tokens_per_second": 144133.325 }, { "epoch": 0.6257538134090103, "grad_norm": 0.4577406048774719, "learning_rate": 1.2369552885553437e-05, "loss": 0.4078, "num_input_tokens_seen": 6852845145, "step": 1764, "train_runtime": 47548.8788, "train_tokens_per_second": 144122.118 }, { "epoch": 0.6261085491308975, "grad_norm": 0.23010632395744324, "learning_rate": 1.2348910927049133e-05, "loss": 0.4069, "num_input_tokens_seen": 6856697974, "step": 1765, "train_runtime": 47571.3573, "train_tokens_per_second": 144135.008 }, { "epoch": 0.6264632848527847, "grad_norm": 0.22792352735996246, "learning_rate": 1.2328278511513407e-05, "loss": 0.3901, "num_input_tokens_seen": 6860584209, "step": 1766, "train_runtime": 47594.1504, "train_tokens_per_second": 144147.635 }, { "epoch": 0.6268180205746718, "grad_norm": 0.38058555126190186, "learning_rate": 1.230765566468044e-05, "loss": 0.3883, "num_input_tokens_seen": 6864437789, "step": 1767, "train_runtime": 47613.6195, "train_tokens_per_second": 144169.628 }, { "epoch": 0.6271727562965591, "grad_norm": 0.26933202147483826, "learning_rate": 1.2287042412272482e-05, "loss": 0.3997, "num_input_tokens_seen": 6868302684, "step": 1768, "train_runtime": 47635.9519, "train_tokens_per_second": 144183.173 }, { "epoch": 0.6275274920184463, "grad_norm": 0.25938743352890015, "learning_rate": 1.2266438779999797e-05, "loss": 0.3914, "num_input_tokens_seen": 6872197192, "step": 1769, "train_runtime": 47655.041, "train_tokens_per_second": 144207.14 }, { "epoch": 0.6278822277403334, "grad_norm": 0.2465728223323822, "learning_rate": 1.2245844793560666e-05, "loss": 0.3966, "num_input_tokens_seen": 6876095014, "step": 1770, "train_runtime": 47674.3758, "train_tokens_per_second": 144230.415 }, { "epoch": 0.6282369634622207, "grad_norm": 0.32155898213386536, "learning_rate": 1.2225260478641343e-05, "loss": 0.406, "num_input_tokens_seen": 6879916385, "step": 1771, "train_runtime": 47706.6225, "train_tokens_per_second": 144213.026 }, { "epoch": 0.6285916991841078, "grad_norm": 0.24229075014591217, "learning_rate": 1.220468586091601e-05, "loss": 0.3933, "num_input_tokens_seen": 6883780358, "step": 1772, "train_runtime": 47742.8461, "train_tokens_per_second": 144184.541 }, { "epoch": 0.628946434905995, "grad_norm": 0.22721442580223083, "learning_rate": 1.2184120966046755e-05, "loss": 0.3987, "num_input_tokens_seen": 6887633685, "step": 1773, "train_runtime": 47768.645, "train_tokens_per_second": 144187.336 }, { "epoch": 0.6293011706278823, "grad_norm": 0.28792887926101685, "learning_rate": 1.2163565819683547e-05, "loss": 0.3855, "num_input_tokens_seen": 6891498539, "step": 1774, "train_runtime": 47788.7036, "train_tokens_per_second": 144207.69 }, { "epoch": 0.6296559063497694, "grad_norm": 0.8671011924743652, "learning_rate": 1.2143020447464177e-05, "loss": 0.4127, "num_input_tokens_seen": 6895416044, "step": 1775, "train_runtime": 47813.6781, "train_tokens_per_second": 144214.298 }, { "epoch": 0.6300106420716566, "grad_norm": 0.2454371303319931, "learning_rate": 1.2122484875014261e-05, "loss": 0.4014, "num_input_tokens_seen": 6899256957, "step": 1776, "train_runtime": 47845.7552, "train_tokens_per_second": 144197.89 }, { "epoch": 0.6303653777935438, "grad_norm": 0.2856403887271881, "learning_rate": 1.210195912794719e-05, "loss": 0.3859, "num_input_tokens_seen": 6903150300, "step": 1777, "train_runtime": 47870.3246, "train_tokens_per_second": 144205.212 }, { "epoch": 0.630720113515431, "grad_norm": 0.32632383704185486, "learning_rate": 1.2081443231864099e-05, "loss": 0.4011, "num_input_tokens_seen": 6907007373, "step": 1778, "train_runtime": 47890.3469, "train_tokens_per_second": 144225.461 }, { "epoch": 0.6310748492373182, "grad_norm": 0.21479660272598267, "learning_rate": 1.2060937212353834e-05, "loss": 0.3878, "num_input_tokens_seen": 6910876827, "step": 1779, "train_runtime": 47916.6085, "train_tokens_per_second": 144227.17 }, { "epoch": 0.6314295849592054, "grad_norm": 0.39925551414489746, "learning_rate": 1.204044109499293e-05, "loss": 0.4084, "num_input_tokens_seen": 6914816788, "step": 1780, "train_runtime": 47950.8613, "train_tokens_per_second": 144206.31 }, { "epoch": 0.6317843206810926, "grad_norm": 0.2552863657474518, "learning_rate": 1.2019954905345554e-05, "loss": 0.3913, "num_input_tokens_seen": 6918598370, "step": 1781, "train_runtime": 47969.7435, "train_tokens_per_second": 144228.38 }, { "epoch": 0.6321390564029797, "grad_norm": 0.21061669290065765, "learning_rate": 1.1999478668963509e-05, "loss": 0.4126, "num_input_tokens_seen": 6922512374, "step": 1782, "train_runtime": 47994.195, "train_tokens_per_second": 144236.451 }, { "epoch": 0.632493792124867, "grad_norm": 0.1928565949201584, "learning_rate": 1.1979012411386175e-05, "loss": 0.3929, "num_input_tokens_seen": 6926381250, "step": 1783, "train_runtime": 48020.8296, "train_tokens_per_second": 144237.018 }, { "epoch": 0.6328485278467542, "grad_norm": 0.239546000957489, "learning_rate": 1.1958556158140486e-05, "loss": 0.3901, "num_input_tokens_seen": 6930294649, "step": 1784, "train_runtime": 48043.042, "train_tokens_per_second": 144251.787 }, { "epoch": 0.6332032635686413, "grad_norm": 0.20935384929180145, "learning_rate": 1.1938109934740898e-05, "loss": 0.3875, "num_input_tokens_seen": 6934145766, "step": 1785, "train_runtime": 48066.7104, "train_tokens_per_second": 144260.876 }, { "epoch": 0.6335579992905286, "grad_norm": 0.2035689353942871, "learning_rate": 1.1917673766689362e-05, "loss": 0.3962, "num_input_tokens_seen": 6938037715, "step": 1786, "train_runtime": 48089.2176, "train_tokens_per_second": 144274.29 }, { "epoch": 0.6339127350124157, "grad_norm": 0.30531004071235657, "learning_rate": 1.1897247679475284e-05, "loss": 0.4062, "num_input_tokens_seen": 6941997110, "step": 1787, "train_runtime": 48120.8637, "train_tokens_per_second": 144261.69 }, { "epoch": 0.6342674707343029, "grad_norm": 0.21816061437129974, "learning_rate": 1.1876831698575486e-05, "loss": 0.3978, "num_input_tokens_seen": 6945868732, "step": 1788, "train_runtime": 48143.5133, "train_tokens_per_second": 144274.239 }, { "epoch": 0.6346222064561902, "grad_norm": 0.2119147777557373, "learning_rate": 1.1856425849454203e-05, "loss": 0.3959, "num_input_tokens_seen": 6949718407, "step": 1789, "train_runtime": 48172.2055, "train_tokens_per_second": 144268.221 }, { "epoch": 0.6349769421780773, "grad_norm": 0.3115083575248718, "learning_rate": 1.1836030157563021e-05, "loss": 0.4068, "num_input_tokens_seen": 6953604783, "step": 1790, "train_runtime": 48192.8515, "train_tokens_per_second": 144287.059 }, { "epoch": 0.6353316778999645, "grad_norm": 0.259899377822876, "learning_rate": 1.1815644648340862e-05, "loss": 0.3836, "num_input_tokens_seen": 6957554491, "step": 1791, "train_runtime": 48218.6552, "train_tokens_per_second": 144291.757 }, { "epoch": 0.6356864136218517, "grad_norm": 0.31616076827049255, "learning_rate": 1.1795269347213947e-05, "loss": 0.4028, "num_input_tokens_seen": 6961437846, "step": 1792, "train_runtime": 48256.6866, "train_tokens_per_second": 144258.513 }, { "epoch": 0.6360411493437389, "grad_norm": 0.2655629813671112, "learning_rate": 1.1774904279595766e-05, "loss": 0.4065, "num_input_tokens_seen": 6965315954, "step": 1793, "train_runtime": 48275.4427, "train_tokens_per_second": 144282.798 }, { "epoch": 0.636395885065626, "grad_norm": 0.27888140082359314, "learning_rate": 1.1754549470887033e-05, "loss": 0.3937, "num_input_tokens_seen": 6969218050, "step": 1794, "train_runtime": 48297.436, "train_tokens_per_second": 144297.889 }, { "epoch": 0.6367506207875133, "grad_norm": 0.20229539275169373, "learning_rate": 1.1734204946475685e-05, "loss": 0.3942, "num_input_tokens_seen": 6973129128, "step": 1795, "train_runtime": 48317.9122, "train_tokens_per_second": 144317.683 }, { "epoch": 0.6371053565094005, "grad_norm": 0.26510119438171387, "learning_rate": 1.1713870731736816e-05, "loss": 0.3914, "num_input_tokens_seen": 6976970236, "step": 1796, "train_runtime": 48346.375, "train_tokens_per_second": 144312.169 }, { "epoch": 0.6374600922312877, "grad_norm": 0.20387110114097595, "learning_rate": 1.1693546852032671e-05, "loss": 0.4052, "num_input_tokens_seen": 6980847909, "step": 1797, "train_runtime": 48368.3016, "train_tokens_per_second": 144326.918 }, { "epoch": 0.6378148279531749, "grad_norm": 0.3982866108417511, "learning_rate": 1.1673233332712606e-05, "loss": 0.3976, "num_input_tokens_seen": 6984726541, "step": 1798, "train_runtime": 48404.5008, "train_tokens_per_second": 144299.113 }, { "epoch": 0.6381695636750621, "grad_norm": 0.3397718071937561, "learning_rate": 1.1652930199113055e-05, "loss": 0.4001, "num_input_tokens_seen": 6988661916, "step": 1799, "train_runtime": 48431.0559, "train_tokens_per_second": 144301.25 }, { "epoch": 0.6385242993969493, "grad_norm": 1.3639914989471436, "learning_rate": 1.1632637476557476e-05, "loss": 0.3921, "num_input_tokens_seen": 6992553910, "step": 1800, "train_runtime": 48456.9746, "train_tokens_per_second": 144304.385 }, { "epoch": 0.6388790351188365, "grad_norm": 0.22068767249584198, "learning_rate": 1.1612355190356368e-05, "loss": 0.3874, "num_input_tokens_seen": 6996422275, "step": 1801, "train_runtime": 48586.127, "train_tokens_per_second": 144000.411 }, { "epoch": 0.6392337708407236, "grad_norm": 0.2780616283416748, "learning_rate": 1.1592083365807208e-05, "loss": 0.4096, "num_input_tokens_seen": 7000328997, "step": 1802, "train_runtime": 48615.6856, "train_tokens_per_second": 143993.218 }, { "epoch": 0.6395885065626109, "grad_norm": 0.27379634976387024, "learning_rate": 1.157182202819441e-05, "loss": 0.391, "num_input_tokens_seen": 7004281525, "step": 1803, "train_runtime": 48639.3071, "train_tokens_per_second": 144004.55 }, { "epoch": 0.6399432422844981, "grad_norm": 0.2500085234642029, "learning_rate": 1.155157120278933e-05, "loss": 0.3903, "num_input_tokens_seen": 7008133630, "step": 1804, "train_runtime": 48661.2776, "train_tokens_per_second": 144018.694 }, { "epoch": 0.6402979780063852, "grad_norm": 0.265468031167984, "learning_rate": 1.1531330914850204e-05, "loss": 0.3936, "num_input_tokens_seen": 7012064853, "step": 1805, "train_runtime": 48690.5007, "train_tokens_per_second": 144012.995 }, { "epoch": 0.6406527137282725, "grad_norm": 0.2874915897846222, "learning_rate": 1.1511101189622105e-05, "loss": 0.3803, "num_input_tokens_seen": 7015937797, "step": 1806, "train_runtime": 48717.8753, "train_tokens_per_second": 144011.572 }, { "epoch": 0.6410074494501596, "grad_norm": 0.24932405352592468, "learning_rate": 1.149088205233696e-05, "loss": 0.391, "num_input_tokens_seen": 7019802020, "step": 1807, "train_runtime": 48737.2749, "train_tokens_per_second": 144033.536 }, { "epoch": 0.6413621851720468, "grad_norm": 0.3057173490524292, "learning_rate": 1.1470673528213476e-05, "loss": 0.4004, "num_input_tokens_seen": 7023699629, "step": 1808, "train_runtime": 48758.5742, "train_tokens_per_second": 144050.554 }, { "epoch": 0.6417169208939341, "grad_norm": 0.24093875288963318, "learning_rate": 1.1450475642457124e-05, "loss": 0.3979, "num_input_tokens_seen": 7027569696, "step": 1809, "train_runtime": 48786.666, "train_tokens_per_second": 144046.935 }, { "epoch": 0.6420716566158212, "grad_norm": 0.3042769432067871, "learning_rate": 1.1430288420260106e-05, "loss": 0.3974, "num_input_tokens_seen": 7031451816, "step": 1810, "train_runtime": 48813.8247, "train_tokens_per_second": 144046.32 }, { "epoch": 0.6424263923377084, "grad_norm": 0.21607545018196106, "learning_rate": 1.141011188680133e-05, "loss": 0.3869, "num_input_tokens_seen": 7035380787, "step": 1811, "train_runtime": 48843.6185, "train_tokens_per_second": 144038.894 }, { "epoch": 0.6427811280595956, "grad_norm": 0.2613435983657837, "learning_rate": 1.1389946067246362e-05, "loss": 0.4029, "num_input_tokens_seen": 7039220030, "step": 1812, "train_runtime": 48873.6081, "train_tokens_per_second": 144029.064 }, { "epoch": 0.6431358637814828, "grad_norm": 0.21928001940250397, "learning_rate": 1.1369790986747412e-05, "loss": 0.3938, "num_input_tokens_seen": 7043093630, "step": 1813, "train_runtime": 48893.2775, "train_tokens_per_second": 144050.348 }, { "epoch": 0.64349059950337, "grad_norm": 0.25544679164886475, "learning_rate": 1.1349646670443293e-05, "loss": 0.394, "num_input_tokens_seen": 7047014929, "step": 1814, "train_runtime": 48919.1807, "train_tokens_per_second": 144054.23 }, { "epoch": 0.6438453352252572, "grad_norm": 0.2531129717826843, "learning_rate": 1.1329513143459391e-05, "loss": 0.385, "num_input_tokens_seen": 7050900902, "step": 1815, "train_runtime": 48951.4221, "train_tokens_per_second": 144038.735 }, { "epoch": 0.6442000709471444, "grad_norm": 1.2394721508026123, "learning_rate": 1.130939043090764e-05, "loss": 0.3811, "num_input_tokens_seen": 7054754401, "step": 1816, "train_runtime": 48971.9582, "train_tokens_per_second": 144057.021 }, { "epoch": 0.6445548066690315, "grad_norm": 0.2536279261112213, "learning_rate": 1.1289278557886476e-05, "loss": 0.3988, "num_input_tokens_seen": 7058701593, "step": 1817, "train_runtime": 49001.5206, "train_tokens_per_second": 144050.664 }, { "epoch": 0.6449095423909188, "grad_norm": 0.27858179807662964, "learning_rate": 1.1269177549480836e-05, "loss": 0.4005, "num_input_tokens_seen": 7062475341, "step": 1818, "train_runtime": 49022.5361, "train_tokens_per_second": 144065.891 }, { "epoch": 0.645264278112806, "grad_norm": 0.3159359395503998, "learning_rate": 1.124908743076207e-05, "loss": 0.4064, "num_input_tokens_seen": 7066419093, "step": 1819, "train_runtime": 49049.769, "train_tokens_per_second": 144066.307 }, { "epoch": 0.6456190138346931, "grad_norm": 0.23147493600845337, "learning_rate": 1.1229008226787976e-05, "loss": 0.4006, "num_input_tokens_seen": 7070292623, "step": 1820, "train_runtime": 49069.3237, "train_tokens_per_second": 144087.835 }, { "epoch": 0.6459737495565804, "grad_norm": 0.21261638402938843, "learning_rate": 1.1208939962602728e-05, "loss": 0.3864, "num_input_tokens_seen": 7074115171, "step": 1821, "train_runtime": 49098.0024, "train_tokens_per_second": 144081.527 }, { "epoch": 0.6463284852784675, "grad_norm": 0.6457936763763428, "learning_rate": 1.1188882663236855e-05, "loss": 0.4003, "num_input_tokens_seen": 7078075877, "step": 1822, "train_runtime": 49115.0855, "train_tokens_per_second": 144112.055 }, { "epoch": 0.6466832210003547, "grad_norm": 1.0460132360458374, "learning_rate": 1.1168836353707214e-05, "loss": 0.4012, "num_input_tokens_seen": 7081918223, "step": 1823, "train_runtime": 49151.6735, "train_tokens_per_second": 144082.952 }, { "epoch": 0.647037956722242, "grad_norm": 0.22445932030677795, "learning_rate": 1.1148801059016958e-05, "loss": 0.384, "num_input_tokens_seen": 7085859482, "step": 1824, "train_runtime": 49177.454, "train_tokens_per_second": 144087.563 }, { "epoch": 0.6473926924441291, "grad_norm": 0.41175857186317444, "learning_rate": 1.1128776804155479e-05, "loss": 0.3914, "num_input_tokens_seen": 7089697810, "step": 1825, "train_runtime": 49203.9099, "train_tokens_per_second": 144088.098 }, { "epoch": 0.6477474281660163, "grad_norm": 0.22464288771152496, "learning_rate": 1.1108763614098424e-05, "loss": 0.3962, "num_input_tokens_seen": 7093674140, "step": 1826, "train_runtime": 49234.8132, "train_tokens_per_second": 144078.421 }, { "epoch": 0.6481021638879035, "grad_norm": 0.3036128282546997, "learning_rate": 1.1088761513807622e-05, "loss": 0.3947, "num_input_tokens_seen": 7097527630, "step": 1827, "train_runtime": 49263.6984, "train_tokens_per_second": 144072.164 }, { "epoch": 0.6484568996097907, "grad_norm": 0.303561806678772, "learning_rate": 1.1068770528231094e-05, "loss": 0.3948, "num_input_tokens_seen": 7101321471, "step": 1828, "train_runtime": 49295.0502, "train_tokens_per_second": 144057.496 }, { "epoch": 0.6488116353316778, "grad_norm": 0.25108879804611206, "learning_rate": 1.1048790682302978e-05, "loss": 0.4022, "num_input_tokens_seen": 7105306336, "step": 1829, "train_runtime": 49315.6305, "train_tokens_per_second": 144078.181 }, { "epoch": 0.6491663710535651, "grad_norm": 0.49465397000312805, "learning_rate": 1.1028822000943529e-05, "loss": 0.3977, "num_input_tokens_seen": 7109170259, "step": 1830, "train_runtime": 49347.5553, "train_tokens_per_second": 144063.272 }, { "epoch": 0.6495211067754523, "grad_norm": 0.33978933095932007, "learning_rate": 1.1008864509059055e-05, "loss": 0.3945, "num_input_tokens_seen": 7113076593, "step": 1831, "train_runtime": 49374.0965, "train_tokens_per_second": 144064.947 }, { "epoch": 0.6498758424973394, "grad_norm": 0.6304144263267517, "learning_rate": 1.0988918231541928e-05, "loss": 0.3964, "num_input_tokens_seen": 7117020035, "step": 1832, "train_runtime": 49398.835, "train_tokens_per_second": 144072.629 }, { "epoch": 0.6502305782192267, "grad_norm": 0.19375288486480713, "learning_rate": 1.0968983193270531e-05, "loss": 0.4038, "num_input_tokens_seen": 7120951005, "step": 1833, "train_runtime": 49426.4145, "train_tokens_per_second": 144071.77 }, { "epoch": 0.6505853139411139, "grad_norm": 0.22690415382385254, "learning_rate": 1.0949059419109225e-05, "loss": 0.403, "num_input_tokens_seen": 7124818402, "step": 1834, "train_runtime": 49446.5472, "train_tokens_per_second": 144091.323 }, { "epoch": 0.6509400496630011, "grad_norm": 0.2230953723192215, "learning_rate": 1.0929146933908318e-05, "loss": 0.3976, "num_input_tokens_seen": 7128747520, "step": 1835, "train_runtime": 49473.1402, "train_tokens_per_second": 144093.29 }, { "epoch": 0.6512947853848883, "grad_norm": 0.2158123105764389, "learning_rate": 1.0909245762504055e-05, "loss": 0.3955, "num_input_tokens_seen": 7132740484, "step": 1836, "train_runtime": 49495.5577, "train_tokens_per_second": 144108.7 }, { "epoch": 0.6516495211067754, "grad_norm": 0.23956823348999023, "learning_rate": 1.0889355929718541e-05, "loss": 0.3984, "num_input_tokens_seen": 7136627598, "step": 1837, "train_runtime": 49522.0244, "train_tokens_per_second": 144110.175 }, { "epoch": 0.6520042568286627, "grad_norm": 0.26248666644096375, "learning_rate": 1.0869477460359756e-05, "loss": 0.3964, "num_input_tokens_seen": 7140530453, "step": 1838, "train_runtime": 49543.0452, "train_tokens_per_second": 144127.807 }, { "epoch": 0.6523589925505499, "grad_norm": 0.19771386682987213, "learning_rate": 1.0849610379221508e-05, "loss": 0.4016, "num_input_tokens_seen": 7144415603, "step": 1839, "train_runtime": 49570.9414, "train_tokens_per_second": 144125.074 }, { "epoch": 0.652713728272437, "grad_norm": 0.2419430911540985, "learning_rate": 1.08297547110834e-05, "loss": 0.4066, "num_input_tokens_seen": 7148253999, "step": 1840, "train_runtime": 49603.8157, "train_tokens_per_second": 144106.938 }, { "epoch": 0.6530684639943243, "grad_norm": 0.21368268132209778, "learning_rate": 1.0809910480710793e-05, "loss": 0.3996, "num_input_tokens_seen": 7152205398, "step": 1841, "train_runtime": 49635.5184, "train_tokens_per_second": 144094.504 }, { "epoch": 0.6534231997162114, "grad_norm": 0.26195481419563293, "learning_rate": 1.07900777128548e-05, "loss": 0.3964, "num_input_tokens_seen": 7156195984, "step": 1842, "train_runtime": 49669.7639, "train_tokens_per_second": 144075.498 }, { "epoch": 0.6537779354380986, "grad_norm": 0.22672998905181885, "learning_rate": 1.0770256432252193e-05, "loss": 0.3869, "num_input_tokens_seen": 7160033477, "step": 1843, "train_runtime": 49698.4992, "train_tokens_per_second": 144069.41 }, { "epoch": 0.6541326711599859, "grad_norm": 0.7583263516426086, "learning_rate": 1.0750446663625476e-05, "loss": 0.3844, "num_input_tokens_seen": 7163950031, "step": 1844, "train_runtime": 49717.9994, "train_tokens_per_second": 144091.679 }, { "epoch": 0.654487406881873, "grad_norm": 0.2126823216676712, "learning_rate": 1.0730648431682754e-05, "loss": 0.4044, "num_input_tokens_seen": 7167854530, "step": 1845, "train_runtime": 49743.2213, "train_tokens_per_second": 144097.112 }, { "epoch": 0.6548421426037602, "grad_norm": 0.22527992725372314, "learning_rate": 1.0710861761117756e-05, "loss": 0.3881, "num_input_tokens_seen": 7171712696, "step": 1846, "train_runtime": 49768.3195, "train_tokens_per_second": 144101.966 }, { "epoch": 0.6551968783256474, "grad_norm": 0.32248809933662415, "learning_rate": 1.069108667660979e-05, "loss": 0.4051, "num_input_tokens_seen": 7175577762, "step": 1847, "train_runtime": 49801.8499, "train_tokens_per_second": 144082.555 }, { "epoch": 0.6555516140475346, "grad_norm": 0.22402553260326385, "learning_rate": 1.067132320282371e-05, "loss": 0.3971, "num_input_tokens_seen": 7179492644, "step": 1848, "train_runtime": 49820.2739, "train_tokens_per_second": 144107.852 }, { "epoch": 0.6559063497694217, "grad_norm": 0.23994439840316772, "learning_rate": 1.0651571364409897e-05, "loss": 0.4055, "num_input_tokens_seen": 7183363782, "step": 1849, "train_runtime": 49840.1812, "train_tokens_per_second": 144127.963 }, { "epoch": 0.656261085491309, "grad_norm": 0.23013997077941895, "learning_rate": 1.06318311860042e-05, "loss": 0.3996, "num_input_tokens_seen": 7187233768, "step": 1850, "train_runtime": 49871.8501, "train_tokens_per_second": 144114.039 }, { "epoch": 0.6566158212131962, "grad_norm": 0.24056412279605865, "learning_rate": 1.0612102692227944e-05, "loss": 0.4056, "num_input_tokens_seen": 7191168169, "step": 1851, "train_runtime": 49895.1319, "train_tokens_per_second": 144125.647 }, { "epoch": 0.6569705569350833, "grad_norm": 0.20812740921974182, "learning_rate": 1.0592385907687875e-05, "loss": 0.4033, "num_input_tokens_seen": 7195066181, "step": 1852, "train_runtime": 49918.9591, "train_tokens_per_second": 144134.94 }, { "epoch": 0.6573252926569706, "grad_norm": 0.24930483102798462, "learning_rate": 1.0572680856976131e-05, "loss": 0.405, "num_input_tokens_seen": 7198911094, "step": 1853, "train_runtime": 49946.0005, "train_tokens_per_second": 144133.885 }, { "epoch": 0.6576800283788578, "grad_norm": 0.24172507226467133, "learning_rate": 1.0552987564670221e-05, "loss": 0.4029, "num_input_tokens_seen": 7202824749, "step": 1854, "train_runtime": 49965.565, "train_tokens_per_second": 144155.775 }, { "epoch": 0.6580347641007449, "grad_norm": 0.21902009844779968, "learning_rate": 1.053330605533299e-05, "loss": 0.3854, "num_input_tokens_seen": 7206697228, "step": 1855, "train_runtime": 49992.5589, "train_tokens_per_second": 144155.398 }, { "epoch": 0.6583894998226322, "grad_norm": 0.6218721270561218, "learning_rate": 1.0513636353512562e-05, "loss": 0.3902, "num_input_tokens_seen": 7210525386, "step": 1856, "train_runtime": 50012.2153, "train_tokens_per_second": 144175.285 }, { "epoch": 0.6587442355445193, "grad_norm": 0.26162290573120117, "learning_rate": 1.0493978483742362e-05, "loss": 0.3944, "num_input_tokens_seen": 7214483440, "step": 1857, "train_runtime": 50038.2917, "train_tokens_per_second": 144179.252 }, { "epoch": 0.6590989712664065, "grad_norm": 0.29853057861328125, "learning_rate": 1.047433247054105e-05, "loss": 0.3987, "num_input_tokens_seen": 7218246457, "step": 1858, "train_runtime": 50060.0449, "train_tokens_per_second": 144191.77 }, { "epoch": 0.6594537069882938, "grad_norm": 0.26954489946365356, "learning_rate": 1.0454698338412493e-05, "loss": 0.3928, "num_input_tokens_seen": 7222166962, "step": 1859, "train_runtime": 50079.4444, "train_tokens_per_second": 144214.199 }, { "epoch": 0.6598084427101809, "grad_norm": 0.2318904846906662, "learning_rate": 1.0435076111845741e-05, "loss": 0.4045, "num_input_tokens_seen": 7226027651, "step": 1860, "train_runtime": 50101.3925, "train_tokens_per_second": 144228.08 }, { "epoch": 0.6601631784320681, "grad_norm": 0.3221368193626404, "learning_rate": 1.0415465815314995e-05, "loss": 0.393, "num_input_tokens_seen": 7229880803, "step": 1861, "train_runtime": 50133.8408, "train_tokens_per_second": 144211.588 }, { "epoch": 0.6605179141539553, "grad_norm": 0.26431652903556824, "learning_rate": 1.0395867473279578e-05, "loss": 0.3991, "num_input_tokens_seen": 7233791833, "step": 1862, "train_runtime": 50159.9604, "train_tokens_per_second": 144214.464 }, { "epoch": 0.6608726498758425, "grad_norm": 0.22016966342926025, "learning_rate": 1.03762811101839e-05, "loss": 0.401, "num_input_tokens_seen": 7237660065, "step": 1863, "train_runtime": 50185.5154, "train_tokens_per_second": 144218.108 }, { "epoch": 0.6612273855977296, "grad_norm": 1.221964716911316, "learning_rate": 1.0356706750457429e-05, "loss": 0.394, "num_input_tokens_seen": 7241583794, "step": 1864, "train_runtime": 50206.7555, "train_tokens_per_second": 144235.247 }, { "epoch": 0.6615821213196169, "grad_norm": 0.2780468463897705, "learning_rate": 1.0337144418514664e-05, "loss": 0.3958, "num_input_tokens_seen": 7245491056, "step": 1865, "train_runtime": 50236.6579, "train_tokens_per_second": 144227.171 }, { "epoch": 0.6619368570415041, "grad_norm": 0.2430473417043686, "learning_rate": 1.0317594138755101e-05, "loss": 0.3983, "num_input_tokens_seen": 7249311090, "step": 1866, "train_runtime": 50261.9156, "train_tokens_per_second": 144230.696 }, { "epoch": 0.6622915927633912, "grad_norm": 0.2696076035499573, "learning_rate": 1.0298055935563212e-05, "loss": 0.3829, "num_input_tokens_seen": 7253239126, "step": 1867, "train_runtime": 50290.9657, "train_tokens_per_second": 144225.489 }, { "epoch": 0.6626463284852785, "grad_norm": 0.21783383190631866, "learning_rate": 1.0278529833308382e-05, "loss": 0.393, "num_input_tokens_seen": 7257083296, "step": 1868, "train_runtime": 50313.2552, "train_tokens_per_second": 144238.0 }, { "epoch": 0.6630010642071656, "grad_norm": 0.23600219190120697, "learning_rate": 1.0259015856344927e-05, "loss": 0.3897, "num_input_tokens_seen": 7260980252, "step": 1869, "train_runtime": 50338.4563, "train_tokens_per_second": 144243.205 }, { "epoch": 0.6633557999290528, "grad_norm": 0.30577191710472107, "learning_rate": 1.0239514029012035e-05, "loss": 0.3953, "num_input_tokens_seen": 7264856178, "step": 1870, "train_runtime": 50365.1823, "train_tokens_per_second": 144243.619 }, { "epoch": 0.6637105356509401, "grad_norm": 0.2903383672237396, "learning_rate": 1.0220024375633733e-05, "loss": 0.3952, "num_input_tokens_seen": 7268689045, "step": 1871, "train_runtime": 50384.6682, "train_tokens_per_second": 144263.906 }, { "epoch": 0.6640652713728272, "grad_norm": 0.9937180876731873, "learning_rate": 1.0200546920518865e-05, "loss": 0.3822, "num_input_tokens_seen": 7272549259, "step": 1872, "train_runtime": 50408.4695, "train_tokens_per_second": 144272.368 }, { "epoch": 0.6644200070947145, "grad_norm": 0.23687131702899933, "learning_rate": 1.0181081687961067e-05, "loss": 0.395, "num_input_tokens_seen": 7276379735, "step": 1873, "train_runtime": 50433.928, "train_tokens_per_second": 144275.491 }, { "epoch": 0.6647747428166016, "grad_norm": 0.3157918453216553, "learning_rate": 1.0161628702238736e-05, "loss": 0.3954, "num_input_tokens_seen": 7280276983, "step": 1874, "train_runtime": 50455.3285, "train_tokens_per_second": 144291.539 }, { "epoch": 0.6651294785384888, "grad_norm": 0.2506940960884094, "learning_rate": 1.0142187987614962e-05, "loss": 0.399, "num_input_tokens_seen": 7284189565, "step": 1875, "train_runtime": 50481.1715, "train_tokens_per_second": 144295.177 }, { "epoch": 0.6654842142603761, "grad_norm": 0.24229153990745544, "learning_rate": 1.0122759568337566e-05, "loss": 0.3912, "num_input_tokens_seen": 7288041573, "step": 1876, "train_runtime": 50508.6574, "train_tokens_per_second": 144292.918 }, { "epoch": 0.6658389499822632, "grad_norm": 0.29133346676826477, "learning_rate": 1.0103343468639016e-05, "loss": 0.3834, "num_input_tokens_seen": 7291929081, "step": 1877, "train_runtime": 50530.3597, "train_tokens_per_second": 144307.88 }, { "epoch": 0.6661936857041504, "grad_norm": 0.2528546452522278, "learning_rate": 1.0083939712736414e-05, "loss": 0.3919, "num_input_tokens_seen": 7295850599, "step": 1878, "train_runtime": 50550.6881, "train_tokens_per_second": 144327.424 }, { "epoch": 0.6665484214260377, "grad_norm": 0.2528163194656372, "learning_rate": 1.0064548324831475e-05, "loss": 0.3923, "num_input_tokens_seen": 7299660283, "step": 1879, "train_runtime": 50572.8779, "train_tokens_per_second": 144339.428 }, { "epoch": 0.6669031571479248, "grad_norm": 0.2777249813079834, "learning_rate": 1.004516932911048e-05, "loss": 0.3944, "num_input_tokens_seen": 7303595450, "step": 1880, "train_runtime": 50610.616, "train_tokens_per_second": 144309.555 }, { "epoch": 0.667257892869812, "grad_norm": 0.24569062888622284, "learning_rate": 1.0025802749744256e-05, "loss": 0.4013, "num_input_tokens_seen": 7307470214, "step": 1881, "train_runtime": 50637.5568, "train_tokens_per_second": 144309.297 }, { "epoch": 0.6676126285916992, "grad_norm": 0.2746468484401703, "learning_rate": 1.000644861088814e-05, "loss": 0.3937, "num_input_tokens_seen": 7311350243, "step": 1882, "train_runtime": 50664.0809, "train_tokens_per_second": 144310.33 }, { "epoch": 0.6679673643135864, "grad_norm": 0.21846577525138855, "learning_rate": 9.987106936681958e-06, "loss": 0.3931, "num_input_tokens_seen": 7315223870, "step": 1883, "train_runtime": 50681.2507, "train_tokens_per_second": 144337.872 }, { "epoch": 0.6683221000354735, "grad_norm": 0.3146246373653412, "learning_rate": 9.967777751249987e-06, "loss": 0.3996, "num_input_tokens_seen": 7319158965, "step": 1884, "train_runtime": 50703.919, "train_tokens_per_second": 144350.952 }, { "epoch": 0.6686768357573608, "grad_norm": 0.2759598195552826, "learning_rate": 9.948461078700926e-06, "loss": 0.3995, "num_input_tokens_seen": 7323072652, "step": 1885, "train_runtime": 50746.6084, "train_tokens_per_second": 144306.642 }, { "epoch": 0.669031571479248, "grad_norm": 0.2689201235771179, "learning_rate": 9.929156943127876e-06, "loss": 0.396, "num_input_tokens_seen": 7326935648, "step": 1886, "train_runtime": 50765.6302, "train_tokens_per_second": 144328.665 }, { "epoch": 0.6693863072011351, "grad_norm": 0.5129528641700745, "learning_rate": 9.909865368608275e-06, "loss": 0.3962, "num_input_tokens_seen": 7330865879, "step": 1887, "train_runtime": 50798.4352, "train_tokens_per_second": 144312.829 }, { "epoch": 0.6697410429230224, "grad_norm": 0.30360928177833557, "learning_rate": 9.890586379203923e-06, "loss": 0.3952, "num_input_tokens_seen": 7334618701, "step": 1888, "train_runtime": 50825.5607, "train_tokens_per_second": 144309.647 }, { "epoch": 0.6700957786449095, "grad_norm": 0.22872842848300934, "learning_rate": 9.87131999896091e-06, "loss": 0.3962, "num_input_tokens_seen": 7338532101, "step": 1889, "train_runtime": 50853.6308, "train_tokens_per_second": 144306.945 }, { "epoch": 0.6704505143667967, "grad_norm": 0.46613165736198425, "learning_rate": 9.852066251909603e-06, "loss": 0.3852, "num_input_tokens_seen": 7342432206, "step": 1890, "train_runtime": 50882.1022, "train_tokens_per_second": 144302.847 }, { "epoch": 0.670805250088684, "grad_norm": 0.2180221974849701, "learning_rate": 9.832825162064605e-06, "loss": 0.3923, "num_input_tokens_seen": 7346394499, "step": 1891, "train_runtime": 50914.2623, "train_tokens_per_second": 144289.521 }, { "epoch": 0.6711599858105711, "grad_norm": 0.32388031482696533, "learning_rate": 9.813596753424747e-06, "loss": 0.4003, "num_input_tokens_seen": 7350260963, "step": 1892, "train_runtime": 50938.7843, "train_tokens_per_second": 144295.964 }, { "epoch": 0.6715147215324583, "grad_norm": 0.28301820158958435, "learning_rate": 9.794381049973018e-06, "loss": 0.3914, "num_input_tokens_seen": 7354180104, "step": 1893, "train_runtime": 50963.9173, "train_tokens_per_second": 144301.704 }, { "epoch": 0.6718694572543455, "grad_norm": 0.2870126962661743, "learning_rate": 9.775178075676586e-06, "loss": 0.3993, "num_input_tokens_seen": 7358019513, "step": 1894, "train_runtime": 51003.7469, "train_tokens_per_second": 144264.294 }, { "epoch": 0.6722241929762327, "grad_norm": 0.26716479659080505, "learning_rate": 9.755987854486726e-06, "loss": 0.4021, "num_input_tokens_seen": 7361942489, "step": 1895, "train_runtime": 51021.6208, "train_tokens_per_second": 144290.643 }, { "epoch": 0.6725789286981199, "grad_norm": 0.2677048146724701, "learning_rate": 9.736810410338815e-06, "loss": 0.3977, "num_input_tokens_seen": 7365744745, "step": 1896, "train_runtime": 51045.0777, "train_tokens_per_second": 144298.825 }, { "epoch": 0.6729336644200071, "grad_norm": 0.2665366530418396, "learning_rate": 9.717645767152289e-06, "loss": 0.3991, "num_input_tokens_seen": 7369700246, "step": 1897, "train_runtime": 51068.2976, "train_tokens_per_second": 144310.67 }, { "epoch": 0.6732884001418943, "grad_norm": 0.2775816023349762, "learning_rate": 9.698493948830618e-06, "loss": 0.4066, "num_input_tokens_seen": 7373639407, "step": 1898, "train_runtime": 51088.1891, "train_tokens_per_second": 144331.587 }, { "epoch": 0.6736431358637814, "grad_norm": 0.2317071110010147, "learning_rate": 9.679354979261282e-06, "loss": 0.3971, "num_input_tokens_seen": 7377467142, "step": 1899, "train_runtime": 51106.9965, "train_tokens_per_second": 144353.369 }, { "epoch": 0.6739978715856687, "grad_norm": 0.4138202965259552, "learning_rate": 9.660228882315725e-06, "loss": 0.3999, "num_input_tokens_seen": 7381409039, "step": 1900, "train_runtime": 51129.3443, "train_tokens_per_second": 144367.371 }, { "epoch": 0.6743526073075559, "grad_norm": 0.2782776653766632, "learning_rate": 9.641115681849347e-06, "loss": 0.3908, "num_input_tokens_seen": 7385302694, "step": 1901, "train_runtime": 51158.9209, "train_tokens_per_second": 144360.017 }, { "epoch": 0.674707343029443, "grad_norm": 0.25928133726119995, "learning_rate": 9.62201540170145e-06, "loss": 0.3998, "num_input_tokens_seen": 7389144588, "step": 1902, "train_runtime": 51183.8901, "train_tokens_per_second": 144364.654 }, { "epoch": 0.6750620787513303, "grad_norm": 0.3207211196422577, "learning_rate": 9.602928065695229e-06, "loss": 0.3943, "num_input_tokens_seen": 7393021864, "step": 1903, "train_runtime": 51209.9761, "train_tokens_per_second": 144366.829 }, { "epoch": 0.6754168144732174, "grad_norm": 0.2594163715839386, "learning_rate": 9.583853697637734e-06, "loss": 0.3865, "num_input_tokens_seen": 7396869119, "step": 1904, "train_runtime": 51236.533, "train_tokens_per_second": 144367.089 }, { "epoch": 0.6757715501951046, "grad_norm": 0.3749329447746277, "learning_rate": 9.564792321319846e-06, "loss": 0.3948, "num_input_tokens_seen": 7400776811, "step": 1905, "train_runtime": 51268.4859, "train_tokens_per_second": 144353.333 }, { "epoch": 0.6761262859169919, "grad_norm": 0.34989309310913086, "learning_rate": 9.545743960516218e-06, "loss": 0.3913, "num_input_tokens_seen": 7404680327, "step": 1906, "train_runtime": 51287.8434, "train_tokens_per_second": 144374.96 }, { "epoch": 0.676481021638879, "grad_norm": 0.22281792759895325, "learning_rate": 9.526708638985296e-06, "loss": 0.3961, "num_input_tokens_seen": 7408593442, "step": 1907, "train_runtime": 51316.657, "train_tokens_per_second": 144370.15 }, { "epoch": 0.6768357573607662, "grad_norm": 0.8007925748825073, "learning_rate": 9.50768638046925e-06, "loss": 0.3923, "num_input_tokens_seen": 7412463568, "step": 1908, "train_runtime": 51348.7032, "train_tokens_per_second": 144355.419 }, { "epoch": 0.6771904930826534, "grad_norm": 0.22571665048599243, "learning_rate": 9.48867720869396e-06, "loss": 0.3881, "num_input_tokens_seen": 7416391924, "step": 1909, "train_runtime": 51378.124, "train_tokens_per_second": 144349.216 }, { "epoch": 0.6775452288045406, "grad_norm": 0.33733198046684265, "learning_rate": 9.469681147368982e-06, "loss": 0.3907, "num_input_tokens_seen": 7420278800, "step": 1910, "train_runtime": 51407.4942, "train_tokens_per_second": 144342.355 }, { "epoch": 0.6778999645264278, "grad_norm": 0.2416616529226303, "learning_rate": 9.450698220187528e-06, "loss": 0.3902, "num_input_tokens_seen": 7424175994, "step": 1911, "train_runtime": 51445.3796, "train_tokens_per_second": 144311.813 }, { "epoch": 0.678254700248315, "grad_norm": 0.358969122171402, "learning_rate": 9.431728450826408e-06, "loss": 0.4033, "num_input_tokens_seen": 7428051007, "step": 1912, "train_runtime": 51468.1465, "train_tokens_per_second": 144323.266 }, { "epoch": 0.6786094359702022, "grad_norm": 0.3551838994026184, "learning_rate": 9.412771862946038e-06, "loss": 0.4023, "num_input_tokens_seen": 7431954668, "step": 1913, "train_runtime": 51494.4222, "train_tokens_per_second": 144325.431 }, { "epoch": 0.6789641716920894, "grad_norm": 0.47825756669044495, "learning_rate": 9.393828480190387e-06, "loss": 0.397, "num_input_tokens_seen": 7435858239, "step": 1914, "train_runtime": 51525.5783, "train_tokens_per_second": 144313.921 }, { "epoch": 0.6793189074139766, "grad_norm": 0.23075219988822937, "learning_rate": 9.374898326186959e-06, "loss": 0.3927, "num_input_tokens_seen": 7439739962, "step": 1915, "train_runtime": 51541.042, "train_tokens_per_second": 144345.936 }, { "epoch": 0.6796736431358638, "grad_norm": 0.26134201884269714, "learning_rate": 9.35598142454675e-06, "loss": 0.3894, "num_input_tokens_seen": 7443641713, "step": 1916, "train_runtime": 51559.6574, "train_tokens_per_second": 144369.495 }, { "epoch": 0.680028378857751, "grad_norm": 0.42492061853408813, "learning_rate": 9.337077798864231e-06, "loss": 0.4026, "num_input_tokens_seen": 7447556076, "step": 1917, "train_runtime": 51578.2164, "train_tokens_per_second": 144393.44 }, { "epoch": 0.6803831145796382, "grad_norm": 0.257869154214859, "learning_rate": 9.318187472717319e-06, "loss": 0.3908, "num_input_tokens_seen": 7451436830, "step": 1918, "train_runtime": 51609.8933, "train_tokens_per_second": 144380.008 }, { "epoch": 0.6807378503015253, "grad_norm": 0.2848484516143799, "learning_rate": 9.299310469667333e-06, "loss": 0.4019, "num_input_tokens_seen": 7455316461, "step": 1919, "train_runtime": 51635.5173, "train_tokens_per_second": 144383.495 }, { "epoch": 0.6810925860234126, "grad_norm": 0.2645987868309021, "learning_rate": 9.280446813258982e-06, "loss": 0.3984, "num_input_tokens_seen": 7459206267, "step": 1920, "train_runtime": 51665.5188, "train_tokens_per_second": 144374.942 }, { "epoch": 0.6814473217452998, "grad_norm": 0.22541047632694244, "learning_rate": 9.261596527020324e-06, "loss": 0.3951, "num_input_tokens_seen": 7463037374, "step": 1921, "train_runtime": 51684.5387, "train_tokens_per_second": 144395.936 }, { "epoch": 0.6818020574671869, "grad_norm": 0.3084311783313751, "learning_rate": 9.242759634462743e-06, "loss": 0.3959, "num_input_tokens_seen": 7466933399, "step": 1922, "train_runtime": 51710.1743, "train_tokens_per_second": 144399.695 }, { "epoch": 0.6821567931890742, "grad_norm": 2.635568141937256, "learning_rate": 9.223936159080926e-06, "loss": 0.4017, "num_input_tokens_seen": 7470814354, "step": 1923, "train_runtime": 51736.8699, "train_tokens_per_second": 144400.2 }, { "epoch": 0.6825115289109613, "grad_norm": 0.2886437177658081, "learning_rate": 9.2051261243528e-06, "loss": 0.388, "num_input_tokens_seen": 7474768358, "step": 1924, "train_runtime": 51758.6108, "train_tokens_per_second": 144415.939 }, { "epoch": 0.6828662646328485, "grad_norm": 0.2525988221168518, "learning_rate": 9.186329553739552e-06, "loss": 0.3963, "num_input_tokens_seen": 7478637975, "step": 1925, "train_runtime": 51792.8417, "train_tokens_per_second": 144395.205 }, { "epoch": 0.6832210003547358, "grad_norm": 0.3055419921875, "learning_rate": 9.167546470685567e-06, "loss": 0.3965, "num_input_tokens_seen": 7482511044, "step": 1926, "train_runtime": 51817.1812, "train_tokens_per_second": 144402.124 }, { "epoch": 0.6835757360766229, "grad_norm": 0.27304020524024963, "learning_rate": 9.148776898618408e-06, "loss": 0.3868, "num_input_tokens_seen": 7486393916, "step": 1927, "train_runtime": 51848.856, "train_tokens_per_second": 144388.797 }, { "epoch": 0.6839304717985101, "grad_norm": 0.23826870322227478, "learning_rate": 9.130020860948786e-06, "loss": 0.3984, "num_input_tokens_seen": 7490241299, "step": 1928, "train_runtime": 51878.7364, "train_tokens_per_second": 144379.794 }, { "epoch": 0.6842852075203973, "grad_norm": 0.37184959650039673, "learning_rate": 9.111278381070538e-06, "loss": 0.3967, "num_input_tokens_seen": 7494188357, "step": 1929, "train_runtime": 51898.2136, "train_tokens_per_second": 144401.663 }, { "epoch": 0.6846399432422845, "grad_norm": 0.23322296142578125, "learning_rate": 9.092549482360571e-06, "loss": 0.3988, "num_input_tokens_seen": 7498093936, "step": 1930, "train_runtime": 51923.7999, "train_tokens_per_second": 144405.724 }, { "epoch": 0.6849946789641717, "grad_norm": 0.30996280908584595, "learning_rate": 9.073834188178872e-06, "loss": 0.3921, "num_input_tokens_seen": 7501988159, "step": 1931, "train_runtime": 51941.3989, "train_tokens_per_second": 144431.77 }, { "epoch": 0.6853494146860589, "grad_norm": 0.3017365336418152, "learning_rate": 9.055132521868452e-06, "loss": 0.4022, "num_input_tokens_seen": 7505925905, "step": 1932, "train_runtime": 51968.0944, "train_tokens_per_second": 144433.349 }, { "epoch": 0.6857041504079461, "grad_norm": 0.25434139370918274, "learning_rate": 9.036444506755322e-06, "loss": 0.4017, "num_input_tokens_seen": 7509712670, "step": 1933, "train_runtime": 51994.4534, "train_tokens_per_second": 144432.957 }, { "epoch": 0.6860588861298332, "grad_norm": 0.28697600960731506, "learning_rate": 9.01777016614848e-06, "loss": 0.3926, "num_input_tokens_seen": 7513675740, "step": 1934, "train_runtime": 52013.759, "train_tokens_per_second": 144455.542 }, { "epoch": 0.6864136218517205, "grad_norm": 0.22553640604019165, "learning_rate": 8.99910952333985e-06, "loss": 0.3952, "num_input_tokens_seen": 7517530643, "step": 1935, "train_runtime": 52037.6349, "train_tokens_per_second": 144463.342 }, { "epoch": 0.6867683575736077, "grad_norm": 0.2570045292377472, "learning_rate": 8.980462601604291e-06, "loss": 0.4035, "num_input_tokens_seen": 7521410280, "step": 1936, "train_runtime": 52063.4184, "train_tokens_per_second": 144466.317 }, { "epoch": 0.6871230932954948, "grad_norm": 0.2543066143989563, "learning_rate": 8.96182942419952e-06, "loss": 0.3935, "num_input_tokens_seen": 7525307642, "step": 1937, "train_runtime": 52081.96, "train_tokens_per_second": 144489.717 }, { "epoch": 0.6874778290173821, "grad_norm": 0.27995166182518005, "learning_rate": 8.943210014366131e-06, "loss": 0.3935, "num_input_tokens_seen": 7529210747, "step": 1938, "train_runtime": 52107.9999, "train_tokens_per_second": 144492.415 }, { "epoch": 0.6878325647392692, "grad_norm": 0.2574532628059387, "learning_rate": 8.924604395327546e-06, "loss": 0.396, "num_input_tokens_seen": 7533072936, "step": 1939, "train_runtime": 52127.2541, "train_tokens_per_second": 144513.135 }, { "epoch": 0.6881873004611564, "grad_norm": 0.33331355452537537, "learning_rate": 8.90601259028998e-06, "loss": 0.4048, "num_input_tokens_seen": 7536992185, "step": 1940, "train_runtime": 52165.7855, "train_tokens_per_second": 144481.524 }, { "epoch": 0.6885420361830437, "grad_norm": 0.25276580452919006, "learning_rate": 8.887434622442426e-06, "loss": 0.3901, "num_input_tokens_seen": 7540845404, "step": 1941, "train_runtime": 52185.3032, "train_tokens_per_second": 144501.324 }, { "epoch": 0.6888967719049308, "grad_norm": 0.2527956962585449, "learning_rate": 8.868870514956613e-06, "loss": 0.3952, "num_input_tokens_seen": 7544755832, "step": 1942, "train_runtime": 52206.6769, "train_tokens_per_second": 144517.067 }, { "epoch": 0.689251507626818, "grad_norm": 0.2085837572813034, "learning_rate": 8.850320290986973e-06, "loss": 0.3908, "num_input_tokens_seen": 7548657621, "step": 1943, "train_runtime": 52235.6343, "train_tokens_per_second": 144511.648 }, { "epoch": 0.6896062433487052, "grad_norm": 0.21445277333259583, "learning_rate": 8.831783973670638e-06, "loss": 0.4019, "num_input_tokens_seen": 7552558705, "step": 1944, "train_runtime": 52255.9461, "train_tokens_per_second": 144530.131 }, { "epoch": 0.6899609790705924, "grad_norm": 0.1980811059474945, "learning_rate": 8.813261586127386e-06, "loss": 0.3884, "num_input_tokens_seen": 7556432792, "step": 1945, "train_runtime": 52283.823, "train_tokens_per_second": 144527.166 }, { "epoch": 0.6903157147924796, "grad_norm": 0.32807567715644836, "learning_rate": 8.794753151459626e-06, "loss": 0.3973, "num_input_tokens_seen": 7560345288, "step": 1946, "train_runtime": 52300.8464, "train_tokens_per_second": 144554.932 }, { "epoch": 0.6906704505143668, "grad_norm": 0.2720105051994324, "learning_rate": 8.776258692752355e-06, "loss": 0.4008, "num_input_tokens_seen": 7564204537, "step": 1947, "train_runtime": 52326.6355, "train_tokens_per_second": 144557.441 }, { "epoch": 0.691025186236254, "grad_norm": 0.26367998123168945, "learning_rate": 8.757778233073153e-06, "loss": 0.3951, "num_input_tokens_seen": 7568084302, "step": 1948, "train_runtime": 52352.6808, "train_tokens_per_second": 144559.633 }, { "epoch": 0.6913799219581411, "grad_norm": 0.28478091955184937, "learning_rate": 8.739311795472112e-06, "loss": 0.3888, "num_input_tokens_seen": 7571977307, "step": 1949, "train_runtime": 52377.2142, "train_tokens_per_second": 144566.247 }, { "epoch": 0.6917346576800284, "grad_norm": 0.2640671133995056, "learning_rate": 8.72085940298187e-06, "loss": 0.3988, "num_input_tokens_seen": 7575847553, "step": 1950, "train_runtime": 52409.0693, "train_tokens_per_second": 144552.224 }, { "epoch": 0.6920893934019156, "grad_norm": 0.20318703353405, "learning_rate": 8.702421078617525e-06, "loss": 0.3864, "num_input_tokens_seen": 7579750497, "step": 1951, "train_runtime": 52444.4172, "train_tokens_per_second": 144529.216 }, { "epoch": 0.6924441291238028, "grad_norm": 0.309520959854126, "learning_rate": 8.68399684537663e-06, "loss": 0.3947, "num_input_tokens_seen": 7583677903, "step": 1952, "train_runtime": 52470.8545, "train_tokens_per_second": 144531.245 }, { "epoch": 0.69279886484569, "grad_norm": 0.3409821093082428, "learning_rate": 8.66558672623917e-06, "loss": 0.3972, "num_input_tokens_seen": 7587544662, "step": 1953, "train_runtime": 52492.3424, "train_tokens_per_second": 144545.743 }, { "epoch": 0.6931536005675771, "grad_norm": 0.2848144471645355, "learning_rate": 8.647190744167528e-06, "loss": 0.3926, "num_input_tokens_seen": 7591399421, "step": 1954, "train_runtime": 52525.2951, "train_tokens_per_second": 144528.449 }, { "epoch": 0.6935083362894644, "grad_norm": 0.3258538544178009, "learning_rate": 8.628808922106432e-06, "loss": 0.3861, "num_input_tokens_seen": 7595381688, "step": 1955, "train_runtime": 52558.4031, "train_tokens_per_second": 144513.175 }, { "epoch": 0.6938630720113516, "grad_norm": 0.2491893768310547, "learning_rate": 8.610441282982971e-06, "loss": 0.3917, "num_input_tokens_seen": 7599242150, "step": 1956, "train_runtime": 52582.0393, "train_tokens_per_second": 144521.632 }, { "epoch": 0.6942178077332387, "grad_norm": 0.30618560314178467, "learning_rate": 8.59208784970654e-06, "loss": 0.3938, "num_input_tokens_seen": 7603146262, "step": 1957, "train_runtime": 52614.8995, "train_tokens_per_second": 144505.574 }, { "epoch": 0.694572543455126, "grad_norm": 0.2446652501821518, "learning_rate": 8.573748645168811e-06, "loss": 0.3871, "num_input_tokens_seen": 7606988056, "step": 1958, "train_runtime": 52644.5115, "train_tokens_per_second": 144497.267 }, { "epoch": 0.6949272791770131, "grad_norm": 0.27547210454940796, "learning_rate": 8.555423692243711e-06, "loss": 0.3952, "num_input_tokens_seen": 7610963268, "step": 1959, "train_runtime": 52663.5935, "train_tokens_per_second": 144520.394 }, { "epoch": 0.6952820148989003, "grad_norm": 0.2784478962421417, "learning_rate": 8.5371130137874e-06, "loss": 0.3911, "num_input_tokens_seen": 7614799378, "step": 1960, "train_runtime": 52686.8197, "train_tokens_per_second": 144529.494 }, { "epoch": 0.6956367506207876, "grad_norm": 0.28784358501434326, "learning_rate": 8.51881663263821e-06, "loss": 0.402, "num_input_tokens_seen": 7618702214, "step": 1961, "train_runtime": 52709.226, "train_tokens_per_second": 144542.1 }, { "epoch": 0.6959914863426747, "grad_norm": 1.2510405778884888, "learning_rate": 8.500534571616663e-06, "loss": 0.3943, "num_input_tokens_seen": 7622572561, "step": 1962, "train_runtime": 52729.1533, "train_tokens_per_second": 144560.875 }, { "epoch": 0.6963462220645619, "grad_norm": 0.3084670305252075, "learning_rate": 8.482266853525411e-06, "loss": 0.4023, "num_input_tokens_seen": 7626405778, "step": 1963, "train_runtime": 52748.9272, "train_tokens_per_second": 144579.353 }, { "epoch": 0.6967009577864491, "grad_norm": 0.42808717489242554, "learning_rate": 8.464013501149221e-06, "loss": 0.3927, "num_input_tokens_seen": 7630313497, "step": 1964, "train_runtime": 52768.5655, "train_tokens_per_second": 144599.601 }, { "epoch": 0.6970556935083363, "grad_norm": 0.294259637594223, "learning_rate": 8.445774537254938e-06, "loss": 0.4014, "num_input_tokens_seen": 7634130316, "step": 1965, "train_runtime": 52800.112, "train_tokens_per_second": 144585.495 }, { "epoch": 0.6974104292302234, "grad_norm": 0.2687172293663025, "learning_rate": 8.427549984591461e-06, "loss": 0.4037, "num_input_tokens_seen": 7638017014, "step": 1966, "train_runtime": 52825.4745, "train_tokens_per_second": 144589.653 }, { "epoch": 0.6977651649521107, "grad_norm": 0.24645614624023438, "learning_rate": 8.409339865889716e-06, "loss": 0.4012, "num_input_tokens_seen": 7641887677, "step": 1967, "train_runtime": 52844.5842, "train_tokens_per_second": 144610.612 }, { "epoch": 0.6981199006739979, "grad_norm": 0.28496053814888, "learning_rate": 8.391144203862625e-06, "loss": 0.3881, "num_input_tokens_seen": 7645779150, "step": 1968, "train_runtime": 52869.2588, "train_tokens_per_second": 144616.727 }, { "epoch": 0.698474636395885, "grad_norm": 0.2358340620994568, "learning_rate": 8.37296302120508e-06, "loss": 0.3947, "num_input_tokens_seen": 7649701344, "step": 1969, "train_runtime": 52890.4702, "train_tokens_per_second": 144632.886 }, { "epoch": 0.6988293721177723, "grad_norm": 0.4968493580818176, "learning_rate": 8.354796340593909e-06, "loss": 0.3889, "num_input_tokens_seen": 7653571340, "step": 1970, "train_runtime": 52911.9191, "train_tokens_per_second": 144647.397 }, { "epoch": 0.6991841078396595, "grad_norm": 0.26885905861854553, "learning_rate": 8.33664418468786e-06, "loss": 0.396, "num_input_tokens_seen": 7657464394, "step": 1971, "train_runtime": 52938.6337, "train_tokens_per_second": 144647.942 }, { "epoch": 0.6995388435615466, "grad_norm": 0.264447420835495, "learning_rate": 8.318506576127557e-06, "loss": 0.3957, "num_input_tokens_seen": 7661333668, "step": 1972, "train_runtime": 52957.9797, "train_tokens_per_second": 144668.164 }, { "epoch": 0.6998935792834339, "grad_norm": 0.2743479907512665, "learning_rate": 8.300383537535485e-06, "loss": 0.385, "num_input_tokens_seen": 7665214383, "step": 1973, "train_runtime": 52977.8529, "train_tokens_per_second": 144687.147 }, { "epoch": 0.700248315005321, "grad_norm": 0.26376375555992126, "learning_rate": 8.28227509151595e-06, "loss": 0.4017, "num_input_tokens_seen": 7669089287, "step": 1974, "train_runtime": 53002.6322, "train_tokens_per_second": 144692.612 }, { "epoch": 0.7006030507272082, "grad_norm": 0.6479829549789429, "learning_rate": 8.26418126065506e-06, "loss": 0.4037, "num_input_tokens_seen": 7672948700, "step": 1975, "train_runtime": 53027.6259, "train_tokens_per_second": 144697.195 }, { "epoch": 0.7009577864490955, "grad_norm": 0.43820086121559143, "learning_rate": 8.246102067520699e-06, "loss": 0.4127, "num_input_tokens_seen": 7676838742, "step": 1976, "train_runtime": 53055.1987, "train_tokens_per_second": 144695.316 }, { "epoch": 0.7013125221709826, "grad_norm": 0.268923282623291, "learning_rate": 8.228037534662485e-06, "loss": 0.395, "num_input_tokens_seen": 7680711710, "step": 1977, "train_runtime": 53082.5499, "train_tokens_per_second": 144693.722 }, { "epoch": 0.7016672578928698, "grad_norm": 0.24014711380004883, "learning_rate": 8.20998768461176e-06, "loss": 0.3996, "num_input_tokens_seen": 7684679373, "step": 1978, "train_runtime": 53105.8732, "train_tokens_per_second": 144704.887 }, { "epoch": 0.702021993614757, "grad_norm": 0.25220751762390137, "learning_rate": 8.191952539881554e-06, "loss": 0.3939, "num_input_tokens_seen": 7688543021, "step": 1979, "train_runtime": 53128.9872, "train_tokens_per_second": 144714.654 }, { "epoch": 0.7023767293366442, "grad_norm": 0.3192920684814453, "learning_rate": 8.173932122966535e-06, "loss": 0.3872, "num_input_tokens_seen": 7692439292, "step": 1980, "train_runtime": 53153.7959, "train_tokens_per_second": 144720.413 }, { "epoch": 0.7027314650585313, "grad_norm": 1.5334653854370117, "learning_rate": 8.155926456343022e-06, "loss": 0.4106, "num_input_tokens_seen": 7696345137, "step": 1981, "train_runtime": 53179.7675, "train_tokens_per_second": 144723.181 }, { "epoch": 0.7030862007804186, "grad_norm": 0.6414632201194763, "learning_rate": 8.13793556246893e-06, "loss": 0.3908, "num_input_tokens_seen": 7700228944, "step": 1982, "train_runtime": 53205.4188, "train_tokens_per_second": 144726.404 }, { "epoch": 0.7034409365023058, "grad_norm": 0.8949713706970215, "learning_rate": 8.119959463783754e-06, "loss": 0.3945, "num_input_tokens_seen": 7704173835, "step": 1983, "train_runtime": 53234.5259, "train_tokens_per_second": 144721.376 }, { "epoch": 0.7037956722241929, "grad_norm": 0.24833470582962036, "learning_rate": 8.101998182708521e-06, "loss": 0.3946, "num_input_tokens_seen": 7708015502, "step": 1984, "train_runtime": 53255.9199, "train_tokens_per_second": 144735.374 }, { "epoch": 0.7041504079460802, "grad_norm": 0.21736283600330353, "learning_rate": 8.084051741645796e-06, "loss": 0.3882, "num_input_tokens_seen": 7711925062, "step": 1985, "train_runtime": 53272.8425, "train_tokens_per_second": 144762.785 }, { "epoch": 0.7045051436679673, "grad_norm": 0.3580147624015808, "learning_rate": 8.066120162979615e-06, "loss": 0.4091, "num_input_tokens_seen": 7715819401, "step": 1986, "train_runtime": 53290.6693, "train_tokens_per_second": 144787.437 }, { "epoch": 0.7048598793898545, "grad_norm": 0.25669851899147034, "learning_rate": 8.048203469075493e-06, "loss": 0.396, "num_input_tokens_seen": 7719703291, "step": 1987, "train_runtime": 53307.4776, "train_tokens_per_second": 144814.642 }, { "epoch": 0.7052146151117418, "grad_norm": 0.28441521525382996, "learning_rate": 8.030301682280366e-06, "loss": 0.3889, "num_input_tokens_seen": 7723590462, "step": 1988, "train_runtime": 53327.0144, "train_tokens_per_second": 144834.481 }, { "epoch": 0.7055693508336289, "grad_norm": 0.32989004254341125, "learning_rate": 8.012414824922587e-06, "loss": 0.3869, "num_input_tokens_seen": 7727487012, "step": 1989, "train_runtime": 53359.3529, "train_tokens_per_second": 144819.729 }, { "epoch": 0.7059240865555162, "grad_norm": 0.20341108739376068, "learning_rate": 7.994542919311886e-06, "loss": 0.3943, "num_input_tokens_seen": 7731400631, "step": 1990, "train_runtime": 53382.4502, "train_tokens_per_second": 144830.382 }, { "epoch": 0.7062788222774034, "grad_norm": 0.2019575536251068, "learning_rate": 7.976685987739338e-06, "loss": 0.398, "num_input_tokens_seen": 7735304621, "step": 1991, "train_runtime": 53404.2265, "train_tokens_per_second": 144844.428 }, { "epoch": 0.7066335579992905, "grad_norm": 0.20933978259563446, "learning_rate": 7.958844052477356e-06, "loss": 0.3858, "num_input_tokens_seen": 7739165071, "step": 1992, "train_runtime": 53430.5001, "train_tokens_per_second": 144845.454 }, { "epoch": 0.7069882937211778, "grad_norm": 0.3293900191783905, "learning_rate": 7.941017135779623e-06, "loss": 0.3984, "num_input_tokens_seen": 7743056767, "step": 1993, "train_runtime": 53456.4136, "train_tokens_per_second": 144848.041 }, { "epoch": 0.7073430294430649, "grad_norm": 0.2303933948278427, "learning_rate": 7.92320525988111e-06, "loss": 0.4102, "num_input_tokens_seen": 7746926170, "step": 1994, "train_runtime": 53478.362, "train_tokens_per_second": 144860.947 }, { "epoch": 0.7076977651649521, "grad_norm": 0.21454940736293793, "learning_rate": 7.905408446998027e-06, "loss": 0.4009, "num_input_tokens_seen": 7750814996, "step": 1995, "train_runtime": 53501.818, "train_tokens_per_second": 144870.124 }, { "epoch": 0.7080525008868394, "grad_norm": 1.3477704524993896, "learning_rate": 7.88762671932779e-06, "loss": 0.3979, "num_input_tokens_seen": 7754704046, "step": 1996, "train_runtime": 53533.2497, "train_tokens_per_second": 144857.712 }, { "epoch": 0.7084072366087265, "grad_norm": 0.25351855158805847, "learning_rate": 7.869860099049003e-06, "loss": 0.3972, "num_input_tokens_seen": 7758598913, "step": 1997, "train_runtime": 53553.7927, "train_tokens_per_second": 144874.873 }, { "epoch": 0.7087619723306137, "grad_norm": 0.20360183715820312, "learning_rate": 7.852108608321432e-06, "loss": 0.3946, "num_input_tokens_seen": 7762453567, "step": 1998, "train_runtime": 53585.9543, "train_tokens_per_second": 144859.855 }, { "epoch": 0.7091167080525009, "grad_norm": 0.22703570127487183, "learning_rate": 7.834372269285954e-06, "loss": 0.3901, "num_input_tokens_seen": 7766344978, "step": 1999, "train_runtime": 53616.8046, "train_tokens_per_second": 144849.083 }, { "epoch": 0.7094714437743881, "grad_norm": 0.3580891191959381, "learning_rate": 7.816651104064567e-06, "loss": 0.4023, "num_input_tokens_seen": 7770097869, "step": 2000, "train_runtime": 53644.3086, "train_tokens_per_second": 144844.776 }, { "epoch": 0.7098261794962752, "grad_norm": 0.2715013325214386, "learning_rate": 7.798945134760337e-06, "loss": 0.4094, "num_input_tokens_seen": 7774061033, "step": 2001, "train_runtime": 53792.818, "train_tokens_per_second": 144518.568 }, { "epoch": 0.7101809152181625, "grad_norm": 0.3807271420955658, "learning_rate": 7.781254383457376e-06, "loss": 0.3942, "num_input_tokens_seen": 7777926497, "step": 2002, "train_runtime": 53813.771, "train_tokens_per_second": 144534.129 }, { "epoch": 0.7105356509400497, "grad_norm": 0.2548302412033081, "learning_rate": 7.763578872220811e-06, "loss": 0.3907, "num_input_tokens_seen": 7781737897, "step": 2003, "train_runtime": 53841.3068, "train_tokens_per_second": 144531.0 }, { "epoch": 0.7108903866619368, "grad_norm": 0.21804042160511017, "learning_rate": 7.745918623096773e-06, "loss": 0.3924, "num_input_tokens_seen": 7785608372, "step": 2004, "train_runtime": 53863.7056, "train_tokens_per_second": 144542.755 }, { "epoch": 0.7112451223838241, "grad_norm": 0.20746053755283356, "learning_rate": 7.728273658112341e-06, "loss": 0.3979, "num_input_tokens_seen": 7789540971, "step": 2005, "train_runtime": 53895.7087, "train_tokens_per_second": 144529.892 }, { "epoch": 0.7115998581057112, "grad_norm": 0.25886571407318115, "learning_rate": 7.710643999275542e-06, "loss": 0.3926, "num_input_tokens_seen": 7793402666, "step": 2006, "train_runtime": 53926.7549, "train_tokens_per_second": 144518.295 }, { "epoch": 0.7119545938275984, "grad_norm": 0.5712423324584961, "learning_rate": 7.69302966857531e-06, "loss": 0.3922, "num_input_tokens_seen": 7797337184, "step": 2007, "train_runtime": 53950.042, "train_tokens_per_second": 144528.844 }, { "epoch": 0.7123093295494857, "grad_norm": 0.2043018937110901, "learning_rate": 7.675430687981454e-06, "loss": 0.3938, "num_input_tokens_seen": 7801228520, "step": 2008, "train_runtime": 53988.478, "train_tokens_per_second": 144498.026 }, { "epoch": 0.7126640652713728, "grad_norm": 0.3361000716686249, "learning_rate": 7.657847079444647e-06, "loss": 0.3767, "num_input_tokens_seen": 7805104499, "step": 2009, "train_runtime": 54017.0792, "train_tokens_per_second": 144493.272 }, { "epoch": 0.71301880099326, "grad_norm": 0.2903348505496979, "learning_rate": 7.640278864896386e-06, "loss": 0.4038, "num_input_tokens_seen": 7809017144, "step": 2010, "train_runtime": 54045.341, "train_tokens_per_second": 144490.108 }, { "epoch": 0.7133735367151472, "grad_norm": 0.23660556972026825, "learning_rate": 7.62272606624896e-06, "loss": 0.3889, "num_input_tokens_seen": 7812872893, "step": 2011, "train_runtime": 54064.6849, "train_tokens_per_second": 144509.728 }, { "epoch": 0.7137282724370344, "grad_norm": 0.23524829745292664, "learning_rate": 7.605188705395434e-06, "loss": 0.3846, "num_input_tokens_seen": 7816738177, "step": 2012, "train_runtime": 54099.631, "train_tokens_per_second": 144487.828 }, { "epoch": 0.7140830081589216, "grad_norm": 0.36851754784584045, "learning_rate": 7.587666804209628e-06, "loss": 0.4021, "num_input_tokens_seen": 7820700903, "step": 2013, "train_runtime": 54136.8823, "train_tokens_per_second": 144461.605 }, { "epoch": 0.7144377438808088, "grad_norm": 0.22280041873455048, "learning_rate": 7.5701603845460654e-06, "loss": 0.3838, "num_input_tokens_seen": 7824524644, "step": 2014, "train_runtime": 54164.5264, "train_tokens_per_second": 144458.471 }, { "epoch": 0.714792479602696, "grad_norm": 0.23926280438899994, "learning_rate": 7.552669468239971e-06, "loss": 0.4051, "num_input_tokens_seen": 7828534696, "step": 2015, "train_runtime": 54183.4729, "train_tokens_per_second": 144481.966 }, { "epoch": 0.7151472153245831, "grad_norm": 0.2245815545320511, "learning_rate": 7.535194077107228e-06, "loss": 0.3862, "num_input_tokens_seen": 7832400563, "step": 2016, "train_runtime": 54204.9039, "train_tokens_per_second": 144496.162 }, { "epoch": 0.7155019510464704, "grad_norm": 0.2322535216808319, "learning_rate": 7.517734232944349e-06, "loss": 0.3985, "num_input_tokens_seen": 7836281931, "step": 2017, "train_runtime": 54236.6825, "train_tokens_per_second": 144483.062 }, { "epoch": 0.7158566867683576, "grad_norm": 0.24673093855381012, "learning_rate": 7.500289957528466e-06, "loss": 0.396, "num_input_tokens_seen": 7840106783, "step": 2018, "train_runtime": 54255.8893, "train_tokens_per_second": 144502.41 }, { "epoch": 0.7162114224902447, "grad_norm": 0.23032085597515106, "learning_rate": 7.482861272617288e-06, "loss": 0.4056, "num_input_tokens_seen": 7844060875, "step": 2019, "train_runtime": 54277.1375, "train_tokens_per_second": 144518.691 }, { "epoch": 0.716566158212132, "grad_norm": 0.331269770860672, "learning_rate": 7.465448199949077e-06, "loss": 0.3937, "num_input_tokens_seen": 7847979633, "step": 2020, "train_runtime": 54298.5983, "train_tokens_per_second": 144533.743 }, { "epoch": 0.7169208939340191, "grad_norm": 0.3581739068031311, "learning_rate": 7.448050761242627e-06, "loss": 0.3995, "num_input_tokens_seen": 7851837707, "step": 2021, "train_runtime": 54323.5657, "train_tokens_per_second": 144538.335 }, { "epoch": 0.7172756296559063, "grad_norm": 0.22423292696475983, "learning_rate": 7.430668978197222e-06, "loss": 0.3865, "num_input_tokens_seen": 7855734578, "step": 2022, "train_runtime": 54343.4194, "train_tokens_per_second": 144557.238 }, { "epoch": 0.7176303653777936, "grad_norm": 0.2587428092956543, "learning_rate": 7.413302872492645e-06, "loss": 0.3938, "num_input_tokens_seen": 7859622645, "step": 2023, "train_runtime": 54372.9364, "train_tokens_per_second": 144550.27 }, { "epoch": 0.7179851010996807, "grad_norm": 0.2044743299484253, "learning_rate": 7.395952465789089e-06, "loss": 0.3949, "num_input_tokens_seen": 7863536284, "step": 2024, "train_runtime": 54394.6535, "train_tokens_per_second": 144564.507 }, { "epoch": 0.7183398368215679, "grad_norm": 0.30927830934524536, "learning_rate": 7.378617779727188e-06, "loss": 0.3947, "num_input_tokens_seen": 7867383002, "step": 2025, "train_runtime": 54422.4904, "train_tokens_per_second": 144561.246 }, { "epoch": 0.7186945725434551, "grad_norm": 0.22779671847820282, "learning_rate": 7.3612988359279655e-06, "loss": 0.3897, "num_input_tokens_seen": 7871281050, "step": 2026, "train_runtime": 54451.3687, "train_tokens_per_second": 144556.165 }, { "epoch": 0.7190493082653423, "grad_norm": 0.23337316513061523, "learning_rate": 7.3439956559928085e-06, "loss": 0.4009, "num_input_tokens_seen": 7875175045, "step": 2027, "train_runtime": 54472.1229, "train_tokens_per_second": 144572.574 }, { "epoch": 0.7194040439872296, "grad_norm": 0.28351259231567383, "learning_rate": 7.3267082615034415e-06, "loss": 0.3892, "num_input_tokens_seen": 7879083926, "step": 2028, "train_runtime": 54494.4499, "train_tokens_per_second": 144585.071 }, { "epoch": 0.7197587797091167, "grad_norm": 0.23351530730724335, "learning_rate": 7.309436674021908e-06, "loss": 0.3977, "num_input_tokens_seen": 7882967972, "step": 2029, "train_runtime": 54526.7442, "train_tokens_per_second": 144570.671 }, { "epoch": 0.7201135154310039, "grad_norm": 0.25138944387435913, "learning_rate": 7.292180915090516e-06, "loss": 0.3988, "num_input_tokens_seen": 7886882131, "step": 2030, "train_runtime": 54556.867, "train_tokens_per_second": 144562.592 }, { "epoch": 0.7204682511528911, "grad_norm": 0.5543501973152161, "learning_rate": 7.27494100623185e-06, "loss": 0.4085, "num_input_tokens_seen": 7890761413, "step": 2031, "train_runtime": 54581.9263, "train_tokens_per_second": 144567.294 }, { "epoch": 0.7208229868747783, "grad_norm": 0.5483025312423706, "learning_rate": 7.257716968948721e-06, "loss": 0.3895, "num_input_tokens_seen": 7894732564, "step": 2032, "train_runtime": 54603.7914, "train_tokens_per_second": 144582.132 }, { "epoch": 0.7211777225966655, "grad_norm": 0.5158199667930603, "learning_rate": 7.24050882472414e-06, "loss": 0.3911, "num_input_tokens_seen": 7898540242, "step": 2033, "train_runtime": 54629.6401, "train_tokens_per_second": 144583.421 }, { "epoch": 0.7215324583185527, "grad_norm": 0.3932812511920929, "learning_rate": 7.223316595021297e-06, "loss": 0.3851, "num_input_tokens_seen": 7902461737, "step": 2034, "train_runtime": 54661.5461, "train_tokens_per_second": 144570.769 }, { "epoch": 0.7218871940404399, "grad_norm": 0.2620951235294342, "learning_rate": 7.206140301283542e-06, "loss": 0.3948, "num_input_tokens_seen": 7906255531, "step": 2035, "train_runtime": 54682.9484, "train_tokens_per_second": 144583.563 }, { "epoch": 0.722241929762327, "grad_norm": 0.24123968183994293, "learning_rate": 7.188979964934322e-06, "loss": 0.3789, "num_input_tokens_seen": 7910168163, "step": 2036, "train_runtime": 54703.9959, "train_tokens_per_second": 144599.458 }, { "epoch": 0.7225966654842143, "grad_norm": 0.2594224214553833, "learning_rate": 7.171835607377206e-06, "loss": 0.3938, "num_input_tokens_seen": 7914016888, "step": 2037, "train_runtime": 54728.125, "train_tokens_per_second": 144606.03 }, { "epoch": 0.7229514012061015, "grad_norm": 0.2257164716720581, "learning_rate": 7.15470724999582e-06, "loss": 0.4096, "num_input_tokens_seen": 7917869992, "step": 2038, "train_runtime": 54759.8224, "train_tokens_per_second": 144592.689 }, { "epoch": 0.7233061369279886, "grad_norm": 0.2438397854566574, "learning_rate": 7.137594914153852e-06, "loss": 0.4031, "num_input_tokens_seen": 7921768229, "step": 2039, "train_runtime": 54783.0201, "train_tokens_per_second": 144602.62 }, { "epoch": 0.7236608726498759, "grad_norm": 0.21126988530158997, "learning_rate": 7.1204986211949826e-06, "loss": 0.3846, "num_input_tokens_seen": 7925650887, "step": 2040, "train_runtime": 54819.9468, "train_tokens_per_second": 144576.041 }, { "epoch": 0.724015608371763, "grad_norm": 0.24295693635940552, "learning_rate": 7.103418392442902e-06, "loss": 0.398, "num_input_tokens_seen": 7929610632, "step": 2041, "train_runtime": 54849.2555, "train_tokens_per_second": 144570.98 }, { "epoch": 0.7243703440936502, "grad_norm": 0.39859265089035034, "learning_rate": 7.086354249201244e-06, "loss": 0.4073, "num_input_tokens_seen": 7933405512, "step": 2042, "train_runtime": 54873.7, "train_tokens_per_second": 144575.735 }, { "epoch": 0.7247250798155375, "grad_norm": 0.32809942960739136, "learning_rate": 7.069306212753595e-06, "loss": 0.3968, "num_input_tokens_seen": 7937280047, "step": 2043, "train_runtime": 54899.3517, "train_tokens_per_second": 144578.757 }, { "epoch": 0.7250798155374246, "grad_norm": 0.2638392150402069, "learning_rate": 7.052274304363449e-06, "loss": 0.4044, "num_input_tokens_seen": 7941141053, "step": 2044, "train_runtime": 54921.5266, "train_tokens_per_second": 144590.683 }, { "epoch": 0.7254345512593118, "grad_norm": 0.5597684979438782, "learning_rate": 7.0352585452741796e-06, "loss": 0.3937, "num_input_tokens_seen": 7945082070, "step": 2045, "train_runtime": 54944.203, "train_tokens_per_second": 144602.736 }, { "epoch": 0.725789286981199, "grad_norm": 0.3087954521179199, "learning_rate": 7.018258956709025e-06, "loss": 0.3913, "num_input_tokens_seen": 7948951851, "step": 2046, "train_runtime": 54973.1209, "train_tokens_per_second": 144597.064 }, { "epoch": 0.7261440227030862, "grad_norm": 0.2263558954000473, "learning_rate": 7.001275559871057e-06, "loss": 0.3967, "num_input_tokens_seen": 7952856508, "step": 2047, "train_runtime": 54996.8353, "train_tokens_per_second": 144605.712 }, { "epoch": 0.7264987584249734, "grad_norm": 0.23647542297840118, "learning_rate": 6.984308375943128e-06, "loss": 0.3883, "num_input_tokens_seen": 7956693484, "step": 2048, "train_runtime": 55018.4024, "train_tokens_per_second": 144618.766 }, { "epoch": 0.7268534941468606, "grad_norm": 0.23434311151504517, "learning_rate": 6.967357426087895e-06, "loss": 0.399, "num_input_tokens_seen": 7960686229, "step": 2049, "train_runtime": 55044.6084, "train_tokens_per_second": 144622.452 }, { "epoch": 0.7272082298687478, "grad_norm": 0.2871394455432892, "learning_rate": 6.950422731447759e-06, "loss": 0.404, "num_input_tokens_seen": 7964526737, "step": 2050, "train_runtime": 55068.9174, "train_tokens_per_second": 144628.351 }, { "epoch": 0.7275629655906349, "grad_norm": 0.24556158483028412, "learning_rate": 6.933504313144841e-06, "loss": 0.3885, "num_input_tokens_seen": 7968380653, "step": 2051, "train_runtime": 55097.3732, "train_tokens_per_second": 144623.603 }, { "epoch": 0.7279177013125222, "grad_norm": 0.430499792098999, "learning_rate": 6.916602192280965e-06, "loss": 0.4031, "num_input_tokens_seen": 7972252698, "step": 2052, "train_runtime": 55118.2296, "train_tokens_per_second": 144639.129 }, { "epoch": 0.7282724370344094, "grad_norm": 0.21470613777637482, "learning_rate": 6.899716389937634e-06, "loss": 0.4009, "num_input_tokens_seen": 7976154754, "step": 2053, "train_runtime": 55146.5934, "train_tokens_per_second": 144635.493 }, { "epoch": 0.7286271727562965, "grad_norm": 0.2511955201625824, "learning_rate": 6.882846927175984e-06, "loss": 0.3927, "num_input_tokens_seen": 7980120467, "step": 2054, "train_runtime": 55172.8421, "train_tokens_per_second": 144638.561 }, { "epoch": 0.7289819084781838, "grad_norm": 0.31511223316192627, "learning_rate": 6.865993825036781e-06, "loss": 0.3865, "num_input_tokens_seen": 7983999455, "step": 2055, "train_runtime": 55191.9339, "train_tokens_per_second": 144658.81 }, { "epoch": 0.7293366442000709, "grad_norm": 0.26505520939826965, "learning_rate": 6.849157104540381e-06, "loss": 0.3888, "num_input_tokens_seen": 7987821001, "step": 2056, "train_runtime": 55217.5799, "train_tokens_per_second": 144660.831 }, { "epoch": 0.7296913799219581, "grad_norm": 0.40556931495666504, "learning_rate": 6.832336786686711e-06, "loss": 0.3838, "num_input_tokens_seen": 7991710325, "step": 2057, "train_runtime": 55244.4976, "train_tokens_per_second": 144660.748 }, { "epoch": 0.7300461156438454, "grad_norm": 0.2412620484828949, "learning_rate": 6.815532892455234e-06, "loss": 0.3955, "num_input_tokens_seen": 7995593204, "step": 2058, "train_runtime": 55262.6778, "train_tokens_per_second": 144683.42 }, { "epoch": 0.7304008513657325, "grad_norm": 0.2983127534389496, "learning_rate": 6.798745442804935e-06, "loss": 0.3999, "num_input_tokens_seen": 7999541117, "step": 2059, "train_runtime": 55283.3826, "train_tokens_per_second": 144700.645 }, { "epoch": 0.7307555870876197, "grad_norm": 0.44995439052581787, "learning_rate": 6.781974458674287e-06, "loss": 0.3896, "num_input_tokens_seen": 8003422796, "step": 2060, "train_runtime": 55303.4503, "train_tokens_per_second": 144718.327 }, { "epoch": 0.7311103228095069, "grad_norm": 0.2795678675174713, "learning_rate": 6.765219960981215e-06, "loss": 0.3808, "num_input_tokens_seen": 8007276768, "step": 2061, "train_runtime": 55336.6533, "train_tokens_per_second": 144701.139 }, { "epoch": 0.7314650585313941, "grad_norm": 0.5778995752334595, "learning_rate": 6.7484819706230955e-06, "loss": 0.3865, "num_input_tokens_seen": 8011192662, "step": 2062, "train_runtime": 55359.3735, "train_tokens_per_second": 144712.488 }, { "epoch": 0.7318197942532813, "grad_norm": 0.2870245575904846, "learning_rate": 6.7317605084767076e-06, "loss": 0.3876, "num_input_tokens_seen": 8015105248, "step": 2063, "train_runtime": 55385.1467, "train_tokens_per_second": 144715.79 }, { "epoch": 0.7321745299751685, "grad_norm": 0.2466004341840744, "learning_rate": 6.715055595398219e-06, "loss": 0.4156, "num_input_tokens_seen": 8019033071, "step": 2064, "train_runtime": 55411.9063, "train_tokens_per_second": 144716.788 }, { "epoch": 0.7325292656970557, "grad_norm": 0.2039877325296402, "learning_rate": 6.698367252223157e-06, "loss": 0.3927, "num_input_tokens_seen": 8022897683, "step": 2065, "train_runtime": 55435.8604, "train_tokens_per_second": 144723.968 }, { "epoch": 0.7328840014189429, "grad_norm": 0.24683688580989838, "learning_rate": 6.681695499766383e-06, "loss": 0.3858, "num_input_tokens_seen": 8026791915, "step": 2066, "train_runtime": 55457.7366, "train_tokens_per_second": 144737.099 }, { "epoch": 0.7332387371408301, "grad_norm": 0.2208670973777771, "learning_rate": 6.665040358822053e-06, "loss": 0.3878, "num_input_tokens_seen": 8030689410, "step": 2067, "train_runtime": 55477.6551, "train_tokens_per_second": 144755.387 }, { "epoch": 0.7335934728627173, "grad_norm": 0.3005874752998352, "learning_rate": 6.6484018501636174e-06, "loss": 0.4034, "num_input_tokens_seen": 8034514309, "step": 2068, "train_runtime": 55497.3468, "train_tokens_per_second": 144772.944 }, { "epoch": 0.7339482085846045, "grad_norm": 0.21432772278785706, "learning_rate": 6.631779994543776e-06, "loss": 0.3875, "num_input_tokens_seen": 8038465814, "step": 2069, "train_runtime": 55528.9228, "train_tokens_per_second": 144761.782 }, { "epoch": 0.7343029443064917, "grad_norm": 0.28990399837493896, "learning_rate": 6.615174812694461e-06, "loss": 0.3967, "num_input_tokens_seen": 8042312689, "step": 2070, "train_runtime": 55548.6044, "train_tokens_per_second": 144779.743 }, { "epoch": 0.7346576800283788, "grad_norm": 0.23503434658050537, "learning_rate": 6.598586325326808e-06, "loss": 0.4028, "num_input_tokens_seen": 8046208114, "step": 2071, "train_runtime": 55569.4533, "train_tokens_per_second": 144795.524 }, { "epoch": 0.7350124157502661, "grad_norm": 0.233891099691391, "learning_rate": 6.5820145531311245e-06, "loss": 0.3991, "num_input_tokens_seen": 8050093859, "step": 2072, "train_runtime": 55595.5232, "train_tokens_per_second": 144797.52 }, { "epoch": 0.7353671514721533, "grad_norm": 0.22332656383514404, "learning_rate": 6.565459516776875e-06, "loss": 0.3833, "num_input_tokens_seen": 8053894303, "step": 2073, "train_runtime": 55624.0049, "train_tokens_per_second": 144791.701 }, { "epoch": 0.7357218871940404, "grad_norm": 0.23937483131885529, "learning_rate": 6.548921236912646e-06, "loss": 0.3924, "num_input_tokens_seen": 8057858989, "step": 2074, "train_runtime": 55663.9769, "train_tokens_per_second": 144758.952 }, { "epoch": 0.7360766229159277, "grad_norm": 0.28991904854774475, "learning_rate": 6.532399734166132e-06, "loss": 0.3841, "num_input_tokens_seen": 8061693636, "step": 2075, "train_runtime": 55687.4014, "train_tokens_per_second": 144766.921 }, { "epoch": 0.7364313586378148, "grad_norm": 0.2478332221508026, "learning_rate": 6.515895029144092e-06, "loss": 0.391, "num_input_tokens_seen": 8065512867, "step": 2076, "train_runtime": 55705.6039, "train_tokens_per_second": 144788.178 }, { "epoch": 0.736786094359702, "grad_norm": 0.42628228664398193, "learning_rate": 6.499407142432339e-06, "loss": 0.3918, "num_input_tokens_seen": 8069386146, "step": 2077, "train_runtime": 55726.0941, "train_tokens_per_second": 144804.445 }, { "epoch": 0.7371408300815893, "grad_norm": 0.2355152666568756, "learning_rate": 6.482936094595713e-06, "loss": 0.3924, "num_input_tokens_seen": 8073336799, "step": 2078, "train_runtime": 55755.0072, "train_tokens_per_second": 144800.211 }, { "epoch": 0.7374955658034764, "grad_norm": 0.1871626228094101, "learning_rate": 6.466481906178037e-06, "loss": 0.3884, "num_input_tokens_seen": 8077212164, "step": 2079, "train_runtime": 55776.7722, "train_tokens_per_second": 144813.187 }, { "epoch": 0.7378503015253636, "grad_norm": 0.2263989895582199, "learning_rate": 6.450044597702118e-06, "loss": 0.4013, "num_input_tokens_seen": 8081087703, "step": 2080, "train_runtime": 55809.67, "train_tokens_per_second": 144797.267 }, { "epoch": 0.7382050372472508, "grad_norm": 0.5152593851089478, "learning_rate": 6.433624189669709e-06, "loss": 0.4021, "num_input_tokens_seen": 8084929556, "step": 2081, "train_runtime": 55833.6826, "train_tokens_per_second": 144803.803 }, { "epoch": 0.738559772969138, "grad_norm": 0.23284099996089935, "learning_rate": 6.41722070256148e-06, "loss": 0.3935, "num_input_tokens_seen": 8088820267, "step": 2082, "train_runtime": 55861.1553, "train_tokens_per_second": 144802.237 }, { "epoch": 0.7389145086910252, "grad_norm": 0.2181706726551056, "learning_rate": 6.400834156836997e-06, "loss": 0.4014, "num_input_tokens_seen": 8092707323, "step": 2083, "train_runtime": 55887.1924, "train_tokens_per_second": 144804.328 }, { "epoch": 0.7392692444129124, "grad_norm": 0.2086559683084488, "learning_rate": 6.384464572934694e-06, "loss": 0.4034, "num_input_tokens_seen": 8096576391, "step": 2084, "train_runtime": 55916.5535, "train_tokens_per_second": 144797.486 }, { "epoch": 0.7396239801347996, "grad_norm": 0.25475406646728516, "learning_rate": 6.36811197127186e-06, "loss": 0.3937, "num_input_tokens_seen": 8100506067, "step": 2085, "train_runtime": 55942.5281, "train_tokens_per_second": 144800.5 }, { "epoch": 0.7399787158566867, "grad_norm": 0.2305385172367096, "learning_rate": 6.351776372244578e-06, "loss": 0.3948, "num_input_tokens_seen": 8104338069, "step": 2086, "train_runtime": 55962.1564, "train_tokens_per_second": 144818.188 }, { "epoch": 0.740333451578574, "grad_norm": 0.28171491622924805, "learning_rate": 6.335457796227749e-06, "loss": 0.3817, "num_input_tokens_seen": 8108271762, "step": 2087, "train_runtime": 55996.4098, "train_tokens_per_second": 144799.85 }, { "epoch": 0.7406881873004612, "grad_norm": 0.2537763714790344, "learning_rate": 6.319156263575026e-06, "loss": 0.3951, "num_input_tokens_seen": 8112175241, "step": 2088, "train_runtime": 56021.279, "train_tokens_per_second": 144805.249 }, { "epoch": 0.7410429230223483, "grad_norm": 0.4992367923259735, "learning_rate": 6.302871794618817e-06, "loss": 0.3817, "num_input_tokens_seen": 8116084792, "step": 2089, "train_runtime": 56037.9002, "train_tokens_per_second": 144832.065 }, { "epoch": 0.7413976587442356, "grad_norm": 0.23577454686164856, "learning_rate": 6.286604409670236e-06, "loss": 0.3963, "num_input_tokens_seen": 8120015040, "step": 2090, "train_runtime": 56058.065, "train_tokens_per_second": 144850.077 }, { "epoch": 0.7417523944661227, "grad_norm": 0.30589860677719116, "learning_rate": 6.2703541290190964e-06, "loss": 0.3858, "num_input_tokens_seen": 8123820818, "step": 2091, "train_runtime": 56090.0004, "train_tokens_per_second": 144835.457 }, { "epoch": 0.7421071301880099, "grad_norm": 0.2509048581123352, "learning_rate": 6.25412097293387e-06, "loss": 0.394, "num_input_tokens_seen": 8127722603, "step": 2092, "train_runtime": 56110.4502, "train_tokens_per_second": 144852.208 }, { "epoch": 0.7424618659098972, "grad_norm": 0.31317466497421265, "learning_rate": 6.237904961661678e-06, "loss": 0.3986, "num_input_tokens_seen": 8131590592, "step": 2093, "train_runtime": 56132.115, "train_tokens_per_second": 144865.209 }, { "epoch": 0.7428166016317843, "grad_norm": 0.28476420044898987, "learning_rate": 6.221706115428252e-06, "loss": 0.3918, "num_input_tokens_seen": 8135481937, "step": 2094, "train_runtime": 56154.9975, "train_tokens_per_second": 144875.475 }, { "epoch": 0.7431713373536715, "grad_norm": 0.2395438700914383, "learning_rate": 6.2055244544379145e-06, "loss": 0.3955, "num_input_tokens_seen": 8139285027, "step": 2095, "train_runtime": 56186.1659, "train_tokens_per_second": 144862.795 }, { "epoch": 0.7435260730755587, "grad_norm": 0.2532247304916382, "learning_rate": 6.1893599988735564e-06, "loss": 0.3855, "num_input_tokens_seen": 8143129169, "step": 2096, "train_runtime": 56207.1075, "train_tokens_per_second": 144877.214 }, { "epoch": 0.7438808087974459, "grad_norm": 0.2282903641462326, "learning_rate": 6.173212768896615e-06, "loss": 0.3908, "num_input_tokens_seen": 8147037237, "step": 2097, "train_runtime": 56238.9022, "train_tokens_per_second": 144864.799 }, { "epoch": 0.744235544519333, "grad_norm": 0.2956813871860504, "learning_rate": 6.15708278464702e-06, "loss": 0.3895, "num_input_tokens_seen": 8150951130, "step": 2098, "train_runtime": 56264.8269, "train_tokens_per_second": 144867.612 }, { "epoch": 0.7445902802412203, "grad_norm": 0.277876079082489, "learning_rate": 6.14097006624321e-06, "loss": 0.3857, "num_input_tokens_seen": 8154861868, "step": 2099, "train_runtime": 56298.627, "train_tokens_per_second": 144850.102 }, { "epoch": 0.7449450159631075, "grad_norm": 0.33384984731674194, "learning_rate": 6.124874633782088e-06, "loss": 0.3824, "num_input_tokens_seen": 8158744538, "step": 2100, "train_runtime": 56318.4918, "train_tokens_per_second": 144867.951 }, { "epoch": 0.7452997516849946, "grad_norm": 0.25378331542015076, "learning_rate": 6.10879650733899e-06, "loss": 0.3971, "num_input_tokens_seen": 8162655190, "step": 2101, "train_runtime": 56345.0182, "train_tokens_per_second": 144869.155 }, { "epoch": 0.7456544874068819, "grad_norm": 0.3142505884170532, "learning_rate": 6.0927357069676715e-06, "loss": 0.3997, "num_input_tokens_seen": 8166527491, "step": 2102, "train_runtime": 56366.1763, "train_tokens_per_second": 144883.475 }, { "epoch": 0.746009223128769, "grad_norm": 0.29426783323287964, "learning_rate": 6.07669225270028e-06, "loss": 0.3936, "num_input_tokens_seen": 8170459797, "step": 2103, "train_runtime": 56393.6412, "train_tokens_per_second": 144882.643 }, { "epoch": 0.7463639588506562, "grad_norm": 0.3105940520763397, "learning_rate": 6.060666164547315e-06, "loss": 0.384, "num_input_tokens_seen": 8174324614, "step": 2104, "train_runtime": 56421.693, "train_tokens_per_second": 144879.109 }, { "epoch": 0.7467186945725435, "grad_norm": 1.4184739589691162, "learning_rate": 6.044657462497628e-06, "loss": 0.3932, "num_input_tokens_seen": 8178231373, "step": 2105, "train_runtime": 56442.0335, "train_tokens_per_second": 144896.115 }, { "epoch": 0.7470734302944306, "grad_norm": 0.2652340531349182, "learning_rate": 6.02866616651838e-06, "loss": 0.4043, "num_input_tokens_seen": 8182124043, "step": 2106, "train_runtime": 56463.9208, "train_tokens_per_second": 144908.889 }, { "epoch": 0.7474281660163179, "grad_norm": 0.23727355897426605, "learning_rate": 6.012692296555027e-06, "loss": 0.3917, "num_input_tokens_seen": 8185994257, "step": 2107, "train_runtime": 56496.4307, "train_tokens_per_second": 144894.008 }, { "epoch": 0.747782901738205, "grad_norm": 0.27461501955986023, "learning_rate": 5.996735872531283e-06, "loss": 0.3844, "num_input_tokens_seen": 8189866693, "step": 2108, "train_runtime": 56522.8287, "train_tokens_per_second": 144894.848 }, { "epoch": 0.7481376374600922, "grad_norm": 0.3084505498409271, "learning_rate": 5.980796914349111e-06, "loss": 0.3898, "num_input_tokens_seen": 8193769785, "step": 2109, "train_runtime": 56546.7258, "train_tokens_per_second": 144902.639 }, { "epoch": 0.7484923731819795, "grad_norm": 0.2053784877061844, "learning_rate": 5.96487544188868e-06, "loss": 0.3882, "num_input_tokens_seen": 8197681444, "step": 2110, "train_runtime": 56567.8045, "train_tokens_per_second": 144917.794 }, { "epoch": 0.7488471089038666, "grad_norm": 0.1991582065820694, "learning_rate": 5.948971475008359e-06, "loss": 0.3841, "num_input_tokens_seen": 8201526460, "step": 2111, "train_runtime": 56594.3916, "train_tokens_per_second": 144917.654 }, { "epoch": 0.7492018446257538, "grad_norm": 0.31353476643562317, "learning_rate": 5.9330850335446695e-06, "loss": 0.3995, "num_input_tokens_seen": 8205482488, "step": 2112, "train_runtime": 56621.1842, "train_tokens_per_second": 144918.949 }, { "epoch": 0.749556580347641, "grad_norm": 0.5841571688652039, "learning_rate": 5.917216137312292e-06, "loss": 0.3842, "num_input_tokens_seen": 8209362945, "step": 2113, "train_runtime": 56654.1143, "train_tokens_per_second": 144903.209 }, { "epoch": 0.7499113160695282, "grad_norm": 0.2393309324979782, "learning_rate": 5.901364806104007e-06, "loss": 0.4053, "num_input_tokens_seen": 8213254413, "step": 2114, "train_runtime": 56676.7011, "train_tokens_per_second": 144914.123 }, { "epoch": 0.7502660517914154, "grad_norm": 0.22194220125675201, "learning_rate": 5.885531059690694e-06, "loss": 0.3865, "num_input_tokens_seen": 8217178866, "step": 2115, "train_runtime": 56696.5251, "train_tokens_per_second": 144932.672 }, { "epoch": 0.7506207875133026, "grad_norm": 0.22551754117012024, "learning_rate": 5.869714917821307e-06, "loss": 0.4005, "num_input_tokens_seen": 8221043169, "step": 2116, "train_runtime": 56717.5722, "train_tokens_per_second": 144947.022 }, { "epoch": 0.7509755232351898, "grad_norm": 0.24377596378326416, "learning_rate": 5.853916400222823e-06, "loss": 0.404, "num_input_tokens_seen": 8224949973, "step": 2117, "train_runtime": 56740.118, "train_tokens_per_second": 144958.281 }, { "epoch": 0.751330258957077, "grad_norm": 0.23138779401779175, "learning_rate": 5.838135526600253e-06, "loss": 0.3915, "num_input_tokens_seen": 8228813776, "step": 2118, "train_runtime": 56771.4636, "train_tokens_per_second": 144946.303 }, { "epoch": 0.7516849946789642, "grad_norm": 0.30676788091659546, "learning_rate": 5.822372316636593e-06, "loss": 0.3978, "num_input_tokens_seen": 8232704038, "step": 2119, "train_runtime": 56794.0852, "train_tokens_per_second": 144957.067 }, { "epoch": 0.7520397304008514, "grad_norm": 0.44161951541900635, "learning_rate": 5.806626789992818e-06, "loss": 0.3981, "num_input_tokens_seen": 8236619060, "step": 2120, "train_runtime": 56815.3826, "train_tokens_per_second": 144971.638 }, { "epoch": 0.7523944661227385, "grad_norm": 0.4190133213996887, "learning_rate": 5.790898966307834e-06, "loss": 0.3925, "num_input_tokens_seen": 8240494776, "step": 2121, "train_runtime": 56843.6072, "train_tokens_per_second": 144967.837 }, { "epoch": 0.7527492018446258, "grad_norm": 0.22665494680404663, "learning_rate": 5.775188865198483e-06, "loss": 0.3956, "num_input_tokens_seen": 8244370211, "step": 2122, "train_runtime": 56875.5456, "train_tokens_per_second": 144954.569 }, { "epoch": 0.753103937566513, "grad_norm": 0.22368016839027405, "learning_rate": 5.759496506259476e-06, "loss": 0.3919, "num_input_tokens_seen": 8248254152, "step": 2123, "train_runtime": 56901.3168, "train_tokens_per_second": 144957.175 }, { "epoch": 0.7534586732884001, "grad_norm": 0.24678783118724823, "learning_rate": 5.7438219090634205e-06, "loss": 0.3953, "num_input_tokens_seen": 8252145573, "step": 2124, "train_runtime": 56924.448, "train_tokens_per_second": 144966.633 }, { "epoch": 0.7538134090102874, "grad_norm": 0.276946097612381, "learning_rate": 5.728165093160758e-06, "loss": 0.3893, "num_input_tokens_seen": 8256088662, "step": 2125, "train_runtime": 56952.0717, "train_tokens_per_second": 144965.555 }, { "epoch": 0.7541681447321745, "grad_norm": 0.43951794505119324, "learning_rate": 5.712526078079754e-06, "loss": 0.4011, "num_input_tokens_seen": 8259941660, "step": 2126, "train_runtime": 56979.4789, "train_tokens_per_second": 144963.447 }, { "epoch": 0.7545228804540617, "grad_norm": 0.26078101992607117, "learning_rate": 5.696904883326475e-06, "loss": 0.386, "num_input_tokens_seen": 8263896695, "step": 2127, "train_runtime": 57004.411, "train_tokens_per_second": 144969.425 }, { "epoch": 0.754877616175949, "grad_norm": 0.27300819754600525, "learning_rate": 5.681301528384755e-06, "loss": 0.396, "num_input_tokens_seen": 8267745003, "step": 2128, "train_runtime": 57039.8078, "train_tokens_per_second": 144946.93 }, { "epoch": 0.7552323518978361, "grad_norm": 0.37689685821533203, "learning_rate": 5.6657160327161794e-06, "loss": 0.394, "num_input_tokens_seen": 8271615500, "step": 2129, "train_runtime": 57062.6882, "train_tokens_per_second": 144956.639 }, { "epoch": 0.7555870876197233, "grad_norm": 0.22618719935417175, "learning_rate": 5.650148415760062e-06, "loss": 0.3934, "num_input_tokens_seen": 8275469522, "step": 2130, "train_runtime": 57086.389, "train_tokens_per_second": 144963.969 }, { "epoch": 0.7559418233416105, "grad_norm": 0.2186545580625534, "learning_rate": 5.634598696933411e-06, "loss": 0.3984, "num_input_tokens_seen": 8279427849, "step": 2131, "train_runtime": 57111.6965, "train_tokens_per_second": 144969.041 }, { "epoch": 0.7562965590634977, "grad_norm": 0.2357320934534073, "learning_rate": 5.619066895630913e-06, "loss": 0.3768, "num_input_tokens_seen": 8283292283, "step": 2132, "train_runtime": 57138.8277, "train_tokens_per_second": 144967.837 }, { "epoch": 0.7566512947853848, "grad_norm": 0.2841896116733551, "learning_rate": 5.6035530312249105e-06, "loss": 0.3837, "num_input_tokens_seen": 8287179140, "step": 2133, "train_runtime": 57160.9035, "train_tokens_per_second": 144979.849 }, { "epoch": 0.7570060305072721, "grad_norm": 0.34705254435539246, "learning_rate": 5.5880571230653735e-06, "loss": 0.3978, "num_input_tokens_seen": 8291059634, "step": 2134, "train_runtime": 57185.7588, "train_tokens_per_second": 144984.692 }, { "epoch": 0.7573607662291593, "grad_norm": 0.306009978055954, "learning_rate": 5.5725791904798585e-06, "loss": 0.3951, "num_input_tokens_seen": 8294951212, "step": 2135, "train_runtime": 57206.4702, "train_tokens_per_second": 145000.228 }, { "epoch": 0.7577155019510464, "grad_norm": 0.33970922231674194, "learning_rate": 5.557119252773529e-06, "loss": 0.399, "num_input_tokens_seen": 8298847375, "step": 2136, "train_runtime": 57229.1418, "train_tokens_per_second": 145010.865 }, { "epoch": 0.7580702376729337, "grad_norm": 0.21725642681121826, "learning_rate": 5.541677329229083e-06, "loss": 0.3909, "num_input_tokens_seen": 8302783514, "step": 2137, "train_runtime": 57255.9848, "train_tokens_per_second": 145011.627 }, { "epoch": 0.7584249733948208, "grad_norm": 0.26501017808914185, "learning_rate": 5.526253439106761e-06, "loss": 0.389, "num_input_tokens_seen": 8306670261, "step": 2138, "train_runtime": 57278.7086, "train_tokens_per_second": 145021.954 }, { "epoch": 0.758779709116708, "grad_norm": 0.27969542145729065, "learning_rate": 5.510847601644309e-06, "loss": 0.3897, "num_input_tokens_seen": 8310601856, "step": 2139, "train_runtime": 57305.7487, "train_tokens_per_second": 145022.132 }, { "epoch": 0.7591344448385953, "grad_norm": 0.3559422194957733, "learning_rate": 5.495459836056953e-06, "loss": 0.377, "num_input_tokens_seen": 8314543656, "step": 2140, "train_runtime": 57334.0068, "train_tokens_per_second": 145019.407 }, { "epoch": 0.7594891805604824, "grad_norm": 0.3380289375782013, "learning_rate": 5.480090161537388e-06, "loss": 0.3925, "num_input_tokens_seen": 8318404248, "step": 2141, "train_runtime": 57356.2341, "train_tokens_per_second": 145030.516 }, { "epoch": 0.7598439162823696, "grad_norm": 0.2811984419822693, "learning_rate": 5.464738597255727e-06, "loss": 0.4033, "num_input_tokens_seen": 8322290539, "step": 2142, "train_runtime": 57380.583, "train_tokens_per_second": 145036.702 }, { "epoch": 0.7601986520042568, "grad_norm": 0.22717882692813873, "learning_rate": 5.449405162359507e-06, "loss": 0.3875, "num_input_tokens_seen": 8326175629, "step": 2143, "train_runtime": 57399.3144, "train_tokens_per_second": 145057.057 }, { "epoch": 0.760553387726144, "grad_norm": 0.3828713297843933, "learning_rate": 5.434089875973649e-06, "loss": 0.401, "num_input_tokens_seen": 8330049727, "step": 2144, "train_runtime": 57436.8775, "train_tokens_per_second": 145029.641 }, { "epoch": 0.7609081234480313, "grad_norm": 0.3157232701778412, "learning_rate": 5.418792757200448e-06, "loss": 0.3893, "num_input_tokens_seen": 8333979992, "step": 2145, "train_runtime": 57456.7975, "train_tokens_per_second": 145047.764 }, { "epoch": 0.7612628591699184, "grad_norm": 0.2282640039920807, "learning_rate": 5.403513825119526e-06, "loss": 0.3931, "num_input_tokens_seen": 8337846508, "step": 2146, "train_runtime": 57486.7511, "train_tokens_per_second": 145039.445 }, { "epoch": 0.7616175948918056, "grad_norm": 0.22403956949710846, "learning_rate": 5.388253098787834e-06, "loss": 0.3891, "num_input_tokens_seen": 8341705451, "step": 2147, "train_runtime": 57513.4588, "train_tokens_per_second": 145039.189 }, { "epoch": 0.7619723306136928, "grad_norm": 0.2480609118938446, "learning_rate": 5.373010597239592e-06, "loss": 0.4003, "num_input_tokens_seen": 8345615996, "step": 2148, "train_runtime": 57547.176, "train_tokens_per_second": 145022.164 }, { "epoch": 0.76232706633558, "grad_norm": 0.3313763737678528, "learning_rate": 5.357786339486315e-06, "loss": 0.3865, "num_input_tokens_seen": 8349520580, "step": 2149, "train_runtime": 57569.0061, "train_tokens_per_second": 145034.996 }, { "epoch": 0.7626818020574672, "grad_norm": 0.25636664032936096, "learning_rate": 5.342580344516748e-06, "loss": 0.3908, "num_input_tokens_seen": 8353403178, "step": 2150, "train_runtime": 57594.5667, "train_tokens_per_second": 145038.042 }, { "epoch": 0.7630365377793544, "grad_norm": 0.5690871477127075, "learning_rate": 5.327392631296864e-06, "loss": 0.3786, "num_input_tokens_seen": 8357247056, "step": 2151, "train_runtime": 57628.3992, "train_tokens_per_second": 145019.594 }, { "epoch": 0.7633912735012416, "grad_norm": 0.3009524345397949, "learning_rate": 5.31222321876983e-06, "loss": 0.3915, "num_input_tokens_seen": 8361135039, "step": 2152, "train_runtime": 57655.4711, "train_tokens_per_second": 145018.935 }, { "epoch": 0.7637460092231287, "grad_norm": 0.23467910289764404, "learning_rate": 5.297072125855998e-06, "loss": 0.3894, "num_input_tokens_seen": 8365030068, "step": 2153, "train_runtime": 57681.6015, "train_tokens_per_second": 145020.767 }, { "epoch": 0.764100744945016, "grad_norm": 0.29082736372947693, "learning_rate": 5.281939371452844e-06, "loss": 0.3869, "num_input_tokens_seen": 8368833005, "step": 2154, "train_runtime": 57714.0576, "train_tokens_per_second": 145005.105 }, { "epoch": 0.7644554806669032, "grad_norm": 0.3895636200904846, "learning_rate": 5.266824974434998e-06, "loss": 0.3923, "num_input_tokens_seen": 8372726894, "step": 2155, "train_runtime": 57739.8158, "train_tokens_per_second": 145007.856 }, { "epoch": 0.7648102163887903, "grad_norm": 0.4017200767993927, "learning_rate": 5.251728953654185e-06, "loss": 0.3817, "num_input_tokens_seen": 8376597504, "step": 2156, "train_runtime": 57769.4983, "train_tokens_per_second": 145000.35 }, { "epoch": 0.7651649521106776, "grad_norm": 0.20641431212425232, "learning_rate": 5.2366513279392066e-06, "loss": 0.3886, "num_input_tokens_seen": 8380521383, "step": 2157, "train_runtime": 57798.2122, "train_tokens_per_second": 144996.204 }, { "epoch": 0.7655196878325647, "grad_norm": 0.30230531096458435, "learning_rate": 5.22159211609592e-06, "loss": 0.3815, "num_input_tokens_seen": 8384366717, "step": 2158, "train_runtime": 57822.4587, "train_tokens_per_second": 145001.906 }, { "epoch": 0.7658744235544519, "grad_norm": 0.2868739664554596, "learning_rate": 5.206551336907224e-06, "loss": 0.3958, "num_input_tokens_seen": 8388249719, "step": 2159, "train_runtime": 57852.1511, "train_tokens_per_second": 144994.604 }, { "epoch": 0.7662291592763392, "grad_norm": 0.2407056987285614, "learning_rate": 5.191529009133007e-06, "loss": 0.3883, "num_input_tokens_seen": 8392161107, "step": 2160, "train_runtime": 57871.6156, "train_tokens_per_second": 145013.424 }, { "epoch": 0.7665838949982263, "grad_norm": 0.2584916353225708, "learning_rate": 5.1765251515101745e-06, "loss": 0.3921, "num_input_tokens_seen": 8396041179, "step": 2161, "train_runtime": 57901.8996, "train_tokens_per_second": 145004.589 }, { "epoch": 0.7669386307201135, "grad_norm": 0.2274174690246582, "learning_rate": 5.1615397827525694e-06, "loss": 0.3878, "num_input_tokens_seen": 8399962460, "step": 2162, "train_runtime": 57930.6342, "train_tokens_per_second": 145000.354 }, { "epoch": 0.7672933664420007, "grad_norm": 0.22321905195713043, "learning_rate": 5.1465729215509826e-06, "loss": 0.3987, "num_input_tokens_seen": 8403831916, "step": 2163, "train_runtime": 57952.5448, "train_tokens_per_second": 145012.302 }, { "epoch": 0.7676481021638879, "grad_norm": 0.2289804369211197, "learning_rate": 5.131624586573123e-06, "loss": 0.392, "num_input_tokens_seen": 8407785672, "step": 2164, "train_runtime": 57972.2167, "train_tokens_per_second": 145031.295 }, { "epoch": 0.7680028378857751, "grad_norm": 0.3151598274707794, "learning_rate": 5.116694796463593e-06, "loss": 0.3813, "num_input_tokens_seen": 8411629092, "step": 2165, "train_runtime": 57997.7989, "train_tokens_per_second": 145033.592 }, { "epoch": 0.7683575736076623, "grad_norm": 0.3708524703979492, "learning_rate": 5.101783569843852e-06, "loss": 0.3989, "num_input_tokens_seen": 8415495706, "step": 2166, "train_runtime": 58018.1446, "train_tokens_per_second": 145049.377 }, { "epoch": 0.7687123093295495, "grad_norm": 0.2897185683250427, "learning_rate": 5.08689092531222e-06, "loss": 0.3815, "num_input_tokens_seen": 8419312844, "step": 2167, "train_runtime": 58057.8629, "train_tokens_per_second": 145015.893 }, { "epoch": 0.7690670450514366, "grad_norm": 0.23263615369796753, "learning_rate": 5.0720168814438356e-06, "loss": 0.386, "num_input_tokens_seen": 8423268162, "step": 2168, "train_runtime": 58089.9937, "train_tokens_per_second": 145003.771 }, { "epoch": 0.7694217807733239, "grad_norm": 0.36944344639778137, "learning_rate": 5.057161456790638e-06, "loss": 0.3932, "num_input_tokens_seen": 8427159051, "step": 2169, "train_runtime": 58111.8577, "train_tokens_per_second": 145016.17 }, { "epoch": 0.7697765164952111, "grad_norm": 0.25439754128456116, "learning_rate": 5.04232466988134e-06, "loss": 0.3825, "num_input_tokens_seen": 8431032331, "step": 2170, "train_runtime": 58133.9414, "train_tokens_per_second": 145027.709 }, { "epoch": 0.7701312522170982, "grad_norm": 0.3357224762439728, "learning_rate": 5.027506539221414e-06, "loss": 0.4043, "num_input_tokens_seen": 8434945345, "step": 2171, "train_runtime": 58155.0433, "train_tokens_per_second": 145042.371 }, { "epoch": 0.7704859879389855, "grad_norm": 0.7128701210021973, "learning_rate": 5.012707083293062e-06, "loss": 0.393, "num_input_tokens_seen": 8438798536, "step": 2172, "train_runtime": 58185.6664, "train_tokens_per_second": 145032.257 }, { "epoch": 0.7708407236608726, "grad_norm": 0.21048827469348907, "learning_rate": 4.997926320555184e-06, "loss": 0.3941, "num_input_tokens_seen": 8442694609, "step": 2173, "train_runtime": 58212.6715, "train_tokens_per_second": 145031.904 }, { "epoch": 0.7711954593827598, "grad_norm": 0.21522316336631775, "learning_rate": 4.9831642694433725e-06, "loss": 0.3861, "num_input_tokens_seen": 8446584092, "step": 2174, "train_runtime": 58236.6148, "train_tokens_per_second": 145039.064 }, { "epoch": 0.7715501951046471, "grad_norm": 0.26975390315055847, "learning_rate": 4.968420948369886e-06, "loss": 0.3957, "num_input_tokens_seen": 8450458831, "step": 2175, "train_runtime": 58265.0676, "train_tokens_per_second": 145034.738 }, { "epoch": 0.7719049308265342, "grad_norm": 0.24595288932323456, "learning_rate": 4.9536963757236115e-06, "loss": 0.3981, "num_input_tokens_seen": 8454309390, "step": 2176, "train_runtime": 58285.5716, "train_tokens_per_second": 145049.781 }, { "epoch": 0.7722596665484214, "grad_norm": 0.2588481903076172, "learning_rate": 4.938990569870057e-06, "loss": 0.3842, "num_input_tokens_seen": 8458202097, "step": 2177, "train_runtime": 58307.6811, "train_tokens_per_second": 145061.541 }, { "epoch": 0.7726144022703086, "grad_norm": 0.25222480297088623, "learning_rate": 4.9243035491513255e-06, "loss": 0.3937, "num_input_tokens_seen": 8462029232, "step": 2178, "train_runtime": 58341.5813, "train_tokens_per_second": 145042.85 }, { "epoch": 0.7729691379921958, "grad_norm": 0.22644692659378052, "learning_rate": 4.909635331886087e-06, "loss": 0.3855, "num_input_tokens_seen": 8465988643, "step": 2179, "train_runtime": 58369.3089, "train_tokens_per_second": 145041.783 }, { "epoch": 0.773323873714083, "grad_norm": 0.2512996196746826, "learning_rate": 4.894985936369558e-06, "loss": 0.4042, "num_input_tokens_seen": 8469880668, "step": 2180, "train_runtime": 58396.2708, "train_tokens_per_second": 145041.465 }, { "epoch": 0.7736786094359702, "grad_norm": 0.21414124965667725, "learning_rate": 4.880355380873478e-06, "loss": 0.3922, "num_input_tokens_seen": 8473816270, "step": 2181, "train_runtime": 58429.7636, "train_tokens_per_second": 145025.681 }, { "epoch": 0.7740333451578574, "grad_norm": 0.43090176582336426, "learning_rate": 4.865743683646094e-06, "loss": 0.4005, "num_input_tokens_seen": 8477684123, "step": 2182, "train_runtime": 58447.963, "train_tokens_per_second": 145046.699 }, { "epoch": 0.7743880808797446, "grad_norm": 0.28061360120773315, "learning_rate": 4.851150862912124e-06, "loss": 0.3862, "num_input_tokens_seen": 8481647781, "step": 2183, "train_runtime": 58467.6607, "train_tokens_per_second": 145065.626 }, { "epoch": 0.7747428166016318, "grad_norm": 0.19799232482910156, "learning_rate": 4.836576936872752e-06, "loss": 0.3954, "num_input_tokens_seen": 8485532213, "step": 2184, "train_runtime": 58486.9623, "train_tokens_per_second": 145084.167 }, { "epoch": 0.775097552323519, "grad_norm": 0.548132598400116, "learning_rate": 4.822021923705577e-06, "loss": 0.3889, "num_input_tokens_seen": 8489397048, "step": 2185, "train_runtime": 58510.7122, "train_tokens_per_second": 145091.33 }, { "epoch": 0.7754522880454062, "grad_norm": 0.2831798791885376, "learning_rate": 4.807485841564625e-06, "loss": 0.3935, "num_input_tokens_seen": 8493286726, "step": 2186, "train_runtime": 58548.806, "train_tokens_per_second": 145063.363 }, { "epoch": 0.7758070237672934, "grad_norm": 1.4148951768875122, "learning_rate": 4.79296870858031e-06, "loss": 0.3954, "num_input_tokens_seen": 8497189702, "step": 2187, "train_runtime": 58574.345, "train_tokens_per_second": 145066.747 }, { "epoch": 0.7761617594891805, "grad_norm": 0.2583080530166626, "learning_rate": 4.778470542859399e-06, "loss": 0.4019, "num_input_tokens_seen": 8501082314, "step": 2188, "train_runtime": 58612.3613, "train_tokens_per_second": 145039.069 }, { "epoch": 0.7765164952110678, "grad_norm": 0.36854287981987, "learning_rate": 4.763991362485016e-06, "loss": 0.3915, "num_input_tokens_seen": 8504935254, "step": 2189, "train_runtime": 58639.4088, "train_tokens_per_second": 145037.875 }, { "epoch": 0.776871230932955, "grad_norm": 0.3119100034236908, "learning_rate": 4.7495311855166e-06, "loss": 0.3893, "num_input_tokens_seen": 8508827294, "step": 2190, "train_runtime": 58668.8757, "train_tokens_per_second": 145031.368 }, { "epoch": 0.7772259666548421, "grad_norm": 0.2663511037826538, "learning_rate": 4.735090029989875e-06, "loss": 0.3943, "num_input_tokens_seen": 8512717758, "step": 2191, "train_runtime": 58690.4923, "train_tokens_per_second": 145044.239 }, { "epoch": 0.7775807023767294, "grad_norm": 0.4165090322494507, "learning_rate": 4.7206679139168585e-06, "loss": 0.3948, "num_input_tokens_seen": 8516584758, "step": 2192, "train_runtime": 58715.481, "train_tokens_per_second": 145048.369 }, { "epoch": 0.7779354380986165, "grad_norm": 0.23924389481544495, "learning_rate": 4.706264855285811e-06, "loss": 0.395, "num_input_tokens_seen": 8520469148, "step": 2193, "train_runtime": 58741.5125, "train_tokens_per_second": 145050.217 }, { "epoch": 0.7782901738205037, "grad_norm": 0.2347801774740219, "learning_rate": 4.691880872061228e-06, "loss": 0.3861, "num_input_tokens_seen": 8524336136, "step": 2194, "train_runtime": 58761.5485, "train_tokens_per_second": 145066.567 }, { "epoch": 0.778644909542391, "grad_norm": 0.2829698324203491, "learning_rate": 4.677515982183806e-06, "loss": 0.4079, "num_input_tokens_seen": 8528289218, "step": 2195, "train_runtime": 58798.225, "train_tokens_per_second": 145043.311 }, { "epoch": 0.7789996452642781, "grad_norm": 0.2721082270145416, "learning_rate": 4.663170203570435e-06, "loss": 0.3895, "num_input_tokens_seen": 8532109848, "step": 2196, "train_runtime": 58824.5039, "train_tokens_per_second": 145043.465 }, { "epoch": 0.7793543809861653, "grad_norm": 0.2197941541671753, "learning_rate": 4.648843554114166e-06, "loss": 0.381, "num_input_tokens_seen": 8535983059, "step": 2197, "train_runtime": 58858.0736, "train_tokens_per_second": 145026.545 }, { "epoch": 0.7797091167080525, "grad_norm": 0.22824542224407196, "learning_rate": 4.634536051684184e-06, "loss": 0.3876, "num_input_tokens_seen": 8539930388, "step": 2198, "train_runtime": 58878.9127, "train_tokens_per_second": 145042.257 }, { "epoch": 0.7800638524299397, "grad_norm": 0.35497772693634033, "learning_rate": 4.6202477141258025e-06, "loss": 0.391, "num_input_tokens_seen": 8543840872, "step": 2199, "train_runtime": 58906.9858, "train_tokens_per_second": 145039.519 }, { "epoch": 0.7804185881518269, "grad_norm": 0.3125051259994507, "learning_rate": 4.605978559260422e-06, "loss": 0.3931, "num_input_tokens_seen": 8547643367, "step": 2200, "train_runtime": 58935.7843, "train_tokens_per_second": 145033.166 }, { "epoch": 0.7807733238737141, "grad_norm": 0.2580440938472748, "learning_rate": 4.591728604885526e-06, "loss": 0.397, "num_input_tokens_seen": 8551623752, "step": 2201, "train_runtime": 59063.2928, "train_tokens_per_second": 144787.453 }, { "epoch": 0.7811280595956013, "grad_norm": 0.29855403304100037, "learning_rate": 4.5774978687746405e-06, "loss": 0.3965, "num_input_tokens_seen": 8555446151, "step": 2202, "train_runtime": 59089.4151, "train_tokens_per_second": 144788.134 }, { "epoch": 0.7814827953174884, "grad_norm": 0.2206319272518158, "learning_rate": 4.5632863686773355e-06, "loss": 0.3998, "num_input_tokens_seen": 8559355643, "step": 2203, "train_runtime": 59115.2417, "train_tokens_per_second": 144791.012 }, { "epoch": 0.7818375310393757, "grad_norm": 0.2588633894920349, "learning_rate": 4.549094122319166e-06, "loss": 0.3981, "num_input_tokens_seen": 8563204425, "step": 2204, "train_runtime": 59140.6003, "train_tokens_per_second": 144794.006 }, { "epoch": 0.7821922667612629, "grad_norm": 0.26628434658050537, "learning_rate": 4.5349211474016894e-06, "loss": 0.3872, "num_input_tokens_seen": 8567070846, "step": 2205, "train_runtime": 59162.9251, "train_tokens_per_second": 144804.721 }, { "epoch": 0.78254700248315, "grad_norm": 0.3602887690067291, "learning_rate": 4.520767461602426e-06, "loss": 0.3897, "num_input_tokens_seen": 8570982025, "step": 2206, "train_runtime": 59183.0481, "train_tokens_per_second": 144821.572 }, { "epoch": 0.7829017382050373, "grad_norm": 0.30310681462287903, "learning_rate": 4.506633082574832e-06, "loss": 0.3934, "num_input_tokens_seen": 8574902988, "step": 2207, "train_runtime": 59208.3747, "train_tokens_per_second": 144825.847 }, { "epoch": 0.7832564739269244, "grad_norm": 0.20686829090118408, "learning_rate": 4.492518027948283e-06, "loss": 0.4045, "num_input_tokens_seen": 8578793533, "step": 2208, "train_runtime": 59234.6553, "train_tokens_per_second": 144827.272 }, { "epoch": 0.7836112096488116, "grad_norm": 0.2886744737625122, "learning_rate": 4.478422315328059e-06, "loss": 0.3746, "num_input_tokens_seen": 8582683307, "step": 2209, "train_runtime": 59266.5139, "train_tokens_per_second": 144815.052 }, { "epoch": 0.7839659453706989, "grad_norm": 0.30395036935806274, "learning_rate": 4.464345962295302e-06, "loss": 0.4006, "num_input_tokens_seen": 8586590244, "step": 2210, "train_runtime": 59292.766, "train_tokens_per_second": 144816.827 }, { "epoch": 0.784320681092586, "grad_norm": 0.5298042297363281, "learning_rate": 4.450288986407019e-06, "loss": 0.3901, "num_input_tokens_seen": 8590411699, "step": 2211, "train_runtime": 59319.6328, "train_tokens_per_second": 144815.659 }, { "epoch": 0.7846754168144732, "grad_norm": 0.2571088671684265, "learning_rate": 4.4362514051960415e-06, "loss": 0.3972, "num_input_tokens_seen": 8594339684, "step": 2212, "train_runtime": 59348.0354, "train_tokens_per_second": 144812.539 }, { "epoch": 0.7850301525363604, "grad_norm": 0.22527194023132324, "learning_rate": 4.422233236171018e-06, "loss": 0.3938, "num_input_tokens_seen": 8598197451, "step": 2213, "train_runtime": 59374.9081, "train_tokens_per_second": 144811.971 }, { "epoch": 0.7853848882582476, "grad_norm": 0.24053174257278442, "learning_rate": 4.408234496816377e-06, "loss": 0.3958, "num_input_tokens_seen": 8602151535, "step": 2214, "train_runtime": 59402.3151, "train_tokens_per_second": 144811.722 }, { "epoch": 0.7857396239801347, "grad_norm": 0.20176084339618683, "learning_rate": 4.394255204592322e-06, "loss": 0.3823, "num_input_tokens_seen": 8606067355, "step": 2215, "train_runtime": 59422.7485, "train_tokens_per_second": 144827.824 }, { "epoch": 0.786094359702022, "grad_norm": 0.24210692942142487, "learning_rate": 4.380295376934787e-06, "loss": 0.3916, "num_input_tokens_seen": 8609887187, "step": 2216, "train_runtime": 59451.4885, "train_tokens_per_second": 144822.063 }, { "epoch": 0.7864490954239092, "grad_norm": 0.3064371645450592, "learning_rate": 4.3663550312554425e-06, "loss": 0.3878, "num_input_tokens_seen": 8613797418, "step": 2217, "train_runtime": 59483.4844, "train_tokens_per_second": 144809.9 }, { "epoch": 0.7868038311457963, "grad_norm": 0.29869136214256287, "learning_rate": 4.352434184941654e-06, "loss": 0.3842, "num_input_tokens_seen": 8617692068, "step": 2218, "train_runtime": 59507.0484, "train_tokens_per_second": 144818.006 }, { "epoch": 0.7871585668676836, "grad_norm": 0.23322618007659912, "learning_rate": 4.338532855356463e-06, "loss": 0.3929, "num_input_tokens_seen": 8621576744, "step": 2219, "train_runtime": 59533.5413, "train_tokens_per_second": 144818.812 }, { "epoch": 0.7875133025895708, "grad_norm": 0.20375767350196838, "learning_rate": 4.324651059838572e-06, "loss": 0.3869, "num_input_tokens_seen": 8625456452, "step": 2220, "train_runtime": 59553.7763, "train_tokens_per_second": 144834.753 }, { "epoch": 0.787868038311458, "grad_norm": 0.25743934512138367, "learning_rate": 4.310788815702325e-06, "loss": 0.3903, "num_input_tokens_seen": 8629360074, "step": 2221, "train_runtime": 59575.3387, "train_tokens_per_second": 144847.856 }, { "epoch": 0.7882227740333452, "grad_norm": 0.23560672998428345, "learning_rate": 4.296946140237661e-06, "loss": 0.3956, "num_input_tokens_seen": 8633192123, "step": 2222, "train_runtime": 59597.755, "train_tokens_per_second": 144857.673 }, { "epoch": 0.7885775097552323, "grad_norm": 0.24105753004550934, "learning_rate": 4.283123050710132e-06, "loss": 0.4069, "num_input_tokens_seen": 8637133334, "step": 2223, "train_runtime": 59615.795, "train_tokens_per_second": 144879.949 }, { "epoch": 0.7889322454771196, "grad_norm": 0.263855904340744, "learning_rate": 4.269319564360852e-06, "loss": 0.3921, "num_input_tokens_seen": 8641043030, "step": 2224, "train_runtime": 59641.5024, "train_tokens_per_second": 144883.054 }, { "epoch": 0.7892869811990068, "grad_norm": 0.2788633406162262, "learning_rate": 4.255535698406488e-06, "loss": 0.4094, "num_input_tokens_seen": 8644962595, "step": 2225, "train_runtime": 59664.1305, "train_tokens_per_second": 144893.8 }, { "epoch": 0.7896417169208939, "grad_norm": 0.8912442326545715, "learning_rate": 4.2417714700392285e-06, "loss": 0.3852, "num_input_tokens_seen": 8648820310, "step": 2226, "train_runtime": 59692.7726, "train_tokens_per_second": 144888.902 }, { "epoch": 0.7899964526427812, "grad_norm": 0.29677677154541016, "learning_rate": 4.22802689642678e-06, "loss": 0.3801, "num_input_tokens_seen": 8652688914, "step": 2227, "train_runtime": 59725.8174, "train_tokens_per_second": 144873.512 }, { "epoch": 0.7903511883646683, "grad_norm": 0.22658199071884155, "learning_rate": 4.21430199471232e-06, "loss": 0.3935, "num_input_tokens_seen": 8656642691, "step": 2228, "train_runtime": 59752.0747, "train_tokens_per_second": 144876.019 }, { "epoch": 0.7907059240865555, "grad_norm": 0.32928627729415894, "learning_rate": 4.200596782014499e-06, "loss": 0.3931, "num_input_tokens_seen": 8660505624, "step": 2229, "train_runtime": 59785.0853, "train_tokens_per_second": 144860.638 }, { "epoch": 0.7910606598084428, "grad_norm": 0.28952696919441223, "learning_rate": 4.18691127542741e-06, "loss": 0.385, "num_input_tokens_seen": 8664342812, "step": 2230, "train_runtime": 59806.9633, "train_tokens_per_second": 144871.807 }, { "epoch": 0.7914153955303299, "grad_norm": 0.2025330662727356, "learning_rate": 4.173245492020563e-06, "loss": 0.3831, "num_input_tokens_seen": 8668340224, "step": 2231, "train_runtime": 59839.039, "train_tokens_per_second": 144860.953 }, { "epoch": 0.7917701312522171, "grad_norm": 0.3654575049877167, "learning_rate": 4.159599448838874e-06, "loss": 0.3887, "num_input_tokens_seen": 8672175270, "step": 2232, "train_runtime": 59867.957, "train_tokens_per_second": 144855.04 }, { "epoch": 0.7921248669741043, "grad_norm": 0.24086712300777435, "learning_rate": 4.145973162902626e-06, "loss": 0.3942, "num_input_tokens_seen": 8676054930, "step": 2233, "train_runtime": 59899.2605, "train_tokens_per_second": 144844.108 }, { "epoch": 0.7924796026959915, "grad_norm": 0.629715085029602, "learning_rate": 4.132366651207482e-06, "loss": 0.4014, "num_input_tokens_seen": 8679959881, "step": 2234, "train_runtime": 59925.4672, "train_tokens_per_second": 144845.928 }, { "epoch": 0.7928343384178786, "grad_norm": 0.2194431573152542, "learning_rate": 4.118779930724412e-06, "loss": 0.3914, "num_input_tokens_seen": 8683840567, "step": 2235, "train_runtime": 59950.9869, "train_tokens_per_second": 144849.001 }, { "epoch": 0.7931890741397659, "grad_norm": 0.23389799892902374, "learning_rate": 4.105213018399723e-06, "loss": 0.3866, "num_input_tokens_seen": 8687727219, "step": 2236, "train_runtime": 59981.8306, "train_tokens_per_second": 144839.314 }, { "epoch": 0.7935438098616531, "grad_norm": 0.2211068719625473, "learning_rate": 4.091665931155005e-06, "loss": 0.3968, "num_input_tokens_seen": 8691629187, "step": 2237, "train_runtime": 60011.2235, "train_tokens_per_second": 144833.394 }, { "epoch": 0.7938985455835402, "grad_norm": 0.305769145488739, "learning_rate": 4.078138685887125e-06, "loss": 0.3827, "num_input_tokens_seen": 8695451500, "step": 2238, "train_runtime": 60041.4152, "train_tokens_per_second": 144824.226 }, { "epoch": 0.7942532813054275, "grad_norm": 0.3425058126449585, "learning_rate": 4.0646312994682e-06, "loss": 0.3905, "num_input_tokens_seen": 8699322740, "step": 2239, "train_runtime": 60072.5612, "train_tokens_per_second": 144813.582 }, { "epoch": 0.7946080170273147, "grad_norm": 0.37642157077789307, "learning_rate": 4.051143788745588e-06, "loss": 0.3867, "num_input_tokens_seen": 8703221284, "step": 2240, "train_runtime": 60099.539, "train_tokens_per_second": 144813.445 }, { "epoch": 0.7949627527492018, "grad_norm": 0.2782459855079651, "learning_rate": 4.037676170541831e-06, "loss": 0.3845, "num_input_tokens_seen": 8707070373, "step": 2241, "train_runtime": 60127.0176, "train_tokens_per_second": 144811.28 }, { "epoch": 0.7953174884710891, "grad_norm": 0.3055364191532135, "learning_rate": 4.024228461654685e-06, "loss": 0.3862, "num_input_tokens_seen": 8710912267, "step": 2242, "train_runtime": 60161.7857, "train_tokens_per_second": 144791.451 }, { "epoch": 0.7956722241929762, "grad_norm": 0.3468208312988281, "learning_rate": 4.0108006788570634e-06, "loss": 0.3908, "num_input_tokens_seen": 8714807439, "step": 2243, "train_runtime": 60178.6766, "train_tokens_per_second": 144815.538 }, { "epoch": 0.7960269599148634, "grad_norm": 0.2503373920917511, "learning_rate": 3.997392838897027e-06, "loss": 0.3925, "num_input_tokens_seen": 8718628840, "step": 2244, "train_runtime": 60206.1455, "train_tokens_per_second": 144812.939 }, { "epoch": 0.7963816956367507, "grad_norm": 0.2570567727088928, "learning_rate": 3.984004958497765e-06, "loss": 0.388, "num_input_tokens_seen": 8722531298, "step": 2245, "train_runtime": 60231.4542, "train_tokens_per_second": 144816.88 }, { "epoch": 0.7967364313586378, "grad_norm": 0.24168933928012848, "learning_rate": 3.970637054357571e-06, "loss": 0.3791, "num_input_tokens_seen": 8726458182, "step": 2246, "train_runtime": 60264.559, "train_tokens_per_second": 144802.49 }, { "epoch": 0.797091167080525, "grad_norm": 0.21626192331314087, "learning_rate": 3.957289143149816e-06, "loss": 0.3811, "num_input_tokens_seen": 8730331120, "step": 2247, "train_runtime": 60290.005, "train_tokens_per_second": 144805.613 }, { "epoch": 0.7974459028024122, "grad_norm": 0.48704102635383606, "learning_rate": 3.943961241522942e-06, "loss": 0.3904, "num_input_tokens_seen": 8734215699, "step": 2248, "train_runtime": 60321.4924, "train_tokens_per_second": 144794.423 }, { "epoch": 0.7978006385242994, "grad_norm": 0.22186553478240967, "learning_rate": 3.930653366100425e-06, "loss": 0.3907, "num_input_tokens_seen": 8738069870, "step": 2249, "train_runtime": 60355.2668, "train_tokens_per_second": 144777.255 }, { "epoch": 0.7981553742461865, "grad_norm": 0.21680143475532532, "learning_rate": 3.917365533480786e-06, "loss": 0.3979, "num_input_tokens_seen": 8741944531, "step": 2250, "train_runtime": 60378.1455, "train_tokens_per_second": 144786.569 }, { "epoch": 0.7985101099680738, "grad_norm": 0.22756017744541168, "learning_rate": 3.90409776023752e-06, "loss": 0.394, "num_input_tokens_seen": 8745765130, "step": 2251, "train_runtime": 60410.0737, "train_tokens_per_second": 144773.29 }, { "epoch": 0.798864845689961, "grad_norm": 0.435754656791687, "learning_rate": 3.89085006291912e-06, "loss": 0.3941, "num_input_tokens_seen": 8749786378, "step": 2252, "train_runtime": 60442.4211, "train_tokens_per_second": 144762.341 }, { "epoch": 0.7992195814118481, "grad_norm": 0.20055285096168518, "learning_rate": 3.8776224580490245e-06, "loss": 0.3878, "num_input_tokens_seen": 8753646815, "step": 2253, "train_runtime": 60476.6274, "train_tokens_per_second": 144744.295 }, { "epoch": 0.7995743171337354, "grad_norm": 0.23923376202583313, "learning_rate": 3.86441496212562e-06, "loss": 0.399, "num_input_tokens_seen": 8757505679, "step": 2254, "train_runtime": 60512.5888, "train_tokens_per_second": 144722.046 }, { "epoch": 0.7999290528556225, "grad_norm": 0.2961578369140625, "learning_rate": 3.851227591622213e-06, "loss": 0.3846, "num_input_tokens_seen": 8761420686, "step": 2255, "train_runtime": 60538.0477, "train_tokens_per_second": 144725.854 }, { "epoch": 0.8002837885775097, "grad_norm": 0.42868301272392273, "learning_rate": 3.838060362987006e-06, "loss": 0.3978, "num_input_tokens_seen": 8765258527, "step": 2256, "train_runtime": 60570.1045, "train_tokens_per_second": 144712.62 }, { "epoch": 0.800638524299397, "grad_norm": 0.26374271512031555, "learning_rate": 3.82491329264308e-06, "loss": 0.3758, "num_input_tokens_seen": 8769123605, "step": 2257, "train_runtime": 60587.2016, "train_tokens_per_second": 144735.577 }, { "epoch": 0.8009932600212841, "grad_norm": 0.2502101957798004, "learning_rate": 3.8117863969883704e-06, "loss": 0.3891, "num_input_tokens_seen": 8773043830, "step": 2258, "train_runtime": 60606.3866, "train_tokens_per_second": 144754.445 }, { "epoch": 0.8013479957431713, "grad_norm": 0.3009650409221649, "learning_rate": 3.7986796923956594e-06, "loss": 0.4072, "num_input_tokens_seen": 8776958248, "step": 2259, "train_runtime": 60632.1341, "train_tokens_per_second": 144757.535 }, { "epoch": 0.8017027314650585, "grad_norm": 0.34717386960983276, "learning_rate": 3.7855931952125225e-06, "loss": 0.3929, "num_input_tokens_seen": 8780789424, "step": 2260, "train_runtime": 60665.3426, "train_tokens_per_second": 144741.446 }, { "epoch": 0.8020574671869457, "grad_norm": 0.2743281126022339, "learning_rate": 3.7725269217613547e-06, "loss": 0.4031, "num_input_tokens_seen": 8784683851, "step": 2261, "train_runtime": 60697.7616, "train_tokens_per_second": 144728.3 }, { "epoch": 0.802412202908833, "grad_norm": 0.22928832471370697, "learning_rate": 3.7594808883393175e-06, "loss": 0.3893, "num_input_tokens_seen": 8788613196, "step": 2262, "train_runtime": 60719.1235, "train_tokens_per_second": 144742.096 }, { "epoch": 0.8027669386307201, "grad_norm": 0.9097374081611633, "learning_rate": 3.7464551112183257e-06, "loss": 0.3845, "num_input_tokens_seen": 8792461055, "step": 2263, "train_runtime": 60744.9689, "train_tokens_per_second": 144743.856 }, { "epoch": 0.8031216743526073, "grad_norm": 0.26189205050468445, "learning_rate": 3.7334496066450345e-06, "loss": 0.3934, "num_input_tokens_seen": 8796402746, "step": 2264, "train_runtime": 60767.9877, "train_tokens_per_second": 144753.892 }, { "epoch": 0.8034764100744946, "grad_norm": 0.2699904143810272, "learning_rate": 3.7204643908408102e-06, "loss": 0.3942, "num_input_tokens_seen": 8800316390, "step": 2265, "train_runtime": 60796.4791, "train_tokens_per_second": 144750.428 }, { "epoch": 0.8038311457963817, "grad_norm": 0.2863009572029114, "learning_rate": 3.7074994800017105e-06, "loss": 0.3889, "num_input_tokens_seen": 8804155394, "step": 2266, "train_runtime": 60825.5298, "train_tokens_per_second": 144744.41 }, { "epoch": 0.8041858815182689, "grad_norm": 0.20842048525810242, "learning_rate": 3.694554890298474e-06, "loss": 0.3857, "num_input_tokens_seen": 8808025933, "step": 2267, "train_runtime": 60846.2127, "train_tokens_per_second": 144758.82 }, { "epoch": 0.8045406172401561, "grad_norm": 0.24499846994876862, "learning_rate": 3.68163063787649e-06, "loss": 0.397, "num_input_tokens_seen": 8811926262, "step": 2268, "train_runtime": 60879.5674, "train_tokens_per_second": 144743.575 }, { "epoch": 0.8048953529620433, "grad_norm": 0.3129425048828125, "learning_rate": 3.668726738855779e-06, "loss": 0.3945, "num_input_tokens_seen": 8815834839, "step": 2269, "train_runtime": 60913.5251, "train_tokens_per_second": 144727.051 }, { "epoch": 0.8052500886839304, "grad_norm": 0.36292099952697754, "learning_rate": 3.6558432093309825e-06, "loss": 0.3879, "num_input_tokens_seen": 8819741084, "step": 2270, "train_runtime": 60937.223, "train_tokens_per_second": 144734.871 }, { "epoch": 0.8056048244058177, "grad_norm": 0.31213030219078064, "learning_rate": 3.642980065371333e-06, "loss": 0.3778, "num_input_tokens_seen": 8823579281, "step": 2271, "train_runtime": 60956.5631, "train_tokens_per_second": 144751.916 }, { "epoch": 0.8059595601277049, "grad_norm": 0.2756778299808502, "learning_rate": 3.6301373230206284e-06, "loss": 0.3826, "num_input_tokens_seen": 8827553039, "step": 2272, "train_runtime": 60983.2207, "train_tokens_per_second": 144753.802 }, { "epoch": 0.806314295849592, "grad_norm": 0.43608558177948, "learning_rate": 3.61731499829723e-06, "loss": 0.3887, "num_input_tokens_seen": 8831447034, "step": 2273, "train_runtime": 61014.2974, "train_tokens_per_second": 144743.895 }, { "epoch": 0.8066690315714793, "grad_norm": 0.4411071538925171, "learning_rate": 3.6045131071940275e-06, "loss": 0.4008, "num_input_tokens_seen": 8835333311, "step": 2274, "train_runtime": 61039.806, "train_tokens_per_second": 144747.074 }, { "epoch": 0.8070237672933664, "grad_norm": 0.3238903880119324, "learning_rate": 3.591731665678433e-06, "loss": 0.3807, "num_input_tokens_seen": 8839246017, "step": 2275, "train_runtime": 61062.0374, "train_tokens_per_second": 144758.452 }, { "epoch": 0.8073785030152536, "grad_norm": 0.29062649607658386, "learning_rate": 3.5789706896923404e-06, "loss": 0.3919, "num_input_tokens_seen": 8843126772, "step": 2276, "train_runtime": 61085.1084, "train_tokens_per_second": 144767.309 }, { "epoch": 0.8077332387371409, "grad_norm": 0.7504581212997437, "learning_rate": 3.5662301951521294e-06, "loss": 0.389, "num_input_tokens_seen": 8846961004, "step": 2277, "train_runtime": 61111.1497, "train_tokens_per_second": 144768.361 }, { "epoch": 0.808087974459028, "grad_norm": 0.2815552055835724, "learning_rate": 3.553510197948622e-06, "loss": 0.3849, "num_input_tokens_seen": 8850878826, "step": 2278, "train_runtime": 61136.8984, "train_tokens_per_second": 144771.473 }, { "epoch": 0.8084427101809152, "grad_norm": 0.5101130604743958, "learning_rate": 3.5408107139470805e-06, "loss": 0.3923, "num_input_tokens_seen": 8854739610, "step": 2279, "train_runtime": 61170.1966, "train_tokens_per_second": 144755.781 }, { "epoch": 0.8087974459028024, "grad_norm": 0.28146547079086304, "learning_rate": 3.5281317589871834e-06, "loss": 0.3811, "num_input_tokens_seen": 8858612773, "step": 2280, "train_runtime": 61190.9704, "train_tokens_per_second": 144769.934 }, { "epoch": 0.8091521816246896, "grad_norm": 0.23049302399158478, "learning_rate": 3.515473348883003e-06, "loss": 0.398, "num_input_tokens_seen": 8862528266, "step": 2281, "train_runtime": 61218.2949, "train_tokens_per_second": 144769.277 }, { "epoch": 0.8095069173465768, "grad_norm": 0.2857791781425476, "learning_rate": 3.502835499422983e-06, "loss": 0.3846, "num_input_tokens_seen": 8866406014, "step": 2282, "train_runtime": 61244.8318, "train_tokens_per_second": 144769.865 }, { "epoch": 0.809861653068464, "grad_norm": 0.29558953642845154, "learning_rate": 3.490218226369928e-06, "loss": 0.3948, "num_input_tokens_seen": 8870297928, "step": 2283, "train_runtime": 61268.9988, "train_tokens_per_second": 144776.283 }, { "epoch": 0.8102163887903512, "grad_norm": 0.31136295199394226, "learning_rate": 3.477621545460974e-06, "loss": 0.38, "num_input_tokens_seen": 8874216278, "step": 2284, "train_runtime": 61291.5586, "train_tokens_per_second": 144786.925 }, { "epoch": 0.8105711245122383, "grad_norm": 0.3574710786342621, "learning_rate": 3.4650454724075754e-06, "loss": 0.3937, "num_input_tokens_seen": 8878086483, "step": 2285, "train_runtime": 61316.5691, "train_tokens_per_second": 144790.986 }, { "epoch": 0.8109258602341256, "grad_norm": 0.2831103205680847, "learning_rate": 3.452490022895483e-06, "loss": 0.4063, "num_input_tokens_seen": 8882047689, "step": 2286, "train_runtime": 61343.2301, "train_tokens_per_second": 144792.631 }, { "epoch": 0.8112805959560128, "grad_norm": 0.21033398807048798, "learning_rate": 3.4399552125847202e-06, "loss": 0.3886, "num_input_tokens_seen": 8885922561, "step": 2287, "train_runtime": 61363.2339, "train_tokens_per_second": 144808.577 }, { "epoch": 0.8116353316778999, "grad_norm": 0.27951714396476746, "learning_rate": 3.4274410571095728e-06, "loss": 0.3853, "num_input_tokens_seen": 8889832497, "step": 2288, "train_runtime": 61381.9818, "train_tokens_per_second": 144828.046 }, { "epoch": 0.8119900673997872, "grad_norm": 0.32550159096717834, "learning_rate": 3.414947572078564e-06, "loss": 0.3961, "num_input_tokens_seen": 8893689932, "step": 2289, "train_runtime": 61413.629, "train_tokens_per_second": 144816.225 }, { "epoch": 0.8123448031216743, "grad_norm": 0.300696462392807, "learning_rate": 3.40247477307444e-06, "loss": 0.3904, "num_input_tokens_seen": 8897492250, "step": 2290, "train_runtime": 61433.4994, "train_tokens_per_second": 144831.278 }, { "epoch": 0.8126995388435615, "grad_norm": 0.3008787930011749, "learning_rate": 3.3900226756541275e-06, "loss": 0.3941, "num_input_tokens_seen": 8901352538, "step": 2291, "train_runtime": 61459.0989, "train_tokens_per_second": 144833.763 }, { "epoch": 0.8130542745654488, "grad_norm": 0.21326257288455963, "learning_rate": 3.37759129534875e-06, "loss": 0.3986, "num_input_tokens_seen": 8905193366, "step": 2292, "train_runtime": 61481.2709, "train_tokens_per_second": 144844.003 }, { "epoch": 0.8134090102873359, "grad_norm": 0.23931531608104706, "learning_rate": 3.365180647663591e-06, "loss": 0.3879, "num_input_tokens_seen": 8909132351, "step": 2293, "train_runtime": 61513.9878, "train_tokens_per_second": 144831.0 }, { "epoch": 0.8137637460092231, "grad_norm": 0.523747980594635, "learning_rate": 3.3527907480780674e-06, "loss": 0.3977, "num_input_tokens_seen": 8913031972, "step": 2294, "train_runtime": 61538.2538, "train_tokens_per_second": 144837.259 }, { "epoch": 0.8141184817311103, "grad_norm": 0.3039131462574005, "learning_rate": 3.340421612045719e-06, "loss": 0.3889, "num_input_tokens_seen": 8916951161, "step": 2295, "train_runtime": 61570.345, "train_tokens_per_second": 144825.421 }, { "epoch": 0.8144732174529975, "grad_norm": 0.22775550186634064, "learning_rate": 3.3280732549941974e-06, "loss": 0.3866, "num_input_tokens_seen": 8920913609, "step": 2296, "train_runtime": 61593.9534, "train_tokens_per_second": 144834.243 }, { "epoch": 0.8148279531748847, "grad_norm": 0.35279691219329834, "learning_rate": 3.315745692325221e-06, "loss": 0.3881, "num_input_tokens_seen": 8924786965, "step": 2297, "train_runtime": 61617.0466, "train_tokens_per_second": 144842.823 }, { "epoch": 0.8151826888967719, "grad_norm": 0.2518133521080017, "learning_rate": 3.3034389394145806e-06, "loss": 0.4013, "num_input_tokens_seen": 8928670832, "step": 2298, "train_runtime": 61649.7264, "train_tokens_per_second": 144829.042 }, { "epoch": 0.8155374246186591, "grad_norm": 0.2334630936384201, "learning_rate": 3.2911530116121137e-06, "loss": 0.3963, "num_input_tokens_seen": 8932450849, "step": 2299, "train_runtime": 61688.8843, "train_tokens_per_second": 144798.385 }, { "epoch": 0.8158921603405463, "grad_norm": 0.23774664103984833, "learning_rate": 3.2788879242416804e-06, "loss": 0.3906, "num_input_tokens_seen": 8936411808, "step": 2300, "train_runtime": 61711.713, "train_tokens_per_second": 144809.006 }, { "epoch": 0.8162468960624335, "grad_norm": 0.520298182964325, "learning_rate": 3.26664369260115e-06, "loss": 0.3982, "num_input_tokens_seen": 8940268254, "step": 2301, "train_runtime": 61747.9326, "train_tokens_per_second": 144786.52 }, { "epoch": 0.8166016317843207, "grad_norm": 0.3326541483402252, "learning_rate": 3.2544203319623733e-06, "loss": 0.3911, "num_input_tokens_seen": 8944103290, "step": 2302, "train_runtime": 61775.4551, "train_tokens_per_second": 144784.094 }, { "epoch": 0.8169563675062079, "grad_norm": 0.32906240224838257, "learning_rate": 3.242217857571177e-06, "loss": 0.4016, "num_input_tokens_seen": 8948080753, "step": 2303, "train_runtime": 61795.0915, "train_tokens_per_second": 144802.452 }, { "epoch": 0.8173111032280951, "grad_norm": 0.2230798304080963, "learning_rate": 3.2300362846473287e-06, "loss": 0.3981, "num_input_tokens_seen": 8951950676, "step": 2304, "train_runtime": 61814.6225, "train_tokens_per_second": 144819.305 }, { "epoch": 0.8176658389499822, "grad_norm": 0.2944362461566925, "learning_rate": 3.2178756283845346e-06, "loss": 0.3965, "num_input_tokens_seen": 8955770448, "step": 2305, "train_runtime": 61839.5039, "train_tokens_per_second": 144822.806 }, { "epoch": 0.8180205746718695, "grad_norm": 0.2776722311973572, "learning_rate": 3.2057359039504044e-06, "loss": 0.3917, "num_input_tokens_seen": 8959649872, "step": 2306, "train_runtime": 61861.2052, "train_tokens_per_second": 144834.712 }, { "epoch": 0.8183753103937567, "grad_norm": 0.3155571222305298, "learning_rate": 3.1936171264864437e-06, "loss": 0.4057, "num_input_tokens_seen": 8963623198, "step": 2307, "train_runtime": 61891.0825, "train_tokens_per_second": 144828.994 }, { "epoch": 0.8187300461156438, "grad_norm": 0.3682713806629181, "learning_rate": 3.181519311108041e-06, "loss": 0.3865, "num_input_tokens_seen": 8967483288, "step": 2308, "train_runtime": 61916.3371, "train_tokens_per_second": 144832.264 }, { "epoch": 0.8190847818375311, "grad_norm": 0.24654977023601532, "learning_rate": 3.169442472904416e-06, "loss": 0.3808, "num_input_tokens_seen": 8971384211, "step": 2309, "train_runtime": 61937.3974, "train_tokens_per_second": 144845.999 }, { "epoch": 0.8194395175594182, "grad_norm": 0.21702373027801514, "learning_rate": 3.1573866269386434e-06, "loss": 0.3931, "num_input_tokens_seen": 8975260205, "step": 2310, "train_runtime": 61975.287, "train_tokens_per_second": 144819.986 }, { "epoch": 0.8197942532813054, "grad_norm": 0.21949508786201477, "learning_rate": 3.1453517882476127e-06, "loss": 0.3899, "num_input_tokens_seen": 8979154579, "step": 2311, "train_runtime": 62009.152, "train_tokens_per_second": 144803.699 }, { "epoch": 0.8201489890031927, "grad_norm": 0.20219112932682037, "learning_rate": 3.1333379718420074e-06, "loss": 0.3914, "num_input_tokens_seen": 8983043305, "step": 2312, "train_runtime": 62039.5582, "train_tokens_per_second": 144795.411 }, { "epoch": 0.8205037247250798, "grad_norm": 0.249370738863945, "learning_rate": 3.121345192706291e-06, "loss": 0.3936, "num_input_tokens_seen": 8986971295, "step": 2313, "train_runtime": 62068.0153, "train_tokens_per_second": 144792.31 }, { "epoch": 0.820858460446967, "grad_norm": 0.26327213644981384, "learning_rate": 3.1093734657986953e-06, "loss": 0.3965, "num_input_tokens_seen": 8990827621, "step": 2314, "train_runtime": 62092.8173, "train_tokens_per_second": 144796.581 }, { "epoch": 0.8212131961688542, "grad_norm": 0.2837604582309723, "learning_rate": 3.097422806051178e-06, "loss": 0.3974, "num_input_tokens_seen": 8994737208, "step": 2315, "train_runtime": 62109.5775, "train_tokens_per_second": 144820.454 }, { "epoch": 0.8215679318907414, "grad_norm": 0.247392475605011, "learning_rate": 3.0854932283694338e-06, "loss": 0.3919, "num_input_tokens_seen": 8998645151, "step": 2316, "train_runtime": 62131.6538, "train_tokens_per_second": 144831.895 }, { "epoch": 0.8219226676126286, "grad_norm": 0.2281252145767212, "learning_rate": 3.0735847476328585e-06, "loss": 0.3836, "num_input_tokens_seen": 9002555007, "step": 2317, "train_runtime": 62150.5924, "train_tokens_per_second": 144850.671 }, { "epoch": 0.8222774033345158, "grad_norm": 0.24129034578800201, "learning_rate": 3.061697378694537e-06, "loss": 0.39, "num_input_tokens_seen": 9006473863, "step": 2318, "train_runtime": 62183.1689, "train_tokens_per_second": 144837.808 }, { "epoch": 0.822632139056403, "grad_norm": 0.24658972024917603, "learning_rate": 3.0498311363812185e-06, "loss": 0.3994, "num_input_tokens_seen": 9010354253, "step": 2319, "train_runtime": 62202.8415, "train_tokens_per_second": 144854.383 }, { "epoch": 0.8229868747782901, "grad_norm": 0.3694169223308563, "learning_rate": 3.0379860354933012e-06, "loss": 0.3873, "num_input_tokens_seen": 9014225684, "step": 2320, "train_runtime": 62236.0819, "train_tokens_per_second": 144839.222 }, { "epoch": 0.8233416105001774, "grad_norm": 0.26651954650878906, "learning_rate": 3.0261620908048184e-06, "loss": 0.4014, "num_input_tokens_seen": 9018036535, "step": 2321, "train_runtime": 62256.3775, "train_tokens_per_second": 144853.217 }, { "epoch": 0.8236963462220646, "grad_norm": 0.3189883232116699, "learning_rate": 3.014359317063413e-06, "loss": 0.3868, "num_input_tokens_seen": 9021924049, "step": 2322, "train_runtime": 62277.4157, "train_tokens_per_second": 144866.706 }, { "epoch": 0.8240510819439517, "grad_norm": 0.26129069924354553, "learning_rate": 3.0025777289903212e-06, "loss": 0.3886, "num_input_tokens_seen": 9025764072, "step": 2323, "train_runtime": 62301.3342, "train_tokens_per_second": 144872.725 }, { "epoch": 0.824405817665839, "grad_norm": 0.3109435737133026, "learning_rate": 2.9908173412803564e-06, "loss": 0.3942, "num_input_tokens_seen": 9029637777, "step": 2324, "train_runtime": 62326.8164, "train_tokens_per_second": 144875.646 }, { "epoch": 0.8247605533877261, "grad_norm": 0.36247873306274414, "learning_rate": 2.9790781686018897e-06, "loss": 0.3915, "num_input_tokens_seen": 9033535134, "step": 2325, "train_runtime": 62358.2449, "train_tokens_per_second": 144865.128 }, { "epoch": 0.8251152891096133, "grad_norm": 0.3161031901836395, "learning_rate": 2.9673602255968336e-06, "loss": 0.374, "num_input_tokens_seen": 9037400671, "step": 2326, "train_runtime": 62379.9662, "train_tokens_per_second": 144876.652 }, { "epoch": 0.8254700248315006, "grad_norm": 0.23324695229530334, "learning_rate": 2.9556635268806165e-06, "loss": 0.3858, "num_input_tokens_seen": 9041305179, "step": 2327, "train_runtime": 62399.3463, "train_tokens_per_second": 144894.229 }, { "epoch": 0.8258247605533877, "grad_norm": 0.42883968353271484, "learning_rate": 2.943988087042169e-06, "loss": 0.3913, "num_input_tokens_seen": 9045119447, "step": 2328, "train_runtime": 62429.6708, "train_tokens_per_second": 144884.945 }, { "epoch": 0.8261794962752749, "grad_norm": 0.27072733640670776, "learning_rate": 2.932333920643913e-06, "loss": 0.3858, "num_input_tokens_seen": 9049093773, "step": 2329, "train_runtime": 62453.9688, "train_tokens_per_second": 144892.213 }, { "epoch": 0.8265342319971621, "grad_norm": 0.22166064381599426, "learning_rate": 2.9207010422217274e-06, "loss": 0.3901, "num_input_tokens_seen": 9052906146, "step": 2330, "train_runtime": 62472.4448, "train_tokens_per_second": 144910.387 }, { "epoch": 0.8268889677190493, "grad_norm": 0.2573656141757965, "learning_rate": 2.909089466284949e-06, "loss": 0.3933, "num_input_tokens_seen": 9056825055, "step": 2331, "train_runtime": 62508.7701, "train_tokens_per_second": 144888.87 }, { "epoch": 0.8272437034409365, "grad_norm": 0.28824442625045776, "learning_rate": 2.897499207316339e-06, "loss": 0.3898, "num_input_tokens_seen": 9060740001, "step": 2332, "train_runtime": 62529.9931, "train_tokens_per_second": 144902.303 }, { "epoch": 0.8275984391628237, "grad_norm": 0.2788163721561432, "learning_rate": 2.8859302797720756e-06, "loss": 0.3989, "num_input_tokens_seen": 9064598349, "step": 2333, "train_runtime": 62552.2516, "train_tokens_per_second": 144912.423 }, { "epoch": 0.8279531748847109, "grad_norm": 0.2694867253303528, "learning_rate": 2.8743826980817193e-06, "loss": 0.3852, "num_input_tokens_seen": 9068444768, "step": 2334, "train_runtime": 62574.6636, "train_tokens_per_second": 144921.99 }, { "epoch": 0.828307910606598, "grad_norm": 0.24592271447181702, "learning_rate": 2.8628564766482193e-06, "loss": 0.3795, "num_input_tokens_seen": 9072352632, "step": 2335, "train_runtime": 62605.4152, "train_tokens_per_second": 144913.225 }, { "epoch": 0.8286626463284853, "grad_norm": 0.23494239151477814, "learning_rate": 2.8513516298478737e-06, "loss": 0.3911, "num_input_tokens_seen": 9076229053, "step": 2336, "train_runtime": 62625.287, "train_tokens_per_second": 144929.141 }, { "epoch": 0.8290173820503725, "grad_norm": 0.3249126374721527, "learning_rate": 2.8398681720303313e-06, "loss": 0.3853, "num_input_tokens_seen": 9080132589, "step": 2337, "train_runtime": 62644.7908, "train_tokens_per_second": 144946.331 }, { "epoch": 0.8293721177722597, "grad_norm": 0.28514835238456726, "learning_rate": 2.828406117518545e-06, "loss": 0.3864, "num_input_tokens_seen": 9083962513, "step": 2338, "train_runtime": 62683.9295, "train_tokens_per_second": 144916.928 }, { "epoch": 0.8297268534941469, "grad_norm": 0.30452480912208557, "learning_rate": 2.816965480608802e-06, "loss": 0.3869, "num_input_tokens_seen": 9087824689, "step": 2339, "train_runtime": 62704.2941, "train_tokens_per_second": 144931.457 }, { "epoch": 0.830081589216034, "grad_norm": 0.4818491041660309, "learning_rate": 2.805546275570643e-06, "loss": 0.3821, "num_input_tokens_seen": 9091786534, "step": 2340, "train_runtime": 62730.5114, "train_tokens_per_second": 144934.041 }, { "epoch": 0.8304363249379213, "grad_norm": 0.24927091598510742, "learning_rate": 2.794148516646897e-06, "loss": 0.3953, "num_input_tokens_seen": 9095605607, "step": 2341, "train_runtime": 62751.0085, "train_tokens_per_second": 144947.56 }, { "epoch": 0.8307910606598085, "grad_norm": 0.36131930351257324, "learning_rate": 2.7827722180536355e-06, "loss": 0.3754, "num_input_tokens_seen": 9099532712, "step": 2342, "train_runtime": 62778.3671, "train_tokens_per_second": 144946.948 }, { "epoch": 0.8311457963816956, "grad_norm": 0.3661107122898102, "learning_rate": 2.7714173939801714e-06, "loss": 0.3891, "num_input_tokens_seen": 9103388705, "step": 2343, "train_runtime": 62798.6643, "train_tokens_per_second": 144961.502 }, { "epoch": 0.8315005321035829, "grad_norm": 0.2173675000667572, "learning_rate": 2.7600840585890255e-06, "loss": 0.3779, "num_input_tokens_seen": 9107308682, "step": 2344, "train_runtime": 62830.8114, "train_tokens_per_second": 144949.723 }, { "epoch": 0.83185526782547, "grad_norm": 0.26679566502571106, "learning_rate": 2.7487722260159256e-06, "loss": 0.3779, "num_input_tokens_seen": 9111175234, "step": 2345, "train_runtime": 62864.5181, "train_tokens_per_second": 144933.51 }, { "epoch": 0.8322100035473572, "grad_norm": 0.20934396982192993, "learning_rate": 2.7374819103697615e-06, "loss": 0.382, "num_input_tokens_seen": 9115127625, "step": 2346, "train_runtime": 62895.9472, "train_tokens_per_second": 144923.926 }, { "epoch": 0.8325647392692445, "grad_norm": 0.23315493762493134, "learning_rate": 2.726213125732602e-06, "loss": 0.3805, "num_input_tokens_seen": 9118946426, "step": 2347, "train_runtime": 62916.0893, "train_tokens_per_second": 144938.227 }, { "epoch": 0.8329194749911316, "grad_norm": 0.2695687711238861, "learning_rate": 2.7149658861596593e-06, "loss": 0.386, "num_input_tokens_seen": 9122842338, "step": 2348, "train_runtime": 62944.053, "train_tokens_per_second": 144935.731 }, { "epoch": 0.8332742107130188, "grad_norm": 0.3175129294395447, "learning_rate": 2.7037402056792684e-06, "loss": 0.389, "num_input_tokens_seen": 9126743750, "step": 2349, "train_runtime": 62975.3545, "train_tokens_per_second": 144925.643 }, { "epoch": 0.833628946434906, "grad_norm": 0.24553725123405457, "learning_rate": 2.6925360982928772e-06, "loss": 0.3963, "num_input_tokens_seen": 9130614951, "step": 2350, "train_runtime": 63001.8307, "train_tokens_per_second": 144926.185 }, { "epoch": 0.8339836821567932, "grad_norm": 0.36412733793258667, "learning_rate": 2.6813535779750212e-06, "loss": 0.3777, "num_input_tokens_seen": 9134514620, "step": 2351, "train_runtime": 63040.0637, "train_tokens_per_second": 144900.149 }, { "epoch": 0.8343384178786803, "grad_norm": 0.24937084317207336, "learning_rate": 2.6701926586733227e-06, "loss": 0.3807, "num_input_tokens_seen": 9138425612, "step": 2352, "train_runtime": 63066.1377, "train_tokens_per_second": 144902.256 }, { "epoch": 0.8346931536005676, "grad_norm": 0.29104122519493103, "learning_rate": 2.659053354308445e-06, "loss": 0.3873, "num_input_tokens_seen": 9142261264, "step": 2353, "train_runtime": 63087.2304, "train_tokens_per_second": 144914.608 }, { "epoch": 0.8350478893224548, "grad_norm": 0.35300636291503906, "learning_rate": 2.6479356787741006e-06, "loss": 0.3929, "num_input_tokens_seen": 9146115282, "step": 2354, "train_runtime": 63104.6207, "train_tokens_per_second": 144935.746 }, { "epoch": 0.8354026250443419, "grad_norm": 0.32984381914138794, "learning_rate": 2.6368396459370328e-06, "loss": 0.3995, "num_input_tokens_seen": 9150001079, "step": 2355, "train_runtime": 63128.2452, "train_tokens_per_second": 144943.061 }, { "epoch": 0.8357573607662292, "grad_norm": 0.23188185691833496, "learning_rate": 2.6257652696369773e-06, "loss": 0.3844, "num_input_tokens_seen": 9153805124, "step": 2356, "train_runtime": 63153.8719, "train_tokens_per_second": 144944.48 }, { "epoch": 0.8361120964881164, "grad_norm": 0.2410576194524765, "learning_rate": 2.6147125636866655e-06, "loss": 0.3869, "num_input_tokens_seen": 9157702079, "step": 2357, "train_runtime": 63179.6168, "train_tokens_per_second": 144947.098 }, { "epoch": 0.8364668322100035, "grad_norm": 1.5020517110824585, "learning_rate": 2.603681541871803e-06, "loss": 0.394, "num_input_tokens_seen": 9161629374, "step": 2358, "train_runtime": 63204.1487, "train_tokens_per_second": 144952.975 }, { "epoch": 0.8368215679318908, "grad_norm": 0.45133015513420105, "learning_rate": 2.5926722179510334e-06, "loss": 0.384, "num_input_tokens_seen": 9165529996, "step": 2359, "train_runtime": 63234.3944, "train_tokens_per_second": 144945.327 }, { "epoch": 0.8371763036537779, "grad_norm": 0.33665281534194946, "learning_rate": 2.5816846056559564e-06, "loss": 0.389, "num_input_tokens_seen": 9169401020, "step": 2360, "train_runtime": 63266.0968, "train_tokens_per_second": 144933.882 }, { "epoch": 0.8375310393756651, "grad_norm": 0.4281953275203705, "learning_rate": 2.5707187186910808e-06, "loss": 0.3904, "num_input_tokens_seen": 9173339312, "step": 2361, "train_runtime": 63290.9856, "train_tokens_per_second": 144939.113 }, { "epoch": 0.8378857750975524, "grad_norm": 0.2651159465312958, "learning_rate": 2.559774570733822e-06, "loss": 0.387, "num_input_tokens_seen": 9177159833, "step": 2362, "train_runtime": 63309.9232, "train_tokens_per_second": 144956.104 }, { "epoch": 0.8382405108194395, "grad_norm": 0.27285856008529663, "learning_rate": 2.5488521754344797e-06, "loss": 0.3805, "num_input_tokens_seen": 9181040703, "step": 2363, "train_runtime": 63333.7403, "train_tokens_per_second": 144962.869 }, { "epoch": 0.8385952465413267, "grad_norm": 0.2801761031150818, "learning_rate": 2.5379515464162285e-06, "loss": 0.3879, "num_input_tokens_seen": 9184908426, "step": 2364, "train_runtime": 63358.8888, "train_tokens_per_second": 144966.375 }, { "epoch": 0.8389499822632139, "grad_norm": 0.21748310327529907, "learning_rate": 2.5270726972750768e-06, "loss": 0.3868, "num_input_tokens_seen": 9188784624, "step": 2365, "train_runtime": 63383.9467, "train_tokens_per_second": 144970.219 }, { "epoch": 0.8393047179851011, "grad_norm": 0.2225230485200882, "learning_rate": 2.5162156415798843e-06, "loss": 0.4004, "num_input_tokens_seen": 9192719875, "step": 2366, "train_runtime": 63422.8378, "train_tokens_per_second": 144943.371 }, { "epoch": 0.8396594537069882, "grad_norm": 0.2676551043987274, "learning_rate": 2.505380392872325e-06, "loss": 0.3971, "num_input_tokens_seen": 9196525235, "step": 2367, "train_runtime": 63444.9672, "train_tokens_per_second": 144952.794 }, { "epoch": 0.8400141894288755, "grad_norm": 0.2471899688243866, "learning_rate": 2.4945669646668693e-06, "loss": 0.3848, "num_input_tokens_seen": 9200438003, "step": 2368, "train_runtime": 63477.8787, "train_tokens_per_second": 144939.28 }, { "epoch": 0.8403689251507627, "grad_norm": 0.3509639799594879, "learning_rate": 2.4837753704507783e-06, "loss": 0.3847, "num_input_tokens_seen": 9204345345, "step": 2369, "train_runtime": 63500.2638, "train_tokens_per_second": 144949.718 }, { "epoch": 0.8407236608726498, "grad_norm": 0.23189543187618256, "learning_rate": 2.4730056236840727e-06, "loss": 0.3946, "num_input_tokens_seen": 9208233675, "step": 2370, "train_runtime": 63522.9385, "train_tokens_per_second": 144959.189 }, { "epoch": 0.8410783965945371, "grad_norm": 0.2460927516222, "learning_rate": 2.462257737799527e-06, "loss": 0.3949, "num_input_tokens_seen": 9212196069, "step": 2371, "train_runtime": 63543.0748, "train_tokens_per_second": 144975.611 }, { "epoch": 0.8414331323164242, "grad_norm": 0.22761312127113342, "learning_rate": 2.4515317262026516e-06, "loss": 0.3919, "num_input_tokens_seen": 9216058125, "step": 2372, "train_runtime": 63568.2591, "train_tokens_per_second": 144978.929 }, { "epoch": 0.8417878680383114, "grad_norm": 0.26683786511421204, "learning_rate": 2.4408276022716714e-06, "loss": 0.3982, "num_input_tokens_seen": 9219930794, "step": 2373, "train_runtime": 63594.4663, "train_tokens_per_second": 144980.08 }, { "epoch": 0.8421426037601987, "grad_norm": 0.20171226561069489, "learning_rate": 2.4301453793575112e-06, "loss": 0.3843, "num_input_tokens_seen": 9223704400, "step": 2374, "train_runtime": 63613.1699, "train_tokens_per_second": 144996.774 }, { "epoch": 0.8424973394820858, "grad_norm": 0.29553118348121643, "learning_rate": 2.41948507078378e-06, "loss": 0.3765, "num_input_tokens_seen": 9227663720, "step": 2375, "train_runtime": 63636.2236, "train_tokens_per_second": 145006.463 }, { "epoch": 0.8428520752039731, "grad_norm": 0.20284101366996765, "learning_rate": 2.408846689846751e-06, "loss": 0.3888, "num_input_tokens_seen": 9231562761, "step": 2376, "train_runtime": 63654.7312, "train_tokens_per_second": 145025.556 }, { "epoch": 0.8432068109258603, "grad_norm": 0.31054115295410156, "learning_rate": 2.3982302498153587e-06, "loss": 0.3838, "num_input_tokens_seen": 9235393167, "step": 2377, "train_runtime": 63681.1033, "train_tokens_per_second": 145025.646 }, { "epoch": 0.8435615466477474, "grad_norm": 0.29871729016304016, "learning_rate": 2.387635763931151e-06, "loss": 0.3782, "num_input_tokens_seen": 9239317118, "step": 2378, "train_runtime": 63706.7754, "train_tokens_per_second": 145028.799 }, { "epoch": 0.8439162823696347, "grad_norm": 0.2610792815685272, "learning_rate": 2.377063245408311e-06, "loss": 0.3973, "num_input_tokens_seen": 9243262652, "step": 2379, "train_runtime": 63738.8969, "train_tokens_per_second": 145017.612 }, { "epoch": 0.8442710180915218, "grad_norm": 0.30532464385032654, "learning_rate": 2.3665127074336145e-06, "loss": 0.3746, "num_input_tokens_seen": 9247255426, "step": 2380, "train_runtime": 63766.4006, "train_tokens_per_second": 145017.679 }, { "epoch": 0.844625753813409, "grad_norm": 0.34844011068344116, "learning_rate": 2.355984163166427e-06, "loss": 0.387, "num_input_tokens_seen": 9251131187, "step": 2381, "train_runtime": 63786.1391, "train_tokens_per_second": 145033.566 }, { "epoch": 0.8449804895352963, "grad_norm": 0.32958468794822693, "learning_rate": 2.345477625738677e-06, "loss": 0.4016, "num_input_tokens_seen": 9254989435, "step": 2382, "train_runtime": 63811.2553, "train_tokens_per_second": 145036.944 }, { "epoch": 0.8453352252571834, "grad_norm": 0.39983633160591125, "learning_rate": 2.3349931082548507e-06, "loss": 0.3977, "num_input_tokens_seen": 9258890091, "step": 2383, "train_runtime": 63839.452, "train_tokens_per_second": 145033.984 }, { "epoch": 0.8456899609790706, "grad_norm": 0.35043928027153015, "learning_rate": 2.324530623791954e-06, "loss": 0.3934, "num_input_tokens_seen": 9262759699, "step": 2384, "train_runtime": 63865.8026, "train_tokens_per_second": 145034.734 }, { "epoch": 0.8460446967009578, "grad_norm": 0.2417929470539093, "learning_rate": 2.3140901853995313e-06, "loss": 0.3857, "num_input_tokens_seen": 9266611259, "step": 2385, "train_runtime": 63902.9259, "train_tokens_per_second": 145010.751 }, { "epoch": 0.846399432422845, "grad_norm": 0.303269624710083, "learning_rate": 2.3036718060996167e-06, "loss": 0.3892, "num_input_tokens_seen": 9270531353, "step": 2386, "train_runtime": 63931.7685, "train_tokens_per_second": 145006.646 }, { "epoch": 0.8467541681447321, "grad_norm": 0.23743359744548798, "learning_rate": 2.293275498886738e-06, "loss": 0.3934, "num_input_tokens_seen": 9274433320, "step": 2387, "train_runtime": 63959.6483, "train_tokens_per_second": 145004.445 }, { "epoch": 0.8471089038666194, "grad_norm": 0.28829994797706604, "learning_rate": 2.282901276727889e-06, "loss": 0.3905, "num_input_tokens_seen": 9278298808, "step": 2388, "train_runtime": 63985.111, "train_tokens_per_second": 145007.153 }, { "epoch": 0.8474636395885066, "grad_norm": 0.24972280859947205, "learning_rate": 2.272549152562518e-06, "loss": 0.3793, "num_input_tokens_seen": 9282189878, "step": 2389, "train_runtime": 64010.1143, "train_tokens_per_second": 145011.3 }, { "epoch": 0.8478183753103937, "grad_norm": 0.7858405709266663, "learning_rate": 2.262219139302513e-06, "loss": 0.3789, "num_input_tokens_seen": 9286078988, "step": 2390, "train_runtime": 64029.9748, "train_tokens_per_second": 145027.06 }, { "epoch": 0.848173111032281, "grad_norm": 0.2630370259284973, "learning_rate": 2.2519112498321836e-06, "loss": 0.3916, "num_input_tokens_seen": 9289977397, "step": 2391, "train_runtime": 64054.9987, "train_tokens_per_second": 145031.264 }, { "epoch": 0.8485278467541681, "grad_norm": 0.2992914617061615, "learning_rate": 2.2416254970082394e-06, "loss": 0.3815, "num_input_tokens_seen": 9293865417, "step": 2392, "train_runtime": 64085.062, "train_tokens_per_second": 145023.897 }, { "epoch": 0.8488825824760553, "grad_norm": 0.5925807356834412, "learning_rate": 2.231361893659789e-06, "loss": 0.3895, "num_input_tokens_seen": 9297743450, "step": 2393, "train_runtime": 64118.3376, "train_tokens_per_second": 145009.116 }, { "epoch": 0.8492373181979426, "grad_norm": 0.2763619124889374, "learning_rate": 2.221120452588308e-06, "loss": 0.384, "num_input_tokens_seen": 9301644145, "step": 2394, "train_runtime": 64141.3064, "train_tokens_per_second": 145018.003 }, { "epoch": 0.8495920539198297, "grad_norm": 0.22321288287639618, "learning_rate": 2.2109011865676334e-06, "loss": 0.3808, "num_input_tokens_seen": 9305606892, "step": 2395, "train_runtime": 64166.4264, "train_tokens_per_second": 145022.988 }, { "epoch": 0.8499467896417169, "grad_norm": 0.2904931604862213, "learning_rate": 2.2007041083439386e-06, "loss": 0.3864, "num_input_tokens_seen": 9309473273, "step": 2396, "train_runtime": 64198.2195, "train_tokens_per_second": 145011.393 }, { "epoch": 0.8503015253636041, "grad_norm": 0.23686327040195465, "learning_rate": 2.1905292306357274e-06, "loss": 0.3861, "num_input_tokens_seen": 9313375145, "step": 2397, "train_runtime": 64219.6381, "train_tokens_per_second": 145023.787 }, { "epoch": 0.8506562610854913, "grad_norm": 0.351521760225296, "learning_rate": 2.1803765661338107e-06, "loss": 0.3968, "num_input_tokens_seen": 9317272087, "step": 2398, "train_runtime": 64246.6103, "train_tokens_per_second": 145023.559 }, { "epoch": 0.8510109968073785, "grad_norm": 0.2705916166305542, "learning_rate": 2.170246127501301e-06, "loss": 0.4002, "num_input_tokens_seen": 9321150084, "step": 2399, "train_runtime": 64274.6408, "train_tokens_per_second": 145020.648 }, { "epoch": 0.8513657325292657, "grad_norm": 0.24254034459590912, "learning_rate": 2.160137927373578e-06, "loss": 0.3915, "num_input_tokens_seen": 9325035191, "step": 2400, "train_runtime": 64309.8212, "train_tokens_per_second": 145001.728 }, { "epoch": 0.8517204682511529, "grad_norm": 0.2652006149291992, "learning_rate": 2.1500519783582962e-06, "loss": 0.3742, "num_input_tokens_seen": 9328898028, "step": 2401, "train_runtime": 64434.0356, "train_tokens_per_second": 144782.147 }, { "epoch": 0.85207520397304, "grad_norm": 0.2372075617313385, "learning_rate": 2.13998829303534e-06, "loss": 0.3949, "num_input_tokens_seen": 9332802938, "step": 2402, "train_runtime": 64460.0967, "train_tokens_per_second": 144784.191 }, { "epoch": 0.8524299396949273, "grad_norm": 0.2638118863105774, "learning_rate": 2.1299468839568436e-06, "loss": 0.391, "num_input_tokens_seen": 9336663377, "step": 2403, "train_runtime": 64484.2519, "train_tokens_per_second": 144789.822 }, { "epoch": 0.8527846754168145, "grad_norm": 0.20981386303901672, "learning_rate": 2.1199277636471427e-06, "loss": 0.3969, "num_input_tokens_seen": 9340630810, "step": 2404, "train_runtime": 64512.7393, "train_tokens_per_second": 144787.385 }, { "epoch": 0.8531394111387016, "grad_norm": 0.36527878046035767, "learning_rate": 2.109930944602783e-06, "loss": 0.3915, "num_input_tokens_seen": 9344403368, "step": 2405, "train_runtime": 64536.9818, "train_tokens_per_second": 144791.453 }, { "epoch": 0.8534941468605889, "grad_norm": 0.26998433470726013, "learning_rate": 2.0999564392924897e-06, "loss": 0.389, "num_input_tokens_seen": 9348342762, "step": 2406, "train_runtime": 64563.5459, "train_tokens_per_second": 144792.896 }, { "epoch": 0.853848882582476, "grad_norm": 0.29893964529037476, "learning_rate": 2.0900042601571548e-06, "loss": 0.3801, "num_input_tokens_seen": 9352185947, "step": 2407, "train_runtime": 64584.6858, "train_tokens_per_second": 144805.008 }, { "epoch": 0.8542036183043632, "grad_norm": 0.28778746724128723, "learning_rate": 2.0800744196098276e-06, "loss": 0.3935, "num_input_tokens_seen": 9356132312, "step": 2408, "train_runtime": 64605.6861, "train_tokens_per_second": 144819.023 }, { "epoch": 0.8545583540262505, "grad_norm": 0.284683495759964, "learning_rate": 2.0701669300356908e-06, "loss": 0.3845, "num_input_tokens_seen": 9360000791, "step": 2409, "train_runtime": 64635.4361, "train_tokens_per_second": 144812.217 }, { "epoch": 0.8549130897481376, "grad_norm": 0.2604880928993225, "learning_rate": 2.060281803792057e-06, "loss": 0.3767, "num_input_tokens_seen": 9363873401, "step": 2410, "train_runtime": 64654.687, "train_tokens_per_second": 144828.996 }, { "epoch": 0.8552678254700248, "grad_norm": 0.24045820534229279, "learning_rate": 2.050419053208337e-06, "loss": 0.3894, "num_input_tokens_seen": 9367782356, "step": 2411, "train_runtime": 64684.559, "train_tokens_per_second": 144822.543 }, { "epoch": 0.855622561191912, "grad_norm": 0.23311877250671387, "learning_rate": 2.040578690586037e-06, "loss": 0.3763, "num_input_tokens_seen": 9371657132, "step": 2412, "train_runtime": 64705.7891, "train_tokens_per_second": 144834.91 }, { "epoch": 0.8559772969137992, "grad_norm": 0.2761721611022949, "learning_rate": 2.030760728198742e-06, "loss": 0.3851, "num_input_tokens_seen": 9375616211, "step": 2413, "train_runtime": 64737.4866, "train_tokens_per_second": 144825.15 }, { "epoch": 0.8563320326356865, "grad_norm": 0.23860573768615723, "learning_rate": 2.020965178292096e-06, "loss": 0.3798, "num_input_tokens_seen": 9379468458, "step": 2414, "train_runtime": 64759.8911, "train_tokens_per_second": 144834.531 }, { "epoch": 0.8566867683575736, "grad_norm": 0.28962865471839905, "learning_rate": 2.0111920530837815e-06, "loss": 0.3859, "num_input_tokens_seen": 9383396622, "step": 2415, "train_runtime": 64782.6063, "train_tokens_per_second": 144844.383 }, { "epoch": 0.8570415040794608, "grad_norm": 0.1958659142255783, "learning_rate": 2.001441364763521e-06, "loss": 0.3851, "num_input_tokens_seen": 9387297475, "step": 2416, "train_runtime": 64803.8352, "train_tokens_per_second": 144857.128 }, { "epoch": 0.857396239801348, "grad_norm": 0.5294010043144226, "learning_rate": 1.991713125493049e-06, "loss": 0.3886, "num_input_tokens_seen": 9391197208, "step": 2417, "train_runtime": 64826.1682, "train_tokens_per_second": 144867.381 }, { "epoch": 0.8577509755232352, "grad_norm": 0.2330729365348816, "learning_rate": 1.982007347406101e-06, "loss": 0.3769, "num_input_tokens_seen": 9395009319, "step": 2418, "train_runtime": 64846.8576, "train_tokens_per_second": 144879.947 }, { "epoch": 0.8581057112451224, "grad_norm": 0.2581097185611725, "learning_rate": 1.972324042608393e-06, "loss": 0.4019, "num_input_tokens_seen": 9398870909, "step": 2419, "train_runtime": 64873.5878, "train_tokens_per_second": 144879.777 }, { "epoch": 0.8584604469670096, "grad_norm": 0.5652875900268555, "learning_rate": 1.962663223177621e-06, "loss": 0.389, "num_input_tokens_seen": 9402762414, "step": 2420, "train_runtime": 64899.0504, "train_tokens_per_second": 144882.897 }, { "epoch": 0.8588151826888968, "grad_norm": 0.21880018711090088, "learning_rate": 1.9530249011634183e-06, "loss": 0.3941, "num_input_tokens_seen": 9406631633, "step": 2421, "train_runtime": 64929.2694, "train_tokens_per_second": 144875.057 }, { "epoch": 0.8591699184107839, "grad_norm": 0.32937800884246826, "learning_rate": 1.9434090885873736e-06, "loss": 0.3788, "num_input_tokens_seen": 9410534034, "step": 2422, "train_runtime": 64957.0824, "train_tokens_per_second": 144873.102 }, { "epoch": 0.8595246541326712, "grad_norm": 0.25910529494285583, "learning_rate": 1.933815797442995e-06, "loss": 0.39, "num_input_tokens_seen": 9414387026, "step": 2423, "train_runtime": 64981.4997, "train_tokens_per_second": 144877.959 }, { "epoch": 0.8598793898545584, "grad_norm": 0.2347790151834488, "learning_rate": 1.9242450396957e-06, "loss": 0.3896, "num_input_tokens_seen": 9418322709, "step": 2424, "train_runtime": 65014.0142, "train_tokens_per_second": 144866.039 }, { "epoch": 0.8602341255764455, "grad_norm": 0.21789060533046722, "learning_rate": 1.9146968272828003e-06, "loss": 0.3855, "num_input_tokens_seen": 9422187286, "step": 2425, "train_runtime": 65044.3298, "train_tokens_per_second": 144857.935 }, { "epoch": 0.8605888612983328, "grad_norm": 0.30356982350349426, "learning_rate": 1.905171172113487e-06, "loss": 0.394, "num_input_tokens_seen": 9426026542, "step": 2426, "train_runtime": 65070.7523, "train_tokens_per_second": 144858.115 }, { "epoch": 0.8609435970202199, "grad_norm": 0.2124083936214447, "learning_rate": 1.8956680860688203e-06, "loss": 0.3877, "num_input_tokens_seen": 9429976135, "step": 2427, "train_runtime": 65102.5137, "train_tokens_per_second": 144848.111 }, { "epoch": 0.8612983327421071, "grad_norm": 0.2240905612707138, "learning_rate": 1.8861875810017061e-06, "loss": 0.3867, "num_input_tokens_seen": 9433838859, "step": 2428, "train_runtime": 65133.2551, "train_tokens_per_second": 144839.051 }, { "epoch": 0.8616530684639944, "grad_norm": 0.22795191407203674, "learning_rate": 1.8767296687368875e-06, "loss": 0.3865, "num_input_tokens_seen": 9437712022, "step": 2429, "train_runtime": 65164.9574, "train_tokens_per_second": 144828.024 }, { "epoch": 0.8620078041858815, "grad_norm": 0.24732759594917297, "learning_rate": 1.8672943610709283e-06, "loss": 0.3891, "num_input_tokens_seen": 9441647448, "step": 2430, "train_runtime": 65189.2784, "train_tokens_per_second": 144834.361 }, { "epoch": 0.8623625399077687, "grad_norm": 0.22709886729717255, "learning_rate": 1.8578816697721969e-06, "loss": 0.3971, "num_input_tokens_seen": 9445445245, "step": 2431, "train_runtime": 65210.1343, "train_tokens_per_second": 144846.278 }, { "epoch": 0.8627172756296559, "grad_norm": 0.20601673424243927, "learning_rate": 1.8484916065808622e-06, "loss": 0.3798, "num_input_tokens_seen": 9449349629, "step": 2432, "train_runtime": 65239.5155, "train_tokens_per_second": 144840.892 }, { "epoch": 0.8630720113515431, "grad_norm": 0.29615288972854614, "learning_rate": 1.8391241832088492e-06, "loss": 0.3902, "num_input_tokens_seen": 9453251817, "step": 2433, "train_runtime": 65264.0811, "train_tokens_per_second": 144846.164 }, { "epoch": 0.8634267470734303, "grad_norm": 0.37092867493629456, "learning_rate": 1.8297794113398648e-06, "loss": 0.3883, "num_input_tokens_seen": 9457128046, "step": 2434, "train_runtime": 65286.8079, "train_tokens_per_second": 144855.115 }, { "epoch": 0.8637814827953175, "grad_norm": 0.3100888133049011, "learning_rate": 1.8204573026293526e-06, "loss": 0.3956, "num_input_tokens_seen": 9461005272, "step": 2435, "train_runtime": 65323.4901, "train_tokens_per_second": 144833.126 }, { "epoch": 0.8641362185172047, "grad_norm": 0.2154722809791565, "learning_rate": 1.8111578687044984e-06, "loss": 0.3941, "num_input_tokens_seen": 9464918247, "step": 2436, "train_runtime": 65360.8092, "train_tokens_per_second": 144810.298 }, { "epoch": 0.8644909542390918, "grad_norm": 0.24278314411640167, "learning_rate": 1.801881121164195e-06, "loss": 0.3812, "num_input_tokens_seen": 9468804603, "step": 2437, "train_runtime": 65392.3373, "train_tokens_per_second": 144799.911 }, { "epoch": 0.8648456899609791, "grad_norm": 0.2084408849477768, "learning_rate": 1.7926270715790472e-06, "loss": 0.3876, "num_input_tokens_seen": 9472771677, "step": 2438, "train_runtime": 65423.193, "train_tokens_per_second": 144792.256 }, { "epoch": 0.8652004256828663, "grad_norm": 0.2973061800003052, "learning_rate": 1.783395731491351e-06, "loss": 0.3841, "num_input_tokens_seen": 9476625940, "step": 2439, "train_runtime": 65444.1488, "train_tokens_per_second": 144804.786 }, { "epoch": 0.8655551614047534, "grad_norm": 0.23325474560260773, "learning_rate": 1.7741871124150666e-06, "loss": 0.3997, "num_input_tokens_seen": 9480539611, "step": 2440, "train_runtime": 65465.1588, "train_tokens_per_second": 144818.095 }, { "epoch": 0.8659098971266407, "grad_norm": 0.24528615176677704, "learning_rate": 1.7650012258358252e-06, "loss": 0.3977, "num_input_tokens_seen": 9484423810, "step": 2441, "train_runtime": 65485.2666, "train_tokens_per_second": 144832.942 }, { "epoch": 0.8662646328485278, "grad_norm": 0.46584033966064453, "learning_rate": 1.7558380832109012e-06, "loss": 0.4042, "num_input_tokens_seen": 9488266237, "step": 2442, "train_runtime": 65510.9248, "train_tokens_per_second": 144834.869 }, { "epoch": 0.866619368570415, "grad_norm": 0.26026445627212524, "learning_rate": 1.7466976959691994e-06, "loss": 0.391, "num_input_tokens_seen": 9492185962, "step": 2443, "train_runtime": 65537.1356, "train_tokens_per_second": 144836.754 }, { "epoch": 0.8669741042923023, "grad_norm": 0.22883379459381104, "learning_rate": 1.7375800755112425e-06, "loss": 0.3909, "num_input_tokens_seen": 9496092527, "step": 2444, "train_runtime": 65557.5788, "train_tokens_per_second": 144851.178 }, { "epoch": 0.8673288400141894, "grad_norm": 0.2760745882987976, "learning_rate": 1.7284852332091694e-06, "loss": 0.397, "num_input_tokens_seen": 9500051618, "step": 2445, "train_runtime": 65593.4033, "train_tokens_per_second": 144832.424 }, { "epoch": 0.8676835757360766, "grad_norm": 0.22152838110923767, "learning_rate": 1.7194131804066882e-06, "loss": 0.3785, "num_input_tokens_seen": 9503900331, "step": 2446, "train_runtime": 65614.8301, "train_tokens_per_second": 144843.785 }, { "epoch": 0.8680383114579638, "grad_norm": 0.24765466153621674, "learning_rate": 1.710363928419092e-06, "loss": 0.385, "num_input_tokens_seen": 9507792942, "step": 2447, "train_runtime": 65640.5016, "train_tokens_per_second": 144846.439 }, { "epoch": 0.868393047179851, "grad_norm": 0.22121743857860565, "learning_rate": 1.701337488533239e-06, "loss": 0.3983, "num_input_tokens_seen": 9511719004, "step": 2448, "train_runtime": 65665.9646, "train_tokens_per_second": 144850.061 }, { "epoch": 0.8687477829017382, "grad_norm": 0.2655586004257202, "learning_rate": 1.6923338720075277e-06, "loss": 0.3784, "num_input_tokens_seen": 9515542266, "step": 2449, "train_runtime": 65689.0841, "train_tokens_per_second": 144857.283 }, { "epoch": 0.8691025186236254, "grad_norm": 0.24966448545455933, "learning_rate": 1.6833530900718931e-06, "loss": 0.384, "num_input_tokens_seen": 9519453778, "step": 2450, "train_runtime": 65726.4236, "train_tokens_per_second": 144834.501 }, { "epoch": 0.8694572543455126, "grad_norm": 0.2092888355255127, "learning_rate": 1.6743951539277947e-06, "loss": 0.3904, "num_input_tokens_seen": 9523332936, "step": 2451, "train_runtime": 65759.4771, "train_tokens_per_second": 144820.691 }, { "epoch": 0.8698119900673997, "grad_norm": 0.8306344747543335, "learning_rate": 1.6654600747481819e-06, "loss": 0.3847, "num_input_tokens_seen": 9527264881, "step": 2452, "train_runtime": 65790.1743, "train_tokens_per_second": 144812.884 }, { "epoch": 0.870166725789287, "grad_norm": 0.35813745856285095, "learning_rate": 1.656547863677509e-06, "loss": 0.3892, "num_input_tokens_seen": 9531116012, "step": 2453, "train_runtime": 65817.3879, "train_tokens_per_second": 144811.52 }, { "epoch": 0.8705214615111742, "grad_norm": 0.6059327125549316, "learning_rate": 1.6476585318317018e-06, "loss": 0.383, "num_input_tokens_seen": 9534999072, "step": 2454, "train_runtime": 65842.5072, "train_tokens_per_second": 144815.249 }, { "epoch": 0.8708761972330614, "grad_norm": 0.27296847105026245, "learning_rate": 1.6387920902981535e-06, "loss": 0.3831, "num_input_tokens_seen": 9538920428, "step": 2455, "train_runtime": 65879.9852, "train_tokens_per_second": 144792.389 }, { "epoch": 0.8712309329549486, "grad_norm": 0.20469455420970917, "learning_rate": 1.6299485501357048e-06, "loss": 0.392, "num_input_tokens_seen": 9542778912, "step": 2456, "train_runtime": 65901.8485, "train_tokens_per_second": 144802.902 }, { "epoch": 0.8715856686768357, "grad_norm": 0.2095031887292862, "learning_rate": 1.6211279223746345e-06, "loss": 0.4016, "num_input_tokens_seen": 9546710335, "step": 2457, "train_runtime": 65927.428, "train_tokens_per_second": 144806.352 }, { "epoch": 0.871940404398723, "grad_norm": 0.22982187569141388, "learning_rate": 1.6123302180166312e-06, "loss": 0.4047, "num_input_tokens_seen": 9550551436, "step": 2458, "train_runtime": 65965.5176, "train_tokens_per_second": 144780.967 }, { "epoch": 0.8722951401206102, "grad_norm": 0.509998083114624, "learning_rate": 1.6035554480348102e-06, "loss": 0.3829, "num_input_tokens_seen": 9554497185, "step": 2459, "train_runtime": 65992.146, "train_tokens_per_second": 144782.338 }, { "epoch": 0.8726498758424973, "grad_norm": 0.6097367405891418, "learning_rate": 1.5948036233736642e-06, "loss": 0.4008, "num_input_tokens_seen": 9558330106, "step": 2460, "train_runtime": 66017.3864, "train_tokens_per_second": 144785.043 }, { "epoch": 0.8730046115643846, "grad_norm": 0.2200324535369873, "learning_rate": 1.5860747549490873e-06, "loss": 0.3977, "num_input_tokens_seen": 9562305584, "step": 2461, "train_runtime": 66034.0605, "train_tokens_per_second": 144808.687 }, { "epoch": 0.8733593472862717, "grad_norm": 0.2656064033508301, "learning_rate": 1.5773688536483223e-06, "loss": 0.3737, "num_input_tokens_seen": 9566173051, "step": 2462, "train_runtime": 66054.6982, "train_tokens_per_second": 144821.993 }, { "epoch": 0.8737140830081589, "grad_norm": 0.2307184487581253, "learning_rate": 1.5686859303299762e-06, "loss": 0.385, "num_input_tokens_seen": 9569993050, "step": 2463, "train_runtime": 66086.2099, "train_tokens_per_second": 144810.741 }, { "epoch": 0.8740688187300462, "grad_norm": 0.2738270163536072, "learning_rate": 1.560025995823986e-06, "loss": 0.3981, "num_input_tokens_seen": 9573918252, "step": 2464, "train_runtime": 66114.1875, "train_tokens_per_second": 144808.832 }, { "epoch": 0.8744235544519333, "grad_norm": 0.22412772476673126, "learning_rate": 1.5513890609316295e-06, "loss": 0.3875, "num_input_tokens_seen": 9577777356, "step": 2465, "train_runtime": 66134.2247, "train_tokens_per_second": 144823.311 }, { "epoch": 0.8747782901738205, "grad_norm": 0.3034251630306244, "learning_rate": 1.542775136425485e-06, "loss": 0.3926, "num_input_tokens_seen": 9581629644, "step": 2466, "train_runtime": 66159.4674, "train_tokens_per_second": 144826.281 }, { "epoch": 0.8751330258957077, "grad_norm": 0.26545441150665283, "learning_rate": 1.5341842330494383e-06, "loss": 0.3929, "num_input_tokens_seen": 9585536556, "step": 2467, "train_runtime": 66188.0797, "train_tokens_per_second": 144822.702 }, { "epoch": 0.8754877616175949, "grad_norm": 0.3181459605693817, "learning_rate": 1.5256163615186626e-06, "loss": 0.3949, "num_input_tokens_seen": 9589392442, "step": 2468, "train_runtime": 66212.4767, "train_tokens_per_second": 144827.575 }, { "epoch": 0.875842497339482, "grad_norm": 0.5673038363456726, "learning_rate": 1.5170715325195962e-06, "loss": 0.3767, "num_input_tokens_seen": 9593301766, "step": 2469, "train_runtime": 66232.4109, "train_tokens_per_second": 144843.01 }, { "epoch": 0.8761972330613693, "grad_norm": 0.19511640071868896, "learning_rate": 1.5085497567099471e-06, "loss": 0.3896, "num_input_tokens_seen": 9597133800, "step": 2470, "train_runtime": 66259.4287, "train_tokens_per_second": 144841.783 }, { "epoch": 0.8765519687832565, "grad_norm": 0.20549079775810242, "learning_rate": 1.50005104471866e-06, "loss": 0.3959, "num_input_tokens_seen": 9600988460, "step": 2471, "train_runtime": 66282.0261, "train_tokens_per_second": 144850.558 }, { "epoch": 0.8769067045051436, "grad_norm": 0.22855764627456665, "learning_rate": 1.4915754071459176e-06, "loss": 0.3947, "num_input_tokens_seen": 9604917820, "step": 2472, "train_runtime": 66309.0983, "train_tokens_per_second": 144850.678 }, { "epoch": 0.8772614402270309, "grad_norm": 0.35530564188957214, "learning_rate": 1.483122854563126e-06, "loss": 0.3938, "num_input_tokens_seen": 9608742242, "step": 2473, "train_runtime": 66326.1216, "train_tokens_per_second": 144871.161 }, { "epoch": 0.877616175948918, "grad_norm": 0.2368943691253662, "learning_rate": 1.474693397512892e-06, "loss": 0.396, "num_input_tokens_seen": 9612707161, "step": 2474, "train_runtime": 66351.1345, "train_tokens_per_second": 144876.304 }, { "epoch": 0.8779709116708052, "grad_norm": 0.33446577191352844, "learning_rate": 1.4662870465090207e-06, "loss": 0.387, "num_input_tokens_seen": 9616581204, "step": 2475, "train_runtime": 66378.3069, "train_tokens_per_second": 144875.361 }, { "epoch": 0.8783256473926925, "grad_norm": 0.3293631076812744, "learning_rate": 1.457903812036492e-06, "loss": 0.3864, "num_input_tokens_seen": 9620424466, "step": 2476, "train_runtime": 66402.5562, "train_tokens_per_second": 144880.333 }, { "epoch": 0.8786803831145796, "grad_norm": 0.33619219064712524, "learning_rate": 1.4495437045514616e-06, "loss": 0.3917, "num_input_tokens_seen": 9624262104, "step": 2477, "train_runtime": 66430.9084, "train_tokens_per_second": 144876.268 }, { "epoch": 0.8790351188364668, "grad_norm": 0.24092774093151093, "learning_rate": 1.441206734481233e-06, "loss": 0.4013, "num_input_tokens_seen": 9628124039, "step": 2478, "train_runtime": 66456.1451, "train_tokens_per_second": 144879.364 }, { "epoch": 0.8793898545583541, "grad_norm": 0.2560415267944336, "learning_rate": 1.4328929122242507e-06, "loss": 0.391, "num_input_tokens_seen": 9632036005, "step": 2479, "train_runtime": 66484.7852, "train_tokens_per_second": 144875.793 }, { "epoch": 0.8797445902802412, "grad_norm": 0.26726076006889343, "learning_rate": 1.4246022481500955e-06, "loss": 0.3871, "num_input_tokens_seen": 9635878409, "step": 2480, "train_runtime": 66514.9983, "train_tokens_per_second": 144867.754 }, { "epoch": 0.8800993260021284, "grad_norm": 0.24334903061389923, "learning_rate": 1.416334752599453e-06, "loss": 0.3901, "num_input_tokens_seen": 9639850790, "step": 2481, "train_runtime": 66540.5086, "train_tokens_per_second": 144871.913 }, { "epoch": 0.8804540617240156, "grad_norm": 0.31444355845451355, "learning_rate": 1.4080904358841218e-06, "loss": 0.3909, "num_input_tokens_seen": 9643649006, "step": 2482, "train_runtime": 66564.3791, "train_tokens_per_second": 144877.022 }, { "epoch": 0.8808087974459028, "grad_norm": 0.20502640306949615, "learning_rate": 1.3998693082869807e-06, "loss": 0.3846, "num_input_tokens_seen": 9647556323, "step": 2483, "train_runtime": 66592.0645, "train_tokens_per_second": 144875.465 }, { "epoch": 0.88116353316779, "grad_norm": 1.5613919496536255, "learning_rate": 1.3916713800619896e-06, "loss": 0.3862, "num_input_tokens_seen": 9651486963, "step": 2484, "train_runtime": 66618.6788, "train_tokens_per_second": 144876.589 }, { "epoch": 0.8815182688896772, "grad_norm": 0.2697506546974182, "learning_rate": 1.383496661434176e-06, "loss": 0.388, "num_input_tokens_seen": 9655273828, "step": 2485, "train_runtime": 66645.1272, "train_tokens_per_second": 144875.916 }, { "epoch": 0.8818730046115644, "grad_norm": 0.3437880873680115, "learning_rate": 1.3753451625996128e-06, "loss": 0.3742, "num_input_tokens_seen": 9659276547, "step": 2486, "train_runtime": 66675.8896, "train_tokens_per_second": 144869.106 }, { "epoch": 0.8822277403334515, "grad_norm": 0.3457321524620056, "learning_rate": 1.3672168937254138e-06, "loss": 0.3935, "num_input_tokens_seen": 9663115305, "step": 2487, "train_runtime": 66696.5744, "train_tokens_per_second": 144881.733 }, { "epoch": 0.8825824760553388, "grad_norm": 0.31999436020851135, "learning_rate": 1.359111864949727e-06, "loss": 0.3941, "num_input_tokens_seen": 9666998444, "step": 2488, "train_runtime": 66717.7755, "train_tokens_per_second": 144893.896 }, { "epoch": 0.882937211777226, "grad_norm": 0.3032386898994446, "learning_rate": 1.351030086381695e-06, "loss": 0.3824, "num_input_tokens_seen": 9670929825, "step": 2489, "train_runtime": 66739.385, "train_tokens_per_second": 144905.888 }, { "epoch": 0.8832919474991131, "grad_norm": 0.2715248465538025, "learning_rate": 1.3429715681014765e-06, "loss": 0.3916, "num_input_tokens_seen": 9674826103, "step": 2490, "train_runtime": 66763.0461, "train_tokens_per_second": 144912.892 }, { "epoch": 0.8836466832210004, "grad_norm": 0.5115591287612915, "learning_rate": 1.334936320160216e-06, "loss": 0.3859, "num_input_tokens_seen": 9678705298, "step": 2491, "train_runtime": 66782.2628, "train_tokens_per_second": 144929.281 }, { "epoch": 0.8840014189428875, "grad_norm": 0.22861982882022858, "learning_rate": 1.3269243525800301e-06, "loss": 0.3944, "num_input_tokens_seen": 9682599605, "step": 2492, "train_runtime": 66816.4207, "train_tokens_per_second": 144913.473 }, { "epoch": 0.8843561546647748, "grad_norm": 0.3418673574924469, "learning_rate": 1.3189356753540006e-06, "loss": 0.3888, "num_input_tokens_seen": 9686448859, "step": 2493, "train_runtime": 66840.3249, "train_tokens_per_second": 144919.237 }, { "epoch": 0.884710890386662, "grad_norm": 0.2649252414703369, "learning_rate": 1.3109702984461592e-06, "loss": 0.3811, "num_input_tokens_seen": 9690285327, "step": 2494, "train_runtime": 66877.6784, "train_tokens_per_second": 144895.66 }, { "epoch": 0.8850656261085491, "grad_norm": 0.2252597212791443, "learning_rate": 1.3030282317914766e-06, "loss": 0.3921, "num_input_tokens_seen": 9694182465, "step": 2495, "train_runtime": 66909.3953, "train_tokens_per_second": 144885.22 }, { "epoch": 0.8854203618304364, "grad_norm": 0.26488229632377625, "learning_rate": 1.295109485295847e-06, "loss": 0.3955, "num_input_tokens_seen": 9697992897, "step": 2496, "train_runtime": 66937.5461, "train_tokens_per_second": 144881.213 }, { "epoch": 0.8857750975523235, "grad_norm": 0.31922823190689087, "learning_rate": 1.2872140688360801e-06, "loss": 0.3851, "num_input_tokens_seen": 9701974705, "step": 2497, "train_runtime": 66963.6856, "train_tokens_per_second": 144884.121 }, { "epoch": 0.8861298332742107, "grad_norm": 0.20921534299850464, "learning_rate": 1.2793419922598882e-06, "loss": 0.3816, "num_input_tokens_seen": 9705904153, "step": 2498, "train_runtime": 66990.0423, "train_tokens_per_second": 144885.774 }, { "epoch": 0.886484568996098, "grad_norm": 0.2699640691280365, "learning_rate": 1.2714932653858704e-06, "loss": 0.3855, "num_input_tokens_seen": 9709828928, "step": 2499, "train_runtime": 67019.8959, "train_tokens_per_second": 144879.797 }, { "epoch": 0.8868393047179851, "grad_norm": 0.24078643321990967, "learning_rate": 1.2636678980035021e-06, "loss": 0.3973, "num_input_tokens_seen": 9713741757, "step": 2500, "train_runtime": 67042.168, "train_tokens_per_second": 144890.03 }, { "epoch": 0.8871940404398723, "grad_norm": 0.2246292531490326, "learning_rate": 1.2558658998731298e-06, "loss": 0.3885, "num_input_tokens_seen": 9717595799, "step": 2501, "train_runtime": 67071.073, "train_tokens_per_second": 144885.05 }, { "epoch": 0.8875487761617595, "grad_norm": 0.4093039333820343, "learning_rate": 1.2480872807259337e-06, "loss": 0.3776, "num_input_tokens_seen": 9721508258, "step": 2502, "train_runtime": 67090.7753, "train_tokens_per_second": 144900.818 }, { "epoch": 0.8879035118836467, "grad_norm": 0.2331242859363556, "learning_rate": 1.240332050263957e-06, "loss": 0.384, "num_input_tokens_seen": 9725346919, "step": 2503, "train_runtime": 67112.0292, "train_tokens_per_second": 144912.127 }, { "epoch": 0.8882582476055338, "grad_norm": 0.3715978264808655, "learning_rate": 1.232600218160056e-06, "loss": 0.4037, "num_input_tokens_seen": 9729266974, "step": 2504, "train_runtime": 67129.6829, "train_tokens_per_second": 144932.414 }, { "epoch": 0.8886129833274211, "grad_norm": 0.37216997146606445, "learning_rate": 1.2248917940579097e-06, "loss": 0.3912, "num_input_tokens_seen": 9733121008, "step": 2505, "train_runtime": 67166.9932, "train_tokens_per_second": 144909.286 }, { "epoch": 0.8889677190493083, "grad_norm": 0.2506546080112457, "learning_rate": 1.2172067875719962e-06, "loss": 0.3935, "num_input_tokens_seen": 9737007061, "step": 2506, "train_runtime": 67197.4916, "train_tokens_per_second": 144901.347 }, { "epoch": 0.8893224547711954, "grad_norm": 0.224944069981575, "learning_rate": 1.2095452082875947e-06, "loss": 0.392, "num_input_tokens_seen": 9740917059, "step": 2507, "train_runtime": 67217.2636, "train_tokens_per_second": 144916.894 }, { "epoch": 0.8896771904930827, "grad_norm": 0.20817303657531738, "learning_rate": 1.2019070657607523e-06, "loss": 0.3921, "num_input_tokens_seen": 9744831380, "step": 2508, "train_runtime": 67237.2396, "train_tokens_per_second": 144932.056 }, { "epoch": 0.8900319262149698, "grad_norm": 0.31403225660324097, "learning_rate": 1.1942923695182883e-06, "loss": 0.3807, "num_input_tokens_seen": 9748783456, "step": 2509, "train_runtime": 67264.7061, "train_tokens_per_second": 144931.629 }, { "epoch": 0.890386661936857, "grad_norm": 0.20825207233428955, "learning_rate": 1.1867011290577835e-06, "loss": 0.3952, "num_input_tokens_seen": 9752668909, "step": 2510, "train_runtime": 67290.6525, "train_tokens_per_second": 144933.487 }, { "epoch": 0.8907413976587443, "grad_norm": 0.6766046285629272, "learning_rate": 1.1791333538475569e-06, "loss": 0.3756, "num_input_tokens_seen": 9756529320, "step": 2511, "train_runtime": 67322.9611, "train_tokens_per_second": 144921.274 }, { "epoch": 0.8910961333806314, "grad_norm": 0.38873663544654846, "learning_rate": 1.1715890533266626e-06, "loss": 0.3946, "num_input_tokens_seen": 9760439086, "step": 2512, "train_runtime": 67342.3553, "train_tokens_per_second": 144937.596 }, { "epoch": 0.8914508691025186, "grad_norm": 0.2168574184179306, "learning_rate": 1.1640682369048783e-06, "loss": 0.3853, "num_input_tokens_seen": 9764381843, "step": 2513, "train_runtime": 67368.2507, "train_tokens_per_second": 144940.409 }, { "epoch": 0.8918056048244059, "grad_norm": 0.22484587132930756, "learning_rate": 1.1565709139626825e-06, "loss": 0.3922, "num_input_tokens_seen": 9768206613, "step": 2514, "train_runtime": 67394.0539, "train_tokens_per_second": 144941.668 }, { "epoch": 0.892160340546293, "grad_norm": 0.30770155787467957, "learning_rate": 1.149097093851259e-06, "loss": 0.3836, "num_input_tokens_seen": 9772171251, "step": 2515, "train_runtime": 67413.8583, "train_tokens_per_second": 144957.899 }, { "epoch": 0.8925150762681802, "grad_norm": 0.2820533812046051, "learning_rate": 1.141646785892474e-06, "loss": 0.3893, "num_input_tokens_seen": 9775965667, "step": 2516, "train_runtime": 67442.1442, "train_tokens_per_second": 144953.364 }, { "epoch": 0.8928698119900674, "grad_norm": 0.26916080713272095, "learning_rate": 1.1342199993788695e-06, "loss": 0.3828, "num_input_tokens_seen": 9779819582, "step": 2517, "train_runtime": 67466.6655, "train_tokens_per_second": 144957.803 }, { "epoch": 0.8932245477119546, "grad_norm": 0.2522388994693756, "learning_rate": 1.1268167435736444e-06, "loss": 0.3827, "num_input_tokens_seen": 9783703683, "step": 2518, "train_runtime": 67492.5935, "train_tokens_per_second": 144959.664 }, { "epoch": 0.8935792834338417, "grad_norm": 0.2654381990432739, "learning_rate": 1.119437027710657e-06, "loss": 0.3848, "num_input_tokens_seen": 9787584836, "step": 2519, "train_runtime": 67514.3833, "train_tokens_per_second": 144970.366 }, { "epoch": 0.893934019155729, "grad_norm": 0.22847065329551697, "learning_rate": 1.1120808609943956e-06, "loss": 0.368, "num_input_tokens_seen": 9791507809, "step": 2520, "train_runtime": 67539.7915, "train_tokens_per_second": 144973.912 }, { "epoch": 0.8942887548776162, "grad_norm": 0.2910318076610565, "learning_rate": 1.1047482525999832e-06, "loss": 0.383, "num_input_tokens_seen": 9795376525, "step": 2521, "train_runtime": 67571.6844, "train_tokens_per_second": 144962.74 }, { "epoch": 0.8946434905995033, "grad_norm": 0.25811129808425903, "learning_rate": 1.0974392116731592e-06, "loss": 0.395, "num_input_tokens_seen": 9799278909, "step": 2522, "train_runtime": 67597.8142, "train_tokens_per_second": 144964.435 }, { "epoch": 0.8949982263213906, "grad_norm": 0.4762890636920929, "learning_rate": 1.0901537473302626e-06, "loss": 0.3971, "num_input_tokens_seen": 9803182126, "step": 2523, "train_runtime": 67624.6328, "train_tokens_per_second": 144964.663 }, { "epoch": 0.8953529620432777, "grad_norm": 0.26774102449417114, "learning_rate": 1.0828918686582312e-06, "loss": 0.3954, "num_input_tokens_seen": 9807061495, "step": 2524, "train_runtime": 67652.1276, "train_tokens_per_second": 144963.091 }, { "epoch": 0.8957076977651649, "grad_norm": 0.32458558678627014, "learning_rate": 1.0756535847145844e-06, "loss": 0.3912, "num_input_tokens_seen": 9810976676, "step": 2525, "train_runtime": 67677.9199, "train_tokens_per_second": 144965.695 }, { "epoch": 0.8960624334870522, "grad_norm": 0.22812645137310028, "learning_rate": 1.0684389045274134e-06, "loss": 0.3977, "num_input_tokens_seen": 9814794391, "step": 2526, "train_runtime": 67710.5495, "train_tokens_per_second": 144952.219 }, { "epoch": 0.8964171692089393, "grad_norm": 0.33155229687690735, "learning_rate": 1.0612478370953627e-06, "loss": 0.3845, "num_input_tokens_seen": 9818657873, "step": 2527, "train_runtime": 67745.5684, "train_tokens_per_second": 144934.32 }, { "epoch": 0.8967719049308265, "grad_norm": 0.388156920671463, "learning_rate": 1.0540803913876308e-06, "loss": 0.3949, "num_input_tokens_seen": 9822577704, "step": 2528, "train_runtime": 67766.7541, "train_tokens_per_second": 144946.852 }, { "epoch": 0.8971266406527137, "grad_norm": 0.3085254728794098, "learning_rate": 1.0469365763439532e-06, "loss": 0.3893, "num_input_tokens_seen": 9826437194, "step": 2529, "train_runtime": 67797.2729, "train_tokens_per_second": 144938.532 }, { "epoch": 0.8974813763746009, "grad_norm": 0.23404604196548462, "learning_rate": 1.0398164008745916e-06, "loss": 0.3907, "num_input_tokens_seen": 9830329715, "step": 2530, "train_runtime": 67822.3309, "train_tokens_per_second": 144942.375 }, { "epoch": 0.8978361120964882, "grad_norm": 0.2299780249595642, "learning_rate": 1.0327198738603238e-06, "loss": 0.3887, "num_input_tokens_seen": 9834230767, "step": 2531, "train_runtime": 67860.5341, "train_tokens_per_second": 144918.263 }, { "epoch": 0.8981908478183753, "grad_norm": 0.22902683913707733, "learning_rate": 1.025647004152428e-06, "loss": 0.3944, "num_input_tokens_seen": 9838116798, "step": 2532, "train_runtime": 67886.6256, "train_tokens_per_second": 144919.809 }, { "epoch": 0.8985455835402625, "grad_norm": 0.19220133125782013, "learning_rate": 1.0185978005726804e-06, "loss": 0.3827, "num_input_tokens_seen": 9842023944, "step": 2533, "train_runtime": 67923.7019, "train_tokens_per_second": 144898.226 }, { "epoch": 0.8989003192621497, "grad_norm": 0.7201579213142395, "learning_rate": 1.0115722719133324e-06, "loss": 0.3886, "num_input_tokens_seen": 9845917288, "step": 2534, "train_runtime": 67955.4843, "train_tokens_per_second": 144887.751 }, { "epoch": 0.8992550549840369, "grad_norm": 0.3171398937702179, "learning_rate": 1.0045704269371125e-06, "loss": 0.3773, "num_input_tokens_seen": 9849837383, "step": 2535, "train_runtime": 67975.6426, "train_tokens_per_second": 144902.453 }, { "epoch": 0.8996097907059241, "grad_norm": 0.28875136375427246, "learning_rate": 9.97592274377206e-07, "loss": 0.4031, "num_input_tokens_seen": 9853667607, "step": 2536, "train_runtime": 68008.0779, "train_tokens_per_second": 144889.665 }, { "epoch": 0.8999645264278113, "grad_norm": 0.4499320983886719, "learning_rate": 9.906378229372505e-07, "loss": 0.3841, "num_input_tokens_seen": 9857499040, "step": 2537, "train_runtime": 68033.4738, "train_tokens_per_second": 144891.896 }, { "epoch": 0.9003192621496985, "grad_norm": 0.2517848312854767, "learning_rate": 9.837070812913208e-07, "loss": 0.3801, "num_input_tokens_seen": 9861427272, "step": 2538, "train_runtime": 68057.1453, "train_tokens_per_second": 144899.22 }, { "epoch": 0.9006739978715856, "grad_norm": 0.22726863622665405, "learning_rate": 9.768000580839131e-07, "loss": 0.3963, "num_input_tokens_seen": 9865321206, "step": 2539, "train_runtime": 68083.0977, "train_tokens_per_second": 144901.18 }, { "epoch": 0.9010287335934729, "grad_norm": 0.4794916808605194, "learning_rate": 9.699167619299477e-07, "loss": 0.3861, "num_input_tokens_seen": 9869226166, "step": 2540, "train_runtime": 68110.8415, "train_tokens_per_second": 144899.489 }, { "epoch": 0.9013834693153601, "grad_norm": 0.4152539372444153, "learning_rate": 9.6305720141475e-07, "loss": 0.3953, "num_input_tokens_seen": 9873014807, "step": 2541, "train_runtime": 68137.7168, "train_tokens_per_second": 144897.94 }, { "epoch": 0.9017382050372472, "grad_norm": 0.2417723685503006, "learning_rate": 9.562213850940382e-07, "loss": 0.3853, "num_input_tokens_seen": 9876976288, "step": 2542, "train_runtime": 68168.2056, "train_tokens_per_second": 144891.247 }, { "epoch": 0.9020929407591345, "grad_norm": 0.3102905750274658, "learning_rate": 9.494093214939171e-07, "loss": 0.3844, "num_input_tokens_seen": 9880847930, "step": 2543, "train_runtime": 68198.9309, "train_tokens_per_second": 144882.739 }, { "epoch": 0.9024476764810216, "grad_norm": 0.26932328939437866, "learning_rate": 9.426210191108654e-07, "loss": 0.3843, "num_input_tokens_seen": 9884816946, "step": 2544, "train_runtime": 68223.7792, "train_tokens_per_second": 144888.147 }, { "epoch": 0.9028024122029088, "grad_norm": 0.8683258295059204, "learning_rate": 9.358564864117215e-07, "loss": 0.3917, "num_input_tokens_seen": 9888608162, "step": 2545, "train_runtime": 68244.0886, "train_tokens_per_second": 144900.582 }, { "epoch": 0.9031571479247961, "grad_norm": 0.24109287559986115, "learning_rate": 9.291157318336785e-07, "loss": 0.3891, "num_input_tokens_seen": 9892493878, "step": 2546, "train_runtime": 68282.9277, "train_tokens_per_second": 144875.069 }, { "epoch": 0.9035118836466832, "grad_norm": 0.2805735468864441, "learning_rate": 9.22398763784278e-07, "loss": 0.3832, "num_input_tokens_seen": 9896385402, "step": 2547, "train_runtime": 68303.7075, "train_tokens_per_second": 144887.968 }, { "epoch": 0.9038666193685704, "grad_norm": 0.25294730067253113, "learning_rate": 9.157055906413847e-07, "loss": 0.3874, "num_input_tokens_seen": 9900216914, "step": 2548, "train_runtime": 68324.1289, "train_tokens_per_second": 144900.741 }, { "epoch": 0.9042213550904576, "grad_norm": 0.553794264793396, "learning_rate": 9.090362207531833e-07, "loss": 0.3823, "num_input_tokens_seen": 9904185733, "step": 2549, "train_runtime": 68350.8477, "train_tokens_per_second": 144902.164 }, { "epoch": 0.9045760908123448, "grad_norm": 0.4072200655937195, "learning_rate": 9.023906624381862e-07, "loss": 0.3887, "num_input_tokens_seen": 9908083192, "step": 2550, "train_runtime": 68370.9838, "train_tokens_per_second": 144916.493 }, { "epoch": 0.904930826534232, "grad_norm": 0.24507489800453186, "learning_rate": 8.957689239851852e-07, "loss": 0.393, "num_input_tokens_seen": 9911958081, "step": 2551, "train_runtime": 68397.1357, "train_tokens_per_second": 144917.736 }, { "epoch": 0.9052855622561192, "grad_norm": 0.2504776418209076, "learning_rate": 8.891710136532738e-07, "loss": 0.3824, "num_input_tokens_seen": 9915852841, "step": 2552, "train_runtime": 68418.2613, "train_tokens_per_second": 144929.916 }, { "epoch": 0.9056402979780064, "grad_norm": 0.23728527128696442, "learning_rate": 8.825969396718226e-07, "loss": 0.3764, "num_input_tokens_seen": 9919797893, "step": 2553, "train_runtime": 68445.1512, "train_tokens_per_second": 144930.615 }, { "epoch": 0.9059950336998935, "grad_norm": 0.2915765047073364, "learning_rate": 8.760467102404724e-07, "loss": 0.3968, "num_input_tokens_seen": 9923677156, "step": 2554, "train_runtime": 68474.4544, "train_tokens_per_second": 144925.246 }, { "epoch": 0.9063497694217808, "grad_norm": 0.20637954771518707, "learning_rate": 8.695203335291235e-07, "loss": 0.3877, "num_input_tokens_seen": 9927607559, "step": 2555, "train_runtime": 68505.6012, "train_tokens_per_second": 144916.728 }, { "epoch": 0.906704505143668, "grad_norm": 0.22493894398212433, "learning_rate": 8.630178176779269e-07, "loss": 0.3775, "num_input_tokens_seen": 9931465890, "step": 2556, "train_runtime": 68532.1796, "train_tokens_per_second": 144916.825 }, { "epoch": 0.9070592408655551, "grad_norm": 0.2801229953765869, "learning_rate": 8.565391707972704e-07, "loss": 0.3929, "num_input_tokens_seen": 9935428891, "step": 2557, "train_runtime": 68564.4742, "train_tokens_per_second": 144906.367 }, { "epoch": 0.9074139765874424, "grad_norm": 0.22795584797859192, "learning_rate": 8.500844009677678e-07, "loss": 0.3893, "num_input_tokens_seen": 9939274044, "step": 2558, "train_runtime": 68589.9419, "train_tokens_per_second": 144908.623 }, { "epoch": 0.9077687123093295, "grad_norm": 0.2467738389968872, "learning_rate": 8.436535162402592e-07, "loss": 0.3787, "num_input_tokens_seen": 9943176028, "step": 2559, "train_runtime": 68618.9151, "train_tokens_per_second": 144904.302 }, { "epoch": 0.9081234480312167, "grad_norm": 0.9294378161430359, "learning_rate": 8.372465246357885e-07, "loss": 0.4025, "num_input_tokens_seen": 9947082633, "step": 2560, "train_runtime": 68643.2908, "train_tokens_per_second": 144909.758 }, { "epoch": 0.908478183753104, "grad_norm": 0.3013227880001068, "learning_rate": 8.30863434145599e-07, "loss": 0.3976, "num_input_tokens_seen": 9950893395, "step": 2561, "train_runtime": 68664.9966, "train_tokens_per_second": 144919.448 }, { "epoch": 0.9088329194749911, "grad_norm": 0.3043067753314972, "learning_rate": 8.245042527311264e-07, "loss": 0.3941, "num_input_tokens_seen": 9954781376, "step": 2562, "train_runtime": 68685.9197, "train_tokens_per_second": 144931.908 }, { "epoch": 0.9091876551968783, "grad_norm": 0.18161830306053162, "learning_rate": 8.181689883239819e-07, "loss": 0.3937, "num_input_tokens_seen": 9958726354, "step": 2563, "train_runtime": 68704.2111, "train_tokens_per_second": 144950.742 }, { "epoch": 0.9095423909187655, "grad_norm": 0.2527953088283539, "learning_rate": 8.118576488259422e-07, "loss": 0.3857, "num_input_tokens_seen": 9962571620, "step": 2564, "train_runtime": 68735.427, "train_tokens_per_second": 144940.856 }, { "epoch": 0.9098971266406527, "grad_norm": 0.30561894178390503, "learning_rate": 8.055702421089462e-07, "loss": 0.3958, "num_input_tokens_seen": 9966454341, "step": 2565, "train_runtime": 68760.2076, "train_tokens_per_second": 144945.088 }, { "epoch": 0.9102518623625399, "grad_norm": 0.3279886543750763, "learning_rate": 7.993067760150918e-07, "loss": 0.3895, "num_input_tokens_seen": 9970359806, "step": 2566, "train_runtime": 68784.0338, "train_tokens_per_second": 144951.659 }, { "epoch": 0.9106065980844271, "grad_norm": 0.2122339904308319, "learning_rate": 7.930672583566035e-07, "loss": 0.3862, "num_input_tokens_seen": 9974278027, "step": 2567, "train_runtime": 68811.5119, "train_tokens_per_second": 144950.717 }, { "epoch": 0.9109613338063143, "grad_norm": 0.25118324160575867, "learning_rate": 7.868516969158402e-07, "loss": 0.3795, "num_input_tokens_seen": 9978102285, "step": 2568, "train_runtime": 68850.3142, "train_tokens_per_second": 144924.572 }, { "epoch": 0.9113160695282015, "grad_norm": 2.658940553665161, "learning_rate": 7.806600994452829e-07, "loss": 0.3967, "num_input_tokens_seen": 9982022264, "step": 2569, "train_runtime": 68870.78, "train_tokens_per_second": 144938.423 }, { "epoch": 0.9116708052500887, "grad_norm": 0.2659049928188324, "learning_rate": 7.744924736675164e-07, "loss": 0.3928, "num_input_tokens_seen": 9985870740, "step": 2570, "train_runtime": 68904.3312, "train_tokens_per_second": 144923.702 }, { "epoch": 0.9120255409719759, "grad_norm": 0.20502400398254395, "learning_rate": 7.683488272752337e-07, "loss": 0.3782, "num_input_tokens_seen": 9989803641, "step": 2571, "train_runtime": 68926.5653, "train_tokens_per_second": 144934.012 }, { "epoch": 0.9123802766938631, "grad_norm": 0.24243022501468658, "learning_rate": 7.622291679312166e-07, "loss": 0.3876, "num_input_tokens_seen": 9993646469, "step": 2572, "train_runtime": 68949.0526, "train_tokens_per_second": 144942.477 }, { "epoch": 0.9127350124157503, "grad_norm": 0.36113572120666504, "learning_rate": 7.561335032683303e-07, "loss": 0.3891, "num_input_tokens_seen": 9997538030, "step": 2573, "train_runtime": 68968.8817, "train_tokens_per_second": 144957.23 }, { "epoch": 0.9130897481376374, "grad_norm": 0.21084316074848175, "learning_rate": 7.500618408895044e-07, "loss": 0.3845, "num_input_tokens_seen": 10001384320, "step": 2574, "train_runtime": 68997.5762, "train_tokens_per_second": 144952.691 }, { "epoch": 0.9134444838595247, "grad_norm": 0.2476728856563568, "learning_rate": 7.440141883677454e-07, "loss": 0.3896, "num_input_tokens_seen": 10005235079, "step": 2575, "train_runtime": 69022.5814, "train_tokens_per_second": 144955.968 }, { "epoch": 0.9137992195814119, "grad_norm": 0.2344910055398941, "learning_rate": 7.379905532460974e-07, "loss": 0.3751, "num_input_tokens_seen": 10009164917, "step": 2576, "train_runtime": 69045.8316, "train_tokens_per_second": 144964.072 }, { "epoch": 0.914153955303299, "grad_norm": 0.23052029311656952, "learning_rate": 7.319909430376593e-07, "loss": 0.3856, "num_input_tokens_seen": 10013002625, "step": 2577, "train_runtime": 69073.1761, "train_tokens_per_second": 144962.244 }, { "epoch": 0.9145086910251863, "grad_norm": 0.33733877539634705, "learning_rate": 7.260153652255586e-07, "loss": 0.3938, "num_input_tokens_seen": 10016882939, "step": 2578, "train_runtime": 69091.8367, "train_tokens_per_second": 144979.254 }, { "epoch": 0.9148634267470734, "grad_norm": 0.28692781925201416, "learning_rate": 7.200638272629512e-07, "loss": 0.3939, "num_input_tokens_seen": 10020779673, "step": 2579, "train_runtime": 69113.0091, "train_tokens_per_second": 144991.222 }, { "epoch": 0.9152181624689606, "grad_norm": 0.2352931946516037, "learning_rate": 7.141363365730081e-07, "loss": 0.3824, "num_input_tokens_seen": 10024673084, "step": 2580, "train_runtime": 69142.3429, "train_tokens_per_second": 144986.02 }, { "epoch": 0.9155728981908479, "grad_norm": 0.2269282341003418, "learning_rate": 7.082329005489063e-07, "loss": 0.3838, "num_input_tokens_seen": 10028572874, "step": 2581, "train_runtime": 69161.6608, "train_tokens_per_second": 145001.909 }, { "epoch": 0.915927633912735, "grad_norm": 0.2335023581981659, "learning_rate": 7.023535265538207e-07, "loss": 0.3756, "num_input_tokens_seen": 10032494221, "step": 2582, "train_runtime": 69189.1646, "train_tokens_per_second": 145000.945 }, { "epoch": 0.9162823696346222, "grad_norm": 0.2414119988679886, "learning_rate": 6.964982219209137e-07, "loss": 0.3994, "num_input_tokens_seen": 10036424391, "step": 2583, "train_runtime": 69209.6421, "train_tokens_per_second": 145014.829 }, { "epoch": 0.9166371053565094, "grad_norm": 0.22317950427532196, "learning_rate": 6.906669939533262e-07, "loss": 0.3845, "num_input_tokens_seen": 10040243014, "step": 2584, "train_runtime": 69236.9266, "train_tokens_per_second": 145012.835 }, { "epoch": 0.9169918410783966, "grad_norm": 0.22529137134552002, "learning_rate": 6.848598499241732e-07, "loss": 0.3852, "num_input_tokens_seen": 10044188838, "step": 2585, "train_runtime": 69261.4371, "train_tokens_per_second": 145018.487 }, { "epoch": 0.9173465768002838, "grad_norm": 0.24142858386039734, "learning_rate": 6.790767970765211e-07, "loss": 0.3884, "num_input_tokens_seen": 10047997727, "step": 2586, "train_runtime": 69279.0297, "train_tokens_per_second": 145036.641 }, { "epoch": 0.917701312522171, "grad_norm": 0.2705199718475342, "learning_rate": 6.733178426234e-07, "loss": 0.3869, "num_input_tokens_seen": 10051854314, "step": 2587, "train_runtime": 69307.5795, "train_tokens_per_second": 145032.54 }, { "epoch": 0.9180560482440582, "grad_norm": 0.292214035987854, "learning_rate": 6.675829937477773e-07, "loss": 0.3831, "num_input_tokens_seen": 10055670482, "step": 2588, "train_runtime": 69338.8103, "train_tokens_per_second": 145022.253 }, { "epoch": 0.9184107839659453, "grad_norm": 0.26025161147117615, "learning_rate": 6.618722576025472e-07, "loss": 0.3918, "num_input_tokens_seen": 10059601486, "step": 2589, "train_runtime": 69361.6951, "train_tokens_per_second": 145031.079 }, { "epoch": 0.9187655196878326, "grad_norm": 0.2318294793367386, "learning_rate": 6.561856413105428e-07, "loss": 0.3904, "num_input_tokens_seen": 10063461106, "step": 2590, "train_runtime": 69383.3589, "train_tokens_per_second": 145041.423 }, { "epoch": 0.9191202554097198, "grad_norm": 0.26421114802360535, "learning_rate": 6.505231519645017e-07, "loss": 0.3963, "num_input_tokens_seen": 10067368010, "step": 2591, "train_runtime": 69405.9006, "train_tokens_per_second": 145050.607 }, { "epoch": 0.9194749911316069, "grad_norm": 0.23046068847179413, "learning_rate": 6.448847966270743e-07, "loss": 0.3848, "num_input_tokens_seen": 10071210509, "step": 2592, "train_runtime": 69424.8351, "train_tokens_per_second": 145066.394 }, { "epoch": 0.9198297268534942, "grad_norm": 0.5261188745498657, "learning_rate": 6.392705823308087e-07, "loss": 0.3717, "num_input_tokens_seen": 10075111368, "step": 2593, "train_runtime": 69462.8666, "train_tokens_per_second": 145043.127 }, { "epoch": 0.9201844625753813, "grad_norm": 0.21874447166919708, "learning_rate": 6.336805160781456e-07, "loss": 0.3931, "num_input_tokens_seen": 10079066801, "step": 2594, "train_runtime": 69484.9075, "train_tokens_per_second": 145054.043 }, { "epoch": 0.9205391982972685, "grad_norm": 0.25193682312965393, "learning_rate": 6.281146048413967e-07, "loss": 0.3836, "num_input_tokens_seen": 10082888145, "step": 2595, "train_runtime": 69511.622, "train_tokens_per_second": 145053.271 }, { "epoch": 0.9208939340191558, "grad_norm": 0.5098807215690613, "learning_rate": 6.22572855562753e-07, "loss": 0.3752, "num_input_tokens_seen": 10086721525, "step": 2596, "train_runtime": 69530.4073, "train_tokens_per_second": 145069.214 }, { "epoch": 0.9212486697410429, "grad_norm": 0.37726029753685, "learning_rate": 6.170552751542724e-07, "loss": 0.3858, "num_input_tokens_seen": 10090591824, "step": 2597, "train_runtime": 69560.6879, "train_tokens_per_second": 145061.703 }, { "epoch": 0.9216034054629301, "grad_norm": 0.27637284994125366, "learning_rate": 6.115618704978609e-07, "loss": 0.4038, "num_input_tokens_seen": 10094492998, "step": 2598, "train_runtime": 69582.0969, "train_tokens_per_second": 145073.136 }, { "epoch": 0.9219581411848173, "grad_norm": 0.3719647526741028, "learning_rate": 6.060926484452734e-07, "loss": 0.3855, "num_input_tokens_seen": 10098417137, "step": 2599, "train_runtime": 69601.8752, "train_tokens_per_second": 145088.291 }, { "epoch": 0.9223128769067045, "grad_norm": 0.20983225107192993, "learning_rate": 6.006476158181041e-07, "loss": 0.3908, "num_input_tokens_seen": 10102260465, "step": 2600, "train_runtime": 69624.1526, "train_tokens_per_second": 145097.069 }, { "epoch": 0.9226676126285916, "grad_norm": 0.30133503675460815, "learning_rate": 5.952267794077737e-07, "loss": 0.3882, "num_input_tokens_seen": 10106096104, "step": 2601, "train_runtime": 69763.7948, "train_tokens_per_second": 144861.617 }, { "epoch": 0.9230223483504789, "grad_norm": 0.36285343766212463, "learning_rate": 5.898301459755251e-07, "loss": 0.3902, "num_input_tokens_seen": 10109961189, "step": 2602, "train_runtime": 69789.113, "train_tokens_per_second": 144864.446 }, { "epoch": 0.9233770840723661, "grad_norm": 0.2264968603849411, "learning_rate": 5.844577222524162e-07, "loss": 0.3912, "num_input_tokens_seen": 10113883743, "step": 2603, "train_runtime": 69811.457, "train_tokens_per_second": 144874.268 }, { "epoch": 0.9237318197942532, "grad_norm": 0.30939117074012756, "learning_rate": 5.791095149393022e-07, "loss": 0.3901, "num_input_tokens_seen": 10117767349, "step": 2604, "train_runtime": 69840.3961, "train_tokens_per_second": 144869.845 }, { "epoch": 0.9240865555161405, "grad_norm": 0.6367536783218384, "learning_rate": 5.737855307068408e-07, "loss": 0.3789, "num_input_tokens_seen": 10121643333, "step": 2605, "train_runtime": 69867.9001, "train_tokens_per_second": 144868.292 }, { "epoch": 0.9244412912380277, "grad_norm": 0.21330414712429047, "learning_rate": 5.684857761954754e-07, "loss": 0.3871, "num_input_tokens_seen": 10125501806, "step": 2606, "train_runtime": 69904.4669, "train_tokens_per_second": 144847.708 }, { "epoch": 0.9247960269599149, "grad_norm": 0.21590569615364075, "learning_rate": 5.632102580154231e-07, "loss": 0.3848, "num_input_tokens_seen": 10129424447, "step": 2607, "train_runtime": 69925.2662, "train_tokens_per_second": 144860.721 }, { "epoch": 0.9251507626818021, "grad_norm": 0.26435887813568115, "learning_rate": 5.57958982746678e-07, "loss": 0.3957, "num_input_tokens_seen": 10133313642, "step": 2608, "train_runtime": 69956.5193, "train_tokens_per_second": 144851.598 }, { "epoch": 0.9255054984036892, "grad_norm": 0.5623168349266052, "learning_rate": 5.527319569389944e-07, "loss": 0.383, "num_input_tokens_seen": 10137141066, "step": 2609, "train_runtime": 69979.8309, "train_tokens_per_second": 144858.039 }, { "epoch": 0.9258602341255765, "grad_norm": 0.30865055322647095, "learning_rate": 5.475291871118793e-07, "loss": 0.3938, "num_input_tokens_seen": 10141076191, "step": 2610, "train_runtime": 70014.5215, "train_tokens_per_second": 144842.469 }, { "epoch": 0.9262149698474637, "grad_norm": 0.4324350953102112, "learning_rate": 5.423506797545907e-07, "loss": 0.3958, "num_input_tokens_seen": 10144965501, "step": 2611, "train_runtime": 70039.5054, "train_tokens_per_second": 144846.333 }, { "epoch": 0.9265697055693508, "grad_norm": 0.229365274310112, "learning_rate": 5.371964413261243e-07, "loss": 0.3843, "num_input_tokens_seen": 10148850507, "step": 2612, "train_runtime": 70059.0868, "train_tokens_per_second": 144861.302 }, { "epoch": 0.9269244412912381, "grad_norm": 0.2743283808231354, "learning_rate": 5.320664782551954e-07, "loss": 0.3827, "num_input_tokens_seen": 10152788577, "step": 2613, "train_runtime": 70084.0981, "train_tokens_per_second": 144865.795 }, { "epoch": 0.9272791770131252, "grad_norm": 0.23952221870422363, "learning_rate": 5.269607969402546e-07, "loss": 0.373, "num_input_tokens_seen": 10156666965, "step": 2614, "train_runtime": 70116.8185, "train_tokens_per_second": 144853.506 }, { "epoch": 0.9276339127350124, "grad_norm": 0.18808157742023468, "learning_rate": 5.218794037494634e-07, "loss": 0.3852, "num_input_tokens_seen": 10160631499, "step": 2615, "train_runtime": 70148.6929, "train_tokens_per_second": 144844.203 }, { "epoch": 0.9279886484568997, "grad_norm": 0.3494366407394409, "learning_rate": 5.168223050206833e-07, "loss": 0.3937, "num_input_tokens_seen": 10164487012, "step": 2616, "train_runtime": 70171.7498, "train_tokens_per_second": 144851.554 }, { "epoch": 0.9283433841787868, "grad_norm": 2.441261053085327, "learning_rate": 5.117895070614797e-07, "loss": 0.3865, "num_input_tokens_seen": 10168456968, "step": 2617, "train_runtime": 70199.412, "train_tokens_per_second": 144851.028 }, { "epoch": 0.928698119900674, "grad_norm": 0.2595275044441223, "learning_rate": 5.067810161491094e-07, "loss": 0.3973, "num_input_tokens_seen": 10172312982, "step": 2618, "train_runtime": 70224.4304, "train_tokens_per_second": 144854.332 }, { "epoch": 0.9290528556225612, "grad_norm": 0.2388169765472412, "learning_rate": 5.017968385305061e-07, "loss": 0.3837, "num_input_tokens_seen": 10176194565, "step": 2619, "train_runtime": 70249.8698, "train_tokens_per_second": 144857.131 }, { "epoch": 0.9294075913444484, "grad_norm": 0.23133264482021332, "learning_rate": 4.968369804222817e-07, "loss": 0.3933, "num_input_tokens_seen": 10180042649, "step": 2620, "train_runtime": 70274.9937, "train_tokens_per_second": 144860.101 }, { "epoch": 0.9297623270663355, "grad_norm": 0.284017413854599, "learning_rate": 4.919014480107187e-07, "loss": 0.391, "num_input_tokens_seen": 10183986574, "step": 2621, "train_runtime": 70303.4702, "train_tokens_per_second": 144857.523 }, { "epoch": 0.9301170627882228, "grad_norm": 0.22432176768779755, "learning_rate": 4.869902474517529e-07, "loss": 0.3786, "num_input_tokens_seen": 10187841913, "step": 2622, "train_runtime": 70328.16, "train_tokens_per_second": 144861.488 }, { "epoch": 0.93047179851011, "grad_norm": 0.428864985704422, "learning_rate": 4.821033848709733e-07, "loss": 0.3995, "num_input_tokens_seen": 10191778927, "step": 2623, "train_runtime": 70355.9586, "train_tokens_per_second": 144860.21 }, { "epoch": 0.9308265342319971, "grad_norm": 0.2769404649734497, "learning_rate": 4.77240866363613e-07, "loss": 0.3821, "num_input_tokens_seen": 10195626668, "step": 2624, "train_runtime": 70375.0822, "train_tokens_per_second": 144875.52 }, { "epoch": 0.9311812699538844, "grad_norm": 0.34178629517555237, "learning_rate": 4.724026979945473e-07, "loss": 0.3906, "num_input_tokens_seen": 10199552567, "step": 2625, "train_runtime": 70399.9829, "train_tokens_per_second": 144880.043 }, { "epoch": 0.9315360056757716, "grad_norm": 0.30035486817359924, "learning_rate": 4.675888857982669e-07, "loss": 0.3914, "num_input_tokens_seen": 10203460143, "step": 2626, "train_runtime": 70425.1096, "train_tokens_per_second": 144883.838 }, { "epoch": 0.9318907413976587, "grad_norm": 0.29316869378089905, "learning_rate": 4.6279943577889783e-07, "loss": 0.3777, "num_input_tokens_seen": 10207318031, "step": 2627, "train_runtime": 70451.8845, "train_tokens_per_second": 144883.534 }, { "epoch": 0.932245477119546, "grad_norm": 0.3057233691215515, "learning_rate": 4.580343539101728e-07, "loss": 0.3914, "num_input_tokens_seen": 10211231428, "step": 2628, "train_runtime": 70475.9004, "train_tokens_per_second": 144889.691 }, { "epoch": 0.9326002128414331, "grad_norm": 2.2024576663970947, "learning_rate": 4.5329364613543093e-07, "loss": 0.3856, "num_input_tokens_seen": 10215139848, "step": 2629, "train_runtime": 70505.5021, "train_tokens_per_second": 144884.293 }, { "epoch": 0.9329549485633203, "grad_norm": 0.3101314902305603, "learning_rate": 4.4857731836761343e-07, "loss": 0.3844, "num_input_tokens_seen": 10218931474, "step": 2630, "train_runtime": 70535.6801, "train_tokens_per_second": 144876.061 }, { "epoch": 0.9333096842852076, "grad_norm": 0.2908758521080017, "learning_rate": 4.4388537648925035e-07, "loss": 0.391, "num_input_tokens_seen": 10222917373, "step": 2631, "train_runtime": 70561.9481, "train_tokens_per_second": 144878.616 }, { "epoch": 0.9336644200070947, "grad_norm": 0.24409601092338562, "learning_rate": 4.3921782635245823e-07, "loss": 0.392, "num_input_tokens_seen": 10226758138, "step": 2632, "train_runtime": 70586.0086, "train_tokens_per_second": 144883.644 }, { "epoch": 0.9340191557289819, "grad_norm": 0.3175964951515198, "learning_rate": 4.345746737789247e-07, "loss": 0.3859, "num_input_tokens_seen": 10230651742, "step": 2633, "train_runtime": 70608.4961, "train_tokens_per_second": 144892.645 }, { "epoch": 0.9343738914508691, "grad_norm": 0.30444222688674927, "learning_rate": 4.29955924559915e-07, "loss": 0.3827, "num_input_tokens_seen": 10234558054, "step": 2634, "train_runtime": 70629.6452, "train_tokens_per_second": 144904.566 }, { "epoch": 0.9347286271727563, "grad_norm": 0.1860361546278, "learning_rate": 4.25361584456252e-07, "loss": 0.3877, "num_input_tokens_seen": 10238430473, "step": 2635, "train_runtime": 70654.8984, "train_tokens_per_second": 144907.582 }, { "epoch": 0.9350833628946434, "grad_norm": 0.2431369125843048, "learning_rate": 4.2079165919831633e-07, "loss": 0.3854, "num_input_tokens_seen": 10242333072, "step": 2636, "train_runtime": 70683.4185, "train_tokens_per_second": 144904.325 }, { "epoch": 0.9354380986165307, "grad_norm": 0.2722305655479431, "learning_rate": 4.1624615448603524e-07, "loss": 0.3901, "num_input_tokens_seen": 10246273145, "step": 2637, "train_runtime": 70702.4927, "train_tokens_per_second": 144920.96 }, { "epoch": 0.9357928343384179, "grad_norm": 0.24090099334716797, "learning_rate": 4.11725075988878e-07, "loss": 0.3839, "num_input_tokens_seen": 10250142764, "step": 2638, "train_runtime": 70727.8539, "train_tokens_per_second": 144923.707 }, { "epoch": 0.936147570060305, "grad_norm": 0.2673729360103607, "learning_rate": 4.072284293458451e-07, "loss": 0.3978, "num_input_tokens_seen": 10254037798, "step": 2639, "train_runtime": 70757.1324, "train_tokens_per_second": 144918.787 }, { "epoch": 0.9365023057821923, "grad_norm": 0.2082284688949585, "learning_rate": 4.0275622016546554e-07, "loss": 0.3819, "num_input_tokens_seen": 10257913632, "step": 2640, "train_runtime": 70788.9562, "train_tokens_per_second": 144908.389 }, { "epoch": 0.9368570415040794, "grad_norm": 0.21167321503162384, "learning_rate": 3.9830845402579086e-07, "loss": 0.3827, "num_input_tokens_seen": 10261741575, "step": 2641, "train_runtime": 70815.2445, "train_tokens_per_second": 144908.651 }, { "epoch": 0.9372117772259666, "grad_norm": 0.25889360904693604, "learning_rate": 3.938851364743812e-07, "loss": 0.391, "num_input_tokens_seen": 10265649717, "step": 2642, "train_runtime": 70849.3392, "train_tokens_per_second": 144894.079 }, { "epoch": 0.9375665129478539, "grad_norm": 0.24919907748699188, "learning_rate": 3.894862730283078e-07, "loss": 0.3891, "num_input_tokens_seen": 10269534452, "step": 2643, "train_runtime": 70869.5799, "train_tokens_per_second": 144907.511 }, { "epoch": 0.937921248669741, "grad_norm": 0.44342687726020813, "learning_rate": 3.8511186917413736e-07, "loss": 0.3884, "num_input_tokens_seen": 10273398462, "step": 2644, "train_runtime": 70894.5205, "train_tokens_per_second": 144911.037 }, { "epoch": 0.9382759843916282, "grad_norm": 2.1034765243530273, "learning_rate": 3.8076193036792553e-07, "loss": 0.3977, "num_input_tokens_seen": 10277349068, "step": 2645, "train_runtime": 70919.7573, "train_tokens_per_second": 144915.175 }, { "epoch": 0.9386307201135154, "grad_norm": 0.3282065689563751, "learning_rate": 3.7643646203521897e-07, "loss": 0.3909, "num_input_tokens_seen": 10281223090, "step": 2646, "train_runtime": 70938.7516, "train_tokens_per_second": 144930.984 }, { "epoch": 0.9389854558354026, "grad_norm": 0.35821694135665894, "learning_rate": 3.721354695710444e-07, "loss": 0.3941, "num_input_tokens_seen": 10285111954, "step": 2647, "train_runtime": 70960.3119, "train_tokens_per_second": 144941.752 }, { "epoch": 0.9393401915572899, "grad_norm": 0.24755671620368958, "learning_rate": 3.6785895833989727e-07, "loss": 0.3812, "num_input_tokens_seen": 10289027821, "step": 2648, "train_runtime": 70995.7293, "train_tokens_per_second": 144924.602 }, { "epoch": 0.939694927279177, "grad_norm": 0.23035390675067902, "learning_rate": 3.636069336757353e-07, "loss": 0.3814, "num_input_tokens_seen": 10292917188, "step": 2649, "train_runtime": 71027.423, "train_tokens_per_second": 144914.693 }, { "epoch": 0.9400496630010642, "grad_norm": 0.2851038873195648, "learning_rate": 3.593794008819829e-07, "loss": 0.3956, "num_input_tokens_seen": 10296767972, "step": 2650, "train_runtime": 71048.5307, "train_tokens_per_second": 144925.84 }, { "epoch": 0.9404043987229515, "grad_norm": 0.2776734530925751, "learning_rate": 3.551763652315088e-07, "loss": 0.3767, "num_input_tokens_seen": 10300696762, "step": 2651, "train_runtime": 71081.9715, "train_tokens_per_second": 144912.93 }, { "epoch": 0.9407591344448386, "grad_norm": 0.26736220717430115, "learning_rate": 3.5099783196663074e-07, "loss": 0.4061, "num_input_tokens_seen": 10304599791, "step": 2652, "train_runtime": 71112.3746, "train_tokens_per_second": 144905.86 }, { "epoch": 0.9411138701667258, "grad_norm": 0.5615634918212891, "learning_rate": 3.468438062991042e-07, "loss": 0.3876, "num_input_tokens_seen": 10308518019, "step": 2653, "train_runtime": 71132.5167, "train_tokens_per_second": 144919.911 }, { "epoch": 0.941468605888613, "grad_norm": 0.22342520952224731, "learning_rate": 3.427142934101202e-07, "loss": 0.3883, "num_input_tokens_seen": 10312370628, "step": 2654, "train_runtime": 71164.15, "train_tokens_per_second": 144909.63 }, { "epoch": 0.9418233416105002, "grad_norm": 0.3472968637943268, "learning_rate": 3.38609298450292e-07, "loss": 0.4011, "num_input_tokens_seen": 10316232024, "step": 2655, "train_runtime": 71190.5129, "train_tokens_per_second": 144910.208 }, { "epoch": 0.9421780773323873, "grad_norm": 0.23839351534843445, "learning_rate": 3.3452882653965734e-07, "loss": 0.386, "num_input_tokens_seen": 10320126595, "step": 2656, "train_runtime": 71223.1758, "train_tokens_per_second": 144898.433 }, { "epoch": 0.9425328130542746, "grad_norm": 0.31396040320396423, "learning_rate": 3.3047288276765844e-07, "loss": 0.3744, "num_input_tokens_seen": 10324017847, "step": 2657, "train_runtime": 71251.45, "train_tokens_per_second": 144895.547 }, { "epoch": 0.9428875487761618, "grad_norm": 0.24904032051563263, "learning_rate": 3.2644147219315305e-07, "loss": 0.3936, "num_input_tokens_seen": 10327918035, "step": 2658, "train_runtime": 71271.5486, "train_tokens_per_second": 144909.41 }, { "epoch": 0.9432422844980489, "grad_norm": 0.26939594745635986, "learning_rate": 3.224345998443923e-07, "loss": 0.3892, "num_input_tokens_seen": 10331803169, "step": 2659, "train_runtime": 71300.8622, "train_tokens_per_second": 144904.323 }, { "epoch": 0.9435970202199362, "grad_norm": 0.32192716002464294, "learning_rate": 3.184522707190296e-07, "loss": 0.3813, "num_input_tokens_seen": 10335616358, "step": 2660, "train_runtime": 71330.8559, "train_tokens_per_second": 144896.85 }, { "epoch": 0.9439517559418233, "grad_norm": 0.39788445830345154, "learning_rate": 3.1449448978409627e-07, "loss": 0.3894, "num_input_tokens_seen": 10339579856, "step": 2661, "train_runtime": 71359.5863, "train_tokens_per_second": 144894.055 }, { "epoch": 0.9443064916637105, "grad_norm": 0.22267885506153107, "learning_rate": 3.10561261976019e-07, "loss": 0.384, "num_input_tokens_seen": 10343456825, "step": 2662, "train_runtime": 71386.6011, "train_tokens_per_second": 144893.533 }, { "epoch": 0.9446612273855978, "grad_norm": 0.405804306268692, "learning_rate": 3.0665259220058475e-07, "loss": 0.3907, "num_input_tokens_seen": 10347345786, "step": 2663, "train_runtime": 71406.721, "train_tokens_per_second": 144907.169 }, { "epoch": 0.9450159631074849, "grad_norm": 0.27042219042778015, "learning_rate": 3.0276848533295824e-07, "loss": 0.3877, "num_input_tokens_seen": 10351207454, "step": 2664, "train_runtime": 71430.238, "train_tokens_per_second": 144913.523 }, { "epoch": 0.9453706988293721, "grad_norm": 0.21212929487228394, "learning_rate": 2.9890894621767085e-07, "loss": 0.3906, "num_input_tokens_seen": 10355133451, "step": 2665, "train_runtime": 71461.7109, "train_tokens_per_second": 144904.639 }, { "epoch": 0.9457254345512593, "grad_norm": 0.26186075806617737, "learning_rate": 2.9507397966860306e-07, "loss": 0.4002, "num_input_tokens_seen": 10358987293, "step": 2666, "train_runtime": 71482.5469, "train_tokens_per_second": 144916.315 }, { "epoch": 0.9460801702731465, "grad_norm": 0.3264133036136627, "learning_rate": 2.912635904689931e-07, "loss": 0.3899, "num_input_tokens_seen": 10362864193, "step": 2667, "train_runtime": 71510.3151, "train_tokens_per_second": 144914.257 }, { "epoch": 0.9464349059950337, "grad_norm": 0.2700214385986328, "learning_rate": 2.874777833714237e-07, "loss": 0.4031, "num_input_tokens_seen": 10366791437, "step": 2668, "train_runtime": 71537.5236, "train_tokens_per_second": 144914.038 }, { "epoch": 0.9467896417169209, "grad_norm": 0.19952738285064697, "learning_rate": 2.837165630978134e-07, "loss": 0.3764, "num_input_tokens_seen": 10370701917, "step": 2669, "train_runtime": 71563.5262, "train_tokens_per_second": 144916.027 }, { "epoch": 0.9471443774388081, "grad_norm": 0.2698543071746826, "learning_rate": 2.799799343394161e-07, "loss": 0.3772, "num_input_tokens_seen": 10374571723, "step": 2670, "train_runtime": 71594.9268, "train_tokens_per_second": 144906.52 }, { "epoch": 0.9474991131606952, "grad_norm": 0.27793875336647034, "learning_rate": 2.7626790175681704e-07, "loss": 0.3821, "num_input_tokens_seen": 10378527720, "step": 2671, "train_runtime": 71625.726, "train_tokens_per_second": 144899.442 }, { "epoch": 0.9478538488825825, "grad_norm": 0.2408047467470169, "learning_rate": 2.725804699799195e-07, "loss": 0.3818, "num_input_tokens_seen": 10382373320, "step": 2672, "train_runtime": 71650.6987, "train_tokens_per_second": 144902.611 }, { "epoch": 0.9482085846044697, "grad_norm": 0.38596034049987793, "learning_rate": 2.6891764360794437e-07, "loss": 0.3913, "num_input_tokens_seen": 10386327583, "step": 2673, "train_runtime": 71688.1013, "train_tokens_per_second": 144882.169 }, { "epoch": 0.9485633203263568, "grad_norm": 0.4173130393028259, "learning_rate": 2.6527942720941946e-07, "loss": 0.3844, "num_input_tokens_seen": 10390191654, "step": 2674, "train_runtime": 71720.9315, "train_tokens_per_second": 144869.725 }, { "epoch": 0.9489180560482441, "grad_norm": 0.3144417703151703, "learning_rate": 2.61665825322186e-07, "loss": 0.3841, "num_input_tokens_seen": 10394124119, "step": 2675, "train_runtime": 71743.3254, "train_tokens_per_second": 144879.319 }, { "epoch": 0.9492727917701312, "grad_norm": 0.5667002201080322, "learning_rate": 2.5807684245337414e-07, "loss": 0.386, "num_input_tokens_seen": 10398053178, "step": 2676, "train_runtime": 71768.7374, "train_tokens_per_second": 144882.766 }, { "epoch": 0.9496275274920184, "grad_norm": 0.2162804752588272, "learning_rate": 2.5451248307941434e-07, "loss": 0.3883, "num_input_tokens_seen": 10401917741, "step": 2677, "train_runtime": 71794.2338, "train_tokens_per_second": 144885.142 }, { "epoch": 0.9499822632139057, "grad_norm": 0.23378798365592957, "learning_rate": 2.509727516460214e-07, "loss": 0.3862, "num_input_tokens_seen": 10405778426, "step": 2678, "train_runtime": 71814.144, "train_tokens_per_second": 144898.732 }, { "epoch": 0.9503369989357928, "grad_norm": 0.2494104951620102, "learning_rate": 2.4745765256819264e-07, "loss": 0.3951, "num_input_tokens_seen": 10409599810, "step": 2679, "train_runtime": 71835.5865, "train_tokens_per_second": 144908.677 }, { "epoch": 0.95069173465768, "grad_norm": 0.26876693964004517, "learning_rate": 2.439671902302076e-07, "loss": 0.3878, "num_input_tokens_seen": 10413534315, "step": 2680, "train_runtime": 71859.1571, "train_tokens_per_second": 144915.898 }, { "epoch": 0.9510464703795672, "grad_norm": 0.21101875603199005, "learning_rate": 2.405013689856084e-07, "loss": 0.3808, "num_input_tokens_seen": 10417461443, "step": 2681, "train_runtime": 71890.5644, "train_tokens_per_second": 144907.215 }, { "epoch": 0.9514012061014544, "grad_norm": 0.25656288862228394, "learning_rate": 2.3706019315721029e-07, "loss": 0.399, "num_input_tokens_seen": 10421348653, "step": 2682, "train_runtime": 71918.7518, "train_tokens_per_second": 144904.47 }, { "epoch": 0.9517559418233416, "grad_norm": 0.21558383107185364, "learning_rate": 2.336436670370845e-07, "loss": 0.3836, "num_input_tokens_seen": 10425245462, "step": 2683, "train_runtime": 71944.9546, "train_tokens_per_second": 144905.859 }, { "epoch": 0.9521106775452288, "grad_norm": 0.2952481508255005, "learning_rate": 2.3025179488655792e-07, "loss": 0.3795, "num_input_tokens_seen": 10429133271, "step": 2684, "train_runtime": 71972.1379, "train_tokens_per_second": 144905.148 }, { "epoch": 0.952465413267116, "grad_norm": 0.2305695116519928, "learning_rate": 2.2688458093621302e-07, "loss": 0.3857, "num_input_tokens_seen": 10432971702, "step": 2685, "train_runtime": 71996.815, "train_tokens_per_second": 144908.795 }, { "epoch": 0.9528201489890032, "grad_norm": 0.22327451407909393, "learning_rate": 2.235420293858681e-07, "loss": 0.3849, "num_input_tokens_seen": 10436894200, "step": 2686, "train_runtime": 72027.962, "train_tokens_per_second": 144900.59 }, { "epoch": 0.9531748847108904, "grad_norm": 0.19164524972438812, "learning_rate": 2.2022414440458605e-07, "loss": 0.3861, "num_input_tokens_seen": 10440814810, "step": 2687, "train_runtime": 72050.1131, "train_tokens_per_second": 144910.457 }, { "epoch": 0.9535296204327776, "grad_norm": 0.25427180528640747, "learning_rate": 2.1693093013066546e-07, "loss": 0.3871, "num_input_tokens_seen": 10444698157, "step": 2688, "train_runtime": 72081.6523, "train_tokens_per_second": 144900.926 }, { "epoch": 0.9538843561546648, "grad_norm": 0.23946800827980042, "learning_rate": 2.1366239067162953e-07, "loss": 0.393, "num_input_tokens_seen": 10448563205, "step": 2689, "train_runtime": 72112.5909, "train_tokens_per_second": 144892.356 }, { "epoch": 0.954239091876552, "grad_norm": 0.30598118901252747, "learning_rate": 2.1041853010422607e-07, "loss": 0.3905, "num_input_tokens_seen": 10452474943, "step": 2690, "train_runtime": 72136.5732, "train_tokens_per_second": 144898.413 }, { "epoch": 0.9545938275984391, "grad_norm": 0.2556592524051666, "learning_rate": 2.0719935247442536e-07, "loss": 0.3932, "num_input_tokens_seen": 10456381660, "step": 2691, "train_runtime": 72158.6115, "train_tokens_per_second": 144908.299 }, { "epoch": 0.9549485633203264, "grad_norm": 0.28527888655662537, "learning_rate": 2.0400486179740886e-07, "loss": 0.3949, "num_input_tokens_seen": 10460214044, "step": 2692, "train_runtime": 72190.0727, "train_tokens_per_second": 144898.234 }, { "epoch": 0.9553032990422136, "grad_norm": 0.2589302957057953, "learning_rate": 2.0083506205756497e-07, "loss": 0.3941, "num_input_tokens_seen": 10464158418, "step": 2693, "train_runtime": 72216.0402, "train_tokens_per_second": 144900.75 }, { "epoch": 0.9556580347641007, "grad_norm": 0.41096231341362, "learning_rate": 1.9768995720848895e-07, "loss": 0.38, "num_input_tokens_seen": 10468017051, "step": 2694, "train_runtime": 72244.1727, "train_tokens_per_second": 144897.736 }, { "epoch": 0.956012770485988, "grad_norm": 0.356736958026886, "learning_rate": 1.945695511729695e-07, "loss": 0.3979, "num_input_tokens_seen": 10471860312, "step": 2695, "train_runtime": 72268.2744, "train_tokens_per_second": 144902.592 }, { "epoch": 0.9563675062078751, "grad_norm": 0.25347328186035156, "learning_rate": 1.9147384784300005e-07, "loss": 0.3854, "num_input_tokens_seen": 10475740566, "step": 2696, "train_runtime": 72291.8626, "train_tokens_per_second": 144908.987 }, { "epoch": 0.9567222419297623, "grad_norm": 0.22672107815742493, "learning_rate": 1.8840285107975198e-07, "loss": 0.3941, "num_input_tokens_seen": 10479624415, "step": 2697, "train_runtime": 72321.8781, "train_tokens_per_second": 144902.548 }, { "epoch": 0.9570769776516496, "grad_norm": 0.2895939350128174, "learning_rate": 1.8535656471358576e-07, "loss": 0.383, "num_input_tokens_seen": 10483508529, "step": 2698, "train_runtime": 72347.9632, "train_tokens_per_second": 144903.99 }, { "epoch": 0.9574317133735367, "grad_norm": 0.33159223198890686, "learning_rate": 1.8233499254404208e-07, "loss": 0.3976, "num_input_tokens_seen": 10487384746, "step": 2699, "train_runtime": 72378.2361, "train_tokens_per_second": 144896.937 }, { "epoch": 0.9577864490954239, "grad_norm": 0.24166008830070496, "learning_rate": 1.7933813833983294e-07, "loss": 0.3842, "num_input_tokens_seen": 10491266014, "step": 2700, "train_runtime": 72405.0236, "train_tokens_per_second": 144896.935 }, { "epoch": 0.9581411848173111, "grad_norm": 0.23321770131587982, "learning_rate": 1.7636600583884388e-07, "loss": 0.384, "num_input_tokens_seen": 10495141577, "step": 2701, "train_runtime": 72436.933, "train_tokens_per_second": 144886.609 }, { "epoch": 0.9584959205391983, "grad_norm": 0.31199875473976135, "learning_rate": 1.734185987481274e-07, "loss": 0.3855, "num_input_tokens_seen": 10499084681, "step": 2702, "train_runtime": 72459.7654, "train_tokens_per_second": 144895.372 }, { "epoch": 0.9588506562610855, "grad_norm": 0.21532680094242096, "learning_rate": 1.7049592074388942e-07, "loss": 0.3896, "num_input_tokens_seen": 10502946631, "step": 2703, "train_runtime": 72481.6083, "train_tokens_per_second": 144904.989 }, { "epoch": 0.9592053919829727, "grad_norm": 0.30736786127090454, "learning_rate": 1.6759797547149849e-07, "loss": 0.3945, "num_input_tokens_seen": 10506916647, "step": 2704, "train_runtime": 72514.4313, "train_tokens_per_second": 144894.147 }, { "epoch": 0.9595601277048599, "grad_norm": 0.20293380320072174, "learning_rate": 1.647247665454721e-07, "loss": 0.3901, "num_input_tokens_seen": 10510788503, "step": 2705, "train_runtime": 72532.9329, "train_tokens_per_second": 144910.568 }, { "epoch": 0.959914863426747, "grad_norm": 0.22338633239269257, "learning_rate": 1.6187629754947697e-07, "loss": 0.3834, "num_input_tokens_seen": 10514720714, "step": 2706, "train_runtime": 72554.1838, "train_tokens_per_second": 144922.321 }, { "epoch": 0.9602695991486343, "grad_norm": 0.242214173078537, "learning_rate": 1.5905257203632008e-07, "loss": 0.3912, "num_input_tokens_seen": 10518586842, "step": 2707, "train_runtime": 72575.4696, "train_tokens_per_second": 144933.087 }, { "epoch": 0.9606243348705215, "grad_norm": 0.2549039423465729, "learning_rate": 1.5625359352794857e-07, "loss": 0.4003, "num_input_tokens_seen": 10522461052, "step": 2708, "train_runtime": 72605.7408, "train_tokens_per_second": 144926.02 }, { "epoch": 0.9609790705924086, "grad_norm": 0.4040926694869995, "learning_rate": 1.5347936551544096e-07, "loss": 0.3767, "num_input_tokens_seen": 10526321379, "step": 2709, "train_runtime": 72635.5398, "train_tokens_per_second": 144919.71 }, { "epoch": 0.9613338063142959, "grad_norm": 0.2840788662433624, "learning_rate": 1.5072989145900718e-07, "loss": 0.38, "num_input_tokens_seen": 10530219062, "step": 2710, "train_runtime": 72664.1053, "train_tokens_per_second": 144916.38 }, { "epoch": 0.961688542036183, "grad_norm": 0.2671756446361542, "learning_rate": 1.480051747879818e-07, "loss": 0.387, "num_input_tokens_seen": 10534125593, "step": 2711, "train_runtime": 72690.6528, "train_tokens_per_second": 144917.196 }, { "epoch": 0.9620432777580702, "grad_norm": 0.21615466475486755, "learning_rate": 1.4530521890082193e-07, "loss": 0.3907, "num_input_tokens_seen": 10537997034, "step": 2712, "train_runtime": 72722.5623, "train_tokens_per_second": 144906.845 }, { "epoch": 0.9623980134799575, "grad_norm": 0.22870895266532898, "learning_rate": 1.42630027165096e-07, "loss": 0.3864, "num_input_tokens_seen": 10541886252, "step": 2713, "train_runtime": 72741.2025, "train_tokens_per_second": 144923.178 }, { "epoch": 0.9627527492018446, "grad_norm": 0.22245989739894867, "learning_rate": 1.399796029174927e-07, "loss": 0.3855, "num_input_tokens_seen": 10545867387, "step": 2714, "train_runtime": 72774.2397, "train_tokens_per_second": 144912.093 }, { "epoch": 0.9631074849237318, "grad_norm": 0.24891076982021332, "learning_rate": 1.3735394946380321e-07, "loss": 0.3888, "num_input_tokens_seen": 10549746474, "step": 2715, "train_runtime": 72794.2211, "train_tokens_per_second": 144925.604 }, { "epoch": 0.963462220645619, "grad_norm": 0.20614062249660492, "learning_rate": 1.3475307007892346e-07, "loss": 0.3956, "num_input_tokens_seen": 10553640573, "step": 2716, "train_runtime": 72813.8009, "train_tokens_per_second": 144940.114 }, { "epoch": 0.9638169563675062, "grad_norm": 0.22330544888973236, "learning_rate": 1.3217696800684966e-07, "loss": 0.3908, "num_input_tokens_seen": 10557473663, "step": 2717, "train_runtime": 72844.7499, "train_tokens_per_second": 144931.154 }, { "epoch": 0.9641716920893934, "grad_norm": 0.22157253324985504, "learning_rate": 1.2962564646068043e-07, "loss": 0.3916, "num_input_tokens_seen": 10561343233, "step": 2718, "train_runtime": 72875.9387, "train_tokens_per_second": 144922.226 }, { "epoch": 0.9645264278112806, "grad_norm": 0.23463773727416992, "learning_rate": 1.2709910862259477e-07, "loss": 0.3851, "num_input_tokens_seen": 10565225667, "step": 2719, "train_runtime": 72901.4174, "train_tokens_per_second": 144924.832 }, { "epoch": 0.9648811635331678, "grad_norm": 0.3645741939544678, "learning_rate": 1.245973576438675e-07, "loss": 0.3842, "num_input_tokens_seen": 10569160222, "step": 2720, "train_runtime": 72926.1668, "train_tokens_per_second": 144929.601 }, { "epoch": 0.9652358992550549, "grad_norm": 0.4616822600364685, "learning_rate": 1.2212039664485586e-07, "loss": 0.394, "num_input_tokens_seen": 10572952013, "step": 2721, "train_runtime": 72951.6063, "train_tokens_per_second": 144931.038 }, { "epoch": 0.9655906349769422, "grad_norm": 0.23334328830242157, "learning_rate": 1.196682287149975e-07, "loss": 0.3894, "num_input_tokens_seen": 10576832620, "step": 2722, "train_runtime": 72983.366, "train_tokens_per_second": 144921.14 }, { "epoch": 0.9659453706988294, "grad_norm": 0.24147841334342957, "learning_rate": 1.1724085691280806e-07, "loss": 0.3876, "num_input_tokens_seen": 10580704115, "step": 2723, "train_runtime": 73014.5435, "train_tokens_per_second": 144912.282 }, { "epoch": 0.9663001064207166, "grad_norm": 0.33494699001312256, "learning_rate": 1.148382842658724e-07, "loss": 0.3805, "num_input_tokens_seen": 10584568924, "step": 2724, "train_runtime": 73039.3449, "train_tokens_per_second": 144915.989 }, { "epoch": 0.9666548421426038, "grad_norm": 0.24496065080165863, "learning_rate": 1.1246051377084233e-07, "loss": 0.3887, "num_input_tokens_seen": 10588464798, "step": 2725, "train_runtime": 73070.5557, "train_tokens_per_second": 144907.408 }, { "epoch": 0.9670095778644909, "grad_norm": 0.32468292117118835, "learning_rate": 1.1010754839344328e-07, "loss": 0.3889, "num_input_tokens_seen": 10592362607, "step": 2726, "train_runtime": 73092.3575, "train_tokens_per_second": 144917.512 }, { "epoch": 0.9673643135863782, "grad_norm": 0.2041717916727066, "learning_rate": 1.0777939106845658e-07, "loss": 0.388, "num_input_tokens_seen": 10596147306, "step": 2727, "train_runtime": 73112.3971, "train_tokens_per_second": 144929.557 }, { "epoch": 0.9677190493082654, "grad_norm": 0.5890065431594849, "learning_rate": 1.0547604469971717e-07, "loss": 0.3812, "num_input_tokens_seen": 10600076098, "step": 2728, "train_runtime": 73131.483, "train_tokens_per_second": 144945.455 }, { "epoch": 0.9680737850301525, "grad_norm": 0.18573741614818573, "learning_rate": 1.031975121601203e-07, "loss": 0.382, "num_input_tokens_seen": 10603998977, "step": 2729, "train_runtime": 73158.5681, "train_tokens_per_second": 144945.414 }, { "epoch": 0.9684285207520398, "grad_norm": 0.26559585332870483, "learning_rate": 1.0094379629161044e-07, "loss": 0.393, "num_input_tokens_seen": 10607878188, "step": 2730, "train_runtime": 73186.7643, "train_tokens_per_second": 144942.577 }, { "epoch": 0.9687832564739269, "grad_norm": 0.2976178526878357, "learning_rate": 9.871489990517902e-08, "loss": 0.3768, "num_input_tokens_seen": 10611784638, "step": 2731, "train_runtime": 73206.1229, "train_tokens_per_second": 144957.61 }, { "epoch": 0.9691379921958141, "grad_norm": 0.2358769029378891, "learning_rate": 9.65108257808578e-08, "loss": 0.3854, "num_input_tokens_seen": 10615607390, "step": 2732, "train_runtime": 73230.8715, "train_tokens_per_second": 144960.823 }, { "epoch": 0.9694927279177014, "grad_norm": 0.21163062751293182, "learning_rate": 9.433157666772109e-08, "loss": 0.3832, "num_input_tokens_seen": 10619522315, "step": 2733, "train_runtime": 73253.3252, "train_tokens_per_second": 144969.833 }, { "epoch": 0.9698474636395885, "grad_norm": 0.2663170099258423, "learning_rate": 9.217715528387905e-08, "loss": 0.3834, "num_input_tokens_seen": 10623326411, "step": 2734, "train_runtime": 73279.0036, "train_tokens_per_second": 144970.945 }, { "epoch": 0.9702021993614757, "grad_norm": 0.28996577858924866, "learning_rate": 9.004756431647776e-08, "loss": 0.3923, "num_input_tokens_seen": 10627260067, "step": 2735, "train_runtime": 73299.2006, "train_tokens_per_second": 144984.665 }, { "epoch": 0.9705569350833629, "grad_norm": 0.4292827844619751, "learning_rate": 8.794280642169029e-08, "loss": 0.3988, "num_input_tokens_seen": 10631100382, "step": 2736, "train_runtime": 73324.8879, "train_tokens_per_second": 144986.248 }, { "epoch": 0.9709116708052501, "grad_norm": 0.22719381749629974, "learning_rate": 8.586288422471667e-08, "loss": 0.3988, "num_input_tokens_seen": 10635011897, "step": 2737, "train_runtime": 73349.2645, "train_tokens_per_second": 144991.391 }, { "epoch": 0.9712664065271372, "grad_norm": 0.370496928691864, "learning_rate": 8.380780031977509e-08, "loss": 0.3863, "num_input_tokens_seen": 10638957003, "step": 2738, "train_runtime": 73369.7071, "train_tokens_per_second": 145004.763 }, { "epoch": 0.9716211422490245, "grad_norm": 0.26827988028526306, "learning_rate": 8.177755727011515e-08, "loss": 0.3957, "num_input_tokens_seen": 10642835946, "step": 2739, "train_runtime": 73392.5539, "train_tokens_per_second": 145012.476 }, { "epoch": 0.9719758779709117, "grad_norm": 0.23580950498580933, "learning_rate": 7.977215760799351e-08, "loss": 0.3829, "num_input_tokens_seen": 10646647352, "step": 2740, "train_runtime": 73416.9717, "train_tokens_per_second": 145016.16 }, { "epoch": 0.9723306136927988, "grad_norm": 0.21447663009166718, "learning_rate": 7.779160383468488e-08, "loss": 0.3866, "num_input_tokens_seen": 10650514759, "step": 2741, "train_runtime": 73442.5749, "train_tokens_per_second": 145018.265 }, { "epoch": 0.9726853494146861, "grad_norm": 0.26273050904273987, "learning_rate": 7.583589842047101e-08, "loss": 0.3959, "num_input_tokens_seen": 10654401100, "step": 2742, "train_runtime": 73459.9171, "train_tokens_per_second": 145036.933 }, { "epoch": 0.9730400851365733, "grad_norm": 0.2745969295501709, "learning_rate": 7.390504380464514e-08, "loss": 0.3734, "num_input_tokens_seen": 10658299372, "step": 2743, "train_runtime": 73487.7881, "train_tokens_per_second": 145034.973 }, { "epoch": 0.9733948208584604, "grad_norm": 0.2892249524593353, "learning_rate": 7.199904239550304e-08, "loss": 0.3866, "num_input_tokens_seen": 10662170950, "step": 2744, "train_runtime": 73507.3847, "train_tokens_per_second": 145048.977 }, { "epoch": 0.9737495565803477, "grad_norm": 0.7271156907081604, "learning_rate": 7.011789657034085e-08, "loss": 0.3944, "num_input_tokens_seen": 10666104610, "step": 2745, "train_runtime": 73538.8481, "train_tokens_per_second": 145040.409 }, { "epoch": 0.9741042923022348, "grad_norm": 0.25853443145751953, "learning_rate": 6.826160867545284e-08, "loss": 0.3916, "num_input_tokens_seen": 10669968817, "step": 2746, "train_runtime": 73563.9655, "train_tokens_per_second": 145043.416 }, { "epoch": 0.974459028024122, "grad_norm": 0.563653290271759, "learning_rate": 6.643018102613363e-08, "loss": 0.3858, "num_input_tokens_seen": 10673852929, "step": 2747, "train_runtime": 73594.7427, "train_tokens_per_second": 145035.536 }, { "epoch": 0.9748137637460093, "grad_norm": 0.21643507480621338, "learning_rate": 6.462361590666488e-08, "loss": 0.3839, "num_input_tokens_seen": 10677692335, "step": 2748, "train_runtime": 73616.9962, "train_tokens_per_second": 145043.847 }, { "epoch": 0.9751684994678964, "grad_norm": 2.472262144088745, "learning_rate": 6.284191557031971e-08, "loss": 0.3915, "num_input_tokens_seen": 10681645315, "step": 2749, "train_runtime": 73637.102, "train_tokens_per_second": 145057.926 }, { "epoch": 0.9755232351897836, "grad_norm": 0.19923138618469238, "learning_rate": 6.108508223935605e-08, "loss": 0.3808, "num_input_tokens_seen": 10685555029, "step": 2750, "train_runtime": 73664.9199, "train_tokens_per_second": 145056.223 }, { "epoch": 0.9758779709116708, "grad_norm": 0.4935762286186218, "learning_rate": 5.935311810502331e-08, "loss": 0.3933, "num_input_tokens_seen": 10689375396, "step": 2751, "train_runtime": 73684.2065, "train_tokens_per_second": 145070.103 }, { "epoch": 0.976232706633558, "grad_norm": 0.22405782341957092, "learning_rate": 5.764602532754015e-08, "loss": 0.3972, "num_input_tokens_seen": 10693261308, "step": 2752, "train_runtime": 73710.5589, "train_tokens_per_second": 145070.957 }, { "epoch": 0.9765874423554451, "grad_norm": 0.2812807559967041, "learning_rate": 5.596380603611895e-08, "loss": 0.3932, "num_input_tokens_seen": 10697200818, "step": 2753, "train_runtime": 73735.7641, "train_tokens_per_second": 145074.794 }, { "epoch": 0.9769421780773324, "grad_norm": 0.3529277741909027, "learning_rate": 5.430646232893466e-08, "loss": 0.3975, "num_input_tokens_seen": 10701072395, "step": 2754, "train_runtime": 73760.3643, "train_tokens_per_second": 145078.898 }, { "epoch": 0.9772969137992196, "grad_norm": 0.23230616748332977, "learning_rate": 5.2673996273142626e-08, "loss": 0.3841, "num_input_tokens_seen": 10704981311, "step": 2755, "train_runtime": 73789.0861, "train_tokens_per_second": 145075.402 }, { "epoch": 0.9776516495211067, "grad_norm": 0.24899335205554962, "learning_rate": 5.106640990486744e-08, "loss": 0.3912, "num_input_tokens_seen": 10708884928, "step": 2756, "train_runtime": 73819.4483, "train_tokens_per_second": 145068.612 }, { "epoch": 0.978006385242994, "grad_norm": 0.22449573874473572, "learning_rate": 4.948370522920298e-08, "loss": 0.3965, "num_input_tokens_seen": 10712742153, "step": 2757, "train_runtime": 73841.9567, "train_tokens_per_second": 145076.629 }, { "epoch": 0.9783611209648811, "grad_norm": 0.23597358167171478, "learning_rate": 4.7925884220207944e-08, "loss": 0.3901, "num_input_tokens_seen": 10716700933, "step": 2758, "train_runtime": 73868.5571, "train_tokens_per_second": 145077.978 }, { "epoch": 0.9787158566867683, "grad_norm": 0.2554972767829895, "learning_rate": 4.6392948820903616e-08, "loss": 0.3895, "num_input_tokens_seen": 10720572024, "step": 2759, "train_runtime": 73894.3448, "train_tokens_per_second": 145079.736 }, { "epoch": 0.9790705924086556, "grad_norm": 0.24046213924884796, "learning_rate": 4.488490094327391e-08, "loss": 0.3854, "num_input_tokens_seen": 10724406628, "step": 2760, "train_runtime": 73915.8242, "train_tokens_per_second": 145089.455 }, { "epoch": 0.9794253281305427, "grad_norm": 0.30712324380874634, "learning_rate": 4.340174246826312e-08, "loss": 0.3887, "num_input_tokens_seen": 10728290682, "step": 2761, "train_runtime": 73948.626, "train_tokens_per_second": 145077.62 }, { "epoch": 0.97978006385243, "grad_norm": 0.23902933299541473, "learning_rate": 4.194347524576703e-08, "loss": 0.3771, "num_input_tokens_seen": 10732169108, "step": 2762, "train_runtime": 73979.7331, "train_tokens_per_second": 145069.043 }, { "epoch": 0.9801347995743172, "grad_norm": 0.2203664630651474, "learning_rate": 4.0510101094637376e-08, "loss": 0.3833, "num_input_tokens_seen": 10736147954, "step": 2763, "train_runtime": 74009.3968, "train_tokens_per_second": 145064.66 }, { "epoch": 0.9804895352962043, "grad_norm": 0.24352440237998962, "learning_rate": 3.91016218026774e-08, "loss": 0.3967, "num_input_tokens_seen": 10740070978, "step": 2764, "train_runtime": 74028.3446, "train_tokens_per_second": 145080.523 }, { "epoch": 0.9808442710180916, "grad_norm": 0.2981015741825104, "learning_rate": 3.771803912663963e-08, "loss": 0.3817, "num_input_tokens_seen": 10743990819, "step": 2765, "train_runtime": 74059.373, "train_tokens_per_second": 145072.668 }, { "epoch": 0.9811990067399787, "grad_norm": 0.2420603334903717, "learning_rate": 3.635935479222807e-08, "loss": 0.394, "num_input_tokens_seen": 10747899592, "step": 2766, "train_runtime": 74082.3073, "train_tokens_per_second": 145080.519 }, { "epoch": 0.9815537424618659, "grad_norm": 0.2316364049911499, "learning_rate": 3.5025570494084946e-08, "loss": 0.3869, "num_input_tokens_seen": 10751771624, "step": 2767, "train_runtime": 74114.5174, "train_tokens_per_second": 145069.711 }, { "epoch": 0.9819084781837532, "grad_norm": 0.24885252118110657, "learning_rate": 3.3716687895797295e-08, "loss": 0.3963, "num_input_tokens_seen": 10755644945, "step": 2768, "train_runtime": 74141.831, "train_tokens_per_second": 145068.51 }, { "epoch": 0.9822632139056403, "grad_norm": 0.25024041533470154, "learning_rate": 3.2432708629897e-08, "loss": 0.4011, "num_input_tokens_seen": 10759547207, "step": 2769, "train_runtime": 74169.5536, "train_tokens_per_second": 145066.9 }, { "epoch": 0.9826179496275275, "grad_norm": 0.19837725162506104, "learning_rate": 3.11736342978497e-08, "loss": 0.3865, "num_input_tokens_seen": 10763456881, "step": 2770, "train_runtime": 74189.4067, "train_tokens_per_second": 145080.779 }, { "epoch": 0.9829726853494147, "grad_norm": 0.2240581065416336, "learning_rate": 2.993946647006141e-08, "loss": 0.3796, "num_input_tokens_seen": 10767373765, "step": 2771, "train_runtime": 74210.9196, "train_tokens_per_second": 145091.502 }, { "epoch": 0.9833274210713019, "grad_norm": 0.2740337550640106, "learning_rate": 2.873020668587412e-08, "loss": 0.3858, "num_input_tokens_seen": 10771159492, "step": 2772, "train_runtime": 74230.3552, "train_tokens_per_second": 145104.512 }, { "epoch": 0.983682156793189, "grad_norm": 0.2217368185520172, "learning_rate": 2.7545856453554675e-08, "loss": 0.398, "num_input_tokens_seen": 10775052362, "step": 2773, "train_runtime": 74257.5867, "train_tokens_per_second": 145103.724 }, { "epoch": 0.9840368925150763, "grad_norm": 0.35490232706069946, "learning_rate": 2.6386417250312546e-08, "loss": 0.3878, "num_input_tokens_seen": 10778954063, "step": 2774, "train_runtime": 74278.8628, "train_tokens_per_second": 145114.689 }, { "epoch": 0.9843916282369635, "grad_norm": 0.19074490666389465, "learning_rate": 2.525189052227761e-08, "loss": 0.3829, "num_input_tokens_seen": 10782816782, "step": 2775, "train_runtime": 74300.353, "train_tokens_per_second": 145124.705 }, { "epoch": 0.9847463639588506, "grad_norm": 0.24867363274097443, "learning_rate": 2.414227768450905e-08, "loss": 0.3875, "num_input_tokens_seen": 10786713883, "step": 2776, "train_runtime": 74321.8297, "train_tokens_per_second": 145135.204 }, { "epoch": 0.9851010996807379, "grad_norm": 0.30506452918052673, "learning_rate": 2.305758012099535e-08, "loss": 0.3929, "num_input_tokens_seen": 10790613726, "step": 2777, "train_runtime": 74347.1879, "train_tokens_per_second": 145138.156 }, { "epoch": 0.985455835402625, "grad_norm": 0.4996855854988098, "learning_rate": 2.1997799184647616e-08, "loss": 0.3927, "num_input_tokens_seen": 10794527944, "step": 2778, "train_runtime": 74385.1211, "train_tokens_per_second": 145116.762 }, { "epoch": 0.9858105711245122, "grad_norm": 0.3572838604450226, "learning_rate": 2.096293619729295e-08, "loss": 0.3844, "num_input_tokens_seen": 10798367336, "step": 2779, "train_runtime": 74420.7957, "train_tokens_per_second": 145098.789 }, { "epoch": 0.9861653068463995, "grad_norm": 0.2312585860490799, "learning_rate": 1.9952992449689955e-08, "loss": 0.3919, "num_input_tokens_seen": 10802251685, "step": 2780, "train_runtime": 74451.676, "train_tokens_per_second": 145090.779 }, { "epoch": 0.9865200425682866, "grad_norm": 0.33985471725463867, "learning_rate": 1.8967969201508786e-08, "loss": 0.3893, "num_input_tokens_seen": 10806037064, "step": 2781, "train_runtime": 74476.38, "train_tokens_per_second": 145093.479 }, { "epoch": 0.9868747782901738, "grad_norm": 0.3408084213733673, "learning_rate": 1.8007867681335557e-08, "loss": 0.3849, "num_input_tokens_seen": 10809964230, "step": 2782, "train_runtime": 74497.4704, "train_tokens_per_second": 145105.118 }, { "epoch": 0.987229514012061, "grad_norm": 0.30118387937545776, "learning_rate": 1.707268908667903e-08, "loss": 0.3967, "num_input_tokens_seen": 10813813626, "step": 2783, "train_runtime": 74522.6588, "train_tokens_per_second": 145107.727 }, { "epoch": 0.9875842497339482, "grad_norm": 0.24711917340755463, "learning_rate": 1.6162434583957278e-08, "loss": 0.3956, "num_input_tokens_seen": 10817666460, "step": 2784, "train_runtime": 74548.5939, "train_tokens_per_second": 145108.927 }, { "epoch": 0.9879389854558354, "grad_norm": 0.34452980756759644, "learning_rate": 1.5277105308504347e-08, "loss": 0.393, "num_input_tokens_seen": 10821562441, "step": 2785, "train_runtime": 74568.3843, "train_tokens_per_second": 145122.662 }, { "epoch": 0.9882937211777226, "grad_norm": 0.34005793929100037, "learning_rate": 1.4416702364561386e-08, "loss": 0.3767, "num_input_tokens_seen": 10825403947, "step": 2786, "train_runtime": 74586.925, "train_tokens_per_second": 145138.092 }, { "epoch": 0.9886484568996098, "grad_norm": 0.22436240315437317, "learning_rate": 1.3581226825285509e-08, "loss": 0.392, "num_input_tokens_seen": 10829306482, "step": 2787, "train_runtime": 74616.6173, "train_tokens_per_second": 145132.638 }, { "epoch": 0.9890031926214969, "grad_norm": 0.19429348409175873, "learning_rate": 1.2770679732738711e-08, "loss": 0.382, "num_input_tokens_seen": 10833189669, "step": 2788, "train_runtime": 74637.8019, "train_tokens_per_second": 145143.471 }, { "epoch": 0.9893579283433842, "grad_norm": 0.3541187047958374, "learning_rate": 1.1985062097890077e-08, "loss": 0.3819, "num_input_tokens_seen": 10837083990, "step": 2789, "train_runtime": 74665.2079, "train_tokens_per_second": 145142.353 }, { "epoch": 0.9897126640652714, "grad_norm": 0.42917752265930176, "learning_rate": 1.1224374900615787e-08, "loss": 0.3971, "num_input_tokens_seen": 10840925383, "step": 2790, "train_runtime": 74690.3836, "train_tokens_per_second": 145144.862 }, { "epoch": 0.9900673997871585, "grad_norm": 0.21065708994865417, "learning_rate": 1.0488619089701335e-08, "loss": 0.391, "num_input_tokens_seen": 10844797514, "step": 2791, "train_runtime": 74712.4795, "train_tokens_per_second": 145153.763 }, { "epoch": 0.9904221355090458, "grad_norm": 0.25038889050483704, "learning_rate": 9.77779558283043e-09, "loss": 0.3862, "num_input_tokens_seen": 10848721994, "step": 2792, "train_runtime": 74731.3824, "train_tokens_per_second": 145169.561 }, { "epoch": 0.9907768712309329, "grad_norm": 0.35527920722961426, "learning_rate": 9.09190526659165e-09, "loss": 0.3912, "num_input_tokens_seen": 10852595709, "step": 2793, "train_runtime": 74756.6546, "train_tokens_per_second": 145172.303 }, { "epoch": 0.9911316069528201, "grad_norm": 0.2593751847743988, "learning_rate": 8.430948996474008e-09, "loss": 0.3951, "num_input_tokens_seen": 10856444691, "step": 2794, "train_runtime": 74783.9148, "train_tokens_per_second": 145170.853 }, { "epoch": 0.9914863426747074, "grad_norm": 0.21614111959934235, "learning_rate": 7.794927596869172e-09, "loss": 0.3896, "num_input_tokens_seen": 10860329454, "step": 2795, "train_runtime": 74808.5521, "train_tokens_per_second": 145174.972 }, { "epoch": 0.9918410783965945, "grad_norm": 0.2078436315059662, "learning_rate": 7.1838418610648e-09, "loss": 0.3813, "num_input_tokens_seen": 10864166991, "step": 2796, "train_runtime": 74827.8209, "train_tokens_per_second": 145188.873 }, { "epoch": 0.9921958141184817, "grad_norm": 0.24233268201351166, "learning_rate": 6.59769255125342e-09, "loss": 0.3922, "num_input_tokens_seen": 10868067209, "step": 2797, "train_runtime": 74863.0034, "train_tokens_per_second": 145172.738 }, { "epoch": 0.992550549840369, "grad_norm": 0.2365359514951706, "learning_rate": 6.036480398516898e-09, "loss": 0.3928, "num_input_tokens_seen": 10871960243, "step": 2798, "train_runtime": 74884.2089, "train_tokens_per_second": 145183.616 }, { "epoch": 0.9929052855622561, "grad_norm": 0.3567480742931366, "learning_rate": 5.500206102841965e-09, "loss": 0.3927, "num_input_tokens_seen": 10875881880, "step": 2799, "train_runtime": 74912.8887, "train_tokens_per_second": 145180.383 }, { "epoch": 0.9932600212841434, "grad_norm": 0.3678499758243561, "learning_rate": 4.988870333102469e-09, "loss": 0.3812, "num_input_tokens_seen": 10879651656, "step": 2800, "train_runtime": 74939.308, "train_tokens_per_second": 145179.505 }, { "epoch": 0.9936147570060305, "grad_norm": 0.6157177090644836, "learning_rate": 4.502473727074908e-09, "loss": 0.3744, "num_input_tokens_seen": 10883606407, "step": 2801, "train_runtime": 75084.0623, "train_tokens_per_second": 144952.285 }, { "epoch": 0.9939694927279177, "grad_norm": 0.44416946172714233, "learning_rate": 4.041016891427329e-09, "loss": 0.3827, "num_input_tokens_seen": 10887476639, "step": 2802, "train_runtime": 75111.8409, "train_tokens_per_second": 144950.204 }, { "epoch": 0.994324228449805, "grad_norm": 0.22886939346790314, "learning_rate": 3.6045004017193353e-09, "loss": 0.3907, "num_input_tokens_seen": 10891403924, "step": 2803, "train_runtime": 75142.8995, "train_tokens_per_second": 144942.556 }, { "epoch": 0.9946789641716921, "grad_norm": 0.22026100754737854, "learning_rate": 3.1929248024042956e-09, "loss": 0.3966, "num_input_tokens_seen": 10895219958, "step": 2804, "train_runtime": 75162.6148, "train_tokens_per_second": 144955.308 }, { "epoch": 0.9950336998935793, "grad_norm": 0.217874214053154, "learning_rate": 2.8062906068293537e-09, "loss": 0.3828, "num_input_tokens_seen": 10899126909, "step": 2805, "train_runtime": 75187.3036, "train_tokens_per_second": 144959.673 }, { "epoch": 0.9953884356154665, "grad_norm": 0.2713411748409271, "learning_rate": 2.444598297228762e-09, "loss": 0.3878, "num_input_tokens_seen": 10903023004, "step": 2806, "train_runtime": 75214.4966, "train_tokens_per_second": 144959.064 }, { "epoch": 0.9957431713373537, "grad_norm": 0.3075081408023834, "learning_rate": 2.107848324732764e-09, "loss": 0.3888, "num_input_tokens_seen": 10906944883, "step": 2807, "train_runtime": 75239.3081, "train_tokens_per_second": 144963.386 }, { "epoch": 0.9960979070592408, "grad_norm": 0.5299983620643616, "learning_rate": 1.796041109358715e-09, "loss": 0.3931, "num_input_tokens_seen": 10910841748, "step": 2808, "train_runtime": 75268.4549, "train_tokens_per_second": 144959.024 }, { "epoch": 0.9964526427811281, "grad_norm": 0.2318962812423706, "learning_rate": 1.5091770400132988e-09, "loss": 0.3867, "num_input_tokens_seen": 10914640443, "step": 2809, "train_runtime": 75286.5297, "train_tokens_per_second": 144974.678 }, { "epoch": 0.9968073785030153, "grad_norm": 0.5047751665115356, "learning_rate": 1.247256474492531e-09, "loss": 0.4035, "num_input_tokens_seen": 10918592275, "step": 2810, "train_runtime": 75304.7542, "train_tokens_per_second": 144992.071 }, { "epoch": 0.9971621142249024, "grad_norm": 0.19155722856521606, "learning_rate": 1.0102797394861974e-09, "loss": 0.3849, "num_input_tokens_seen": 10922440375, "step": 2811, "train_runtime": 75327.7181, "train_tokens_per_second": 144998.955 }, { "epoch": 0.9975168499467897, "grad_norm": 0.43026021122932434, "learning_rate": 7.98247130562313e-10, "loss": 0.3871, "num_input_tokens_seen": 10926305915, "step": 2812, "train_runtime": 75353.3043, "train_tokens_per_second": 145001.019 }, { "epoch": 0.9978715856686768, "grad_norm": 0.20562836527824402, "learning_rate": 6.111589121871042e-10, "loss": 0.4046, "num_input_tokens_seen": 10930165727, "step": 2813, "train_runtime": 75379.5814, "train_tokens_per_second": 145001.677 }, { "epoch": 0.998226321390564, "grad_norm": 0.18939965963363647, "learning_rate": 4.4901531770946694e-10, "loss": 0.3867, "num_input_tokens_seen": 10934094627, "step": 2814, "train_runtime": 75398.9764, "train_tokens_per_second": 145016.486 }, { "epoch": 0.9985810571124513, "grad_norm": 0.23067033290863037, "learning_rate": 3.1181654936318637e-10, "loss": 0.3939, "num_input_tokens_seen": 10937971574, "step": 2815, "train_runtime": 75428.8629, "train_tokens_per_second": 145010.426 }, { "epoch": 0.9989357928343384, "grad_norm": 0.3685580790042877, "learning_rate": 1.9956277827359872e-10, "loss": 0.3883, "num_input_tokens_seen": 10941847471, "step": 2816, "train_runtime": 75450.1107, "train_tokens_per_second": 145020.96 }, { "epoch": 0.9992905285562256, "grad_norm": 0.2531599998474121, "learning_rate": 1.1225414445092952e-10, "loss": 0.395, "num_input_tokens_seen": 10945672192, "step": 2817, "train_runtime": 75477.2765, "train_tokens_per_second": 145019.438 }, { "epoch": 0.9996452642781128, "grad_norm": 0.37901538610458374, "learning_rate": 4.989075679473487e-11, "loss": 0.3876, "num_input_tokens_seen": 10949580645, "step": 2818, "train_runtime": 75512.3193, "train_tokens_per_second": 145003.898 }, { "epoch": 1.0, "grad_norm": 0.2679806649684906, "learning_rate": 1.2472693087239862e-11, "loss": 0.3895, "num_input_tokens_seen": 10953473797, "step": 2819, "train_runtime": 75534.9661, "train_tokens_per_second": 145011.964 } ], "logging_steps": 1, "max_steps": 2819, "num_input_samples_seen": 13300361, "num_input_tokens_seen": 10953473797, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.7780491925279316e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }