diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,4683 +1,14088 @@ { - "best_metric": 0.0036811623722314835, - "best_model_checkpoint": "./results/checkpoint-1830", - "epoch": 2.0, + "best_metric": 0.002703184960409999, + "best_model_checkpoint": "./results/checkpoint-8540", + "epoch": 1.0, "eval_steps": 10, - "global_step": 3118, + "global_step": 9386, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.006414368184733804, - "grad_norm": 4.846098899841309, - "learning_rate": 1.9935856318152666e-05, - "loss": 0.5328, + "epoch": 0.001065416577881952, + "grad_norm": 2.598374366760254, + "learning_rate": 1.9978691668442363e-05, + "loss": 0.6478, "step": 10 }, { - "epoch": 0.006414368184733804, - "eval_loss": 0.4104921221733093, - "eval_runtime": 2.8279, - "eval_samples_per_second": 4408.52, - "eval_steps_per_second": 137.91, + "epoch": 0.001065416577881952, + "eval_loss": 0.5497280955314636, + "eval_runtime": 34.7845, + "eval_samples_per_second": 4317.273, + "eval_steps_per_second": 67.473, "step": 10 }, { - "epoch": 0.012828736369467608, - "grad_norm": 2.26491641998291, - "learning_rate": 1.9871712636305324e-05, - "loss": 0.3602, + "epoch": 0.002130833155763904, + "grad_norm": 2.8029165267944336, + "learning_rate": 1.9957383336884725e-05, + "loss": 0.4639, "step": 20 }, { - "epoch": 0.012828736369467608, - "eval_loss": 0.29217612743377686, - "eval_runtime": 2.7561, - "eval_samples_per_second": 4523.399, - "eval_steps_per_second": 141.504, + "epoch": 0.002130833155763904, + "eval_loss": 0.3493800461292267, + "eval_runtime": 34.8493, + "eval_samples_per_second": 4309.239, + "eval_steps_per_second": 67.347, "step": 20 }, { - "epoch": 0.01924310455420141, - "grad_norm": 1.9946489334106445, - "learning_rate": 1.9807568954457988e-05, - "loss": 0.2535, + "epoch": 0.0031962497336458554, + "grad_norm": 1.549603819847107, + "learning_rate": 1.9936075005327084e-05, + "loss": 0.2637, "step": 30 }, { - "epoch": 0.01924310455420141, - "eval_loss": 0.20272396504878998, - "eval_runtime": 2.7868, - "eval_samples_per_second": 4473.656, - "eval_steps_per_second": 139.948, + "epoch": 0.0031962497336458554, + "eval_loss": 0.17470526695251465, + "eval_runtime": 35.0922, + "eval_samples_per_second": 4279.407, + "eval_steps_per_second": 66.881, "step": 30 }, { - "epoch": 0.025657472738935216, - "grad_norm": 1.5090171098709106, - "learning_rate": 1.974342527261065e-05, - "loss": 0.1843, + "epoch": 0.004261666311527808, + "grad_norm": 0.5110528469085693, + "learning_rate": 1.9914766673769446e-05, + "loss": 0.1509, "step": 40 }, { - "epoch": 0.025657472738935216, - "eval_loss": 0.13557858765125275, - "eval_runtime": 2.7501, - "eval_samples_per_second": 4533.352, - "eval_steps_per_second": 141.815, + "epoch": 0.004261666311527808, + "eval_loss": 0.10934165120124817, + "eval_runtime": 34.8506, + "eval_samples_per_second": 4309.077, + "eval_steps_per_second": 67.345, "step": 40 }, { - "epoch": 0.03207184092366902, - "grad_norm": 1.2906427383422852, - "learning_rate": 1.967928159076331e-05, - "loss": 0.1196, + "epoch": 0.005327082889409759, + "grad_norm": 0.5021756887435913, + "learning_rate": 1.9893458342211807e-05, + "loss": 0.1135, "step": 50 }, { - "epoch": 0.03207184092366902, - "eval_loss": 0.09848916530609131, - "eval_runtime": 2.8508, - "eval_samples_per_second": 4373.15, - "eval_steps_per_second": 136.803, + "epoch": 0.005327082889409759, + "eval_loss": 0.08590664714574814, + "eval_runtime": 34.8851, + "eval_samples_per_second": 4304.813, + "eval_steps_per_second": 67.278, "step": 50 }, { - "epoch": 0.03848620910840282, - "grad_norm": 1.8510209321975708, - "learning_rate": 1.9615137908915974e-05, - "loss": 0.077, + "epoch": 0.006392499467291711, + "grad_norm": 0.8349336385726929, + "learning_rate": 1.987215001065417e-05, + "loss": 0.1037, "step": 60 }, { - "epoch": 0.03848620910840282, - "eval_loss": 0.0683838352560997, - "eval_runtime": 2.7704, - "eval_samples_per_second": 4500.147, - "eval_steps_per_second": 140.776, + "epoch": 0.006392499467291711, + "eval_loss": 0.07590685039758682, + "eval_runtime": 34.9244, + "eval_samples_per_second": 4299.977, + "eval_steps_per_second": 67.202, "step": 60 }, { - "epoch": 0.04490057729313662, - "grad_norm": 0.5595826506614685, - "learning_rate": 1.9550994227068635e-05, - "loss": 0.0694, + "epoch": 0.007457916045173663, + "grad_norm": 0.5514059066772461, + "learning_rate": 1.9850841679096528e-05, + "loss": 0.0652, "step": 70 }, { - "epoch": 0.04490057729313662, - "eval_loss": 0.05297030881047249, - "eval_runtime": 2.754, - "eval_samples_per_second": 4526.939, - "eval_steps_per_second": 141.614, + "epoch": 0.007457916045173663, + "eval_loss": 0.06897564232349396, + "eval_runtime": 34.8575, + "eval_samples_per_second": 4308.227, + "eval_steps_per_second": 67.331, "step": 70 }, { - "epoch": 0.05131494547787043, - "grad_norm": 0.836297869682312, - "learning_rate": 1.9486850545221296e-05, - "loss": 0.0452, + "epoch": 0.008523332623055616, + "grad_norm": 0.4209994673728943, + "learning_rate": 1.982953334753889e-05, + "loss": 0.0596, "step": 80 }, { - "epoch": 0.05131494547787043, - "eval_loss": 0.04719825088977814, - "eval_runtime": 2.7488, - "eval_samples_per_second": 4535.467, - "eval_steps_per_second": 141.881, + "epoch": 0.008523332623055616, + "eval_loss": 0.060382526367902756, + "eval_runtime": 34.9303, + "eval_samples_per_second": 4299.245, + "eval_steps_per_second": 67.191, "step": 80 }, { - "epoch": 0.05772931366260423, - "grad_norm": 3.999357223510742, - "learning_rate": 1.942270686337396e-05, - "loss": 0.0533, + "epoch": 0.009588749200937566, + "grad_norm": 0.4787921905517578, + "learning_rate": 1.980822501598125e-05, + "loss": 0.0729, "step": 90 }, { - "epoch": 0.05772931366260423, - "eval_loss": 0.035153161734342575, - "eval_runtime": 2.7687, - "eval_samples_per_second": 4502.755, - "eval_steps_per_second": 140.858, + "epoch": 0.009588749200937566, + "eval_loss": 0.04821014031767845, + "eval_runtime": 34.9624, + "eval_samples_per_second": 4295.297, + "eval_steps_per_second": 67.129, "step": 90 }, { - "epoch": 0.06414368184733804, - "grad_norm": 0.37607356905937195, - "learning_rate": 1.935856318152662e-05, - "loss": 0.0376, + "epoch": 0.010654165778819518, + "grad_norm": 0.5219190120697021, + "learning_rate": 1.9786916684423613e-05, + "loss": 0.0312, "step": 100 }, { - "epoch": 0.06414368184733804, - "eval_loss": 0.02929052524268627, - "eval_runtime": 2.7416, - "eval_samples_per_second": 4547.36, - "eval_steps_per_second": 142.253, + "epoch": 0.010654165778819518, + "eval_loss": 0.0431891568005085, + "eval_runtime": 34.9505, + "eval_samples_per_second": 4296.761, + "eval_steps_per_second": 67.152, "step": 100 }, { - "epoch": 0.07055805003207184, - "grad_norm": 0.2907819151878357, - "learning_rate": 1.9294419499679282e-05, - "loss": 0.0199, + "epoch": 0.01171958235670147, + "grad_norm": 1.1521271467208862, + "learning_rate": 1.976560835286597e-05, + "loss": 0.037, "step": 110 }, { - "epoch": 0.07055805003207184, - "eval_loss": 0.024354618042707443, - "eval_runtime": 2.7981, - "eval_samples_per_second": 4455.483, - "eval_steps_per_second": 139.379, + "epoch": 0.01171958235670147, + "eval_loss": 0.03519212082028389, + "eval_runtime": 34.9299, + "eval_samples_per_second": 4299.301, + "eval_steps_per_second": 67.192, "step": 110 }, { - "epoch": 0.07697241821680564, - "grad_norm": 0.2949336767196655, - "learning_rate": 1.9230275817831943e-05, - "loss": 0.0175, + "epoch": 0.012784998934583422, + "grad_norm": 0.20447871088981628, + "learning_rate": 1.9744300021308334e-05, + "loss": 0.0279, "step": 120 }, { - "epoch": 0.07697241821680564, - "eval_loss": 0.021745959296822548, - "eval_runtime": 2.7494, - "eval_samples_per_second": 4534.52, - "eval_steps_per_second": 141.852, + "epoch": 0.012784998934583422, + "eval_loss": 0.02899288199841976, + "eval_runtime": 34.9533, + "eval_samples_per_second": 4296.421, + "eval_steps_per_second": 67.147, "step": 120 }, { - "epoch": 0.08338678640153944, - "grad_norm": 0.2311653196811676, - "learning_rate": 1.9166132135984608e-05, - "loss": 0.0203, + "epoch": 0.013850415512465374, + "grad_norm": 0.18650375306606293, + "learning_rate": 1.9722991689750695e-05, + "loss": 0.0215, "step": 130 }, { - "epoch": 0.08338678640153944, - "eval_loss": 0.0190635584294796, - "eval_runtime": 2.7863, - "eval_samples_per_second": 4474.414, - "eval_steps_per_second": 139.971, + "epoch": 0.013850415512465374, + "eval_loss": 0.025864260271191597, + "eval_runtime": 35.0071, + "eval_samples_per_second": 4289.821, + "eval_steps_per_second": 67.044, "step": 130 }, { - "epoch": 0.08980115458627325, - "grad_norm": 0.2001882642507553, - "learning_rate": 1.910198845413727e-05, - "loss": 0.0168, + "epoch": 0.014915832090347326, + "grad_norm": 0.32700005173683167, + "learning_rate": 1.9701683358193057e-05, + "loss": 0.0259, "step": 140 }, { - "epoch": 0.08980115458627325, - "eval_loss": 0.017704442143440247, - "eval_runtime": 2.7325, - "eval_samples_per_second": 4562.549, - "eval_steps_per_second": 142.728, + "epoch": 0.014915832090347326, + "eval_loss": 0.0220870953053236, + "eval_runtime": 34.9891, + "eval_samples_per_second": 4292.02, + "eval_steps_per_second": 67.078, "step": 140 }, { - "epoch": 0.09621552277100706, - "grad_norm": 0.21124917268753052, - "learning_rate": 1.903784477228993e-05, - "loss": 0.019, + "epoch": 0.015981248668229277, + "grad_norm": 0.601952850818634, + "learning_rate": 1.9680375026635416e-05, + "loss": 0.0227, "step": 150 }, { - "epoch": 0.09621552277100706, - "eval_loss": 0.01564132049679756, - "eval_runtime": 2.795, - "eval_samples_per_second": 4460.445, - "eval_steps_per_second": 139.534, + "epoch": 0.015981248668229277, + "eval_loss": 0.01984524168074131, + "eval_runtime": 34.9806, + "eval_samples_per_second": 4293.068, + "eval_steps_per_second": 67.094, "step": 150 }, { - "epoch": 0.10262989095574086, - "grad_norm": 0.1619035303592682, - "learning_rate": 1.8973701090442594e-05, - "loss": 0.0116, + "epoch": 0.01704666524611123, + "grad_norm": 0.5832983255386353, + "learning_rate": 1.9659066695077777e-05, + "loss": 0.0129, "step": 160 }, { - "epoch": 0.10262989095574086, - "eval_loss": 0.014719261787831783, - "eval_runtime": 2.7377, - "eval_samples_per_second": 4553.795, - "eval_steps_per_second": 142.454, + "epoch": 0.01704666524611123, + "eval_loss": 0.018882030621170998, + "eval_runtime": 34.9919, + "eval_samples_per_second": 4291.685, + "eval_steps_per_second": 67.073, "step": 160 }, { - "epoch": 0.10904425914047466, - "grad_norm": 5.417097091674805, - "learning_rate": 1.8909557408595255e-05, - "loss": 0.0126, + "epoch": 0.01811208182399318, + "grad_norm": 0.5878477096557617, + "learning_rate": 1.963775836352014e-05, + "loss": 0.0142, "step": 170 }, { - "epoch": 0.10904425914047466, - "eval_loss": 0.014089061878621578, - "eval_runtime": 2.7909, - "eval_samples_per_second": 4467.003, - "eval_steps_per_second": 139.739, + "epoch": 0.01811208182399318, + "eval_loss": 0.01820511370897293, + "eval_runtime": 34.9938, + "eval_samples_per_second": 4291.445, + "eval_steps_per_second": 67.069, "step": 170 }, { - "epoch": 0.11545862732520847, - "grad_norm": 0.23231153190135956, - "learning_rate": 1.8845413726747916e-05, - "loss": 0.0095, + "epoch": 0.01917749840187513, + "grad_norm": 0.11928685754537582, + "learning_rate": 1.96164500319625e-05, + "loss": 0.0112, "step": 180 }, { - "epoch": 0.11545862732520847, - "eval_loss": 0.01361811999231577, - "eval_runtime": 2.678, - "eval_samples_per_second": 4655.296, - "eval_steps_per_second": 145.63, + "epoch": 0.01917749840187513, + "eval_loss": 0.015623863786458969, + "eval_runtime": 34.9739, + "eval_samples_per_second": 4293.893, + "eval_steps_per_second": 67.107, "step": 180 }, { - "epoch": 0.12187299550994227, - "grad_norm": 0.12273150682449341, - "learning_rate": 1.878127004490058e-05, - "loss": 0.0077, + "epoch": 0.020242914979757085, + "grad_norm": 0.16800431907176971, + "learning_rate": 1.959514170040486e-05, + "loss": 0.0204, "step": 190 }, { - "epoch": 0.12187299550994227, - "eval_loss": 0.012749651446938515, - "eval_runtime": 2.4162, - "eval_samples_per_second": 5159.82, - "eval_steps_per_second": 161.413, + "epoch": 0.020242914979757085, + "eval_loss": 0.014880867674946785, + "eval_runtime": 34.9967, + "eval_samples_per_second": 4291.092, + "eval_steps_per_second": 67.063, "step": 190 }, { - "epoch": 0.12828736369467608, - "grad_norm": 0.1562490612268448, - "learning_rate": 1.871712636305324e-05, - "loss": 0.013, + "epoch": 0.021308331557639035, + "grad_norm": 0.18934427201747894, + "learning_rate": 1.957383336884722e-05, + "loss": 0.0217, "step": 200 }, { - "epoch": 0.12828736369467608, - "eval_loss": 0.01220065075904131, - "eval_runtime": 2.4215, - "eval_samples_per_second": 5148.566, - "eval_steps_per_second": 161.06, + "epoch": 0.021308331557639035, + "eval_loss": 0.015829147771000862, + "eval_runtime": 35.0483, + "eval_samples_per_second": 4284.768, + "eval_steps_per_second": 66.965, "step": 200 }, { - "epoch": 0.13470173187940987, - "grad_norm": 0.10928566008806229, - "learning_rate": 1.8652982681205902e-05, - "loss": 0.0067, + "epoch": 0.02237374813552099, + "grad_norm": 0.13677361607551575, + "learning_rate": 1.9552525037289583e-05, + "loss": 0.0242, "step": 210 }, { - "epoch": 0.13470173187940987, - "eval_loss": 0.011816415004432201, - "eval_runtime": 2.4392, - "eval_samples_per_second": 5111.048, - "eval_steps_per_second": 159.887, + "epoch": 0.02237374813552099, + "eval_loss": 0.013792283833026886, + "eval_runtime": 35.0652, + "eval_samples_per_second": 4282.709, + "eval_steps_per_second": 66.932, "step": 210 }, { - "epoch": 0.14111610006414368, - "grad_norm": 0.14731262624263763, - "learning_rate": 1.8588838999358566e-05, - "loss": 0.0058, + "epoch": 0.02343916471340294, + "grad_norm": 0.09131798893213272, + "learning_rate": 1.9531216705731945e-05, + "loss": 0.0149, "step": 220 }, { - "epoch": 0.14111610006414368, - "eval_loss": 0.011285252869129181, - "eval_runtime": 2.592, - "eval_samples_per_second": 4809.839, - "eval_steps_per_second": 150.464, + "epoch": 0.02343916471340294, + "eval_loss": 0.012414357624948025, + "eval_runtime": 35.0415, + "eval_samples_per_second": 4285.601, + "eval_steps_per_second": 66.978, "step": 220 }, { - "epoch": 0.14753046824887747, - "grad_norm": 0.20062246918678284, - "learning_rate": 1.8524695317511227e-05, - "loss": 0.0046, + "epoch": 0.024504581291284893, + "grad_norm": 1.9855362176895142, + "learning_rate": 1.9509908374174304e-05, + "loss": 0.0213, "step": 230 }, { - "epoch": 0.14753046824887747, - "eval_loss": 0.011024047620594501, - "eval_runtime": 2.7581, - "eval_samples_per_second": 4520.102, - "eval_steps_per_second": 141.4, + "epoch": 0.024504581291284893, + "eval_loss": 0.013272976502776146, + "eval_runtime": 35.023, + "eval_samples_per_second": 4287.865, + "eval_steps_per_second": 67.013, "step": 230 }, { - "epoch": 0.1539448364336113, - "grad_norm": 0.08004695922136307, - "learning_rate": 1.8460551635663888e-05, - "loss": 0.0068, + "epoch": 0.025569997869166843, + "grad_norm": 0.9082479476928711, + "learning_rate": 1.9488600042616665e-05, + "loss": 0.0199, "step": 240 }, { - "epoch": 0.1539448364336113, - "eval_loss": 0.01156590785831213, - "eval_runtime": 2.781, - "eval_samples_per_second": 4482.909, - "eval_steps_per_second": 140.237, + "epoch": 0.025569997869166843, + "eval_loss": 0.01087925210595131, + "eval_runtime": 35.0338, + "eval_samples_per_second": 4286.545, + "eval_steps_per_second": 66.992, "step": 240 }, { - "epoch": 0.1603592046183451, - "grad_norm": 0.10583271086215973, - "learning_rate": 1.839640795381655e-05, - "loss": 0.0185, + "epoch": 0.026635414447048797, + "grad_norm": 0.14924249053001404, + "learning_rate": 1.9467291711059027e-05, + "loss": 0.0169, "step": 250 }, { - "epoch": 0.1603592046183451, - "eval_loss": 0.010944708250463009, - "eval_runtime": 2.751, - "eval_samples_per_second": 4531.758, - "eval_steps_per_second": 141.765, + "epoch": 0.026635414447048797, + "eval_loss": 0.012179452925920486, + "eval_runtime": 35.0707, + "eval_samples_per_second": 4282.032, + "eval_steps_per_second": 66.922, "step": 250 }, { - "epoch": 0.1667735728030789, - "grad_norm": 0.08356133848428726, - "learning_rate": 1.8332264271969214e-05, - "loss": 0.0047, + "epoch": 0.027700831024930747, + "grad_norm": 0.08655331283807755, + "learning_rate": 1.944598337950139e-05, + "loss": 0.01, "step": 260 }, { - "epoch": 0.1667735728030789, - "eval_loss": 0.010701462626457214, - "eval_runtime": 2.7856, - "eval_samples_per_second": 4475.594, - "eval_steps_per_second": 140.008, + "epoch": 0.027700831024930747, + "eval_loss": 0.00992455706000328, + "eval_runtime": 35.0605, + "eval_samples_per_second": 4283.286, + "eval_steps_per_second": 66.941, "step": 260 }, { - "epoch": 0.1731879409878127, - "grad_norm": 0.07829653471708298, - "learning_rate": 1.8268120590121874e-05, - "loss": 0.0063, + "epoch": 0.0287662476028127, + "grad_norm": 0.0683509036898613, + "learning_rate": 1.9424675047943748e-05, + "loss": 0.0064, "step": 270 }, { - "epoch": 0.1731879409878127, - "eval_loss": 0.010307610966265202, - "eval_runtime": 2.7448, - "eval_samples_per_second": 4541.989, - "eval_steps_per_second": 142.085, + "epoch": 0.0287662476028127, + "eval_loss": 0.011564863845705986, + "eval_runtime": 35.1547, + "eval_samples_per_second": 4271.81, + "eval_steps_per_second": 66.762, "step": 270 }, { - "epoch": 0.1796023091725465, - "grad_norm": 0.09023799002170563, - "learning_rate": 1.8203976908274535e-05, - "loss": 0.012, + "epoch": 0.02983166418069465, + "grad_norm": 0.07224300503730774, + "learning_rate": 1.940336671638611e-05, + "loss": 0.0053, "step": 280 }, { - "epoch": 0.1796023091725465, - "eval_loss": 0.010143229737877846, - "eval_runtime": 2.7579, - "eval_samples_per_second": 4520.523, - "eval_steps_per_second": 141.414, + "epoch": 0.02983166418069465, + "eval_loss": 0.011479129083454609, + "eval_runtime": 35.0371, + "eval_samples_per_second": 4286.14, + "eval_steps_per_second": 66.986, "step": 280 }, { - "epoch": 0.1860166773572803, - "grad_norm": 0.09888483583927155, - "learning_rate": 1.81398332264272e-05, - "loss": 0.0061, + "epoch": 0.030897080758576605, + "grad_norm": 0.07006030529737473, + "learning_rate": 1.938205838482847e-05, + "loss": 0.0096, "step": 290 }, { - "epoch": 0.1860166773572803, - "eval_loss": 0.010016077198088169, - "eval_runtime": 2.7506, - "eval_samples_per_second": 4532.456, - "eval_steps_per_second": 141.787, + "epoch": 0.030897080758576605, + "eval_loss": 0.009303942322731018, + "eval_runtime": 35.042, + "eval_samples_per_second": 4285.543, + "eval_steps_per_second": 66.977, "step": 290 }, { - "epoch": 0.19243104554201412, - "grad_norm": 0.19275423884391785, - "learning_rate": 1.807568954457986e-05, - "loss": 0.004, + "epoch": 0.031962497336458555, + "grad_norm": 0.13934771716594696, + "learning_rate": 1.9360750053270833e-05, + "loss": 0.0038, "step": 300 }, { - "epoch": 0.19243104554201412, - "eval_loss": 0.010140118189156055, - "eval_runtime": 2.7433, - "eval_samples_per_second": 4544.529, - "eval_steps_per_second": 142.165, + "epoch": 0.031962497336458555, + "eval_loss": 0.009287681430578232, + "eval_runtime": 35.0082, + "eval_samples_per_second": 4289.686, + "eval_steps_per_second": 67.042, "step": 300 }, { - "epoch": 0.1988454137267479, - "grad_norm": 0.6617489457130432, - "learning_rate": 1.801154586273252e-05, - "loss": 0.004, + "epoch": 0.033027913914340505, + "grad_norm": 0.13616764545440674, + "learning_rate": 1.933944172171319e-05, + "loss": 0.0062, "step": 310 }, { - "epoch": 0.1988454137267479, - "eval_loss": 0.00996321253478527, - "eval_runtime": 2.7928, - "eval_samples_per_second": 4464.0, - "eval_steps_per_second": 139.645, + "epoch": 0.033027913914340505, + "eval_loss": 0.009377561509609222, + "eval_runtime": 34.9955, + "eval_samples_per_second": 4291.232, + "eval_steps_per_second": 67.066, "step": 310 }, { - "epoch": 0.20525978191148173, - "grad_norm": 0.08185740560293198, - "learning_rate": 1.7947402180885186e-05, - "loss": 0.0033, + "epoch": 0.03409333049222246, + "grad_norm": 0.05875202640891075, + "learning_rate": 1.9318133390155553e-05, + "loss": 0.0049, "step": 320 }, { - "epoch": 0.20525978191148173, - "eval_loss": 0.009846839122474194, - "eval_runtime": 2.7484, - "eval_samples_per_second": 4536.056, - "eval_steps_per_second": 141.9, + "epoch": 0.03409333049222246, + "eval_loss": 0.008939397521317005, + "eval_runtime": 35.0043, + "eval_samples_per_second": 4290.158, + "eval_steps_per_second": 67.049, "step": 320 }, { - "epoch": 0.2116741500962155, - "grad_norm": 0.07048241049051285, - "learning_rate": 1.7883258499037847e-05, - "loss": 0.0035, + "epoch": 0.03515874707010441, + "grad_norm": 1.2608754634857178, + "learning_rate": 1.9296825058597915e-05, + "loss": 0.0047, "step": 330 }, { - "epoch": 0.2116741500962155, - "eval_loss": 0.009670664556324482, - "eval_runtime": 2.8146, - "eval_samples_per_second": 4429.476, - "eval_steps_per_second": 138.565, + "epoch": 0.03515874707010441, + "eval_loss": 0.00868003349751234, + "eval_runtime": 35.0087, + "eval_samples_per_second": 4289.62, + "eval_steps_per_second": 67.04, "step": 330 }, { - "epoch": 0.21808851828094933, - "grad_norm": 0.05294159799814224, - "learning_rate": 1.7819114817190508e-05, - "loss": 0.0063, + "epoch": 0.03622416364798636, + "grad_norm": 0.045649394392967224, + "learning_rate": 1.9275516727040277e-05, + "loss": 0.0039, "step": 340 }, { - "epoch": 0.21808851828094933, - "eval_loss": 0.00945183914154768, - "eval_runtime": 2.7473, - "eval_samples_per_second": 4537.971, - "eval_steps_per_second": 141.959, + "epoch": 0.03622416364798636, + "eval_loss": 0.00883927196264267, + "eval_runtime": 34.9953, + "eval_samples_per_second": 4291.265, + "eval_steps_per_second": 67.066, "step": 340 }, { - "epoch": 0.22450288646568314, - "grad_norm": 0.08298569917678833, - "learning_rate": 1.7754971135343172e-05, + "epoch": 0.03728958022586831, + "grad_norm": 0.052812762558460236, + "learning_rate": 1.9254208395482635e-05, "loss": 0.0029, "step": 350 }, { - "epoch": 0.22450288646568314, - "eval_loss": 0.00936791580170393, - "eval_runtime": 2.7685, - "eval_samples_per_second": 4503.121, - "eval_steps_per_second": 140.869, + "epoch": 0.03728958022586831, + "eval_loss": 0.008632567711174488, + "eval_runtime": 35.0069, + "eval_samples_per_second": 4289.834, + "eval_steps_per_second": 67.044, "step": 350 }, { - "epoch": 0.23091725465041693, - "grad_norm": 0.05115268751978874, - "learning_rate": 1.769082745349583e-05, - "loss": 0.0033, + "epoch": 0.03835499680375026, + "grad_norm": 0.36764615774154663, + "learning_rate": 1.9232900063924994e-05, + "loss": 0.0082, "step": 360 }, { - "epoch": 0.23091725465041693, - "eval_loss": 0.009066049940884113, - "eval_runtime": 2.7558, - "eval_samples_per_second": 4523.868, - "eval_steps_per_second": 141.518, + "epoch": 0.03835499680375026, + "eval_loss": 0.008467404171824455, + "eval_runtime": 34.9954, + "eval_samples_per_second": 4291.254, + "eval_steps_per_second": 67.066, "step": 360 }, { - "epoch": 0.23733162283515075, - "grad_norm": 0.05516692250967026, - "learning_rate": 1.7626683771648494e-05, - "loss": 0.0028, + "epoch": 0.03942041338163222, + "grad_norm": 0.046024467796087265, + "learning_rate": 1.9211591732367356e-05, + "loss": 0.0091, "step": 370 }, { - "epoch": 0.23733162283515075, - "eval_loss": 0.008907307870686054, - "eval_runtime": 2.7915, - "eval_samples_per_second": 4466.028, - "eval_steps_per_second": 139.709, + "epoch": 0.03942041338163222, + "eval_loss": 0.008618910796940327, + "eval_runtime": 34.9555, + "eval_samples_per_second": 4296.153, + "eval_steps_per_second": 67.143, "step": 370 }, { - "epoch": 0.24374599101988453, - "grad_norm": 0.04760660603642464, - "learning_rate": 1.7562540089801155e-05, - "loss": 0.0196, + "epoch": 0.04048582995951417, + "grad_norm": 2.4632503986358643, + "learning_rate": 1.9190283400809718e-05, + "loss": 0.0088, "step": 380 }, { - "epoch": 0.24374599101988453, - "eval_loss": 0.008900969289243221, - "eval_runtime": 2.746, - "eval_samples_per_second": 4540.006, - "eval_steps_per_second": 142.023, + "epoch": 0.04048582995951417, + "eval_loss": 0.010347607545554638, + "eval_runtime": 34.9756, + "eval_samples_per_second": 4293.684, + "eval_steps_per_second": 67.104, "step": 380 }, { - "epoch": 0.2501603592046183, - "grad_norm": 0.08314153552055359, - "learning_rate": 1.7498396407953816e-05, - "loss": 0.0031, + "epoch": 0.04155124653739612, + "grad_norm": 0.060661379247903824, + "learning_rate": 1.916897506925208e-05, + "loss": 0.0144, "step": 390 }, { - "epoch": 0.2501603592046183, - "eval_loss": 0.009380661882460117, - "eval_runtime": 2.7831, - "eval_samples_per_second": 4479.473, - "eval_steps_per_second": 140.129, + "epoch": 0.04155124653739612, + "eval_loss": 0.007851127535104752, + "eval_runtime": 35.016, + "eval_samples_per_second": 4288.73, + "eval_steps_per_second": 67.027, "step": 390 }, { - "epoch": 0.25657472738935216, - "grad_norm": 0.06383411586284637, - "learning_rate": 1.743425272610648e-05, - "loss": 0.0097, + "epoch": 0.04261666311527807, + "grad_norm": 0.0612405426800251, + "learning_rate": 1.9147666737694438e-05, + "loss": 0.0212, "step": 400 }, { - "epoch": 0.25657472738935216, - "eval_loss": 0.007914945483207703, - "eval_runtime": 2.4093, - "eval_samples_per_second": 5174.549, - "eval_steps_per_second": 161.873, + "epoch": 0.04261666311527807, + "eval_loss": 0.008376965299248695, + "eval_runtime": 35.0394, + "eval_samples_per_second": 4285.857, + "eval_steps_per_second": 66.982, "step": 400 }, { - "epoch": 0.26298909557408595, - "grad_norm": 0.05818384885787964, - "learning_rate": 1.737010904425914e-05, - "loss": 0.0284, + "epoch": 0.04368207969316003, + "grad_norm": 0.05073362961411476, + "learning_rate": 1.91263584061368e-05, + "loss": 0.008, "step": 410 }, { - "epoch": 0.26298909557408595, - "eval_loss": 0.0076097771525382996, - "eval_runtime": 2.4156, - "eval_samples_per_second": 5160.954, - "eval_steps_per_second": 161.448, + "epoch": 0.04368207969316003, + "eval_loss": 0.008724682033061981, + "eval_runtime": 35.0067, + "eval_samples_per_second": 4289.87, + "eval_steps_per_second": 67.044, "step": 410 }, { - "epoch": 0.26940346375881974, - "grad_norm": 0.04634961113333702, - "learning_rate": 1.7305965362411802e-05, - "loss": 0.0028, + "epoch": 0.04474749627104198, + "grad_norm": 0.06536891311407089, + "learning_rate": 1.910505007457916e-05, + "loss": 0.0055, "step": 420 }, { - "epoch": 0.26940346375881974, - "eval_loss": 0.007537364028394222, - "eval_runtime": 2.4317, - "eval_samples_per_second": 5126.827, - "eval_steps_per_second": 160.38, + "epoch": 0.04474749627104198, + "eval_loss": 0.007841785438358784, + "eval_runtime": 35.0751, + "eval_samples_per_second": 4281.496, + "eval_steps_per_second": 66.914, "step": 420 }, { - "epoch": 0.2758178319435536, - "grad_norm": 0.05479798465967178, - "learning_rate": 1.7241821680564467e-05, - "loss": 0.0267, + "epoch": 0.04581291284892393, + "grad_norm": 0.1056961938738823, + "learning_rate": 1.9083741743021523e-05, + "loss": 0.0028, "step": 430 }, { - "epoch": 0.2758178319435536, - "eval_loss": 0.007072314620018005, - "eval_runtime": 2.4166, - "eval_samples_per_second": 5158.936, - "eval_steps_per_second": 161.385, + "epoch": 0.04581291284892393, + "eval_loss": 0.0076719019562006, + "eval_runtime": 34.9762, + "eval_samples_per_second": 4293.602, + "eval_steps_per_second": 67.103, "step": 430 }, { - "epoch": 0.28223220012828737, - "grad_norm": 0.05063086375594139, - "learning_rate": 1.7177677998717128e-05, - "loss": 0.0053, + "epoch": 0.04687832942680588, + "grad_norm": 0.0451618917286396, + "learning_rate": 1.9062433411463882e-05, + "loss": 0.0133, "step": 440 }, { - "epoch": 0.28223220012828737, - "eval_loss": 0.007745321840047836, - "eval_runtime": 2.4878, - "eval_samples_per_second": 5011.194, - "eval_steps_per_second": 156.763, + "epoch": 0.04687832942680588, + "eval_loss": 0.007590805646032095, + "eval_runtime": 35.0087, + "eval_samples_per_second": 4289.616, + "eval_steps_per_second": 67.04, "step": 440 }, { - "epoch": 0.28864656831302116, - "grad_norm": 0.05067560821771622, - "learning_rate": 1.711353431686979e-05, - "loss": 0.0154, + "epoch": 0.047943746004687836, + "grad_norm": 1.1059614419937134, + "learning_rate": 1.9041125079906244e-05, + "loss": 0.0168, "step": 450 }, { - "epoch": 0.28864656831302116, - "eval_loss": 0.007782284170389175, - "eval_runtime": 2.7555, - "eval_samples_per_second": 4524.446, - "eval_steps_per_second": 141.536, + "epoch": 0.047943746004687836, + "eval_loss": 0.007430546451359987, + "eval_runtime": 35.0188, + "eval_samples_per_second": 4288.385, + "eval_steps_per_second": 67.021, "step": 450 }, { - "epoch": 0.29506093649775494, - "grad_norm": 0.0449526272714138, - "learning_rate": 1.7049390635022453e-05, - "loss": 0.0024, + "epoch": 0.049009162582569786, + "grad_norm": 0.06677515804767609, + "learning_rate": 1.9019816748348605e-05, + "loss": 0.0069, "step": 460 }, { - "epoch": 0.29506093649775494, - "eval_loss": 0.007555678952485323, - "eval_runtime": 2.7641, - "eval_samples_per_second": 4510.352, - "eval_steps_per_second": 141.095, + "epoch": 0.049009162582569786, + "eval_loss": 0.008023254573345184, + "eval_runtime": 34.9882, + "eval_samples_per_second": 4292.131, + "eval_steps_per_second": 67.08, "step": 460 }, { - "epoch": 0.3014753046824888, - "grad_norm": 0.051270872354507446, - "learning_rate": 1.6985246953175114e-05, - "loss": 0.0026, + "epoch": 0.050074579160451736, + "grad_norm": 0.04259790852665901, + "learning_rate": 1.8998508416790967e-05, + "loss": 0.0047, "step": 470 }, { - "epoch": 0.3014753046824888, - "eval_loss": 0.007208110298961401, - "eval_runtime": 2.7392, - "eval_samples_per_second": 4551.376, - "eval_steps_per_second": 142.379, + "epoch": 0.050074579160451736, + "eval_loss": 0.007901841774582863, + "eval_runtime": 35.0652, + "eval_samples_per_second": 4282.71, + "eval_steps_per_second": 66.932, "step": 470 }, { - "epoch": 0.3078896728672226, - "grad_norm": 0.03896138072013855, - "learning_rate": 1.6921103271327775e-05, - "loss": 0.008, + "epoch": 0.051139995738333686, + "grad_norm": 0.12410833686590195, + "learning_rate": 1.8977200085233326e-05, + "loss": 0.0066, "step": 480 }, { - "epoch": 0.3078896728672226, - "eval_loss": 0.007339117117226124, - "eval_runtime": 2.7424, - "eval_samples_per_second": 4546.013, - "eval_steps_per_second": 142.211, + "epoch": 0.051139995738333686, + "eval_loss": 0.00759873166680336, + "eval_runtime": 35.0487, + "eval_samples_per_second": 4284.728, + "eval_steps_per_second": 66.964, "step": 480 }, { - "epoch": 0.31430404105195636, - "grad_norm": 0.16523483395576477, - "learning_rate": 1.6856959589480436e-05, - "loss": 0.0088, + "epoch": 0.052205412316215644, + "grad_norm": 0.08035538345575333, + "learning_rate": 1.8955891753675688e-05, + "loss": 0.0023, "step": 490 }, { - "epoch": 0.31430404105195636, - "eval_loss": 0.010751989670097828, - "eval_runtime": 2.7719, - "eval_samples_per_second": 4497.586, - "eval_steps_per_second": 140.696, + "epoch": 0.052205412316215644, + "eval_loss": 0.007343141362071037, + "eval_runtime": 35.0739, + "eval_samples_per_second": 4281.64, + "eval_steps_per_second": 66.916, "step": 490 }, { - "epoch": 0.3207184092366902, - "grad_norm": 0.037684451788663864, - "learning_rate": 1.67928159076331e-05, - "loss": 0.0031, + "epoch": 0.053270828894097594, + "grad_norm": 0.04620998725295067, + "learning_rate": 1.893458342211805e-05, + "loss": 0.0087, "step": 500 }, { - "epoch": 0.3207184092366902, - "eval_loss": 0.007628251798450947, - "eval_runtime": 2.7331, - "eval_samples_per_second": 4561.511, - "eval_steps_per_second": 142.696, + "epoch": 0.053270828894097594, + "eval_loss": 0.007286736276000738, + "eval_runtime": 35.0515, + "eval_samples_per_second": 4284.378, + "eval_steps_per_second": 66.959, "step": 500 }, { - "epoch": 0.327132777421424, - "grad_norm": 0.06561500579118729, - "learning_rate": 1.672867222578576e-05, - "loss": 0.0022, + "epoch": 0.054336245471979544, + "grad_norm": 0.03123115375638008, + "learning_rate": 1.891327509056041e-05, + "loss": 0.0029, "step": 510 }, { - "epoch": 0.327132777421424, - "eval_loss": 0.007246845401823521, - "eval_runtime": 2.7607, - "eval_samples_per_second": 4515.928, - "eval_steps_per_second": 141.27, + "epoch": 0.054336245471979544, + "eval_loss": 0.007131603546440601, + "eval_runtime": 35.0687, + "eval_samples_per_second": 4282.275, + "eval_steps_per_second": 66.926, "step": 510 }, { - "epoch": 0.3335471456061578, - "grad_norm": 0.041504278779029846, - "learning_rate": 1.6664528543938422e-05, - "loss": 0.0019, + "epoch": 0.055401662049861494, + "grad_norm": 0.06445208191871643, + "learning_rate": 1.889196675900277e-05, + "loss": 0.0107, "step": 520 }, { - "epoch": 0.3335471456061578, - "eval_loss": 0.0074407560750842094, - "eval_runtime": 2.7513, - "eval_samples_per_second": 4531.364, - "eval_steps_per_second": 141.753, + "epoch": 0.055401662049861494, + "eval_loss": 0.007308864034712315, + "eval_runtime": 35.0344, + "eval_samples_per_second": 4286.475, + "eval_steps_per_second": 66.991, "step": 520 }, { - "epoch": 0.3399615137908916, - "grad_norm": 0.03790243715047836, - "learning_rate": 1.6600384862091086e-05, - "loss": 0.0019, + "epoch": 0.056467078627743444, + "grad_norm": 0.026168525218963623, + "learning_rate": 1.887065842744513e-05, + "loss": 0.0074, "step": 530 }, { - "epoch": 0.3399615137908916, - "eval_loss": 0.007536882068961859, - "eval_runtime": 2.769, - "eval_samples_per_second": 4502.413, - "eval_steps_per_second": 140.847, + "epoch": 0.056467078627743444, + "eval_loss": 0.007467833813279867, + "eval_runtime": 35.1394, + "eval_samples_per_second": 4273.663, + "eval_steps_per_second": 66.791, "step": 530 }, { - "epoch": 0.3463758819756254, - "grad_norm": 0.0355646014213562, - "learning_rate": 1.6536241180243747e-05, - "loss": 0.0019, + "epoch": 0.0575324952056254, + "grad_norm": 0.06097254157066345, + "learning_rate": 1.8849350095887493e-05, + "loss": 0.0079, "step": 540 }, { - "epoch": 0.3463758819756254, - "eval_loss": 0.0075461240485310555, - "eval_runtime": 2.413, - "eval_samples_per_second": 5166.639, - "eval_steps_per_second": 161.626, + "epoch": 0.0575324952056254, + "eval_loss": 0.006862245034426451, + "eval_runtime": 35.0843, + "eval_samples_per_second": 4280.379, + "eval_steps_per_second": 66.896, "step": 540 }, { - "epoch": 0.3527902501603592, - "grad_norm": 0.04195011779665947, - "learning_rate": 1.6472097498396408e-05, - "loss": 0.0018, + "epoch": 0.05859791178350735, + "grad_norm": 1.4416311979293823, + "learning_rate": 1.8828041764329855e-05, + "loss": 0.003, "step": 550 }, { - "epoch": 0.3527902501603592, - "eval_loss": 0.007501190062612295, - "eval_runtime": 2.4208, - "eval_samples_per_second": 5150.019, - "eval_steps_per_second": 161.106, + "epoch": 0.05859791178350735, + "eval_loss": 0.006868099793791771, + "eval_runtime": 35.0895, + "eval_samples_per_second": 4279.74, + "eval_steps_per_second": 66.886, "step": 550 }, { - "epoch": 0.359204618345093, - "grad_norm": 0.28111106157302856, - "learning_rate": 1.6407953816549073e-05, - "loss": 0.0132, + "epoch": 0.0596633283613893, + "grad_norm": 0.02442428097128868, + "learning_rate": 1.8806733432772214e-05, + "loss": 0.007, "step": 560 }, { - "epoch": 0.359204618345093, - "eval_loss": 0.006708688568323851, - "eval_runtime": 2.4465, - "eval_samples_per_second": 5095.942, - "eval_steps_per_second": 159.414, + "epoch": 0.0596633283613893, + "eval_loss": 0.0072341980412602425, + "eval_runtime": 35.1045, + "eval_samples_per_second": 4277.91, + "eval_steps_per_second": 66.857, "step": 560 }, { - "epoch": 0.36561898652982683, - "grad_norm": 0.030719993636012077, - "learning_rate": 1.6343810134701734e-05, - "loss": 0.0018, + "epoch": 0.06072874493927125, + "grad_norm": 0.024394547566771507, + "learning_rate": 1.8785425101214576e-05, + "loss": 0.0202, "step": 570 }, { - "epoch": 0.36561898652982683, - "eval_loss": 0.00691966200247407, - "eval_runtime": 2.4163, - "eval_samples_per_second": 5159.519, - "eval_steps_per_second": 161.403, + "epoch": 0.06072874493927125, + "eval_loss": 0.009562548249959946, + "eval_runtime": 35.0609, + "eval_samples_per_second": 4283.231, + "eval_steps_per_second": 66.941, "step": 570 }, { - "epoch": 0.3720333547145606, - "grad_norm": 0.028909320011734962, - "learning_rate": 1.6279666452854395e-05, - "loss": 0.0018, + "epoch": 0.06179416151715321, + "grad_norm": 0.14542266726493835, + "learning_rate": 1.8764116769656937e-05, + "loss": 0.0023, "step": 580 }, { - "epoch": 0.3720333547145606, - "eval_loss": 0.006832743063569069, - "eval_runtime": 2.4073, - "eval_samples_per_second": 5178.73, - "eval_steps_per_second": 162.004, + "epoch": 0.06179416151715321, + "eval_loss": 0.0071628945879638195, + "eval_runtime": 35.1203, + "eval_samples_per_second": 4275.993, + "eval_steps_per_second": 66.828, "step": 580 }, { - "epoch": 0.3784477228992944, - "grad_norm": 0.5586157441139221, - "learning_rate": 1.621552277100706e-05, - "loss": 0.0017, + "epoch": 0.06285957809503516, + "grad_norm": 0.027121223509311676, + "learning_rate": 1.87428084380993e-05, + "loss": 0.007, "step": 590 }, { - "epoch": 0.3784477228992944, - "eval_loss": 0.0066373394802212715, - "eval_runtime": 2.4151, - "eval_samples_per_second": 5162.11, - "eval_steps_per_second": 161.484, + "epoch": 0.06285957809503516, + "eval_loss": 0.00703906686976552, + "eval_runtime": 35.0628, + "eval_samples_per_second": 4283.001, + "eval_steps_per_second": 66.937, "step": 590 }, { - "epoch": 0.38486209108402825, - "grad_norm": 0.029421871528029442, - "learning_rate": 1.615137908915972e-05, - "loss": 0.0015, + "epoch": 0.06392499467291711, + "grad_norm": 0.052346475422382355, + "learning_rate": 1.8721500106541658e-05, + "loss": 0.0104, "step": 600 }, { - "epoch": 0.38486209108402825, - "eval_loss": 0.0064350636675953865, - "eval_runtime": 2.4015, - "eval_samples_per_second": 5191.282, - "eval_steps_per_second": 162.397, + "epoch": 0.06392499467291711, + "eval_loss": 0.006780738476663828, + "eval_runtime": 35.0665, + "eval_samples_per_second": 4282.547, + "eval_steps_per_second": 66.93, "step": 600 }, { - "epoch": 0.39127645926876203, - "grad_norm": 0.028947781771421432, - "learning_rate": 1.608723540731238e-05, - "loss": 0.0022, + "epoch": 0.06499041125079906, + "grad_norm": 0.026305731385946274, + "learning_rate": 1.870019177498402e-05, + "loss": 0.0021, "step": 610 }, { - "epoch": 0.39127645926876203, - "eval_loss": 0.00638926774263382, - "eval_runtime": 2.414, - "eval_samples_per_second": 5164.477, - "eval_steps_per_second": 161.558, + "epoch": 0.06499041125079906, + "eval_loss": 0.006772972177714109, + "eval_runtime": 35.0776, + "eval_samples_per_second": 4281.194, + "eval_steps_per_second": 66.909, "step": 610 }, { - "epoch": 0.3976908274534958, - "grad_norm": 0.06888407468795776, - "learning_rate": 1.6023091725465042e-05, - "loss": 0.0016, + "epoch": 0.06605582782868101, + "grad_norm": 0.029565811157226562, + "learning_rate": 1.867888344342638e-05, + "loss": 0.0033, "step": 620 }, { - "epoch": 0.3976908274534958, - "eval_loss": 0.006634836550801992, - "eval_runtime": 2.4135, - "eval_samples_per_second": 5165.479, - "eval_steps_per_second": 161.59, + "epoch": 0.06605582782868101, + "eval_loss": 0.00664109131321311, + "eval_runtime": 35.0977, + "eval_samples_per_second": 4278.741, + "eval_steps_per_second": 66.87, "step": 620 }, { - "epoch": 0.4041051956382296, - "grad_norm": 0.04158307611942291, - "learning_rate": 1.5958948043617706e-05, - "loss": 0.0187, + "epoch": 0.06712124440656296, + "grad_norm": 0.020861292257905006, + "learning_rate": 1.8657575111868743e-05, + "loss": 0.0016, "step": 630 }, { - "epoch": 0.4041051956382296, - "eval_loss": 0.006342691835016012, - "eval_runtime": 2.4083, - "eval_samples_per_second": 5176.599, - "eval_steps_per_second": 161.937, + "epoch": 0.06712124440656296, + "eval_loss": 0.006744803860783577, + "eval_runtime": 35.0927, + "eval_samples_per_second": 4279.354, + "eval_steps_per_second": 66.88, "step": 630 }, { - "epoch": 0.41051956382296345, - "grad_norm": 0.02934635430574417, - "learning_rate": 1.5894804361770367e-05, - "loss": 0.0015, + "epoch": 0.06818666098444492, + "grad_norm": 0.08481285721063614, + "learning_rate": 1.86362667803111e-05, + "loss": 0.0021, "step": 640 }, { - "epoch": 0.41051956382296345, - "eval_loss": 0.00618335185572505, - "eval_runtime": 2.4101, - "eval_samples_per_second": 5172.867, - "eval_steps_per_second": 161.821, + "epoch": 0.06818666098444492, + "eval_loss": 0.007654301356524229, + "eval_runtime": 35.0712, + "eval_samples_per_second": 4281.973, + "eval_steps_per_second": 66.921, "step": 640 }, { - "epoch": 0.41693393200769724, - "grad_norm": 0.030286366119980812, - "learning_rate": 1.5830660679923028e-05, - "loss": 0.0015, + "epoch": 0.06925207756232687, + "grad_norm": 0.0536779910326004, + "learning_rate": 1.8614958448753463e-05, + "loss": 0.0182, "step": 650 }, { - "epoch": 0.41693393200769724, - "eval_loss": 0.006189233157783747, - "eval_runtime": 2.4471, - "eval_samples_per_second": 5094.534, - "eval_steps_per_second": 159.37, + "epoch": 0.06925207756232687, + "eval_loss": 0.00671360595151782, + "eval_runtime": 35.107, + "eval_samples_per_second": 4277.604, + "eval_steps_per_second": 66.853, "step": 650 }, { - "epoch": 0.423348300192431, - "grad_norm": 0.030619481578469276, - "learning_rate": 1.5766516998075692e-05, - "loss": 0.0016, + "epoch": 0.07031749414020882, + "grad_norm": 0.5193659067153931, + "learning_rate": 1.8593650117195825e-05, + "loss": 0.0069, "step": 660 }, { - "epoch": 0.423348300192431, - "eval_loss": 0.0063001601956784725, - "eval_runtime": 2.417, - "eval_samples_per_second": 5157.96, - "eval_steps_per_second": 161.354, + "epoch": 0.07031749414020882, + "eval_loss": 0.00693098409101367, + "eval_runtime": 35.1433, + "eval_samples_per_second": 4273.191, + "eval_steps_per_second": 66.784, "step": 660 }, { - "epoch": 0.42976266837716487, - "grad_norm": 11.238213539123535, - "learning_rate": 1.5702373316228353e-05, - "loss": 0.0194, + "epoch": 0.07138291071809078, + "grad_norm": 0.020888999104499817, + "learning_rate": 1.8572341785638187e-05, + "loss": 0.0013, "step": 670 }, { - "epoch": 0.42976266837716487, - "eval_loss": 0.006720089819282293, - "eval_runtime": 2.6201, - "eval_samples_per_second": 4758.272, - "eval_steps_per_second": 148.851, + "epoch": 0.07138291071809078, + "eval_loss": 0.006469358690083027, + "eval_runtime": 35.1021, + "eval_samples_per_second": 4278.206, + "eval_steps_per_second": 66.862, "step": 670 }, { - "epoch": 0.43617703656189866, - "grad_norm": 0.02961154095828533, - "learning_rate": 1.5638229634381014e-05, + "epoch": 0.07244832729597273, + "grad_norm": 0.019426677376031876, + "learning_rate": 1.8551033454080546e-05, "loss": 0.0014, "step": 680 }, { - "epoch": 0.43617703656189866, - "eval_loss": 0.006322733126580715, - "eval_runtime": 2.7635, - "eval_samples_per_second": 4511.389, - "eval_steps_per_second": 141.128, + "epoch": 0.07244832729597273, + "eval_loss": 0.006414386909455061, + "eval_runtime": 35.1154, + "eval_samples_per_second": 4276.585, + "eval_steps_per_second": 66.837, "step": 680 }, { - "epoch": 0.44259140474663244, - "grad_norm": 0.03423071652650833, - "learning_rate": 1.557408595253368e-05, - "loss": 0.013, + "epoch": 0.07351374387385468, + "grad_norm": 0.019121970981359482, + "learning_rate": 1.8529725122522907e-05, + "loss": 0.0015, "step": 690 }, { - "epoch": 0.44259140474663244, - "eval_loss": 0.005984561517834663, - "eval_runtime": 2.7417, - "eval_samples_per_second": 4547.152, - "eval_steps_per_second": 142.247, + "epoch": 0.07351374387385468, + "eval_loss": 0.00648439209908247, + "eval_runtime": 35.1, + "eval_samples_per_second": 4278.46, + "eval_steps_per_second": 66.866, "step": 690 }, { - "epoch": 0.4490057729313663, - "grad_norm": 0.04294706881046295, - "learning_rate": 1.550994227068634e-05, - "loss": 0.0226, + "epoch": 0.07457916045173663, + "grad_norm": 0.02870786562561989, + "learning_rate": 1.850841679096527e-05, + "loss": 0.0161, "step": 700 }, { - "epoch": 0.4490057729313663, - "eval_loss": 0.006303212605416775, - "eval_runtime": 2.783, - "eval_samples_per_second": 4479.762, - "eval_steps_per_second": 140.139, + "epoch": 0.07457916045173663, + "eval_loss": 0.006600781809538603, + "eval_runtime": 35.1446, + "eval_samples_per_second": 4273.032, + "eval_steps_per_second": 66.781, "step": 700 }, { - "epoch": 0.4554201411161001, - "grad_norm": 0.04019453376531601, - "learning_rate": 1.5445798588839e-05, - "loss": 0.0016, + "epoch": 0.07564457702961858, + "grad_norm": 0.048932578414678574, + "learning_rate": 1.848710845940763e-05, + "loss": 0.0033, "step": 710 }, { - "epoch": 0.4554201411161001, - "eval_loss": 0.0058699618093669415, - "eval_runtime": 2.7406, - "eval_samples_per_second": 4549.084, - "eval_steps_per_second": 142.307, + "epoch": 0.07564457702961858, + "eval_loss": 0.00685811135917902, + "eval_runtime": 35.0851, + "eval_samples_per_second": 4280.274, + "eval_steps_per_second": 66.894, "step": 710 }, { - "epoch": 0.46183450930083386, - "grad_norm": 0.0327487550675869, - "learning_rate": 1.5381654906991665e-05, - "loss": 0.0018, + "epoch": 0.07670999360750053, + "grad_norm": 0.020688414573669434, + "learning_rate": 1.846580012784999e-05, + "loss": 0.0022, "step": 720 }, { - "epoch": 0.46183450930083386, - "eval_loss": 0.0058822124265134335, - "eval_runtime": 2.7933, - "eval_samples_per_second": 4463.192, - "eval_steps_per_second": 139.62, + "epoch": 0.07670999360750053, + "eval_loss": 0.006442280951887369, + "eval_runtime": 35.0598, + "eval_samples_per_second": 4283.366, + "eval_steps_per_second": 66.943, "step": 720 }, { - "epoch": 0.46824887748556765, - "grad_norm": 0.025891833007335663, - "learning_rate": 1.5317511225144322e-05, - "loss": 0.0013, + "epoch": 0.07777541018538249, + "grad_norm": 0.02903449535369873, + "learning_rate": 1.844449179629235e-05, + "loss": 0.0167, "step": 730 }, { - "epoch": 0.46824887748556765, - "eval_loss": 0.00716716842725873, - "eval_runtime": 2.7388, - "eval_samples_per_second": 4551.919, - "eval_steps_per_second": 142.396, + "epoch": 0.07777541018538249, + "eval_loss": 0.0070753456093370914, + "eval_runtime": 35.0516, + "eval_samples_per_second": 4284.367, + "eval_steps_per_second": 66.958, "step": 730 }, { - "epoch": 0.4746632456703015, - "grad_norm": 0.03438916057348251, - "learning_rate": 1.5253367543296987e-05, - "loss": 0.0015, + "epoch": 0.07884082676326444, + "grad_norm": 0.02733609639108181, + "learning_rate": 1.8423183464734713e-05, + "loss": 0.0014, "step": 740 }, { - "epoch": 0.4746632456703015, - "eval_loss": 0.007040859200060368, - "eval_runtime": 2.7963, - "eval_samples_per_second": 4458.353, - "eval_steps_per_second": 139.469, + "epoch": 0.07884082676326444, + "eval_loss": 0.007332003675401211, + "eval_runtime": 35.1395, + "eval_samples_per_second": 4273.647, + "eval_steps_per_second": 66.791, "step": 740 }, { - "epoch": 0.4810776138550353, - "grad_norm": 0.02543172985315323, - "learning_rate": 1.518922386144965e-05, - "loss": 0.0025, + "epoch": 0.07990624334114639, + "grad_norm": 0.2964134216308594, + "learning_rate": 1.8401875133177075e-05, + "loss": 0.0061, "step": 750 }, { - "epoch": 0.4810776138550353, - "eval_loss": 0.006040054839104414, - "eval_runtime": 2.7444, - "eval_samples_per_second": 4542.754, - "eval_steps_per_second": 142.109, + "epoch": 0.07990624334114639, + "eval_loss": 0.006313994061201811, + "eval_runtime": 35.0697, + "eval_samples_per_second": 4282.154, + "eval_steps_per_second": 66.924, "step": 750 }, { - "epoch": 0.48749198203976907, - "grad_norm": 0.02564307488501072, - "learning_rate": 1.5125080179602309e-05, + "epoch": 0.08097165991902834, + "grad_norm": 0.01741037145256996, + "learning_rate": 1.8380566801619433e-05, "loss": 0.0012, "step": 760 }, { - "epoch": 0.48749198203976907, - "eval_loss": 0.006054045632481575, - "eval_runtime": 2.7961, - "eval_samples_per_second": 4458.733, - "eval_steps_per_second": 139.481, + "epoch": 0.08097165991902834, + "eval_loss": 0.006638936698436737, + "eval_runtime": 35.0747, + "eval_samples_per_second": 4281.544, + "eval_steps_per_second": 66.914, "step": 760 }, { - "epoch": 0.4939063502245029, - "grad_norm": 0.026773959398269653, - "learning_rate": 1.5060936497754971e-05, - "loss": 0.0084, + "epoch": 0.08203707649691029, + "grad_norm": 0.016884565353393555, + "learning_rate": 1.8359258470061795e-05, + "loss": 0.0072, "step": 770 }, { - "epoch": 0.4939063502245029, - "eval_loss": 0.006040018983185291, - "eval_runtime": 2.7423, - "eval_samples_per_second": 4546.213, - "eval_steps_per_second": 142.217, + "epoch": 0.08203707649691029, + "eval_loss": 0.006715176161378622, + "eval_runtime": 35.0656, + "eval_samples_per_second": 4282.664, + "eval_steps_per_second": 66.932, "step": 770 }, { - "epoch": 0.5003207184092366, - "grad_norm": 0.02138395607471466, - "learning_rate": 1.4996792815907636e-05, - "loss": 0.0022, + "epoch": 0.08310249307479224, + "grad_norm": 0.023544272407889366, + "learning_rate": 1.8337950138504157e-05, + "loss": 0.0115, "step": 780 }, { - "epoch": 0.5003207184092366, - "eval_loss": 0.006401837337762117, - "eval_runtime": 2.7962, - "eval_samples_per_second": 4458.477, - "eval_steps_per_second": 139.473, + "epoch": 0.08310249307479224, + "eval_loss": 0.006417686585336924, + "eval_runtime": 35.0831, + "eval_samples_per_second": 4280.519, + "eval_steps_per_second": 66.898, "step": 780 }, { - "epoch": 0.5067350865939705, - "grad_norm": 0.021893974393606186, - "learning_rate": 1.4932649134060295e-05, - "loss": 0.0032, + "epoch": 0.08416790965267419, + "grad_norm": 0.03194332495331764, + "learning_rate": 1.831664180694652e-05, + "loss": 0.0023, "step": 790 }, { - "epoch": 0.5067350865939705, - "eval_loss": 0.006350088398903608, - "eval_runtime": 2.7416, - "eval_samples_per_second": 4547.347, - "eval_steps_per_second": 142.253, + "epoch": 0.08416790965267419, + "eval_loss": 0.00630133505910635, + "eval_runtime": 35.0749, + "eval_samples_per_second": 4281.519, + "eval_steps_per_second": 66.914, "step": 790 }, { - "epoch": 0.5131494547787043, - "grad_norm": 0.026742270216345787, - "learning_rate": 1.4868505452212958e-05, - "loss": 0.0492, + "epoch": 0.08523332623055614, + "grad_norm": 0.017488490790128708, + "learning_rate": 1.8295333475388877e-05, + "loss": 0.0117, "step": 800 }, { - "epoch": 0.5131494547787043, - "eval_loss": 0.005826563574373722, - "eval_runtime": 2.7898, - "eval_samples_per_second": 4468.753, - "eval_steps_per_second": 139.794, + "epoch": 0.08523332623055614, + "eval_loss": 0.006475712638348341, + "eval_runtime": 35.0689, + "eval_samples_per_second": 4282.252, + "eval_steps_per_second": 66.925, "step": 800 }, { - "epoch": 0.5195638229634381, - "grad_norm": 0.03277752548456192, - "learning_rate": 1.480436177036562e-05, - "loss": 0.0014, + "epoch": 0.0862987428084381, + "grad_norm": 0.8892996311187744, + "learning_rate": 1.827402514383124e-05, + "loss": 0.0157, "step": 810 }, { - "epoch": 0.5195638229634381, - "eval_loss": 0.005615294445306063, - "eval_runtime": 2.743, - "eval_samples_per_second": 4545.047, - "eval_steps_per_second": 142.181, + "epoch": 0.0862987428084381, + "eval_loss": 0.005997061729431152, + "eval_runtime": 35.0766, + "eval_samples_per_second": 4281.311, + "eval_steps_per_second": 66.911, "step": 810 }, { - "epoch": 0.5259781911481719, - "grad_norm": 0.03740512579679489, - "learning_rate": 1.4740218088518281e-05, - "loss": 0.0014, + "epoch": 0.08736415938632006, + "grad_norm": 0.019312532618641853, + "learning_rate": 1.82527168122736e-05, + "loss": 0.0204, "step": 820 }, { - "epoch": 0.5259781911481719, - "eval_loss": 0.0055384160950779915, - "eval_runtime": 2.7995, - "eval_samples_per_second": 4453.26, - "eval_steps_per_second": 139.309, + "epoch": 0.08736415938632006, + "eval_loss": 0.0064412918873131275, + "eval_runtime": 35.0849, + "eval_samples_per_second": 4280.301, + "eval_steps_per_second": 66.895, "step": 820 }, { - "epoch": 0.5323925593329057, - "grad_norm": 0.02818211168050766, - "learning_rate": 1.4676074406670944e-05, - "loss": 0.0016, + "epoch": 0.088429575964202, + "grad_norm": 0.07955110818147659, + "learning_rate": 1.8231408480715963e-05, + "loss": 0.0084, "step": 830 }, { - "epoch": 0.5323925593329057, - "eval_loss": 0.006212199572473764, - "eval_runtime": 2.747, - "eval_samples_per_second": 4538.341, - "eval_steps_per_second": 141.971, + "epoch": 0.088429575964202, + "eval_loss": 0.006166558247059584, + "eval_runtime": 35.1204, + "eval_samples_per_second": 4275.979, + "eval_steps_per_second": 66.827, "step": 830 }, { - "epoch": 0.5388069275176395, - "grad_norm": 0.02874821238219738, - "learning_rate": 1.4611930724823606e-05, - "loss": 0.0023, + "epoch": 0.08949499254208396, + "grad_norm": 0.2547236680984497, + "learning_rate": 1.821010014915832e-05, + "loss": 0.0036, "step": 840 }, { - "epoch": 0.5388069275176395, - "eval_loss": 0.0062833670526742935, - "eval_runtime": 2.7804, - "eval_samples_per_second": 4483.918, - "eval_steps_per_second": 140.269, + "epoch": 0.08949499254208396, + "eval_loss": 0.0059661865234375, + "eval_runtime": 35.1305, + "eval_samples_per_second": 4274.751, + "eval_steps_per_second": 66.808, "step": 840 }, { - "epoch": 0.5452212957023733, - "grad_norm": 15.998806953430176, - "learning_rate": 1.4547787042976269e-05, - "loss": 0.0069, + "epoch": 0.0905604091199659, + "grad_norm": 0.028440352529287338, + "learning_rate": 1.8188791817600683e-05, + "loss": 0.0026, "step": 850 }, { - "epoch": 0.5452212957023733, - "eval_loss": 0.005439049564301968, - "eval_runtime": 2.7374, - "eval_samples_per_second": 4554.31, - "eval_steps_per_second": 142.471, + "epoch": 0.0905604091199659, + "eval_loss": 0.0058583482168614864, + "eval_runtime": 35.0777, + "eval_samples_per_second": 4281.186, + "eval_steps_per_second": 66.909, "step": 850 }, { - "epoch": 0.5516356638871072, - "grad_norm": 0.13971057534217834, - "learning_rate": 1.448364336112893e-05, - "loss": 0.0104, + "epoch": 0.09162582569784786, + "grad_norm": 0.015186217613518238, + "learning_rate": 1.8167483486043045e-05, + "loss": 0.0053, "step": 860 }, { - "epoch": 0.5516356638871072, - "eval_loss": 0.007052511442452669, - "eval_runtime": 2.7895, - "eval_samples_per_second": 4469.333, - "eval_steps_per_second": 139.812, + "epoch": 0.09162582569784786, + "eval_loss": 0.0058418079279363155, + "eval_runtime": 35.0918, + "eval_samples_per_second": 4279.466, + "eval_steps_per_second": 66.882, "step": 860 }, { - "epoch": 0.5580500320718409, - "grad_norm": 1.057011604309082, - "learning_rate": 1.4419499679281593e-05, - "loss": 0.0205, + "epoch": 0.09269124227572981, + "grad_norm": 0.0264381542801857, + "learning_rate": 1.8146175154485407e-05, + "loss": 0.0071, "step": 870 }, { - "epoch": 0.5580500320718409, - "eval_loss": 0.006652463227510452, - "eval_runtime": 2.4224, - "eval_samples_per_second": 5146.536, - "eval_steps_per_second": 160.997, + "epoch": 0.09269124227572981, + "eval_loss": 0.0058640833012759686, + "eval_runtime": 35.0547, + "eval_samples_per_second": 4283.985, + "eval_steps_per_second": 66.952, "step": 870 }, { - "epoch": 0.5644644002565747, - "grad_norm": 0.02542971819639206, - "learning_rate": 1.4355355997434255e-05, - "loss": 0.0033, + "epoch": 0.09375665885361176, + "grad_norm": 0.04548242315649986, + "learning_rate": 1.8124866822927765e-05, + "loss": 0.0047, "step": 880 }, { - "epoch": 0.5644644002565747, - "eval_loss": 0.005239278543740511, - "eval_runtime": 2.4966, - "eval_samples_per_second": 4993.593, - "eval_steps_per_second": 156.213, + "epoch": 0.09375665885361176, + "eval_loss": 0.005854357033967972, + "eval_runtime": 35.0612, + "eval_samples_per_second": 4283.196, + "eval_steps_per_second": 66.94, "step": 880 }, { - "epoch": 0.5708787684413086, - "grad_norm": 0.024873876944184303, - "learning_rate": 1.4291212315586915e-05, - "loss": 0.0011, + "epoch": 0.09482207543149371, + "grad_norm": 0.05402829125523567, + "learning_rate": 1.8103558491370127e-05, + "loss": 0.0017, "step": 890 }, { - "epoch": 0.5708787684413086, - "eval_loss": 0.006826899945735931, - "eval_runtime": 2.4524, - "eval_samples_per_second": 5083.67, - "eval_steps_per_second": 159.03, + "epoch": 0.09482207543149371, + "eval_loss": 0.0061132400296628475, + "eval_runtime": 35.0448, + "eval_samples_per_second": 4285.206, + "eval_steps_per_second": 66.972, "step": 890 }, { - "epoch": 0.5772931366260423, - "grad_norm": 0.022238241508603096, - "learning_rate": 1.4227068633739577e-05, - "loss": 0.0012, + "epoch": 0.09588749200937567, + "grad_norm": 0.23751606047153473, + "learning_rate": 1.808225015981249e-05, + "loss": 0.006, "step": 900 }, { - "epoch": 0.5772931366260423, - "eval_loss": 0.0071626221761107445, - "eval_runtime": 2.4183, - "eval_samples_per_second": 5155.357, - "eval_steps_per_second": 161.273, + "epoch": 0.09588749200937567, + "eval_loss": 0.006291827652603388, + "eval_runtime": 35.0656, + "eval_samples_per_second": 4282.655, + "eval_steps_per_second": 66.932, "step": 900 }, { - "epoch": 0.5837075048107762, - "grad_norm": 0.029160836711525917, - "learning_rate": 1.416292495189224e-05, - "loss": 0.0012, + "epoch": 0.09695290858725762, + "grad_norm": 0.03594108670949936, + "learning_rate": 1.806094182825485e-05, + "loss": 0.0021, "step": 910 }, { - "epoch": 0.5837075048107762, - "eval_loss": 0.007146148942410946, - "eval_runtime": 2.4316, - "eval_samples_per_second": 5127.12, - "eval_steps_per_second": 160.39, + "epoch": 0.09695290858725762, + "eval_loss": 0.006167920306324959, + "eval_runtime": 35.0452, + "eval_samples_per_second": 4285.146, + "eval_steps_per_second": 66.971, "step": 910 }, { - "epoch": 0.5901218729955099, - "grad_norm": 0.020332586020231247, - "learning_rate": 1.40987812700449e-05, - "loss": 0.001, + "epoch": 0.09801832516513957, + "grad_norm": 0.013563692569732666, + "learning_rate": 1.803963349669721e-05, + "loss": 0.0057, "step": 920 }, { - "epoch": 0.5901218729955099, - "eval_loss": 0.007053141016513109, - "eval_runtime": 2.4123, - "eval_samples_per_second": 5168.073, - "eval_steps_per_second": 161.671, + "epoch": 0.09801832516513957, + "eval_loss": 0.006407948210835457, + "eval_runtime": 35.0591, + "eval_samples_per_second": 4283.448, + "eval_steps_per_second": 66.944, "step": 920 }, { - "epoch": 0.5965362411802437, - "grad_norm": 0.019170017912983894, - "learning_rate": 1.4034637588197563e-05, - "loss": 0.001, + "epoch": 0.09908374174302152, + "grad_norm": 0.013167720288038254, + "learning_rate": 1.801832516513957e-05, + "loss": 0.0043, "step": 930 }, { - "epoch": 0.5965362411802437, - "eval_loss": 0.006995479576289654, - "eval_runtime": 2.4138, - "eval_samples_per_second": 5164.946, - "eval_steps_per_second": 161.573, + "epoch": 0.09908374174302152, + "eval_loss": 0.005963355768471956, + "eval_runtime": 34.9944, + "eval_samples_per_second": 4291.378, + "eval_steps_per_second": 67.068, "step": 930 }, { - "epoch": 0.6029506093649776, - "grad_norm": 0.01940876618027687, - "learning_rate": 1.3970493906350226e-05, - "loss": 0.001, + "epoch": 0.10014915832090347, + "grad_norm": 0.6629673838615417, + "learning_rate": 1.7997016833581933e-05, + "loss": 0.0263, "step": 940 }, { - "epoch": 0.6029506093649776, - "eval_loss": 0.006961450912058353, - "eval_runtime": 2.4248, - "eval_samples_per_second": 5141.427, - "eval_steps_per_second": 160.837, + "epoch": 0.10014915832090347, + "eval_loss": 0.00560146477073431, + "eval_runtime": 35.028, + "eval_samples_per_second": 4287.256, + "eval_steps_per_second": 67.004, "step": 940 }, { - "epoch": 0.6093649775497113, - "grad_norm": 0.023615067824721336, - "learning_rate": 1.3906350224502887e-05, - "loss": 0.0205, + "epoch": 0.10121457489878542, + "grad_norm": 0.01741507463157177, + "learning_rate": 1.7975708502024295e-05, + "loss": 0.0075, "step": 950 }, { - "epoch": 0.6093649775497113, - "eval_loss": 0.006625923793762922, - "eval_runtime": 2.4163, - "eval_samples_per_second": 5159.503, - "eval_steps_per_second": 161.403, + "epoch": 0.10121457489878542, + "eval_loss": 0.006059127859771252, + "eval_runtime": 35.0757, + "eval_samples_per_second": 4281.429, + "eval_steps_per_second": 66.912, "step": 950 }, { - "epoch": 0.6157793457344451, - "grad_norm": 0.35211464762687683, - "learning_rate": 1.384220654265555e-05, - "loss": 0.0025, + "epoch": 0.10227999147666737, + "grad_norm": 0.11731712520122528, + "learning_rate": 1.7954400170466653e-05, + "loss": 0.0034, "step": 960 }, { - "epoch": 0.6157793457344451, - "eval_loss": 0.005356335546821356, - "eval_runtime": 2.4192, - "eval_samples_per_second": 5153.285, - "eval_steps_per_second": 161.208, + "epoch": 0.10227999147666737, + "eval_loss": 0.005581183824688196, + "eval_runtime": 35.0641, + "eval_samples_per_second": 4282.843, + "eval_steps_per_second": 66.935, "step": 960 }, { - "epoch": 0.622193713919179, - "grad_norm": 0.028275813907384872, - "learning_rate": 1.3778062860808212e-05, - "loss": 0.0127, + "epoch": 0.10334540805454932, + "grad_norm": 0.15285035967826843, + "learning_rate": 1.7933091838909015e-05, + "loss": 0.0138, "step": 970 }, { - "epoch": 0.622193713919179, - "eval_loss": 0.0050421105697751045, - "eval_runtime": 2.4233, - "eval_samples_per_second": 5144.725, - "eval_steps_per_second": 160.94, + "epoch": 0.10334540805454932, + "eval_loss": 0.005551150534301996, + "eval_runtime": 35.0458, + "eval_samples_per_second": 4285.083, + "eval_steps_per_second": 66.97, "step": 970 }, { - "epoch": 0.6286080821039127, - "grad_norm": 0.026007099077105522, - "learning_rate": 1.3713919178960873e-05, - "loss": 0.001, + "epoch": 0.10441082463243129, + "grad_norm": 0.0942121297121048, + "learning_rate": 1.7911783507351377e-05, + "loss": 0.0033, "step": 980 }, { - "epoch": 0.6286080821039127, - "eval_loss": 0.004812104627490044, - "eval_runtime": 2.4411, - "eval_samples_per_second": 5107.113, - "eval_steps_per_second": 159.764, + "epoch": 0.10441082463243129, + "eval_loss": 0.005906397942453623, + "eval_runtime": 35.0753, + "eval_samples_per_second": 4281.47, + "eval_steps_per_second": 66.913, "step": 980 }, { - "epoch": 0.6350224502886466, - "grad_norm": 0.030545761808753014, - "learning_rate": 1.3649775497113536e-05, - "loss": 0.001, + "epoch": 0.10547624121031324, + "grad_norm": 0.01376664824783802, + "learning_rate": 1.789047517579374e-05, + "loss": 0.0013, "step": 990 }, { - "epoch": 0.6350224502886466, - "eval_loss": 0.004728221334517002, - "eval_runtime": 2.4142, - "eval_samples_per_second": 5164.069, - "eval_steps_per_second": 161.545, + "epoch": 0.10547624121031324, + "eval_loss": 0.006121132522821426, + "eval_runtime": 35.0395, + "eval_samples_per_second": 4285.847, + "eval_steps_per_second": 66.982, "step": 990 }, { - "epoch": 0.6414368184733804, - "grad_norm": 0.018988870084285736, - "learning_rate": 1.3585631815266199e-05, - "loss": 0.0018, + "epoch": 0.10654165778819519, + "grad_norm": 1.00563645362854, + "learning_rate": 1.7869166844236097e-05, + "loss": 0.005, "step": 1000 }, { - "epoch": 0.6414368184733804, - "eval_loss": 0.004783857148140669, - "eval_runtime": 2.4228, - "eval_samples_per_second": 5145.609, - "eval_steps_per_second": 160.968, + "epoch": 0.10654165778819519, + "eval_loss": 0.005701792426407337, + "eval_runtime": 34.9988, + "eval_samples_per_second": 4290.838, + "eval_steps_per_second": 67.06, "step": 1000 }, { - "epoch": 0.6478511866581141, - "grad_norm": 0.022379985079169273, - "learning_rate": 1.3521488133418858e-05, - "loss": 0.0009, + "epoch": 0.10760707436607714, + "grad_norm": 0.012580779381096363, + "learning_rate": 1.784785851267846e-05, + "loss": 0.017, "step": 1010 }, { - "epoch": 0.6478511866581141, - "eval_loss": 0.005252769682556391, - "eval_runtime": 2.6978, - "eval_samples_per_second": 4621.196, - "eval_steps_per_second": 144.563, + "epoch": 0.10760707436607714, + "eval_loss": 0.006706910207867622, + "eval_runtime": 35.1083, + "eval_samples_per_second": 4277.451, + "eval_steps_per_second": 66.85, "step": 1010 }, { - "epoch": 0.654265554842848, - "grad_norm": 0.04967594891786575, - "learning_rate": 1.345734445157152e-05, - "loss": 0.001, + "epoch": 0.10867249094395909, + "grad_norm": 0.6065902709960938, + "learning_rate": 1.782655018112082e-05, + "loss": 0.0074, "step": 1020 }, { - "epoch": 0.654265554842848, - "eval_loss": 0.005410985555499792, - "eval_runtime": 2.7624, - "eval_samples_per_second": 4513.09, - "eval_steps_per_second": 141.181, + "epoch": 0.10867249094395909, + "eval_loss": 0.005461184773594141, + "eval_runtime": 35.2265, + "eval_samples_per_second": 4263.103, + "eval_steps_per_second": 66.626, "step": 1020 }, { - "epoch": 0.6606799230275818, - "grad_norm": 0.021347397938370705, - "learning_rate": 1.3393200769724183e-05, - "loss": 0.001, + "epoch": 0.10973790752184104, + "grad_norm": 0.5651019215583801, + "learning_rate": 1.780524184956318e-05, + "loss": 0.0072, "step": 1030 }, { - "epoch": 0.6606799230275818, - "eval_loss": 0.005168409552425146, - "eval_runtime": 2.811, - "eval_samples_per_second": 4435.142, - "eval_steps_per_second": 138.743, + "epoch": 0.10973790752184104, + "eval_loss": 0.005571197718381882, + "eval_runtime": 35.0715, + "eval_samples_per_second": 4281.944, + "eval_steps_per_second": 66.921, "step": 1030 }, { - "epoch": 0.6670942912123156, - "grad_norm": 0.038585614413022995, - "learning_rate": 1.3329057087876844e-05, - "loss": 0.0082, + "epoch": 0.11080332409972299, + "grad_norm": 0.015482685528695583, + "learning_rate": 1.778393351800554e-05, + "loss": 0.0089, "step": 1040 }, { - "epoch": 0.6670942912123156, - "eval_loss": 0.006633167155086994, - "eval_runtime": 2.7415, - "eval_samples_per_second": 4547.528, - "eval_steps_per_second": 142.258, + "epoch": 0.11080332409972299, + "eval_loss": 0.008133814670145512, + "eval_runtime": 35.2657, + "eval_samples_per_second": 4258.363, + "eval_steps_per_second": 66.552, "step": 1040 }, { - "epoch": 0.6735086593970494, - "grad_norm": 0.02415415085852146, - "learning_rate": 1.3264913406029507e-05, - "loss": 0.0009, + "epoch": 0.11186874067760494, + "grad_norm": 0.016248241066932678, + "learning_rate": 1.7762625186447903e-05, + "loss": 0.004, "step": 1050 }, { - "epoch": 0.6735086593970494, - "eval_loss": 0.00695871701464057, - "eval_runtime": 2.7897, - "eval_samples_per_second": 4468.88, - "eval_steps_per_second": 139.798, + "epoch": 0.11186874067760494, + "eval_loss": 0.00686853239312768, + "eval_runtime": 35.1303, + "eval_samples_per_second": 4274.77, + "eval_steps_per_second": 66.808, "step": 1050 }, { - "epoch": 0.6799230275817832, - "grad_norm": 0.03703638166189194, - "learning_rate": 1.320076972418217e-05, - "loss": 0.0033, + "epoch": 0.11293415725548689, + "grad_norm": 0.16063354909420013, + "learning_rate": 1.774131685489026e-05, + "loss": 0.018, "step": 1060 }, { - "epoch": 0.6799230275817832, - "eval_loss": 0.004926735535264015, - "eval_runtime": 2.747, - "eval_samples_per_second": 4538.411, - "eval_steps_per_second": 141.973, + "epoch": 0.11293415725548689, + "eval_loss": 0.005687262862920761, + "eval_runtime": 35.0941, + "eval_samples_per_second": 4279.181, + "eval_steps_per_second": 66.877, "step": 1060 }, { - "epoch": 0.686337395766517, - "grad_norm": 0.019467538222670555, - "learning_rate": 1.313662604233483e-05, - "loss": 0.0009, + "epoch": 0.11399957383336885, + "grad_norm": 0.012475158087909222, + "learning_rate": 1.7720008523332623e-05, + "loss": 0.0026, "step": 1070 }, { - "epoch": 0.686337395766517, - "eval_loss": 0.004651096649467945, - "eval_runtime": 2.8053, - "eval_samples_per_second": 4444.041, - "eval_steps_per_second": 139.021, + "epoch": 0.11399957383336885, + "eval_loss": 0.0057092043571174145, + "eval_runtime": 35.285, + "eval_samples_per_second": 4256.028, + "eval_steps_per_second": 66.515, "step": 1070 }, { - "epoch": 0.6927517639512508, - "grad_norm": 0.019495300948619843, - "learning_rate": 1.3072482360487493e-05, - "loss": 0.0008, + "epoch": 0.1150649904112508, + "grad_norm": 0.12960150837898254, + "learning_rate": 1.7698700191774985e-05, + "loss": 0.0081, "step": 1080 }, { - "epoch": 0.6927517639512508, - "eval_loss": 0.004747034516185522, - "eval_runtime": 2.7655, - "eval_samples_per_second": 4508.104, - "eval_steps_per_second": 141.025, + "epoch": 0.1150649904112508, + "eval_loss": 0.005676012486219406, + "eval_runtime": 35.0659, + "eval_samples_per_second": 4282.619, + "eval_steps_per_second": 66.931, "step": 1080 }, { - "epoch": 0.6991661321359846, - "grad_norm": 0.017888143658638, - "learning_rate": 1.3008338678640156e-05, - "loss": 0.0025, + "epoch": 0.11613040698913275, + "grad_norm": 0.015063290484249592, + "learning_rate": 1.7677391860217347e-05, + "loss": 0.0109, "step": 1090 }, { - "epoch": 0.6991661321359846, - "eval_loss": 0.004640842322260141, - "eval_runtime": 2.764, - "eval_samples_per_second": 4510.484, - "eval_steps_per_second": 141.1, + "epoch": 0.11613040698913275, + "eval_loss": 0.005491800140589476, + "eval_runtime": 35.1395, + "eval_samples_per_second": 4273.656, + "eval_steps_per_second": 66.791, "step": 1090 }, { - "epoch": 0.7055805003207184, - "grad_norm": 0.016517043113708496, - "learning_rate": 1.2944194996792817e-05, - "loss": 0.0028, + "epoch": 0.1171958235670147, + "grad_norm": 0.35197392106056213, + "learning_rate": 1.7656083528659705e-05, + "loss": 0.0015, "step": 1100 }, { - "epoch": 0.7055805003207184, - "eval_loss": 0.004688124172389507, - "eval_runtime": 2.7296, - "eval_samples_per_second": 4567.403, - "eval_steps_per_second": 142.88, + "epoch": 0.1171958235670147, + "eval_loss": 0.005604551173746586, + "eval_runtime": 35.0678, + "eval_samples_per_second": 4282.389, + "eval_steps_per_second": 66.927, "step": 1100 }, { - "epoch": 0.7119948685054522, - "grad_norm": 0.021053675562143326, - "learning_rate": 1.288005131494548e-05, - "loss": 0.0208, + "epoch": 0.11826124014489665, + "grad_norm": 0.37558114528656006, + "learning_rate": 1.7634775197102067e-05, + "loss": 0.0109, "step": 1110 }, { - "epoch": 0.7119948685054522, - "eval_loss": 0.005344197154045105, - "eval_runtime": 2.7735, - "eval_samples_per_second": 4494.962, - "eval_steps_per_second": 140.614, + "epoch": 0.11826124014489665, + "eval_loss": 0.005907374434173107, + "eval_runtime": 35.1031, + "eval_samples_per_second": 4278.078, + "eval_steps_per_second": 66.86, "step": 1110 }, { - "epoch": 0.718409236690186, - "grad_norm": 0.01877407915890217, - "learning_rate": 1.2815907633098142e-05, - "loss": 0.0008, + "epoch": 0.1193266567227786, + "grad_norm": 0.040187984704971313, + "learning_rate": 1.761346686554443e-05, + "loss": 0.0014, "step": 1120 }, { - "epoch": 0.718409236690186, - "eval_loss": 0.005765383131802082, - "eval_runtime": 2.7453, - "eval_samples_per_second": 4541.15, - "eval_steps_per_second": 142.059, + "epoch": 0.1193266567227786, + "eval_loss": 0.0068566263653337955, + "eval_runtime": 35.0588, + "eval_samples_per_second": 4283.485, + "eval_steps_per_second": 66.945, "step": 1120 }, { - "epoch": 0.7248236048749198, - "grad_norm": 0.018694570288062096, - "learning_rate": 1.2751763951250801e-05, - "loss": 0.0118, + "epoch": 0.12039207330066055, + "grad_norm": 0.029192611575126648, + "learning_rate": 1.759215853398679e-05, + "loss": 0.0068, "step": 1130 }, { - "epoch": 0.7248236048749198, - "eval_loss": 0.005010578315705061, - "eval_runtime": 2.7586, - "eval_samples_per_second": 4519.321, - "eval_steps_per_second": 141.376, + "epoch": 0.12039207330066055, + "eval_loss": 0.005835311952978373, + "eval_runtime": 35.0615, + "eval_samples_per_second": 4283.161, + "eval_steps_per_second": 66.94, "step": 1130 }, { - "epoch": 0.7312379730596537, - "grad_norm": 0.024368854239583015, - "learning_rate": 1.2687620269403464e-05, - "loss": 0.0012, + "epoch": 0.1214574898785425, + "grad_norm": 0.03659944236278534, + "learning_rate": 1.757085020242915e-05, + "loss": 0.0141, "step": 1140 }, { - "epoch": 0.7312379730596537, - "eval_loss": 0.0046744393184781075, - "eval_runtime": 2.7454, - "eval_samples_per_second": 4541.114, - "eval_steps_per_second": 142.058, + "epoch": 0.1214574898785425, + "eval_loss": 0.0052527920342981815, + "eval_runtime": 35.0493, + "eval_samples_per_second": 4284.648, + "eval_steps_per_second": 66.963, "step": 1140 }, { - "epoch": 0.7376523412443874, - "grad_norm": 0.01632387563586235, - "learning_rate": 1.2623476587556126e-05, - "loss": 0.0008, + "epoch": 0.12252290645642447, + "grad_norm": 0.011439694091677666, + "learning_rate": 1.754954187087151e-05, + "loss": 0.0015, "step": 1150 }, { - "epoch": 0.7376523412443874, - "eval_loss": 0.005472252145409584, - "eval_runtime": 2.7352, - "eval_samples_per_second": 4557.999, - "eval_steps_per_second": 142.586, + "epoch": 0.12252290645642447, + "eval_loss": 0.005540814250707626, + "eval_runtime": 35.0762, + "eval_samples_per_second": 4281.361, + "eval_steps_per_second": 66.911, "step": 1150 }, { - "epoch": 0.7440667094291212, - "grad_norm": 0.01977686956524849, - "learning_rate": 1.2559332905708787e-05, - "loss": 0.0113, + "epoch": 0.12358832303430642, + "grad_norm": 0.010304667986929417, + "learning_rate": 1.7528233539313873e-05, + "loss": 0.0044, "step": 1160 }, { - "epoch": 0.7440667094291212, - "eval_loss": 0.004667737055569887, - "eval_runtime": 2.7817, - "eval_samples_per_second": 4481.825, - "eval_steps_per_second": 140.203, + "epoch": 0.12358832303430642, + "eval_loss": 0.005671035032719374, + "eval_runtime": 35.1111, + "eval_samples_per_second": 4277.113, + "eval_steps_per_second": 66.845, "step": 1160 }, { - "epoch": 0.7504810776138551, - "grad_norm": 0.01677733100950718, - "learning_rate": 1.249518922386145e-05, - "loss": 0.0008, + "epoch": 0.12465373961218837, + "grad_norm": 1.2378696203231812, + "learning_rate": 1.7506925207756235e-05, + "loss": 0.0167, "step": 1170 }, { - "epoch": 0.7504810776138551, - "eval_loss": 0.004439252428710461, - "eval_runtime": 2.7489, - "eval_samples_per_second": 4535.208, - "eval_steps_per_second": 141.873, + "epoch": 0.12465373961218837, + "eval_loss": 0.005685662850737572, + "eval_runtime": 35.0507, + "eval_samples_per_second": 4284.477, + "eval_steps_per_second": 66.96, "step": 1170 }, { - "epoch": 0.7568954457985888, - "grad_norm": 0.01814711093902588, - "learning_rate": 1.2431045542014113e-05, - "loss": 0.0008, + "epoch": 0.12571915619007032, + "grad_norm": 0.029286779463291168, + "learning_rate": 1.7485616876198593e-05, + "loss": 0.0092, "step": 1180 }, { - "epoch": 0.7568954457985888, - "eval_loss": 0.0044731213711202145, - "eval_runtime": 2.7808, - "eval_samples_per_second": 4483.222, - "eval_steps_per_second": 140.247, + "epoch": 0.12571915619007032, + "eval_loss": 0.0064590792171657085, + "eval_runtime": 35.0487, + "eval_samples_per_second": 4284.729, + "eval_steps_per_second": 66.964, "step": 1180 }, { - "epoch": 0.7633098139833226, - "grad_norm": 0.01768704131245613, - "learning_rate": 1.2366901860166775e-05, - "loss": 0.0008, + "epoch": 0.12678457276795227, + "grad_norm": 0.08294548094272614, + "learning_rate": 1.7464308544640955e-05, + "loss": 0.002, "step": 1190 }, { - "epoch": 0.7633098139833226, - "eval_loss": 0.00443003186956048, - "eval_runtime": 2.7322, - "eval_samples_per_second": 4562.948, - "eval_steps_per_second": 142.741, + "epoch": 0.12678457276795227, + "eval_loss": 0.0068722073920071125, + "eval_runtime": 35.0733, + "eval_samples_per_second": 4281.721, + "eval_steps_per_second": 66.917, "step": 1190 }, { - "epoch": 0.7697241821680565, - "grad_norm": 0.018905559554696083, - "learning_rate": 1.2302758178319436e-05, - "loss": 0.0009, + "epoch": 0.12784998934583422, + "grad_norm": 0.10596469789743423, + "learning_rate": 1.7443000213083317e-05, + "loss": 0.0144, "step": 1200 }, { - "epoch": 0.7697241821680565, - "eval_loss": 0.0043729268945753574, - "eval_runtime": 2.7422, - "eval_samples_per_second": 4546.366, - "eval_steps_per_second": 142.222, + "epoch": 0.12784998934583422, + "eval_loss": 0.005687403492629528, + "eval_runtime": 35.0846, + "eval_samples_per_second": 4280.338, + "eval_steps_per_second": 66.895, "step": 1200 }, { - "epoch": 0.7761385503527902, - "grad_norm": 0.016700776293873787, - "learning_rate": 1.2238614496472099e-05, - "loss": 0.0007, + "epoch": 0.12891540592371617, + "grad_norm": 0.03988677263259888, + "learning_rate": 1.742169188152568e-05, + "loss": 0.0016, "step": 1210 }, { - "epoch": 0.7761385503527902, - "eval_loss": 0.004335304256528616, - "eval_runtime": 2.7415, - "eval_samples_per_second": 4547.513, - "eval_steps_per_second": 142.258, + "epoch": 0.12891540592371617, + "eval_loss": 0.005167008843272924, + "eval_runtime": 35.0969, + "eval_samples_per_second": 4278.845, + "eval_steps_per_second": 66.872, "step": 1210 }, { - "epoch": 0.7825529185375241, - "grad_norm": 0.017289504408836365, - "learning_rate": 1.2174470814624762e-05, - "loss": 0.0007, + "epoch": 0.12998082250159812, + "grad_norm": 0.16653232276439667, + "learning_rate": 1.7400383549968037e-05, + "loss": 0.0045, "step": 1220 }, { - "epoch": 0.7825529185375241, - "eval_loss": 0.004327945411205292, - "eval_runtime": 2.7495, - "eval_samples_per_second": 4534.227, - "eval_steps_per_second": 141.842, + "epoch": 0.12998082250159812, + "eval_loss": 0.005050142761319876, + "eval_runtime": 35.0847, + "eval_samples_per_second": 4280.322, + "eval_steps_per_second": 66.895, "step": 1220 }, { - "epoch": 0.7889672867222579, - "grad_norm": 0.01972338743507862, - "learning_rate": 1.2110327132777423e-05, - "loss": 0.0008, + "epoch": 0.13104623907948007, + "grad_norm": 1.8148539066314697, + "learning_rate": 1.73790752184104e-05, + "loss": 0.0052, "step": 1230 }, { - "epoch": 0.7889672867222579, - "eval_loss": 0.004332894925028086, - "eval_runtime": 2.7828, - "eval_samples_per_second": 4480.07, - "eval_steps_per_second": 140.148, + "epoch": 0.13104623907948007, + "eval_loss": 0.0050388118252158165, + "eval_runtime": 35.0834, + "eval_samples_per_second": 4280.482, + "eval_steps_per_second": 66.898, "step": 1230 }, { - "epoch": 0.7953816549069916, - "grad_norm": 0.7810164093971252, - "learning_rate": 1.2046183450930085e-05, - "loss": 0.001, + "epoch": 0.13211165565736202, + "grad_norm": 0.027562782168388367, + "learning_rate": 1.735776688685276e-05, + "loss": 0.0068, "step": 1240 }, { - "epoch": 0.7953816549069916, - "eval_loss": 0.0043570250272750854, - "eval_runtime": 2.7393, - "eval_samples_per_second": 4551.141, - "eval_steps_per_second": 142.371, + "epoch": 0.13211165565736202, + "eval_loss": 0.005193131044507027, + "eval_runtime": 35.0886, + "eval_samples_per_second": 4279.855, + "eval_steps_per_second": 66.888, "step": 1240 }, { - "epoch": 0.8017960230917255, - "grad_norm": 0.013027627021074295, - "learning_rate": 1.1982039769082748e-05, - "loss": 0.001, + "epoch": 0.13317707223524397, + "grad_norm": 2.252390146255493, + "learning_rate": 1.7336458555295123e-05, + "loss": 0.0084, "step": 1250 }, { - "epoch": 0.8017960230917255, - "eval_loss": 0.00467626703903079, - "eval_runtime": 2.7636, - "eval_samples_per_second": 4511.123, - "eval_steps_per_second": 141.12, + "epoch": 0.13317707223524397, + "eval_loss": 0.005359725095331669, + "eval_runtime": 35.056, + "eval_samples_per_second": 4283.831, + "eval_steps_per_second": 66.95, "step": 1250 }, { - "epoch": 0.8082103912764592, - "grad_norm": 0.019782286137342453, - "learning_rate": 1.1917896087235407e-05, - "loss": 0.0009, + "epoch": 0.13424248881312592, + "grad_norm": 0.13568538427352905, + "learning_rate": 1.731515022373748e-05, + "loss": 0.0055, "step": 1260 }, { - "epoch": 0.8082103912764592, - "eval_loss": 0.004564644303172827, - "eval_runtime": 2.7335, - "eval_samples_per_second": 4560.869, - "eval_steps_per_second": 142.676, + "epoch": 0.13424248881312592, + "eval_loss": 0.005222136154770851, + "eval_runtime": 35.0994, + "eval_samples_per_second": 4278.531, + "eval_steps_per_second": 66.867, "step": 1260 }, { - "epoch": 0.8146247594611931, - "grad_norm": 0.019999559968709946, - "learning_rate": 1.185375240538807e-05, - "loss": 0.0007, + "epoch": 0.13530790539100787, + "grad_norm": 0.020430419594049454, + "learning_rate": 1.7293841892179843e-05, + "loss": 0.0052, "step": 1270 }, { - "epoch": 0.8146247594611931, - "eval_loss": 0.004904980771243572, - "eval_runtime": 2.732, - "eval_samples_per_second": 4563.357, - "eval_steps_per_second": 142.754, + "epoch": 0.13530790539100787, + "eval_loss": 0.005026769824326038, + "eval_runtime": 35.0367, + "eval_samples_per_second": 4286.186, + "eval_steps_per_second": 66.987, "step": 1270 }, { - "epoch": 0.8210391276459269, - "grad_norm": 0.02157980017364025, - "learning_rate": 1.1789608723540732e-05, - "loss": 0.0213, + "epoch": 0.13637332196888985, + "grad_norm": 0.033307794481515884, + "learning_rate": 1.7272533560622205e-05, + "loss": 0.0111, "step": 1280 }, { - "epoch": 0.8210391276459269, - "eval_loss": 0.005200786981731653, - "eval_runtime": 2.7722, - "eval_samples_per_second": 4497.167, - "eval_steps_per_second": 140.683, + "epoch": 0.13637332196888985, + "eval_loss": 0.004983577877283096, + "eval_runtime": 35.0924, + "eval_samples_per_second": 4279.39, + "eval_steps_per_second": 66.881, "step": 1280 }, { - "epoch": 0.8274534958306606, - "grad_norm": 0.01738838665187359, - "learning_rate": 1.1725465041693393e-05, - "loss": 0.0008, + "epoch": 0.1374387385467718, + "grad_norm": 0.010311348363757133, + "learning_rate": 1.7251225229064567e-05, + "loss": 0.0017, "step": 1290 }, { - "epoch": 0.8274534958306606, - "eval_loss": 0.005184350069612265, - "eval_runtime": 2.7378, - "eval_samples_per_second": 4553.624, - "eval_steps_per_second": 142.449, + "epoch": 0.1374387385467718, + "eval_loss": 0.005127367097884417, + "eval_runtime": 35.0723, + "eval_samples_per_second": 4281.841, + "eval_steps_per_second": 66.919, "step": 1290 }, { - "epoch": 0.8338678640153945, - "grad_norm": 0.014976155944168568, - "learning_rate": 1.1661321359846056e-05, - "loss": 0.0186, + "epoch": 0.13850415512465375, + "grad_norm": 0.009298436343669891, + "learning_rate": 1.7229916897506925e-05, + "loss": 0.0055, "step": 1300 }, { - "epoch": 0.8338678640153945, - "eval_loss": 0.005122625734657049, - "eval_runtime": 2.7545, - "eval_samples_per_second": 4526.014, - "eval_steps_per_second": 141.585, + "epoch": 0.13850415512465375, + "eval_loss": 0.0049048615619540215, + "eval_runtime": 35.062, + "eval_samples_per_second": 4283.093, + "eval_steps_per_second": 66.938, "step": 1300 }, { - "epoch": 0.8402822322001283, - "grad_norm": 0.022448979318141937, - "learning_rate": 1.1597177677998719e-05, - "loss": 0.0009, + "epoch": 0.1395695717025357, + "grad_norm": 0.009401198476552963, + "learning_rate": 1.7208608565949287e-05, + "loss": 0.0041, "step": 1310 }, { - "epoch": 0.8402822322001283, - "eval_loss": 0.005329828709363937, - "eval_runtime": 2.757, - "eval_samples_per_second": 4521.993, - "eval_steps_per_second": 141.46, + "epoch": 0.1395695717025357, + "eval_loss": 0.005013017915189266, + "eval_runtime": 35.0683, + "eval_samples_per_second": 4282.332, + "eval_steps_per_second": 66.927, "step": 1310 }, { - "epoch": 0.846696600384862, - "grad_norm": 0.022593596950173378, - "learning_rate": 1.153303399615138e-05, - "loss": 0.0009, + "epoch": 0.14063498828041765, + "grad_norm": 0.030792182311415672, + "learning_rate": 1.718730023439165e-05, + "loss": 0.0054, "step": 1320 }, { - "epoch": 0.846696600384862, - "eval_loss": 0.005285405088216066, - "eval_runtime": 2.7287, - "eval_samples_per_second": 4568.789, - "eval_steps_per_second": 142.924, + "epoch": 0.14063498828041765, + "eval_loss": 0.0054897707886993885, + "eval_runtime": 35.0864, + "eval_samples_per_second": 4280.117, + "eval_steps_per_second": 66.892, "step": 1320 }, { - "epoch": 0.8531109685695959, - "grad_norm": 0.013608959503471851, - "learning_rate": 1.1468890314304042e-05, - "loss": 0.0009, + "epoch": 0.1417004048582996, + "grad_norm": 0.5939790606498718, + "learning_rate": 1.716599190283401e-05, + "loss": 0.0137, "step": 1330 }, { - "epoch": 0.8531109685695959, - "eval_loss": 0.0051283277571201324, - "eval_runtime": 2.4129, - "eval_samples_per_second": 5166.744, - "eval_steps_per_second": 161.629, + "epoch": 0.1417004048582996, + "eval_loss": 0.005527508445084095, + "eval_runtime": 35.068, + "eval_samples_per_second": 4282.366, + "eval_steps_per_second": 66.927, "step": 1330 }, { - "epoch": 0.8595253367543297, - "grad_norm": 1.3526451587677002, - "learning_rate": 1.1404746632456705e-05, - "loss": 0.0196, + "epoch": 0.14276582143618155, + "grad_norm": 0.2677154242992401, + "learning_rate": 1.714468357127637e-05, + "loss": 0.0039, "step": 1340 }, { - "epoch": 0.8595253367543297, - "eval_loss": 0.004866322036832571, - "eval_runtime": 2.3974, - "eval_samples_per_second": 5200.213, - "eval_steps_per_second": 162.676, + "epoch": 0.14276582143618155, + "eval_loss": 0.004874934908002615, + "eval_runtime": 35.0254, + "eval_samples_per_second": 4287.575, + "eval_steps_per_second": 67.009, "step": 1340 }, { - "epoch": 0.8659397049390635, - "grad_norm": 0.025603530928492546, - "learning_rate": 1.1340602950609366e-05, - "loss": 0.0008, + "epoch": 0.1438312380140635, + "grad_norm": 1.0665801763534546, + "learning_rate": 1.712337523971873e-05, + "loss": 0.0044, "step": 1350 }, { - "epoch": 0.8659397049390635, - "eval_loss": 0.004412362352013588, - "eval_runtime": 2.3987, - "eval_samples_per_second": 5197.483, - "eval_steps_per_second": 162.591, + "epoch": 0.1438312380140635, + "eval_loss": 0.004903439898043871, + "eval_runtime": 35.0969, + "eval_samples_per_second": 4278.835, + "eval_steps_per_second": 66.872, "step": 1350 }, { - "epoch": 0.8723540731237973, - "grad_norm": 0.02262856625020504, - "learning_rate": 1.1276459268762029e-05, - "loss": 0.0008, + "epoch": 0.14489665459194545, + "grad_norm": 0.016081418842077255, + "learning_rate": 1.7102066908161093e-05, + "loss": 0.0006, "step": 1360 }, { - "epoch": 0.8723540731237973, - "eval_loss": 0.0042611462995409966, - "eval_runtime": 2.3923, - "eval_samples_per_second": 5211.293, - "eval_steps_per_second": 163.023, + "epoch": 0.14489665459194545, + "eval_loss": 0.004955473821610212, + "eval_runtime": 35.0973, + "eval_samples_per_second": 4278.788, + "eval_steps_per_second": 66.871, "step": 1360 }, { - "epoch": 0.8787684413085312, - "grad_norm": 0.018682410940527916, - "learning_rate": 1.1212315586914691e-05, - "loss": 0.0008, + "epoch": 0.1459620711698274, + "grad_norm": 0.012007024139165878, + "learning_rate": 1.7080758576603455e-05, + "loss": 0.0009, "step": 1370 }, { - "epoch": 0.8787684413085312, - "eval_loss": 0.004274127539247274, - "eval_runtime": 2.4094, - "eval_samples_per_second": 5174.395, - "eval_steps_per_second": 161.868, + "epoch": 0.1459620711698274, + "eval_loss": 0.005029810592532158, + "eval_runtime": 35.1077, + "eval_samples_per_second": 4277.52, + "eval_steps_per_second": 66.851, "step": 1370 }, { - "epoch": 0.8851828094932649, - "grad_norm": 0.02168123796582222, - "learning_rate": 1.114817190506735e-05, - "loss": 0.0008, + "epoch": 0.14702748774770935, + "grad_norm": 0.01866872049868107, + "learning_rate": 1.7059450245045813e-05, + "loss": 0.0005, "step": 1380 }, { - "epoch": 0.8851828094932649, - "eval_loss": 0.004351937212049961, - "eval_runtime": 2.4398, - "eval_samples_per_second": 5109.791, - "eval_steps_per_second": 159.847, + "epoch": 0.14702748774770935, + "eval_loss": 0.005090104416012764, + "eval_runtime": 35.1091, + "eval_samples_per_second": 4277.357, + "eval_steps_per_second": 66.849, "step": 1380 }, { - "epoch": 0.8915971776779987, - "grad_norm": 0.01584063470363617, - "learning_rate": 1.1084028223220013e-05, + "epoch": 0.1480929043255913, + "grad_norm": 0.030831903219223022, + "learning_rate": 1.7038141913488175e-05, "loss": 0.0008, "step": 1390 }, { - "epoch": 0.8915971776779987, - "eval_loss": 0.004328163340687752, - "eval_runtime": 2.4187, - "eval_samples_per_second": 5154.385, - "eval_steps_per_second": 161.242, + "epoch": 0.1480929043255913, + "eval_loss": 0.005153083708137274, + "eval_runtime": 35.0702, + "eval_samples_per_second": 4282.1, + "eval_steps_per_second": 66.923, "step": 1390 }, { - "epoch": 0.8980115458627326, - "grad_norm": 0.02418622002005577, - "learning_rate": 1.1019884541372676e-05, - "loss": 0.0179, + "epoch": 0.14915832090347325, + "grad_norm": 0.012699014507234097, + "learning_rate": 1.7016833581930537e-05, + "loss": 0.0068, "step": 1400 }, { - "epoch": 0.8980115458627326, - "eval_loss": 0.004465954378247261, - "eval_runtime": 2.4217, - "eval_samples_per_second": 5148.09, - "eval_steps_per_second": 161.046, + "epoch": 0.14915832090347325, + "eval_loss": 0.005071562714874744, + "eval_runtime": 35.1126, + "eval_samples_per_second": 4276.932, + "eval_steps_per_second": 66.842, "step": 1400 }, { - "epoch": 0.9044259140474663, - "grad_norm": 0.03038007766008377, - "learning_rate": 1.0955740859525337e-05, - "loss": 0.0008, + "epoch": 0.1502237374813552, + "grad_norm": 0.022656958550214767, + "learning_rate": 1.69955252503729e-05, + "loss": 0.0035, "step": 1410 }, { - "epoch": 0.9044259140474663, - "eval_loss": 0.004557340405881405, - "eval_runtime": 2.4249, - "eval_samples_per_second": 5141.298, - "eval_steps_per_second": 160.833, + "epoch": 0.1502237374813552, + "eval_loss": 0.005125128198415041, + "eval_runtime": 35.0598, + "eval_samples_per_second": 4283.37, + "eval_steps_per_second": 66.943, "step": 1410 }, { - "epoch": 0.9108402822322001, - "grad_norm": 0.022822652012109756, - "learning_rate": 1.0891597177678e-05, + "epoch": 0.15128915405923715, + "grad_norm": 0.00814993865787983, + "learning_rate": 1.6974216918815257e-05, "loss": 0.001, "step": 1420 }, { - "epoch": 0.9108402822322001, - "eval_loss": 0.0044282288290560246, - "eval_runtime": 2.4459, - "eval_samples_per_second": 5097.083, - "eval_steps_per_second": 159.45, + "epoch": 0.15128915405923715, + "eval_loss": 0.005287368781864643, + "eval_runtime": 35.0555, + "eval_samples_per_second": 4283.894, + "eval_steps_per_second": 66.951, "step": 1420 }, { - "epoch": 0.9172546504169339, - "grad_norm": 0.019525406882166862, - "learning_rate": 1.0827453495830662e-05, - "loss": 0.0009, + "epoch": 0.1523545706371191, + "grad_norm": 0.06181171163916588, + "learning_rate": 1.695290858725762e-05, + "loss": 0.0006, "step": 1430 }, { - "epoch": 0.9172546504169339, - "eval_loss": 0.004077858291566372, - "eval_runtime": 2.4255, - "eval_samples_per_second": 5139.973, - "eval_steps_per_second": 160.792, + "epoch": 0.1523545706371191, + "eval_loss": 0.005389242433011532, + "eval_runtime": 35.0758, + "eval_samples_per_second": 4281.412, + "eval_steps_per_second": 66.912, "step": 1430 }, { - "epoch": 0.9236690186016677, - "grad_norm": 0.02353576384484768, - "learning_rate": 1.0763309813983323e-05, - "loss": 0.033, + "epoch": 0.15341998721500105, + "grad_norm": 0.018909545615315437, + "learning_rate": 1.693160025569998e-05, + "loss": 0.0101, "step": 1440 }, { - "epoch": 0.9236690186016677, - "eval_loss": 0.0042343405075371265, - "eval_runtime": 2.4218, - "eval_samples_per_second": 5147.784, - "eval_steps_per_second": 161.036, + "epoch": 0.15341998721500105, + "eval_loss": 0.0051095616072416306, + "eval_runtime": 35.0847, + "eval_samples_per_second": 4280.327, + "eval_steps_per_second": 66.895, "step": 1440 }, { - "epoch": 0.9300833867864016, - "grad_norm": 0.019818825647234917, - "learning_rate": 1.0699166132135986e-05, + "epoch": 0.15448540379288303, + "grad_norm": 0.007403901778161526, + "learning_rate": 1.6910291924142343e-05, "loss": 0.001, "step": 1450 }, { - "epoch": 0.9300833867864016, - "eval_loss": 0.004364923574030399, - "eval_runtime": 2.419, - "eval_samples_per_second": 5153.833, - "eval_steps_per_second": 161.225, + "epoch": 0.15448540379288303, + "eval_loss": 0.0049713412299752235, + "eval_runtime": 35.051, + "eval_samples_per_second": 4284.439, + "eval_steps_per_second": 66.96, "step": 1450 }, { - "epoch": 0.9364977549711353, - "grad_norm": 0.08867379277944565, - "learning_rate": 1.0635022450288648e-05, - "loss": 0.001, + "epoch": 0.15555082037076498, + "grad_norm": 0.21636687219142914, + "learning_rate": 1.68889835925847e-05, + "loss": 0.0056, "step": 1460 }, { - "epoch": 0.9364977549711353, - "eval_loss": 0.004264220129698515, - "eval_runtime": 2.4136, - "eval_samples_per_second": 5165.211, - "eval_steps_per_second": 161.581, + "epoch": 0.15555082037076498, + "eval_loss": 0.0050105685368180275, + "eval_runtime": 35.0503, + "eval_samples_per_second": 4284.527, + "eval_steps_per_second": 66.961, "step": 1460 }, { - "epoch": 0.9429121231558691, - "grad_norm": 0.014765486121177673, - "learning_rate": 1.0570878768441307e-05, - "loss": 0.0008, + "epoch": 0.15661623694864693, + "grad_norm": 0.021923823282122612, + "learning_rate": 1.6867675261027063e-05, + "loss": 0.0173, "step": 1470 }, { - "epoch": 0.9429121231558691, - "eval_loss": 0.004075978416949511, - "eval_runtime": 2.4088, - "eval_samples_per_second": 5175.684, - "eval_steps_per_second": 161.909, + "epoch": 0.15661623694864693, + "eval_loss": 0.004856941290199757, + "eval_runtime": 35.1061, + "eval_samples_per_second": 4277.715, + "eval_steps_per_second": 66.854, "step": 1470 }, { - "epoch": 0.949326491340603, - "grad_norm": 0.06277275830507278, - "learning_rate": 1.0506735086593972e-05, - "loss": 0.0008, + "epoch": 0.15768165352652888, + "grad_norm": 0.16257305443286896, + "learning_rate": 1.6846366929469425e-05, + "loss": 0.0056, "step": 1480 }, { - "epoch": 0.949326491340603, - "eval_loss": 0.003993914928287268, - "eval_runtime": 2.4044, - "eval_samples_per_second": 5185.179, - "eval_steps_per_second": 162.206, + "epoch": 0.15768165352652888, + "eval_loss": 0.005984555929899216, + "eval_runtime": 35.0991, + "eval_samples_per_second": 4278.575, + "eval_steps_per_second": 66.868, "step": 1480 }, { - "epoch": 0.9557408595253367, - "grad_norm": 0.015658292919397354, - "learning_rate": 1.0442591404746634e-05, - "loss": 0.0007, + "epoch": 0.15874707010441083, + "grad_norm": 0.3750154674053192, + "learning_rate": 1.6825058597911787e-05, + "loss": 0.0187, "step": 1490 }, { - "epoch": 0.9557408595253367, - "eval_loss": 0.003960499074310064, - "eval_runtime": 2.4068, - "eval_samples_per_second": 5179.913, - "eval_steps_per_second": 162.041, + "epoch": 0.15874707010441083, + "eval_loss": 0.006785502657294273, + "eval_runtime": 35.0856, + "eval_samples_per_second": 4280.213, + "eval_steps_per_second": 66.893, "step": 1490 }, { - "epoch": 0.9621552277100706, - "grad_norm": 0.01588534004986286, - "learning_rate": 1.0378447722899297e-05, - "loss": 0.0006, + "epoch": 0.15981248668229278, + "grad_norm": 0.009664146229624748, + "learning_rate": 1.6803750266354145e-05, + "loss": 0.0037, "step": 1500 }, { - "epoch": 0.9621552277100706, - "eval_loss": 0.003956847358494997, - "eval_runtime": 2.4027, - "eval_samples_per_second": 5188.731, - "eval_steps_per_second": 162.317, + "epoch": 0.15981248668229278, + "eval_loss": 0.0051004113629460335, + "eval_runtime": 35.0502, + "eval_samples_per_second": 4284.534, + "eval_steps_per_second": 66.961, "step": 1500 }, { - "epoch": 0.9685695958948044, - "grad_norm": 0.02051517553627491, - "learning_rate": 1.0314304041051956e-05, - "loss": 0.0007, + "epoch": 0.16087790326017473, + "grad_norm": 0.019265178591012955, + "learning_rate": 1.6782441934796507e-05, + "loss": 0.0048, "step": 1510 }, { - "epoch": 0.9685695958948044, - "eval_loss": 0.003954595420509577, - "eval_runtime": 2.4096, - "eval_samples_per_second": 5173.928, - "eval_steps_per_second": 161.854, + "epoch": 0.16087790326017473, + "eval_loss": 0.004774358589202166, + "eval_runtime": 35.0608, + "eval_samples_per_second": 4283.242, + "eval_steps_per_second": 66.941, "step": 1510 }, { - "epoch": 0.9749839640795381, - "grad_norm": 0.014356785453855991, - "learning_rate": 1.0250160359204619e-05, + "epoch": 0.16194331983805668, + "grad_norm": 0.02663380466401577, + "learning_rate": 1.676113360323887e-05, "loss": 0.0008, "step": 1520 }, { - "epoch": 0.9749839640795381, - "eval_loss": 0.003977358806878328, - "eval_runtime": 2.4108, - "eval_samples_per_second": 5171.403, - "eval_steps_per_second": 161.775, + "epoch": 0.16194331983805668, + "eval_loss": 0.004892702680081129, + "eval_runtime": 35.1044, + "eval_samples_per_second": 4277.93, + "eval_steps_per_second": 66.858, "step": 1520 }, { - "epoch": 0.981398332264272, - "grad_norm": 0.017719434574246407, - "learning_rate": 1.0186016677357282e-05, - "loss": 0.0041, + "epoch": 0.16300873641593863, + "grad_norm": 0.559873104095459, + "learning_rate": 1.673982527168123e-05, + "loss": 0.0085, "step": 1530 }, { - "epoch": 0.981398332264272, - "eval_loss": 0.0038570731412619352, - "eval_runtime": 2.4041, - "eval_samples_per_second": 5185.718, - "eval_steps_per_second": 162.223, + "epoch": 0.16300873641593863, + "eval_loss": 0.005092754494398832, + "eval_runtime": 35.0253, + "eval_samples_per_second": 4287.587, + "eval_steps_per_second": 67.009, "step": 1530 }, { - "epoch": 0.9878127004490058, - "grad_norm": 0.013949839398264885, - "learning_rate": 1.0121872995509943e-05, - "loss": 0.0006, + "epoch": 0.16407415299382058, + "grad_norm": 0.2393077164888382, + "learning_rate": 1.671851694012359e-05, + "loss": 0.0045, "step": 1540 }, { - "epoch": 0.9878127004490058, - "eval_loss": 0.003924272954463959, - "eval_runtime": 2.4092, - "eval_samples_per_second": 5174.835, - "eval_steps_per_second": 161.882, + "epoch": 0.16407415299382058, + "eval_loss": 0.005027854815125465, + "eval_runtime": 35.0738, + "eval_samples_per_second": 4281.653, + "eval_steps_per_second": 66.916, "step": 1540 }, { - "epoch": 0.9942270686337396, - "grad_norm": 0.014598234556615353, - "learning_rate": 1.0057729313662605e-05, - "loss": 0.0006, + "epoch": 0.16513956957170253, + "grad_norm": 0.01315162144601345, + "learning_rate": 1.669720860856595e-05, + "loss": 0.0017, "step": 1550 }, { - "epoch": 0.9942270686337396, - "eval_loss": 0.003942632116377354, - "eval_runtime": 2.4412, - "eval_samples_per_second": 5106.919, - "eval_steps_per_second": 159.758, + "epoch": 0.16513956957170253, + "eval_loss": 0.004932031966745853, + "eval_runtime": 35.05, + "eval_samples_per_second": 4284.571, + "eval_steps_per_second": 66.962, "step": 1550 }, { - "epoch": 1.0006414368184733, - "grad_norm": 0.011457240208983421, - "learning_rate": 9.993585631815266e-06, - "loss": 0.004, + "epoch": 0.16620498614958448, + "grad_norm": 1.2454622983932495, + "learning_rate": 1.6675900277008313e-05, + "loss": 0.0032, "step": 1560 }, { - "epoch": 1.0006414368184733, - "eval_loss": 0.004661895334720612, - "eval_runtime": 2.4112, - "eval_samples_per_second": 5170.457, - "eval_steps_per_second": 161.745, + "epoch": 0.16620498614958448, + "eval_loss": 0.005039810668677092, + "eval_runtime": 35.064, + "eval_samples_per_second": 4282.858, + "eval_steps_per_second": 66.935, "step": 1560 }, { - "epoch": 1.0070558050032072, - "grad_norm": 0.01190961617976427, - "learning_rate": 9.929441949967929e-06, - "loss": 0.0007, + "epoch": 0.16727040272746643, + "grad_norm": 0.07791124284267426, + "learning_rate": 1.6654591945450675e-05, + "loss": 0.0041, "step": 1570 }, { - "epoch": 1.0070558050032072, - "eval_loss": 0.004823943134397268, - "eval_runtime": 2.4047, - "eval_samples_per_second": 5184.371, - "eval_steps_per_second": 162.181, + "epoch": 0.16727040272746643, + "eval_loss": 0.004877708852291107, + "eval_runtime": 35.1101, + "eval_samples_per_second": 4277.232, + "eval_steps_per_second": 66.847, "step": 1570 }, { - "epoch": 1.013470173187941, - "grad_norm": 0.33191996812820435, - "learning_rate": 9.865298268120592e-06, - "loss": 0.0007, + "epoch": 0.16833581930534838, + "grad_norm": 0.006707167252898216, + "learning_rate": 1.6633283613893033e-05, + "loss": 0.0098, "step": 1580 }, { - "epoch": 1.013470173187941, - "eval_loss": 0.004680118057876825, - "eval_runtime": 2.4166, - "eval_samples_per_second": 5159.007, - "eval_steps_per_second": 161.387, + "epoch": 0.16833581930534838, + "eval_loss": 0.005282689351588488, + "eval_runtime": 35.0557, + "eval_samples_per_second": 4283.873, + "eval_steps_per_second": 66.951, "step": 1580 }, { - "epoch": 1.0198845413726747, - "grad_norm": 0.011999332346022129, - "learning_rate": 9.801154586273252e-06, - "loss": 0.0006, + "epoch": 0.16940123588323033, + "grad_norm": 0.12985199689865112, + "learning_rate": 1.6611975282335395e-05, + "loss": 0.006, "step": 1590 }, { - "epoch": 1.0198845413726747, - "eval_loss": 0.004358895123004913, - "eval_runtime": 2.4225, - "eval_samples_per_second": 5146.254, - "eval_steps_per_second": 160.988, + "epoch": 0.16940123588323033, + "eval_loss": 0.00591092836111784, + "eval_runtime": 35.0663, + "eval_samples_per_second": 4282.579, + "eval_steps_per_second": 66.93, "step": 1590 }, { - "epoch": 1.0262989095574087, - "grad_norm": 0.012122240848839283, - "learning_rate": 9.737010904425915e-06, - "loss": 0.0006, + "epoch": 0.17046665246111228, + "grad_norm": 0.006378485355526209, + "learning_rate": 1.6590666950777757e-05, + "loss": 0.0061, "step": 1600 }, { - "epoch": 1.0262989095574087, - "eval_loss": 0.0042406474240124226, - "eval_runtime": 2.4348, - "eval_samples_per_second": 5120.284, - "eval_steps_per_second": 160.176, + "epoch": 0.17046665246111228, + "eval_loss": 0.005602375138550997, + "eval_runtime": 35.0822, + "eval_samples_per_second": 4280.633, + "eval_steps_per_second": 66.9, "step": 1600 }, { - "epoch": 1.0327132777421424, - "grad_norm": 0.011430976912379265, - "learning_rate": 9.672867222578576e-06, - "loss": 0.0006, + "epoch": 0.17153206903899423, + "grad_norm": 0.27390819787979126, + "learning_rate": 1.656935861922012e-05, + "loss": 0.0026, "step": 1610 }, { - "epoch": 1.0327132777421424, - "eval_loss": 0.0041888197883963585, - "eval_runtime": 2.4379, - "eval_samples_per_second": 5113.784, - "eval_steps_per_second": 159.972, + "epoch": 0.17153206903899423, + "eval_loss": 0.005227269604802132, + "eval_runtime": 35.064, + "eval_samples_per_second": 4282.849, + "eval_steps_per_second": 66.935, "step": 1610 }, { - "epoch": 1.0391276459268761, - "grad_norm": 0.013485722243785858, - "learning_rate": 9.608723540731239e-06, - "loss": 0.0006, + "epoch": 0.1725974856168762, + "grad_norm": 0.01282795425504446, + "learning_rate": 1.6548050287662477e-05, + "loss": 0.0013, "step": 1620 }, { - "epoch": 1.0391276459268761, - "eval_loss": 0.004160948097705841, - "eval_runtime": 2.432, - "eval_samples_per_second": 5126.259, - "eval_steps_per_second": 160.363, + "epoch": 0.1725974856168762, + "eval_loss": 0.005013093817979097, + "eval_runtime": 35.0468, + "eval_samples_per_second": 4284.953, + "eval_steps_per_second": 66.968, "step": 1620 }, { - "epoch": 1.04554201411161, - "grad_norm": 0.012653365731239319, - "learning_rate": 9.544579858883901e-06, - "loss": 0.0006, + "epoch": 0.17366290219475816, + "grad_norm": 0.0058741201646625996, + "learning_rate": 1.652674195610484e-05, + "loss": 0.0015, "step": 1630 }, { - "epoch": 1.04554201411161, - "eval_loss": 0.004139183554798365, - "eval_runtime": 2.4264, - "eval_samples_per_second": 5137.96, - "eval_steps_per_second": 160.729, + "epoch": 0.17366290219475816, + "eval_loss": 0.005092192441225052, + "eval_runtime": 35.0896, + "eval_samples_per_second": 4279.726, + "eval_steps_per_second": 66.886, "step": 1630 }, { - "epoch": 1.0519563822963438, - "grad_norm": 0.011114208027720451, - "learning_rate": 9.480436177036562e-06, - "loss": 0.0221, + "epoch": 0.1747283187726401, + "grad_norm": 0.022938504815101624, + "learning_rate": 1.65054336245472e-05, + "loss": 0.0006, "step": 1640 }, { - "epoch": 1.0519563822963438, - "eval_loss": 0.0040536741726100445, - "eval_runtime": 2.4288, - "eval_samples_per_second": 5132.976, - "eval_steps_per_second": 160.573, + "epoch": 0.1747283187726401, + "eval_loss": 0.005200070794671774, + "eval_runtime": 35.0531, + "eval_samples_per_second": 4284.182, + "eval_steps_per_second": 66.956, "step": 1640 }, { - "epoch": 1.0583707504810775, - "grad_norm": 0.012626360170543194, - "learning_rate": 9.416292495189225e-06, - "loss": 0.0006, + "epoch": 0.17579373535052206, + "grad_norm": 0.03323187306523323, + "learning_rate": 1.6484125292989562e-05, + "loss": 0.0047, "step": 1650 }, { - "epoch": 1.0583707504810775, - "eval_loss": 0.0038833727594465017, - "eval_runtime": 2.4227, - "eval_samples_per_second": 5145.84, - "eval_steps_per_second": 160.975, + "epoch": 0.17579373535052206, + "eval_loss": 0.005280145909637213, + "eval_runtime": 35.0383, + "eval_samples_per_second": 4285.991, + "eval_steps_per_second": 66.984, "step": 1650 }, { - "epoch": 1.0647851186658115, - "grad_norm": 0.014159155078232288, - "learning_rate": 9.352148813341888e-06, - "loss": 0.0006, + "epoch": 0.176859151928404, + "grad_norm": 0.005984348710626364, + "learning_rate": 1.646281696143192e-05, + "loss": 0.005, "step": 1660 }, { - "epoch": 1.0647851186658115, - "eval_loss": 0.0038352308329194784, - "eval_runtime": 2.4498, - "eval_samples_per_second": 5088.937, - "eval_steps_per_second": 159.195, + "epoch": 0.176859151928404, + "eval_loss": 0.005126793868839741, + "eval_runtime": 35.0851, + "eval_samples_per_second": 4280.28, + "eval_steps_per_second": 66.895, "step": 1660 }, { - "epoch": 1.0711994868505452, - "grad_norm": 0.012477426789700985, - "learning_rate": 9.288005131494549e-06, - "loss": 0.0006, + "epoch": 0.17792456850628596, + "grad_norm": 0.02346760779619217, + "learning_rate": 1.6441508629874283e-05, + "loss": 0.0042, "step": 1670 }, { - "epoch": 1.0711994868505452, - "eval_loss": 0.003819151548668742, - "eval_runtime": 2.4787, - "eval_samples_per_second": 5029.751, - "eval_steps_per_second": 157.344, + "epoch": 0.17792456850628596, + "eval_loss": 0.005286376923322678, + "eval_runtime": 35.0619, + "eval_samples_per_second": 4283.111, + "eval_steps_per_second": 66.939, "step": 1670 }, { - "epoch": 1.077613855035279, - "grad_norm": 0.015881817787885666, - "learning_rate": 9.22386144964721e-06, - "loss": 0.0006, + "epoch": 0.1789899850841679, + "grad_norm": 0.00583941163495183, + "learning_rate": 1.6420200298316645e-05, + "loss": 0.0059, "step": 1680 }, { - "epoch": 1.077613855035279, - "eval_loss": 0.0038124537095427513, - "eval_runtime": 2.7882, - "eval_samples_per_second": 4471.31, - "eval_steps_per_second": 139.874, + "epoch": 0.1789899850841679, + "eval_loss": 0.0052011506631970406, + "eval_runtime": 35.0433, + "eval_samples_per_second": 4285.389, + "eval_steps_per_second": 66.974, "step": 1680 }, { - "epoch": 1.084028223220013, - "grad_norm": 0.012099322862923145, - "learning_rate": 9.159717767799872e-06, - "loss": 0.0006, + "epoch": 0.18005540166204986, + "grad_norm": 0.020624622702598572, + "learning_rate": 1.6398891966759006e-05, + "loss": 0.0019, "step": 1690 }, { - "epoch": 1.084028223220013, - "eval_loss": 0.003807657863944769, - "eval_runtime": 2.7594, - "eval_samples_per_second": 4518.067, - "eval_steps_per_second": 141.337, + "epoch": 0.18005540166204986, + "eval_loss": 0.004804234951734543, + "eval_runtime": 35.0973, + "eval_samples_per_second": 4278.789, + "eval_steps_per_second": 66.871, "step": 1690 }, { - "epoch": 1.0904425914047466, - "grad_norm": 0.013776600360870361, - "learning_rate": 9.095574085952535e-06, - "loss": 0.0006, + "epoch": 0.1811208182399318, + "grad_norm": 1.2170437574386597, + "learning_rate": 1.6377583635201365e-05, + "loss": 0.0071, "step": 1700 }, { - "epoch": 1.0904425914047466, - "eval_loss": 0.003805548418313265, - "eval_runtime": 2.7956, - "eval_samples_per_second": 4459.47, - "eval_steps_per_second": 139.504, + "epoch": 0.1811208182399318, + "eval_loss": 0.004888548050075769, + "eval_runtime": 35.1137, + "eval_samples_per_second": 4276.79, + "eval_steps_per_second": 66.84, "step": 1700 }, { - "epoch": 1.0968569595894804, - "grad_norm": 0.011048905551433563, - "learning_rate": 9.031430404105196e-06, - "loss": 0.0005, + "epoch": 0.18218623481781376, + "grad_norm": 0.030709806829690933, + "learning_rate": 1.6356275303643723e-05, + "loss": 0.0025, "step": 1710 }, { - "epoch": 1.0968569595894804, - "eval_loss": 0.0038041335064917803, - "eval_runtime": 2.7701, - "eval_samples_per_second": 4500.54, - "eval_steps_per_second": 140.789, + "epoch": 0.18218623481781376, + "eval_loss": 0.004803914111107588, + "eval_runtime": 35.0555, + "eval_samples_per_second": 4283.895, + "eval_steps_per_second": 66.951, "step": 1710 }, { - "epoch": 1.1032713277742143, - "grad_norm": 0.015901437029242516, - "learning_rate": 8.967286722257858e-06, - "loss": 0.0005, + "epoch": 0.1832516513956957, + "grad_norm": 4.421119213104248, + "learning_rate": 1.6334966972086085e-05, + "loss": 0.0183, "step": 1720 }, { - "epoch": 1.1032713277742143, - "eval_loss": 0.0038053819444030523, - "eval_runtime": 2.8155, - "eval_samples_per_second": 4428.021, - "eval_steps_per_second": 138.52, + "epoch": 0.1832516513956957, + "eval_loss": 0.004751955159008503, + "eval_runtime": 35.0418, + "eval_samples_per_second": 4285.571, + "eval_steps_per_second": 66.977, "step": 1720 }, { - "epoch": 1.109685695958948, - "grad_norm": 0.01117046270519495, - "learning_rate": 8.90314304041052e-06, - "loss": 0.0006, + "epoch": 0.18431706797357766, + "grad_norm": 0.009466009214520454, + "learning_rate": 1.6313658640528447e-05, + "loss": 0.0035, "step": 1730 }, { - "epoch": 1.109685695958948, - "eval_loss": 0.0038178153336048126, - "eval_runtime": 2.7607, - "eval_samples_per_second": 4515.815, - "eval_steps_per_second": 141.266, + "epoch": 0.18431706797357766, + "eval_loss": 0.0049493880942463875, + "eval_runtime": 35.0694, + "eval_samples_per_second": 4282.201, + "eval_steps_per_second": 66.925, "step": 1730 }, { - "epoch": 1.1161000641436818, - "grad_norm": 0.0106582622975111, - "learning_rate": 8.838999358563182e-06, - "loss": 0.0005, + "epoch": 0.18538248455145961, + "grad_norm": 0.259084016084671, + "learning_rate": 1.629235030897081e-05, + "loss": 0.0062, "step": 1740 }, { - "epoch": 1.1161000641436818, - "eval_loss": 0.0038231906946748495, - "eval_runtime": 2.7615, - "eval_samples_per_second": 4514.606, - "eval_steps_per_second": 141.229, + "epoch": 0.18538248455145961, + "eval_loss": 0.00492563983425498, + "eval_runtime": 35.0548, + "eval_samples_per_second": 4283.98, + "eval_steps_per_second": 66.952, "step": 1740 }, { - "epoch": 1.1225144323284157, - "grad_norm": 0.011909229680895805, - "learning_rate": 8.774855676715845e-06, - "loss": 0.0005, + "epoch": 0.18644790112934156, + "grad_norm": 0.07872737944126129, + "learning_rate": 1.6271041977413167e-05, + "loss": 0.0044, "step": 1750 }, { - "epoch": 1.1225144323284157, - "eval_loss": 0.0038271723315119743, - "eval_runtime": 2.7512, - "eval_samples_per_second": 4531.54, - "eval_steps_per_second": 141.758, + "epoch": 0.18644790112934156, + "eval_loss": 0.004815262276679277, + "eval_runtime": 35.0635, + "eval_samples_per_second": 4282.919, + "eval_steps_per_second": 66.936, "step": 1750 }, { - "epoch": 1.1289288005131495, - "grad_norm": 0.011566118337213993, - "learning_rate": 8.710711994868506e-06, - "loss": 0.0006, + "epoch": 0.18751331770722351, + "grad_norm": 0.008825350552797318, + "learning_rate": 1.624973364585553e-05, + "loss": 0.001, "step": 1760 }, { - "epoch": 1.1289288005131495, - "eval_loss": 0.0038472446613013744, - "eval_runtime": 2.7699, - "eval_samples_per_second": 4500.962, - "eval_steps_per_second": 140.802, + "epoch": 0.18751331770722351, + "eval_loss": 0.00481291301548481, + "eval_runtime": 35.0737, + "eval_samples_per_second": 4281.672, + "eval_steps_per_second": 66.916, "step": 1760 }, { - "epoch": 1.1353431686978832, - "grad_norm": 0.011672618798911572, - "learning_rate": 8.646568313021168e-06, - "loss": 0.0006, + "epoch": 0.18857873428510546, + "grad_norm": 0.03307470306754112, + "learning_rate": 1.622842531429789e-05, + "loss": 0.0128, "step": 1770 }, { - "epoch": 1.1353431686978832, - "eval_loss": 0.0039085946045815945, - "eval_runtime": 2.7447, - "eval_samples_per_second": 4542.214, - "eval_steps_per_second": 142.092, + "epoch": 0.18857873428510546, + "eval_loss": 0.004740755073726177, + "eval_runtime": 35.0789, + "eval_samples_per_second": 4281.038, + "eval_steps_per_second": 66.906, "step": 1770 }, { - "epoch": 1.1417575368826172, - "grad_norm": 0.011934245936572552, - "learning_rate": 8.582424631173831e-06, - "loss": 0.0027, + "epoch": 0.18964415086298742, + "grad_norm": 0.058342017233371735, + "learning_rate": 1.6207116982740253e-05, + "loss": 0.0021, "step": 1780 }, { - "epoch": 1.1417575368826172, - "eval_loss": 0.003827283624559641, - "eval_runtime": 2.7992, - "eval_samples_per_second": 4453.718, - "eval_steps_per_second": 139.324, + "epoch": 0.18964415086298742, + "eval_loss": 0.00447422219440341, + "eval_runtime": 35.0637, + "eval_samples_per_second": 4282.894, + "eval_steps_per_second": 66.935, "step": 1780 }, { - "epoch": 1.148171905067351, - "grad_norm": 0.011731648817658424, - "learning_rate": 8.518280949326492e-06, - "loss": 0.021, + "epoch": 0.1907095674408694, + "grad_norm": 0.5684050917625427, + "learning_rate": 1.618580865118261e-05, + "loss": 0.0051, "step": 1790 }, { - "epoch": 1.148171905067351, - "eval_loss": 0.00375727703794837, - "eval_runtime": 2.7483, - "eval_samples_per_second": 4536.229, - "eval_steps_per_second": 141.905, + "epoch": 0.1907095674408694, + "eval_loss": 0.004538466222584248, + "eval_runtime": 35.0598, + "eval_samples_per_second": 4283.37, + "eval_steps_per_second": 66.943, "step": 1790 }, { - "epoch": 1.1545862732520846, - "grad_norm": 0.015154370106756687, - "learning_rate": 8.454137267479155e-06, - "loss": 0.0006, + "epoch": 0.19177498401875134, + "grad_norm": 4.642019748687744, + "learning_rate": 1.6164500319624973e-05, + "loss": 0.0041, "step": 1800 }, { - "epoch": 1.1545862732520846, - "eval_loss": 0.003766178386285901, - "eval_runtime": 2.7826, - "eval_samples_per_second": 4480.277, - "eval_steps_per_second": 140.155, + "epoch": 0.19177498401875134, + "eval_loss": 0.004466844256967306, + "eval_runtime": 35.066, + "eval_samples_per_second": 4282.609, + "eval_steps_per_second": 66.931, "step": 1800 }, { - "epoch": 1.1610006414368184, - "grad_norm": 0.017620213329792023, - "learning_rate": 8.389993585631815e-06, - "loss": 0.0169, + "epoch": 0.1928404005966333, + "grad_norm": 0.08696554601192474, + "learning_rate": 1.6143191988067335e-05, + "loss": 0.001, "step": 1810 }, { - "epoch": 1.1610006414368184, - "eval_loss": 0.0037458380684256554, - "eval_runtime": 2.7342, - "eval_samples_per_second": 4559.694, - "eval_steps_per_second": 142.639, + "epoch": 0.1928404005966333, + "eval_loss": 0.004406987689435482, + "eval_runtime": 35.0911, + "eval_samples_per_second": 4279.541, + "eval_steps_per_second": 66.883, "step": 1810 }, { - "epoch": 1.1674150096215523, - "grad_norm": 0.014909962192177773, - "learning_rate": 8.325849903784478e-06, - "loss": 0.0006, + "epoch": 0.19390581717451524, + "grad_norm": 2.991973876953125, + "learning_rate": 1.6121883656509697e-05, + "loss": 0.0039, "step": 1820 }, { - "epoch": 1.1674150096215523, - "eval_loss": 0.0036850275937467813, - "eval_runtime": 2.7781, - "eval_samples_per_second": 4487.627, - "eval_steps_per_second": 140.385, + "epoch": 0.19390581717451524, + "eval_loss": 0.004455452784895897, + "eval_runtime": 35.0798, + "eval_samples_per_second": 4280.926, + "eval_steps_per_second": 66.905, "step": 1820 }, { - "epoch": 1.173829377806286, - "grad_norm": 0.03852078691124916, - "learning_rate": 8.26170622193714e-06, + "epoch": 0.1949712337523972, + "grad_norm": 0.005264118313789368, + "learning_rate": 1.6100575324952055e-05, "loss": 0.0006, "step": 1830 }, { - "epoch": 1.173829377806286, - "eval_loss": 0.0036811623722314835, - "eval_runtime": 2.7264, - "eval_samples_per_second": 4572.666, - "eval_steps_per_second": 143.045, + "epoch": 0.1949712337523972, + "eval_loss": 0.004717789124697447, + "eval_runtime": 35.0708, + "eval_samples_per_second": 4282.025, + "eval_steps_per_second": 66.922, "step": 1830 }, { - "epoch": 1.18024374599102, - "grad_norm": 0.013608761131763458, - "learning_rate": 8.197562540089802e-06, + "epoch": 0.19603665033027914, + "grad_norm": 0.03296063467860222, + "learning_rate": 1.6079266993394417e-05, "loss": 0.0006, "step": 1840 }, { - "epoch": 1.18024374599102, - "eval_loss": 0.0036842951085418463, - "eval_runtime": 2.7401, - "eval_samples_per_second": 4549.854, - "eval_steps_per_second": 142.331, + "epoch": 0.19603665033027914, + "eval_loss": 0.004698717035353184, + "eval_runtime": 35.1299, + "eval_samples_per_second": 4274.824, + "eval_steps_per_second": 66.809, "step": 1840 }, { - "epoch": 1.1866581141757537, - "grad_norm": 0.01463257521390915, - "learning_rate": 8.133418858242463e-06, - "loss": 0.0006, + "epoch": 0.1971020669081611, + "grad_norm": 0.0412713959813118, + "learning_rate": 1.605795866183678e-05, + "loss": 0.0032, "step": 1850 }, { - "epoch": 1.1866581141757537, - "eval_loss": 0.0036834382917732, - "eval_runtime": 2.7684, - "eval_samples_per_second": 4503.347, - "eval_steps_per_second": 140.876, + "epoch": 0.1971020669081611, + "eval_loss": 0.0045752511359751225, + "eval_runtime": 35.0645, + "eval_samples_per_second": 4282.79, + "eval_steps_per_second": 66.934, "step": 1850 }, { - "epoch": 1.1930724823604875, - "grad_norm": 0.015022198669612408, - "learning_rate": 8.069275176395125e-06, - "loss": 0.0006, + "epoch": 0.19816748348604304, + "grad_norm": 0.03118061274290085, + "learning_rate": 1.603665033027914e-05, + "loss": 0.0009, "step": 1860 }, { - "epoch": 1.1930724823604875, - "eval_loss": 0.003694625571370125, - "eval_runtime": 2.7501, - "eval_samples_per_second": 4533.313, - "eval_steps_per_second": 141.814, + "epoch": 0.19816748348604304, + "eval_loss": 0.0048514497466385365, + "eval_runtime": 35.0481, + "eval_samples_per_second": 4284.795, + "eval_steps_per_second": 66.965, "step": 1860 }, { - "epoch": 1.1994868505452212, - "grad_norm": 0.011660662479698658, - "learning_rate": 8.005131494547788e-06, - "loss": 0.0006, + "epoch": 0.199232900063925, + "grad_norm": 0.778282105922699, + "learning_rate": 1.6015341998721503e-05, + "loss": 0.0098, "step": 1870 }, { - "epoch": 1.1994868505452212, - "eval_loss": 0.003708133939653635, - "eval_runtime": 2.7645, - "eval_samples_per_second": 4509.744, - "eval_steps_per_second": 141.076, + "epoch": 0.199232900063925, + "eval_loss": 0.00497164111584425, + "eval_runtime": 35.0663, + "eval_samples_per_second": 4282.574, + "eval_steps_per_second": 66.93, "step": 1870 }, { - "epoch": 1.2059012187299551, - "grad_norm": 0.01334298774600029, - "learning_rate": 7.940987812700449e-06, + "epoch": 0.20029831664180694, + "grad_norm": 0.007694170344620943, + "learning_rate": 1.599403366716386e-05, "loss": 0.0005, "step": 1880 }, { - "epoch": 1.2059012187299551, - "eval_loss": 0.003715616650879383, - "eval_runtime": 2.7401, - "eval_samples_per_second": 4549.905, - "eval_steps_per_second": 142.333, + "epoch": 0.20029831664180694, + "eval_loss": 0.004604881163686514, + "eval_runtime": 35.0134, + "eval_samples_per_second": 4289.046, + "eval_steps_per_second": 67.032, "step": 1880 }, { - "epoch": 1.2123155869146889, - "grad_norm": 0.012951839715242386, - "learning_rate": 7.876844130853112e-06, - "loss": 0.0006, + "epoch": 0.2013637332196889, + "grad_norm": 0.9302027821540833, + "learning_rate": 1.5972725335606223e-05, + "loss": 0.0048, "step": 1890 }, { - "epoch": 1.2123155869146889, - "eval_loss": 0.0037382924929261208, - "eval_runtime": 2.7682, - "eval_samples_per_second": 4503.721, - "eval_steps_per_second": 140.888, + "epoch": 0.2013637332196889, + "eval_loss": 0.004474525805562735, + "eval_runtime": 35.0823, + "eval_samples_per_second": 4280.622, + "eval_steps_per_second": 66.9, "step": 1890 }, { - "epoch": 1.2187299550994226, - "grad_norm": 0.009976466186344624, - "learning_rate": 7.812700449005774e-06, - "loss": 0.0005, + "epoch": 0.20242914979757085, + "grad_norm": 0.00496167317032814, + "learning_rate": 1.5951417004048585e-05, + "loss": 0.0038, "step": 1900 }, { - "epoch": 1.2187299550994226, - "eval_loss": 0.0037695877254009247, - "eval_runtime": 2.7309, - "eval_samples_per_second": 4565.132, - "eval_steps_per_second": 142.809, + "epoch": 0.20242914979757085, + "eval_loss": 0.004204958211630583, + "eval_runtime": 35.0994, + "eval_samples_per_second": 4278.533, + "eval_steps_per_second": 66.867, "step": 1900 }, { - "epoch": 1.2251443232841566, - "grad_norm": 0.012881353497505188, - "learning_rate": 7.748556767158437e-06, - "loss": 0.0005, + "epoch": 0.2034945663754528, + "grad_norm": 0.24391968548297882, + "learning_rate": 1.5930108672490947e-05, + "loss": 0.0014, "step": 1910 }, { - "epoch": 1.2251443232841566, - "eval_loss": 0.003783053020015359, - "eval_runtime": 2.5865, - "eval_samples_per_second": 4820.049, - "eval_steps_per_second": 150.784, + "epoch": 0.2034945663754528, + "eval_loss": 0.004156021401286125, + "eval_runtime": 35.055, + "eval_samples_per_second": 4283.949, + "eval_steps_per_second": 66.952, "step": 1910 }, { - "epoch": 1.2315586914688903, - "grad_norm": 0.01235857605934143, - "learning_rate": 7.684413085311098e-06, - "loss": 0.0005, + "epoch": 0.20455998295333475, + "grad_norm": 0.21614207327365875, + "learning_rate": 1.5908800340933305e-05, + "loss": 0.0038, "step": 1920 }, { - "epoch": 1.2315586914688903, - "eval_loss": 0.0037915727589279413, - "eval_runtime": 2.4029, - "eval_samples_per_second": 5188.243, - "eval_steps_per_second": 162.302, + "epoch": 0.20455998295333475, + "eval_loss": 0.004162695724517107, + "eval_runtime": 35.0833, + "eval_samples_per_second": 4280.504, + "eval_steps_per_second": 66.898, "step": 1920 }, { - "epoch": 1.237973059653624, - "grad_norm": 0.01356592122465372, - "learning_rate": 7.620269403463759e-06, - "loss": 0.0006, + "epoch": 0.2056253995312167, + "grad_norm": 0.08646874129772186, + "learning_rate": 1.5887492009375667e-05, + "loss": 0.0007, "step": 1930 }, { - "epoch": 1.237973059653624, - "eval_loss": 0.003795850556343794, - "eval_runtime": 2.4046, - "eval_samples_per_second": 5184.593, - "eval_steps_per_second": 162.187, + "epoch": 0.2056253995312167, + "eval_loss": 0.00422044238075614, + "eval_runtime": 35.0779, + "eval_samples_per_second": 4281.163, + "eval_steps_per_second": 66.908, "step": 1930 }, { - "epoch": 1.244387427838358, - "grad_norm": 0.01199817843735218, - "learning_rate": 7.556125721616422e-06, - "loss": 0.0005, + "epoch": 0.20669081610909865, + "grad_norm": 0.004879661835730076, + "learning_rate": 1.586618367781803e-05, + "loss": 0.0123, "step": 1940 }, { - "epoch": 1.244387427838358, - "eval_loss": 0.003800132079049945, - "eval_runtime": 2.4031, - "eval_samples_per_second": 5187.901, - "eval_steps_per_second": 162.291, + "epoch": 0.20669081610909865, + "eval_loss": 0.004559625405818224, + "eval_runtime": 35.0448, + "eval_samples_per_second": 4285.202, + "eval_steps_per_second": 66.971, "step": 1940 }, { - "epoch": 1.2508017960230917, - "grad_norm": 0.00958781223744154, - "learning_rate": 7.491982039769083e-06, - "loss": 0.0005, + "epoch": 0.2077562326869806, + "grad_norm": 0.15028773248195648, + "learning_rate": 1.584487534626039e-05, + "loss": 0.0124, "step": 1950 }, { - "epoch": 1.2508017960230917, - "eval_loss": 0.0038033805321902037, - "eval_runtime": 2.4027, - "eval_samples_per_second": 5188.694, - "eval_steps_per_second": 162.316, + "epoch": 0.2077562326869806, + "eval_loss": 0.005765740759670734, + "eval_runtime": 35.0383, + "eval_samples_per_second": 4285.999, + "eval_steps_per_second": 66.984, "step": 1950 }, { - "epoch": 1.2572161642078254, - "grad_norm": 0.011594674549996853, - "learning_rate": 7.427838357921745e-06, - "loss": 0.0005, + "epoch": 0.20882164926486257, + "grad_norm": 0.004790129140019417, + "learning_rate": 1.582356701470275e-05, + "loss": 0.0089, "step": 1960 }, { - "epoch": 1.2572161642078254, - "eval_loss": 0.003806302323937416, - "eval_runtime": 2.39, - "eval_samples_per_second": 5216.258, - "eval_steps_per_second": 163.178, + "epoch": 0.20882164926486257, + "eval_loss": 0.004500931594520807, + "eval_runtime": 35.0794, + "eval_samples_per_second": 4280.971, + "eval_steps_per_second": 66.905, "step": 1960 }, { - "epoch": 1.2636305323925594, - "grad_norm": 0.012927345000207424, - "learning_rate": 7.363694676074408e-06, - "loss": 0.0005, + "epoch": 0.20988706584274452, + "grad_norm": 3.749185562133789, + "learning_rate": 1.580225868314511e-05, + "loss": 0.0115, "step": 1970 }, { - "epoch": 1.2636305323925594, - "eval_loss": 0.0038109412416815758, - "eval_runtime": 2.3902, - "eval_samples_per_second": 5215.912, - "eval_steps_per_second": 163.167, + "epoch": 0.20988706584274452, + "eval_loss": 0.004156744107604027, + "eval_runtime": 35.0591, + "eval_samples_per_second": 4283.456, + "eval_steps_per_second": 66.944, "step": 1970 }, { - "epoch": 1.2700449005772931, - "grad_norm": 0.010677590034902096, - "learning_rate": 7.2995509942270695e-06, - "loss": 0.0005, + "epoch": 0.21095248242062647, + "grad_norm": 0.009407439269125462, + "learning_rate": 1.5780950351587473e-05, + "loss": 0.001, "step": 1980 }, { - "epoch": 1.2700449005772931, - "eval_loss": 0.0038150884211063385, - "eval_runtime": 2.4019, - "eval_samples_per_second": 5190.418, - "eval_steps_per_second": 162.37, + "epoch": 0.21095248242062647, + "eval_loss": 0.005346408113837242, + "eval_runtime": 35.0873, + "eval_samples_per_second": 4280.009, + "eval_steps_per_second": 66.89, "step": 1980 }, { - "epoch": 1.2764592687620269, - "grad_norm": 0.009424679912626743, - "learning_rate": 7.2354073123797304e-06, - "loss": 0.0209, + "epoch": 0.21201789899850843, + "grad_norm": 4.62002420425415, + "learning_rate": 1.5759642020029834e-05, + "loss": 0.0068, "step": 1990 }, { - "epoch": 1.2764592687620269, - "eval_loss": 0.0037959227338433266, - "eval_runtime": 2.4118, - "eval_samples_per_second": 5169.261, - "eval_steps_per_second": 161.708, + "epoch": 0.21201789899850843, + "eval_loss": 0.006435367278754711, + "eval_runtime": 35.1066, + "eval_samples_per_second": 4277.661, + "eval_steps_per_second": 66.854, "step": 1990 }, { - "epoch": 1.2828736369467608, - "grad_norm": 0.014829086139798164, - "learning_rate": 7.171263630532393e-06, - "loss": 0.0006, + "epoch": 0.21308331557639038, + "grad_norm": 0.005438864231109619, + "learning_rate": 1.5738333688472193e-05, + "loss": 0.0027, "step": 2000 }, { - "epoch": 1.2828736369467608, - "eval_loss": 0.00383403105661273, - "eval_runtime": 2.4131, - "eval_samples_per_second": 5166.39, - "eval_steps_per_second": 161.618, + "epoch": 0.21308331557639038, + "eval_loss": 0.00453655980527401, + "eval_runtime": 35.0641, + "eval_samples_per_second": 4282.841, + "eval_steps_per_second": 66.935, "step": 2000 }, { - "epoch": 1.2892880051314946, - "grad_norm": 0.01788959838449955, - "learning_rate": 7.107119948685055e-06, - "loss": 0.0005, + "epoch": 0.21414873215427233, + "grad_norm": 0.05276772007346153, + "learning_rate": 1.5717025356914555e-05, + "loss": 0.0049, "step": 2010 }, { - "epoch": 1.2892880051314946, - "eval_loss": 0.0038744392804801464, - "eval_runtime": 2.4295, - "eval_samples_per_second": 5131.522, - "eval_steps_per_second": 160.527, + "epoch": 0.21414873215427233, + "eval_loss": 0.004296323750168085, + "eval_runtime": 35.0599, + "eval_samples_per_second": 4283.356, + "eval_steps_per_second": 66.943, "step": 2010 }, { - "epoch": 1.2957023733162283, - "grad_norm": 0.013126869685947895, - "learning_rate": 7.042976266837717e-06, - "loss": 0.0006, + "epoch": 0.21521414873215428, + "grad_norm": 0.0866408571600914, + "learning_rate": 1.5695717025356917e-05, + "loss": 0.0005, "step": 2020 }, { - "epoch": 1.2957023733162283, - "eval_loss": 0.0038970729801803827, - "eval_runtime": 2.4135, - "eval_samples_per_second": 5165.481, - "eval_steps_per_second": 161.59, + "epoch": 0.21521414873215428, + "eval_loss": 0.0043297079391777515, + "eval_runtime": 35.0108, + "eval_samples_per_second": 4289.364, + "eval_steps_per_second": 67.036, "step": 2020 }, { - "epoch": 1.3021167415009622, - "grad_norm": 0.010365926660597324, - "learning_rate": 6.978832584990379e-06, - "loss": 0.0005, + "epoch": 0.21627956531003623, + "grad_norm": 0.03794199973344803, + "learning_rate": 1.567440869379928e-05, + "loss": 0.0007, "step": 2030 }, { - "epoch": 1.3021167415009622, - "eval_loss": 0.00391175365075469, - "eval_runtime": 2.4159, - "eval_samples_per_second": 5160.436, - "eval_steps_per_second": 161.432, + "epoch": 0.21627956531003623, + "eval_loss": 0.004399747122079134, + "eval_runtime": 35.043, + "eval_samples_per_second": 4285.424, + "eval_steps_per_second": 66.975, "step": 2030 }, { - "epoch": 1.308531109685696, - "grad_norm": 0.014084501191973686, - "learning_rate": 6.914688903143041e-06, - "loss": 0.0005, + "epoch": 0.21734498188791818, + "grad_norm": 0.1419890820980072, + "learning_rate": 1.5653100362241637e-05, + "loss": 0.011, "step": 2040 }, { - "epoch": 1.308531109685696, - "eval_loss": 0.003916487097740173, - "eval_runtime": 2.412, - "eval_samples_per_second": 5168.686, - "eval_steps_per_second": 161.69, + "epoch": 0.21734498188791818, + "eval_loss": 0.004798985552042723, + "eval_runtime": 35.2407, + "eval_samples_per_second": 4261.377, + "eval_steps_per_second": 66.599, "step": 2040 }, { - "epoch": 1.3149454778704297, - "grad_norm": 0.009805840440094471, - "learning_rate": 6.850545221295702e-06, - "loss": 0.0005, + "epoch": 0.21841039846580013, + "grad_norm": 0.004484011325985193, + "learning_rate": 1.5631792030684e-05, + "loss": 0.0051, "step": 2050 }, { - "epoch": 1.3149454778704297, - "eval_loss": 0.003910783212631941, - "eval_runtime": 2.5001, - "eval_samples_per_second": 4986.515, - "eval_steps_per_second": 155.991, + "epoch": 0.21841039846580013, + "eval_loss": 0.0046847849152982235, + "eval_runtime": 35.045, + "eval_samples_per_second": 4285.182, + "eval_steps_per_second": 66.971, "step": 2050 }, { - "epoch": 1.3213598460551634, - "grad_norm": 0.0099485469982028, - "learning_rate": 6.786401539448365e-06, - "loss": 0.0005, + "epoch": 0.21947581504368208, + "grad_norm": 0.04429204761981964, + "learning_rate": 1.561048369912636e-05, + "loss": 0.0012, "step": 2060 }, { - "epoch": 1.3213598460551634, - "eval_loss": 0.003903586184605956, - "eval_runtime": 2.7919, - "eval_samples_per_second": 4465.368, - "eval_steps_per_second": 139.688, + "epoch": 0.21947581504368208, + "eval_loss": 0.004873516503721476, + "eval_runtime": 35.0605, + "eval_samples_per_second": 4283.283, + "eval_steps_per_second": 66.941, "step": 2060 }, { - "epoch": 1.3277742142398974, - "grad_norm": 0.01362858060747385, - "learning_rate": 6.7222578576010265e-06, - "loss": 0.0221, + "epoch": 0.22054123162156403, + "grad_norm": 2.6432743072509766, + "learning_rate": 1.5589175367568722e-05, + "loss": 0.0134, "step": 2070 }, { - "epoch": 1.3277742142398974, - "eval_loss": 0.003886124351993203, - "eval_runtime": 2.7347, - "eval_samples_per_second": 4558.826, - "eval_steps_per_second": 142.612, + "epoch": 0.22054123162156403, + "eval_loss": 0.004669446498155594, + "eval_runtime": 35.0459, + "eval_samples_per_second": 4285.071, + "eval_steps_per_second": 66.969, "step": 2070 }, { - "epoch": 1.3341885824246311, - "grad_norm": 0.010772572830319405, - "learning_rate": 6.658114175753689e-06, - "loss": 0.0005, + "epoch": 0.22160664819944598, + "grad_norm": 0.004335370380431414, + "learning_rate": 1.556786703601108e-05, + "loss": 0.0006, "step": 2080 }, { - "epoch": 1.3341885824246311, - "eval_loss": 0.003885190933942795, - "eval_runtime": 2.775, - "eval_samples_per_second": 4492.608, - "eval_steps_per_second": 140.54, + "epoch": 0.22160664819944598, + "eval_loss": 0.0043738046661019325, + "eval_runtime": 35.0814, + "eval_samples_per_second": 4280.727, + "eval_steps_per_second": 66.902, "step": 2080 }, { - "epoch": 1.340602950609365, - "grad_norm": 0.0132959159091115, - "learning_rate": 6.593970493906351e-06, + "epoch": 0.22267206477732793, + "grad_norm": 0.09280374646186829, + "learning_rate": 1.5546558704453443e-05, "loss": 0.0006, "step": 2090 }, { - "epoch": 1.340602950609365, - "eval_loss": 0.0038879828061908484, - "eval_runtime": 2.7484, - "eval_samples_per_second": 4536.089, - "eval_steps_per_second": 141.901, + "epoch": 0.22267206477732793, + "eval_loss": 0.0046281940303742886, + "eval_runtime": 35.0542, + "eval_samples_per_second": 4284.053, + "eval_steps_per_second": 66.953, "step": 2090 }, { - "epoch": 1.3470173187940988, - "grad_norm": 0.028292661532759666, - "learning_rate": 6.529826812059013e-06, - "loss": 0.0006, + "epoch": 0.22373748135520988, + "grad_norm": 0.005173602141439915, + "learning_rate": 1.5525250372895804e-05, + "loss": 0.0012, "step": 2100 }, { - "epoch": 1.3470173187940988, - "eval_loss": 0.0038887602277100086, - "eval_runtime": 2.7721, - "eval_samples_per_second": 4497.362, - "eval_steps_per_second": 140.689, + "epoch": 0.22373748135520988, + "eval_loss": 0.004610604140907526, + "eval_runtime": 35.0581, + "eval_samples_per_second": 4283.58, + "eval_steps_per_second": 66.946, "step": 2100 }, { - "epoch": 1.3534316869788325, - "grad_norm": 0.014790890738368034, - "learning_rate": 6.4656831302116754e-06, - "loss": 0.0006, + "epoch": 0.22480289793309183, + "grad_norm": 2.3304152488708496, + "learning_rate": 1.5503942041338166e-05, + "loss": 0.006, "step": 2110 }, { - "epoch": 1.3534316869788325, - "eval_loss": 0.003893056884407997, - "eval_runtime": 2.7258, - "eval_samples_per_second": 4573.656, - "eval_steps_per_second": 143.076, + "epoch": 0.22480289793309183, + "eval_loss": 0.004738961812108755, + "eval_runtime": 35.0641, + "eval_samples_per_second": 4282.84, + "eval_steps_per_second": 66.935, "step": 2110 }, { - "epoch": 1.3598460551635663, - "grad_norm": 0.018104661256074905, - "learning_rate": 6.401539448364336e-06, - "loss": 0.0005, + "epoch": 0.22586831451097378, + "grad_norm": 0.004037824459373951, + "learning_rate": 1.5482633709780525e-05, + "loss": 0.0004, "step": 2120 }, { - "epoch": 1.3598460551635663, - "eval_loss": 0.0038974089547991753, - "eval_runtime": 2.7467, - "eval_samples_per_second": 4538.939, - "eval_steps_per_second": 141.99, + "epoch": 0.22586831451097378, + "eval_loss": 0.004827072378247976, + "eval_runtime": 35.0285, + "eval_samples_per_second": 4287.2, + "eval_steps_per_second": 67.003, "step": 2120 }, { - "epoch": 1.3662604233483002, - "grad_norm": 0.011067189276218414, - "learning_rate": 6.337395766516998e-06, - "loss": 0.0005, + "epoch": 0.22693373108885576, + "grad_norm": 0.018360449001193047, + "learning_rate": 1.5461325378222887e-05, + "loss": 0.0176, "step": 2130 }, { - "epoch": 1.3662604233483002, - "eval_loss": 0.00390303460881114, - "eval_runtime": 2.7776, - "eval_samples_per_second": 4488.473, - "eval_steps_per_second": 140.411, + "epoch": 0.22693373108885576, + "eval_loss": 0.004639809485524893, + "eval_runtime": 35.0055, + "eval_samples_per_second": 4290.006, + "eval_steps_per_second": 67.047, "step": 2130 }, { - "epoch": 1.372674791533034, - "grad_norm": 0.022579031065106392, - "learning_rate": 6.273252084669661e-06, - "loss": 0.0005, + "epoch": 0.2279991476667377, + "grad_norm": 0.14667311310768127, + "learning_rate": 1.544001704666525e-05, + "loss": 0.0166, "step": 2140 }, { - "epoch": 1.372674791533034, - "eval_loss": 0.003907787147909403, - "eval_runtime": 2.7255, - "eval_samples_per_second": 4574.164, - "eval_steps_per_second": 143.092, + "epoch": 0.2279991476667377, + "eval_loss": 0.004509914666414261, + "eval_runtime": 35.0516, + "eval_samples_per_second": 4284.375, + "eval_steps_per_second": 66.959, "step": 2140 }, { - "epoch": 1.379089159717768, - "grad_norm": 0.009309990331530571, - "learning_rate": 6.209108402822323e-06, - "loss": 0.0005, + "epoch": 0.22906456424461966, + "grad_norm": 0.1163237988948822, + "learning_rate": 1.541870871510761e-05, + "loss": 0.0022, "step": 2150 }, { - "epoch": 1.379089159717768, - "eval_loss": 0.003909664694219828, - "eval_runtime": 2.7365, - "eval_samples_per_second": 4555.883, - "eval_steps_per_second": 142.52, + "epoch": 0.22906456424461966, + "eval_loss": 0.004260140936821699, + "eval_runtime": 35.0677, + "eval_samples_per_second": 4282.404, + "eval_steps_per_second": 66.928, "step": 2150 }, { - "epoch": 1.3855035279025016, - "grad_norm": 0.019414927810430527, - "learning_rate": 6.1449647209749844e-06, - "loss": 0.018, + "epoch": 0.2301299808225016, + "grad_norm": 0.11981041729450226, + "learning_rate": 1.539740038354997e-05, + "loss": 0.0021, "step": 2160 }, { - "epoch": 1.3855035279025016, - "eval_loss": 0.003788945497944951, - "eval_runtime": 2.5986, - "eval_samples_per_second": 4797.494, - "eval_steps_per_second": 150.078, + "epoch": 0.2301299808225016, + "eval_loss": 0.0041451407596468925, + "eval_runtime": 35.0698, + "eval_samples_per_second": 4282.144, + "eval_steps_per_second": 66.924, "step": 2160 }, { - "epoch": 1.3919178960872354, - "grad_norm": 0.01467384584248066, - "learning_rate": 6.080821039127647e-06, - "loss": 0.0006, + "epoch": 0.23119539740038356, + "grad_norm": 0.009077006950974464, + "learning_rate": 1.537609205199233e-05, + "loss": 0.0037, "step": 2170 }, { - "epoch": 1.3919178960872354, - "eval_loss": 0.003753120545297861, - "eval_runtime": 2.483, - "eval_samples_per_second": 5020.994, - "eval_steps_per_second": 157.07, + "epoch": 0.23119539740038356, + "eval_loss": 0.00424983911216259, + "eval_runtime": 35.0442, + "eval_samples_per_second": 4285.276, + "eval_steps_per_second": 66.973, "step": 2170 }, { - "epoch": 1.398332264271969, - "grad_norm": 0.012767287902534008, - "learning_rate": 6.016677357280308e-06, - "loss": 0.0006, + "epoch": 0.2322608139782655, + "grad_norm": 0.8403615355491638, + "learning_rate": 1.5354783720434692e-05, + "loss": 0.0086, "step": 2180 }, { - "epoch": 1.398332264271969, - "eval_loss": 0.0037547799292951822, - "eval_runtime": 2.7563, - "eval_samples_per_second": 4523.089, - "eval_steps_per_second": 141.494, + "epoch": 0.2322608139782655, + "eval_loss": 0.004347871523350477, + "eval_runtime": 35.0585, + "eval_samples_per_second": 4283.529, + "eval_steps_per_second": 66.945, "step": 2180 }, { - "epoch": 1.404746632456703, - "grad_norm": 0.015485835261642933, - "learning_rate": 5.95253367543297e-06, - "loss": 0.0005, + "epoch": 0.23332623055614746, + "grad_norm": 0.7031656503677368, + "learning_rate": 1.5333475388877054e-05, + "loss": 0.0087, "step": 2190 }, { - "epoch": 1.404746632456703, - "eval_loss": 0.0037627057172358036, - "eval_runtime": 2.7383, - "eval_samples_per_second": 4552.902, - "eval_steps_per_second": 142.427, + "epoch": 0.23332623055614746, + "eval_loss": 0.004369079601019621, + "eval_runtime": 35.0569, + "eval_samples_per_second": 4283.716, + "eval_steps_per_second": 66.948, "step": 2190 }, { - "epoch": 1.4111610006414368, - "grad_norm": 0.01081483718007803, - "learning_rate": 5.8883899935856325e-06, - "loss": 0.0005, + "epoch": 0.2343916471340294, + "grad_norm": 0.004467003047466278, + "learning_rate": 1.5312167057319413e-05, + "loss": 0.0013, "step": 2200 }, { - "epoch": 1.4111610006414368, - "eval_loss": 0.0037731672637164593, - "eval_runtime": 2.7497, - "eval_samples_per_second": 4533.983, - "eval_steps_per_second": 141.835, + "epoch": 0.2343916471340294, + "eval_loss": 0.004685032181441784, + "eval_runtime": 35.0162, + "eval_samples_per_second": 4288.697, + "eval_steps_per_second": 67.026, "step": 2200 }, { - "epoch": 1.4175753688261707, - "grad_norm": 0.009181569330394268, - "learning_rate": 5.824246311738294e-06, - "loss": 0.0213, + "epoch": 0.23545706371191136, + "grad_norm": 0.3929450809955597, + "learning_rate": 1.5290858725761775e-05, + "loss": 0.0034, "step": 2210 }, { - "epoch": 1.4175753688261707, - "eval_loss": 0.003773350967094302, - "eval_runtime": 2.7961, - "eval_samples_per_second": 4458.641, - "eval_steps_per_second": 139.478, + "epoch": 0.23545706371191136, + "eval_loss": 0.0044579585082829, + "eval_runtime": 35.0558, + "eval_samples_per_second": 4283.855, + "eval_steps_per_second": 66.95, "step": 2210 }, { - "epoch": 1.4239897370109045, - "grad_norm": 0.01457177009433508, - "learning_rate": 5.760102629890956e-06, - "loss": 0.0006, + "epoch": 0.2365224802897933, + "grad_norm": 0.004594275262206793, + "learning_rate": 1.5269550394204136e-05, + "loss": 0.0065, "step": 2220 }, { - "epoch": 1.4239897370109045, - "eval_loss": 0.0037766669411212206, - "eval_runtime": 2.7373, - "eval_samples_per_second": 4554.509, - "eval_steps_per_second": 142.477, + "epoch": 0.2365224802897933, + "eval_loss": 0.005013572052121162, + "eval_runtime": 35.0748, + "eval_samples_per_second": 4281.54, + "eval_steps_per_second": 66.914, "step": 2220 }, { - "epoch": 1.4304041051956382, - "grad_norm": 0.017023414373397827, - "learning_rate": 5.695958948043619e-06, - "loss": 0.0005, + "epoch": 0.23758789686767526, + "grad_norm": 0.0050141457468271255, + "learning_rate": 1.5248242062646496e-05, + "loss": 0.0054, "step": 2230 }, { - "epoch": 1.4304041051956382, - "eval_loss": 0.003780537284910679, - "eval_runtime": 2.7984, - "eval_samples_per_second": 4455.052, - "eval_steps_per_second": 139.366, + "epoch": 0.23758789686767526, + "eval_loss": 0.004857253283262253, + "eval_runtime": 35.057, + "eval_samples_per_second": 4283.714, + "eval_steps_per_second": 66.948, "step": 2230 }, { - "epoch": 1.436818473380372, - "grad_norm": 0.024216625839471817, - "learning_rate": 5.63181526619628e-06, - "loss": 0.0006, + "epoch": 0.2386533134455572, + "grad_norm": 0.041468288749456406, + "learning_rate": 1.5226933731088858e-05, + "loss": 0.0007, "step": 2240 }, { - "epoch": 1.436818473380372, - "eval_loss": 0.0037873839028179646, - "eval_runtime": 2.7322, - "eval_samples_per_second": 4562.914, - "eval_steps_per_second": 142.74, + "epoch": 0.2386533134455572, + "eval_loss": 0.004677619785070419, + "eval_runtime": 35.0514, + "eval_samples_per_second": 4284.389, + "eval_steps_per_second": 66.959, "step": 2240 }, { - "epoch": 1.443232841565106, - "grad_norm": 0.013345190323889256, - "learning_rate": 5.567671584348942e-06, - "loss": 0.0005, + "epoch": 0.23971873002343916, + "grad_norm": 0.004301860462874174, + "learning_rate": 1.5205625399531218e-05, + "loss": 0.0009, "step": 2250 }, { - "epoch": 1.443232841565106, - "eval_loss": 0.0037942323833703995, - "eval_runtime": 2.7887, - "eval_samples_per_second": 4470.565, - "eval_steps_per_second": 139.851, + "epoch": 0.23971873002343916, + "eval_loss": 0.004459399729967117, + "eval_runtime": 35.0712, + "eval_samples_per_second": 4281.976, + "eval_steps_per_second": 66.921, "step": 2250 }, { - "epoch": 1.4496472097498396, - "grad_norm": 0.015292412601411343, - "learning_rate": 5.503527902501604e-06, - "loss": 0.0006, + "epoch": 0.2407841466013211, + "grad_norm": 0.38491347432136536, + "learning_rate": 1.518431706797358e-05, + "loss": 0.0018, "step": 2260 }, { - "epoch": 1.4496472097498396, - "eval_loss": 0.0038015488535165787, - "eval_runtime": 2.726, - "eval_samples_per_second": 4573.445, - "eval_steps_per_second": 143.069, + "epoch": 0.2407841466013211, + "eval_loss": 0.004341489169746637, + "eval_runtime": 35.09, + "eval_samples_per_second": 4279.681, + "eval_steps_per_second": 66.885, "step": 2260 }, { - "epoch": 1.4560615779345736, - "grad_norm": 0.008000009693205357, - "learning_rate": 5.439384220654266e-06, - "loss": 0.0005, + "epoch": 0.24184956317920306, + "grad_norm": 0.0182588379830122, + "learning_rate": 1.516300873641594e-05, + "loss": 0.0049, "step": 2270 }, { - "epoch": 1.4560615779345736, - "eval_loss": 0.003808696521446109, - "eval_runtime": 2.7534, - "eval_samples_per_second": 4527.914, - "eval_steps_per_second": 141.645, + "epoch": 0.24184956317920306, + "eval_loss": 0.00429992750287056, + "eval_runtime": 35.0536, + "eval_samples_per_second": 4284.123, + "eval_steps_per_second": 66.955, "step": 2270 }, { - "epoch": 1.4624759461193073, - "grad_norm": 0.011347589083015919, - "learning_rate": 5.375240538806929e-06, - "loss": 0.0308, + "epoch": 0.242914979757085, + "grad_norm": 0.0038155666552484035, + "learning_rate": 1.5141700404858302e-05, + "loss": 0.0012, "step": 2280 }, { - "epoch": 1.4624759461193073, - "eval_loss": 0.003851969027891755, - "eval_runtime": 2.5855, - "eval_samples_per_second": 4821.875, - "eval_steps_per_second": 150.841, + "epoch": 0.242914979757085, + "eval_loss": 0.0042260088957846165, + "eval_runtime": 35.0614, + "eval_samples_per_second": 4283.169, + "eval_steps_per_second": 66.94, "step": 2280 }, { - "epoch": 1.468890314304041, - "grad_norm": 0.017860205844044685, - "learning_rate": 5.31109685695959e-06, - "loss": 0.0006, + "epoch": 0.24398039633496696, + "grad_norm": 0.003924189601093531, + "learning_rate": 1.5120392073300662e-05, + "loss": 0.0077, "step": 2290 }, { - "epoch": 1.468890314304041, - "eval_loss": 0.00419242400676012, - "eval_runtime": 2.4698, - "eval_samples_per_second": 5047.81, - "eval_steps_per_second": 157.909, + "epoch": 0.24398039633496696, + "eval_loss": 0.004574434366077185, + "eval_runtime": 35.0275, + "eval_samples_per_second": 4287.317, + "eval_steps_per_second": 67.004, "step": 2290 }, { - "epoch": 1.4753046824887748, - "grad_norm": 0.014923288486897945, - "learning_rate": 5.246953175112251e-06, - "loss": 0.0005, + "epoch": 0.24504581291284894, + "grad_norm": 0.025482522323727608, + "learning_rate": 1.5099083741743024e-05, + "loss": 0.0071, "step": 2300 }, { - "epoch": 1.4753046824887748, - "eval_loss": 0.004501288756728172, - "eval_runtime": 2.7337, - "eval_samples_per_second": 4560.461, - "eval_steps_per_second": 142.663, + "epoch": 0.24504581291284894, + "eval_loss": 0.005042645614594221, + "eval_runtime": 35.027, + "eval_samples_per_second": 4287.374, + "eval_steps_per_second": 67.005, "step": 2300 }, { - "epoch": 1.4817190506735087, - "grad_norm": 0.05106737092137337, - "learning_rate": 5.182809493264914e-06, - "loss": 0.0006, + "epoch": 0.2461112294907309, + "grad_norm": 0.003832248505204916, + "learning_rate": 1.5077775410185384e-05, + "loss": 0.0026, "step": 2310 }, { - "epoch": 1.4817190506735087, - "eval_loss": 0.0046123480424284935, - "eval_runtime": 2.7714, - "eval_samples_per_second": 4498.443, - "eval_steps_per_second": 140.723, + "epoch": 0.2461112294907309, + "eval_loss": 0.0048546576872467995, + "eval_runtime": 35.0534, + "eval_samples_per_second": 4284.146, + "eval_steps_per_second": 66.955, "step": 2310 }, { - "epoch": 1.4881334188582425, - "grad_norm": 0.018469417467713356, - "learning_rate": 5.118665811417576e-06, - "loss": 0.0006, + "epoch": 0.24717664606861284, + "grad_norm": 0.0844670832157135, + "learning_rate": 1.5056467078627746e-05, + "loss": 0.0047, "step": 2320 }, { - "epoch": 1.4881334188582425, - "eval_loss": 0.004673714749515057, - "eval_runtime": 2.7182, - "eval_samples_per_second": 4586.408, - "eval_steps_per_second": 143.475, + "epoch": 0.24717664606861284, + "eval_loss": 0.00422197300940752, + "eval_runtime": 35.0619, + "eval_samples_per_second": 4283.111, + "eval_steps_per_second": 66.939, "step": 2320 }, { - "epoch": 1.4945477870429762, - "grad_norm": 0.010447741486132145, - "learning_rate": 5.054522129570238e-06, - "loss": 0.0007, + "epoch": 0.2482420626464948, + "grad_norm": 0.039526067674160004, + "learning_rate": 1.5035158747070106e-05, + "loss": 0.0008, "step": 2330 }, { - "epoch": 1.4945477870429762, - "eval_loss": 0.004349403083324432, - "eval_runtime": 2.7507, - "eval_samples_per_second": 4532.368, - "eval_steps_per_second": 141.784, + "epoch": 0.2482420626464948, + "eval_loss": 0.004363663960248232, + "eval_runtime": 35.0057, + "eval_samples_per_second": 4289.986, + "eval_steps_per_second": 67.046, "step": 2330 }, { - "epoch": 1.5009621552277101, - "grad_norm": 0.012541081756353378, - "learning_rate": 4.990378447722899e-06, - "loss": 0.0005, + "epoch": 0.24930747922437674, + "grad_norm": 0.036807432770729065, + "learning_rate": 1.5013850415512468e-05, + "loss": 0.005, "step": 2340 }, { - "epoch": 1.5009621552277101, - "eval_loss": 0.004171546548604965, - "eval_runtime": 2.7531, - "eval_samples_per_second": 4528.382, - "eval_steps_per_second": 141.659, + "epoch": 0.24930747922437674, + "eval_loss": 0.00450093112885952, + "eval_runtime": 35.0239, + "eval_samples_per_second": 4287.753, + "eval_steps_per_second": 67.011, "step": 2340 }, { - "epoch": 1.5073765234124439, - "grad_norm": 2.703684091567993, - "learning_rate": 4.926234765875561e-06, - "loss": 0.0142, + "epoch": 0.2503728958022587, + "grad_norm": 0.013508515432476997, + "learning_rate": 1.4992542083954828e-05, + "loss": 0.0011, "step": 2350 }, { - "epoch": 1.5073765234124439, - "eval_loss": 0.004026424139738083, - "eval_runtime": 2.7277, - "eval_samples_per_second": 4570.578, - "eval_steps_per_second": 142.98, + "epoch": 0.2503728958022587, + "eval_loss": 0.004736943170428276, + "eval_runtime": 35.0146, + "eval_samples_per_second": 4288.902, + "eval_steps_per_second": 67.029, "step": 2350 }, { - "epoch": 1.5137908915971776, - "grad_norm": 0.02144004963338375, - "learning_rate": 4.862091084028224e-06, - "loss": 0.0005, + "epoch": 0.25143831238014064, + "grad_norm": 0.13655096292495728, + "learning_rate": 1.497123375239719e-05, + "loss": 0.0006, "step": 2360 }, { - "epoch": 1.5137908915971776, - "eval_loss": 0.0038558936212211847, - "eval_runtime": 2.7816, - "eval_samples_per_second": 4481.995, - "eval_steps_per_second": 140.208, + "epoch": 0.25143831238014064, + "eval_loss": 0.004960217047482729, + "eval_runtime": 35.0563, + "eval_samples_per_second": 4283.792, + "eval_steps_per_second": 66.949, "step": 2360 }, { - "epoch": 1.5202052597819113, - "grad_norm": 0.0292587261646986, - "learning_rate": 4.797947402180886e-06, - "loss": 0.0005, + "epoch": 0.2525037289580226, + "grad_norm": 1.2185442447662354, + "learning_rate": 1.494992542083955e-05, + "loss": 0.0104, "step": 2370 }, { - "epoch": 1.5202052597819113, - "eval_loss": 0.0038230891805142164, - "eval_runtime": 2.7173, - "eval_samples_per_second": 4588.073, - "eval_steps_per_second": 143.527, + "epoch": 0.2525037289580226, + "eval_loss": 0.004864447750151157, + "eval_runtime": 35.0658, + "eval_samples_per_second": 4282.637, + "eval_steps_per_second": 66.931, "step": 2370 }, { - "epoch": 1.5266196279666453, - "grad_norm": 0.007578797172755003, - "learning_rate": 4.7338037203335474e-06, - "loss": 0.0005, + "epoch": 0.25356914553590454, + "grad_norm": 0.036553967744112015, + "learning_rate": 1.4928617089281912e-05, + "loss": 0.0074, "step": 2380 }, { - "epoch": 1.5266196279666453, - "eval_loss": 0.003819518955424428, - "eval_runtime": 2.7184, - "eval_samples_per_second": 4586.149, - "eval_steps_per_second": 143.467, + "epoch": 0.25356914553590454, + "eval_loss": 0.004570557735860348, + "eval_runtime": 35.027, + "eval_samples_per_second": 4287.378, + "eval_steps_per_second": 67.005, "step": 2380 }, { - "epoch": 1.5330339961513793, - "grad_norm": 0.013671874068677425, - "learning_rate": 4.669660038486209e-06, - "loss": 0.0005, + "epoch": 0.2546345621137865, + "grad_norm": 0.027491575106978416, + "learning_rate": 1.490730875772427e-05, + "loss": 0.0011, "step": 2390 }, { - "epoch": 1.5330339961513793, - "eval_loss": 0.003823925508186221, - "eval_runtime": 2.7743, - "eval_samples_per_second": 4493.785, - "eval_steps_per_second": 140.577, + "epoch": 0.2546345621137865, + "eval_loss": 0.004696827381849289, + "eval_runtime": 35.0112, + "eval_samples_per_second": 4289.31, + "eval_steps_per_second": 67.036, "step": 2390 }, { - "epoch": 1.539448364336113, - "grad_norm": 0.01323388610035181, - "learning_rate": 4.605516356638872e-06, - "loss": 0.0005, + "epoch": 0.25569997869166844, + "grad_norm": 0.23158523440361023, + "learning_rate": 1.488600042616663e-05, + "loss": 0.0104, "step": 2400 }, { - "epoch": 1.539448364336113, - "eval_loss": 0.003829265246167779, - "eval_runtime": 2.7303, - "eval_samples_per_second": 4566.097, - "eval_steps_per_second": 142.839, + "epoch": 0.25569997869166844, + "eval_loss": 0.004320676904171705, + "eval_runtime": 35.0305, + "eval_samples_per_second": 4286.953, + "eval_steps_per_second": 66.999, "step": 2400 }, { - "epoch": 1.5458627325208467, - "grad_norm": 0.015381712466478348, - "learning_rate": 4.541372674791533e-06, - "loss": 0.0005, + "epoch": 0.2567653952695504, + "grad_norm": 0.007454337552189827, + "learning_rate": 1.4864692094608993e-05, + "loss": 0.0093, "step": 2410 }, { - "epoch": 1.5458627325208467, - "eval_loss": 0.003835107199847698, - "eval_runtime": 2.6429, - "eval_samples_per_second": 4717.103, - "eval_steps_per_second": 147.563, + "epoch": 0.2567653952695504, + "eval_loss": 0.004152194131165743, + "eval_runtime": 35.0245, + "eval_samples_per_second": 4287.684, + "eval_steps_per_second": 67.01, "step": 2410 }, { - "epoch": 1.5522771007055804, - "grad_norm": 0.010174254886806011, - "learning_rate": 4.4772289929441955e-06, - "loss": 0.0005, + "epoch": 0.25783081184743234, + "grad_norm": 0.006590006407350302, + "learning_rate": 1.4843383763051353e-05, + "loss": 0.0007, "step": 2420 }, { - "epoch": 1.5522771007055804, - "eval_loss": 0.0038460749201476574, - "eval_runtime": 2.634, - "eval_samples_per_second": 4733.166, - "eval_steps_per_second": 148.066, + "epoch": 0.25783081184743234, + "eval_loss": 0.004091034177690744, + "eval_runtime": 35.0369, + "eval_samples_per_second": 4286.17, + "eval_steps_per_second": 66.987, "step": 2420 }, { - "epoch": 1.5586914688903142, - "grad_norm": 0.008752675727009773, - "learning_rate": 4.413085311096857e-06, - "loss": 0.0004, + "epoch": 0.2588962284253143, + "grad_norm": 0.08318906277418137, + "learning_rate": 1.4822075431493715e-05, + "loss": 0.0032, "step": 2430 }, { - "epoch": 1.5586914688903142, - "eval_loss": 0.003855367423966527, - "eval_runtime": 2.7267, - "eval_samples_per_second": 4572.183, - "eval_steps_per_second": 143.03, + "epoch": 0.2588962284253143, + "eval_loss": 0.004115572199225426, + "eval_runtime": 35.0695, + "eval_samples_per_second": 4282.183, + "eval_steps_per_second": 66.924, "step": 2430 }, { - "epoch": 1.5651058370750481, - "grad_norm": 0.010081687942147255, - "learning_rate": 4.348941629249519e-06, - "loss": 0.0005, + "epoch": 0.25996164500319624, + "grad_norm": 0.25250810384750366, + "learning_rate": 1.4800767099936075e-05, + "loss": 0.0025, "step": 2440 }, { - "epoch": 1.5651058370750481, - "eval_loss": 0.003865085309371352, - "eval_runtime": 2.7222, - "eval_samples_per_second": 4579.785, - "eval_steps_per_second": 143.268, + "epoch": 0.25996164500319624, + "eval_loss": 0.004098709672689438, + "eval_runtime": 35.0339, + "eval_samples_per_second": 4286.54, + "eval_steps_per_second": 66.992, "step": 2440 }, { - "epoch": 1.5715202052597819, - "grad_norm": 0.010632511228322983, - "learning_rate": 4.284797947402181e-06, - "loss": 0.0005, + "epoch": 0.2610270615810782, + "grad_norm": 0.004992151632905006, + "learning_rate": 1.4779458768378437e-05, + "loss": 0.0019, "step": 2450 }, { - "epoch": 1.5715202052597819, - "eval_loss": 0.003874831600114703, - "eval_runtime": 2.7831, - "eval_samples_per_second": 4479.547, - "eval_steps_per_second": 140.132, + "epoch": 0.2610270615810782, + "eval_loss": 0.00410530436784029, + "eval_runtime": 35.0414, + "eval_samples_per_second": 4285.617, + "eval_steps_per_second": 66.978, "step": 2450 }, { - "epoch": 1.5779345734445158, - "grad_norm": 0.015775036066770554, - "learning_rate": 4.2206542655548435e-06, - "loss": 0.0005, + "epoch": 0.26209247815896014, + "grad_norm": 2.30206036567688, + "learning_rate": 1.4758150436820797e-05, + "loss": 0.0107, "step": 2460 }, { - "epoch": 1.5779345734445158, - "eval_loss": 0.00388737628236413, - "eval_runtime": 2.7237, - "eval_samples_per_second": 4577.207, - "eval_steps_per_second": 143.187, + "epoch": 0.26209247815896014, + "eval_loss": 0.004170614294707775, + "eval_runtime": 35.0183, + "eval_samples_per_second": 4288.444, + "eval_steps_per_second": 67.022, "step": 2460 }, { - "epoch": 1.5843489416292496, - "grad_norm": 0.011216863058507442, - "learning_rate": 4.156510583707505e-06, - "loss": 0.0005, + "epoch": 0.2631578947368421, + "grad_norm": 0.07904893159866333, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.0032, "step": 2470 }, { - "epoch": 1.5843489416292496, - "eval_loss": 0.0038962597027420998, - "eval_runtime": 2.7328, - "eval_samples_per_second": 4561.93, - "eval_steps_per_second": 142.709, + "epoch": 0.2631578947368421, + "eval_loss": 0.004105927422642708, + "eval_runtime": 35.0424, + "eval_samples_per_second": 4285.494, + "eval_steps_per_second": 66.976, "step": 2470 }, { - "epoch": 1.5907633098139833, - "grad_norm": 0.009197092615067959, - "learning_rate": 4.092366901860167e-06, - "loss": 0.0004, + "epoch": 0.26422331131472404, + "grad_norm": 0.0035470998845994473, + "learning_rate": 1.4715533773705519e-05, + "loss": 0.0074, "step": 2480 }, { - "epoch": 1.5907633098139833, - "eval_loss": 0.0039031601045280695, - "eval_runtime": 2.7575, - "eval_samples_per_second": 4521.111, - "eval_steps_per_second": 141.432, + "epoch": 0.26422331131472404, + "eval_loss": 0.004295279737561941, + "eval_runtime": 35.0454, + "eval_samples_per_second": 4285.128, + "eval_steps_per_second": 66.97, "step": 2480 }, { - "epoch": 1.597177677998717, - "grad_norm": 0.007920138537883759, - "learning_rate": 4.028223220012829e-06, - "loss": 0.0004, + "epoch": 0.265288727892606, + "grad_norm": 0.4439772665500641, + "learning_rate": 1.469422544214788e-05, + "loss": 0.0048, "step": 2490 }, { - "epoch": 1.597177677998717, - "eval_loss": 0.003908549435436726, - "eval_runtime": 2.7164, - "eval_samples_per_second": 4589.533, - "eval_steps_per_second": 143.572, + "epoch": 0.265288727892606, + "eval_loss": 0.004732625558972359, + "eval_runtime": 35.0313, + "eval_samples_per_second": 4286.848, + "eval_steps_per_second": 66.997, "step": 2490 }, { - "epoch": 1.603592046183451, - "grad_norm": 0.01130605023354292, - "learning_rate": 3.964079538165492e-06, - "loss": 0.0202, + "epoch": 0.26635414447048794, + "grad_norm": 0.0037416014820337296, + "learning_rate": 1.467291711059024e-05, + "loss": 0.0026, "step": 2500 }, { - "epoch": 1.603592046183451, - "eval_loss": 0.003887481288984418, - "eval_runtime": 2.7368, - "eval_samples_per_second": 4555.355, - "eval_steps_per_second": 142.503, + "epoch": 0.26635414447048794, + "eval_loss": 0.004636832047253847, + "eval_runtime": 35.0307, + "eval_samples_per_second": 4286.927, + "eval_steps_per_second": 66.998, "step": 2500 }, { - "epoch": 1.6100064143681847, - "grad_norm": 0.010281969793140888, - "learning_rate": 3.8999358563181525e-06, - "loss": 0.0004, + "epoch": 0.2674195610483699, + "grad_norm": 0.003717947518453002, + "learning_rate": 1.4651608779032603e-05, + "loss": 0.0048, "step": 2510 }, { - "epoch": 1.6100064143681847, - "eval_loss": 0.0038782022893428802, - "eval_runtime": 2.7779, - "eval_samples_per_second": 4487.867, - "eval_steps_per_second": 140.392, + "epoch": 0.2674195610483699, + "eval_loss": 0.004365737084299326, + "eval_runtime": 35.0714, + "eval_samples_per_second": 4281.948, + "eval_steps_per_second": 66.921, "step": 2510 }, { - "epoch": 1.6164207825529187, - "grad_norm": 0.009838576428592205, - "learning_rate": 3.835792174470815e-06, - "loss": 0.0145, + "epoch": 0.26848497762625184, + "grad_norm": 0.031179407611489296, + "learning_rate": 1.4630300447474963e-05, + "loss": 0.0009, "step": 2520 }, { - "epoch": 1.6164207825529187, - "eval_loss": 0.0037884835619479418, - "eval_runtime": 2.7139, - "eval_samples_per_second": 4593.753, - "eval_steps_per_second": 143.704, + "epoch": 0.26848497762625184, + "eval_loss": 0.004146276507526636, + "eval_runtime": 35.0595, + "eval_samples_per_second": 4283.409, + "eval_steps_per_second": 66.943, "step": 2520 }, { - "epoch": 1.6228351507376524, - "grad_norm": 0.010711363516747952, - "learning_rate": 3.771648492623477e-06, - "loss": 0.0005, + "epoch": 0.2695503942041338, + "grad_norm": 0.05639449879527092, + "learning_rate": 1.4608992115917325e-05, + "loss": 0.0079, "step": 2530 }, { - "epoch": 1.6228351507376524, - "eval_loss": 0.003752094926312566, - "eval_runtime": 2.3995, - "eval_samples_per_second": 5195.686, - "eval_steps_per_second": 162.534, + "epoch": 0.2695503942041338, + "eval_loss": 0.004130475223064423, + "eval_runtime": 35.0379, + "eval_samples_per_second": 4286.044, + "eval_steps_per_second": 66.985, "step": 2530 }, { - "epoch": 1.6292495189223861, - "grad_norm": 0.010661286301910877, - "learning_rate": 3.707504810776139e-06, - "loss": 0.0005, + "epoch": 0.27061581078201574, + "grad_norm": 0.003375578671693802, + "learning_rate": 1.4587683784359685e-05, + "loss": 0.0008, "step": 2540 }, { - "epoch": 1.6292495189223861, - "eval_loss": 0.0037525563966482878, - "eval_runtime": 2.4005, - "eval_samples_per_second": 5193.403, - "eval_steps_per_second": 162.463, + "epoch": 0.27061581078201574, + "eval_loss": 0.004228705074638128, + "eval_runtime": 35.0234, + "eval_samples_per_second": 4287.822, + "eval_steps_per_second": 67.012, "step": 2540 }, { - "epoch": 1.6356638871071199, - "grad_norm": 0.010547863319516182, - "learning_rate": 3.6433611289288006e-06, - "loss": 0.0005, + "epoch": 0.2716812273598977, + "grad_norm": 0.004005058668553829, + "learning_rate": 1.4566375452802046e-05, + "loss": 0.0138, "step": 2550 }, { - "epoch": 1.6356638871071199, - "eval_loss": 0.0037555524613708258, - "eval_runtime": 2.3918, - "eval_samples_per_second": 5212.477, - "eval_steps_per_second": 163.06, + "epoch": 0.2716812273598977, + "eval_loss": 0.004597960971295834, + "eval_runtime": 35.0266, + "eval_samples_per_second": 4287.433, + "eval_steps_per_second": 67.006, "step": 2550 }, { - "epoch": 1.6420782552918538, - "grad_norm": 0.009853300638496876, - "learning_rate": 3.579217447081463e-06, - "loss": 0.0005, + "epoch": 0.2727466439377797, + "grad_norm": 0.0035304948687553406, + "learning_rate": 1.4545067121244407e-05, + "loss": 0.0055, "step": 2560 }, { - "epoch": 1.6420782552918538, - "eval_loss": 0.0037585473619401455, - "eval_runtime": 2.4095, - "eval_samples_per_second": 5174.209, - "eval_steps_per_second": 161.863, + "epoch": 0.2727466439377797, + "eval_loss": 0.005362317897379398, + "eval_runtime": 35.0017, + "eval_samples_per_second": 4290.48, + "eval_steps_per_second": 67.054, "step": 2560 }, { - "epoch": 1.6484926234765875, - "grad_norm": 0.015307929366827011, - "learning_rate": 3.515073765234125e-06, - "loss": 0.0004, + "epoch": 0.27381206051566165, + "grad_norm": 0.00467054545879364, + "learning_rate": 1.4523758789686768e-05, + "loss": 0.0132, "step": 2570 }, { - "epoch": 1.6484926234765875, - "eval_loss": 0.0037612884771078825, - "eval_runtime": 2.3983, - "eval_samples_per_second": 5198.286, - "eval_steps_per_second": 162.616, + "epoch": 0.27381206051566165, + "eval_loss": 0.005131016951054335, + "eval_runtime": 35.0481, + "eval_samples_per_second": 4284.791, + "eval_steps_per_second": 66.965, "step": 2570 }, { - "epoch": 1.6549069916613215, - "grad_norm": 0.011234630830585957, - "learning_rate": 3.4509300833867864e-06, - "loss": 0.0005, + "epoch": 0.2748774770935436, + "grad_norm": 0.025946978479623795, + "learning_rate": 1.4502450458129129e-05, + "loss": 0.0017, "step": 2580 }, { - "epoch": 1.6549069916613215, - "eval_loss": 0.0037648940924555063, - "eval_runtime": 2.3961, - "eval_samples_per_second": 5202.987, - "eval_steps_per_second": 162.763, + "epoch": 0.2748774770935436, + "eval_loss": 0.011761846020817757, + "eval_runtime": 35.0511, + "eval_samples_per_second": 4284.43, + "eval_steps_per_second": 66.959, "step": 2580 }, { - "epoch": 1.6613213598460552, - "grad_norm": 0.09665284305810928, - "learning_rate": 3.3867864015394486e-06, - "loss": 0.0006, + "epoch": 0.27594289367142555, + "grad_norm": 0.01493908278644085, + "learning_rate": 1.448114212657149e-05, + "loss": 0.0043, "step": 2590 }, { - "epoch": 1.6613213598460552, - "eval_loss": 0.003771902294829488, - "eval_runtime": 2.396, - "eval_samples_per_second": 5203.268, - "eval_steps_per_second": 162.772, + "epoch": 0.27594289367142555, + "eval_loss": 0.006850136443972588, + "eval_runtime": 35.0353, + "eval_samples_per_second": 4286.368, + "eval_steps_per_second": 66.99, "step": 2590 }, { - "epoch": 1.667735728030789, - "grad_norm": 0.009439531713724136, - "learning_rate": 3.322642719692111e-06, - "loss": 0.0005, + "epoch": 0.2770083102493075, + "grad_norm": 0.5396614670753479, + "learning_rate": 1.445983379501385e-05, + "loss": 0.0047, "step": 2600 }, { - "epoch": 1.667735728030789, - "eval_loss": 0.0037818914279341698, - "eval_runtime": 2.4026, - "eval_samples_per_second": 5189.052, - "eval_steps_per_second": 162.327, + "epoch": 0.2770083102493075, + "eval_loss": 0.004564850591123104, + "eval_runtime": 35.0642, + "eval_samples_per_second": 4282.829, + "eval_steps_per_second": 66.934, "step": 2600 }, { - "epoch": 1.6741500962155227, - "grad_norm": 0.011505583301186562, - "learning_rate": 3.2584990378447722e-06, - "loss": 0.0004, + "epoch": 0.27807372682718945, + "grad_norm": 0.003329735714942217, + "learning_rate": 1.4438525463456212e-05, + "loss": 0.0052, "step": 2610 }, { - "epoch": 1.6741500962155227, - "eval_loss": 0.0037876018323004246, - "eval_runtime": 2.4217, - "eval_samples_per_second": 5148.034, - "eval_steps_per_second": 161.044, + "epoch": 0.27807372682718945, + "eval_loss": 0.004351920913904905, + "eval_runtime": 35.0437, + "eval_samples_per_second": 4285.335, + "eval_steps_per_second": 66.974, "step": 2610 }, { - "epoch": 1.6805644644002564, - "grad_norm": 0.01173364743590355, - "learning_rate": 3.1943553559974345e-06, - "loss": 0.0004, + "epoch": 0.2791391434050714, + "grad_norm": 0.007534320000559092, + "learning_rate": 1.4417217131898573e-05, + "loss": 0.0039, "step": 2620 }, { - "epoch": 1.6805644644002564, - "eval_loss": 0.003792904084548354, - "eval_runtime": 2.4025, - "eval_samples_per_second": 5189.132, - "eval_steps_per_second": 162.329, + "epoch": 0.2791391434050714, + "eval_loss": 0.004768616519868374, + "eval_runtime": 35.078, + "eval_samples_per_second": 4281.139, + "eval_steps_per_second": 66.908, "step": 2620 }, { - "epoch": 1.6869788325849904, - "grad_norm": 0.009778267703950405, - "learning_rate": 3.1302116741500967e-06, - "loss": 0.0005, + "epoch": 0.28020455998295335, + "grad_norm": 0.0036227928940206766, + "learning_rate": 1.4395908800340934e-05, + "loss": 0.0197, "step": 2630 }, { - "epoch": 1.6869788325849904, - "eval_loss": 0.0037972936406731606, - "eval_runtime": 2.4016, - "eval_samples_per_second": 5191.166, - "eval_steps_per_second": 162.393, + "epoch": 0.28020455998295335, + "eval_loss": 0.004250204190611839, + "eval_runtime": 35.0223, + "eval_samples_per_second": 4287.952, + "eval_steps_per_second": 67.014, "step": 2630 }, { - "epoch": 1.6933932007697243, - "grad_norm": 0.012008159421384335, - "learning_rate": 3.0660679923027585e-06, - "loss": 0.0004, + "epoch": 0.2812699765608353, + "grad_norm": 0.057612668722867966, + "learning_rate": 1.4374600468783295e-05, + "loss": 0.0017, "step": 2640 }, { - "epoch": 1.6933932007697243, - "eval_loss": 0.0038023728411644697, - "eval_runtime": 2.4092, - "eval_samples_per_second": 5174.654, - "eval_steps_per_second": 161.877, + "epoch": 0.2812699765608353, + "eval_loss": 0.004129552282392979, + "eval_runtime": 35.0617, + "eval_samples_per_second": 4283.136, + "eval_steps_per_second": 66.939, "step": 2640 }, { - "epoch": 1.699807568954458, - "grad_norm": 0.008908640593290329, - "learning_rate": 3.0019243104554203e-06, - "loss": 0.0005, + "epoch": 0.28233539313871725, + "grad_norm": 0.05967571586370468, + "learning_rate": 1.4353292137225656e-05, + "loss": 0.0034, "step": 2650 }, { - "epoch": 1.699807568954458, - "eval_loss": 0.00380841176956892, - "eval_runtime": 2.4007, - "eval_samples_per_second": 5193.082, - "eval_steps_per_second": 162.453, + "epoch": 0.28233539313871725, + "eval_loss": 0.004171199630945921, + "eval_runtime": 35.0514, + "eval_samples_per_second": 4284.394, + "eval_steps_per_second": 66.959, "step": 2650 }, { - "epoch": 1.7062219371391918, - "grad_norm": 0.01192167866975069, - "learning_rate": 2.9377806286080825e-06, - "loss": 0.0004, + "epoch": 0.2834008097165992, + "grad_norm": 0.027145517989993095, + "learning_rate": 1.4331983805668017e-05, + "loss": 0.0016, "step": 2660 }, { - "epoch": 1.7062219371391918, - "eval_loss": 0.003814821597188711, - "eval_runtime": 2.4035, - "eval_samples_per_second": 5187.046, - "eval_steps_per_second": 162.264, + "epoch": 0.2834008097165992, + "eval_loss": 0.004205272998660803, + "eval_runtime": 35.0565, + "eval_samples_per_second": 4283.776, + "eval_steps_per_second": 66.949, "step": 2660 }, { - "epoch": 1.7126363053239255, - "grad_norm": 0.008039736188948154, - "learning_rate": 2.8736369467607443e-06, - "loss": 0.0004, + "epoch": 0.28446622629448115, + "grad_norm": 0.028178216889500618, + "learning_rate": 1.4310675474110378e-05, + "loss": 0.0008, "step": 2670 }, { - "epoch": 1.7126363053239255, - "eval_loss": 0.0038202591240406036, - "eval_runtime": 2.4055, - "eval_samples_per_second": 5182.655, - "eval_steps_per_second": 162.127, + "epoch": 0.28446622629448115, + "eval_loss": 0.004260431043803692, + "eval_runtime": 35.049, + "eval_samples_per_second": 4284.69, + "eval_steps_per_second": 66.963, "step": 2670 }, { - "epoch": 1.7190506735086593, - "grad_norm": 0.013746132142841816, - "learning_rate": 2.809493264913406e-06, - "loss": 0.0004, + "epoch": 0.2855316428723631, + "grad_norm": 0.17948974668979645, + "learning_rate": 1.4289367142552739e-05, + "loss": 0.0183, "step": 2680 }, { - "epoch": 1.7190506735086593, - "eval_loss": 0.003824560670182109, - "eval_runtime": 2.4046, - "eval_samples_per_second": 5184.652, - "eval_steps_per_second": 162.189, + "epoch": 0.2855316428723631, + "eval_loss": 0.004343624692410231, + "eval_runtime": 35.0373, + "eval_samples_per_second": 4286.113, + "eval_steps_per_second": 66.986, "step": 2680 }, { - "epoch": 1.7254650416933932, - "grad_norm": 0.011107255704700947, - "learning_rate": 2.7453495830660683e-06, - "loss": 0.0004, + "epoch": 0.28659705945024505, + "grad_norm": 0.06010470911860466, + "learning_rate": 1.42680588109951e-05, + "loss": 0.0017, "step": 2690 }, { - "epoch": 1.7254650416933932, - "eval_loss": 0.0038276948034763336, - "eval_runtime": 2.446, - "eval_samples_per_second": 5096.936, - "eval_steps_per_second": 159.445, + "epoch": 0.28659705945024505, + "eval_loss": 0.004411335103213787, + "eval_runtime": 35.0836, + "eval_samples_per_second": 4280.467, + "eval_steps_per_second": 66.897, "step": 2690 }, { - "epoch": 1.7318794098781272, - "grad_norm": 0.009707199409604073, - "learning_rate": 2.68120590121873e-06, - "loss": 0.0005, + "epoch": 0.287662476028127, + "grad_norm": 0.005281396675854921, + "learning_rate": 1.424675047943746e-05, + "loss": 0.0007, "step": 2700 }, { - "epoch": 1.7318794098781272, - "eval_loss": 0.003818488446995616, - "eval_runtime": 2.7582, - "eval_samples_per_second": 4520.011, - "eval_steps_per_second": 141.398, + "epoch": 0.287662476028127, + "eval_loss": 0.0045102485455572605, + "eval_runtime": 35.0122, + "eval_samples_per_second": 4289.185, + "eval_steps_per_second": 67.034, "step": 2700 }, { - "epoch": 1.738293778062861, - "grad_norm": 0.007962013594806194, - "learning_rate": 2.6170622193713924e-06, - "loss": 0.0004, + "epoch": 0.28872789260600895, + "grad_norm": 0.3176427185535431, + "learning_rate": 1.4225442147879822e-05, + "loss": 0.0024, "step": 2710 }, { - "epoch": 1.738293778062861, - "eval_loss": 0.003793991869315505, - "eval_runtime": 2.7217, - "eval_samples_per_second": 4580.669, - "eval_steps_per_second": 143.295, + "epoch": 0.28872789260600895, + "eval_loss": 0.004357383586466312, + "eval_runtime": 35.0509, + "eval_samples_per_second": 4284.451, + "eval_steps_per_second": 66.96, "step": 2710 }, { - "epoch": 1.7447081462475946, - "grad_norm": 0.013016624376177788, - "learning_rate": 2.5529185375240537e-06, - "loss": 0.0005, + "epoch": 0.2897933091838909, + "grad_norm": 0.3456588685512543, + "learning_rate": 1.4204133816322182e-05, + "loss": 0.0013, "step": 2720 }, { - "epoch": 1.7447081462475946, - "eval_loss": 0.003789684269577265, - "eval_runtime": 2.7327, - "eval_samples_per_second": 4562.147, - "eval_steps_per_second": 142.716, + "epoch": 0.2897933091838909, + "eval_loss": 0.004329455550760031, + "eval_runtime": 35.0464, + "eval_samples_per_second": 4285.011, + "eval_steps_per_second": 66.968, "step": 2720 }, { - "epoch": 1.7511225144323284, - "grad_norm": 0.007025664672255516, - "learning_rate": 2.488774855676716e-06, - "loss": 0.0154, + "epoch": 0.29085872576177285, + "grad_norm": 0.003269694047048688, + "learning_rate": 1.4182825484764544e-05, + "loss": 0.0006, "step": 2730 }, { - "epoch": 1.7511225144323284, - "eval_loss": 0.0038092422764748335, - "eval_runtime": 2.744, - "eval_samples_per_second": 4543.305, - "eval_steps_per_second": 142.126, + "epoch": 0.29085872576177285, + "eval_loss": 0.004743185359984636, + "eval_runtime": 35.0411, + "eval_samples_per_second": 4285.649, + "eval_steps_per_second": 66.978, "step": 2730 }, { - "epoch": 1.757536882617062, - "grad_norm": 0.0099189393222332, - "learning_rate": 2.4246311738293778e-06, - "loss": 0.0004, + "epoch": 0.2919241423396548, + "grad_norm": 0.0067397127859294415, + "learning_rate": 1.4161517153206904e-05, + "loss": 0.0035, "step": 2740 }, { - "epoch": 1.757536882617062, - "eval_loss": 0.0038577597588300705, - "eval_runtime": 2.7325, - "eval_samples_per_second": 4562.439, - "eval_steps_per_second": 142.725, + "epoch": 0.2919241423396548, + "eval_loss": 0.0047289966605603695, + "eval_runtime": 35.0343, + "eval_samples_per_second": 4286.487, + "eval_steps_per_second": 66.992, "step": 2740 }, { - "epoch": 1.763951250801796, - "grad_norm": 0.011446350254118443, - "learning_rate": 2.36048749198204e-06, - "loss": 0.0004, + "epoch": 0.29298955891753675, + "grad_norm": 0.0032168785110116005, + "learning_rate": 1.4140208821649266e-05, + "loss": 0.0003, "step": 2750 }, { - "epoch": 1.763951250801796, - "eval_loss": 0.003888155333697796, - "eval_runtime": 2.7408, - "eval_samples_per_second": 4548.595, - "eval_steps_per_second": 142.292, + "epoch": 0.29298955891753675, + "eval_loss": 0.004747165832668543, + "eval_runtime": 35.0422, + "eval_samples_per_second": 4285.518, + "eval_steps_per_second": 66.976, "step": 2750 }, { - "epoch": 1.7703656189865298, - "grad_norm": 0.006927170790731907, - "learning_rate": 2.2963438101347018e-06, - "loss": 0.0004, + "epoch": 0.2940549754954187, + "grad_norm": 0.014863832853734493, + "learning_rate": 1.4118900490091626e-05, + "loss": 0.0012, "step": 2760 }, { - "epoch": 1.7703656189865298, - "eval_loss": 0.0039006902370601892, - "eval_runtime": 2.7693, - "eval_samples_per_second": 4501.888, - "eval_steps_per_second": 140.831, + "epoch": 0.2940549754954187, + "eval_loss": 0.004687744192779064, + "eval_runtime": 35.0762, + "eval_samples_per_second": 4281.363, + "eval_steps_per_second": 66.911, "step": 2760 }, { - "epoch": 1.7767799871712637, - "grad_norm": 0.010873212479054928, - "learning_rate": 2.232200128287364e-06, - "loss": 0.0005, + "epoch": 0.29512039207330065, + "grad_norm": 0.011120929382741451, + "learning_rate": 1.4097592158533988e-05, + "loss": 0.0007, "step": 2770 }, { - "epoch": 1.7767799871712637, - "eval_loss": 0.003896691370755434, - "eval_runtime": 2.7253, - "eval_samples_per_second": 4574.621, - "eval_steps_per_second": 143.106, + "epoch": 0.29512039207330065, + "eval_loss": 0.0050058220513165, + "eval_runtime": 35.0543, + "eval_samples_per_second": 4284.039, + "eval_steps_per_second": 66.953, "step": 2770 }, { - "epoch": 1.7831943553559975, - "grad_norm": 0.008327585645020008, - "learning_rate": 2.168056446440026e-06, - "loss": 0.0004, + "epoch": 0.2961858086511826, + "grad_norm": 0.006813787389546633, + "learning_rate": 1.4076283826976348e-05, + "loss": 0.0033, "step": 2780 }, { - "epoch": 1.7831943553559975, - "eval_loss": 0.003881297539919615, - "eval_runtime": 2.6677, - "eval_samples_per_second": 4673.4, - "eval_steps_per_second": 146.196, + "epoch": 0.2961858086511826, + "eval_loss": 0.006108899600803852, + "eval_runtime": 35.0497, + "eval_samples_per_second": 4284.604, + "eval_steps_per_second": 66.962, "step": 2780 }, { - "epoch": 1.7896087235407312, - "grad_norm": 0.009809192270040512, - "learning_rate": 2.1039127645926876e-06, - "loss": 0.0004, + "epoch": 0.29725122522906455, + "grad_norm": 0.004185474012047052, + "learning_rate": 1.405497549541871e-05, + "loss": 0.001, "step": 2790 }, { - "epoch": 1.7896087235407312, - "eval_loss": 0.0038780542090535164, - "eval_runtime": 2.3819, - "eval_samples_per_second": 5234.003, - "eval_steps_per_second": 163.733, + "epoch": 0.29725122522906455, + "eval_loss": 0.006051088683307171, + "eval_runtime": 35.0464, + "eval_samples_per_second": 4285.005, + "eval_steps_per_second": 66.968, "step": 2790 }, { - "epoch": 1.796023091725465, - "grad_norm": 0.008661613799631596, - "learning_rate": 2.03976908274535e-06, - "loss": 0.0004, + "epoch": 0.2983166418069465, + "grad_norm": 0.0032837213948369026, + "learning_rate": 1.403366716386107e-05, + "loss": 0.0027, "step": 2800 }, { - "epoch": 1.796023091725465, - "eval_loss": 0.0038783461786806583, - "eval_runtime": 2.3989, - "eval_samples_per_second": 5196.882, - "eval_steps_per_second": 162.572, + "epoch": 0.2983166418069465, + "eval_loss": 0.005179966799914837, + "eval_runtime": 35.0247, + "eval_samples_per_second": 4287.657, + "eval_steps_per_second": 67.01, "step": 2800 }, { - "epoch": 1.8024374599101989, - "grad_norm": 0.008230826817452908, - "learning_rate": 1.9756254008980116e-06, - "loss": 0.0004, + "epoch": 0.29938205838482845, + "grad_norm": 0.018226496875286102, + "learning_rate": 1.4012358832303432e-05, + "loss": 0.0003, "step": 2810 }, { - "epoch": 1.8024374599101989, - "eval_loss": 0.0038756639696657658, - "eval_runtime": 2.3947, - "eval_samples_per_second": 5206.034, - "eval_steps_per_second": 162.858, + "epoch": 0.29938205838482845, + "eval_loss": 0.00502545852214098, + "eval_runtime": 35.0582, + "eval_samples_per_second": 4283.567, + "eval_steps_per_second": 66.946, "step": 2810 }, { - "epoch": 1.8088518280949326, - "grad_norm": 0.010815752670168877, - "learning_rate": 1.911481719050674e-06, - "loss": 0.001, + "epoch": 0.3004474749627104, + "grad_norm": 0.013967903330922127, + "learning_rate": 1.3991050500745792e-05, + "loss": 0.0109, "step": 2820 }, { - "epoch": 1.8088518280949326, - "eval_loss": 0.0038313877303153276, - "eval_runtime": 2.3985, - "eval_samples_per_second": 5197.74, - "eval_steps_per_second": 162.599, + "epoch": 0.3004474749627104, + "eval_loss": 0.004795020446181297, + "eval_runtime": 35.0562, + "eval_samples_per_second": 4283.809, + "eval_steps_per_second": 66.95, "step": 2820 }, { - "epoch": 1.8152661962796666, - "grad_norm": 0.008790099062025547, - "learning_rate": 1.8473380372033357e-06, - "loss": 0.0004, + "epoch": 0.30151289154059235, + "grad_norm": 0.00310189975425601, + "learning_rate": 1.3969742169188154e-05, + "loss": 0.0116, "step": 2830 }, { - "epoch": 1.8152661962796666, - "eval_loss": 0.0038214183878153563, - "eval_runtime": 2.4048, - "eval_samples_per_second": 5184.222, - "eval_steps_per_second": 162.176, + "epoch": 0.30151289154059235, + "eval_loss": 0.005139603745192289, + "eval_runtime": 35.0512, + "eval_samples_per_second": 4284.424, + "eval_steps_per_second": 66.959, "step": 2830 }, { - "epoch": 1.8216805644644003, - "grad_norm": 0.008266700431704521, - "learning_rate": 1.7831943553559975e-06, - "loss": 0.0124, + "epoch": 0.3025783081184743, + "grad_norm": 0.003773763542994857, + "learning_rate": 1.3948433837630514e-05, + "loss": 0.0115, "step": 2840 }, { - "epoch": 1.8216805644644003, - "eval_loss": 0.0038049728609621525, - "eval_runtime": 2.4016, - "eval_samples_per_second": 5191.199, - "eval_steps_per_second": 162.394, + "epoch": 0.3025783081184743, + "eval_loss": 0.004601133055984974, + "eval_runtime": 35.0881, + "eval_samples_per_second": 4279.908, + "eval_steps_per_second": 66.889, "step": 2840 }, { - "epoch": 1.828094932649134, - "grad_norm": 0.007836179807782173, - "learning_rate": 1.7190506735086595e-06, - "loss": 0.0004, + "epoch": 0.30364372469635625, + "grad_norm": 0.00316253793425858, + "learning_rate": 1.3927125506072876e-05, + "loss": 0.0058, "step": 2850 }, { - "epoch": 1.828094932649134, - "eval_loss": 0.0037862639874219894, - "eval_runtime": 2.4046, - "eval_samples_per_second": 5184.681, - "eval_steps_per_second": 162.19, + "epoch": 0.30364372469635625, + "eval_loss": 0.004557873122394085, + "eval_runtime": 35.0701, + "eval_samples_per_second": 4282.113, + "eval_steps_per_second": 66.923, "step": 2850 }, { - "epoch": 1.8345093008338678, - "grad_norm": 0.010235415771603584, - "learning_rate": 1.6549069916613215e-06, - "loss": 0.0003, + "epoch": 0.3047091412742382, + "grad_norm": 0.010574285872280598, + "learning_rate": 1.3905817174515236e-05, + "loss": 0.0058, "step": 2860 }, { - "epoch": 1.8345093008338678, - "eval_loss": 0.003780810162425041, - "eval_runtime": 2.4032, - "eval_samples_per_second": 5187.659, - "eval_steps_per_second": 162.283, + "epoch": 0.3047091412742382, + "eval_loss": 0.005399353802204132, + "eval_runtime": 35.0344, + "eval_samples_per_second": 4286.476, + "eval_steps_per_second": 66.991, "step": 2860 }, { - "epoch": 1.8409236690186017, - "grad_norm": 0.008026237599551678, - "learning_rate": 1.5907633098139835e-06, - "loss": 0.0004, + "epoch": 0.30577455785212015, + "grad_norm": 0.019116273149847984, + "learning_rate": 1.3884508842957598e-05, + "loss": 0.0027, "step": 2870 }, { - "epoch": 1.8409236690186017, - "eval_loss": 0.003780545899644494, - "eval_runtime": 2.4055, - "eval_samples_per_second": 5182.623, - "eval_steps_per_second": 162.126, + "epoch": 0.30577455785212015, + "eval_loss": 0.004699068609625101, + "eval_runtime": 35.0451, + "eval_samples_per_second": 4285.167, + "eval_steps_per_second": 66.971, "step": 2870 }, { - "epoch": 1.8473380372033354, - "grad_norm": 0.0085107097402215, - "learning_rate": 1.5266196279666453e-06, - "loss": 0.0004, + "epoch": 0.3068399744300021, + "grad_norm": 0.03568415716290474, + "learning_rate": 1.3863200511399958e-05, + "loss": 0.009, "step": 2880 }, { - "epoch": 1.8473380372033354, - "eval_loss": 0.003781872568652034, - "eval_runtime": 2.4133, - "eval_samples_per_second": 5166.047, - "eval_steps_per_second": 161.607, + "epoch": 0.3068399744300021, + "eval_loss": 0.0043353792279958725, + "eval_runtime": 35.0693, + "eval_samples_per_second": 4282.212, + "eval_steps_per_second": 66.925, "step": 2880 }, { - "epoch": 1.8537524053880694, - "grad_norm": 0.012739639729261398, - "learning_rate": 1.4624759461193075e-06, - "loss": 0.0004, + "epoch": 0.30790539100788406, + "grad_norm": 2.934440851211548, + "learning_rate": 1.384189217984232e-05, + "loss": 0.004, "step": 2890 }, { - "epoch": 1.8537524053880694, - "eval_loss": 0.003783388528972864, - "eval_runtime": 2.3985, - "eval_samples_per_second": 5197.939, - "eval_steps_per_second": 162.605, + "epoch": 0.30790539100788406, + "eval_loss": 0.0042236242443323135, + "eval_runtime": 35.0591, + "eval_samples_per_second": 4283.447, + "eval_steps_per_second": 66.944, "step": 2890 }, { - "epoch": 1.8601667735728031, - "grad_norm": 0.007168211042881012, - "learning_rate": 1.3983322642719693e-06, - "loss": 0.0173, + "epoch": 0.30897080758576606, + "grad_norm": 0.4145413637161255, + "learning_rate": 1.382058384828468e-05, + "loss": 0.0038, "step": 2900 }, { - "epoch": 1.8601667735728031, - "eval_loss": 0.0037781535647809505, - "eval_runtime": 2.4089, - "eval_samples_per_second": 5175.415, - "eval_steps_per_second": 161.9, + "epoch": 0.30897080758576606, + "eval_loss": 0.004244114272296429, + "eval_runtime": 35.0187, + "eval_samples_per_second": 4288.391, + "eval_steps_per_second": 67.021, "step": 2900 }, { - "epoch": 1.8665811417575369, - "grad_norm": 0.008200396783649921, - "learning_rate": 1.3341885824246311e-06, - "loss": 0.0004, + "epoch": 0.310036224163648, + "grad_norm": 0.0071656289510428905, + "learning_rate": 1.3799275516727042e-05, + "loss": 0.001, "step": 2910 }, { - "epoch": 1.8665811417575369, - "eval_loss": 0.003767445683479309, - "eval_runtime": 2.405, - "eval_samples_per_second": 5183.787, - "eval_steps_per_second": 162.162, + "epoch": 0.310036224163648, + "eval_loss": 0.004367951303720474, + "eval_runtime": 35.0279, + "eval_samples_per_second": 4287.263, + "eval_steps_per_second": 67.004, "step": 2910 }, { - "epoch": 1.8729955099422706, - "grad_norm": 0.007957600988447666, - "learning_rate": 1.2700449005772933e-06, - "loss": 0.0156, + "epoch": 0.31110164074152996, + "grad_norm": 1.2467246055603027, + "learning_rate": 1.3777967185169402e-05, + "loss": 0.0074, "step": 2920 }, { - "epoch": 1.8729955099422706, - "eval_loss": 0.0037655681371688843, - "eval_runtime": 2.4106, - "eval_samples_per_second": 5171.768, - "eval_steps_per_second": 161.786, + "epoch": 0.31110164074152996, + "eval_loss": 0.003949224948883057, + "eval_runtime": 35.0452, + "eval_samples_per_second": 4285.157, + "eval_steps_per_second": 66.971, "step": 2920 }, { - "epoch": 1.8794098781270043, - "grad_norm": 0.011560726910829544, - "learning_rate": 1.2059012187299551e-06, - "loss": 0.0004, + "epoch": 0.3121670573194119, + "grad_norm": 0.0029619967099279165, + "learning_rate": 1.3756658853611764e-05, + "loss": 0.0005, "step": 2930 }, { - "epoch": 1.8794098781270043, - "eval_loss": 0.003768111579120159, - "eval_runtime": 2.4108, - "eval_samples_per_second": 5171.259, - "eval_steps_per_second": 161.77, + "epoch": 0.3121670573194119, + "eval_loss": 0.004097965080291033, + "eval_runtime": 35.0575, + "eval_samples_per_second": 4283.649, + "eval_steps_per_second": 66.947, "step": 2930 }, { - "epoch": 1.8858242463117383, - "grad_norm": 0.041454609483480453, - "learning_rate": 1.1417575368826172e-06, - "loss": 0.0004, + "epoch": 0.31323247389729386, + "grad_norm": 0.003615755122154951, + "learning_rate": 1.3735350522054124e-05, + "loss": 0.0098, "step": 2940 }, { - "epoch": 1.8858242463117383, - "eval_loss": 0.0037701409310102463, - "eval_runtime": 2.4098, - "eval_samples_per_second": 5173.398, - "eval_steps_per_second": 161.837, + "epoch": 0.31323247389729386, + "eval_loss": 0.00408256845548749, + "eval_runtime": 35.0611, + "eval_samples_per_second": 4283.203, + "eval_steps_per_second": 66.94, "step": 2940 }, { - "epoch": 1.8922386144964722, - "grad_norm": 0.013258632272481918, - "learning_rate": 1.0776138550352792e-06, - "loss": 0.0004, + "epoch": 0.3142978904751758, + "grad_norm": 0.0028381391894072294, + "learning_rate": 1.3714042190496486e-05, + "loss": 0.0006, "step": 2950 }, { - "epoch": 1.8922386144964722, - "eval_loss": 0.003771682735532522, - "eval_runtime": 2.3911, - "eval_samples_per_second": 5213.929, - "eval_steps_per_second": 163.105, + "epoch": 0.3142978904751758, + "eval_loss": 0.003918228670954704, + "eval_runtime": 35.0452, + "eval_samples_per_second": 4285.146, + "eval_steps_per_second": 66.971, "step": 2950 }, { - "epoch": 1.898652982681206, - "grad_norm": 0.007926654070615768, - "learning_rate": 1.0134701731879412e-06, - "loss": 0.0151, + "epoch": 0.31536330705305776, + "grad_norm": 0.06406796723604202, + "learning_rate": 1.3692733858938846e-05, + "loss": 0.0007, "step": 2960 }, { - "epoch": 1.898652982681206, - "eval_loss": 0.0037694782949984074, - "eval_runtime": 2.3961, - "eval_samples_per_second": 5203.0, - "eval_steps_per_second": 162.763, + "epoch": 0.31536330705305776, + "eval_loss": 0.003923286683857441, + "eval_runtime": 35.0548, + "eval_samples_per_second": 4283.984, + "eval_steps_per_second": 66.952, "step": 2960 }, { - "epoch": 1.9050673508659397, - "grad_norm": 0.009811542928218842, - "learning_rate": 9.493264913406031e-07, - "loss": 0.0004, + "epoch": 0.3164287236309397, + "grad_norm": 0.16647638380527496, + "learning_rate": 1.3671425527381208e-05, + "loss": 0.0009, "step": 2970 }, { - "epoch": 1.9050673508659397, - "eval_loss": 0.00376922101713717, - "eval_runtime": 2.3914, - "eval_samples_per_second": 5213.272, - "eval_steps_per_second": 163.085, + "epoch": 0.3164287236309397, + "eval_loss": 0.003978800494223833, + "eval_runtime": 35.0109, + "eval_samples_per_second": 4289.344, + "eval_steps_per_second": 67.036, "step": 2970 }, { - "epoch": 1.9114817190506734, - "grad_norm": 0.010268312878906727, - "learning_rate": 8.85182809493265e-07, - "loss": 0.0004, + "epoch": 0.31749414020882166, + "grad_norm": 1.183781623840332, + "learning_rate": 1.3650117195823568e-05, + "loss": 0.0041, "step": 2980 }, { - "epoch": 1.9114817190506734, - "eval_loss": 0.0037699865642935038, - "eval_runtime": 2.3966, - "eval_samples_per_second": 5201.847, - "eval_steps_per_second": 162.727, + "epoch": 0.31749414020882166, + "eval_loss": 0.004037255886942148, + "eval_runtime": 35.0257, + "eval_samples_per_second": 4287.532, + "eval_steps_per_second": 67.008, "step": 2980 }, { - "epoch": 1.9178960872354072, - "grad_norm": 0.009732135571539402, - "learning_rate": 8.210391276459269e-07, - "loss": 0.0004, + "epoch": 0.3185595567867036, + "grad_norm": 0.002885080175474286, + "learning_rate": 1.362880886426593e-05, + "loss": 0.0102, "step": 2990 }, { - "epoch": 1.9178960872354072, - "eval_loss": 0.0037714012432843447, - "eval_runtime": 2.4004, - "eval_samples_per_second": 5193.805, - "eval_steps_per_second": 162.476, + "epoch": 0.3185595567867036, + "eval_loss": 0.003956990782171488, + "eval_runtime": 35.0222, + "eval_samples_per_second": 4287.965, + "eval_steps_per_second": 67.015, "step": 2990 }, { - "epoch": 1.9243104554201411, - "grad_norm": 0.00821003783494234, - "learning_rate": 7.568954457985889e-07, - "loss": 0.0004, + "epoch": 0.31962497336458556, + "grad_norm": 0.13790345191955566, + "learning_rate": 1.360750053270829e-05, + "loss": 0.0014, "step": 3000 }, { - "epoch": 1.9243104554201411, - "eval_loss": 0.0037729167379438877, - "eval_runtime": 2.4092, - "eval_samples_per_second": 5174.836, - "eval_steps_per_second": 161.882, + "epoch": 0.31962497336458556, + "eval_loss": 0.004563441965728998, + "eval_runtime": 35.0586, + "eval_samples_per_second": 4283.514, + "eval_steps_per_second": 66.945, "step": 3000 }, { - "epoch": 1.930724823604875, - "grad_norm": 0.008221003226935863, - "learning_rate": 6.927517639512508e-07, - "loss": 0.0004, + "epoch": 0.3206903899424675, + "grad_norm": 0.23745372891426086, + "learning_rate": 1.3586192201150652e-05, + "loss": 0.0033, "step": 3010 }, { - "epoch": 1.930724823604875, - "eval_loss": 0.0037741470150649548, - "eval_runtime": 2.4854, - "eval_samples_per_second": 5016.031, - "eval_steps_per_second": 156.914, + "epoch": 0.3206903899424675, + "eval_loss": 0.004914409015327692, + "eval_runtime": 35.0379, + "eval_samples_per_second": 4286.05, + "eval_steps_per_second": 66.985, "step": 3010 }, { - "epoch": 1.9371391917896088, - "grad_norm": 0.0076448554173111916, - "learning_rate": 6.286080821039128e-07, - "loss": 0.0004, + "epoch": 0.32175580652034946, + "grad_norm": 0.043057333678007126, + "learning_rate": 1.3564883869593012e-05, + "loss": 0.0017, "step": 3020 }, { - "epoch": 1.9371391917896088, - "eval_loss": 0.0037749563343822956, - "eval_runtime": 2.7299, - "eval_samples_per_second": 4566.865, - "eval_steps_per_second": 142.863, + "epoch": 0.32175580652034946, + "eval_loss": 0.004360364284366369, + "eval_runtime": 35.0253, + "eval_samples_per_second": 4287.587, + "eval_steps_per_second": 67.009, "step": 3020 }, { - "epoch": 1.9435535599743425, - "grad_norm": 0.011786909773945808, - "learning_rate": 5.644644002565747e-07, - "loss": 0.0096, + "epoch": 0.3228212230982314, + "grad_norm": 0.037039484828710556, + "learning_rate": 1.3543575538035374e-05, + "loss": 0.0013, "step": 3030 }, { - "epoch": 1.9435535599743425, - "eval_loss": 0.0037560511846095324, - "eval_runtime": 2.7211, - "eval_samples_per_second": 4581.635, - "eval_steps_per_second": 143.325, + "epoch": 0.3228212230982314, + "eval_loss": 0.004530046600848436, + "eval_runtime": 35.0216, + "eval_samples_per_second": 4288.035, + "eval_steps_per_second": 67.016, "step": 3030 }, { - "epoch": 1.9499679281590763, - "grad_norm": 0.009966257959604263, - "learning_rate": 5.003207184092367e-07, - "loss": 0.0004, + "epoch": 0.32388663967611336, + "grad_norm": 3.6495485305786133, + "learning_rate": 1.3522267206477734e-05, + "loss": 0.021, "step": 3040 }, { - "epoch": 1.9499679281590763, - "eval_loss": 0.0037503966595977545, - "eval_runtime": 2.7541, - "eval_samples_per_second": 4526.77, - "eval_steps_per_second": 141.609, + "epoch": 0.32388663967611336, + "eval_loss": 0.006539896596223116, + "eval_runtime": 35.0111, + "eval_samples_per_second": 4289.326, + "eval_steps_per_second": 67.036, "step": 3040 }, { - "epoch": 1.95638229634381, - "grad_norm": 0.008138866163790226, - "learning_rate": 4.361770365618987e-07, - "loss": 0.0004, + "epoch": 0.3249520562539953, + "grad_norm": 4.4830756187438965, + "learning_rate": 1.3500958874920096e-05, + "loss": 0.0152, "step": 3050 }, { - "epoch": 1.95638229634381, - "eval_loss": 0.0037492290139198303, - "eval_runtime": 2.7213, - "eval_samples_per_second": 4581.226, - "eval_steps_per_second": 143.313, + "epoch": 0.3249520562539953, + "eval_loss": 0.004404969979077578, + "eval_runtime": 35.0765, + "eval_samples_per_second": 4281.333, + "eval_steps_per_second": 66.911, "step": 3050 }, { - "epoch": 1.962796664528544, - "grad_norm": 0.010643853805959225, - "learning_rate": 3.720333547145606e-07, - "loss": 0.0004, + "epoch": 0.32601747283187726, + "grad_norm": 0.06523173302412033, + "learning_rate": 1.3479650543362456e-05, + "loss": 0.0013, "step": 3060 }, { - "epoch": 1.962796664528544, - "eval_loss": 0.00374912703409791, - "eval_runtime": 2.7173, - "eval_samples_per_second": 4587.941, - "eval_steps_per_second": 143.523, + "epoch": 0.32601747283187726, + "eval_loss": 0.004014772828668356, + "eval_runtime": 35.1746, + "eval_samples_per_second": 4269.389, + "eval_steps_per_second": 66.724, "step": 3060 }, { - "epoch": 1.9692110327132777, - "grad_norm": 0.010715777054429054, - "learning_rate": 3.078896728672226e-07, - "loss": 0.0004, + "epoch": 0.3270828894097592, + "grad_norm": 0.0030045281164348125, + "learning_rate": 1.3458342211804816e-05, + "loss": 0.001, "step": 3070 }, { - "epoch": 1.9692110327132777, - "eval_loss": 0.0037495270371437073, - "eval_runtime": 2.7194, - "eval_samples_per_second": 4584.515, - "eval_steps_per_second": 143.415, + "epoch": 0.3270828894097592, + "eval_loss": 0.003926475998014212, + "eval_runtime": 35.0278, + "eval_samples_per_second": 4287.284, + "eval_steps_per_second": 67.004, "step": 3070 }, { - "epoch": 1.9756254008980116, - "grad_norm": 0.0092789800837636, - "learning_rate": 2.4374599101988453e-07, - "loss": 0.0011, + "epoch": 0.32814830598764116, + "grad_norm": 0.19878143072128296, + "learning_rate": 1.3437033880247176e-05, + "loss": 0.0038, "step": 3080 }, { - "epoch": 1.9756254008980116, - "eval_loss": 0.003745446214452386, - "eval_runtime": 2.7756, - "eval_samples_per_second": 4491.701, - "eval_steps_per_second": 140.512, + "epoch": 0.32814830598764116, + "eval_loss": 0.0038352280389517546, + "eval_runtime": 35.0379, + "eval_samples_per_second": 4286.044, + "eval_steps_per_second": 66.985, "step": 3080 }, { - "epoch": 1.9820397690827454, - "grad_norm": 0.008841835893690586, - "learning_rate": 1.7960230917254652e-07, - "loss": 0.0004, + "epoch": 0.3292137225655231, + "grad_norm": 0.010688831098377705, + "learning_rate": 1.3415725548689538e-05, + "loss": 0.0012, "step": 3090 }, { - "epoch": 1.9820397690827454, - "eval_loss": 0.0037423004396259785, - "eval_runtime": 2.7196, - "eval_samples_per_second": 4584.057, - "eval_steps_per_second": 143.401, + "epoch": 0.3292137225655231, + "eval_loss": 0.0038455103058367968, + "eval_runtime": 35.0172, + "eval_samples_per_second": 4288.572, + "eval_steps_per_second": 67.024, "step": 3090 }, { - "epoch": 1.988454137267479, - "grad_norm": 0.010190588422119617, - "learning_rate": 1.1545862732520848e-07, - "loss": 0.0004, + "epoch": 0.33027913914340506, + "grad_norm": 2.0539698600769043, + "learning_rate": 1.3394417217131898e-05, + "loss": 0.0102, "step": 3100 }, { - "epoch": 1.988454137267479, - "eval_loss": 0.003741658991202712, - "eval_runtime": 2.6701, - "eval_samples_per_second": 4669.199, - "eval_steps_per_second": 146.065, + "epoch": 0.33027913914340506, + "eval_loss": 0.0038040748331695795, + "eval_runtime": 35.0546, + "eval_samples_per_second": 4284.007, + "eval_steps_per_second": 66.953, "step": 3100 }, { - "epoch": 1.9948685054522128, - "grad_norm": 0.008077690377831459, - "learning_rate": 5.131494547787043e-08, - "loss": 0.0004, + "epoch": 0.331344555721287, + "grad_norm": 0.01796787604689598, + "learning_rate": 1.337310888557426e-05, + "loss": 0.0054, "step": 3110 }, { - "epoch": 1.9948685054522128, - "eval_loss": 0.003741599852219224, - "eval_runtime": 2.4727, - "eval_samples_per_second": 5041.765, - "eval_steps_per_second": 157.719, + "epoch": 0.331344555721287, + "eval_loss": 0.003991218749433756, + "eval_runtime": 35.0314, + "eval_samples_per_second": 4286.841, + "eval_steps_per_second": 66.997, "step": 3110 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 0.40324315428733826, + "learning_rate": 1.335180055401662e-05, + "loss": 0.0131, + "step": 3120 + }, + { + "epoch": 0.33240997229916897, + "eval_loss": 0.004118100740015507, + "eval_runtime": 35.0329, + "eval_samples_per_second": 4286.658, + "eval_steps_per_second": 66.994, + "step": 3120 + }, + { + "epoch": 0.3334753888770509, + "grad_norm": 0.14840951561927795, + "learning_rate": 1.3330492222458982e-05, + "loss": 0.0075, + "step": 3130 + }, + { + "epoch": 0.3334753888770509, + "eval_loss": 0.003930113278329372, + "eval_runtime": 35.0237, + "eval_samples_per_second": 4287.785, + "eval_steps_per_second": 67.012, + "step": 3130 + }, + { + "epoch": 0.33454080545493287, + "grad_norm": 0.053780388087034225, + "learning_rate": 1.3309183890901342e-05, + "loss": 0.0009, + "step": 3140 + }, + { + "epoch": 0.33454080545493287, + "eval_loss": 0.0038190835621207952, + "eval_runtime": 34.9943, + "eval_samples_per_second": 4291.383, + "eval_steps_per_second": 67.068, + "step": 3140 + }, + { + "epoch": 0.3356062220328148, + "grad_norm": 0.002771625993773341, + "learning_rate": 1.3287875559343704e-05, + "loss": 0.0037, + "step": 3150 + }, + { + "epoch": 0.3356062220328148, + "eval_loss": 0.0038646007888019085, + "eval_runtime": 35.023, + "eval_samples_per_second": 4287.864, + "eval_steps_per_second": 67.013, + "step": 3150 + }, + { + "epoch": 0.33667163861069677, + "grad_norm": 0.003614110639318824, + "learning_rate": 1.3266567227786064e-05, + "loss": 0.0025, + "step": 3160 + }, + { + "epoch": 0.33667163861069677, + "eval_loss": 0.0038535690400749445, + "eval_runtime": 35.0472, + "eval_samples_per_second": 4284.901, + "eval_steps_per_second": 66.967, + "step": 3160 + }, + { + "epoch": 0.3377370551885787, + "grad_norm": 0.05751369893550873, + "learning_rate": 1.3245258896228426e-05, + "loss": 0.0078, + "step": 3170 + }, + { + "epoch": 0.3377370551885787, + "eval_loss": 0.0038923989050090313, + "eval_runtime": 35.0433, + "eval_samples_per_second": 4285.383, + "eval_steps_per_second": 66.974, + "step": 3170 + }, + { + "epoch": 0.33880247176646067, + "grad_norm": 0.05214925855398178, + "learning_rate": 1.3223950564670786e-05, + "loss": 0.0004, + "step": 3180 + }, + { + "epoch": 0.33880247176646067, + "eval_loss": 0.003956311382353306, + "eval_runtime": 35.0271, + "eval_samples_per_second": 4287.364, + "eval_steps_per_second": 67.005, + "step": 3180 + }, + { + "epoch": 0.3398678883443426, + "grad_norm": 0.1537170559167862, + "learning_rate": 1.3202642233113148e-05, + "loss": 0.004, + "step": 3190 + }, + { + "epoch": 0.3398678883443426, + "eval_loss": 0.003928401041775942, + "eval_runtime": 35.0635, + "eval_samples_per_second": 4282.914, + "eval_steps_per_second": 66.936, + "step": 3190 + }, + { + "epoch": 0.34093330492222457, + "grad_norm": 0.12932927906513214, + "learning_rate": 1.3181333901555508e-05, + "loss": 0.0033, + "step": 3200 + }, + { + "epoch": 0.34093330492222457, + "eval_loss": 0.0038969647139310837, + "eval_runtime": 35.079, + "eval_samples_per_second": 4281.024, + "eval_steps_per_second": 66.906, + "step": 3200 + }, + { + "epoch": 0.3419987215001065, + "grad_norm": 0.5066677927970886, + "learning_rate": 1.316002556999787e-05, + "loss": 0.0059, + "step": 3210 + }, + { + "epoch": 0.3419987215001065, + "eval_loss": 0.0044798399321734905, + "eval_runtime": 34.9897, + "eval_samples_per_second": 4291.945, + "eval_steps_per_second": 67.077, + "step": 3210 + }, + { + "epoch": 0.34306413807798847, + "grad_norm": 0.7912442088127136, + "learning_rate": 1.313871723844023e-05, + "loss": 0.0104, + "step": 3220 + }, + { + "epoch": 0.34306413807798847, + "eval_loss": 0.004379452206194401, + "eval_runtime": 35.0419, + "eval_samples_per_second": 4285.558, + "eval_steps_per_second": 66.977, + "step": 3220 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 0.0026291459798812866, + "learning_rate": 1.3117408906882592e-05, + "loss": 0.0149, + "step": 3230 + }, + { + "epoch": 0.3441295546558704, + "eval_loss": 0.0038042503874748945, + "eval_runtime": 35.032, + "eval_samples_per_second": 4286.769, + "eval_steps_per_second": 66.996, + "step": 3230 + }, + { + "epoch": 0.3451949712337524, + "grad_norm": 0.012699414044618607, + "learning_rate": 1.3096100575324952e-05, + "loss": 0.0003, + "step": 3240 + }, + { + "epoch": 0.3451949712337524, + "eval_loss": 0.0037444639019668102, + "eval_runtime": 34.9976, + "eval_samples_per_second": 4290.975, + "eval_steps_per_second": 67.062, + "step": 3240 + }, + { + "epoch": 0.3462603878116344, + "grad_norm": 0.06426554918289185, + "learning_rate": 1.3074792243767314e-05, + "loss": 0.0038, + "step": 3250 + }, + { + "epoch": 0.3462603878116344, + "eval_loss": 0.0037567310500890017, + "eval_runtime": 35.0155, + "eval_samples_per_second": 4288.782, + "eval_steps_per_second": 67.027, + "step": 3250 + }, + { + "epoch": 0.3473258043895163, + "grad_norm": 0.002508602337911725, + "learning_rate": 1.3053483912209674e-05, + "loss": 0.0071, + "step": 3260 + }, + { + "epoch": 0.3473258043895163, + "eval_loss": 0.004010562784969807, + "eval_runtime": 35.0264, + "eval_samples_per_second": 4287.447, + "eval_steps_per_second": 67.007, + "step": 3260 + }, + { + "epoch": 0.3483912209673983, + "grad_norm": 0.11929357796907425, + "learning_rate": 1.3032175580652036e-05, + "loss": 0.0016, + "step": 3270 + }, + { + "epoch": 0.3483912209673983, + "eval_loss": 0.004778635688126087, + "eval_runtime": 35.0281, + "eval_samples_per_second": 4287.247, + "eval_steps_per_second": 67.003, + "step": 3270 + }, + { + "epoch": 0.3494566375452802, + "grad_norm": 2.577575922012329, + "learning_rate": 1.3010867249094396e-05, + "loss": 0.0043, + "step": 3280 + }, + { + "epoch": 0.3494566375452802, + "eval_loss": 0.004186397884041071, + "eval_runtime": 35.0663, + "eval_samples_per_second": 4282.569, + "eval_steps_per_second": 66.93, + "step": 3280 + }, + { + "epoch": 0.3505220541231622, + "grad_norm": 0.003140628570690751, + "learning_rate": 1.2989558917536758e-05, + "loss": 0.0002, + "step": 3290 + }, + { + "epoch": 0.3505220541231622, + "eval_loss": 0.004019039683043957, + "eval_runtime": 35.0117, + "eval_samples_per_second": 4289.253, + "eval_steps_per_second": 67.035, + "step": 3290 + }, + { + "epoch": 0.3515874707010441, + "grad_norm": 3.4825243949890137, + "learning_rate": 1.2968250585979118e-05, + "loss": 0.0222, + "step": 3300 + }, + { + "epoch": 0.3515874707010441, + "eval_loss": 0.003909524530172348, + "eval_runtime": 35.026, + "eval_samples_per_second": 4287.501, + "eval_steps_per_second": 67.007, + "step": 3300 + }, + { + "epoch": 0.3526528872789261, + "grad_norm": 0.0026517182122915983, + "learning_rate": 1.294694225442148e-05, + "loss": 0.0015, + "step": 3310 + }, + { + "epoch": 0.3526528872789261, + "eval_loss": 0.003752995515242219, + "eval_runtime": 35.0277, + "eval_samples_per_second": 4287.298, + "eval_steps_per_second": 67.004, + "step": 3310 + }, + { + "epoch": 0.353718303856808, + "grad_norm": 0.011982251890003681, + "learning_rate": 1.292563392286384e-05, + "loss": 0.0014, + "step": 3320 + }, + { + "epoch": 0.353718303856808, + "eval_loss": 0.003981877584010363, + "eval_runtime": 35.0094, + "eval_samples_per_second": 4289.53, + "eval_steps_per_second": 67.039, + "step": 3320 + }, + { + "epoch": 0.35478372043469, + "grad_norm": 0.009307941421866417, + "learning_rate": 1.2904325591306202e-05, + "loss": 0.0018, + "step": 3330 + }, + { + "epoch": 0.35478372043469, + "eval_loss": 0.004219081252813339, + "eval_runtime": 35.037, + "eval_samples_per_second": 4286.157, + "eval_steps_per_second": 66.986, + "step": 3330 + }, + { + "epoch": 0.3558491370125719, + "grad_norm": 0.003159622196108103, + "learning_rate": 1.2883017259748562e-05, + "loss": 0.0006, + "step": 3340 + }, + { + "epoch": 0.3558491370125719, + "eval_loss": 0.004214595537632704, + "eval_runtime": 35.0219, + "eval_samples_per_second": 4288.002, + "eval_steps_per_second": 67.015, + "step": 3340 + }, + { + "epoch": 0.3569145535904539, + "grad_norm": 0.008063999935984612, + "learning_rate": 1.2861708928190924e-05, + "loss": 0.0009, + "step": 3350 + }, + { + "epoch": 0.3569145535904539, + "eval_loss": 0.0040070428512990475, + "eval_runtime": 35.0574, + "eval_samples_per_second": 4283.666, + "eval_steps_per_second": 66.947, + "step": 3350 + }, + { + "epoch": 0.3579799701683358, + "grad_norm": 0.005213271360844374, + "learning_rate": 1.2840400596633284e-05, + "loss": 0.0003, + "step": 3360 + }, + { + "epoch": 0.3579799701683358, + "eval_loss": 0.00408785417675972, + "eval_runtime": 35.0643, + "eval_samples_per_second": 4282.822, + "eval_steps_per_second": 66.934, + "step": 3360 + }, + { + "epoch": 0.3590453867462178, + "grad_norm": 0.09602358192205429, + "learning_rate": 1.2819092265075646e-05, + "loss": 0.0004, + "step": 3370 + }, + { + "epoch": 0.3590453867462178, + "eval_loss": 0.004157669842243195, + "eval_runtime": 35.041, + "eval_samples_per_second": 4285.661, + "eval_steps_per_second": 66.979, + "step": 3370 + }, + { + "epoch": 0.3601108033240997, + "grad_norm": 0.00831978116184473, + "learning_rate": 1.2797783933518006e-05, + "loss": 0.0002, + "step": 3380 + }, + { + "epoch": 0.3601108033240997, + "eval_loss": 0.0041765193454921246, + "eval_runtime": 35.0633, + "eval_samples_per_second": 4282.934, + "eval_steps_per_second": 66.936, + "step": 3380 + }, + { + "epoch": 0.3611762199019817, + "grad_norm": 0.011790602467954159, + "learning_rate": 1.2776475601960368e-05, + "loss": 0.0003, + "step": 3390 + }, + { + "epoch": 0.3611762199019817, + "eval_loss": 0.004201879724860191, + "eval_runtime": 35.0369, + "eval_samples_per_second": 4286.161, + "eval_steps_per_second": 66.986, + "step": 3390 + }, + { + "epoch": 0.3622416364798636, + "grad_norm": 0.009733389131724834, + "learning_rate": 1.2755167270402728e-05, + "loss": 0.0018, + "step": 3400 + }, + { + "epoch": 0.3622416364798636, + "eval_loss": 0.00434511061757803, + "eval_runtime": 35.0659, + "eval_samples_per_second": 4282.623, + "eval_steps_per_second": 66.931, + "step": 3400 + }, + { + "epoch": 0.3633070530577456, + "grad_norm": 0.02603778801858425, + "learning_rate": 1.273385893884509e-05, + "loss": 0.0006, + "step": 3410 + }, + { + "epoch": 0.3633070530577456, + "eval_loss": 0.004456702154129744, + "eval_runtime": 35.0386, + "eval_samples_per_second": 4285.953, + "eval_steps_per_second": 66.983, + "step": 3410 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 0.0025256723165512085, + "learning_rate": 1.271255060728745e-05, + "loss": 0.0003, + "step": 3420 + }, + { + "epoch": 0.3643724696356275, + "eval_loss": 0.004565515089780092, + "eval_runtime": 35.0713, + "eval_samples_per_second": 4281.965, + "eval_steps_per_second": 66.921, + "step": 3420 + }, + { + "epoch": 0.3654378862135095, + "grad_norm": 0.0844619870185852, + "learning_rate": 1.2691242275729812e-05, + "loss": 0.0005, + "step": 3430 + }, + { + "epoch": 0.3654378862135095, + "eval_loss": 0.004752400331199169, + "eval_runtime": 35.0162, + "eval_samples_per_second": 4288.697, + "eval_steps_per_second": 67.026, + "step": 3430 + }, + { + "epoch": 0.3665033027913914, + "grad_norm": 0.0021026332397013903, + "learning_rate": 1.2669933944172172e-05, + "loss": 0.008, + "step": 3440 + }, + { + "epoch": 0.3665033027913914, + "eval_loss": 0.005648768972605467, + "eval_runtime": 35.0423, + "eval_samples_per_second": 4285.511, + "eval_steps_per_second": 66.976, + "step": 3440 + }, + { + "epoch": 0.3675687193692734, + "grad_norm": 0.0023316419683396816, + "learning_rate": 1.2648625612614534e-05, + "loss": 0.0228, + "step": 3450 + }, + { + "epoch": 0.3675687193692734, + "eval_loss": 0.004643257707357407, + "eval_runtime": 35.0494, + "eval_samples_per_second": 4284.635, + "eval_steps_per_second": 66.963, + "step": 3450 + }, + { + "epoch": 0.36863413594715533, + "grad_norm": 0.07622463256120682, + "learning_rate": 1.2627317281056894e-05, + "loss": 0.0007, + "step": 3460 + }, + { + "epoch": 0.36863413594715533, + "eval_loss": 0.0040694731287658215, + "eval_runtime": 35.0492, + "eval_samples_per_second": 4284.666, + "eval_steps_per_second": 66.963, + "step": 3460 + }, + { + "epoch": 0.3696995525250373, + "grad_norm": 0.047097232192754745, + "learning_rate": 1.2606008949499256e-05, + "loss": 0.0038, + "step": 3470 + }, + { + "epoch": 0.3696995525250373, + "eval_loss": 0.004037928301841021, + "eval_runtime": 35.0666, + "eval_samples_per_second": 4282.532, + "eval_steps_per_second": 66.93, + "step": 3470 + }, + { + "epoch": 0.37076496910291923, + "grad_norm": 0.0033278772607445717, + "learning_rate": 1.2584700617941616e-05, + "loss": 0.0003, + "step": 3480 + }, + { + "epoch": 0.37076496910291923, + "eval_loss": 0.003961279056966305, + "eval_runtime": 35.017, + "eval_samples_per_second": 4288.604, + "eval_steps_per_second": 67.025, + "step": 3480 + }, + { + "epoch": 0.3718303856808012, + "grad_norm": 0.0027841716073453426, + "learning_rate": 1.2563392286383978e-05, + "loss": 0.0034, + "step": 3490 + }, + { + "epoch": 0.3718303856808012, + "eval_loss": 0.0038393058348447084, + "eval_runtime": 35.0566, + "eval_samples_per_second": 4283.76, + "eval_steps_per_second": 66.949, + "step": 3490 + }, + { + "epoch": 0.37289580225868313, + "grad_norm": 0.01204370055347681, + "learning_rate": 1.2542083954826338e-05, + "loss": 0.0003, + "step": 3500 + }, + { + "epoch": 0.37289580225868313, + "eval_loss": 0.0038628741167485714, + "eval_runtime": 35.0746, + "eval_samples_per_second": 4281.557, + "eval_steps_per_second": 66.914, + "step": 3500 + }, + { + "epoch": 0.3739612188365651, + "grad_norm": 0.0020387719850987196, + "learning_rate": 1.25207756232687e-05, + "loss": 0.0005, + "step": 3510 + }, + { + "epoch": 0.3739612188365651, + "eval_loss": 0.0038964590057730675, + "eval_runtime": 35.0528, + "eval_samples_per_second": 4284.217, + "eval_steps_per_second": 66.956, + "step": 3510 + }, + { + "epoch": 0.37502663541444703, + "grad_norm": 0.0024983766488730907, + "learning_rate": 1.249946729171106e-05, + "loss": 0.0002, + "step": 3520 + }, + { + "epoch": 0.37502663541444703, + "eval_loss": 0.003918655216693878, + "eval_runtime": 35.0622, + "eval_samples_per_second": 4283.07, + "eval_steps_per_second": 66.938, + "step": 3520 + }, + { + "epoch": 0.376092051992329, + "grad_norm": 0.008725779131054878, + "learning_rate": 1.2478158960153422e-05, + "loss": 0.0002, + "step": 3530 + }, + { + "epoch": 0.376092051992329, + "eval_loss": 0.003952388651669025, + "eval_runtime": 35.0543, + "eval_samples_per_second": 4284.036, + "eval_steps_per_second": 66.953, + "step": 3530 + }, + { + "epoch": 0.37715746857021093, + "grad_norm": 0.02480531670153141, + "learning_rate": 1.2456850628595782e-05, + "loss": 0.0009, + "step": 3540 + }, + { + "epoch": 0.37715746857021093, + "eval_loss": 0.004151148721575737, + "eval_runtime": 35.0767, + "eval_samples_per_second": 4281.306, + "eval_steps_per_second": 66.911, + "step": 3540 + }, + { + "epoch": 0.3782228851480929, + "grad_norm": 0.0019680445548146963, + "learning_rate": 1.2435542297038144e-05, + "loss": 0.0005, + "step": 3550 + }, + { + "epoch": 0.3782228851480929, + "eval_loss": 0.004354014992713928, + "eval_runtime": 35.0834, + "eval_samples_per_second": 4280.481, + "eval_steps_per_second": 66.898, + "step": 3550 + }, + { + "epoch": 0.37928830172597483, + "grad_norm": 0.002003498375415802, + "learning_rate": 1.2414233965480504e-05, + "loss": 0.0101, + "step": 3560 + }, + { + "epoch": 0.37928830172597483, + "eval_loss": 0.003934256266802549, + "eval_runtime": 35.06, + "eval_samples_per_second": 4283.344, + "eval_steps_per_second": 66.942, + "step": 3560 + }, + { + "epoch": 0.3803537183038568, + "grad_norm": 0.12127909809350967, + "learning_rate": 1.2392925633922866e-05, + "loss": 0.0007, + "step": 3570 + }, + { + "epoch": 0.3803537183038568, + "eval_loss": 0.0039041999261826277, + "eval_runtime": 35.0747, + "eval_samples_per_second": 4281.545, + "eval_steps_per_second": 66.914, + "step": 3570 + }, + { + "epoch": 0.3814191348817388, + "grad_norm": 0.0020610857754945755, + "learning_rate": 1.2371617302365226e-05, + "loss": 0.011, + "step": 3580 + }, + { + "epoch": 0.3814191348817388, + "eval_loss": 0.0038424658123403788, + "eval_runtime": 35.1049, + "eval_samples_per_second": 4277.859, + "eval_steps_per_second": 66.857, + "step": 3580 + }, + { + "epoch": 0.38248455145962074, + "grad_norm": 0.002351719420403242, + "learning_rate": 1.2350308970807588e-05, + "loss": 0.0021, + "step": 3590 + }, + { + "epoch": 0.38248455145962074, + "eval_loss": 0.003790972288697958, + "eval_runtime": 35.104, + "eval_samples_per_second": 4277.973, + "eval_steps_per_second": 66.858, + "step": 3590 + }, + { + "epoch": 0.3835499680375027, + "grad_norm": 0.002572182798758149, + "learning_rate": 1.2329000639249948e-05, + "loss": 0.0002, + "step": 3600 + }, + { + "epoch": 0.3835499680375027, + "eval_loss": 0.00375328934751451, + "eval_runtime": 35.0611, + "eval_samples_per_second": 4283.213, + "eval_steps_per_second": 66.94, + "step": 3600 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.11054070293903351, + "learning_rate": 1.230769230769231e-05, + "loss": 0.0045, + "step": 3610 + }, + { + "epoch": 0.38461538461538464, + "eval_loss": 0.00388448778539896, + "eval_runtime": 35.06, + "eval_samples_per_second": 4283.349, + "eval_steps_per_second": 66.942, + "step": 3610 + }, + { + "epoch": 0.3856808011932666, + "grad_norm": 0.0022837959695607424, + "learning_rate": 1.228638397613467e-05, + "loss": 0.0001, + "step": 3620 + }, + { + "epoch": 0.3856808011932666, + "eval_loss": 0.004068476613610983, + "eval_runtime": 35.0813, + "eval_samples_per_second": 4280.748, + "eval_steps_per_second": 66.902, + "step": 3620 + }, + { + "epoch": 0.38674621777114854, + "grad_norm": 0.15774469077587128, + "learning_rate": 1.2265075644577032e-05, + "loss": 0.0141, + "step": 3630 + }, + { + "epoch": 0.38674621777114854, + "eval_loss": 0.0036877018865197897, + "eval_runtime": 35.0629, + "eval_samples_per_second": 4282.984, + "eval_steps_per_second": 66.937, + "step": 3630 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 0.0019772418309003115, + "learning_rate": 1.2243767313019392e-05, + "loss": 0.0084, + "step": 3640 + }, + { + "epoch": 0.3878116343490305, + "eval_loss": 0.0037544872611761093, + "eval_runtime": 35.0694, + "eval_samples_per_second": 4282.201, + "eval_steps_per_second": 66.925, + "step": 3640 + }, + { + "epoch": 0.38887705092691244, + "grad_norm": 0.2123226374387741, + "learning_rate": 1.2222458981461754e-05, + "loss": 0.0056, + "step": 3650 + }, + { + "epoch": 0.38887705092691244, + "eval_loss": 0.0039087338373064995, + "eval_runtime": 35.0526, + "eval_samples_per_second": 4284.253, + "eval_steps_per_second": 66.957, + "step": 3650 + }, + { + "epoch": 0.3899424675047944, + "grad_norm": 0.0640728697180748, + "learning_rate": 1.2201150649904114e-05, + "loss": 0.0004, + "step": 3660 + }, + { + "epoch": 0.3899424675047944, + "eval_loss": 0.004001296125352383, + "eval_runtime": 35.07, + "eval_samples_per_second": 4282.117, + "eval_steps_per_second": 66.923, + "step": 3660 + }, + { + "epoch": 0.39100788408267634, + "grad_norm": 0.0062736626714468, + "learning_rate": 1.2179842318346476e-05, + "loss": 0.0097, + "step": 3670 + }, + { + "epoch": 0.39100788408267634, + "eval_loss": 0.0038057903293520212, + "eval_runtime": 35.1173, + "eval_samples_per_second": 4276.355, + "eval_steps_per_second": 66.833, + "step": 3670 + }, + { + "epoch": 0.3920733006605583, + "grad_norm": 0.00930896308273077, + "learning_rate": 1.2158533986788836e-05, + "loss": 0.0023, + "step": 3680 + }, + { + "epoch": 0.3920733006605583, + "eval_loss": 0.0034806670155376196, + "eval_runtime": 35.0473, + "eval_samples_per_second": 4284.891, + "eval_steps_per_second": 66.967, + "step": 3680 + }, + { + "epoch": 0.39313871723844024, + "grad_norm": 0.002325017936527729, + "learning_rate": 1.2137225655231198e-05, + "loss": 0.0025, + "step": 3690 + }, + { + "epoch": 0.39313871723844024, + "eval_loss": 0.003411883721128106, + "eval_runtime": 35.0649, + "eval_samples_per_second": 4282.74, + "eval_steps_per_second": 66.933, + "step": 3690 + }, + { + "epoch": 0.3942041338163222, + "grad_norm": 0.005867179948836565, + "learning_rate": 1.2115917323673558e-05, + "loss": 0.0032, + "step": 3700 + }, + { + "epoch": 0.3942041338163222, + "eval_loss": 0.0034860384184867144, + "eval_runtime": 35.0541, + "eval_samples_per_second": 4284.061, + "eval_steps_per_second": 66.954, + "step": 3700 + }, + { + "epoch": 0.39526955039420414, + "grad_norm": 0.00318440911360085, + "learning_rate": 1.209460899211592e-05, + "loss": 0.0023, + "step": 3710 + }, + { + "epoch": 0.39526955039420414, + "eval_loss": 0.003476213663816452, + "eval_runtime": 35.0529, + "eval_samples_per_second": 4284.216, + "eval_steps_per_second": 66.956, + "step": 3710 + }, + { + "epoch": 0.3963349669720861, + "grad_norm": 0.04958747327327728, + "learning_rate": 1.207330066055828e-05, + "loss": 0.0015, + "step": 3720 + }, + { + "epoch": 0.3963349669720861, + "eval_loss": 0.003567621810361743, + "eval_runtime": 35.0515, + "eval_samples_per_second": 4284.38, + "eval_steps_per_second": 66.959, + "step": 3720 + }, + { + "epoch": 0.39740038354996804, + "grad_norm": 0.0030473291408270597, + "learning_rate": 1.2051992329000642e-05, + "loss": 0.0002, + "step": 3730 + }, + { + "epoch": 0.39740038354996804, + "eval_loss": 0.0036918912082910538, + "eval_runtime": 35.0549, + "eval_samples_per_second": 4283.967, + "eval_steps_per_second": 66.952, + "step": 3730 + }, + { + "epoch": 0.39846580012785, + "grad_norm": 0.028261132538318634, + "learning_rate": 1.2030683997443002e-05, + "loss": 0.0038, + "step": 3740 + }, + { + "epoch": 0.39846580012785, + "eval_loss": 0.0036734293680638075, + "eval_runtime": 35.05, + "eval_samples_per_second": 4284.562, + "eval_steps_per_second": 66.961, + "step": 3740 + }, + { + "epoch": 0.39953121670573194, + "grad_norm": 0.0021167814265936613, + "learning_rate": 1.2009375665885362e-05, + "loss": 0.0101, + "step": 3750 + }, + { + "epoch": 0.39953121670573194, + "eval_loss": 0.0038234253879636526, + "eval_runtime": 35.0671, + "eval_samples_per_second": 4282.476, + "eval_steps_per_second": 66.929, + "step": 3750 + }, + { + "epoch": 0.4005966332836139, + "grad_norm": 0.00868785660713911, + "learning_rate": 1.1988067334327722e-05, + "loss": 0.0021, + "step": 3760 + }, + { + "epoch": 0.4005966332836139, + "eval_loss": 0.0038062850944697857, + "eval_runtime": 35.0761, + "eval_samples_per_second": 4281.374, + "eval_steps_per_second": 66.912, + "step": 3760 + }, + { + "epoch": 0.40166204986149584, + "grad_norm": 0.010587544180452824, + "learning_rate": 1.1966759002770084e-05, + "loss": 0.0008, + "step": 3770 + }, + { + "epoch": 0.40166204986149584, + "eval_loss": 0.0039014420472085476, + "eval_runtime": 35.0702, + "eval_samples_per_second": 4282.097, + "eval_steps_per_second": 66.923, + "step": 3770 + }, + { + "epoch": 0.4027274664393778, + "grad_norm": 0.001994876191020012, + "learning_rate": 1.1945450671212444e-05, + "loss": 0.0044, + "step": 3780 + }, + { + "epoch": 0.4027274664393778, + "eval_loss": 0.0038195240776985884, + "eval_runtime": 35.0371, + "eval_samples_per_second": 4286.142, + "eval_steps_per_second": 66.986, + "step": 3780 + }, + { + "epoch": 0.40379288301725974, + "grad_norm": 1.4522329568862915, + "learning_rate": 1.1924142339654806e-05, + "loss": 0.0052, + "step": 3790 + }, + { + "epoch": 0.40379288301725974, + "eval_loss": 0.003863760968670249, + "eval_runtime": 35.0456, + "eval_samples_per_second": 4285.108, + "eval_steps_per_second": 66.97, + "step": 3790 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 0.22452107071876526, + "learning_rate": 1.1902834008097166e-05, + "loss": 0.0019, + "step": 3800 + }, + { + "epoch": 0.4048582995951417, + "eval_loss": 0.003808986861258745, + "eval_runtime": 35.0644, + "eval_samples_per_second": 4282.803, + "eval_steps_per_second": 66.934, + "step": 3800 + }, + { + "epoch": 0.40592371617302364, + "grad_norm": 0.0063810450956225395, + "learning_rate": 1.1881525676539528e-05, + "loss": 0.0045, + "step": 3810 + }, + { + "epoch": 0.40592371617302364, + "eval_loss": 0.003760164137929678, + "eval_runtime": 35.0902, + "eval_samples_per_second": 4279.652, + "eval_steps_per_second": 66.885, + "step": 3810 + }, + { + "epoch": 0.4069891327509056, + "grad_norm": 0.004251557867974043, + "learning_rate": 1.1860217344981888e-05, + "loss": 0.003, + "step": 3820 + }, + { + "epoch": 0.4069891327509056, + "eval_loss": 0.003843538695946336, + "eval_runtime": 35.04, + "eval_samples_per_second": 4285.79, + "eval_steps_per_second": 66.981, + "step": 3820 + }, + { + "epoch": 0.40805454932878754, + "grad_norm": 0.03963892534375191, + "learning_rate": 1.183890901342425e-05, + "loss": 0.001, + "step": 3830 + }, + { + "epoch": 0.40805454932878754, + "eval_loss": 0.0038712327368557453, + "eval_runtime": 35.0958, + "eval_samples_per_second": 4278.975, + "eval_steps_per_second": 66.874, + "step": 3830 + }, + { + "epoch": 0.4091199659066695, + "grad_norm": 0.4857088625431061, + "learning_rate": 1.181760068186661e-05, + "loss": 0.0014, + "step": 3840 + }, + { + "epoch": 0.4091199659066695, + "eval_loss": 0.003814863506704569, + "eval_runtime": 35.1055, + "eval_samples_per_second": 4277.786, + "eval_steps_per_second": 66.856, + "step": 3840 + }, + { + "epoch": 0.41018538248455144, + "grad_norm": 1.0623544454574585, + "learning_rate": 1.1796292350308972e-05, + "loss": 0.0041, + "step": 3850 + }, + { + "epoch": 0.41018538248455144, + "eval_loss": 0.00388675881549716, + "eval_runtime": 35.0714, + "eval_samples_per_second": 4281.952, + "eval_steps_per_second": 66.921, + "step": 3850 + }, + { + "epoch": 0.4112507990624334, + "grad_norm": 0.25712525844573975, + "learning_rate": 1.1774984018751332e-05, + "loss": 0.0095, + "step": 3860 + }, + { + "epoch": 0.4112507990624334, + "eval_loss": 0.0036819500382989645, + "eval_runtime": 35.0976, + "eval_samples_per_second": 4278.758, + "eval_steps_per_second": 66.871, + "step": 3860 + }, + { + "epoch": 0.41231621564031534, + "grad_norm": 0.1655515432357788, + "learning_rate": 1.1753675687193694e-05, + "loss": 0.0005, + "step": 3870 + }, + { + "epoch": 0.41231621564031534, + "eval_loss": 0.003723361063748598, + "eval_runtime": 35.055, + "eval_samples_per_second": 4283.959, + "eval_steps_per_second": 66.952, + "step": 3870 + }, + { + "epoch": 0.4133816322181973, + "grad_norm": 0.0033157425932586193, + "learning_rate": 1.1732367355636054e-05, + "loss": 0.0011, + "step": 3880 + }, + { + "epoch": 0.4133816322181973, + "eval_loss": 0.003979133442044258, + "eval_runtime": 35.0914, + "eval_samples_per_second": 4279.505, + "eval_steps_per_second": 66.882, + "step": 3880 + }, + { + "epoch": 0.41444704879607924, + "grad_norm": 0.0025206347927451134, + "learning_rate": 1.1711059024078416e-05, + "loss": 0.0009, + "step": 3890 + }, + { + "epoch": 0.41444704879607924, + "eval_loss": 0.0041635469533503056, + "eval_runtime": 35.0535, + "eval_samples_per_second": 4284.141, + "eval_steps_per_second": 66.955, + "step": 3890 + }, + { + "epoch": 0.4155124653739612, + "grad_norm": 0.034843962639570236, + "learning_rate": 1.1689750692520776e-05, + "loss": 0.0003, + "step": 3900 + }, + { + "epoch": 0.4155124653739612, + "eval_loss": 0.00432234350591898, + "eval_runtime": 35.0686, + "eval_samples_per_second": 4282.288, + "eval_steps_per_second": 66.926, + "step": 3900 + }, + { + "epoch": 0.41657788195184314, + "grad_norm": 0.009492074139416218, + "learning_rate": 1.1668442360963138e-05, + "loss": 0.0048, + "step": 3910 + }, + { + "epoch": 0.41657788195184314, + "eval_loss": 0.004398560617119074, + "eval_runtime": 35.073, + "eval_samples_per_second": 4281.752, + "eval_steps_per_second": 66.918, + "step": 3910 + }, + { + "epoch": 0.41764329852972515, + "grad_norm": 0.0028107059188187122, + "learning_rate": 1.1647134029405498e-05, + "loss": 0.0003, + "step": 3920 + }, + { + "epoch": 0.41764329852972515, + "eval_loss": 0.004599866457283497, + "eval_runtime": 35.0365, + "eval_samples_per_second": 4286.217, + "eval_steps_per_second": 66.987, + "step": 3920 + }, + { + "epoch": 0.4187087151076071, + "grad_norm": 0.024568969383835793, + "learning_rate": 1.162582569784786e-05, + "loss": 0.0005, + "step": 3930 + }, + { + "epoch": 0.4187087151076071, + "eval_loss": 0.004718529060482979, + "eval_runtime": 35.0643, + "eval_samples_per_second": 4282.822, + "eval_steps_per_second": 66.934, + "step": 3930 + }, + { + "epoch": 0.41977413168548905, + "grad_norm": 0.0036480259150266647, + "learning_rate": 1.160451736629022e-05, + "loss": 0.0018, + "step": 3940 + }, + { + "epoch": 0.41977413168548905, + "eval_loss": 0.004873115103691816, + "eval_runtime": 35.1052, + "eval_samples_per_second": 4277.824, + "eval_steps_per_second": 66.856, + "step": 3940 + }, + { + "epoch": 0.420839548263371, + "grad_norm": 0.017611248418688774, + "learning_rate": 1.1583209034732582e-05, + "loss": 0.0002, + "step": 3950 + }, + { + "epoch": 0.420839548263371, + "eval_loss": 0.005019678734242916, + "eval_runtime": 35.0986, + "eval_samples_per_second": 4278.636, + "eval_steps_per_second": 66.869, + "step": 3950 + }, + { + "epoch": 0.42190496484125295, + "grad_norm": 0.0019490675767883658, + "learning_rate": 1.1561900703174942e-05, + "loss": 0.001, + "step": 3960 + }, + { + "epoch": 0.42190496484125295, + "eval_loss": 0.005005026701837778, + "eval_runtime": 35.1168, + "eval_samples_per_second": 4276.412, + "eval_steps_per_second": 66.834, + "step": 3960 + }, + { + "epoch": 0.4229703814191349, + "grad_norm": 0.0025157982017844915, + "learning_rate": 1.1540592371617304e-05, + "loss": 0.0002, + "step": 3970 + }, + { + "epoch": 0.4229703814191349, + "eval_loss": 0.0050827213563025, + "eval_runtime": 35.0907, + "eval_samples_per_second": 4279.602, + "eval_steps_per_second": 66.884, + "step": 3970 + }, + { + "epoch": 0.42403579799701685, + "grad_norm": 0.002603873610496521, + "learning_rate": 1.1519284040059664e-05, + "loss": 0.0001, + "step": 3980 + }, + { + "epoch": 0.42403579799701685, + "eval_loss": 0.0051656016148626804, + "eval_runtime": 35.0682, + "eval_samples_per_second": 4282.34, + "eval_steps_per_second": 66.927, + "step": 3980 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 0.0055221510119736195, + "learning_rate": 1.1497975708502026e-05, + "loss": 0.0005, + "step": 3990 + }, + { + "epoch": 0.4251012145748988, + "eval_loss": 0.005205425899475813, + "eval_runtime": 35.0712, + "eval_samples_per_second": 4281.979, + "eval_steps_per_second": 66.921, + "step": 3990 + }, + { + "epoch": 0.42616663115278075, + "grad_norm": 0.0023050708696246147, + "learning_rate": 1.1476667376944386e-05, + "loss": 0.0001, + "step": 4000 + }, + { + "epoch": 0.42616663115278075, + "eval_loss": 0.005290038418024778, + "eval_runtime": 35.045, + "eval_samples_per_second": 4285.177, + "eval_steps_per_second": 66.971, + "step": 4000 + }, + { + "epoch": 0.4272320477306627, + "grad_norm": 0.11549913138151169, + "learning_rate": 1.1455359045386748e-05, + "loss": 0.0003, + "step": 4010 + }, + { + "epoch": 0.4272320477306627, + "eval_loss": 0.00536829000338912, + "eval_runtime": 35.094, + "eval_samples_per_second": 4279.197, + "eval_steps_per_second": 66.878, + "step": 4010 + }, + { + "epoch": 0.42829746430854465, + "grad_norm": 0.017903966829180717, + "learning_rate": 1.1434050713829108e-05, + "loss": 0.0001, + "step": 4020 + }, + { + "epoch": 0.42829746430854465, + "eval_loss": 0.005488130263984203, + "eval_runtime": 35.0917, + "eval_samples_per_second": 4279.475, + "eval_steps_per_second": 66.882, + "step": 4020 + }, + { + "epoch": 0.4293628808864266, + "grad_norm": 0.0017556482926011086, + "learning_rate": 1.141274238227147e-05, + "loss": 0.017, + "step": 4030 + }, + { + "epoch": 0.4293628808864266, + "eval_loss": 0.005172598175704479, + "eval_runtime": 35.0759, + "eval_samples_per_second": 4281.396, + "eval_steps_per_second": 66.912, + "step": 4030 + }, + { + "epoch": 0.43042829746430855, + "grad_norm": 0.005225505214184523, + "learning_rate": 1.139143405071383e-05, + "loss": 0.0001, + "step": 4040 + }, + { + "epoch": 0.43042829746430855, + "eval_loss": 0.004632376134395599, + "eval_runtime": 35.0926, + "eval_samples_per_second": 4279.37, + "eval_steps_per_second": 66.88, + "step": 4040 + }, + { + "epoch": 0.4314937140421905, + "grad_norm": 0.001705207396298647, + "learning_rate": 1.1370125719156192e-05, + "loss": 0.0005, + "step": 4050 + }, + { + "epoch": 0.4314937140421905, + "eval_loss": 0.0044433241710066795, + "eval_runtime": 35.0773, + "eval_samples_per_second": 4281.231, + "eval_steps_per_second": 66.909, + "step": 4050 + }, + { + "epoch": 0.43255913062007245, + "grad_norm": 0.0016584375407546759, + "learning_rate": 1.1348817387598552e-05, + "loss": 0.0001, + "step": 4060 + }, + { + "epoch": 0.43255913062007245, + "eval_loss": 0.004409145098179579, + "eval_runtime": 35.0654, + "eval_samples_per_second": 4282.689, + "eval_steps_per_second": 66.932, + "step": 4060 + }, + { + "epoch": 0.4336245471979544, + "grad_norm": 0.0026151298079639673, + "learning_rate": 1.1327509056040914e-05, + "loss": 0.0001, + "step": 4070 + }, + { + "epoch": 0.4336245471979544, + "eval_loss": 0.004420367535203695, + "eval_runtime": 35.1047, + "eval_samples_per_second": 4277.889, + "eval_steps_per_second": 66.857, + "step": 4070 + }, + { + "epoch": 0.43468996377583635, + "grad_norm": 0.0077970316633582115, + "learning_rate": 1.1306200724483274e-05, + "loss": 0.0002, + "step": 4080 + }, + { + "epoch": 0.43468996377583635, + "eval_loss": 0.004452695604413748, + "eval_runtime": 35.193, + "eval_samples_per_second": 4267.161, + "eval_steps_per_second": 66.689, + "step": 4080 + }, + { + "epoch": 0.4357553803537183, + "grad_norm": 0.03631202504038811, + "learning_rate": 1.1284892392925636e-05, + "loss": 0.0007, + "step": 4090 + }, + { + "epoch": 0.4357553803537183, + "eval_loss": 0.004691319074481726, + "eval_runtime": 35.0695, + "eval_samples_per_second": 4282.187, + "eval_steps_per_second": 66.924, + "step": 4090 + }, + { + "epoch": 0.43682079693160025, + "grad_norm": 1.5857198238372803, + "learning_rate": 1.1263584061367996e-05, + "loss": 0.0085, + "step": 4100 + }, + { + "epoch": 0.43682079693160025, + "eval_loss": 0.004954234231263399, + "eval_runtime": 35.0751, + "eval_samples_per_second": 4281.502, + "eval_steps_per_second": 66.914, + "step": 4100 + }, + { + "epoch": 0.4378862135094822, + "grad_norm": 0.001573398825712502, + "learning_rate": 1.1242275729810358e-05, + "loss": 0.0047, + "step": 4110 + }, + { + "epoch": 0.4378862135094822, + "eval_loss": 0.004456042777746916, + "eval_runtime": 35.0967, + "eval_samples_per_second": 4278.862, + "eval_steps_per_second": 66.872, + "step": 4110 + }, + { + "epoch": 0.43895163008736415, + "grad_norm": 0.00567322364076972, + "learning_rate": 1.1220967398252718e-05, + "loss": 0.0208, + "step": 4120 + }, + { + "epoch": 0.43895163008736415, + "eval_loss": 0.004521696828305721, + "eval_runtime": 35.0786, + "eval_samples_per_second": 4281.069, + "eval_steps_per_second": 66.907, + "step": 4120 + }, + { + "epoch": 0.4400170466652461, + "grad_norm": 0.23694893717765808, + "learning_rate": 1.119965906669508e-05, + "loss": 0.0006, + "step": 4130 + }, + { + "epoch": 0.4400170466652461, + "eval_loss": 0.00439961813390255, + "eval_runtime": 35.0751, + "eval_samples_per_second": 4281.495, + "eval_steps_per_second": 66.914, + "step": 4130 + }, + { + "epoch": 0.44108246324312805, + "grad_norm": 0.0016371961683034897, + "learning_rate": 1.117835073513744e-05, + "loss": 0.0004, + "step": 4140 + }, + { + "epoch": 0.44108246324312805, + "eval_loss": 0.004129570908844471, + "eval_runtime": 35.0666, + "eval_samples_per_second": 4282.542, + "eval_steps_per_second": 66.93, + "step": 4140 + }, + { + "epoch": 0.44214787982101, + "grad_norm": 0.001793770119547844, + "learning_rate": 1.1157042403579802e-05, + "loss": 0.0005, + "step": 4150 + }, + { + "epoch": 0.44214787982101, + "eval_loss": 0.0041097295470535755, + "eval_runtime": 35.0581, + "eval_samples_per_second": 4283.58, + "eval_steps_per_second": 66.946, + "step": 4150 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 0.025650380179286003, + "learning_rate": 1.1135734072022162e-05, + "loss": 0.0115, + "step": 4160 + }, + { + "epoch": 0.44321329639889195, + "eval_loss": 0.004528433550149202, + "eval_runtime": 35.0417, + "eval_samples_per_second": 4285.579, + "eval_steps_per_second": 66.977, + "step": 4160 + }, + { + "epoch": 0.4442787129767739, + "grad_norm": 0.001788902678526938, + "learning_rate": 1.1114425740464523e-05, + "loss": 0.0054, + "step": 4170 + }, + { + "epoch": 0.4442787129767739, + "eval_loss": 0.004935861565172672, + "eval_runtime": 35.0797, + "eval_samples_per_second": 4280.932, + "eval_steps_per_second": 66.905, + "step": 4170 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 0.07157997041940689, + "learning_rate": 1.1093117408906884e-05, + "loss": 0.0044, + "step": 4180 + }, + { + "epoch": 0.44534412955465585, + "eval_loss": 0.005132563877850771, + "eval_runtime": 35.0861, + "eval_samples_per_second": 4280.157, + "eval_steps_per_second": 66.893, + "step": 4180 + }, + { + "epoch": 0.4464095461325378, + "grad_norm": 0.0018303110264241695, + "learning_rate": 1.1071809077349245e-05, + "loss": 0.001, + "step": 4190 + }, + { + "epoch": 0.4464095461325378, + "eval_loss": 0.005342422518879175, + "eval_runtime": 35.0513, + "eval_samples_per_second": 4284.412, + "eval_steps_per_second": 66.959, + "step": 4190 + }, + { + "epoch": 0.44747496271041975, + "grad_norm": 0.0073822783306241035, + "learning_rate": 1.1050500745791606e-05, + "loss": 0.0018, + "step": 4200 + }, + { + "epoch": 0.44747496271041975, + "eval_loss": 0.0050468165427446365, + "eval_runtime": 35.0561, + "eval_samples_per_second": 4283.816, + "eval_steps_per_second": 66.95, + "step": 4200 + }, + { + "epoch": 0.4485403792883017, + "grad_norm": 0.06104118749499321, + "learning_rate": 1.1029192414233967e-05, + "loss": 0.0005, + "step": 4210 + }, + { + "epoch": 0.4485403792883017, + "eval_loss": 0.004778716247528791, + "eval_runtime": 35.0462, + "eval_samples_per_second": 4285.024, + "eval_steps_per_second": 66.969, + "step": 4210 + }, + { + "epoch": 0.44960579586618366, + "grad_norm": 0.07653524726629257, + "learning_rate": 1.1007884082676328e-05, + "loss": 0.0009, + "step": 4220 + }, + { + "epoch": 0.44960579586618366, + "eval_loss": 0.0042607756331563, + "eval_runtime": 35.0848, + "eval_samples_per_second": 4280.319, + "eval_steps_per_second": 66.895, + "step": 4220 + }, + { + "epoch": 0.4506712124440656, + "grad_norm": 0.0015622730134055018, + "learning_rate": 1.098657575111869e-05, + "loss": 0.0007, + "step": 4230 + }, + { + "epoch": 0.4506712124440656, + "eval_loss": 0.004101978614926338, + "eval_runtime": 35.0595, + "eval_samples_per_second": 4283.398, + "eval_steps_per_second": 66.943, + "step": 4230 + }, + { + "epoch": 0.45173662902194756, + "grad_norm": 0.16386055946350098, + "learning_rate": 1.096526741956105e-05, + "loss": 0.0008, + "step": 4240 + }, + { + "epoch": 0.45173662902194756, + "eval_loss": 0.004037812352180481, + "eval_runtime": 35.0852, + "eval_samples_per_second": 4280.269, + "eval_steps_per_second": 66.894, + "step": 4240 + }, + { + "epoch": 0.4528020455998295, + "grad_norm": 0.05913758650422096, + "learning_rate": 1.0943959088003411e-05, + "loss": 0.0029, + "step": 4250 + }, + { + "epoch": 0.4528020455998295, + "eval_loss": 0.003882251214236021, + "eval_runtime": 35.0103, + "eval_samples_per_second": 4289.42, + "eval_steps_per_second": 67.037, + "step": 4250 + }, + { + "epoch": 0.4538674621777115, + "grad_norm": 0.0016470799455419183, + "learning_rate": 1.0922650756445772e-05, + "loss": 0.0008, + "step": 4260 + }, + { + "epoch": 0.4538674621777115, + "eval_loss": 0.003832570044323802, + "eval_runtime": 35.0235, + "eval_samples_per_second": 4287.813, + "eval_steps_per_second": 67.012, + "step": 4260 + }, + { + "epoch": 0.45493287875559346, + "grad_norm": 0.0037195871118456125, + "learning_rate": 1.0901342424888133e-05, + "loss": 0.0002, + "step": 4270 + }, + { + "epoch": 0.45493287875559346, + "eval_loss": 0.0038891404401510954, + "eval_runtime": 35.053, + "eval_samples_per_second": 4284.196, + "eval_steps_per_second": 66.956, + "step": 4270 + }, + { + "epoch": 0.4559982953334754, + "grad_norm": 0.001607783604413271, + "learning_rate": 1.0880034093330494e-05, + "loss": 0.0012, + "step": 4280 + }, + { + "epoch": 0.4559982953334754, + "eval_loss": 0.0040294453501701355, + "eval_runtime": 35.0398, + "eval_samples_per_second": 4285.807, + "eval_steps_per_second": 66.981, + "step": 4280 + }, + { + "epoch": 0.45706371191135736, + "grad_norm": 0.005903988610953093, + "learning_rate": 1.0858725761772855e-05, + "loss": 0.0016, + "step": 4290 + }, + { + "epoch": 0.45706371191135736, + "eval_loss": 0.003837657393887639, + "eval_runtime": 35.0606, + "eval_samples_per_second": 4283.274, + "eval_steps_per_second": 66.941, + "step": 4290 + }, + { + "epoch": 0.4581291284892393, + "grad_norm": 0.017637008801102638, + "learning_rate": 1.0837417430215216e-05, + "loss": 0.0003, + "step": 4300 + }, + { + "epoch": 0.4581291284892393, + "eval_loss": 0.003823323640972376, + "eval_runtime": 35.0483, + "eval_samples_per_second": 4284.777, + "eval_steps_per_second": 66.965, + "step": 4300 + }, + { + "epoch": 0.45919454506712126, + "grad_norm": 0.0024709682911634445, + "learning_rate": 1.0816109098657577e-05, + "loss": 0.0002, + "step": 4310 + }, + { + "epoch": 0.45919454506712126, + "eval_loss": 0.003842473030090332, + "eval_runtime": 35.0497, + "eval_samples_per_second": 4284.604, + "eval_steps_per_second": 66.962, + "step": 4310 + }, + { + "epoch": 0.4602599616450032, + "grad_norm": 0.0015808714088052511, + "learning_rate": 1.0794800767099937e-05, + "loss": 0.0067, + "step": 4320 + }, + { + "epoch": 0.4602599616450032, + "eval_loss": 0.0036013866774737835, + "eval_runtime": 35.0348, + "eval_samples_per_second": 4286.421, + "eval_steps_per_second": 66.99, + "step": 4320 + }, + { + "epoch": 0.46132537822288516, + "grad_norm": 0.0048879231326282024, + "learning_rate": 1.07734924355423e-05, + "loss": 0.0008, + "step": 4330 + }, + { + "epoch": 0.46132537822288516, + "eval_loss": 0.0035459273494780064, + "eval_runtime": 35.0239, + "eval_samples_per_second": 4287.753, + "eval_steps_per_second": 67.011, + "step": 4330 + }, + { + "epoch": 0.4623907948007671, + "grad_norm": 2.5852835178375244, + "learning_rate": 1.075218410398466e-05, + "loss": 0.0059, + "step": 4340 + }, + { + "epoch": 0.4623907948007671, + "eval_loss": 0.003548369277268648, + "eval_runtime": 35.0675, + "eval_samples_per_second": 4282.425, + "eval_steps_per_second": 66.928, + "step": 4340 + }, + { + "epoch": 0.46345621137864906, + "grad_norm": 0.017402660101652145, + "learning_rate": 1.0730875772427021e-05, + "loss": 0.0001, + "step": 4350 + }, + { + "epoch": 0.46345621137864906, + "eval_loss": 0.0034816220868378878, + "eval_runtime": 35.0635, + "eval_samples_per_second": 4282.911, + "eval_steps_per_second": 66.936, + "step": 4350 + }, + { + "epoch": 0.464521627956531, + "grad_norm": 0.001518838806077838, + "learning_rate": 1.0709567440869381e-05, + "loss": 0.0032, + "step": 4360 + }, + { + "epoch": 0.464521627956531, + "eval_loss": 0.0034797603730112314, + "eval_runtime": 35.0071, + "eval_samples_per_second": 4289.821, + "eval_steps_per_second": 67.044, + "step": 4360 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 0.0015628690598532557, + "learning_rate": 1.0688259109311743e-05, + "loss": 0.0054, + "step": 4370 + }, + { + "epoch": 0.46558704453441296, + "eval_loss": 0.0033893610816448927, + "eval_runtime": 35.0177, + "eval_samples_per_second": 4288.516, + "eval_steps_per_second": 67.023, + "step": 4370 + }, + { + "epoch": 0.4666524611122949, + "grad_norm": 0.015727238729596138, + "learning_rate": 1.0666950777754103e-05, + "loss": 0.001, + "step": 4380 + }, + { + "epoch": 0.4666524611122949, + "eval_loss": 0.003377847606316209, + "eval_runtime": 35.0481, + "eval_samples_per_second": 4284.803, + "eval_steps_per_second": 66.965, + "step": 4380 + }, + { + "epoch": 0.46771787769017686, + "grad_norm": 0.001751308562234044, + "learning_rate": 1.0645642446196465e-05, + "loss": 0.0095, + "step": 4390 + }, + { + "epoch": 0.46771787769017686, + "eval_loss": 0.003439757041633129, + "eval_runtime": 35.0358, + "eval_samples_per_second": 4286.298, + "eval_steps_per_second": 66.989, + "step": 4390 + }, + { + "epoch": 0.4687832942680588, + "grad_norm": 0.003558347700163722, + "learning_rate": 1.0624334114638825e-05, + "loss": 0.0014, + "step": 4400 + }, + { + "epoch": 0.4687832942680588, + "eval_loss": 0.0034394925460219383, + "eval_runtime": 35.0439, + "eval_samples_per_second": 4285.308, + "eval_steps_per_second": 66.973, + "step": 4400 + }, + { + "epoch": 0.46984871084594076, + "grad_norm": 0.025436507537961006, + "learning_rate": 1.0603025783081187e-05, + "loss": 0.0011, + "step": 4410 + }, + { + "epoch": 0.46984871084594076, + "eval_loss": 0.003332258900627494, + "eval_runtime": 35.0777, + "eval_samples_per_second": 4281.18, + "eval_steps_per_second": 66.909, + "step": 4410 + }, + { + "epoch": 0.4709141274238227, + "grad_norm": 0.0017181203002110124, + "learning_rate": 1.0581717451523547e-05, + "loss": 0.0051, + "step": 4420 + }, + { + "epoch": 0.4709141274238227, + "eval_loss": 0.0032833644654601812, + "eval_runtime": 35.0364, + "eval_samples_per_second": 4286.225, + "eval_steps_per_second": 66.987, + "step": 4420 + }, + { + "epoch": 0.47197954400170467, + "grad_norm": 0.3103368282318115, + "learning_rate": 1.0560409119965906e-05, + "loss": 0.0009, + "step": 4430 + }, + { + "epoch": 0.47197954400170467, + "eval_loss": 0.003310458268970251, + "eval_runtime": 35.0137, + "eval_samples_per_second": 4289.005, + "eval_steps_per_second": 67.031, + "step": 4430 + }, + { + "epoch": 0.4730449605795866, + "grad_norm": 0.0034083034843206406, + "learning_rate": 1.0539100788408268e-05, + "loss": 0.0016, + "step": 4440 + }, + { + "epoch": 0.4730449605795866, + "eval_loss": 0.00335130887106061, + "eval_runtime": 35.028, + "eval_samples_per_second": 4287.254, + "eval_steps_per_second": 67.004, + "step": 4440 + }, + { + "epoch": 0.47411037715746857, + "grad_norm": 0.0014370749704539776, + "learning_rate": 1.0517792456850628e-05, + "loss": 0.0026, + "step": 4450 + }, + { + "epoch": 0.47411037715746857, + "eval_loss": 0.0034298617392778397, + "eval_runtime": 35.0892, + "eval_samples_per_second": 4279.782, + "eval_steps_per_second": 66.887, + "step": 4450 + }, + { + "epoch": 0.4751757937353505, + "grad_norm": 0.004712993744760752, + "learning_rate": 1.049648412529299e-05, + "loss": 0.0001, + "step": 4460 + }, + { + "epoch": 0.4751757937353505, + "eval_loss": 0.003493973519653082, + "eval_runtime": 35.0547, + "eval_samples_per_second": 4283.994, + "eval_steps_per_second": 66.953, + "step": 4460 + }, + { + "epoch": 0.47624121031323247, + "grad_norm": 0.0023907856084406376, + "learning_rate": 1.047517579373535e-05, + "loss": 0.0092, + "step": 4470 + }, + { + "epoch": 0.47624121031323247, + "eval_loss": 0.003423650749027729, + "eval_runtime": 35.0747, + "eval_samples_per_second": 4281.543, + "eval_steps_per_second": 66.914, + "step": 4470 + }, + { + "epoch": 0.4773066268911144, + "grad_norm": 0.6058014035224915, + "learning_rate": 1.0453867462177712e-05, + "loss": 0.0027, + "step": 4480 + }, + { + "epoch": 0.4773066268911144, + "eval_loss": 0.0034181708469986916, + "eval_runtime": 35.0269, + "eval_samples_per_second": 4287.387, + "eval_steps_per_second": 67.006, + "step": 4480 + }, + { + "epoch": 0.47837204346899637, + "grad_norm": 0.002385197440162301, + "learning_rate": 1.0432559130620072e-05, + "loss": 0.0003, + "step": 4490 + }, + { + "epoch": 0.47837204346899637, + "eval_loss": 0.003402228932827711, + "eval_runtime": 35.0808, + "eval_samples_per_second": 4280.801, + "eval_steps_per_second": 66.903, + "step": 4490 + }, + { + "epoch": 0.4794374600468783, + "grad_norm": 0.004214168526232243, + "learning_rate": 1.0411250799062434e-05, + "loss": 0.0093, + "step": 4500 + }, + { + "epoch": 0.4794374600468783, + "eval_loss": 0.0033664952497929335, + "eval_runtime": 35.004, + "eval_samples_per_second": 4290.198, + "eval_steps_per_second": 67.05, + "step": 4500 + }, + { + "epoch": 0.48050287662476027, + "grad_norm": 0.0015161953633651137, + "learning_rate": 1.0389942467504794e-05, + "loss": 0.0001, + "step": 4510 + }, + { + "epoch": 0.48050287662476027, + "eval_loss": 0.003395343665033579, + "eval_runtime": 35.0498, + "eval_samples_per_second": 4284.584, + "eval_steps_per_second": 66.962, + "step": 4510 + }, + { + "epoch": 0.4815682932026422, + "grad_norm": 0.0015271385200321674, + "learning_rate": 1.0368634135947156e-05, + "loss": 0.0096, + "step": 4520 + }, + { + "epoch": 0.4815682932026422, + "eval_loss": 0.0033128561917692423, + "eval_runtime": 35.0436, + "eval_samples_per_second": 4285.349, + "eval_steps_per_second": 66.974, + "step": 4520 + }, + { + "epoch": 0.48263370978052417, + "grad_norm": 0.004237685352563858, + "learning_rate": 1.0347325804389516e-05, + "loss": 0.0003, + "step": 4530 + }, + { + "epoch": 0.48263370978052417, + "eval_loss": 0.0032125210855156183, + "eval_runtime": 35.0305, + "eval_samples_per_second": 4286.947, + "eval_steps_per_second": 66.999, + "step": 4530 + }, + { + "epoch": 0.4836991263584061, + "grad_norm": 0.0016559308860450983, + "learning_rate": 1.0326017472831878e-05, + "loss": 0.0141, + "step": 4540 + }, + { + "epoch": 0.4836991263584061, + "eval_loss": 0.003183180931955576, + "eval_runtime": 35.0073, + "eval_samples_per_second": 4289.791, + "eval_steps_per_second": 67.043, + "step": 4540 + }, + { + "epoch": 0.48476454293628807, + "grad_norm": 0.0016300799325108528, + "learning_rate": 1.0304709141274238e-05, + "loss": 0.0042, + "step": 4550 + }, + { + "epoch": 0.48476454293628807, + "eval_loss": 0.0034217729698866606, + "eval_runtime": 35.0329, + "eval_samples_per_second": 4286.656, + "eval_steps_per_second": 66.994, + "step": 4550 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 0.39741653203964233, + "learning_rate": 1.02834008097166e-05, + "loss": 0.0005, + "step": 4560 + }, + { + "epoch": 0.48582995951417, + "eval_loss": 0.003642290597781539, + "eval_runtime": 35.09, + "eval_samples_per_second": 4279.682, + "eval_steps_per_second": 66.885, + "step": 4560 + }, + { + "epoch": 0.48689537609205197, + "grad_norm": 0.18953844904899597, + "learning_rate": 1.026209247815896e-05, + "loss": 0.0007, + "step": 4570 + }, + { + "epoch": 0.48689537609205197, + "eval_loss": 0.003431662917137146, + "eval_runtime": 35.0419, + "eval_samples_per_second": 4285.553, + "eval_steps_per_second": 66.977, + "step": 4570 + }, + { + "epoch": 0.4879607926699339, + "grad_norm": 0.0029646658804267645, + "learning_rate": 1.0240784146601322e-05, + "loss": 0.0013, + "step": 4580 + }, + { + "epoch": 0.4879607926699339, + "eval_loss": 0.0033544725738465786, + "eval_runtime": 35.0086, + "eval_samples_per_second": 4289.63, + "eval_steps_per_second": 67.041, + "step": 4580 + }, + { + "epoch": 0.48902620924781587, + "grad_norm": 0.008971764706075191, + "learning_rate": 1.0219475815043682e-05, + "loss": 0.001, + "step": 4590 + }, + { + "epoch": 0.48902620924781587, + "eval_loss": 0.003337480593472719, + "eval_runtime": 35.0018, + "eval_samples_per_second": 4290.465, + "eval_steps_per_second": 67.054, + "step": 4590 + }, + { + "epoch": 0.4900916258256979, + "grad_norm": 0.0016449299873784184, + "learning_rate": 1.0198167483486044e-05, + "loss": 0.0011, + "step": 4600 + }, + { + "epoch": 0.4900916258256979, + "eval_loss": 0.0033237505704164505, + "eval_runtime": 35.0433, + "eval_samples_per_second": 4285.388, + "eval_steps_per_second": 66.974, + "step": 4600 + }, + { + "epoch": 0.4911570424035798, + "grad_norm": 0.013473814353346825, + "learning_rate": 1.0176859151928404e-05, + "loss": 0.0017, + "step": 4610 + }, + { + "epoch": 0.4911570424035798, + "eval_loss": 0.003368969541043043, + "eval_runtime": 35.0263, + "eval_samples_per_second": 4287.467, + "eval_steps_per_second": 67.007, + "step": 4610 + }, + { + "epoch": 0.4922224589814618, + "grad_norm": 0.0029099630191922188, + "learning_rate": 1.0155550820370766e-05, + "loss": 0.0007, + "step": 4620 + }, + { + "epoch": 0.4922224589814618, + "eval_loss": 0.003540628356859088, + "eval_runtime": 35.0376, + "eval_samples_per_second": 4286.083, + "eval_steps_per_second": 66.985, + "step": 4620 + }, + { + "epoch": 0.4932878755593437, + "grad_norm": 0.020577091723680496, + "learning_rate": 1.0134242488813126e-05, + "loss": 0.0005, + "step": 4630 + }, + { + "epoch": 0.4932878755593437, + "eval_loss": 0.0037534397561103106, + "eval_runtime": 35.0156, + "eval_samples_per_second": 4288.773, + "eval_steps_per_second": 67.027, + "step": 4630 + }, + { + "epoch": 0.4943532921372257, + "grad_norm": 0.07291168719530106, + "learning_rate": 1.0112934157255487e-05, + "loss": 0.0003, + "step": 4640 + }, + { + "epoch": 0.4943532921372257, + "eval_loss": 0.003838547272607684, + "eval_runtime": 35.0423, + "eval_samples_per_second": 4285.511, + "eval_steps_per_second": 66.976, + "step": 4640 + }, + { + "epoch": 0.4954187087151076, + "grad_norm": 0.02392764948308468, + "learning_rate": 1.0091625825697848e-05, + "loss": 0.0059, + "step": 4650 + }, + { + "epoch": 0.4954187087151076, + "eval_loss": 0.0035981247201561928, + "eval_runtime": 35.0354, + "eval_samples_per_second": 4286.356, + "eval_steps_per_second": 66.989, + "step": 4650 + }, + { + "epoch": 0.4964841252929896, + "grad_norm": 0.001953831873834133, + "learning_rate": 1.007031749414021e-05, + "loss": 0.0002, + "step": 4660 + }, + { + "epoch": 0.4964841252929896, + "eval_loss": 0.003541785990819335, + "eval_runtime": 35.0691, + "eval_samples_per_second": 4282.23, + "eval_steps_per_second": 66.925, + "step": 4660 + }, + { + "epoch": 0.4975495418708715, + "grad_norm": 0.05205778032541275, + "learning_rate": 1.004900916258257e-05, + "loss": 0.0008, + "step": 4670 + }, + { + "epoch": 0.4975495418708715, + "eval_loss": 0.0035847588442265987, + "eval_runtime": 35.0465, + "eval_samples_per_second": 4284.988, + "eval_steps_per_second": 66.968, + "step": 4670 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 0.0016077999025583267, + "learning_rate": 1.0027700831024931e-05, + "loss": 0.0001, + "step": 4680 + }, + { + "epoch": 0.4986149584487535, + "eval_loss": 0.0036463753785938025, + "eval_runtime": 35.0411, + "eval_samples_per_second": 4285.648, + "eval_steps_per_second": 66.978, + "step": 4680 + }, + { + "epoch": 0.4996803750266354, + "grad_norm": 0.1424156129360199, + "learning_rate": 1.0006392499467292e-05, + "loss": 0.0035, + "step": 4690 + }, + { + "epoch": 0.4996803750266354, + "eval_loss": 0.003566704923287034, + "eval_runtime": 35.0012, + "eval_samples_per_second": 4290.537, + "eval_steps_per_second": 67.055, + "step": 4690 + }, + { + "epoch": 0.5007457916045174, + "grad_norm": 0.0017005006084218621, + "learning_rate": 9.985084167909653e-06, + "loss": 0.0003, + "step": 4700 + }, + { + "epoch": 0.5007457916045174, + "eval_loss": 0.0034857653081417084, + "eval_runtime": 35.004, + "eval_samples_per_second": 4290.198, + "eval_steps_per_second": 67.05, + "step": 4700 + }, + { + "epoch": 0.5018112081823993, + "grad_norm": 0.11611221730709076, + "learning_rate": 9.963775836352014e-06, + "loss": 0.0007, + "step": 4710 + }, + { + "epoch": 0.5018112081823993, + "eval_loss": 0.0035124989226460457, + "eval_runtime": 35.059, + "eval_samples_per_second": 4283.467, + "eval_steps_per_second": 66.944, + "step": 4710 + }, + { + "epoch": 0.5028766247602813, + "grad_norm": 0.023515425622463226, + "learning_rate": 9.942467504794375e-06, + "loss": 0.0065, + "step": 4720 + }, + { + "epoch": 0.5028766247602813, + "eval_loss": 0.0037029124796390533, + "eval_runtime": 35.0494, + "eval_samples_per_second": 4284.644, + "eval_steps_per_second": 66.963, + "step": 4720 + }, + { + "epoch": 0.5039420413381632, + "grad_norm": 0.022330928593873978, + "learning_rate": 9.921159173236736e-06, + "loss": 0.0003, + "step": 4730 + }, + { + "epoch": 0.5039420413381632, + "eval_loss": 0.003991840872913599, + "eval_runtime": 35.0528, + "eval_samples_per_second": 4284.226, + "eval_steps_per_second": 66.956, + "step": 4730 + }, + { + "epoch": 0.5050074579160452, + "grad_norm": 0.0015543290646746755, + "learning_rate": 9.899850841679097e-06, + "loss": 0.0005, + "step": 4740 + }, + { + "epoch": 0.5050074579160452, + "eval_loss": 0.004205774050205946, + "eval_runtime": 35.0589, + "eval_samples_per_second": 4283.481, + "eval_steps_per_second": 66.945, + "step": 4740 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 0.0047645787708461285, + "learning_rate": 9.878542510121458e-06, + "loss": 0.0097, + "step": 4750 + }, + { + "epoch": 0.5060728744939271, + "eval_loss": 0.004034877754747868, + "eval_runtime": 35.029, + "eval_samples_per_second": 4287.128, + "eval_steps_per_second": 67.002, + "step": 4750 + }, + { + "epoch": 0.5071382910718091, + "grad_norm": 0.018391378223896027, + "learning_rate": 9.85723417856382e-06, + "loss": 0.0033, + "step": 4760 + }, + { + "epoch": 0.5071382910718091, + "eval_loss": 0.0034893695265054703, + "eval_runtime": 35.0565, + "eval_samples_per_second": 4283.77, + "eval_steps_per_second": 66.949, + "step": 4760 + }, + { + "epoch": 0.508203707649691, + "grad_norm": 0.09393607079982758, + "learning_rate": 9.83592584700618e-06, + "loss": 0.0061, + "step": 4770 + }, + { + "epoch": 0.508203707649691, + "eval_loss": 0.003367075929418206, + "eval_runtime": 35.0294, + "eval_samples_per_second": 4287.079, + "eval_steps_per_second": 67.001, + "step": 4770 + }, + { + "epoch": 0.509269124227573, + "grad_norm": 0.05805261433124542, + "learning_rate": 9.814617515448541e-06, + "loss": 0.0029, + "step": 4780 + }, + { + "epoch": 0.509269124227573, + "eval_loss": 0.0032794128637760878, + "eval_runtime": 35.0347, + "eval_samples_per_second": 4286.435, + "eval_steps_per_second": 66.991, + "step": 4780 + }, + { + "epoch": 0.5103345408054549, + "grad_norm": 0.218561589717865, + "learning_rate": 9.793309183890901e-06, + "loss": 0.0011, + "step": 4790 + }, + { + "epoch": 0.5103345408054549, + "eval_loss": 0.0032986998558044434, + "eval_runtime": 35.042, + "eval_samples_per_second": 4285.545, + "eval_steps_per_second": 66.977, + "step": 4790 + }, + { + "epoch": 0.5113999573833369, + "grad_norm": 0.0016841794131323695, + "learning_rate": 9.772000852333263e-06, + "loss": 0.0005, + "step": 4800 + }, + { + "epoch": 0.5113999573833369, + "eval_loss": 0.003459086874499917, + "eval_runtime": 35.0677, + "eval_samples_per_second": 4282.407, + "eval_steps_per_second": 66.928, + "step": 4800 + }, + { + "epoch": 0.5124653739612188, + "grad_norm": 0.16237737238407135, + "learning_rate": 9.750692520775623e-06, + "loss": 0.0029, + "step": 4810 + }, + { + "epoch": 0.5124653739612188, + "eval_loss": 0.0034708159510046244, + "eval_runtime": 35.1028, + "eval_samples_per_second": 4278.124, + "eval_steps_per_second": 66.861, + "step": 4810 + }, + { + "epoch": 0.5135307905391008, + "grad_norm": 0.0015488864155486226, + "learning_rate": 9.729384189217985e-06, + "loss": 0.0133, + "step": 4820 + }, + { + "epoch": 0.5135307905391008, + "eval_loss": 0.0035250100772827864, + "eval_runtime": 35.0425, + "eval_samples_per_second": 4285.48, + "eval_steps_per_second": 66.976, + "step": 4820 + }, + { + "epoch": 0.5145962071169827, + "grad_norm": 0.0018307908903807402, + "learning_rate": 9.708075857660345e-06, + "loss": 0.0003, + "step": 4830 + }, + { + "epoch": 0.5145962071169827, + "eval_loss": 0.0035982467234134674, + "eval_runtime": 35.0382, + "eval_samples_per_second": 4286.011, + "eval_steps_per_second": 66.984, + "step": 4830 + }, + { + "epoch": 0.5156616236948647, + "grad_norm": 0.001923054805956781, + "learning_rate": 9.686767526102707e-06, + "loss": 0.0067, + "step": 4840 + }, + { + "epoch": 0.5156616236948647, + "eval_loss": 0.003679609391838312, + "eval_runtime": 35.0048, + "eval_samples_per_second": 4290.095, + "eval_steps_per_second": 67.048, + "step": 4840 + }, + { + "epoch": 0.5167270402727466, + "grad_norm": 0.004570928402245045, + "learning_rate": 9.665459194545067e-06, + "loss": 0.0002, + "step": 4850 + }, + { + "epoch": 0.5167270402727466, + "eval_loss": 0.003768416354432702, + "eval_runtime": 35.0717, + "eval_samples_per_second": 4281.911, + "eval_steps_per_second": 66.92, + "step": 4850 + }, + { + "epoch": 0.5177924568506286, + "grad_norm": 0.002467310754582286, + "learning_rate": 9.64415086298743e-06, + "loss": 0.0002, + "step": 4860 + }, + { + "epoch": 0.5177924568506286, + "eval_loss": 0.0038299639709293842, + "eval_runtime": 35.0546, + "eval_samples_per_second": 4283.999, + "eval_steps_per_second": 66.953, + "step": 4860 + }, + { + "epoch": 0.5188578734285105, + "grad_norm": 0.007724090479314327, + "learning_rate": 9.62284253142979e-06, + "loss": 0.0002, + "step": 4870 + }, + { + "epoch": 0.5188578734285105, + "eval_loss": 0.0038635297678411007, + "eval_runtime": 35.0595, + "eval_samples_per_second": 4283.406, + "eval_steps_per_second": 66.943, + "step": 4870 + }, + { + "epoch": 0.5199232900063925, + "grad_norm": 0.0018088623182848096, + "learning_rate": 9.601534199872151e-06, + "loss": 0.0021, + "step": 4880 + }, + { + "epoch": 0.5199232900063925, + "eval_loss": 0.00368054979480803, + "eval_runtime": 35.0441, + "eval_samples_per_second": 4285.285, + "eval_steps_per_second": 66.973, + "step": 4880 + }, + { + "epoch": 0.5209887065842744, + "grad_norm": 0.001608138787560165, + "learning_rate": 9.580225868314511e-06, + "loss": 0.0003, + "step": 4890 + }, + { + "epoch": 0.5209887065842744, + "eval_loss": 0.0037215733900666237, + "eval_runtime": 35.0462, + "eval_samples_per_second": 4285.034, + "eval_steps_per_second": 66.969, + "step": 4890 + }, + { + "epoch": 0.5220541231621564, + "grad_norm": 0.006339292973279953, + "learning_rate": 9.558917536756873e-06, + "loss": 0.0002, + "step": 4900 + }, + { + "epoch": 0.5220541231621564, + "eval_loss": 0.003766452893614769, + "eval_runtime": 35.0484, + "eval_samples_per_second": 4284.756, + "eval_steps_per_second": 66.964, + "step": 4900 + }, + { + "epoch": 0.5231195397400383, + "grad_norm": 0.0025099278427660465, + "learning_rate": 9.537609205199233e-06, + "loss": 0.0002, + "step": 4910 + }, + { + "epoch": 0.5231195397400383, + "eval_loss": 0.003810285124927759, + "eval_runtime": 35.0708, + "eval_samples_per_second": 4282.026, + "eval_steps_per_second": 66.922, + "step": 4910 + }, + { + "epoch": 0.5241849563179203, + "grad_norm": 0.013173098675906658, + "learning_rate": 9.516300873641595e-06, + "loss": 0.0146, + "step": 4920 + }, + { + "epoch": 0.5241849563179203, + "eval_loss": 0.0033065176103264093, + "eval_runtime": 35.0434, + "eval_samples_per_second": 4285.372, + "eval_steps_per_second": 66.974, + "step": 4920 + }, + { + "epoch": 0.5252503728958022, + "grad_norm": 0.010503698140382767, + "learning_rate": 9.494992542083955e-06, + "loss": 0.0005, + "step": 4930 + }, + { + "epoch": 0.5252503728958022, + "eval_loss": 0.003174431389197707, + "eval_runtime": 35.0738, + "eval_samples_per_second": 4281.663, + "eval_steps_per_second": 66.916, + "step": 4930 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.001948002027347684, + "learning_rate": 9.473684210526315e-06, + "loss": 0.0067, + "step": 4940 + }, + { + "epoch": 0.5263157894736842, + "eval_loss": 0.0031368620693683624, + "eval_runtime": 35.0438, + "eval_samples_per_second": 4285.322, + "eval_steps_per_second": 66.973, + "step": 4940 + }, + { + "epoch": 0.5273812060515661, + "grad_norm": 0.2971310019493103, + "learning_rate": 9.452375878968677e-06, + "loss": 0.0068, + "step": 4950 + }, + { + "epoch": 0.5273812060515661, + "eval_loss": 0.003268307074904442, + "eval_runtime": 35.0314, + "eval_samples_per_second": 4286.844, + "eval_steps_per_second": 66.997, + "step": 4950 + }, + { + "epoch": 0.5284466226294481, + "grad_norm": 0.0015663893427699804, + "learning_rate": 9.431067547411037e-06, + "loss": 0.0024, + "step": 4960 + }, + { + "epoch": 0.5284466226294481, + "eval_loss": 0.003325843717902899, + "eval_runtime": 35.0579, + "eval_samples_per_second": 4283.604, + "eval_steps_per_second": 66.946, + "step": 4960 + }, + { + "epoch": 0.52951203920733, + "grad_norm": 0.0014791989233344793, + "learning_rate": 9.4097592158534e-06, + "loss": 0.0008, + "step": 4970 + }, + { + "epoch": 0.52951203920733, + "eval_loss": 0.003260023193433881, + "eval_runtime": 35.0543, + "eval_samples_per_second": 4284.038, + "eval_steps_per_second": 66.953, + "step": 4970 + }, + { + "epoch": 0.530577455785212, + "grad_norm": 0.0014208897482603788, + "learning_rate": 9.38845088429576e-06, + "loss": 0.0007, + "step": 4980 + }, + { + "epoch": 0.530577455785212, + "eval_loss": 0.0032248280476778746, + "eval_runtime": 35.0876, + "eval_samples_per_second": 4279.972, + "eval_steps_per_second": 66.89, + "step": 4980 + }, + { + "epoch": 0.5316428723630939, + "grad_norm": 2.7022790908813477, + "learning_rate": 9.367142552738121e-06, + "loss": 0.0094, + "step": 4990 + }, + { + "epoch": 0.5316428723630939, + "eval_loss": 0.0041928887367248535, + "eval_runtime": 35.0333, + "eval_samples_per_second": 4286.607, + "eval_steps_per_second": 66.993, + "step": 4990 + }, + { + "epoch": 0.5327082889409759, + "grad_norm": 0.0015129174571484327, + "learning_rate": 9.345834221180481e-06, + "loss": 0.0006, + "step": 5000 + }, + { + "epoch": 0.5327082889409759, + "eval_loss": 0.004537190776318312, + "eval_runtime": 35.0351, + "eval_samples_per_second": 4286.385, + "eval_steps_per_second": 66.99, + "step": 5000 + }, + { + "epoch": 0.5337737055188578, + "grad_norm": 0.20955073833465576, + "learning_rate": 9.324525889622843e-06, + "loss": 0.0027, + "step": 5010 + }, + { + "epoch": 0.5337737055188578, + "eval_loss": 0.0031157478224486113, + "eval_runtime": 35.053, + "eval_samples_per_second": 4284.197, + "eval_steps_per_second": 66.956, + "step": 5010 + }, + { + "epoch": 0.5348391220967398, + "grad_norm": 0.0013660400873050094, + "learning_rate": 9.303217558065203e-06, + "loss": 0.0002, + "step": 5020 + }, + { + "epoch": 0.5348391220967398, + "eval_loss": 0.003070124424993992, + "eval_runtime": 35.0725, + "eval_samples_per_second": 4281.816, + "eval_steps_per_second": 66.919, + "step": 5020 + }, + { + "epoch": 0.5359045386746217, + "grad_norm": 0.9235541224479675, + "learning_rate": 9.281909226507565e-06, + "loss": 0.0087, + "step": 5030 + }, + { + "epoch": 0.5359045386746217, + "eval_loss": 0.0031868915539234877, + "eval_runtime": 35.0389, + "eval_samples_per_second": 4285.924, + "eval_steps_per_second": 66.983, + "step": 5030 + }, + { + "epoch": 0.5369699552525037, + "grad_norm": 0.003905409947037697, + "learning_rate": 9.260600894949925e-06, + "loss": 0.0065, + "step": 5040 + }, + { + "epoch": 0.5369699552525037, + "eval_loss": 0.0031315067317336798, + "eval_runtime": 35.0444, + "eval_samples_per_second": 4285.252, + "eval_steps_per_second": 66.972, + "step": 5040 + }, + { + "epoch": 0.5380353718303856, + "grad_norm": 0.14299072325229645, + "learning_rate": 9.239292563392287e-06, + "loss": 0.0014, + "step": 5050 + }, + { + "epoch": 0.5380353718303856, + "eval_loss": 0.0031951293349266052, + "eval_runtime": 35.0455, + "eval_samples_per_second": 4285.118, + "eval_steps_per_second": 66.97, + "step": 5050 + }, + { + "epoch": 0.5391007884082676, + "grad_norm": 0.18088282644748688, + "learning_rate": 9.217984231834647e-06, + "loss": 0.001, + "step": 5060 + }, + { + "epoch": 0.5391007884082676, + "eval_loss": 0.003341434756293893, + "eval_runtime": 35.0797, + "eval_samples_per_second": 4280.938, + "eval_steps_per_second": 66.905, + "step": 5060 + }, + { + "epoch": 0.5401662049861495, + "grad_norm": 0.10255859047174454, + "learning_rate": 9.19667590027701e-06, + "loss": 0.0042, + "step": 5070 + }, + { + "epoch": 0.5401662049861495, + "eval_loss": 0.0033879380207508802, + "eval_runtime": 35.0607, + "eval_samples_per_second": 4283.26, + "eval_steps_per_second": 66.941, + "step": 5070 + }, + { + "epoch": 0.5412316215640315, + "grad_norm": 0.6156185865402222, + "learning_rate": 9.17536756871937e-06, + "loss": 0.0037, + "step": 5080 + }, + { + "epoch": 0.5412316215640315, + "eval_loss": 0.0033736974000930786, + "eval_runtime": 35.0571, + "eval_samples_per_second": 4283.693, + "eval_steps_per_second": 66.948, + "step": 5080 + }, + { + "epoch": 0.5422970381419134, + "grad_norm": 0.001436607213690877, + "learning_rate": 9.154059237161731e-06, + "loss": 0.0008, + "step": 5090 + }, + { + "epoch": 0.5422970381419134, + "eval_loss": 0.0032905188854783773, + "eval_runtime": 35.0708, + "eval_samples_per_second": 4282.026, + "eval_steps_per_second": 66.922, + "step": 5090 + }, + { + "epoch": 0.5433624547197954, + "grad_norm": 0.013623624108731747, + "learning_rate": 9.132750905604091e-06, + "loss": 0.0002, + "step": 5100 + }, + { + "epoch": 0.5433624547197954, + "eval_loss": 0.0032300897873938084, + "eval_runtime": 35.1794, + "eval_samples_per_second": 4268.799, + "eval_steps_per_second": 66.715, + "step": 5100 + }, + { + "epoch": 0.5444278712976774, + "grad_norm": 0.0022984424140304327, + "learning_rate": 9.111442574046453e-06, + "loss": 0.0021, + "step": 5110 + }, + { + "epoch": 0.5444278712976774, + "eval_loss": 0.0031744264997541904, + "eval_runtime": 35.0537, + "eval_samples_per_second": 4284.114, + "eval_steps_per_second": 66.954, + "step": 5110 + }, + { + "epoch": 0.5454932878755594, + "grad_norm": 0.0016335392137989402, + "learning_rate": 9.090134242488813e-06, + "loss": 0.0037, + "step": 5120 + }, + { + "epoch": 0.5454932878755594, + "eval_loss": 0.0030962612945586443, + "eval_runtime": 35.0514, + "eval_samples_per_second": 4284.394, + "eval_steps_per_second": 66.959, + "step": 5120 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 0.001437659957446158, + "learning_rate": 9.068825910931175e-06, + "loss": 0.0003, + "step": 5130 + }, + { + "epoch": 0.5465587044534413, + "eval_loss": 0.003103644121438265, + "eval_runtime": 35.083, + "eval_samples_per_second": 4280.534, + "eval_steps_per_second": 66.898, + "step": 5130 + }, + { + "epoch": 0.5476241210313233, + "grad_norm": 0.0013070203131064773, + "learning_rate": 9.047517579373535e-06, + "loss": 0.0118, + "step": 5140 + }, + { + "epoch": 0.5476241210313233, + "eval_loss": 0.0030982240568846464, + "eval_runtime": 35.0299, + "eval_samples_per_second": 4287.028, + "eval_steps_per_second": 67.0, + "step": 5140 + }, + { + "epoch": 0.5486895376092052, + "grad_norm": 0.963262677192688, + "learning_rate": 9.026209247815897e-06, + "loss": 0.0018, + "step": 5150 + }, + { + "epoch": 0.5486895376092052, + "eval_loss": 0.003294318215921521, + "eval_runtime": 35.0551, + "eval_samples_per_second": 4283.947, + "eval_steps_per_second": 66.952, + "step": 5150 + }, + { + "epoch": 0.5497549541870872, + "grad_norm": 0.01332628633826971, + "learning_rate": 9.004900916258257e-06, + "loss": 0.0002, + "step": 5160 + }, + { + "epoch": 0.5497549541870872, + "eval_loss": 0.004062108229845762, + "eval_runtime": 35.073, + "eval_samples_per_second": 4281.76, + "eval_steps_per_second": 66.918, + "step": 5160 + }, + { + "epoch": 0.5508203707649691, + "grad_norm": 0.022138891741633415, + "learning_rate": 8.983592584700619e-06, + "loss": 0.0013, + "step": 5170 + }, + { + "epoch": 0.5508203707649691, + "eval_loss": 0.0037723940331488848, + "eval_runtime": 35.0486, + "eval_samples_per_second": 4284.742, + "eval_steps_per_second": 66.964, + "step": 5170 + }, + { + "epoch": 0.5518857873428511, + "grad_norm": 0.5359131693840027, + "learning_rate": 8.96228425314298e-06, + "loss": 0.0077, + "step": 5180 + }, + { + "epoch": 0.5518857873428511, + "eval_loss": 0.004283850081264973, + "eval_runtime": 35.0668, + "eval_samples_per_second": 4282.51, + "eval_steps_per_second": 66.929, + "step": 5180 + }, + { + "epoch": 0.552951203920733, + "grad_norm": 0.0019443683559074998, + "learning_rate": 8.940975921585341e-06, + "loss": 0.0016, + "step": 5190 + }, + { + "epoch": 0.552951203920733, + "eval_loss": 0.004361128434538841, + "eval_runtime": 35.0534, + "eval_samples_per_second": 4284.152, + "eval_steps_per_second": 66.955, + "step": 5190 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 0.07410170882940292, + "learning_rate": 8.919667590027701e-06, + "loss": 0.0074, + "step": 5200 + }, + { + "epoch": 0.554016620498615, + "eval_loss": 0.0039838762022554874, + "eval_runtime": 35.0317, + "eval_samples_per_second": 4286.807, + "eval_steps_per_second": 66.997, + "step": 5200 + }, + { + "epoch": 0.555082037076497, + "grad_norm": 0.0014316923916339874, + "learning_rate": 8.898359258470063e-06, + "loss": 0.0032, + "step": 5210 + }, + { + "epoch": 0.555082037076497, + "eval_loss": 0.0032119122333824635, + "eval_runtime": 35.0299, + "eval_samples_per_second": 4287.025, + "eval_steps_per_second": 67.0, + "step": 5210 + }, + { + "epoch": 0.5561474536543789, + "grad_norm": 0.0034566791728138924, + "learning_rate": 8.877050926912423e-06, + "loss": 0.0013, + "step": 5220 + }, + { + "epoch": 0.5561474536543789, + "eval_loss": 0.0030109714716672897, + "eval_runtime": 35.0631, + "eval_samples_per_second": 4282.969, + "eval_steps_per_second": 66.937, + "step": 5220 + }, + { + "epoch": 0.5572128702322608, + "grad_norm": 0.00445709191262722, + "learning_rate": 8.855742595354785e-06, + "loss": 0.0028, + "step": 5230 + }, + { + "epoch": 0.5572128702322608, + "eval_loss": 0.0030201044864952564, + "eval_runtime": 35.0621, + "eval_samples_per_second": 4283.091, + "eval_steps_per_second": 66.938, + "step": 5230 + }, + { + "epoch": 0.5582782868101428, + "grad_norm": 0.22698596119880676, + "learning_rate": 8.834434263797145e-06, + "loss": 0.0003, + "step": 5240 + }, + { + "epoch": 0.5582782868101428, + "eval_loss": 0.0030886309687048197, + "eval_runtime": 35.0687, + "eval_samples_per_second": 4282.284, + "eval_steps_per_second": 66.926, + "step": 5240 + }, + { + "epoch": 0.5593437033880247, + "grad_norm": 0.08581502735614777, + "learning_rate": 8.813125932239507e-06, + "loss": 0.001, + "step": 5250 + }, + { + "epoch": 0.5593437033880247, + "eval_loss": 0.003185087814927101, + "eval_runtime": 35.025, + "eval_samples_per_second": 4287.618, + "eval_steps_per_second": 67.009, + "step": 5250 + }, + { + "epoch": 0.5604091199659067, + "grad_norm": 0.002484232885763049, + "learning_rate": 8.791817600681867e-06, + "loss": 0.0128, + "step": 5260 + }, + { + "epoch": 0.5604091199659067, + "eval_loss": 0.0033555706031620502, + "eval_runtime": 35.0536, + "eval_samples_per_second": 4284.12, + "eval_steps_per_second": 66.955, + "step": 5260 + }, + { + "epoch": 0.5614745365437886, + "grad_norm": 5.450259208679199, + "learning_rate": 8.770509269124229e-06, + "loss": 0.0125, + "step": 5270 + }, + { + "epoch": 0.5614745365437886, + "eval_loss": 0.0033903010189533234, + "eval_runtime": 35.0949, + "eval_samples_per_second": 4279.088, + "eval_steps_per_second": 66.876, + "step": 5270 + }, + { + "epoch": 0.5625399531216706, + "grad_norm": 0.05760002136230469, + "learning_rate": 8.74920093756659e-06, + "loss": 0.0049, + "step": 5280 + }, + { + "epoch": 0.5625399531216706, + "eval_loss": 0.00321973511017859, + "eval_runtime": 35.0435, + "eval_samples_per_second": 4285.361, + "eval_steps_per_second": 66.974, + "step": 5280 + }, + { + "epoch": 0.5636053696995525, + "grad_norm": 0.005817291792482138, + "learning_rate": 8.72789260600895e-06, + "loss": 0.0016, + "step": 5290 + }, + { + "epoch": 0.5636053696995525, + "eval_loss": 0.0032233393285423517, + "eval_runtime": 35.0468, + "eval_samples_per_second": 4284.959, + "eval_steps_per_second": 66.968, + "step": 5290 + }, + { + "epoch": 0.5646707862774345, + "grad_norm": 0.009052244946360588, + "learning_rate": 8.706584274451311e-06, + "loss": 0.0013, + "step": 5300 + }, + { + "epoch": 0.5646707862774345, + "eval_loss": 0.0031393333338201046, + "eval_runtime": 35.0484, + "eval_samples_per_second": 4284.761, + "eval_steps_per_second": 66.965, + "step": 5300 + }, + { + "epoch": 0.5657362028553165, + "grad_norm": 0.022343887016177177, + "learning_rate": 8.685275942893671e-06, + "loss": 0.0048, + "step": 5310 + }, + { + "epoch": 0.5657362028553165, + "eval_loss": 0.0030866351444274187, + "eval_runtime": 35.0511, + "eval_samples_per_second": 4284.433, + "eval_steps_per_second": 66.959, + "step": 5310 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 0.001582018448971212, + "learning_rate": 8.663967611336033e-06, + "loss": 0.0005, + "step": 5320 + }, + { + "epoch": 0.5668016194331984, + "eval_loss": 0.0030833673663437366, + "eval_runtime": 35.0498, + "eval_samples_per_second": 4284.586, + "eval_steps_per_second": 66.962, + "step": 5320 + }, + { + "epoch": 0.5678670360110804, + "grad_norm": 0.005051956046372652, + "learning_rate": 8.642659279778393e-06, + "loss": 0.0006, + "step": 5330 + }, + { + "epoch": 0.5678670360110804, + "eval_loss": 0.003096715547144413, + "eval_runtime": 35.0295, + "eval_samples_per_second": 4287.074, + "eval_steps_per_second": 67.001, + "step": 5330 + }, + { + "epoch": 0.5689324525889623, + "grad_norm": 0.4084688425064087, + "learning_rate": 8.621350948220755e-06, + "loss": 0.0027, + "step": 5340 + }, + { + "epoch": 0.5689324525889623, + "eval_loss": 0.0030978054273873568, + "eval_runtime": 35.0254, + "eval_samples_per_second": 4287.573, + "eval_steps_per_second": 67.008, + "step": 5340 + }, + { + "epoch": 0.5699978691668443, + "grad_norm": 0.0028698795940726995, + "learning_rate": 8.600042616663115e-06, + "loss": 0.0002, + "step": 5350 + }, + { + "epoch": 0.5699978691668443, + "eval_loss": 0.003147188574075699, + "eval_runtime": 35.0568, + "eval_samples_per_second": 4283.732, + "eval_steps_per_second": 66.948, + "step": 5350 + }, + { + "epoch": 0.5710632857447262, + "grad_norm": 0.0017005919944494963, + "learning_rate": 8.578734285105477e-06, + "loss": 0.0005, + "step": 5360 + }, + { + "epoch": 0.5710632857447262, + "eval_loss": 0.003185020759701729, + "eval_runtime": 35.0384, + "eval_samples_per_second": 4285.988, + "eval_steps_per_second": 66.984, + "step": 5360 + }, + { + "epoch": 0.5721287023226082, + "grad_norm": 0.005401854868978262, + "learning_rate": 8.557425953547837e-06, + "loss": 0.0001, + "step": 5370 + }, + { + "epoch": 0.5721287023226082, + "eval_loss": 0.0032068644650280476, + "eval_runtime": 35.0844, + "eval_samples_per_second": 4280.363, + "eval_steps_per_second": 66.896, + "step": 5370 + }, + { + "epoch": 0.5731941189004901, + "grad_norm": 0.0929129421710968, + "learning_rate": 8.536117621990199e-06, + "loss": 0.0025, + "step": 5380 + }, + { + "epoch": 0.5731941189004901, + "eval_loss": 0.0031966594979166985, + "eval_runtime": 35.0084, + "eval_samples_per_second": 4289.653, + "eval_steps_per_second": 67.041, + "step": 5380 + }, + { + "epoch": 0.574259535478372, + "grad_norm": 0.0012115921126678586, + "learning_rate": 8.51480929043256e-06, + "loss": 0.0014, + "step": 5390 + }, + { + "epoch": 0.574259535478372, + "eval_loss": 0.0032315885182470083, + "eval_runtime": 35.0764, + "eval_samples_per_second": 4281.346, + "eval_steps_per_second": 66.911, + "step": 5390 + }, + { + "epoch": 0.575324952056254, + "grad_norm": 0.00769865233451128, + "learning_rate": 8.493500958874921e-06, + "loss": 0.0001, + "step": 5400 + }, + { + "epoch": 0.575324952056254, + "eval_loss": 0.0032829763367772102, + "eval_runtime": 35.0689, + "eval_samples_per_second": 4282.254, + "eval_steps_per_second": 66.925, + "step": 5400 + }, + { + "epoch": 0.576390368634136, + "grad_norm": 1.7847949266433716, + "learning_rate": 8.472192627317281e-06, + "loss": 0.0047, + "step": 5410 + }, + { + "epoch": 0.576390368634136, + "eval_loss": 0.0033038435503840446, + "eval_runtime": 35.0485, + "eval_samples_per_second": 4284.75, + "eval_steps_per_second": 66.964, + "step": 5410 + }, + { + "epoch": 0.5774557852120179, + "grad_norm": 0.0020609069615602493, + "learning_rate": 8.450884295759643e-06, + "loss": 0.0007, + "step": 5420 + }, + { + "epoch": 0.5774557852120179, + "eval_loss": 0.003206141060218215, + "eval_runtime": 35.0791, + "eval_samples_per_second": 4281.014, + "eval_steps_per_second": 66.906, + "step": 5420 + }, + { + "epoch": 0.5785212017898999, + "grad_norm": 0.08441135287284851, + "learning_rate": 8.429575964202003e-06, + "loss": 0.0105, + "step": 5430 + }, + { + "epoch": 0.5785212017898999, + "eval_loss": 0.0032010802533477545, + "eval_runtime": 35.0223, + "eval_samples_per_second": 4287.952, + "eval_steps_per_second": 67.014, + "step": 5430 + }, + { + "epoch": 0.5795866183677818, + "grad_norm": 0.03874306008219719, + "learning_rate": 8.408267632644365e-06, + "loss": 0.0007, + "step": 5440 + }, + { + "epoch": 0.5795866183677818, + "eval_loss": 0.003258783370256424, + "eval_runtime": 35.062, + "eval_samples_per_second": 4283.099, + "eval_steps_per_second": 66.939, + "step": 5440 + }, + { + "epoch": 0.5806520349456638, + "grad_norm": 0.07270818948745728, + "learning_rate": 8.386959301086725e-06, + "loss": 0.0157, + "step": 5450 + }, + { + "epoch": 0.5806520349456638, + "eval_loss": 0.0032654814422130585, + "eval_runtime": 35.0557, + "eval_samples_per_second": 4283.864, + "eval_steps_per_second": 66.951, + "step": 5450 + }, + { + "epoch": 0.5817174515235457, + "grad_norm": 0.006695209536701441, + "learning_rate": 8.365650969529087e-06, + "loss": 0.0081, + "step": 5460 + }, + { + "epoch": 0.5817174515235457, + "eval_loss": 0.0033340235240757465, + "eval_runtime": 35.0804, + "eval_samples_per_second": 4280.854, + "eval_steps_per_second": 66.903, + "step": 5460 + }, + { + "epoch": 0.5827828681014277, + "grad_norm": 0.02671169675886631, + "learning_rate": 8.344342637971447e-06, + "loss": 0.0002, + "step": 5470 + }, + { + "epoch": 0.5827828681014277, + "eval_loss": 0.0034461417235434055, + "eval_runtime": 35.0658, + "eval_samples_per_second": 4282.633, + "eval_steps_per_second": 66.931, + "step": 5470 + }, + { + "epoch": 0.5838482846793096, + "grad_norm": 0.012659654952585697, + "learning_rate": 8.323034306413809e-06, + "loss": 0.0002, + "step": 5480 + }, + { + "epoch": 0.5838482846793096, + "eval_loss": 0.0034948259126394987, + "eval_runtime": 35.0375, + "eval_samples_per_second": 4286.093, + "eval_steps_per_second": 66.985, + "step": 5480 + }, + { + "epoch": 0.5849137012571916, + "grad_norm": 0.005894053727388382, + "learning_rate": 8.301725974856169e-06, + "loss": 0.0055, + "step": 5490 + }, + { + "epoch": 0.5849137012571916, + "eval_loss": 0.0036419378593564034, + "eval_runtime": 35.0948, + "eval_samples_per_second": 4279.095, + "eval_steps_per_second": 66.876, + "step": 5490 + }, + { + "epoch": 0.5859791178350735, + "grad_norm": 2.166231155395508, + "learning_rate": 8.280417643298531e-06, + "loss": 0.0048, + "step": 5500 + }, + { + "epoch": 0.5859791178350735, + "eval_loss": 0.0034717011731117964, + "eval_runtime": 35.0707, + "eval_samples_per_second": 4282.038, + "eval_steps_per_second": 66.922, + "step": 5500 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 0.006031760014593601, + "learning_rate": 8.259109311740891e-06, + "loss": 0.0007, + "step": 5510 + }, + { + "epoch": 0.5870445344129555, + "eval_loss": 0.0032065894920378923, + "eval_runtime": 35.0352, + "eval_samples_per_second": 4286.372, + "eval_steps_per_second": 66.99, + "step": 5510 + }, + { + "epoch": 0.5881099509908374, + "grad_norm": 0.023564601317048073, + "learning_rate": 8.237800980183253e-06, + "loss": 0.0009, + "step": 5520 + }, + { + "epoch": 0.5881099509908374, + "eval_loss": 0.003217566292732954, + "eval_runtime": 35.048, + "eval_samples_per_second": 4284.815, + "eval_steps_per_second": 66.965, + "step": 5520 + }, + { + "epoch": 0.5891753675687194, + "grad_norm": 0.04442958906292915, + "learning_rate": 8.216492648625613e-06, + "loss": 0.0004, + "step": 5530 + }, + { + "epoch": 0.5891753675687194, + "eval_loss": 0.0032989357132464647, + "eval_runtime": 35.0666, + "eval_samples_per_second": 4282.534, + "eval_steps_per_second": 66.93, + "step": 5530 + }, + { + "epoch": 0.5902407841466013, + "grad_norm": 0.004105782601982355, + "learning_rate": 8.195184317067975e-06, + "loss": 0.0068, + "step": 5540 + }, + { + "epoch": 0.5902407841466013, + "eval_loss": 0.0033785353880375624, + "eval_runtime": 35.067, + "eval_samples_per_second": 4282.491, + "eval_steps_per_second": 66.929, + "step": 5540 + }, + { + "epoch": 0.5913062007244833, + "grad_norm": 1.0616731643676758, + "learning_rate": 8.173875985510335e-06, + "loss": 0.0134, + "step": 5550 + }, + { + "epoch": 0.5913062007244833, + "eval_loss": 0.0031998585909605026, + "eval_runtime": 35.01, + "eval_samples_per_second": 4289.463, + "eval_steps_per_second": 67.038, + "step": 5550 + }, + { + "epoch": 0.5923716173023652, + "grad_norm": 0.0035948033910244703, + "learning_rate": 8.152567653952697e-06, + "loss": 0.0048, + "step": 5560 + }, + { + "epoch": 0.5923716173023652, + "eval_loss": 0.0032753869891166687, + "eval_runtime": 35.0539, + "eval_samples_per_second": 4284.083, + "eval_steps_per_second": 66.954, + "step": 5560 + }, + { + "epoch": 0.5934370338802472, + "grad_norm": 0.00209414167329669, + "learning_rate": 8.131259322395057e-06, + "loss": 0.0006, + "step": 5570 + }, + { + "epoch": 0.5934370338802472, + "eval_loss": 0.0032072330359369516, + "eval_runtime": 35.0701, + "eval_samples_per_second": 4282.11, + "eval_steps_per_second": 66.923, + "step": 5570 + }, + { + "epoch": 0.5945024504581291, + "grad_norm": 0.04995543509721756, + "learning_rate": 8.109950990837419e-06, + "loss": 0.0029, + "step": 5580 + }, + { + "epoch": 0.5945024504581291, + "eval_loss": 0.003245977219194174, + "eval_runtime": 35.0719, + "eval_samples_per_second": 4281.884, + "eval_steps_per_second": 66.92, + "step": 5580 + }, + { + "epoch": 0.5955678670360111, + "grad_norm": 0.001491761882789433, + "learning_rate": 8.088642659279779e-06, + "loss": 0.0054, + "step": 5590 + }, + { + "epoch": 0.5955678670360111, + "eval_loss": 0.0031598976347595453, + "eval_runtime": 35.0642, + "eval_samples_per_second": 4282.827, + "eval_steps_per_second": 66.934, + "step": 5590 + }, + { + "epoch": 0.596633283613893, + "grad_norm": 0.003034034511074424, + "learning_rate": 8.06733432772214e-06, + "loss": 0.0027, + "step": 5600 + }, + { + "epoch": 0.596633283613893, + "eval_loss": 0.0031364411115646362, + "eval_runtime": 35.0383, + "eval_samples_per_second": 4285.992, + "eval_steps_per_second": 66.984, + "step": 5600 + }, + { + "epoch": 0.597698700191775, + "grad_norm": 0.004649047274142504, + "learning_rate": 8.046025996164501e-06, + "loss": 0.0003, + "step": 5610 + }, + { + "epoch": 0.597698700191775, + "eval_loss": 0.0031330641359090805, + "eval_runtime": 35.0411, + "eval_samples_per_second": 4285.656, + "eval_steps_per_second": 66.979, + "step": 5610 + }, + { + "epoch": 0.5987641167696569, + "grad_norm": 0.06239793077111244, + "learning_rate": 8.024717664606861e-06, + "loss": 0.0098, + "step": 5620 + }, + { + "epoch": 0.5987641167696569, + "eval_loss": 0.003176827682182193, + "eval_runtime": 35.0911, + "eval_samples_per_second": 4279.541, + "eval_steps_per_second": 66.883, + "step": 5620 + }, + { + "epoch": 0.5998295333475389, + "grad_norm": 5.838839530944824, + "learning_rate": 8.003409333049223e-06, + "loss": 0.0072, + "step": 5630 + }, + { + "epoch": 0.5998295333475389, + "eval_loss": 0.003169504227116704, + "eval_runtime": 35.0513, + "eval_samples_per_second": 4284.4, + "eval_steps_per_second": 66.959, + "step": 5630 + }, + { + "epoch": 0.6008949499254208, + "grad_norm": 0.13171178102493286, + "learning_rate": 7.982101001491583e-06, + "loss": 0.0007, + "step": 5640 + }, + { + "epoch": 0.6008949499254208, + "eval_loss": 0.003053726628422737, + "eval_runtime": 35.0276, + "eval_samples_per_second": 4287.3, + "eval_steps_per_second": 67.004, + "step": 5640 + }, + { + "epoch": 0.6019603665033028, + "grad_norm": 0.0015700625954195857, + "learning_rate": 7.960792669933945e-06, + "loss": 0.0065, + "step": 5650 + }, + { + "epoch": 0.6019603665033028, + "eval_loss": 0.0035806247033178806, + "eval_runtime": 35.0609, + "eval_samples_per_second": 4283.23, + "eval_steps_per_second": 66.941, + "step": 5650 + }, + { + "epoch": 0.6030257830811847, + "grad_norm": 0.005014845635741949, + "learning_rate": 7.939484338376305e-06, + "loss": 0.0002, + "step": 5660 + }, + { + "epoch": 0.6030257830811847, + "eval_loss": 0.004452229011803865, + "eval_runtime": 35.0085, + "eval_samples_per_second": 4289.644, + "eval_steps_per_second": 67.041, + "step": 5660 + }, + { + "epoch": 0.6040911996590667, + "grad_norm": 0.05154247581958771, + "learning_rate": 7.918176006818667e-06, + "loss": 0.0005, + "step": 5670 + }, + { + "epoch": 0.6040911996590667, + "eval_loss": 0.004699897486716509, + "eval_runtime": 35.0347, + "eval_samples_per_second": 4286.437, + "eval_steps_per_second": 66.991, + "step": 5670 + }, + { + "epoch": 0.6051566162369486, + "grad_norm": 0.0041319397278130054, + "learning_rate": 7.896867675261027e-06, + "loss": 0.0028, + "step": 5680 + }, + { + "epoch": 0.6051566162369486, + "eval_loss": 0.0034233941696584225, + "eval_runtime": 35.0512, + "eval_samples_per_second": 4284.419, + "eval_steps_per_second": 66.959, + "step": 5680 + }, + { + "epoch": 0.6062220328148306, + "grad_norm": 0.0012949644587934017, + "learning_rate": 7.875559343703389e-06, + "loss": 0.0001, + "step": 5690 + }, + { + "epoch": 0.6062220328148306, + "eval_loss": 0.0033080654684454203, + "eval_runtime": 35.0534, + "eval_samples_per_second": 4284.15, + "eval_steps_per_second": 66.955, + "step": 5690 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 0.0013649500906467438, + "learning_rate": 7.854251012145749e-06, + "loss": 0.0069, + "step": 5700 + }, + { + "epoch": 0.6072874493927125, + "eval_loss": 0.00345489289611578, + "eval_runtime": 35.0169, + "eval_samples_per_second": 4288.615, + "eval_steps_per_second": 67.025, + "step": 5700 + }, + { + "epoch": 0.6083528659705945, + "grad_norm": 0.29954442381858826, + "learning_rate": 7.832942680588111e-06, + "loss": 0.0013, + "step": 5710 + }, + { + "epoch": 0.6083528659705945, + "eval_loss": 0.003461030311882496, + "eval_runtime": 35.0232, + "eval_samples_per_second": 4287.846, + "eval_steps_per_second": 67.013, + "step": 5710 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 0.001273061498068273, + "learning_rate": 7.811634349030471e-06, + "loss": 0.0007, + "step": 5720 + }, + { + "epoch": 0.6094182825484764, + "eval_loss": 0.0033160303719341755, + "eval_runtime": 34.994, + "eval_samples_per_second": 4291.424, + "eval_steps_per_second": 67.069, + "step": 5720 + }, + { + "epoch": 0.6104836991263584, + "grad_norm": 0.0023903066758066416, + "learning_rate": 7.790326017472833e-06, + "loss": 0.0007, + "step": 5730 + }, + { + "epoch": 0.6104836991263584, + "eval_loss": 0.0033325697295367718, + "eval_runtime": 35.0304, + "eval_samples_per_second": 4286.968, + "eval_steps_per_second": 66.999, + "step": 5730 + }, + { + "epoch": 0.6115491157042403, + "grad_norm": 0.0014730616239830852, + "learning_rate": 7.769017685915193e-06, + "loss": 0.0084, + "step": 5740 + }, + { + "epoch": 0.6115491157042403, + "eval_loss": 0.00339673925191164, + "eval_runtime": 35.0363, + "eval_samples_per_second": 4286.245, + "eval_steps_per_second": 66.988, + "step": 5740 + }, + { + "epoch": 0.6126145322821223, + "grad_norm": 0.19337864220142365, + "learning_rate": 7.747709354357555e-06, + "loss": 0.001, + "step": 5750 + }, + { + "epoch": 0.6126145322821223, + "eval_loss": 0.003394161816686392, + "eval_runtime": 35.0308, + "eval_samples_per_second": 4286.914, + "eval_steps_per_second": 66.998, + "step": 5750 + }, + { + "epoch": 0.6136799488600042, + "grad_norm": 0.001471309456974268, + "learning_rate": 7.726401022799915e-06, + "loss": 0.0036, + "step": 5760 + }, + { + "epoch": 0.6136799488600042, + "eval_loss": 0.003426865441724658, + "eval_runtime": 35.0019, + "eval_samples_per_second": 4290.458, + "eval_steps_per_second": 67.054, + "step": 5760 + }, + { + "epoch": 0.6147453654378862, + "grad_norm": 0.0012775680515915155, + "learning_rate": 7.705092691242277e-06, + "loss": 0.0003, + "step": 5770 + }, + { + "epoch": 0.6147453654378862, + "eval_loss": 0.003422880545258522, + "eval_runtime": 35.0286, + "eval_samples_per_second": 4287.183, + "eval_steps_per_second": 67.002, + "step": 5770 + }, + { + "epoch": 0.6158107820157681, + "grad_norm": 0.0013159505324438214, + "learning_rate": 7.683784359684637e-06, + "loss": 0.0084, + "step": 5780 + }, + { + "epoch": 0.6158107820157681, + "eval_loss": 0.0034935129806399345, + "eval_runtime": 35.0203, + "eval_samples_per_second": 4288.193, + "eval_steps_per_second": 67.018, + "step": 5780 + }, + { + "epoch": 0.6168761985936502, + "grad_norm": 0.0015752206090837717, + "learning_rate": 7.662476028126999e-06, + "loss": 0.0003, + "step": 5790 + }, + { + "epoch": 0.6168761985936502, + "eval_loss": 0.003633267944678664, + "eval_runtime": 35.0278, + "eval_samples_per_second": 4287.283, + "eval_steps_per_second": 67.004, + "step": 5790 + }, + { + "epoch": 0.6179416151715321, + "grad_norm": 0.010617982596158981, + "learning_rate": 7.641167696569359e-06, + "loss": 0.005, + "step": 5800 + }, + { + "epoch": 0.6179416151715321, + "eval_loss": 0.003393057268112898, + "eval_runtime": 35.0376, + "eval_samples_per_second": 4286.076, + "eval_steps_per_second": 66.985, + "step": 5800 + }, + { + "epoch": 0.6190070317494141, + "grad_norm": 2.5578744411468506, + "learning_rate": 7.61985936501172e-06, + "loss": 0.0063, + "step": 5810 + }, + { + "epoch": 0.6190070317494141, + "eval_loss": 0.003425801871344447, + "eval_runtime": 35.0481, + "eval_samples_per_second": 4284.799, + "eval_steps_per_second": 66.965, + "step": 5810 + }, + { + "epoch": 0.620072448327296, + "grad_norm": 0.010255936533212662, + "learning_rate": 7.598551033454081e-06, + "loss": 0.0001, + "step": 5820 + }, + { + "epoch": 0.620072448327296, + "eval_loss": 0.003427485004067421, + "eval_runtime": 35.0171, + "eval_samples_per_second": 4288.594, + "eval_steps_per_second": 67.024, + "step": 5820 + }, + { + "epoch": 0.621137864905178, + "grad_norm": 0.1109393909573555, + "learning_rate": 7.577242701896442e-06, + "loss": 0.0003, + "step": 5830 + }, + { + "epoch": 0.621137864905178, + "eval_loss": 0.0034398355055600405, + "eval_runtime": 35.0582, + "eval_samples_per_second": 4283.569, + "eval_steps_per_second": 66.946, + "step": 5830 + }, + { + "epoch": 0.6222032814830599, + "grad_norm": 0.12083720415830612, + "learning_rate": 7.555934370338803e-06, + "loss": 0.0006, + "step": 5840 + }, + { + "epoch": 0.6222032814830599, + "eval_loss": 0.003469038987532258, + "eval_runtime": 35.0472, + "eval_samples_per_second": 4284.904, + "eval_steps_per_second": 66.967, + "step": 5840 + }, + { + "epoch": 0.6232686980609419, + "grad_norm": 0.0032793928403407335, + "learning_rate": 7.534626038781164e-06, + "loss": 0.0025, + "step": 5850 + }, + { + "epoch": 0.6232686980609419, + "eval_loss": 0.0036529472563415766, + "eval_runtime": 35.0157, + "eval_samples_per_second": 4288.764, + "eval_steps_per_second": 67.027, + "step": 5850 + }, + { + "epoch": 0.6243341146388238, + "grad_norm": 0.02544957958161831, + "learning_rate": 7.513317707223525e-06, + "loss": 0.0011, + "step": 5860 + }, + { + "epoch": 0.6243341146388238, + "eval_loss": 0.0036848068702965975, + "eval_runtime": 35.0117, + "eval_samples_per_second": 4289.249, + "eval_steps_per_second": 67.035, + "step": 5860 + }, + { + "epoch": 0.6253995312167058, + "grad_norm": 0.005889591760933399, + "learning_rate": 7.492009375665886e-06, + "loss": 0.0052, + "step": 5870 + }, + { + "epoch": 0.6253995312167058, + "eval_loss": 0.003564575221389532, + "eval_runtime": 35.044, + "eval_samples_per_second": 4285.3, + "eval_steps_per_second": 66.973, + "step": 5870 + }, + { + "epoch": 0.6264649477945877, + "grad_norm": 0.5814864635467529, + "learning_rate": 7.470701044108247e-06, + "loss": 0.0052, + "step": 5880 + }, + { + "epoch": 0.6264649477945877, + "eval_loss": 0.0036007657181471586, + "eval_runtime": 35.0287, + "eval_samples_per_second": 4287.168, + "eval_steps_per_second": 67.002, + "step": 5880 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 0.009390910156071186, + "learning_rate": 7.449392712550608e-06, + "loss": 0.0018, + "step": 5890 + }, + { + "epoch": 0.6275303643724697, + "eval_loss": 0.0035891227889806032, + "eval_runtime": 35.0066, + "eval_samples_per_second": 4289.876, + "eval_steps_per_second": 67.044, + "step": 5890 + }, + { + "epoch": 0.6285957809503516, + "grad_norm": 0.020240269601345062, + "learning_rate": 7.428084380992969e-06, + "loss": 0.0046, + "step": 5900 + }, + { + "epoch": 0.6285957809503516, + "eval_loss": 0.0035373272839933634, + "eval_runtime": 35.0097, + "eval_samples_per_second": 4289.498, + "eval_steps_per_second": 67.039, + "step": 5900 + }, + { + "epoch": 0.6296611975282336, + "grad_norm": 0.10366514325141907, + "learning_rate": 7.40677604943533e-06, + "loss": 0.0004, + "step": 5910 + }, + { + "epoch": 0.6296611975282336, + "eval_loss": 0.003493980038911104, + "eval_runtime": 35.0399, + "eval_samples_per_second": 4285.803, + "eval_steps_per_second": 66.981, + "step": 5910 + }, + { + "epoch": 0.6307266141061155, + "grad_norm": 0.03924533352255821, + "learning_rate": 7.385467717877691e-06, + "loss": 0.0001, + "step": 5920 + }, + { + "epoch": 0.6307266141061155, + "eval_loss": 0.0034797810949385166, + "eval_runtime": 35.024, + "eval_samples_per_second": 4287.747, + "eval_steps_per_second": 67.011, + "step": 5920 + }, + { + "epoch": 0.6317920306839975, + "grad_norm": 0.011868029832839966, + "learning_rate": 7.364159386320052e-06, + "loss": 0.0109, + "step": 5930 + }, + { + "epoch": 0.6317920306839975, + "eval_loss": 0.0033822518307715654, + "eval_runtime": 35.0226, + "eval_samples_per_second": 4287.919, + "eval_steps_per_second": 67.014, + "step": 5930 + }, + { + "epoch": 0.6328574472618794, + "grad_norm": 1.3012027740478516, + "learning_rate": 7.342851054762413e-06, + "loss": 0.0049, + "step": 5940 + }, + { + "epoch": 0.6328574472618794, + "eval_loss": 0.0033151600509881973, + "eval_runtime": 35.0547, + "eval_samples_per_second": 4283.989, + "eval_steps_per_second": 66.952, + "step": 5940 + }, + { + "epoch": 0.6339228638397614, + "grad_norm": 0.0013931491412222385, + "learning_rate": 7.321542723204774e-06, + "loss": 0.0052, + "step": 5950 + }, + { + "epoch": 0.6339228638397614, + "eval_loss": 0.00324883870780468, + "eval_runtime": 35.0289, + "eval_samples_per_second": 4287.151, + "eval_steps_per_second": 67.002, + "step": 5950 + }, + { + "epoch": 0.6349882804176433, + "grad_norm": 0.05665739253163338, + "learning_rate": 7.300234391647134e-06, + "loss": 0.0003, + "step": 5960 + }, + { + "epoch": 0.6349882804176433, + "eval_loss": 0.003316541202366352, + "eval_runtime": 35.0598, + "eval_samples_per_second": 4283.368, + "eval_steps_per_second": 66.943, + "step": 5960 + }, + { + "epoch": 0.6360536969955253, + "grad_norm": 0.014257961884140968, + "learning_rate": 7.278926060089495e-06, + "loss": 0.0061, + "step": 5970 + }, + { + "epoch": 0.6360536969955253, + "eval_loss": 0.003145574824884534, + "eval_runtime": 35.0387, + "eval_samples_per_second": 4285.951, + "eval_steps_per_second": 66.983, + "step": 5970 + }, + { + "epoch": 0.6371191135734072, + "grad_norm": 0.019166210666298866, + "learning_rate": 7.257617728531856e-06, + "loss": 0.0049, + "step": 5980 + }, + { + "epoch": 0.6371191135734072, + "eval_loss": 0.003021866548806429, + "eval_runtime": 35.044, + "eval_samples_per_second": 4285.301, + "eval_steps_per_second": 66.973, + "step": 5980 + }, + { + "epoch": 0.6381845301512892, + "grad_norm": 0.0279945507645607, + "learning_rate": 7.236309396974217e-06, + "loss": 0.0067, + "step": 5990 + }, + { + "epoch": 0.6381845301512892, + "eval_loss": 0.002979971468448639, + "eval_runtime": 34.9997, + "eval_samples_per_second": 4290.723, + "eval_steps_per_second": 67.058, + "step": 5990 + }, + { + "epoch": 0.6392499467291711, + "grad_norm": 0.005042492412030697, + "learning_rate": 7.215001065416578e-06, + "loss": 0.0007, + "step": 6000 + }, + { + "epoch": 0.6392499467291711, + "eval_loss": 0.0029915031045675278, + "eval_runtime": 35.0149, + "eval_samples_per_second": 4288.857, + "eval_steps_per_second": 67.029, + "step": 6000 + }, + { + "epoch": 0.6403153633070531, + "grad_norm": 0.0013033768627792597, + "learning_rate": 7.193692733858939e-06, + "loss": 0.0006, + "step": 6010 + }, + { + "epoch": 0.6403153633070531, + "eval_loss": 0.002996724331751466, + "eval_runtime": 35.0561, + "eval_samples_per_second": 4283.823, + "eval_steps_per_second": 66.95, + "step": 6010 + }, + { + "epoch": 0.641380779884935, + "grad_norm": 0.0022245654836297035, + "learning_rate": 7.1723844023013e-06, + "loss": 0.0005, + "step": 6020 + }, + { + "epoch": 0.641380779884935, + "eval_loss": 0.002998237032443285, + "eval_runtime": 35.0276, + "eval_samples_per_second": 4287.302, + "eval_steps_per_second": 67.004, + "step": 6020 + }, + { + "epoch": 0.642446196462817, + "grad_norm": 4.448103427886963, + "learning_rate": 7.151076070743661e-06, + "loss": 0.0092, + "step": 6030 + }, + { + "epoch": 0.642446196462817, + "eval_loss": 0.0030165978241711855, + "eval_runtime": 35.0225, + "eval_samples_per_second": 4287.935, + "eval_steps_per_second": 67.014, + "step": 6030 + }, + { + "epoch": 0.6435116130406989, + "grad_norm": 0.0021644230000674725, + "learning_rate": 7.129767739186022e-06, + "loss": 0.0017, + "step": 6040 + }, + { + "epoch": 0.6435116130406989, + "eval_loss": 0.0030744208488613367, + "eval_runtime": 35.0525, + "eval_samples_per_second": 4284.256, + "eval_steps_per_second": 66.957, + "step": 6040 + }, + { + "epoch": 0.6445770296185809, + "grad_norm": 0.0013590834569185972, + "learning_rate": 7.108459407628383e-06, + "loss": 0.0061, + "step": 6050 + }, + { + "epoch": 0.6445770296185809, + "eval_loss": 0.003088417463004589, + "eval_runtime": 35.0506, + "eval_samples_per_second": 4284.487, + "eval_steps_per_second": 66.96, + "step": 6050 + }, + { + "epoch": 0.6456424461964628, + "grad_norm": 0.15340279042720795, + "learning_rate": 7.087151076070744e-06, + "loss": 0.0017, + "step": 6060 + }, + { + "epoch": 0.6456424461964628, + "eval_loss": 0.0031277111265808344, + "eval_runtime": 35.0518, + "eval_samples_per_second": 4284.343, + "eval_steps_per_second": 66.958, + "step": 6060 + }, + { + "epoch": 0.6467078627743448, + "grad_norm": 0.03221344202756882, + "learning_rate": 7.065842744513105e-06, + "loss": 0.0027, + "step": 6070 + }, + { + "epoch": 0.6467078627743448, + "eval_loss": 0.0032699485309422016, + "eval_runtime": 35.0636, + "eval_samples_per_second": 4282.901, + "eval_steps_per_second": 66.935, + "step": 6070 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 0.0018749163718894124, + "learning_rate": 7.044534412955466e-06, + "loss": 0.0008, + "step": 6080 + }, + { + "epoch": 0.6477732793522267, + "eval_loss": 0.00332645233720541, + "eval_runtime": 35.0515, + "eval_samples_per_second": 4284.379, + "eval_steps_per_second": 66.959, + "step": 6080 + }, + { + "epoch": 0.6488386959301087, + "grad_norm": 0.0626567080616951, + "learning_rate": 7.023226081397827e-06, + "loss": 0.0001, + "step": 6090 + }, + { + "epoch": 0.6488386959301087, + "eval_loss": 0.003338114358484745, + "eval_runtime": 35.0324, + "eval_samples_per_second": 4286.713, + "eval_steps_per_second": 66.995, + "step": 6090 + }, + { + "epoch": 0.6499041125079906, + "grad_norm": 0.0010921815410256386, + "learning_rate": 7.001917749840188e-06, + "loss": 0.0007, + "step": 6100 + }, + { + "epoch": 0.6499041125079906, + "eval_loss": 0.0033058812841773033, + "eval_runtime": 35.0347, + "eval_samples_per_second": 4286.435, + "eval_steps_per_second": 66.991, + "step": 6100 + }, + { + "epoch": 0.6509695290858726, + "grad_norm": 0.0011606470216065645, + "learning_rate": 6.980609418282549e-06, + "loss": 0.0006, + "step": 6110 + }, + { + "epoch": 0.6509695290858726, + "eval_loss": 0.0032852557487785816, + "eval_runtime": 35.1398, + "eval_samples_per_second": 4273.614, + "eval_steps_per_second": 66.79, + "step": 6110 + }, + { + "epoch": 0.6520349456637545, + "grad_norm": 0.0011158619308844209, + "learning_rate": 6.95930108672491e-06, + "loss": 0.0038, + "step": 6120 + }, + { + "epoch": 0.6520349456637545, + "eval_loss": 0.0032803104259073734, + "eval_runtime": 35.166, + "eval_samples_per_second": 4270.431, + "eval_steps_per_second": 66.741, + "step": 6120 + }, + { + "epoch": 0.6531003622416365, + "grad_norm": 0.3906470537185669, + "learning_rate": 6.937992755167271e-06, + "loss": 0.0012, + "step": 6130 + }, + { + "epoch": 0.6531003622416365, + "eval_loss": 0.003150229575112462, + "eval_runtime": 35.0766, + "eval_samples_per_second": 4281.314, + "eval_steps_per_second": 66.911, + "step": 6130 + }, + { + "epoch": 0.6541657788195184, + "grad_norm": 0.022889362648129463, + "learning_rate": 6.916684423609632e-06, + "loss": 0.001, + "step": 6140 + }, + { + "epoch": 0.6541657788195184, + "eval_loss": 0.0030826658476144075, + "eval_runtime": 35.0315, + "eval_samples_per_second": 4286.834, + "eval_steps_per_second": 66.997, + "step": 6140 + }, + { + "epoch": 0.6552311953974004, + "grad_norm": 0.0011571752838790417, + "learning_rate": 6.895376092051993e-06, + "loss": 0.0002, + "step": 6150 + }, + { + "epoch": 0.6552311953974004, + "eval_loss": 0.003100884146988392, + "eval_runtime": 35.065, + "eval_samples_per_second": 4282.728, + "eval_steps_per_second": 66.933, + "step": 6150 + }, + { + "epoch": 0.6562966119752823, + "grad_norm": 0.0019666426815092564, + "learning_rate": 6.874067760494354e-06, + "loss": 0.0002, + "step": 6160 + }, + { + "epoch": 0.6562966119752823, + "eval_loss": 0.0031208472792059183, + "eval_runtime": 35.064, + "eval_samples_per_second": 4282.858, + "eval_steps_per_second": 66.935, + "step": 6160 + }, + { + "epoch": 0.6573620285531643, + "grad_norm": 0.0021635943558067083, + "learning_rate": 6.852759428936715e-06, + "loss": 0.0143, + "step": 6170 + }, + { + "epoch": 0.6573620285531643, + "eval_loss": 0.0030296596232801676, + "eval_runtime": 35.056, + "eval_samples_per_second": 4283.836, + "eval_steps_per_second": 66.95, + "step": 6170 + }, + { + "epoch": 0.6584274451310462, + "grad_norm": 0.001522368867881596, + "learning_rate": 6.831451097379076e-06, + "loss": 0.0004, + "step": 6180 + }, + { + "epoch": 0.6584274451310462, + "eval_loss": 0.003006124868988991, + "eval_runtime": 35.0506, + "eval_samples_per_second": 4284.497, + "eval_steps_per_second": 66.96, + "step": 6180 + }, + { + "epoch": 0.6594928617089282, + "grad_norm": 0.001939168432727456, + "learning_rate": 6.810142765821437e-06, + "loss": 0.0007, + "step": 6190 + }, + { + "epoch": 0.6594928617089282, + "eval_loss": 0.003031767439097166, + "eval_runtime": 35.07, + "eval_samples_per_second": 4282.121, + "eval_steps_per_second": 66.923, + "step": 6190 + }, + { + "epoch": 0.6605582782868101, + "grad_norm": 0.0015014013042673469, + "learning_rate": 6.788834434263798e-06, + "loss": 0.0003, + "step": 6200 + }, + { + "epoch": 0.6605582782868101, + "eval_loss": 0.003057195106521249, + "eval_runtime": 35.0456, + "eval_samples_per_second": 4285.1, + "eval_steps_per_second": 66.97, + "step": 6200 + }, + { + "epoch": 0.6616236948646921, + "grad_norm": 0.01135373953729868, + "learning_rate": 6.767526102706159e-06, + "loss": 0.0022, + "step": 6210 + }, + { + "epoch": 0.6616236948646921, + "eval_loss": 0.0030318093486130238, + "eval_runtime": 35.0502, + "eval_samples_per_second": 4284.546, + "eval_steps_per_second": 66.961, + "step": 6210 + }, + { + "epoch": 0.662689111442574, + "grad_norm": 0.002891425509005785, + "learning_rate": 6.74621777114852e-06, + "loss": 0.0076, + "step": 6220 + }, + { + "epoch": 0.662689111442574, + "eval_loss": 0.0030160024762153625, + "eval_runtime": 35.0854, + "eval_samples_per_second": 4280.246, + "eval_steps_per_second": 66.894, + "step": 6220 + }, + { + "epoch": 0.663754528020456, + "grad_norm": 0.004777186084538698, + "learning_rate": 6.724909439590881e-06, + "loss": 0.0025, + "step": 6230 + }, + { + "epoch": 0.663754528020456, + "eval_loss": 0.0030639716424047947, + "eval_runtime": 35.0034, + "eval_samples_per_second": 4290.265, + "eval_steps_per_second": 67.051, + "step": 6230 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 0.19623669981956482, + "learning_rate": 6.703601108033242e-06, + "loss": 0.0231, + "step": 6240 + }, + { + "epoch": 0.6648199445983379, + "eval_loss": 0.003115487052127719, + "eval_runtime": 35.0302, + "eval_samples_per_second": 4286.99, + "eval_steps_per_second": 66.999, + "step": 6240 + }, + { + "epoch": 0.6658853611762199, + "grad_norm": 0.0012964721536263824, + "learning_rate": 6.682292776475603e-06, + "loss": 0.0032, + "step": 6250 + }, + { + "epoch": 0.6658853611762199, + "eval_loss": 0.003056860063225031, + "eval_runtime": 35.0191, + "eval_samples_per_second": 4288.342, + "eval_steps_per_second": 67.021, + "step": 6250 + }, + { + "epoch": 0.6669507777541018, + "grad_norm": 0.001261876430362463, + "learning_rate": 6.660984444917964e-06, + "loss": 0.0061, + "step": 6260 + }, + { + "epoch": 0.6669507777541018, + "eval_loss": 0.0029731402173638344, + "eval_runtime": 35.0181, + "eval_samples_per_second": 4288.465, + "eval_steps_per_second": 67.022, + "step": 6260 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 0.05022572726011276, + "learning_rate": 6.639676113360325e-06, + "loss": 0.0005, + "step": 6270 + }, + { + "epoch": 0.6680161943319838, + "eval_loss": 0.0029431069269776344, + "eval_runtime": 35.033, + "eval_samples_per_second": 4286.645, + "eval_steps_per_second": 66.994, + "step": 6270 + }, + { + "epoch": 0.6690816109098657, + "grad_norm": 0.0013886064989492297, + "learning_rate": 6.618367781802686e-06, + "loss": 0.0012, + "step": 6280 + }, + { + "epoch": 0.6690816109098657, + "eval_loss": 0.0029925217386335135, + "eval_runtime": 35.0227, + "eval_samples_per_second": 4287.908, + "eval_steps_per_second": 67.014, + "step": 6280 + }, + { + "epoch": 0.6701470274877477, + "grad_norm": 0.0013931123539805412, + "learning_rate": 6.597059450245047e-06, + "loss": 0.0012, + "step": 6290 + }, + { + "epoch": 0.6701470274877477, + "eval_loss": 0.003136566374450922, + "eval_runtime": 35.002, + "eval_samples_per_second": 4290.435, + "eval_steps_per_second": 67.053, + "step": 6290 + }, + { + "epoch": 0.6712124440656296, + "grad_norm": 0.13292770087718964, + "learning_rate": 6.575751118687407e-06, + "loss": 0.0034, + "step": 6300 + }, + { + "epoch": 0.6712124440656296, + "eval_loss": 0.0031445687636733055, + "eval_runtime": 35.0137, + "eval_samples_per_second": 4289.007, + "eval_steps_per_second": 67.031, + "step": 6300 + }, + { + "epoch": 0.6722778606435116, + "grad_norm": 0.24445843696594238, + "learning_rate": 6.554442787129768e-06, + "loss": 0.0052, + "step": 6310 + }, + { + "epoch": 0.6722778606435116, + "eval_loss": 0.003103400580585003, + "eval_runtime": 35.0057, + "eval_samples_per_second": 4289.981, + "eval_steps_per_second": 67.046, + "step": 6310 + }, + { + "epoch": 0.6733432772213935, + "grad_norm": 0.0012035582913085818, + "learning_rate": 6.533134455572129e-06, + "loss": 0.0006, + "step": 6320 + }, + { + "epoch": 0.6733432772213935, + "eval_loss": 0.003125393996015191, + "eval_runtime": 35.0617, + "eval_samples_per_second": 4283.134, + "eval_steps_per_second": 66.939, + "step": 6320 + }, + { + "epoch": 0.6744086937992755, + "grad_norm": 0.0018411766504868865, + "learning_rate": 6.51182612401449e-06, + "loss": 0.0003, + "step": 6330 + }, + { + "epoch": 0.6744086937992755, + "eval_loss": 0.0031454197596758604, + "eval_runtime": 35.0566, + "eval_samples_per_second": 4283.761, + "eval_steps_per_second": 66.949, + "step": 6330 + }, + { + "epoch": 0.6754741103771574, + "grad_norm": 0.0026676368433982134, + "learning_rate": 6.490517792456851e-06, + "loss": 0.0005, + "step": 6340 + }, + { + "epoch": 0.6754741103771574, + "eval_loss": 0.0031759522389620543, + "eval_runtime": 35.0199, + "eval_samples_per_second": 4288.251, + "eval_steps_per_second": 67.019, + "step": 6340 + }, + { + "epoch": 0.6765395269550394, + "grad_norm": 0.11416032165288925, + "learning_rate": 6.469209460899212e-06, + "loss": 0.0005, + "step": 6350 + }, + { + "epoch": 0.6765395269550394, + "eval_loss": 0.0031935395672917366, + "eval_runtime": 35.0086, + "eval_samples_per_second": 4289.63, + "eval_steps_per_second": 67.041, + "step": 6350 + }, + { + "epoch": 0.6776049435329213, + "grad_norm": 0.08742302656173706, + "learning_rate": 6.447901129341573e-06, + "loss": 0.0056, + "step": 6360 + }, + { + "epoch": 0.6776049435329213, + "eval_loss": 0.0031826442573219538, + "eval_runtime": 35.021, + "eval_samples_per_second": 4288.113, + "eval_steps_per_second": 67.017, + "step": 6360 + }, + { + "epoch": 0.6786703601108033, + "grad_norm": 0.04388425499200821, + "learning_rate": 6.426592797783934e-06, + "loss": 0.0002, + "step": 6370 + }, + { + "epoch": 0.6786703601108033, + "eval_loss": 0.003147592768073082, + "eval_runtime": 34.9947, + "eval_samples_per_second": 4291.33, + "eval_steps_per_second": 67.067, + "step": 6370 + }, + { + "epoch": 0.6797357766886852, + "grad_norm": 0.37059757113456726, + "learning_rate": 6.405284466226295e-06, + "loss": 0.001, + "step": 6380 + }, + { + "epoch": 0.6797357766886852, + "eval_loss": 0.003108437405899167, + "eval_runtime": 35.0218, + "eval_samples_per_second": 4288.015, + "eval_steps_per_second": 67.015, + "step": 6380 + }, + { + "epoch": 0.6808011932665672, + "grad_norm": 0.03148869425058365, + "learning_rate": 6.383976134668656e-06, + "loss": 0.0043, + "step": 6390 + }, + { + "epoch": 0.6808011932665672, + "eval_loss": 0.0030745782423764467, + "eval_runtime": 35.0146, + "eval_samples_per_second": 4288.898, + "eval_steps_per_second": 67.029, + "step": 6390 + }, + { + "epoch": 0.6818666098444491, + "grad_norm": 0.0011743833310902119, + "learning_rate": 6.362667803111017e-06, + "loss": 0.0003, + "step": 6400 + }, + { + "epoch": 0.6818666098444491, + "eval_loss": 0.0030515496619045734, + "eval_runtime": 34.9735, + "eval_samples_per_second": 4293.942, + "eval_steps_per_second": 67.108, + "step": 6400 + }, + { + "epoch": 0.6829320264223311, + "grad_norm": 5.519503116607666, + "learning_rate": 6.341359471553378e-06, + "loss": 0.0112, + "step": 6410 + }, + { + "epoch": 0.6829320264223311, + "eval_loss": 0.0030856935773044825, + "eval_runtime": 35.0092, + "eval_samples_per_second": 4289.562, + "eval_steps_per_second": 67.04, + "step": 6410 + }, + { + "epoch": 0.683997443000213, + "grad_norm": 0.0011778927873820066, + "learning_rate": 6.320051139995739e-06, + "loss": 0.0028, + "step": 6420 + }, + { + "epoch": 0.683997443000213, + "eval_loss": 0.0030590456444770098, + "eval_runtime": 35.0267, + "eval_samples_per_second": 4287.421, + "eval_steps_per_second": 67.006, + "step": 6420 + }, + { + "epoch": 0.685062859578095, + "grad_norm": 0.19133904576301575, + "learning_rate": 6.2987428084381e-06, + "loss": 0.0007, + "step": 6430 + }, + { + "epoch": 0.685062859578095, + "eval_loss": 0.0030688135884702206, + "eval_runtime": 35.0199, + "eval_samples_per_second": 4288.245, + "eval_steps_per_second": 67.019, + "step": 6430 + }, + { + "epoch": 0.6861282761559769, + "grad_norm": 4.050024509429932, + "learning_rate": 6.277434476880461e-06, + "loss": 0.013, + "step": 6440 + }, + { + "epoch": 0.6861282761559769, + "eval_loss": 0.0031101179774850607, + "eval_runtime": 35.0365, + "eval_samples_per_second": 4286.212, + "eval_steps_per_second": 66.987, + "step": 6440 + }, + { + "epoch": 0.6871936927338589, + "grad_norm": 0.0026636181864887476, + "learning_rate": 6.256126145322822e-06, + "loss": 0.0181, + "step": 6450 + }, + { + "epoch": 0.6871936927338589, + "eval_loss": 0.0030249811243265867, + "eval_runtime": 35.0288, + "eval_samples_per_second": 4287.161, + "eval_steps_per_second": 67.002, + "step": 6450 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 0.0036579566076397896, + "learning_rate": 6.234817813765183e-06, + "loss": 0.0005, + "step": 6460 + }, + { + "epoch": 0.6882591093117408, + "eval_loss": 0.0030106704216450453, + "eval_runtime": 35.0327, + "eval_samples_per_second": 4286.684, + "eval_steps_per_second": 66.995, + "step": 6460 + }, + { + "epoch": 0.6893245258896229, + "grad_norm": 0.003752629505470395, + "learning_rate": 6.213509482207544e-06, + "loss": 0.0006, + "step": 6470 + }, + { + "epoch": 0.6893245258896229, + "eval_loss": 0.0030355704948306084, + "eval_runtime": 35.0648, + "eval_samples_per_second": 4282.758, + "eval_steps_per_second": 66.933, + "step": 6470 + }, + { + "epoch": 0.6903899424675048, + "grad_norm": 0.06187931075692177, + "learning_rate": 6.192201150649905e-06, + "loss": 0.0014, + "step": 6480 + }, + { + "epoch": 0.6903899424675048, + "eval_loss": 0.003116002306342125, + "eval_runtime": 35.0665, + "eval_samples_per_second": 4282.551, + "eval_steps_per_second": 66.93, + "step": 6480 + }, + { + "epoch": 0.6914553590453868, + "grad_norm": 0.03547167405486107, + "learning_rate": 6.1708928190922656e-06, + "loss": 0.0002, + "step": 6490 + }, + { + "epoch": 0.6914553590453868, + "eval_loss": 0.003167262999340892, + "eval_runtime": 35.0077, + "eval_samples_per_second": 4289.742, + "eval_steps_per_second": 67.042, + "step": 6490 + }, + { + "epoch": 0.6925207756232687, + "grad_norm": 0.04050152748823166, + "learning_rate": 6.1495844875346266e-06, + "loss": 0.0003, + "step": 6500 + }, + { + "epoch": 0.6925207756232687, + "eval_loss": 0.0032066998537629843, + "eval_runtime": 35.0207, + "eval_samples_per_second": 4288.147, + "eval_steps_per_second": 67.017, + "step": 6500 + }, + { + "epoch": 0.6935861922011507, + "grad_norm": 0.14706210792064667, + "learning_rate": 6.1282761559769876e-06, + "loss": 0.0001, + "step": 6510 + }, + { + "epoch": 0.6935861922011507, + "eval_loss": 0.0032550478354096413, + "eval_runtime": 35.0392, + "eval_samples_per_second": 4285.881, + "eval_steps_per_second": 66.982, + "step": 6510 + }, + { + "epoch": 0.6946516087790326, + "grad_norm": 1.0719351768493652, + "learning_rate": 6.1069678244193485e-06, + "loss": 0.0039, + "step": 6520 + }, + { + "epoch": 0.6946516087790326, + "eval_loss": 0.003266693092882633, + "eval_runtime": 35.0566, + "eval_samples_per_second": 4283.762, + "eval_steps_per_second": 66.949, + "step": 6520 + }, + { + "epoch": 0.6957170253569146, + "grad_norm": 0.0011848441790789366, + "learning_rate": 6.0856594928617095e-06, + "loss": 0.0002, + "step": 6530 + }, + { + "epoch": 0.6957170253569146, + "eval_loss": 0.0031671386677771807, + "eval_runtime": 35.0738, + "eval_samples_per_second": 4281.652, + "eval_steps_per_second": 66.916, + "step": 6530 + }, + { + "epoch": 0.6967824419347965, + "grad_norm": 0.042776867747306824, + "learning_rate": 6.0643511613040705e-06, + "loss": 0.0004, + "step": 6540 + }, + { + "epoch": 0.6967824419347965, + "eval_loss": 0.003157460829243064, + "eval_runtime": 35.0847, + "eval_samples_per_second": 4280.324, + "eval_steps_per_second": 66.895, + "step": 6540 + }, + { + "epoch": 0.6978478585126785, + "grad_norm": 0.1637280434370041, + "learning_rate": 6.0430428297464315e-06, + "loss": 0.0006, + "step": 6550 + }, + { + "epoch": 0.6978478585126785, + "eval_loss": 0.00318445498123765, + "eval_runtime": 35.0605, + "eval_samples_per_second": 4283.276, + "eval_steps_per_second": 66.941, + "step": 6550 + }, + { + "epoch": 0.6989132750905604, + "grad_norm": 0.04782974347472191, + "learning_rate": 6.0217344981887925e-06, + "loss": 0.0002, + "step": 6560 + }, + { + "epoch": 0.6989132750905604, + "eval_loss": 0.0032346732914447784, + "eval_runtime": 35.05, + "eval_samples_per_second": 4284.569, + "eval_steps_per_second": 66.962, + "step": 6560 + }, + { + "epoch": 0.6999786916684424, + "grad_norm": 0.003285630140453577, + "learning_rate": 6.0004261666311535e-06, + "loss": 0.0067, + "step": 6570 + }, + { + "epoch": 0.6999786916684424, + "eval_loss": 0.003163369372487068, + "eval_runtime": 35.0327, + "eval_samples_per_second": 4286.681, + "eval_steps_per_second": 66.995, + "step": 6570 + }, + { + "epoch": 0.7010441082463243, + "grad_norm": 0.0016075136372819543, + "learning_rate": 5.9791178350735145e-06, + "loss": 0.0004, + "step": 6580 + }, + { + "epoch": 0.7010441082463243, + "eval_loss": 0.003068899270147085, + "eval_runtime": 35.0951, + "eval_samples_per_second": 4279.057, + "eval_steps_per_second": 66.875, + "step": 6580 + }, + { + "epoch": 0.7021095248242063, + "grad_norm": 0.0011133512016385794, + "learning_rate": 5.9578095035158755e-06, + "loss": 0.0014, + "step": 6590 + }, + { + "epoch": 0.7021095248242063, + "eval_loss": 0.003099815221503377, + "eval_runtime": 35.0502, + "eval_samples_per_second": 4284.536, + "eval_steps_per_second": 66.961, + "step": 6590 + }, + { + "epoch": 0.7031749414020882, + "grad_norm": 0.002385763917118311, + "learning_rate": 5.9365011719582365e-06, + "loss": 0.0002, + "step": 6600 + }, + { + "epoch": 0.7031749414020882, + "eval_loss": 0.003167761955410242, + "eval_runtime": 35.0594, + "eval_samples_per_second": 4283.418, + "eval_steps_per_second": 66.944, + "step": 6600 + }, + { + "epoch": 0.7042403579799702, + "grad_norm": 0.0011592097580432892, + "learning_rate": 5.9151928404005975e-06, + "loss": 0.0031, + "step": 6610 + }, + { + "epoch": 0.7042403579799702, + "eval_loss": 0.0031276061199605465, + "eval_runtime": 35.0211, + "eval_samples_per_second": 4288.1, + "eval_steps_per_second": 67.017, + "step": 6610 + }, + { + "epoch": 0.7053057745578521, + "grad_norm": 0.0014141725841909647, + "learning_rate": 5.8938845088429584e-06, + "loss": 0.0002, + "step": 6620 + }, + { + "epoch": 0.7053057745578521, + "eval_loss": 0.0030569627415388823, + "eval_runtime": 35.0162, + "eval_samples_per_second": 4288.698, + "eval_steps_per_second": 67.026, + "step": 6620 + }, + { + "epoch": 0.7063711911357341, + "grad_norm": 0.0018372322665527463, + "learning_rate": 5.8725761772853194e-06, + "loss": 0.008, + "step": 6630 + }, + { + "epoch": 0.7063711911357341, + "eval_loss": 0.003044996177777648, + "eval_runtime": 35.0321, + "eval_samples_per_second": 4286.754, + "eval_steps_per_second": 66.996, + "step": 6630 + }, + { + "epoch": 0.707436607713616, + "grad_norm": 0.0027874810621142387, + "learning_rate": 5.8512678457276796e-06, + "loss": 0.0012, + "step": 6640 + }, + { + "epoch": 0.707436607713616, + "eval_loss": 0.0030959330033510923, + "eval_runtime": 35.0844, + "eval_samples_per_second": 4280.361, + "eval_steps_per_second": 66.896, + "step": 6640 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 2.3545823097229004, + "learning_rate": 5.8299595141700406e-06, + "loss": 0.0009, + "step": 6650 + }, + { + "epoch": 0.708502024291498, + "eval_loss": 0.003153095720335841, + "eval_runtime": 34.9933, + "eval_samples_per_second": 4291.507, + "eval_steps_per_second": 67.07, + "step": 6650 + }, + { + "epoch": 0.70956744086938, + "grad_norm": 0.0011235169367864728, + "learning_rate": 5.8086511826124016e-06, + "loss": 0.0005, + "step": 6660 + }, + { + "epoch": 0.70956744086938, + "eval_loss": 0.0032801416236907244, + "eval_runtime": 35.0278, + "eval_samples_per_second": 4287.279, + "eval_steps_per_second": 67.004, + "step": 6660 + }, + { + "epoch": 0.7106328574472619, + "grad_norm": 0.023665864020586014, + "learning_rate": 5.7873428510547625e-06, + "loss": 0.0004, + "step": 6670 + }, + { + "epoch": 0.7106328574472619, + "eval_loss": 0.0033080640714615583, + "eval_runtime": 35.0146, + "eval_samples_per_second": 4288.902, + "eval_steps_per_second": 67.029, + "step": 6670 + }, + { + "epoch": 0.7116982740251439, + "grad_norm": 0.00826460961252451, + "learning_rate": 5.7660345194971235e-06, + "loss": 0.0006, + "step": 6680 + }, + { + "epoch": 0.7116982740251439, + "eval_loss": 0.003354353830218315, + "eval_runtime": 35.1018, + "eval_samples_per_second": 4278.238, + "eval_steps_per_second": 66.863, + "step": 6680 + }, + { + "epoch": 0.7127636906030258, + "grad_norm": 0.2588113248348236, + "learning_rate": 5.7447261879394845e-06, + "loss": 0.001, + "step": 6690 + }, + { + "epoch": 0.7127636906030258, + "eval_loss": 0.00345269194804132, + "eval_runtime": 35.0432, + "eval_samples_per_second": 4285.398, + "eval_steps_per_second": 66.975, + "step": 6690 + }, + { + "epoch": 0.7138291071809078, + "grad_norm": 0.0016366565832868218, + "learning_rate": 5.7234178563818455e-06, + "loss": 0.0001, + "step": 6700 + }, + { + "epoch": 0.7138291071809078, + "eval_loss": 0.0035686830524355173, + "eval_runtime": 35.0532, + "eval_samples_per_second": 4284.173, + "eval_steps_per_second": 66.955, + "step": 6700 + }, + { + "epoch": 0.7148945237587897, + "grad_norm": 0.0024288988206535578, + "learning_rate": 5.7021095248242065e-06, + "loss": 0.0051, + "step": 6710 + }, + { + "epoch": 0.7148945237587897, + "eval_loss": 0.0036059534177184105, + "eval_runtime": 35.0346, + "eval_samples_per_second": 4286.443, + "eval_steps_per_second": 66.991, + "step": 6710 + }, + { + "epoch": 0.7159599403366717, + "grad_norm": 0.0010271297069266438, + "learning_rate": 5.6808011932665675e-06, + "loss": 0.0032, + "step": 6720 + }, + { + "epoch": 0.7159599403366717, + "eval_loss": 0.0035558068193495274, + "eval_runtime": 35.0227, + "eval_samples_per_second": 4287.901, + "eval_steps_per_second": 67.014, + "step": 6720 + }, + { + "epoch": 0.7170253569145536, + "grad_norm": 0.0013646967709064484, + "learning_rate": 5.6594928617089285e-06, + "loss": 0.0001, + "step": 6730 + }, + { + "epoch": 0.7170253569145536, + "eval_loss": 0.003483639331534505, + "eval_runtime": 35.0068, + "eval_samples_per_second": 4289.851, + "eval_steps_per_second": 67.044, + "step": 6730 + }, + { + "epoch": 0.7180907734924356, + "grad_norm": 0.0010676413075998425, + "learning_rate": 5.6381845301512895e-06, + "loss": 0.0003, + "step": 6740 + }, + { + "epoch": 0.7180907734924356, + "eval_loss": 0.0034696413204073906, + "eval_runtime": 35.0092, + "eval_samples_per_second": 4289.554, + "eval_steps_per_second": 67.039, + "step": 6740 + }, + { + "epoch": 0.7191561900703175, + "grad_norm": 0.0070797838270664215, + "learning_rate": 5.6168761985936505e-06, + "loss": 0.0002, + "step": 6750 + }, + { + "epoch": 0.7191561900703175, + "eval_loss": 0.003489007707685232, + "eval_runtime": 35.0141, + "eval_samples_per_second": 4288.963, + "eval_steps_per_second": 67.03, + "step": 6750 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 0.0010801940225064754, + "learning_rate": 5.5955678670360115e-06, + "loss": 0.0055, + "step": 6760 + }, + { + "epoch": 0.7202216066481995, + "eval_loss": 0.003356917528435588, + "eval_runtime": 35.0544, + "eval_samples_per_second": 4284.03, + "eval_steps_per_second": 66.953, + "step": 6760 + }, + { + "epoch": 0.7212870232260814, + "grad_norm": 0.0018471528310328722, + "learning_rate": 5.5742595354783724e-06, + "loss": 0.0002, + "step": 6770 + }, + { + "epoch": 0.7212870232260814, + "eval_loss": 0.00331767532043159, + "eval_runtime": 35.06, + "eval_samples_per_second": 4283.344, + "eval_steps_per_second": 66.942, + "step": 6770 + }, + { + "epoch": 0.7223524398039634, + "grad_norm": 0.0017673600232228637, + "learning_rate": 5.5529512039207334e-06, + "loss": 0.0056, + "step": 6780 + }, + { + "epoch": 0.7223524398039634, + "eval_loss": 0.0031172942835837603, + "eval_runtime": 35.0612, + "eval_samples_per_second": 4283.197, + "eval_steps_per_second": 66.94, + "step": 6780 + }, + { + "epoch": 0.7234178563818453, + "grad_norm": 0.0015435615787282586, + "learning_rate": 5.5316428723630944e-06, + "loss": 0.0007, + "step": 6790 + }, + { + "epoch": 0.7234178563818453, + "eval_loss": 0.0030721002258360386, + "eval_runtime": 35.0224, + "eval_samples_per_second": 4287.935, + "eval_steps_per_second": 67.014, + "step": 6790 + }, + { + "epoch": 0.7244832729597273, + "grad_norm": 0.04698014259338379, + "learning_rate": 5.510334540805455e-06, + "loss": 0.001, + "step": 6800 + }, + { + "epoch": 0.7244832729597273, + "eval_loss": 0.00307706487365067, + "eval_runtime": 35.0244, + "eval_samples_per_second": 4287.692, + "eval_steps_per_second": 67.01, + "step": 6800 + }, + { + "epoch": 0.7255486895376092, + "grad_norm": 0.002133553382009268, + "learning_rate": 5.489026209247816e-06, + "loss": 0.0001, + "step": 6810 + }, + { + "epoch": 0.7255486895376092, + "eval_loss": 0.003107481636106968, + "eval_runtime": 35.0466, + "eval_samples_per_second": 4284.985, + "eval_steps_per_second": 66.968, + "step": 6810 + }, + { + "epoch": 0.7266141061154912, + "grad_norm": 0.0030837086960673332, + "learning_rate": 5.467717877690177e-06, + "loss": 0.0012, + "step": 6820 + }, + { + "epoch": 0.7266141061154912, + "eval_loss": 0.003099891124293208, + "eval_runtime": 35.0508, + "eval_samples_per_second": 4284.463, + "eval_steps_per_second": 66.96, + "step": 6820 + }, + { + "epoch": 0.7276795226933731, + "grad_norm": 0.0010295656975358725, + "learning_rate": 5.446409546132538e-06, + "loss": 0.017, + "step": 6830 + }, + { + "epoch": 0.7276795226933731, + "eval_loss": 0.0031501969788223505, + "eval_runtime": 35.0735, + "eval_samples_per_second": 4281.691, + "eval_steps_per_second": 66.917, + "step": 6830 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 0.0011918977834284306, + "learning_rate": 5.425101214574899e-06, + "loss": 0.0015, + "step": 6840 + }, + { + "epoch": 0.728744939271255, + "eval_loss": 0.0031602659728378057, + "eval_runtime": 35.0557, + "eval_samples_per_second": 4283.869, + "eval_steps_per_second": 66.951, + "step": 6840 + }, + { + "epoch": 0.729810355849137, + "grad_norm": 0.002359379781410098, + "learning_rate": 5.40379288301726e-06, + "loss": 0.0017, + "step": 6850 + }, + { + "epoch": 0.729810355849137, + "eval_loss": 0.0032126172445714474, + "eval_runtime": 35.0649, + "eval_samples_per_second": 4282.74, + "eval_steps_per_second": 66.933, + "step": 6850 + }, + { + "epoch": 0.730875772427019, + "grad_norm": 0.002211513929069042, + "learning_rate": 5.382484551459621e-06, + "loss": 0.0001, + "step": 6860 + }, + { + "epoch": 0.730875772427019, + "eval_loss": 0.0033024682197719812, + "eval_runtime": 35.0584, + "eval_samples_per_second": 4283.538, + "eval_steps_per_second": 66.945, + "step": 6860 + }, + { + "epoch": 0.7319411890049009, + "grad_norm": 0.0362793393433094, + "learning_rate": 5.361176219901982e-06, + "loss": 0.0138, + "step": 6870 + }, + { + "epoch": 0.7319411890049009, + "eval_loss": 0.003259913297370076, + "eval_runtime": 35.0459, + "eval_samples_per_second": 4285.062, + "eval_steps_per_second": 66.969, + "step": 6870 + }, + { + "epoch": 0.7330066055827829, + "grad_norm": 0.0012098865117877722, + "learning_rate": 5.339867888344343e-06, + "loss": 0.0001, + "step": 6880 + }, + { + "epoch": 0.7330066055827829, + "eval_loss": 0.0032443315722048283, + "eval_runtime": 35.0439, + "eval_samples_per_second": 4285.305, + "eval_steps_per_second": 66.973, + "step": 6880 + }, + { + "epoch": 0.7340720221606648, + "grad_norm": 0.0010898082982748747, + "learning_rate": 5.318559556786704e-06, + "loss": 0.0007, + "step": 6890 + }, + { + "epoch": 0.7340720221606648, + "eval_loss": 0.0032403902150690556, + "eval_runtime": 35.0508, + "eval_samples_per_second": 4284.466, + "eval_steps_per_second": 66.96, + "step": 6890 + }, + { + "epoch": 0.7351374387385468, + "grad_norm": 0.016424862667918205, + "learning_rate": 5.297251225229065e-06, + "loss": 0.0163, + "step": 6900 + }, + { + "epoch": 0.7351374387385468, + "eval_loss": 0.003254901384934783, + "eval_runtime": 35.042, + "eval_samples_per_second": 4285.543, + "eval_steps_per_second": 66.977, + "step": 6900 + }, + { + "epoch": 0.7362028553164287, + "grad_norm": 0.0012151696719229221, + "learning_rate": 5.275942893671426e-06, + "loss": 0.0002, + "step": 6910 + }, + { + "epoch": 0.7362028553164287, + "eval_loss": 0.003262386191636324, + "eval_runtime": 35.0134, + "eval_samples_per_second": 4289.044, + "eval_steps_per_second": 67.031, + "step": 6910 + }, + { + "epoch": 0.7372682718943107, + "grad_norm": 0.3996301293373108, + "learning_rate": 5.254634562113787e-06, + "loss": 0.0087, + "step": 6920 + }, + { + "epoch": 0.7372682718943107, + "eval_loss": 0.003233132418245077, + "eval_runtime": 35.0817, + "eval_samples_per_second": 4280.688, + "eval_steps_per_second": 66.901, + "step": 6920 + }, + { + "epoch": 0.7383336884721926, + "grad_norm": 0.0016513338778167963, + "learning_rate": 5.233326230556148e-06, + "loss": 0.0003, + "step": 6930 + }, + { + "epoch": 0.7383336884721926, + "eval_loss": 0.003179131541401148, + "eval_runtime": 35.056, + "eval_samples_per_second": 4283.832, + "eval_steps_per_second": 66.95, + "step": 6930 + }, + { + "epoch": 0.7393991050500746, + "grad_norm": 0.0020407168194651604, + "learning_rate": 5.212017898998509e-06, + "loss": 0.0001, + "step": 6940 + }, + { + "epoch": 0.7393991050500746, + "eval_loss": 0.003184954635798931, + "eval_runtime": 35.0381, + "eval_samples_per_second": 4286.014, + "eval_steps_per_second": 66.984, + "step": 6940 + }, + { + "epoch": 0.7404645216279565, + "grad_norm": 0.0016329142963513732, + "learning_rate": 5.19070956744087e-06, + "loss": 0.0001, + "step": 6950 + }, + { + "epoch": 0.7404645216279565, + "eval_loss": 0.0031928608659654856, + "eval_runtime": 35.0685, + "eval_samples_per_second": 4282.299, + "eval_steps_per_second": 66.926, + "step": 6950 + }, + { + "epoch": 0.7415299382058385, + "grad_norm": 0.001757573802024126, + "learning_rate": 5.169401235883231e-06, + "loss": 0.0008, + "step": 6960 + }, + { + "epoch": 0.7415299382058385, + "eval_loss": 0.0032357927411794662, + "eval_runtime": 35.0463, + "eval_samples_per_second": 4285.021, + "eval_steps_per_second": 66.969, + "step": 6960 + }, + { + "epoch": 0.7425953547837204, + "grad_norm": 0.0012253515888005495, + "learning_rate": 5.148092904325592e-06, + "loss": 0.0123, + "step": 6970 + }, + { + "epoch": 0.7425953547837204, + "eval_loss": 0.003153954865410924, + "eval_runtime": 35.056, + "eval_samples_per_second": 4283.835, + "eval_steps_per_second": 66.95, + "step": 6970 + }, + { + "epoch": 0.7436607713616024, + "grad_norm": 0.011123016476631165, + "learning_rate": 5.126784572767952e-06, + "loss": 0.0002, + "step": 6980 + }, + { + "epoch": 0.7436607713616024, + "eval_loss": 0.003115166211500764, + "eval_runtime": 35.0253, + "eval_samples_per_second": 4287.583, + "eval_steps_per_second": 67.009, + "step": 6980 + }, + { + "epoch": 0.7447261879394843, + "grad_norm": 0.0014360809000208974, + "learning_rate": 5.105476241210313e-06, + "loss": 0.0025, + "step": 6990 + }, + { + "epoch": 0.7447261879394843, + "eval_loss": 0.0031400981824845076, + "eval_runtime": 35.0588, + "eval_samples_per_second": 4283.494, + "eval_steps_per_second": 66.945, + "step": 6990 + }, + { + "epoch": 0.7457916045173663, + "grad_norm": 0.11274624615907669, + "learning_rate": 5.084167909652674e-06, + "loss": 0.0034, + "step": 7000 + }, + { + "epoch": 0.7457916045173663, + "eval_loss": 0.00313013419508934, + "eval_runtime": 35.1033, + "eval_samples_per_second": 4278.065, + "eval_steps_per_second": 66.86, + "step": 7000 + }, + { + "epoch": 0.7468570210952482, + "grad_norm": 0.0017726977821439505, + "learning_rate": 5.062859578095035e-06, + "loss": 0.0028, + "step": 7010 + }, + { + "epoch": 0.7468570210952482, + "eval_loss": 0.003186985617503524, + "eval_runtime": 35.0513, + "eval_samples_per_second": 4284.409, + "eval_steps_per_second": 66.959, + "step": 7010 + }, + { + "epoch": 0.7479224376731302, + "grad_norm": 0.001665642368607223, + "learning_rate": 5.041551246537396e-06, + "loss": 0.0023, + "step": 7020 + }, + { + "epoch": 0.7479224376731302, + "eval_loss": 0.003349791280925274, + "eval_runtime": 35.0399, + "eval_samples_per_second": 4285.805, + "eval_steps_per_second": 66.981, + "step": 7020 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 0.015697909519076347, + "learning_rate": 5.020242914979757e-06, + "loss": 0.0007, + "step": 7030 + }, + { + "epoch": 0.7489878542510121, + "eval_loss": 0.003393676597625017, + "eval_runtime": 35.0379, + "eval_samples_per_second": 4286.046, + "eval_steps_per_second": 66.985, + "step": 7030 + }, + { + "epoch": 0.7500532708288941, + "grad_norm": 0.0011734378058463335, + "learning_rate": 4.998934583422118e-06, + "loss": 0.0013, + "step": 7040 + }, + { + "epoch": 0.7500532708288941, + "eval_loss": 0.003516310593113303, + "eval_runtime": 35.0437, + "eval_samples_per_second": 4285.337, + "eval_steps_per_second": 66.974, + "step": 7040 + }, + { + "epoch": 0.751118687406776, + "grad_norm": 0.001946283970028162, + "learning_rate": 4.977626251864479e-06, + "loss": 0.0021, + "step": 7050 + }, + { + "epoch": 0.751118687406776, + "eval_loss": 0.003597394796088338, + "eval_runtime": 35.0327, + "eval_samples_per_second": 4286.677, + "eval_steps_per_second": 66.994, + "step": 7050 + }, + { + "epoch": 0.752184103984658, + "grad_norm": 0.0019929111003875732, + "learning_rate": 4.95631792030684e-06, + "loss": 0.0004, + "step": 7060 + }, + { + "epoch": 0.752184103984658, + "eval_loss": 0.003620902309194207, + "eval_runtime": 35.0599, + "eval_samples_per_second": 4283.349, + "eval_steps_per_second": 66.942, + "step": 7060 + }, + { + "epoch": 0.7532495205625399, + "grad_norm": 0.0011990427738055587, + "learning_rate": 4.935009588749201e-06, + "loss": 0.0018, + "step": 7070 + }, + { + "epoch": 0.7532495205625399, + "eval_loss": 0.0034169661812484264, + "eval_runtime": 35.0447, + "eval_samples_per_second": 4285.211, + "eval_steps_per_second": 66.972, + "step": 7070 + }, + { + "epoch": 0.7543149371404219, + "grad_norm": 0.10688398033380508, + "learning_rate": 4.913701257191562e-06, + "loss": 0.0024, + "step": 7080 + }, + { + "epoch": 0.7543149371404219, + "eval_loss": 0.0034140669740736485, + "eval_runtime": 35.0525, + "eval_samples_per_second": 4284.26, + "eval_steps_per_second": 66.957, + "step": 7080 + }, + { + "epoch": 0.7553803537183038, + "grad_norm": 0.005744527094066143, + "learning_rate": 4.892392925633923e-06, + "loss": 0.0007, + "step": 7090 + }, + { + "epoch": 0.7553803537183038, + "eval_loss": 0.0033137863501906395, + "eval_runtime": 35.0391, + "eval_samples_per_second": 4285.902, + "eval_steps_per_second": 66.982, + "step": 7090 + }, + { + "epoch": 0.7564457702961858, + "grad_norm": 0.0011864439584314823, + "learning_rate": 4.871084594076284e-06, + "loss": 0.0004, + "step": 7100 + }, + { + "epoch": 0.7564457702961858, + "eval_loss": 0.0032741157338023186, + "eval_runtime": 35.0598, + "eval_samples_per_second": 4283.366, + "eval_steps_per_second": 66.943, + "step": 7100 + }, + { + "epoch": 0.7575111868740677, + "grad_norm": 0.003718329593539238, + "learning_rate": 4.849776262518645e-06, + "loss": 0.0005, + "step": 7110 + }, + { + "epoch": 0.7575111868740677, + "eval_loss": 0.0032938900403678417, + "eval_runtime": 34.9977, + "eval_samples_per_second": 4290.962, + "eval_steps_per_second": 67.061, + "step": 7110 + }, + { + "epoch": 0.7585766034519497, + "grad_norm": 0.0011979677947238088, + "learning_rate": 4.828467930961006e-06, + "loss": 0.0045, + "step": 7120 + }, + { + "epoch": 0.7585766034519497, + "eval_loss": 0.0033634670544415712, + "eval_runtime": 35.0131, + "eval_samples_per_second": 4289.081, + "eval_steps_per_second": 67.032, + "step": 7120 + }, + { + "epoch": 0.7596420200298316, + "grad_norm": 0.0033819531090557575, + "learning_rate": 4.807159599403367e-06, + "loss": 0.0209, + "step": 7130 + }, + { + "epoch": 0.7596420200298316, + "eval_loss": 0.0031614580657333136, + "eval_runtime": 35.028, + "eval_samples_per_second": 4287.261, + "eval_steps_per_second": 67.004, + "step": 7130 + }, + { + "epoch": 0.7607074366077136, + "grad_norm": 0.0051054502837359905, + "learning_rate": 4.785851267845728e-06, + "loss": 0.0107, + "step": 7140 + }, + { + "epoch": 0.7607074366077136, + "eval_loss": 0.003080246038734913, + "eval_runtime": 35.1422, + "eval_samples_per_second": 4273.324, + "eval_steps_per_second": 66.786, + "step": 7140 + }, + { + "epoch": 0.7617728531855956, + "grad_norm": 0.13544993102550507, + "learning_rate": 4.764542936288089e-06, + "loss": 0.0006, + "step": 7150 + }, + { + "epoch": 0.7617728531855956, + "eval_loss": 0.0030894039664417505, + "eval_runtime": 35.0199, + "eval_samples_per_second": 4288.25, + "eval_steps_per_second": 67.019, + "step": 7150 + }, + { + "epoch": 0.7628382697634776, + "grad_norm": 0.0017130186315625906, + "learning_rate": 4.74323460473045e-06, + "loss": 0.0001, + "step": 7160 + }, + { + "epoch": 0.7628382697634776, + "eval_loss": 0.003118880558758974, + "eval_runtime": 35.0319, + "eval_samples_per_second": 4286.777, + "eval_steps_per_second": 66.996, + "step": 7160 + }, + { + "epoch": 0.7639036863413595, + "grad_norm": 1.1553536653518677, + "learning_rate": 4.721926273172811e-06, + "loss": 0.0155, + "step": 7170 + }, + { + "epoch": 0.7639036863413595, + "eval_loss": 0.0030046890024095774, + "eval_runtime": 35.0791, + "eval_samples_per_second": 4281.007, + "eval_steps_per_second": 66.906, + "step": 7170 + }, + { + "epoch": 0.7649691029192415, + "grad_norm": 0.0015282640233635902, + "learning_rate": 4.700617941615172e-06, + "loss": 0.0071, + "step": 7180 + }, + { + "epoch": 0.7649691029192415, + "eval_loss": 0.0028774854727089405, + "eval_runtime": 35.0386, + "eval_samples_per_second": 4285.954, + "eval_steps_per_second": 66.983, + "step": 7180 + }, + { + "epoch": 0.7660345194971234, + "grad_norm": 0.001786403707228601, + "learning_rate": 4.679309610057533e-06, + "loss": 0.0042, + "step": 7190 + }, + { + "epoch": 0.7660345194971234, + "eval_loss": 0.002872324315831065, + "eval_runtime": 35.0037, + "eval_samples_per_second": 4290.235, + "eval_steps_per_second": 67.05, + "step": 7190 + }, + { + "epoch": 0.7670999360750054, + "grad_norm": 0.002205133670940995, + "learning_rate": 4.658001278499894e-06, + "loss": 0.0006, + "step": 7200 + }, + { + "epoch": 0.7670999360750054, + "eval_loss": 0.0029324537608772516, + "eval_runtime": 35.0174, + "eval_samples_per_second": 4288.558, + "eval_steps_per_second": 67.024, + "step": 7200 + }, + { + "epoch": 0.7681653526528873, + "grad_norm": 0.002565717324614525, + "learning_rate": 4.636692946942255e-06, + "loss": 0.0005, + "step": 7210 + }, + { + "epoch": 0.7681653526528873, + "eval_loss": 0.0029697574209421873, + "eval_runtime": 35.0534, + "eval_samples_per_second": 4284.145, + "eval_steps_per_second": 66.955, + "step": 7210 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.1513327360153198, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0162, + "step": 7220 + }, + { + "epoch": 0.7692307692307693, + "eval_loss": 0.002979603363201022, + "eval_runtime": 35.0461, + "eval_samples_per_second": 4285.043, + "eval_steps_per_second": 66.969, + "step": 7220 + }, + { + "epoch": 0.7702961858086512, + "grad_norm": 0.002426127204671502, + "learning_rate": 4.594076283826976e-06, + "loss": 0.0009, + "step": 7230 + }, + { + "epoch": 0.7702961858086512, + "eval_loss": 0.002993279369547963, + "eval_runtime": 35.0161, + "eval_samples_per_second": 4288.714, + "eval_steps_per_second": 67.026, + "step": 7230 + }, + { + "epoch": 0.7713616023865332, + "grad_norm": 0.0015931341331452131, + "learning_rate": 4.572767952269337e-06, + "loss": 0.0004, + "step": 7240 + }, + { + "epoch": 0.7713616023865332, + "eval_loss": 0.0029843186493963003, + "eval_runtime": 35.0478, + "eval_samples_per_second": 4284.837, + "eval_steps_per_second": 66.966, + "step": 7240 + }, + { + "epoch": 0.7724270189644151, + "grad_norm": 0.0018893532687798142, + "learning_rate": 4.551459620711698e-06, + "loss": 0.0078, + "step": 7250 + }, + { + "epoch": 0.7724270189644151, + "eval_loss": 0.0029693315736949444, + "eval_runtime": 35.0331, + "eval_samples_per_second": 4286.632, + "eval_steps_per_second": 66.994, + "step": 7250 + }, + { + "epoch": 0.7734924355422971, + "grad_norm": 0.009522825479507446, + "learning_rate": 4.530151289154059e-06, + "loss": 0.02, + "step": 7260 + }, + { + "epoch": 0.7734924355422971, + "eval_loss": 0.0028855737764388323, + "eval_runtime": 35.057, + "eval_samples_per_second": 4283.708, + "eval_steps_per_second": 66.948, + "step": 7260 + }, + { + "epoch": 0.774557852120179, + "grad_norm": 0.00656323553994298, + "learning_rate": 4.50884295759642e-06, + "loss": 0.0004, + "step": 7270 + }, + { + "epoch": 0.774557852120179, + "eval_loss": 0.002848101779818535, + "eval_runtime": 35.0368, + "eval_samples_per_second": 4286.18, + "eval_steps_per_second": 66.987, + "step": 7270 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 0.07277275621891022, + "learning_rate": 4.487534626038781e-06, + "loss": 0.0022, + "step": 7280 + }, + { + "epoch": 0.775623268698061, + "eval_loss": 0.00286454102024436, + "eval_runtime": 35.0012, + "eval_samples_per_second": 4290.544, + "eval_steps_per_second": 67.055, + "step": 7280 + }, + { + "epoch": 0.7766886852759429, + "grad_norm": 0.0020550009794533253, + "learning_rate": 4.466226294481142e-06, + "loss": 0.0009, + "step": 7290 + }, + { + "epoch": 0.7766886852759429, + "eval_loss": 0.002871564356610179, + "eval_runtime": 35.0429, + "eval_samples_per_second": 4285.428, + "eval_steps_per_second": 66.975, + "step": 7290 + }, + { + "epoch": 0.7777541018538249, + "grad_norm": 0.0024138211738318205, + "learning_rate": 4.444917962923503e-06, + "loss": 0.0017, + "step": 7300 + }, + { + "epoch": 0.7777541018538249, + "eval_loss": 0.0029342793859541416, + "eval_runtime": 35.0165, + "eval_samples_per_second": 4288.667, + "eval_steps_per_second": 67.026, + "step": 7300 + }, + { + "epoch": 0.7788195184317068, + "grad_norm": 0.002006649738177657, + "learning_rate": 4.423609631365864e-06, + "loss": 0.0001, + "step": 7310 + }, + { + "epoch": 0.7788195184317068, + "eval_loss": 0.0029674111865460873, + "eval_runtime": 35.0269, + "eval_samples_per_second": 4287.391, + "eval_steps_per_second": 67.006, + "step": 7310 + }, + { + "epoch": 0.7798849350095888, + "grad_norm": 0.23964039981365204, + "learning_rate": 4.402301299808225e-06, + "loss": 0.0005, + "step": 7320 + }, + { + "epoch": 0.7798849350095888, + "eval_loss": 0.0029816378373652697, + "eval_runtime": 35.0273, + "eval_samples_per_second": 4287.336, + "eval_steps_per_second": 67.005, + "step": 7320 + }, + { + "epoch": 0.7809503515874707, + "grad_norm": 0.07510890811681747, + "learning_rate": 4.380992968250586e-06, + "loss": 0.0171, + "step": 7330 + }, + { + "epoch": 0.7809503515874707, + "eval_loss": 0.002983283717185259, + "eval_runtime": 35.0339, + "eval_samples_per_second": 4286.533, + "eval_steps_per_second": 66.992, + "step": 7330 + }, + { + "epoch": 0.7820157681653527, + "grad_norm": 0.026817040517926216, + "learning_rate": 4.359684636692947e-06, + "loss": 0.0002, + "step": 7340 + }, + { + "epoch": 0.7820157681653527, + "eval_loss": 0.0029749777168035507, + "eval_runtime": 35.0352, + "eval_samples_per_second": 4286.376, + "eval_steps_per_second": 66.99, + "step": 7340 + }, + { + "epoch": 0.7830811847432346, + "grad_norm": 0.002611766569316387, + "learning_rate": 4.338376305135308e-06, + "loss": 0.002, + "step": 7350 + }, + { + "epoch": 0.7830811847432346, + "eval_loss": 0.003011771710589528, + "eval_runtime": 35.0182, + "eval_samples_per_second": 4288.458, + "eval_steps_per_second": 67.022, + "step": 7350 + }, + { + "epoch": 0.7841466013211166, + "grad_norm": 0.0021272755693644285, + "learning_rate": 4.317067973577669e-06, + "loss": 0.0019, + "step": 7360 + }, + { + "epoch": 0.7841466013211166, + "eval_loss": 0.0032490803860127926, + "eval_runtime": 35.0078, + "eval_samples_per_second": 4289.734, + "eval_steps_per_second": 67.042, + "step": 7360 + }, + { + "epoch": 0.7852120178989985, + "grad_norm": 0.0024452470242977142, + "learning_rate": 4.29575964202003e-06, + "loss": 0.0066, + "step": 7370 + }, + { + "epoch": 0.7852120178989985, + "eval_loss": 0.0033675709273666143, + "eval_runtime": 35.0296, + "eval_samples_per_second": 4287.054, + "eval_steps_per_second": 67.0, + "step": 7370 + }, + { + "epoch": 0.7862774344768805, + "grad_norm": 0.31848591566085815, + "learning_rate": 4.274451310462391e-06, + "loss": 0.0024, + "step": 7380 + }, + { + "epoch": 0.7862774344768805, + "eval_loss": 0.0031276163645088673, + "eval_runtime": 35.0063, + "eval_samples_per_second": 4289.91, + "eval_steps_per_second": 67.045, + "step": 7380 + }, + { + "epoch": 0.7873428510547624, + "grad_norm": 0.018683720380067825, + "learning_rate": 4.253142978904752e-06, + "loss": 0.0012, + "step": 7390 + }, + { + "epoch": 0.7873428510547624, + "eval_loss": 0.003029879881069064, + "eval_runtime": 35.0335, + "eval_samples_per_second": 4286.585, + "eval_steps_per_second": 66.993, + "step": 7390 + }, + { + "epoch": 0.7884082676326444, + "grad_norm": 0.39703598618507385, + "learning_rate": 4.231834647347113e-06, + "loss": 0.0031, + "step": 7400 + }, + { + "epoch": 0.7884082676326444, + "eval_loss": 0.0030737167689949274, + "eval_runtime": 35.0217, + "eval_samples_per_second": 4288.024, + "eval_steps_per_second": 67.016, + "step": 7400 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.003899503033608198, + "learning_rate": 4.210526315789474e-06, + "loss": 0.0039, + "step": 7410 + }, + { + "epoch": 0.7894736842105263, + "eval_loss": 0.0030558835715055466, + "eval_runtime": 35.0174, + "eval_samples_per_second": 4288.552, + "eval_steps_per_second": 67.024, + "step": 7410 + }, + { + "epoch": 0.7905391007884083, + "grad_norm": 0.0023945241700857878, + "learning_rate": 4.189217984231835e-06, + "loss": 0.0009, + "step": 7420 + }, + { + "epoch": 0.7905391007884083, + "eval_loss": 0.003028090111911297, + "eval_runtime": 35.0152, + "eval_samples_per_second": 4288.821, + "eval_steps_per_second": 67.028, + "step": 7420 + }, + { + "epoch": 0.7916045173662902, + "grad_norm": 0.007479314226657152, + "learning_rate": 4.167909652674196e-06, + "loss": 0.0027, + "step": 7430 + }, + { + "epoch": 0.7916045173662902, + "eval_loss": 0.0029746410436928272, + "eval_runtime": 35.0281, + "eval_samples_per_second": 4287.249, + "eval_steps_per_second": 67.003, + "step": 7430 + }, + { + "epoch": 0.7926699339441722, + "grad_norm": 0.0067185997031629086, + "learning_rate": 4.146601321116557e-06, + "loss": 0.0106, + "step": 7440 + }, + { + "epoch": 0.7926699339441722, + "eval_loss": 0.0029829549603164196, + "eval_runtime": 35.033, + "eval_samples_per_second": 4286.641, + "eval_steps_per_second": 66.994, + "step": 7440 + }, + { + "epoch": 0.7937353505220541, + "grad_norm": 0.13891682028770447, + "learning_rate": 4.125292989558918e-06, + "loss": 0.0004, + "step": 7450 + }, + { + "epoch": 0.7937353505220541, + "eval_loss": 0.002984261605888605, + "eval_runtime": 35.0187, + "eval_samples_per_second": 4288.393, + "eval_steps_per_second": 67.021, + "step": 7450 + }, + { + "epoch": 0.7948007670999361, + "grad_norm": 0.13743676245212555, + "learning_rate": 4.103984658001279e-06, + "loss": 0.0002, + "step": 7460 + }, + { + "epoch": 0.7948007670999361, + "eval_loss": 0.0029855200555175543, + "eval_runtime": 35.0451, + "eval_samples_per_second": 4285.16, + "eval_steps_per_second": 66.971, + "step": 7460 + }, + { + "epoch": 0.795866183677818, + "grad_norm": 0.2898567020893097, + "learning_rate": 4.08267632644364e-06, + "loss": 0.0004, + "step": 7470 + }, + { + "epoch": 0.795866183677818, + "eval_loss": 0.002985232975333929, + "eval_runtime": 35.0329, + "eval_samples_per_second": 4286.662, + "eval_steps_per_second": 66.994, + "step": 7470 + }, + { + "epoch": 0.7969316002557, + "grad_norm": 0.0027324198745191097, + "learning_rate": 4.061367994886001e-06, + "loss": 0.0088, + "step": 7480 + }, + { + "epoch": 0.7969316002557, + "eval_loss": 0.002921548206359148, + "eval_runtime": 35.0082, + "eval_samples_per_second": 4289.676, + "eval_steps_per_second": 67.041, + "step": 7480 + }, + { + "epoch": 0.7979970168335819, + "grad_norm": 0.6680575609207153, + "learning_rate": 4.040059663328362e-06, + "loss": 0.0046, + "step": 7490 + }, + { + "epoch": 0.7979970168335819, + "eval_loss": 0.00286526489071548, + "eval_runtime": 35.0436, + "eval_samples_per_second": 4285.344, + "eval_steps_per_second": 66.974, + "step": 7490 + }, + { + "epoch": 0.7990624334114639, + "grad_norm": 0.25453507900238037, + "learning_rate": 4.018751331770723e-06, + "loss": 0.001, + "step": 7500 + }, + { + "epoch": 0.7990624334114639, + "eval_loss": 0.0029128112364560366, + "eval_runtime": 35.0244, + "eval_samples_per_second": 4287.697, + "eval_steps_per_second": 67.01, + "step": 7500 + }, + { + "epoch": 0.8001278499893458, + "grad_norm": 0.29362809658050537, + "learning_rate": 3.997443000213084e-06, + "loss": 0.0062, + "step": 7510 + }, + { + "epoch": 0.8001278499893458, + "eval_loss": 0.002864515408873558, + "eval_runtime": 35.0496, + "eval_samples_per_second": 4284.613, + "eval_steps_per_second": 66.962, + "step": 7510 + }, + { + "epoch": 0.8011932665672278, + "grad_norm": 0.0018628902034834027, + "learning_rate": 3.976134668655445e-06, + "loss": 0.0013, + "step": 7520 + }, + { + "epoch": 0.8011932665672278, + "eval_loss": 0.002836798317730427, + "eval_runtime": 35.0704, + "eval_samples_per_second": 4282.074, + "eval_steps_per_second": 66.923, + "step": 7520 + }, + { + "epoch": 0.8022586831451097, + "grad_norm": 0.0024648455437272787, + "learning_rate": 3.954826337097806e-06, + "loss": 0.0083, + "step": 7530 + }, + { + "epoch": 0.8022586831451097, + "eval_loss": 0.0029008083511143923, + "eval_runtime": 35.0548, + "eval_samples_per_second": 4283.981, + "eval_steps_per_second": 66.952, + "step": 7530 + }, + { + "epoch": 0.8033240997229917, + "grad_norm": 0.002339346567168832, + "learning_rate": 3.933518005540167e-06, + "loss": 0.0002, + "step": 7540 + }, + { + "epoch": 0.8033240997229917, + "eval_loss": 0.0029889014549553394, + "eval_runtime": 35.0496, + "eval_samples_per_second": 4284.61, + "eval_steps_per_second": 66.962, + "step": 7540 + }, + { + "epoch": 0.8043895163008736, + "grad_norm": 0.04572073370218277, + "learning_rate": 3.912209673982528e-06, + "loss": 0.0003, + "step": 7550 + }, + { + "epoch": 0.8043895163008736, + "eval_loss": 0.0030233801808208227, + "eval_runtime": 35.0316, + "eval_samples_per_second": 4286.82, + "eval_steps_per_second": 66.997, + "step": 7550 + }, + { + "epoch": 0.8054549328787556, + "grad_norm": 0.09433967620134354, + "learning_rate": 3.890901342424889e-06, + "loss": 0.0003, + "step": 7560 + }, + { + "epoch": 0.8054549328787556, + "eval_loss": 0.0030390520114451647, + "eval_runtime": 35.0528, + "eval_samples_per_second": 4284.227, + "eval_steps_per_second": 66.956, + "step": 7560 + }, + { + "epoch": 0.8065203494566375, + "grad_norm": 0.004556519910693169, + "learning_rate": 3.869593010867249e-06, + "loss": 0.0119, + "step": 7570 + }, + { + "epoch": 0.8065203494566375, + "eval_loss": 0.0029982631094753742, + "eval_runtime": 35.0447, + "eval_samples_per_second": 4285.217, + "eval_steps_per_second": 66.972, + "step": 7570 + }, + { + "epoch": 0.8075857660345195, + "grad_norm": 0.004369661677628756, + "learning_rate": 3.84828467930961e-06, + "loss": 0.0074, + "step": 7580 + }, + { + "epoch": 0.8075857660345195, + "eval_loss": 0.0028656297363340855, + "eval_runtime": 35.0204, + "eval_samples_per_second": 4288.185, + "eval_steps_per_second": 67.018, + "step": 7580 + }, + { + "epoch": 0.8086511826124014, + "grad_norm": 0.37288787961006165, + "learning_rate": 3.826976347751971e-06, + "loss": 0.0027, + "step": 7590 + }, + { + "epoch": 0.8086511826124014, + "eval_loss": 0.0028651338070631027, + "eval_runtime": 35.0362, + "eval_samples_per_second": 4286.255, + "eval_steps_per_second": 66.988, + "step": 7590 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 0.007273674942553043, + "learning_rate": 3.805668016194332e-06, + "loss": 0.0075, + "step": 7600 + }, + { + "epoch": 0.8097165991902834, + "eval_loss": 0.002874514786526561, + "eval_runtime": 35.0364, + "eval_samples_per_second": 4286.227, + "eval_steps_per_second": 66.987, + "step": 7600 + }, + { + "epoch": 0.8107820157681653, + "grad_norm": 0.003154418431222439, + "learning_rate": 3.784359684636693e-06, + "loss": 0.0016, + "step": 7610 + }, + { + "epoch": 0.8107820157681653, + "eval_loss": 0.0028334720991551876, + "eval_runtime": 35.0032, + "eval_samples_per_second": 4290.298, + "eval_steps_per_second": 67.051, + "step": 7610 + }, + { + "epoch": 0.8118474323460473, + "grad_norm": 0.16729117929935455, + "learning_rate": 3.763051353079054e-06, + "loss": 0.0003, + "step": 7620 + }, + { + "epoch": 0.8118474323460473, + "eval_loss": 0.0028151795268058777, + "eval_runtime": 35.1147, + "eval_samples_per_second": 4276.674, + "eval_steps_per_second": 66.838, + "step": 7620 + }, + { + "epoch": 0.8129128489239292, + "grad_norm": 0.11129946261644363, + "learning_rate": 3.741743021521415e-06, + "loss": 0.0036, + "step": 7630 + }, + { + "epoch": 0.8129128489239292, + "eval_loss": 0.0028255251236259937, + "eval_runtime": 35.0946, + "eval_samples_per_second": 4279.114, + "eval_steps_per_second": 66.876, + "step": 7630 + }, + { + "epoch": 0.8139782655018112, + "grad_norm": 0.006738661322742701, + "learning_rate": 3.720434689963776e-06, + "loss": 0.0038, + "step": 7640 + }, + { + "epoch": 0.8139782655018112, + "eval_loss": 0.002836094470694661, + "eval_runtime": 35.0731, + "eval_samples_per_second": 4281.737, + "eval_steps_per_second": 66.917, + "step": 7640 + }, + { + "epoch": 0.8150436820796931, + "grad_norm": 0.008290871046483517, + "learning_rate": 3.699126358406137e-06, + "loss": 0.0003, + "step": 7650 + }, + { + "epoch": 0.8150436820796931, + "eval_loss": 0.002835857914760709, + "eval_runtime": 35.0577, + "eval_samples_per_second": 4283.622, + "eval_steps_per_second": 66.947, + "step": 7650 + }, + { + "epoch": 0.8161090986575751, + "grad_norm": 0.0021515628322958946, + "learning_rate": 3.677818026848498e-06, + "loss": 0.0003, + "step": 7660 + }, + { + "epoch": 0.8161090986575751, + "eval_loss": 0.0028307398315519094, + "eval_runtime": 35.0504, + "eval_samples_per_second": 4284.522, + "eval_steps_per_second": 66.961, + "step": 7660 + }, + { + "epoch": 0.817174515235457, + "grad_norm": 0.0018256115727126598, + "learning_rate": 3.656509695290859e-06, + "loss": 0.0024, + "step": 7670 + }, + { + "epoch": 0.817174515235457, + "eval_loss": 0.002814466366544366, + "eval_runtime": 35.0893, + "eval_samples_per_second": 4279.763, + "eval_steps_per_second": 66.886, + "step": 7670 + }, + { + "epoch": 0.818239931813339, + "grad_norm": 0.6402817368507385, + "learning_rate": 3.63520136373322e-06, + "loss": 0.0021, + "step": 7680 + }, + { + "epoch": 0.818239931813339, + "eval_loss": 0.0028463115449994802, + "eval_runtime": 35.067, + "eval_samples_per_second": 4282.489, + "eval_steps_per_second": 66.929, + "step": 7680 + }, + { + "epoch": 0.8193053483912209, + "grad_norm": 0.0017836468759924173, + "learning_rate": 3.613893032175581e-06, + "loss": 0.0006, + "step": 7690 + }, + { + "epoch": 0.8193053483912209, + "eval_loss": 0.0028544815722852945, + "eval_runtime": 35.0337, + "eval_samples_per_second": 4286.553, + "eval_steps_per_second": 66.993, + "step": 7690 + }, + { + "epoch": 0.8203707649691029, + "grad_norm": 0.005954293999820948, + "learning_rate": 3.592584700617942e-06, + "loss": 0.0004, + "step": 7700 + }, + { + "epoch": 0.8203707649691029, + "eval_loss": 0.0028285484295338392, + "eval_runtime": 35.055, + "eval_samples_per_second": 4283.948, + "eval_steps_per_second": 66.952, + "step": 7700 + }, + { + "epoch": 0.8214361815469848, + "grad_norm": 0.002660792786628008, + "learning_rate": 3.571276369060303e-06, + "loss": 0.0053, + "step": 7710 + }, + { + "epoch": 0.8214361815469848, + "eval_loss": 0.0027588389348238707, + "eval_runtime": 35.0245, + "eval_samples_per_second": 4287.679, + "eval_steps_per_second": 67.01, + "step": 7710 + }, + { + "epoch": 0.8225015981248668, + "grad_norm": 0.006484444718807936, + "learning_rate": 3.549968037502664e-06, + "loss": 0.0029, + "step": 7720 + }, + { + "epoch": 0.8225015981248668, + "eval_loss": 0.0027692620642483234, + "eval_runtime": 35.0538, + "eval_samples_per_second": 4284.096, + "eval_steps_per_second": 66.954, + "step": 7720 + }, + { + "epoch": 0.8235670147027487, + "grad_norm": 0.003297739662230015, + "learning_rate": 3.528659705945025e-06, + "loss": 0.0017, + "step": 7730 + }, + { + "epoch": 0.8235670147027487, + "eval_loss": 0.0028382448945194483, + "eval_runtime": 35.0157, + "eval_samples_per_second": 4288.766, + "eval_steps_per_second": 67.027, + "step": 7730 + }, + { + "epoch": 0.8246324312806307, + "grad_norm": 0.001944978255778551, + "learning_rate": 3.5073513743873855e-06, + "loss": 0.0025, + "step": 7740 + }, + { + "epoch": 0.8246324312806307, + "eval_loss": 0.0028590108267962933, + "eval_runtime": 35.0189, + "eval_samples_per_second": 4288.367, + "eval_steps_per_second": 67.021, + "step": 7740 + }, + { + "epoch": 0.8256978478585126, + "grad_norm": 0.0017903875559568405, + "learning_rate": 3.4860430428297465e-06, + "loss": 0.0133, + "step": 7750 + }, + { + "epoch": 0.8256978478585126, + "eval_loss": 0.0028446416836231947, + "eval_runtime": 35.0345, + "eval_samples_per_second": 4286.465, + "eval_steps_per_second": 66.991, + "step": 7750 + }, + { + "epoch": 0.8267632644363946, + "grad_norm": 0.0015631518326699734, + "learning_rate": 3.4647347112721075e-06, + "loss": 0.0064, + "step": 7760 + }, + { + "epoch": 0.8267632644363946, + "eval_loss": 0.0028156498447060585, + "eval_runtime": 35.0207, + "eval_samples_per_second": 4288.149, + "eval_steps_per_second": 67.017, + "step": 7760 + }, + { + "epoch": 0.8278286810142765, + "grad_norm": 0.0022290684282779694, + "learning_rate": 3.4434263797144685e-06, + "loss": 0.0021, + "step": 7770 + }, + { + "epoch": 0.8278286810142765, + "eval_loss": 0.0028021347243338823, + "eval_runtime": 35.0059, + "eval_samples_per_second": 4289.966, + "eval_steps_per_second": 67.046, + "step": 7770 + }, + { + "epoch": 0.8288940975921585, + "grad_norm": 0.001918564666993916, + "learning_rate": 3.4221180481568295e-06, + "loss": 0.0002, + "step": 7780 + }, + { + "epoch": 0.8288940975921585, + "eval_loss": 0.0027851953636854887, + "eval_runtime": 35.0363, + "eval_samples_per_second": 4286.241, + "eval_steps_per_second": 66.988, + "step": 7780 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 0.027464309707283974, + "learning_rate": 3.4008097165991905e-06, + "loss": 0.0016, + "step": 7790 + }, + { + "epoch": 0.8299595141700404, + "eval_loss": 0.002777885412797332, + "eval_runtime": 35.0127, + "eval_samples_per_second": 4289.128, + "eval_steps_per_second": 67.033, + "step": 7790 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 0.004105029162019491, + "learning_rate": 3.3795013850415515e-06, + "loss": 0.0035, + "step": 7800 + }, + { + "epoch": 0.8310249307479224, + "eval_loss": 0.0027947339694947004, + "eval_runtime": 35.0353, + "eval_samples_per_second": 4286.367, + "eval_steps_per_second": 66.99, + "step": 7800 + }, + { + "epoch": 0.8320903473258043, + "grad_norm": 0.5708588361740112, + "learning_rate": 3.3581930534839125e-06, + "loss": 0.0043, + "step": 7810 + }, + { + "epoch": 0.8320903473258043, + "eval_loss": 0.0027894387021660805, + "eval_runtime": 35.0245, + "eval_samples_per_second": 4287.678, + "eval_steps_per_second": 67.01, + "step": 7810 + }, + { + "epoch": 0.8331557639036863, + "grad_norm": 0.002082349034026265, + "learning_rate": 3.3368847219262734e-06, + "loss": 0.0073, + "step": 7820 + }, + { + "epoch": 0.8331557639036863, + "eval_loss": 0.002818479435518384, + "eval_runtime": 35.0308, + "eval_samples_per_second": 4286.913, + "eval_steps_per_second": 66.998, + "step": 7820 + }, + { + "epoch": 0.8342211804815683, + "grad_norm": 0.0014790042769163847, + "learning_rate": 3.3155763903686344e-06, + "loss": 0.001, + "step": 7830 + }, + { + "epoch": 0.8342211804815683, + "eval_loss": 0.002822867361828685, + "eval_runtime": 35.0817, + "eval_samples_per_second": 4280.692, + "eval_steps_per_second": 66.901, + "step": 7830 + }, + { + "epoch": 0.8352865970594503, + "grad_norm": 0.025291219353675842, + "learning_rate": 3.2942680588109954e-06, + "loss": 0.0011, + "step": 7840 + }, + { + "epoch": 0.8352865970594503, + "eval_loss": 0.0028085343074053526, + "eval_runtime": 35.0213, + "eval_samples_per_second": 4288.076, + "eval_steps_per_second": 67.016, + "step": 7840 + }, + { + "epoch": 0.8363520136373322, + "grad_norm": 0.0024894457310438156, + "learning_rate": 3.2729597272533564e-06, + "loss": 0.0004, + "step": 7850 + }, + { + "epoch": 0.8363520136373322, + "eval_loss": 0.0028004287742078304, + "eval_runtime": 34.9977, + "eval_samples_per_second": 4290.973, + "eval_steps_per_second": 67.062, + "step": 7850 + }, + { + "epoch": 0.8374174302152142, + "grad_norm": 0.001692480524070561, + "learning_rate": 3.2516513956957174e-06, + "loss": 0.0002, + "step": 7860 + }, + { + "epoch": 0.8374174302152142, + "eval_loss": 0.0027998967561870813, + "eval_runtime": 35.0112, + "eval_samples_per_second": 4289.317, + "eval_steps_per_second": 67.036, + "step": 7860 + }, + { + "epoch": 0.8384828467930961, + "grad_norm": 0.0016112946905195713, + "learning_rate": 3.2303430641380784e-06, + "loss": 0.0061, + "step": 7870 + }, + { + "epoch": 0.8384828467930961, + "eval_loss": 0.0027837178204208612, + "eval_runtime": 35.0425, + "eval_samples_per_second": 4285.484, + "eval_steps_per_second": 66.976, + "step": 7870 + }, + { + "epoch": 0.8395482633709781, + "grad_norm": 0.047582581639289856, + "learning_rate": 3.2090347325804394e-06, + "loss": 0.0168, + "step": 7880 + }, + { + "epoch": 0.8395482633709781, + "eval_loss": 0.002785380929708481, + "eval_runtime": 35.0378, + "eval_samples_per_second": 4286.056, + "eval_steps_per_second": 66.985, + "step": 7880 + }, + { + "epoch": 0.84061367994886, + "grad_norm": 0.0018558768788352609, + "learning_rate": 3.1877264010228004e-06, + "loss": 0.0011, + "step": 7890 + }, + { + "epoch": 0.84061367994886, + "eval_loss": 0.002787909237667918, + "eval_runtime": 35.0402, + "eval_samples_per_second": 4285.761, + "eval_steps_per_second": 66.98, + "step": 7890 + }, + { + "epoch": 0.841679096526742, + "grad_norm": 0.0328022725880146, + "learning_rate": 3.1664180694651614e-06, + "loss": 0.0006, + "step": 7900 + }, + { + "epoch": 0.841679096526742, + "eval_loss": 0.0027848321478813887, + "eval_runtime": 35.0488, + "eval_samples_per_second": 4284.716, + "eval_steps_per_second": 66.964, + "step": 7900 + }, + { + "epoch": 0.842744513104624, + "grad_norm": 0.0023002829402685165, + "learning_rate": 3.145109737907522e-06, + "loss": 0.0015, + "step": 7910 + }, + { + "epoch": 0.842744513104624, + "eval_loss": 0.00278343609534204, + "eval_runtime": 35.0285, + "eval_samples_per_second": 4287.2, + "eval_steps_per_second": 67.003, + "step": 7910 + }, + { + "epoch": 0.8438099296825059, + "grad_norm": 0.001770269824191928, + "learning_rate": 3.123801406349883e-06, + "loss": 0.0035, + "step": 7920 + }, + { + "epoch": 0.8438099296825059, + "eval_loss": 0.0027938741259276867, + "eval_runtime": 35.0162, + "eval_samples_per_second": 4288.704, + "eval_steps_per_second": 67.026, + "step": 7920 + }, + { + "epoch": 0.8448753462603878, + "grad_norm": 0.0020561525598168373, + "learning_rate": 3.102493074792244e-06, + "loss": 0.0004, + "step": 7930 + }, + { + "epoch": 0.8448753462603878, + "eval_loss": 0.0028133615851402283, + "eval_runtime": 35.0294, + "eval_samples_per_second": 4287.089, + "eval_steps_per_second": 67.001, + "step": 7930 + }, + { + "epoch": 0.8459407628382698, + "grad_norm": 0.023834535852074623, + "learning_rate": 3.081184743234605e-06, + "loss": 0.0002, + "step": 7940 + }, + { + "epoch": 0.8459407628382698, + "eval_loss": 0.002823204966261983, + "eval_runtime": 34.9882, + "eval_samples_per_second": 4292.129, + "eval_steps_per_second": 67.08, + "step": 7940 + }, + { + "epoch": 0.8470061794161517, + "grad_norm": 0.0046548559330403805, + "learning_rate": 3.059876411676966e-06, + "loss": 0.0004, + "step": 7950 + }, + { + "epoch": 0.8470061794161517, + "eval_loss": 0.0028295184019953012, + "eval_runtime": 34.9818, + "eval_samples_per_second": 4292.921, + "eval_steps_per_second": 67.092, + "step": 7950 + }, + { + "epoch": 0.8480715959940337, + "grad_norm": 0.007586074061691761, + "learning_rate": 3.038568080119327e-06, + "loss": 0.0004, + "step": 7960 + }, + { + "epoch": 0.8480715959940337, + "eval_loss": 0.0028241388499736786, + "eval_runtime": 34.9874, + "eval_samples_per_second": 4292.226, + "eval_steps_per_second": 67.081, + "step": 7960 + }, + { + "epoch": 0.8491370125719157, + "grad_norm": 0.0014697522856295109, + "learning_rate": 3.017259748561688e-06, + "loss": 0.0023, + "step": 7970 + }, + { + "epoch": 0.8491370125719157, + "eval_loss": 0.002786256605759263, + "eval_runtime": 35.0455, + "eval_samples_per_second": 4285.115, + "eval_steps_per_second": 66.97, + "step": 7970 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 0.006472844164818525, + "learning_rate": 2.995951417004049e-06, + "loss": 0.0005, + "step": 7980 + }, + { + "epoch": 0.8502024291497976, + "eval_loss": 0.0027799701783806086, + "eval_runtime": 35.0498, + "eval_samples_per_second": 4284.593, + "eval_steps_per_second": 66.962, + "step": 7980 + }, + { + "epoch": 0.8512678457276796, + "grad_norm": 0.16366152465343475, + "learning_rate": 2.97464308544641e-06, + "loss": 0.0025, + "step": 7990 + }, + { + "epoch": 0.8512678457276796, + "eval_loss": 0.002784137846902013, + "eval_runtime": 35.0048, + "eval_samples_per_second": 4290.101, + "eval_steps_per_second": 67.048, + "step": 7990 + }, + { + "epoch": 0.8523332623055615, + "grad_norm": 0.00848406832665205, + "learning_rate": 2.953334753888771e-06, + "loss": 0.0001, + "step": 8000 + }, + { + "epoch": 0.8523332623055615, + "eval_loss": 0.002791937440633774, + "eval_runtime": 35.0313, + "eval_samples_per_second": 4286.847, + "eval_steps_per_second": 66.997, + "step": 8000 + }, + { + "epoch": 0.8533986788834435, + "grad_norm": 0.0016626849537715316, + "learning_rate": 2.932026422331132e-06, + "loss": 0.0016, + "step": 8010 + }, + { + "epoch": 0.8533986788834435, + "eval_loss": 0.0027943544555455446, + "eval_runtime": 35.0983, + "eval_samples_per_second": 4278.668, + "eval_steps_per_second": 66.869, + "step": 8010 + }, + { + "epoch": 0.8544640954613254, + "grad_norm": 0.0065400260500609875, + "learning_rate": 2.910718090773493e-06, + "loss": 0.0024, + "step": 8020 + }, + { + "epoch": 0.8544640954613254, + "eval_loss": 0.0027912973891943693, + "eval_runtime": 35.0664, + "eval_samples_per_second": 4282.557, + "eval_steps_per_second": 66.93, + "step": 8020 + }, + { + "epoch": 0.8555295120392074, + "grad_norm": 0.002638779580593109, + "learning_rate": 2.889409759215854e-06, + "loss": 0.007, + "step": 8030 + }, + { + "epoch": 0.8555295120392074, + "eval_loss": 0.002751028398051858, + "eval_runtime": 35.0158, + "eval_samples_per_second": 4288.751, + "eval_steps_per_second": 67.027, + "step": 8030 + }, + { + "epoch": 0.8565949286170893, + "grad_norm": 0.1178533062338829, + "learning_rate": 2.868101427658215e-06, + "loss": 0.001, + "step": 8040 + }, + { + "epoch": 0.8565949286170893, + "eval_loss": 0.0027414588257670403, + "eval_runtime": 35.0552, + "eval_samples_per_second": 4283.934, + "eval_steps_per_second": 66.952, + "step": 8040 + }, + { + "epoch": 0.8576603451949713, + "grad_norm": 0.008728962391614914, + "learning_rate": 2.846793096100576e-06, + "loss": 0.0006, + "step": 8050 + }, + { + "epoch": 0.8576603451949713, + "eval_loss": 0.00275249220430851, + "eval_runtime": 35.0312, + "eval_samples_per_second": 4286.86, + "eval_steps_per_second": 66.997, + "step": 8050 + }, + { + "epoch": 0.8587257617728532, + "grad_norm": 0.004858131520450115, + "learning_rate": 2.8254847645429368e-06, + "loss": 0.0002, + "step": 8060 + }, + { + "epoch": 0.8587257617728532, + "eval_loss": 0.002762093674391508, + "eval_runtime": 35.0276, + "eval_samples_per_second": 4287.304, + "eval_steps_per_second": 67.004, + "step": 8060 + }, + { + "epoch": 0.8597911783507352, + "grad_norm": 0.0031513080466538668, + "learning_rate": 2.8041764329852978e-06, + "loss": 0.0019, + "step": 8070 + }, + { + "epoch": 0.8597911783507352, + "eval_loss": 0.0027698467019945383, + "eval_runtime": 35.0124, + "eval_samples_per_second": 4289.171, + "eval_steps_per_second": 67.033, + "step": 8070 + }, + { + "epoch": 0.8608565949286171, + "grad_norm": 0.0038100427482277155, + "learning_rate": 2.7828681014276583e-06, + "loss": 0.0007, + "step": 8080 + }, + { + "epoch": 0.8608565949286171, + "eval_loss": 0.0027781969401985407, + "eval_runtime": 35.0691, + "eval_samples_per_second": 4282.23, + "eval_steps_per_second": 66.925, + "step": 8080 + }, + { + "epoch": 0.861922011506499, + "grad_norm": 0.003881295910105109, + "learning_rate": 2.7615597698700193e-06, + "loss": 0.0011, + "step": 8090 + }, + { + "epoch": 0.861922011506499, + "eval_loss": 0.0027934699319303036, + "eval_runtime": 35.0896, + "eval_samples_per_second": 4279.729, + "eval_steps_per_second": 66.886, + "step": 8090 + }, + { + "epoch": 0.862987428084381, + "grad_norm": 0.0016517649637535214, + "learning_rate": 2.7402514383123803e-06, + "loss": 0.0007, + "step": 8100 + }, + { + "epoch": 0.862987428084381, + "eval_loss": 0.0028144221287220716, + "eval_runtime": 35.0148, + "eval_samples_per_second": 4288.867, + "eval_steps_per_second": 67.029, + "step": 8100 + }, + { + "epoch": 0.864052844662263, + "grad_norm": 1.7808645963668823, + "learning_rate": 2.7189431067547413e-06, + "loss": 0.0011, + "step": 8110 + }, + { + "epoch": 0.864052844662263, + "eval_loss": 0.0028445336502045393, + "eval_runtime": 35.0679, + "eval_samples_per_second": 4282.38, + "eval_steps_per_second": 66.927, + "step": 8110 + }, + { + "epoch": 0.8651182612401449, + "grad_norm": 3.285395383834839, + "learning_rate": 2.6976347751971023e-06, + "loss": 0.004, + "step": 8120 + }, + { + "epoch": 0.8651182612401449, + "eval_loss": 0.0028635459020733833, + "eval_runtime": 35.0252, + "eval_samples_per_second": 4287.6, + "eval_steps_per_second": 67.009, + "step": 8120 + }, + { + "epoch": 0.8661836778180269, + "grad_norm": 0.033276911824941635, + "learning_rate": 2.6763264436394633e-06, + "loss": 0.0161, + "step": 8130 + }, + { + "epoch": 0.8661836778180269, + "eval_loss": 0.002783233532682061, + "eval_runtime": 35.0088, + "eval_samples_per_second": 4289.605, + "eval_steps_per_second": 67.04, + "step": 8130 + }, + { + "epoch": 0.8672490943959088, + "grad_norm": 0.015310313552618027, + "learning_rate": 2.6550181120818243e-06, + "loss": 0.0001, + "step": 8140 + }, + { + "epoch": 0.8672490943959088, + "eval_loss": 0.002757697133347392, + "eval_runtime": 35.0601, + "eval_samples_per_second": 4283.329, + "eval_steps_per_second": 66.942, + "step": 8140 + }, + { + "epoch": 0.8683145109737908, + "grad_norm": 0.012751123867928982, + "learning_rate": 2.6337097805241853e-06, + "loss": 0.0011, + "step": 8150 + }, + { + "epoch": 0.8683145109737908, + "eval_loss": 0.002762366319075227, + "eval_runtime": 35.047, + "eval_samples_per_second": 4284.926, + "eval_steps_per_second": 66.967, + "step": 8150 + }, + { + "epoch": 0.8693799275516727, + "grad_norm": 0.05020337924361229, + "learning_rate": 2.6124014489665463e-06, + "loss": 0.0173, + "step": 8160 + }, + { + "epoch": 0.8693799275516727, + "eval_loss": 0.0027494090609252453, + "eval_runtime": 35.1731, + "eval_samples_per_second": 4269.564, + "eval_steps_per_second": 66.727, + "step": 8160 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 0.029232144355773926, + "learning_rate": 2.5910931174089072e-06, + "loss": 0.0003, + "step": 8170 + }, + { + "epoch": 0.8704453441295547, + "eval_loss": 0.0027434728108346462, + "eval_runtime": 35.0502, + "eval_samples_per_second": 4284.538, + "eval_steps_per_second": 66.961, + "step": 8170 + }, + { + "epoch": 0.8715107607074366, + "grad_norm": 0.07336370646953583, + "learning_rate": 2.5697847858512682e-06, + "loss": 0.0007, + "step": 8180 + }, + { + "epoch": 0.8715107607074366, + "eval_loss": 0.002746333135291934, + "eval_runtime": 35.0401, + "eval_samples_per_second": 4285.776, + "eval_steps_per_second": 66.98, + "step": 8180 + }, + { + "epoch": 0.8725761772853186, + "grad_norm": 0.009558520279824734, + "learning_rate": 2.5484764542936292e-06, + "loss": 0.0039, + "step": 8190 + }, + { + "epoch": 0.8725761772853186, + "eval_loss": 0.002741629723459482, + "eval_runtime": 35.0868, + "eval_samples_per_second": 4280.076, + "eval_steps_per_second": 66.891, + "step": 8190 + }, + { + "epoch": 0.8736415938632005, + "grad_norm": 0.030061665922403336, + "learning_rate": 2.5271681227359902e-06, + "loss": 0.0003, + "step": 8200 + }, + { + "epoch": 0.8736415938632005, + "eval_loss": 0.0027512703090906143, + "eval_runtime": 35.0542, + "eval_samples_per_second": 4284.054, + "eval_steps_per_second": 66.953, + "step": 8200 + }, + { + "epoch": 0.8747070104410825, + "grad_norm": 0.0030335835181176662, + "learning_rate": 2.505859791178351e-06, + "loss": 0.0004, + "step": 8210 + }, + { + "epoch": 0.8747070104410825, + "eval_loss": 0.0027557830326259136, + "eval_runtime": 35.0459, + "eval_samples_per_second": 4285.067, + "eval_steps_per_second": 66.969, + "step": 8210 + }, + { + "epoch": 0.8757724270189644, + "grad_norm": 0.005516626872122288, + "learning_rate": 2.4845514596207118e-06, + "loss": 0.0015, + "step": 8220 + }, + { + "epoch": 0.8757724270189644, + "eval_loss": 0.00276589160785079, + "eval_runtime": 35.0691, + "eval_samples_per_second": 4282.235, + "eval_steps_per_second": 66.925, + "step": 8220 + }, + { + "epoch": 0.8768378435968464, + "grad_norm": 2.0706310272216797, + "learning_rate": 2.4632431280630728e-06, + "loss": 0.0165, + "step": 8230 + }, + { + "epoch": 0.8768378435968464, + "eval_loss": 0.002770791994407773, + "eval_runtime": 35.0091, + "eval_samples_per_second": 4289.571, + "eval_steps_per_second": 67.04, + "step": 8230 + }, + { + "epoch": 0.8779032601747283, + "grad_norm": 0.862779974937439, + "learning_rate": 2.4419347965054338e-06, + "loss": 0.0037, + "step": 8240 + }, + { + "epoch": 0.8779032601747283, + "eval_loss": 0.0027604245115071535, + "eval_runtime": 35.002, + "eval_samples_per_second": 4290.441, + "eval_steps_per_second": 67.053, + "step": 8240 + }, + { + "epoch": 0.8789686767526103, + "grad_norm": 0.07593127340078354, + "learning_rate": 2.4206264649477947e-06, + "loss": 0.0013, + "step": 8250 + }, + { + "epoch": 0.8789686767526103, + "eval_loss": 0.0027596699073910713, + "eval_runtime": 35.0447, + "eval_samples_per_second": 4285.216, + "eval_steps_per_second": 66.972, + "step": 8250 + }, + { + "epoch": 0.8800340933304922, + "grad_norm": 0.004259423352777958, + "learning_rate": 2.3993181333901557e-06, + "loss": 0.0007, + "step": 8260 + }, + { + "epoch": 0.8800340933304922, + "eval_loss": 0.002764316974207759, + "eval_runtime": 35.0623, + "eval_samples_per_second": 4283.062, + "eval_steps_per_second": 66.938, + "step": 8260 + }, + { + "epoch": 0.8810995099083742, + "grad_norm": 0.0013831878313794732, + "learning_rate": 2.3780098018325167e-06, + "loss": 0.0007, + "step": 8270 + }, + { + "epoch": 0.8810995099083742, + "eval_loss": 0.002769648330286145, + "eval_runtime": 35.0422, + "eval_samples_per_second": 4285.516, + "eval_steps_per_second": 66.976, + "step": 8270 + }, + { + "epoch": 0.8821649264862561, + "grad_norm": 0.002447050530463457, + "learning_rate": 2.3567014702748777e-06, + "loss": 0.0035, + "step": 8280 + }, + { + "epoch": 0.8821649264862561, + "eval_loss": 0.002767772413790226, + "eval_runtime": 35.0206, + "eval_samples_per_second": 4288.167, + "eval_steps_per_second": 67.018, + "step": 8280 + }, + { + "epoch": 0.8832303430641381, + "grad_norm": 0.0015266514383256435, + "learning_rate": 2.3353931387172387e-06, + "loss": 0.0047, + "step": 8290 + }, + { + "epoch": 0.8832303430641381, + "eval_loss": 0.002763263415545225, + "eval_runtime": 35.0695, + "eval_samples_per_second": 4282.187, + "eval_steps_per_second": 66.924, + "step": 8290 + }, + { + "epoch": 0.88429575964202, + "grad_norm": 0.08378314226865768, + "learning_rate": 2.3140848071595997e-06, + "loss": 0.0028, + "step": 8300 + }, + { + "epoch": 0.88429575964202, + "eval_loss": 0.0027693863958120346, + "eval_runtime": 35.0809, + "eval_samples_per_second": 4280.795, + "eval_steps_per_second": 66.903, + "step": 8300 + }, + { + "epoch": 0.885361176219902, + "grad_norm": 0.002748900791630149, + "learning_rate": 2.2927764756019607e-06, + "loss": 0.0034, + "step": 8310 + }, + { + "epoch": 0.885361176219902, + "eval_loss": 0.002776265610009432, + "eval_runtime": 35.022, + "eval_samples_per_second": 4287.987, + "eval_steps_per_second": 67.015, + "step": 8310 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 0.025571728125214577, + "learning_rate": 2.2714681440443217e-06, + "loss": 0.0004, + "step": 8320 + }, + { + "epoch": 0.8864265927977839, + "eval_loss": 0.0027706564869731665, + "eval_runtime": 35.0191, + "eval_samples_per_second": 4288.346, + "eval_steps_per_second": 67.021, + "step": 8320 + }, + { + "epoch": 0.8874920093756659, + "grad_norm": 0.007666856050491333, + "learning_rate": 2.2501598124866827e-06, + "loss": 0.0123, + "step": 8330 + }, + { + "epoch": 0.8874920093756659, + "eval_loss": 0.002764170989394188, + "eval_runtime": 35.008, + "eval_samples_per_second": 4289.7, + "eval_steps_per_second": 67.042, + "step": 8330 + }, + { + "epoch": 0.8885574259535478, + "grad_norm": 0.0209694541990757, + "learning_rate": 2.2288514809290437e-06, + "loss": 0.0004, + "step": 8340 + }, + { + "epoch": 0.8885574259535478, + "eval_loss": 0.002746229525655508, + "eval_runtime": 35.0307, + "eval_samples_per_second": 4286.93, + "eval_steps_per_second": 66.998, + "step": 8340 + }, + { + "epoch": 0.8896228425314298, + "grad_norm": 0.0023976133670657873, + "learning_rate": 2.2075431493714046e-06, + "loss": 0.001, + "step": 8350 + }, + { + "epoch": 0.8896228425314298, + "eval_loss": 0.002741154283285141, + "eval_runtime": 35.0325, + "eval_samples_per_second": 4286.707, + "eval_steps_per_second": 66.995, + "step": 8350 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.24398835003376007, + "learning_rate": 2.1862348178137656e-06, + "loss": 0.0009, + "step": 8360 + }, + { + "epoch": 0.8906882591093117, + "eval_loss": 0.002744528232142329, + "eval_runtime": 35.0342, + "eval_samples_per_second": 4286.492, + "eval_steps_per_second": 66.992, + "step": 8360 + }, + { + "epoch": 0.8917536756871937, + "grad_norm": 0.03572320565581322, + "learning_rate": 2.1649264862561266e-06, + "loss": 0.0004, + "step": 8370 + }, + { + "epoch": 0.8917536756871937, + "eval_loss": 0.0027611658442765474, + "eval_runtime": 35.0407, + "eval_samples_per_second": 4285.701, + "eval_steps_per_second": 66.979, + "step": 8370 + }, + { + "epoch": 0.8928190922650756, + "grad_norm": 0.6422826647758484, + "learning_rate": 2.143618154698487e-06, + "loss": 0.0024, + "step": 8380 + }, + { + "epoch": 0.8928190922650756, + "eval_loss": 0.0027707030531018972, + "eval_runtime": 35.062, + "eval_samples_per_second": 4283.101, + "eval_steps_per_second": 66.939, + "step": 8380 + }, + { + "epoch": 0.8938845088429576, + "grad_norm": 0.0015818601241335273, + "learning_rate": 2.122309823140848e-06, + "loss": 0.0045, + "step": 8390 + }, + { + "epoch": 0.8938845088429576, + "eval_loss": 0.0027499543502926826, + "eval_runtime": 35.0496, + "eval_samples_per_second": 4284.608, + "eval_steps_per_second": 66.962, + "step": 8390 + }, + { + "epoch": 0.8949499254208395, + "grad_norm": 0.0156484916806221, + "learning_rate": 2.101001491583209e-06, + "loss": 0.0084, + "step": 8400 + }, + { + "epoch": 0.8949499254208395, + "eval_loss": 0.0027432774659246206, + "eval_runtime": 35.0349, + "eval_samples_per_second": 4286.407, + "eval_steps_per_second": 66.99, + "step": 8400 + }, + { + "epoch": 0.8960153419987215, + "grad_norm": 0.0045946515165269375, + "learning_rate": 2.07969316002557e-06, + "loss": 0.0011, + "step": 8410 + }, + { + "epoch": 0.8960153419987215, + "eval_loss": 0.002739608520641923, + "eval_runtime": 35.0416, + "eval_samples_per_second": 4285.598, + "eval_steps_per_second": 66.978, + "step": 8410 + }, + { + "epoch": 0.8970807585766034, + "grad_norm": 0.0017706300131976604, + "learning_rate": 2.058384828467931e-06, + "loss": 0.0034, + "step": 8420 + }, + { + "epoch": 0.8970807585766034, + "eval_loss": 0.0027431268244981766, + "eval_runtime": 35.0275, + "eval_samples_per_second": 4287.323, + "eval_steps_per_second": 67.005, + "step": 8420 + }, + { + "epoch": 0.8981461751544854, + "grad_norm": 0.002228514524176717, + "learning_rate": 2.037076496910292e-06, + "loss": 0.0015, + "step": 8430 + }, + { + "epoch": 0.8981461751544854, + "eval_loss": 0.0027508740313351154, + "eval_runtime": 35.0118, + "eval_samples_per_second": 4289.244, + "eval_steps_per_second": 67.035, + "step": 8430 + }, + { + "epoch": 0.8992115917323673, + "grad_norm": 0.0016633226769044995, + "learning_rate": 2.015768165352653e-06, + "loss": 0.0007, + "step": 8440 + }, + { + "epoch": 0.8992115917323673, + "eval_loss": 0.0027630003169178963, + "eval_runtime": 35.0398, + "eval_samples_per_second": 4285.81, + "eval_steps_per_second": 66.981, + "step": 8440 + }, + { + "epoch": 0.9002770083102493, + "grad_norm": 0.0019834586419165134, + "learning_rate": 1.994459833795014e-06, + "loss": 0.0059, + "step": 8450 + }, + { + "epoch": 0.9002770083102493, + "eval_loss": 0.002752769272774458, + "eval_runtime": 35.0758, + "eval_samples_per_second": 4281.418, + "eval_steps_per_second": 66.912, + "step": 8450 + }, + { + "epoch": 0.9013424248881312, + "grad_norm": 0.17420539259910583, + "learning_rate": 1.9731515022373747e-06, + "loss": 0.0006, + "step": 8460 + }, + { + "epoch": 0.9013424248881312, + "eval_loss": 0.002748524770140648, + "eval_runtime": 35.0817, + "eval_samples_per_second": 4280.698, + "eval_steps_per_second": 66.901, + "step": 8460 + }, + { + "epoch": 0.9024078414660132, + "grad_norm": 0.003549454268068075, + "learning_rate": 1.9518431706797357e-06, + "loss": 0.0001, + "step": 8470 + }, + { + "epoch": 0.9024078414660132, + "eval_loss": 0.0027478199917823076, + "eval_runtime": 35.0417, + "eval_samples_per_second": 4285.583, + "eval_steps_per_second": 66.977, + "step": 8470 + }, + { + "epoch": 0.9034732580438951, + "grad_norm": 0.0015697539784014225, + "learning_rate": 1.9305348391220967e-06, + "loss": 0.0011, + "step": 8480 + }, + { + "epoch": 0.9034732580438951, + "eval_loss": 0.0027480670250952244, + "eval_runtime": 35.0442, + "eval_samples_per_second": 4285.279, + "eval_steps_per_second": 66.973, + "step": 8480 + }, + { + "epoch": 0.9045386746217771, + "grad_norm": 0.002247209195047617, + "learning_rate": 1.9092265075644577e-06, + "loss": 0.0001, + "step": 8490 + }, + { + "epoch": 0.9045386746217771, + "eval_loss": 0.0027491068467497826, + "eval_runtime": 35.0476, + "eval_samples_per_second": 4284.857, + "eval_steps_per_second": 66.966, + "step": 8490 + }, + { + "epoch": 0.905604091199659, + "grad_norm": 0.009732640348374844, + "learning_rate": 1.8879181760068189e-06, + "loss": 0.0006, + "step": 8500 + }, + { + "epoch": 0.905604091199659, + "eval_loss": 0.002738188486546278, + "eval_runtime": 35.0452, + "eval_samples_per_second": 4285.154, + "eval_steps_per_second": 66.971, + "step": 8500 + }, + { + "epoch": 0.9066695077775411, + "grad_norm": 0.0018489729845896363, + "learning_rate": 1.8666098444491799e-06, + "loss": 0.0006, + "step": 8510 + }, + { + "epoch": 0.9066695077775411, + "eval_loss": 0.002733604284003377, + "eval_runtime": 35.0616, + "eval_samples_per_second": 4283.15, + "eval_steps_per_second": 66.939, + "step": 8510 + }, + { + "epoch": 0.907734924355423, + "grad_norm": 0.0026071134489029646, + "learning_rate": 1.8453015128915408e-06, + "loss": 0.0117, + "step": 8520 + }, + { + "epoch": 0.907734924355423, + "eval_loss": 0.0027230686973780394, + "eval_runtime": 35.0647, + "eval_samples_per_second": 4282.773, + "eval_steps_per_second": 66.933, + "step": 8520 + }, + { + "epoch": 0.908800340933305, + "grad_norm": 3.456970453262329, + "learning_rate": 1.8239931813339018e-06, + "loss": 0.0195, + "step": 8530 + }, + { + "epoch": 0.908800340933305, + "eval_loss": 0.002711121691390872, + "eval_runtime": 35.0266, + "eval_samples_per_second": 4287.425, + "eval_steps_per_second": 67.006, + "step": 8530 + }, + { + "epoch": 0.9098657575111869, + "grad_norm": 0.0017407455015927553, + "learning_rate": 1.8026848497762628e-06, + "loss": 0.0015, + "step": 8540 + }, + { + "epoch": 0.9098657575111869, + "eval_loss": 0.002703184960409999, + "eval_runtime": 35.0438, + "eval_samples_per_second": 4285.323, + "eval_steps_per_second": 66.973, + "step": 8540 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 0.05513337254524231, + "learning_rate": 1.7813765182186236e-06, + "loss": 0.009, + "step": 8550 + }, + { + "epoch": 0.9109311740890689, + "eval_loss": 0.0027215650770813227, + "eval_runtime": 35.0904, + "eval_samples_per_second": 4279.629, + "eval_steps_per_second": 66.884, + "step": 8550 + }, + { + "epoch": 0.9119965906669508, + "grad_norm": 0.014360551722347736, + "learning_rate": 1.7600681866609846e-06, + "loss": 0.0044, + "step": 8560 + }, + { + "epoch": 0.9119965906669508, + "eval_loss": 0.0027318003121763468, + "eval_runtime": 35.0623, + "eval_samples_per_second": 4283.058, + "eval_steps_per_second": 66.938, + "step": 8560 + }, + { + "epoch": 0.9130620072448328, + "grad_norm": 0.02252795174717903, + "learning_rate": 1.7387598551033456e-06, + "loss": 0.0012, + "step": 8570 + }, + { + "epoch": 0.9130620072448328, + "eval_loss": 0.0027427198365330696, + "eval_runtime": 35.086, + "eval_samples_per_second": 4280.172, + "eval_steps_per_second": 66.893, + "step": 8570 + }, + { + "epoch": 0.9141274238227147, + "grad_norm": 0.016549358144402504, + "learning_rate": 1.7174515235457066e-06, + "loss": 0.0009, + "step": 8580 + }, + { + "epoch": 0.9141274238227147, + "eval_loss": 0.0027572920080274343, + "eval_runtime": 35.0789, + "eval_samples_per_second": 4281.031, + "eval_steps_per_second": 66.906, + "step": 8580 + }, + { + "epoch": 0.9151928404005967, + "grad_norm": 0.0019249654142186046, + "learning_rate": 1.6961431919880676e-06, + "loss": 0.0002, + "step": 8590 + }, + { + "epoch": 0.9151928404005967, + "eval_loss": 0.0027666096575558186, + "eval_runtime": 35.0568, + "eval_samples_per_second": 4283.731, + "eval_steps_per_second": 66.948, + "step": 8590 + }, + { + "epoch": 0.9162582569784786, + "grad_norm": 0.003874736838042736, + "learning_rate": 1.6748348604304286e-06, + "loss": 0.0003, + "step": 8600 + }, + { + "epoch": 0.9162582569784786, + "eval_loss": 0.002771862084046006, + "eval_runtime": 35.0593, + "eval_samples_per_second": 4283.426, + "eval_steps_per_second": 66.944, + "step": 8600 + }, + { + "epoch": 0.9173236735563606, + "grad_norm": 0.0042558941058814526, + "learning_rate": 1.6535265288727895e-06, + "loss": 0.0003, + "step": 8610 + }, + { + "epoch": 0.9173236735563606, + "eval_loss": 0.002779304748401046, + "eval_runtime": 35.0443, + "eval_samples_per_second": 4285.264, + "eval_steps_per_second": 66.972, + "step": 8610 + }, + { + "epoch": 0.9183890901342425, + "grad_norm": 0.0014816632028669119, + "learning_rate": 1.6322181973151505e-06, + "loss": 0.0003, + "step": 8620 + }, + { + "epoch": 0.9183890901342425, + "eval_loss": 0.0027969153597950935, + "eval_runtime": 35.0602, + "eval_samples_per_second": 4283.317, + "eval_steps_per_second": 66.942, + "step": 8620 + }, + { + "epoch": 0.9194545067121245, + "grad_norm": 0.005812318064272404, + "learning_rate": 1.610909865757511e-06, + "loss": 0.0032, + "step": 8630 + }, + { + "epoch": 0.9194545067121245, + "eval_loss": 0.002787941135466099, + "eval_runtime": 35.0357, + "eval_samples_per_second": 4286.314, + "eval_steps_per_second": 66.989, + "step": 8630 + }, + { + "epoch": 0.9205199232900064, + "grad_norm": 0.006544212810695171, + "learning_rate": 1.589601534199872e-06, + "loss": 0.0013, + "step": 8640 + }, + { + "epoch": 0.9205199232900064, + "eval_loss": 0.0028067566454410553, + "eval_runtime": 35.0446, + "eval_samples_per_second": 4285.22, + "eval_steps_per_second": 66.972, + "step": 8640 + }, + { + "epoch": 0.9215853398678884, + "grad_norm": 0.00216415012255311, + "learning_rate": 1.568293202642233e-06, + "loss": 0.0011, + "step": 8650 + }, + { + "epoch": 0.9215853398678884, + "eval_loss": 0.0028414882253855467, + "eval_runtime": 35.0524, + "eval_samples_per_second": 4284.268, + "eval_steps_per_second": 66.957, + "step": 8650 + }, + { + "epoch": 0.9226507564457703, + "grad_norm": 0.0019254203652963042, + "learning_rate": 1.546984871084594e-06, + "loss": 0.0011, + "step": 8660 + }, + { + "epoch": 0.9226507564457703, + "eval_loss": 0.0028817523270845413, + "eval_runtime": 35.0563, + "eval_samples_per_second": 4283.796, + "eval_steps_per_second": 66.949, + "step": 8660 + }, + { + "epoch": 0.9237161730236523, + "grad_norm": 0.4283369183540344, + "learning_rate": 1.525676539526955e-06, + "loss": 0.0017, + "step": 8670 + }, + { + "epoch": 0.9237161730236523, + "eval_loss": 0.002892641816288233, + "eval_runtime": 35.0339, + "eval_samples_per_second": 4286.532, + "eval_steps_per_second": 66.992, + "step": 8670 + }, + { + "epoch": 0.9247815896015342, + "grad_norm": 0.002897687954828143, + "learning_rate": 1.504368207969316e-06, + "loss": 0.0003, + "step": 8680 + }, + { + "epoch": 0.9247815896015342, + "eval_loss": 0.0029174918308854103, + "eval_runtime": 35.0612, + "eval_samples_per_second": 4283.199, + "eval_steps_per_second": 66.94, + "step": 8680 + }, + { + "epoch": 0.9258470061794162, + "grad_norm": 0.8872772455215454, + "learning_rate": 1.4830598764116772e-06, + "loss": 0.0053, + "step": 8690 + }, + { + "epoch": 0.9258470061794162, + "eval_loss": 0.0028924746438860893, + "eval_runtime": 35.0912, + "eval_samples_per_second": 4279.538, + "eval_steps_per_second": 66.883, + "step": 8690 + }, + { + "epoch": 0.9269124227572981, + "grad_norm": 0.004901398438960314, + "learning_rate": 1.4617515448540382e-06, + "loss": 0.0001, + "step": 8700 + }, + { + "epoch": 0.9269124227572981, + "eval_loss": 0.002856872510164976, + "eval_runtime": 35.0855, + "eval_samples_per_second": 4280.234, + "eval_steps_per_second": 66.894, + "step": 8700 + }, + { + "epoch": 0.9279778393351801, + "grad_norm": 0.0021194189321249723, + "learning_rate": 1.4404432132963992e-06, + "loss": 0.0153, + "step": 8710 + }, + { + "epoch": 0.9279778393351801, + "eval_loss": 0.002821860834956169, + "eval_runtime": 35.0497, + "eval_samples_per_second": 4284.598, + "eval_steps_per_second": 66.962, + "step": 8710 + }, + { + "epoch": 0.929043255913062, + "grad_norm": 0.10005082935094833, + "learning_rate": 1.4191348817387598e-06, + "loss": 0.0002, + "step": 8720 + }, + { + "epoch": 0.929043255913062, + "eval_loss": 0.0028019933961331844, + "eval_runtime": 35.0643, + "eval_samples_per_second": 4282.821, + "eval_steps_per_second": 66.934, + "step": 8720 + }, + { + "epoch": 0.930108672490944, + "grad_norm": 0.002553171245381236, + "learning_rate": 1.3978265501811208e-06, + "loss": 0.0007, + "step": 8730 + }, + { + "epoch": 0.930108672490944, + "eval_loss": 0.0027997682336717844, + "eval_runtime": 35.0344, + "eval_samples_per_second": 4286.473, + "eval_steps_per_second": 66.991, + "step": 8730 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 0.002466765232384205, + "learning_rate": 1.3765182186234818e-06, + "loss": 0.0003, + "step": 8740 + }, + { + "epoch": 0.9311740890688259, + "eval_loss": 0.002808566903695464, + "eval_runtime": 34.9819, + "eval_samples_per_second": 4292.906, + "eval_steps_per_second": 67.092, + "step": 8740 + }, + { + "epoch": 0.9322395056467079, + "grad_norm": 0.0318465530872345, + "learning_rate": 1.3552098870658428e-06, + "loss": 0.0035, + "step": 8750 + }, + { + "epoch": 0.9322395056467079, + "eval_loss": 0.0027872417122125626, + "eval_runtime": 35.0627, + "eval_samples_per_second": 4283.009, + "eval_steps_per_second": 66.937, + "step": 8750 + }, + { + "epoch": 0.9333049222245898, + "grad_norm": 0.0013958633644506335, + "learning_rate": 1.3339015555082038e-06, + "loss": 0.0002, + "step": 8760 + }, + { + "epoch": 0.9333049222245898, + "eval_loss": 0.002777666551992297, + "eval_runtime": 35.0407, + "eval_samples_per_second": 4285.707, + "eval_steps_per_second": 66.979, + "step": 8760 + }, + { + "epoch": 0.9343703388024718, + "grad_norm": 0.00333898956887424, + "learning_rate": 1.3125932239505647e-06, + "loss": 0.0079, + "step": 8770 + }, + { + "epoch": 0.9343703388024718, + "eval_loss": 0.0027819545939564705, + "eval_runtime": 35.0694, + "eval_samples_per_second": 4282.194, + "eval_steps_per_second": 66.924, + "step": 8770 + }, + { + "epoch": 0.9354357553803537, + "grad_norm": 0.30577364563941956, + "learning_rate": 1.2912848923929257e-06, + "loss": 0.0013, + "step": 8780 + }, + { + "epoch": 0.9354357553803537, + "eval_loss": 0.0027924508322030306, + "eval_runtime": 35.0496, + "eval_samples_per_second": 4284.612, + "eval_steps_per_second": 66.962, + "step": 8780 + }, + { + "epoch": 0.9365011719582357, + "grad_norm": 0.040551621466875076, + "learning_rate": 1.2699765608352867e-06, + "loss": 0.0008, + "step": 8790 + }, + { + "epoch": 0.9365011719582357, + "eval_loss": 0.0028144929092377424, + "eval_runtime": 35.0432, + "eval_samples_per_second": 4285.396, + "eval_steps_per_second": 66.974, + "step": 8790 + }, + { + "epoch": 0.9375665885361176, + "grad_norm": 0.265171080827713, + "learning_rate": 1.2486682292776477e-06, + "loss": 0.0008, + "step": 8800 + }, + { + "epoch": 0.9375665885361176, + "eval_loss": 0.002830737503245473, + "eval_runtime": 35.0513, + "eval_samples_per_second": 4284.411, + "eval_steps_per_second": 66.959, + "step": 8800 + }, + { + "epoch": 0.9386320051139996, + "grad_norm": 0.02604043483734131, + "learning_rate": 1.2273598977200087e-06, + "loss": 0.0001, + "step": 8810 + }, + { + "epoch": 0.9386320051139996, + "eval_loss": 0.0028454142156988382, + "eval_runtime": 35.0453, + "eval_samples_per_second": 4285.141, + "eval_steps_per_second": 66.97, + "step": 8810 + }, + { + "epoch": 0.9396974216918815, + "grad_norm": 0.0020725736394524574, + "learning_rate": 1.2060515661623697e-06, + "loss": 0.0006, + "step": 8820 + }, + { + "epoch": 0.9396974216918815, + "eval_loss": 0.002855469472706318, + "eval_runtime": 35.0566, + "eval_samples_per_second": 4283.757, + "eval_steps_per_second": 66.949, + "step": 8820 + }, + { + "epoch": 0.9407628382697635, + "grad_norm": 0.09126020967960358, + "learning_rate": 1.1847432346047305e-06, + "loss": 0.0004, + "step": 8830 + }, + { + "epoch": 0.9407628382697635, + "eval_loss": 0.0028640632517635822, + "eval_runtime": 35.0432, + "eval_samples_per_second": 4285.401, + "eval_steps_per_second": 66.975, + "step": 8830 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 0.0015713806496933103, + "learning_rate": 1.1634349030470915e-06, + "loss": 0.0008, + "step": 8840 + }, + { + "epoch": 0.9418282548476454, + "eval_loss": 0.0028736621607095003, + "eval_runtime": 35.0791, + "eval_samples_per_second": 4281.007, + "eval_steps_per_second": 66.906, + "step": 8840 + }, + { + "epoch": 0.9428936714255274, + "grad_norm": 0.001434053760021925, + "learning_rate": 1.1421265714894525e-06, + "loss": 0.0066, + "step": 8850 + }, + { + "epoch": 0.9428936714255274, + "eval_loss": 0.0028831621166318655, + "eval_runtime": 35.0581, + "eval_samples_per_second": 4283.57, + "eval_steps_per_second": 66.946, + "step": 8850 + }, + { + "epoch": 0.9439590880034093, + "grad_norm": 0.0015935949049890041, + "learning_rate": 1.1208182399318134e-06, + "loss": 0.0049, + "step": 8860 + }, + { + "epoch": 0.9439590880034093, + "eval_loss": 0.0028416782151907682, + "eval_runtime": 35.0629, + "eval_samples_per_second": 4282.99, + "eval_steps_per_second": 66.937, + "step": 8860 + }, + { + "epoch": 0.9450245045812913, + "grad_norm": 0.0035525760613381863, + "learning_rate": 1.0995099083741744e-06, + "loss": 0.0024, + "step": 8870 + }, + { + "epoch": 0.9450245045812913, + "eval_loss": 0.002823463175445795, + "eval_runtime": 35.0712, + "eval_samples_per_second": 4281.969, + "eval_steps_per_second": 66.921, + "step": 8870 + }, + { + "epoch": 0.9460899211591732, + "grad_norm": 0.003755433950573206, + "learning_rate": 1.0782015768165354e-06, + "loss": 0.0005, + "step": 8880 + }, + { + "epoch": 0.9460899211591732, + "eval_loss": 0.002827939111739397, + "eval_runtime": 35.0628, + "eval_samples_per_second": 4282.996, + "eval_steps_per_second": 66.937, + "step": 8880 + }, + { + "epoch": 0.9471553377370552, + "grad_norm": 0.005914956331253052, + "learning_rate": 1.0568932452588964e-06, + "loss": 0.001, + "step": 8890 + }, + { + "epoch": 0.9471553377370552, + "eval_loss": 0.0028441580943763256, + "eval_runtime": 35.0361, + "eval_samples_per_second": 4286.265, + "eval_steps_per_second": 66.988, + "step": 8890 + }, + { + "epoch": 0.9482207543149371, + "grad_norm": 0.007786376867443323, + "learning_rate": 1.0355849137012574e-06, + "loss": 0.0025, + "step": 8900 + }, + { + "epoch": 0.9482207543149371, + "eval_loss": 0.0028372537344694138, + "eval_runtime": 35.0638, + "eval_samples_per_second": 4282.882, + "eval_steps_per_second": 66.935, + "step": 8900 + }, + { + "epoch": 0.9492861708928191, + "grad_norm": 0.01083774771541357, + "learning_rate": 1.0142765821436182e-06, + "loss": 0.0007, + "step": 8910 + }, + { + "epoch": 0.9492861708928191, + "eval_loss": 0.0028390076477080584, + "eval_runtime": 35.0497, + "eval_samples_per_second": 4284.598, + "eval_steps_per_second": 66.962, + "step": 8910 + }, + { + "epoch": 0.950351587470701, + "grad_norm": 0.0015453147934749722, + "learning_rate": 9.929682505859792e-07, + "loss": 0.0024, + "step": 8920 + }, + { + "epoch": 0.950351587470701, + "eval_loss": 0.002836124738678336, + "eval_runtime": 35.0833, + "eval_samples_per_second": 4280.504, + "eval_steps_per_second": 66.898, + "step": 8920 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 0.003299474949017167, + "learning_rate": 9.716599190283402e-07, + "loss": 0.0002, + "step": 8930 + }, + { + "epoch": 0.951417004048583, + "eval_loss": 0.0028344527818262577, + "eval_runtime": 35.0755, + "eval_samples_per_second": 4281.445, + "eval_steps_per_second": 66.913, + "step": 8930 + }, + { + "epoch": 0.9524824206264649, + "grad_norm": 0.001612770720385015, + "learning_rate": 9.503515874707012e-07, + "loss": 0.0002, + "step": 8940 + }, + { + "epoch": 0.9524824206264649, + "eval_loss": 0.0028346776962280273, + "eval_runtime": 35.0638, + "eval_samples_per_second": 4282.881, + "eval_steps_per_second": 66.935, + "step": 8940 + }, + { + "epoch": 0.9535478372043469, + "grad_norm": 0.00152910640463233, + "learning_rate": 9.29043255913062e-07, + "loss": 0.0001, + "step": 8950 + }, + { + "epoch": 0.9535478372043469, + "eval_loss": 0.0028353093657642603, + "eval_runtime": 35.047, + "eval_samples_per_second": 4284.935, + "eval_steps_per_second": 66.967, + "step": 8950 + }, + { + "epoch": 0.9546132537822288, + "grad_norm": 0.0018316495697945356, + "learning_rate": 9.07734924355423e-07, + "loss": 0.0001, + "step": 8960 + }, + { + "epoch": 0.9546132537822288, + "eval_loss": 0.0028363701421767473, + "eval_runtime": 35.0263, + "eval_samples_per_second": 4287.461, + "eval_steps_per_second": 67.007, + "step": 8960 + }, + { + "epoch": 0.9556786703601108, + "grad_norm": 0.0018473445670679212, + "learning_rate": 8.86426592797784e-07, + "loss": 0.0048, + "step": 8970 + }, + { + "epoch": 0.9556786703601108, + "eval_loss": 0.002827234333381057, + "eval_runtime": 35.0517, + "eval_samples_per_second": 4284.351, + "eval_steps_per_second": 66.958, + "step": 8970 + }, + { + "epoch": 0.9567440869379927, + "grad_norm": 0.0031582904048264027, + "learning_rate": 8.65118261240145e-07, + "loss": 0.0006, + "step": 8980 + }, + { + "epoch": 0.9567440869379927, + "eval_loss": 0.0028260373510420322, + "eval_runtime": 35.0605, + "eval_samples_per_second": 4283.28, + "eval_steps_per_second": 66.941, + "step": 8980 + }, + { + "epoch": 0.9578095035158747, + "grad_norm": 0.001939519657753408, + "learning_rate": 8.43809929682506e-07, + "loss": 0.0007, + "step": 8990 + }, + { + "epoch": 0.9578095035158747, + "eval_loss": 0.0028355128597468138, + "eval_runtime": 35.1082, + "eval_samples_per_second": 4277.462, + "eval_steps_per_second": 66.85, + "step": 8990 + }, + { + "epoch": 0.9588749200937566, + "grad_norm": 0.0012444235617294908, + "learning_rate": 8.225015981248669e-07, + "loss": 0.0005, + "step": 9000 + }, + { + "epoch": 0.9588749200937566, + "eval_loss": 0.002848832868039608, + "eval_runtime": 35.0665, + "eval_samples_per_second": 4282.551, + "eval_steps_per_second": 66.93, + "step": 9000 + }, + { + "epoch": 0.9599403366716386, + "grad_norm": 0.028200862929224968, + "learning_rate": 8.011932665672279e-07, + "loss": 0.0089, + "step": 9010 + }, + { + "epoch": 0.9599403366716386, + "eval_loss": 0.0028532061260193586, + "eval_runtime": 35.0678, + "eval_samples_per_second": 4282.385, + "eval_steps_per_second": 66.927, + "step": 9010 + }, + { + "epoch": 0.9610057532495205, + "grad_norm": 0.07253487408161163, + "learning_rate": 7.798849350095889e-07, + "loss": 0.0003, + "step": 9020 + }, + { + "epoch": 0.9610057532495205, + "eval_loss": 0.0028550319839268923, + "eval_runtime": 35.0826, + "eval_samples_per_second": 4280.586, + "eval_steps_per_second": 66.899, + "step": 9020 + }, + { + "epoch": 0.9620711698274025, + "grad_norm": 0.0023238463327288628, + "learning_rate": 7.585766034519499e-07, + "loss": 0.0026, + "step": 9030 + }, + { + "epoch": 0.9620711698274025, + "eval_loss": 0.002850407036021352, + "eval_runtime": 35.0397, + "eval_samples_per_second": 4285.823, + "eval_steps_per_second": 66.981, + "step": 9030 + }, + { + "epoch": 0.9631365864052844, + "grad_norm": 0.0029777686577290297, + "learning_rate": 7.372682718943107e-07, + "loss": 0.0151, + "step": 9040 + }, + { + "epoch": 0.9631365864052844, + "eval_loss": 0.002831405494362116, + "eval_runtime": 35.0457, + "eval_samples_per_second": 4285.097, + "eval_steps_per_second": 66.97, + "step": 9040 + }, + { + "epoch": 0.9642020029831664, + "grad_norm": 0.0014590666396543384, + "learning_rate": 7.159599403366717e-07, + "loss": 0.0007, + "step": 9050 + }, + { + "epoch": 0.9642020029831664, + "eval_loss": 0.002821285743266344, + "eval_runtime": 35.0131, + "eval_samples_per_second": 4289.085, + "eval_steps_per_second": 67.032, + "step": 9050 + }, + { + "epoch": 0.9652674195610483, + "grad_norm": 0.01177013386040926, + "learning_rate": 6.946516087790327e-07, + "loss": 0.0005, + "step": 9060 + }, + { + "epoch": 0.9652674195610483, + "eval_loss": 0.002820658963173628, + "eval_runtime": 35.0093, + "eval_samples_per_second": 4289.544, + "eval_steps_per_second": 67.039, + "step": 9060 + }, + { + "epoch": 0.9663328361389303, + "grad_norm": 0.016290990635752678, + "learning_rate": 6.733432772213937e-07, + "loss": 0.0005, + "step": 9070 + }, + { + "epoch": 0.9663328361389303, + "eval_loss": 0.002823216374963522, + "eval_runtime": 35.0349, + "eval_samples_per_second": 4286.409, + "eval_steps_per_second": 66.99, + "step": 9070 + }, + { + "epoch": 0.9673982527168122, + "grad_norm": 0.03478045016527176, + "learning_rate": 6.520349456637545e-07, + "loss": 0.0027, + "step": 9080 + }, + { + "epoch": 0.9673982527168122, + "eval_loss": 0.0028139299247413874, + "eval_runtime": 35.0138, + "eval_samples_per_second": 4288.995, + "eval_steps_per_second": 67.031, + "step": 9080 + }, + { + "epoch": 0.9684636692946942, + "grad_norm": 0.001305173384025693, + "learning_rate": 6.307266141061155e-07, + "loss": 0.0035, + "step": 9090 + }, + { + "epoch": 0.9684636692946942, + "eval_loss": 0.0028059857431799173, + "eval_runtime": 35.0574, + "eval_samples_per_second": 4283.659, + "eval_steps_per_second": 66.947, + "step": 9090 + }, + { + "epoch": 0.9695290858725761, + "grad_norm": 0.017361849546432495, + "learning_rate": 6.094182825484765e-07, + "loss": 0.0022, + "step": 9100 + }, + { + "epoch": 0.9695290858725761, + "eval_loss": 0.002785086864605546, + "eval_runtime": 35.026, + "eval_samples_per_second": 4287.499, + "eval_steps_per_second": 67.007, + "step": 9100 + }, + { + "epoch": 0.9705945024504581, + "grad_norm": 0.0025778792332857847, + "learning_rate": 5.881099509908375e-07, + "loss": 0.0006, + "step": 9110 + }, + { + "epoch": 0.9705945024504581, + "eval_loss": 0.002780887531116605, + "eval_runtime": 35.0435, + "eval_samples_per_second": 4285.355, + "eval_steps_per_second": 66.974, + "step": 9110 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 0.7138797044754028, + "learning_rate": 5.668016194331984e-07, + "loss": 0.0061, + "step": 9120 + }, + { + "epoch": 0.97165991902834, + "eval_loss": 0.002781209535896778, + "eval_runtime": 34.9997, + "eval_samples_per_second": 4290.726, + "eval_steps_per_second": 67.058, + "step": 9120 + }, + { + "epoch": 0.972725335606222, + "grad_norm": 0.11075238883495331, + "learning_rate": 5.454932878755593e-07, + "loss": 0.0007, + "step": 9130 + }, + { + "epoch": 0.972725335606222, + "eval_loss": 0.0027841285336762667, + "eval_runtime": 35.0754, + "eval_samples_per_second": 4281.462, + "eval_steps_per_second": 66.913, + "step": 9130 + }, + { + "epoch": 0.9737907521841039, + "grad_norm": 0.0027794514317065477, + "learning_rate": 5.241849563179203e-07, + "loss": 0.0002, + "step": 9140 + }, + { + "epoch": 0.9737907521841039, + "eval_loss": 0.0027846924494951963, + "eval_runtime": 35.0165, + "eval_samples_per_second": 4288.667, + "eval_steps_per_second": 67.026, + "step": 9140 + }, + { + "epoch": 0.9748561687619859, + "grad_norm": 0.00787454191595316, + "learning_rate": 5.028766247602813e-07, + "loss": 0.006, + "step": 9150 + }, + { + "epoch": 0.9748561687619859, + "eval_loss": 0.0027796956710517406, + "eval_runtime": 35.0181, + "eval_samples_per_second": 4288.464, + "eval_steps_per_second": 67.022, + "step": 9150 + }, + { + "epoch": 0.9759215853398678, + "grad_norm": 0.04696900025010109, + "learning_rate": 4.815682932026423e-07, + "loss": 0.0103, + "step": 9160 + }, + { + "epoch": 0.9759215853398678, + "eval_loss": 0.002768127480521798, + "eval_runtime": 35.0038, + "eval_samples_per_second": 4290.223, + "eval_steps_per_second": 67.05, + "step": 9160 + }, + { + "epoch": 0.9769870019177498, + "grad_norm": 0.0021065620239824057, + "learning_rate": 4.6025996164500324e-07, + "loss": 0.0017, + "step": 9170 + }, + { + "epoch": 0.9769870019177498, + "eval_loss": 0.0027594445273280144, + "eval_runtime": 35.1389, + "eval_samples_per_second": 4273.724, + "eval_steps_per_second": 66.792, + "step": 9170 + }, + { + "epoch": 0.9780524184956317, + "grad_norm": 0.01714223064482212, + "learning_rate": 4.389516300873642e-07, + "loss": 0.0008, + "step": 9180 + }, + { + "epoch": 0.9780524184956317, + "eval_loss": 0.00275549478828907, + "eval_runtime": 35.0542, + "eval_samples_per_second": 4284.05, + "eval_steps_per_second": 66.953, + "step": 9180 + }, + { + "epoch": 0.9791178350735138, + "grad_norm": 0.0016872499836608768, + "learning_rate": 4.1764329852972517e-07, + "loss": 0.0007, + "step": 9190 + }, + { + "epoch": 0.9791178350735138, + "eval_loss": 0.0027567828074097633, + "eval_runtime": 35.0654, + "eval_samples_per_second": 4282.682, + "eval_steps_per_second": 66.932, + "step": 9190 + }, + { + "epoch": 0.9801832516513957, + "grad_norm": 0.06704606115818024, + "learning_rate": 3.963349669720861e-07, + "loss": 0.0025, + "step": 9200 + }, + { + "epoch": 0.9801832516513957, + "eval_loss": 0.00275617279112339, + "eval_runtime": 35.0662, + "eval_samples_per_second": 4282.591, + "eval_steps_per_second": 66.931, + "step": 9200 + }, + { + "epoch": 0.9812486682292777, + "grad_norm": 0.0024131489917635918, + "learning_rate": 3.750266354144471e-07, + "loss": 0.001, + "step": 9210 + }, + { + "epoch": 0.9812486682292777, + "eval_loss": 0.0027577125001698732, + "eval_runtime": 35.0511, + "eval_samples_per_second": 4284.426, + "eval_steps_per_second": 66.959, + "step": 9210 + }, + { + "epoch": 0.9823140848071596, + "grad_norm": 0.0015416039386764169, + "learning_rate": 3.5371830385680803e-07, + "loss": 0.002, + "step": 9220 + }, + { + "epoch": 0.9823140848071596, + "eval_loss": 0.0027562561444938183, + "eval_runtime": 35.0509, + "eval_samples_per_second": 4284.459, + "eval_steps_per_second": 66.96, + "step": 9220 + }, + { + "epoch": 0.9833795013850416, + "grad_norm": 0.6929004192352295, + "learning_rate": 3.32409972299169e-07, + "loss": 0.0022, + "step": 9230 + }, + { + "epoch": 0.9833795013850416, + "eval_loss": 0.0027580568566918373, + "eval_runtime": 35.0336, + "eval_samples_per_second": 4286.575, + "eval_steps_per_second": 66.993, + "step": 9230 + }, + { + "epoch": 0.9844449179629235, + "grad_norm": 0.0025643545668572187, + "learning_rate": 3.1110164074152996e-07, + "loss": 0.0008, + "step": 9240 + }, + { + "epoch": 0.9844449179629235, + "eval_loss": 0.002759452909231186, + "eval_runtime": 35.0344, + "eval_samples_per_second": 4286.471, + "eval_steps_per_second": 66.991, + "step": 9240 + }, + { + "epoch": 0.9855103345408055, + "grad_norm": 0.002499173628166318, + "learning_rate": 2.8979330918389095e-07, + "loss": 0.0002, + "step": 9250 + }, + { + "epoch": 0.9855103345408055, + "eval_loss": 0.0027610480319708586, + "eval_runtime": 35.0262, + "eval_samples_per_second": 4287.473, + "eval_steps_per_second": 67.007, + "step": 9250 + }, + { + "epoch": 0.9865757511186874, + "grad_norm": 0.0013231937773525715, + "learning_rate": 2.684849776262519e-07, + "loss": 0.0014, + "step": 9260 + }, + { + "epoch": 0.9865757511186874, + "eval_loss": 0.002763622673228383, + "eval_runtime": 35.0291, + "eval_samples_per_second": 4287.125, + "eval_steps_per_second": 67.001, + "step": 9260 + }, + { + "epoch": 0.9876411676965694, + "grad_norm": 0.001755521516315639, + "learning_rate": 2.471766460686129e-07, + "loss": 0.0029, + "step": 9270 + }, + { + "epoch": 0.9876411676965694, + "eval_loss": 0.002763139782473445, + "eval_runtime": 35.0386, + "eval_samples_per_second": 4285.957, + "eval_steps_per_second": 66.983, + "step": 9270 + }, + { + "epoch": 0.9887065842744513, + "grad_norm": 0.20544728636741638, + "learning_rate": 2.258683145109738e-07, + "loss": 0.0009, + "step": 9280 + }, + { + "epoch": 0.9887065842744513, + "eval_loss": 0.0027635886799544096, + "eval_runtime": 35.0717, + "eval_samples_per_second": 4281.912, + "eval_steps_per_second": 66.92, + "step": 9280 + }, + { + "epoch": 0.9897720008523333, + "grad_norm": 0.06573604047298431, + "learning_rate": 2.0455998295333478e-07, + "loss": 0.0002, + "step": 9290 + }, + { + "epoch": 0.9897720008523333, + "eval_loss": 0.0027646832168102264, + "eval_runtime": 34.9956, + "eval_samples_per_second": 4291.22, + "eval_steps_per_second": 67.065, + "step": 9290 + }, + { + "epoch": 0.9908374174302153, + "grad_norm": 0.002524553332477808, + "learning_rate": 1.8325165139569574e-07, + "loss": 0.0004, + "step": 9300 + }, + { + "epoch": 0.9908374174302153, + "eval_loss": 0.0027653006836771965, + "eval_runtime": 35.0245, + "eval_samples_per_second": 4287.681, + "eval_steps_per_second": 67.01, + "step": 9300 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 0.0033023718278855085, + "learning_rate": 1.619433198380567e-07, + "loss": 0.0058, + "step": 9310 + }, + { + "epoch": 0.9919028340080972, + "eval_loss": 0.0027657628525048494, + "eval_runtime": 35.0047, + "eval_samples_per_second": 4290.116, + "eval_steps_per_second": 67.048, + "step": 9310 + }, + { + "epoch": 0.9929682505859792, + "grad_norm": 0.058614350855350494, + "learning_rate": 1.4063498828041767e-07, + "loss": 0.0003, + "step": 9320 + }, + { + "epoch": 0.9929682505859792, + "eval_loss": 0.002766667865216732, + "eval_runtime": 34.9979, + "eval_samples_per_second": 4290.938, + "eval_steps_per_second": 67.061, + "step": 9320 + }, + { + "epoch": 0.9940336671638611, + "grad_norm": 0.13814635574817657, + "learning_rate": 1.193266567227786e-07, + "loss": 0.0087, + "step": 9330 + }, + { + "epoch": 0.9940336671638611, + "eval_loss": 0.0027665847446769476, + "eval_runtime": 35.0199, + "eval_samples_per_second": 4288.253, + "eval_steps_per_second": 67.019, + "step": 9330 + }, + { + "epoch": 0.995099083741743, + "grad_norm": 0.007654257118701935, + "learning_rate": 9.801832516513957e-08, + "loss": 0.0071, + "step": 9340 + }, + { + "epoch": 0.995099083741743, + "eval_loss": 0.002764417789876461, + "eval_runtime": 35.033, + "eval_samples_per_second": 4286.649, + "eval_steps_per_second": 66.994, + "step": 9340 + }, + { + "epoch": 0.996164500319625, + "grad_norm": 0.0012485783081501722, + "learning_rate": 7.670999360750054e-08, + "loss": 0.0159, + "step": 9350 + }, + { + "epoch": 0.996164500319625, + "eval_loss": 0.0027642108034342527, + "eval_runtime": 35.05, + "eval_samples_per_second": 4284.566, + "eval_steps_per_second": 66.962, + "step": 9350 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 0.0015925171319395304, + "learning_rate": 5.54016620498615e-08, + "loss": 0.0003, + "step": 9360 + }, + { + "epoch": 0.997229916897507, + "eval_loss": 0.002764443401247263, + "eval_runtime": 35.0619, + "eval_samples_per_second": 4283.109, + "eval_steps_per_second": 66.939, + "step": 9360 + }, + { + "epoch": 0.9982953334753889, + "grad_norm": 0.004035618621855974, + "learning_rate": 3.409333049222246e-08, + "loss": 0.0108, + "step": 9370 + }, + { + "epoch": 0.9982953334753889, + "eval_loss": 0.002764328382909298, + "eval_runtime": 34.9973, + "eval_samples_per_second": 4291.017, + "eval_steps_per_second": 67.062, + "step": 9370 + }, + { + "epoch": 0.9993607500532709, + "grad_norm": 0.0013912487775087357, + "learning_rate": 1.2784998934583423e-08, + "loss": 0.0002, + "step": 9380 + }, + { + "epoch": 0.9993607500532709, + "eval_loss": 0.0027642655186355114, + "eval_runtime": 35.0448, + "eval_samples_per_second": 4285.205, + "eval_steps_per_second": 66.971, + "step": 9380 } ], "logging_steps": 10, - "max_steps": 3118, + "max_steps": 9386, "num_input_tokens_seen": 0, - "num_train_epochs": 2, + "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -4691,8 +14096,8 @@ "attributes": {} } }, - "total_flos": 50206333455360.0, - "train_batch_size": 32, + "total_flos": 302396856261120.0, + "train_batch_size": 64, "trial_name": null, "trial_params": null }