diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4698 @@ +{ + "best_metric": 0.0036811623722314835, + "best_model_checkpoint": "./results/checkpoint-1830", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 3118, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006414368184733804, + "grad_norm": 4.846098899841309, + "learning_rate": 1.9935856318152666e-05, + "loss": 0.5328, + "step": 10 + }, + { + "epoch": 0.006414368184733804, + "eval_loss": 0.4104921221733093, + "eval_runtime": 2.8279, + "eval_samples_per_second": 4408.52, + "eval_steps_per_second": 137.91, + "step": 10 + }, + { + "epoch": 0.012828736369467608, + "grad_norm": 2.26491641998291, + "learning_rate": 1.9871712636305324e-05, + "loss": 0.3602, + "step": 20 + }, + { + "epoch": 0.012828736369467608, + "eval_loss": 0.29217612743377686, + "eval_runtime": 2.7561, + "eval_samples_per_second": 4523.399, + "eval_steps_per_second": 141.504, + "step": 20 + }, + { + "epoch": 0.01924310455420141, + "grad_norm": 1.9946489334106445, + "learning_rate": 1.9807568954457988e-05, + "loss": 0.2535, + "step": 30 + }, + { + "epoch": 0.01924310455420141, + "eval_loss": 0.20272396504878998, + "eval_runtime": 2.7868, + "eval_samples_per_second": 4473.656, + "eval_steps_per_second": 139.948, + "step": 30 + }, + { + "epoch": 0.025657472738935216, + "grad_norm": 1.5090171098709106, + "learning_rate": 1.974342527261065e-05, + "loss": 0.1843, + "step": 40 + }, + { + "epoch": 0.025657472738935216, + "eval_loss": 0.13557858765125275, + "eval_runtime": 2.7501, + "eval_samples_per_second": 4533.352, + "eval_steps_per_second": 141.815, + "step": 40 + }, + { + "epoch": 0.03207184092366902, + "grad_norm": 1.2906427383422852, + "learning_rate": 1.967928159076331e-05, + "loss": 0.1196, + "step": 50 + }, + { + "epoch": 0.03207184092366902, + "eval_loss": 0.09848916530609131, + "eval_runtime": 2.8508, + "eval_samples_per_second": 4373.15, + "eval_steps_per_second": 136.803, + "step": 50 + }, + { + "epoch": 0.03848620910840282, + "grad_norm": 1.8510209321975708, + "learning_rate": 1.9615137908915974e-05, + "loss": 0.077, + "step": 60 + }, + { + "epoch": 0.03848620910840282, + "eval_loss": 0.0683838352560997, + "eval_runtime": 2.7704, + "eval_samples_per_second": 4500.147, + "eval_steps_per_second": 140.776, + "step": 60 + }, + { + "epoch": 0.04490057729313662, + "grad_norm": 0.5595826506614685, + "learning_rate": 1.9550994227068635e-05, + "loss": 0.0694, + "step": 70 + }, + { + "epoch": 0.04490057729313662, + "eval_loss": 0.05297030881047249, + "eval_runtime": 2.754, + "eval_samples_per_second": 4526.939, + "eval_steps_per_second": 141.614, + "step": 70 + }, + { + "epoch": 0.05131494547787043, + "grad_norm": 0.836297869682312, + "learning_rate": 1.9486850545221296e-05, + "loss": 0.0452, + "step": 80 + }, + { + "epoch": 0.05131494547787043, + "eval_loss": 0.04719825088977814, + "eval_runtime": 2.7488, + "eval_samples_per_second": 4535.467, + "eval_steps_per_second": 141.881, + "step": 80 + }, + { + "epoch": 0.05772931366260423, + "grad_norm": 3.999357223510742, + "learning_rate": 1.942270686337396e-05, + "loss": 0.0533, + "step": 90 + }, + { + "epoch": 0.05772931366260423, + "eval_loss": 0.035153161734342575, + "eval_runtime": 2.7687, + "eval_samples_per_second": 4502.755, + "eval_steps_per_second": 140.858, + "step": 90 + }, + { + "epoch": 0.06414368184733804, + "grad_norm": 0.37607356905937195, + "learning_rate": 1.935856318152662e-05, + "loss": 0.0376, + "step": 100 + }, + { + "epoch": 0.06414368184733804, + "eval_loss": 0.02929052524268627, + "eval_runtime": 2.7416, + "eval_samples_per_second": 4547.36, + "eval_steps_per_second": 142.253, + "step": 100 + }, + { + "epoch": 0.07055805003207184, + "grad_norm": 0.2907819151878357, + "learning_rate": 1.9294419499679282e-05, + "loss": 0.0199, + "step": 110 + }, + { + "epoch": 0.07055805003207184, + "eval_loss": 0.024354618042707443, + "eval_runtime": 2.7981, + "eval_samples_per_second": 4455.483, + "eval_steps_per_second": 139.379, + "step": 110 + }, + { + "epoch": 0.07697241821680564, + "grad_norm": 0.2949336767196655, + "learning_rate": 1.9230275817831943e-05, + "loss": 0.0175, + "step": 120 + }, + { + "epoch": 0.07697241821680564, + "eval_loss": 0.021745959296822548, + "eval_runtime": 2.7494, + "eval_samples_per_second": 4534.52, + "eval_steps_per_second": 141.852, + "step": 120 + }, + { + "epoch": 0.08338678640153944, + "grad_norm": 0.2311653196811676, + "learning_rate": 1.9166132135984608e-05, + "loss": 0.0203, + "step": 130 + }, + { + "epoch": 0.08338678640153944, + "eval_loss": 0.0190635584294796, + "eval_runtime": 2.7863, + "eval_samples_per_second": 4474.414, + "eval_steps_per_second": 139.971, + "step": 130 + }, + { + "epoch": 0.08980115458627325, + "grad_norm": 0.2001882642507553, + "learning_rate": 1.910198845413727e-05, + "loss": 0.0168, + "step": 140 + }, + { + "epoch": 0.08980115458627325, + "eval_loss": 0.017704442143440247, + "eval_runtime": 2.7325, + "eval_samples_per_second": 4562.549, + "eval_steps_per_second": 142.728, + "step": 140 + }, + { + "epoch": 0.09621552277100706, + "grad_norm": 0.21124917268753052, + "learning_rate": 1.903784477228993e-05, + "loss": 0.019, + "step": 150 + }, + { + "epoch": 0.09621552277100706, + "eval_loss": 0.01564132049679756, + "eval_runtime": 2.795, + "eval_samples_per_second": 4460.445, + "eval_steps_per_second": 139.534, + "step": 150 + }, + { + "epoch": 0.10262989095574086, + "grad_norm": 0.1619035303592682, + "learning_rate": 1.8973701090442594e-05, + "loss": 0.0116, + "step": 160 + }, + { + "epoch": 0.10262989095574086, + "eval_loss": 0.014719261787831783, + "eval_runtime": 2.7377, + "eval_samples_per_second": 4553.795, + "eval_steps_per_second": 142.454, + "step": 160 + }, + { + "epoch": 0.10904425914047466, + "grad_norm": 5.417097091674805, + "learning_rate": 1.8909557408595255e-05, + "loss": 0.0126, + "step": 170 + }, + { + "epoch": 0.10904425914047466, + "eval_loss": 0.014089061878621578, + "eval_runtime": 2.7909, + "eval_samples_per_second": 4467.003, + "eval_steps_per_second": 139.739, + "step": 170 + }, + { + "epoch": 0.11545862732520847, + "grad_norm": 0.23231153190135956, + "learning_rate": 1.8845413726747916e-05, + "loss": 0.0095, + "step": 180 + }, + { + "epoch": 0.11545862732520847, + "eval_loss": 0.01361811999231577, + "eval_runtime": 2.678, + "eval_samples_per_second": 4655.296, + "eval_steps_per_second": 145.63, + "step": 180 + }, + { + "epoch": 0.12187299550994227, + "grad_norm": 0.12273150682449341, + "learning_rate": 1.878127004490058e-05, + "loss": 0.0077, + "step": 190 + }, + { + "epoch": 0.12187299550994227, + "eval_loss": 0.012749651446938515, + "eval_runtime": 2.4162, + "eval_samples_per_second": 5159.82, + "eval_steps_per_second": 161.413, + "step": 190 + }, + { + "epoch": 0.12828736369467608, + "grad_norm": 0.1562490612268448, + "learning_rate": 1.871712636305324e-05, + "loss": 0.013, + "step": 200 + }, + { + "epoch": 0.12828736369467608, + "eval_loss": 0.01220065075904131, + "eval_runtime": 2.4215, + "eval_samples_per_second": 5148.566, + "eval_steps_per_second": 161.06, + "step": 200 + }, + { + "epoch": 0.13470173187940987, + "grad_norm": 0.10928566008806229, + "learning_rate": 1.8652982681205902e-05, + "loss": 0.0067, + "step": 210 + }, + { + "epoch": 0.13470173187940987, + "eval_loss": 0.011816415004432201, + "eval_runtime": 2.4392, + "eval_samples_per_second": 5111.048, + "eval_steps_per_second": 159.887, + "step": 210 + }, + { + "epoch": 0.14111610006414368, + "grad_norm": 0.14731262624263763, + "learning_rate": 1.8588838999358566e-05, + "loss": 0.0058, + "step": 220 + }, + { + "epoch": 0.14111610006414368, + "eval_loss": 0.011285252869129181, + "eval_runtime": 2.592, + "eval_samples_per_second": 4809.839, + "eval_steps_per_second": 150.464, + "step": 220 + }, + { + "epoch": 0.14753046824887747, + "grad_norm": 0.20062246918678284, + "learning_rate": 1.8524695317511227e-05, + "loss": 0.0046, + "step": 230 + }, + { + "epoch": 0.14753046824887747, + "eval_loss": 0.011024047620594501, + "eval_runtime": 2.7581, + "eval_samples_per_second": 4520.102, + "eval_steps_per_second": 141.4, + "step": 230 + }, + { + "epoch": 0.1539448364336113, + "grad_norm": 0.08004695922136307, + "learning_rate": 1.8460551635663888e-05, + "loss": 0.0068, + "step": 240 + }, + { + "epoch": 0.1539448364336113, + "eval_loss": 0.01156590785831213, + "eval_runtime": 2.781, + "eval_samples_per_second": 4482.909, + "eval_steps_per_second": 140.237, + "step": 240 + }, + { + "epoch": 0.1603592046183451, + "grad_norm": 0.10583271086215973, + "learning_rate": 1.839640795381655e-05, + "loss": 0.0185, + "step": 250 + }, + { + "epoch": 0.1603592046183451, + "eval_loss": 0.010944708250463009, + "eval_runtime": 2.751, + "eval_samples_per_second": 4531.758, + "eval_steps_per_second": 141.765, + "step": 250 + }, + { + "epoch": 0.1667735728030789, + "grad_norm": 0.08356133848428726, + "learning_rate": 1.8332264271969214e-05, + "loss": 0.0047, + "step": 260 + }, + { + "epoch": 0.1667735728030789, + "eval_loss": 0.010701462626457214, + "eval_runtime": 2.7856, + "eval_samples_per_second": 4475.594, + "eval_steps_per_second": 140.008, + "step": 260 + }, + { + "epoch": 0.1731879409878127, + "grad_norm": 0.07829653471708298, + "learning_rate": 1.8268120590121874e-05, + "loss": 0.0063, + "step": 270 + }, + { + "epoch": 0.1731879409878127, + "eval_loss": 0.010307610966265202, + "eval_runtime": 2.7448, + "eval_samples_per_second": 4541.989, + "eval_steps_per_second": 142.085, + "step": 270 + }, + { + "epoch": 0.1796023091725465, + "grad_norm": 0.09023799002170563, + "learning_rate": 1.8203976908274535e-05, + "loss": 0.012, + "step": 280 + }, + { + "epoch": 0.1796023091725465, + "eval_loss": 0.010143229737877846, + "eval_runtime": 2.7579, + "eval_samples_per_second": 4520.523, + "eval_steps_per_second": 141.414, + "step": 280 + }, + { + "epoch": 0.1860166773572803, + "grad_norm": 0.09888483583927155, + "learning_rate": 1.81398332264272e-05, + "loss": 0.0061, + "step": 290 + }, + { + "epoch": 0.1860166773572803, + "eval_loss": 0.010016077198088169, + "eval_runtime": 2.7506, + "eval_samples_per_second": 4532.456, + "eval_steps_per_second": 141.787, + "step": 290 + }, + { + "epoch": 0.19243104554201412, + "grad_norm": 0.19275423884391785, + "learning_rate": 1.807568954457986e-05, + "loss": 0.004, + "step": 300 + }, + { + "epoch": 0.19243104554201412, + "eval_loss": 0.010140118189156055, + "eval_runtime": 2.7433, + "eval_samples_per_second": 4544.529, + "eval_steps_per_second": 142.165, + "step": 300 + }, + { + "epoch": 0.1988454137267479, + "grad_norm": 0.6617489457130432, + "learning_rate": 1.801154586273252e-05, + "loss": 0.004, + "step": 310 + }, + { + "epoch": 0.1988454137267479, + "eval_loss": 0.00996321253478527, + "eval_runtime": 2.7928, + "eval_samples_per_second": 4464.0, + "eval_steps_per_second": 139.645, + "step": 310 + }, + { + "epoch": 0.20525978191148173, + "grad_norm": 0.08185740560293198, + "learning_rate": 1.7947402180885186e-05, + "loss": 0.0033, + "step": 320 + }, + { + "epoch": 0.20525978191148173, + "eval_loss": 0.009846839122474194, + "eval_runtime": 2.7484, + "eval_samples_per_second": 4536.056, + "eval_steps_per_second": 141.9, + "step": 320 + }, + { + "epoch": 0.2116741500962155, + "grad_norm": 0.07048241049051285, + "learning_rate": 1.7883258499037847e-05, + "loss": 0.0035, + "step": 330 + }, + { + "epoch": 0.2116741500962155, + "eval_loss": 0.009670664556324482, + "eval_runtime": 2.8146, + "eval_samples_per_second": 4429.476, + "eval_steps_per_second": 138.565, + "step": 330 + }, + { + "epoch": 0.21808851828094933, + "grad_norm": 0.05294159799814224, + "learning_rate": 1.7819114817190508e-05, + "loss": 0.0063, + "step": 340 + }, + { + "epoch": 0.21808851828094933, + "eval_loss": 0.00945183914154768, + "eval_runtime": 2.7473, + "eval_samples_per_second": 4537.971, + "eval_steps_per_second": 141.959, + "step": 340 + }, + { + "epoch": 0.22450288646568314, + "grad_norm": 0.08298569917678833, + "learning_rate": 1.7754971135343172e-05, + "loss": 0.0029, + "step": 350 + }, + { + "epoch": 0.22450288646568314, + "eval_loss": 0.00936791580170393, + "eval_runtime": 2.7685, + "eval_samples_per_second": 4503.121, + "eval_steps_per_second": 140.869, + "step": 350 + }, + { + "epoch": 0.23091725465041693, + "grad_norm": 0.05115268751978874, + "learning_rate": 1.769082745349583e-05, + "loss": 0.0033, + "step": 360 + }, + { + "epoch": 0.23091725465041693, + "eval_loss": 0.009066049940884113, + "eval_runtime": 2.7558, + "eval_samples_per_second": 4523.868, + "eval_steps_per_second": 141.518, + "step": 360 + }, + { + "epoch": 0.23733162283515075, + "grad_norm": 0.05516692250967026, + "learning_rate": 1.7626683771648494e-05, + "loss": 0.0028, + "step": 370 + }, + { + "epoch": 0.23733162283515075, + "eval_loss": 0.008907307870686054, + "eval_runtime": 2.7915, + "eval_samples_per_second": 4466.028, + "eval_steps_per_second": 139.709, + "step": 370 + }, + { + "epoch": 0.24374599101988453, + "grad_norm": 0.04760660603642464, + "learning_rate": 1.7562540089801155e-05, + "loss": 0.0196, + "step": 380 + }, + { + "epoch": 0.24374599101988453, + "eval_loss": 0.008900969289243221, + "eval_runtime": 2.746, + "eval_samples_per_second": 4540.006, + "eval_steps_per_second": 142.023, + "step": 380 + }, + { + "epoch": 0.2501603592046183, + "grad_norm": 0.08314153552055359, + "learning_rate": 1.7498396407953816e-05, + "loss": 0.0031, + "step": 390 + }, + { + "epoch": 0.2501603592046183, + "eval_loss": 0.009380661882460117, + "eval_runtime": 2.7831, + "eval_samples_per_second": 4479.473, + "eval_steps_per_second": 140.129, + "step": 390 + }, + { + "epoch": 0.25657472738935216, + "grad_norm": 0.06383411586284637, + "learning_rate": 1.743425272610648e-05, + "loss": 0.0097, + "step": 400 + }, + { + "epoch": 0.25657472738935216, + "eval_loss": 0.007914945483207703, + "eval_runtime": 2.4093, + "eval_samples_per_second": 5174.549, + "eval_steps_per_second": 161.873, + "step": 400 + }, + { + "epoch": 0.26298909557408595, + "grad_norm": 0.05818384885787964, + "learning_rate": 1.737010904425914e-05, + "loss": 0.0284, + "step": 410 + }, + { + "epoch": 0.26298909557408595, + "eval_loss": 0.0076097771525382996, + "eval_runtime": 2.4156, + "eval_samples_per_second": 5160.954, + "eval_steps_per_second": 161.448, + "step": 410 + }, + { + "epoch": 0.26940346375881974, + "grad_norm": 0.04634961113333702, + "learning_rate": 1.7305965362411802e-05, + "loss": 0.0028, + "step": 420 + }, + { + "epoch": 0.26940346375881974, + "eval_loss": 0.007537364028394222, + "eval_runtime": 2.4317, + "eval_samples_per_second": 5126.827, + "eval_steps_per_second": 160.38, + "step": 420 + }, + { + "epoch": 0.2758178319435536, + "grad_norm": 0.05479798465967178, + "learning_rate": 1.7241821680564467e-05, + "loss": 0.0267, + "step": 430 + }, + { + "epoch": 0.2758178319435536, + "eval_loss": 0.007072314620018005, + "eval_runtime": 2.4166, + "eval_samples_per_second": 5158.936, + "eval_steps_per_second": 161.385, + "step": 430 + }, + { + "epoch": 0.28223220012828737, + "grad_norm": 0.05063086375594139, + "learning_rate": 1.7177677998717128e-05, + "loss": 0.0053, + "step": 440 + }, + { + "epoch": 0.28223220012828737, + "eval_loss": 0.007745321840047836, + "eval_runtime": 2.4878, + "eval_samples_per_second": 5011.194, + "eval_steps_per_second": 156.763, + "step": 440 + }, + { + "epoch": 0.28864656831302116, + "grad_norm": 0.05067560821771622, + "learning_rate": 1.711353431686979e-05, + "loss": 0.0154, + "step": 450 + }, + { + "epoch": 0.28864656831302116, + "eval_loss": 0.007782284170389175, + "eval_runtime": 2.7555, + "eval_samples_per_second": 4524.446, + "eval_steps_per_second": 141.536, + "step": 450 + }, + { + "epoch": 0.29506093649775494, + "grad_norm": 0.0449526272714138, + "learning_rate": 1.7049390635022453e-05, + "loss": 0.0024, + "step": 460 + }, + { + "epoch": 0.29506093649775494, + "eval_loss": 0.007555678952485323, + "eval_runtime": 2.7641, + "eval_samples_per_second": 4510.352, + "eval_steps_per_second": 141.095, + "step": 460 + }, + { + "epoch": 0.3014753046824888, + "grad_norm": 0.051270872354507446, + "learning_rate": 1.6985246953175114e-05, + "loss": 0.0026, + "step": 470 + }, + { + "epoch": 0.3014753046824888, + "eval_loss": 0.007208110298961401, + "eval_runtime": 2.7392, + "eval_samples_per_second": 4551.376, + "eval_steps_per_second": 142.379, + "step": 470 + }, + { + "epoch": 0.3078896728672226, + "grad_norm": 0.03896138072013855, + "learning_rate": 1.6921103271327775e-05, + "loss": 0.008, + "step": 480 + }, + { + "epoch": 0.3078896728672226, + "eval_loss": 0.007339117117226124, + "eval_runtime": 2.7424, + "eval_samples_per_second": 4546.013, + "eval_steps_per_second": 142.211, + "step": 480 + }, + { + "epoch": 0.31430404105195636, + "grad_norm": 0.16523483395576477, + "learning_rate": 1.6856959589480436e-05, + "loss": 0.0088, + "step": 490 + }, + { + "epoch": 0.31430404105195636, + "eval_loss": 0.010751989670097828, + "eval_runtime": 2.7719, + "eval_samples_per_second": 4497.586, + "eval_steps_per_second": 140.696, + "step": 490 + }, + { + "epoch": 0.3207184092366902, + "grad_norm": 0.037684451788663864, + "learning_rate": 1.67928159076331e-05, + "loss": 0.0031, + "step": 500 + }, + { + "epoch": 0.3207184092366902, + "eval_loss": 0.007628251798450947, + "eval_runtime": 2.7331, + "eval_samples_per_second": 4561.511, + "eval_steps_per_second": 142.696, + "step": 500 + }, + { + "epoch": 0.327132777421424, + "grad_norm": 0.06561500579118729, + "learning_rate": 1.672867222578576e-05, + "loss": 0.0022, + "step": 510 + }, + { + "epoch": 0.327132777421424, + "eval_loss": 0.007246845401823521, + "eval_runtime": 2.7607, + "eval_samples_per_second": 4515.928, + "eval_steps_per_second": 141.27, + "step": 510 + }, + { + "epoch": 0.3335471456061578, + "grad_norm": 0.041504278779029846, + "learning_rate": 1.6664528543938422e-05, + "loss": 0.0019, + "step": 520 + }, + { + "epoch": 0.3335471456061578, + "eval_loss": 0.0074407560750842094, + "eval_runtime": 2.7513, + "eval_samples_per_second": 4531.364, + "eval_steps_per_second": 141.753, + "step": 520 + }, + { + "epoch": 0.3399615137908916, + "grad_norm": 0.03790243715047836, + "learning_rate": 1.6600384862091086e-05, + "loss": 0.0019, + "step": 530 + }, + { + "epoch": 0.3399615137908916, + "eval_loss": 0.007536882068961859, + "eval_runtime": 2.769, + "eval_samples_per_second": 4502.413, + "eval_steps_per_second": 140.847, + "step": 530 + }, + { + "epoch": 0.3463758819756254, + "grad_norm": 0.0355646014213562, + "learning_rate": 1.6536241180243747e-05, + "loss": 0.0019, + "step": 540 + }, + { + "epoch": 0.3463758819756254, + "eval_loss": 0.0075461240485310555, + "eval_runtime": 2.413, + "eval_samples_per_second": 5166.639, + "eval_steps_per_second": 161.626, + "step": 540 + }, + { + "epoch": 0.3527902501603592, + "grad_norm": 0.04195011779665947, + "learning_rate": 1.6472097498396408e-05, + "loss": 0.0018, + "step": 550 + }, + { + "epoch": 0.3527902501603592, + "eval_loss": 0.007501190062612295, + "eval_runtime": 2.4208, + "eval_samples_per_second": 5150.019, + "eval_steps_per_second": 161.106, + "step": 550 + }, + { + "epoch": 0.359204618345093, + "grad_norm": 0.28111106157302856, + "learning_rate": 1.6407953816549073e-05, + "loss": 0.0132, + "step": 560 + }, + { + "epoch": 0.359204618345093, + "eval_loss": 0.006708688568323851, + "eval_runtime": 2.4465, + "eval_samples_per_second": 5095.942, + "eval_steps_per_second": 159.414, + "step": 560 + }, + { + "epoch": 0.36561898652982683, + "grad_norm": 0.030719993636012077, + "learning_rate": 1.6343810134701734e-05, + "loss": 0.0018, + "step": 570 + }, + { + "epoch": 0.36561898652982683, + "eval_loss": 0.00691966200247407, + "eval_runtime": 2.4163, + "eval_samples_per_second": 5159.519, + "eval_steps_per_second": 161.403, + "step": 570 + }, + { + "epoch": 0.3720333547145606, + "grad_norm": 0.028909320011734962, + "learning_rate": 1.6279666452854395e-05, + "loss": 0.0018, + "step": 580 + }, + { + "epoch": 0.3720333547145606, + "eval_loss": 0.006832743063569069, + "eval_runtime": 2.4073, + "eval_samples_per_second": 5178.73, + "eval_steps_per_second": 162.004, + "step": 580 + }, + { + "epoch": 0.3784477228992944, + "grad_norm": 0.5586157441139221, + "learning_rate": 1.621552277100706e-05, + "loss": 0.0017, + "step": 590 + }, + { + "epoch": 0.3784477228992944, + "eval_loss": 0.0066373394802212715, + "eval_runtime": 2.4151, + "eval_samples_per_second": 5162.11, + "eval_steps_per_second": 161.484, + "step": 590 + }, + { + "epoch": 0.38486209108402825, + "grad_norm": 0.029421871528029442, + "learning_rate": 1.615137908915972e-05, + "loss": 0.0015, + "step": 600 + }, + { + "epoch": 0.38486209108402825, + "eval_loss": 0.0064350636675953865, + "eval_runtime": 2.4015, + "eval_samples_per_second": 5191.282, + "eval_steps_per_second": 162.397, + "step": 600 + }, + { + "epoch": 0.39127645926876203, + "grad_norm": 0.028947781771421432, + "learning_rate": 1.608723540731238e-05, + "loss": 0.0022, + "step": 610 + }, + { + "epoch": 0.39127645926876203, + "eval_loss": 0.00638926774263382, + "eval_runtime": 2.414, + "eval_samples_per_second": 5164.477, + "eval_steps_per_second": 161.558, + "step": 610 + }, + { + "epoch": 0.3976908274534958, + "grad_norm": 0.06888407468795776, + "learning_rate": 1.6023091725465042e-05, + "loss": 0.0016, + "step": 620 + }, + { + "epoch": 0.3976908274534958, + "eval_loss": 0.006634836550801992, + "eval_runtime": 2.4135, + "eval_samples_per_second": 5165.479, + "eval_steps_per_second": 161.59, + "step": 620 + }, + { + "epoch": 0.4041051956382296, + "grad_norm": 0.04158307611942291, + "learning_rate": 1.5958948043617706e-05, + "loss": 0.0187, + "step": 630 + }, + { + "epoch": 0.4041051956382296, + "eval_loss": 0.006342691835016012, + "eval_runtime": 2.4083, + "eval_samples_per_second": 5176.599, + "eval_steps_per_second": 161.937, + "step": 630 + }, + { + "epoch": 0.41051956382296345, + "grad_norm": 0.02934635430574417, + "learning_rate": 1.5894804361770367e-05, + "loss": 0.0015, + "step": 640 + }, + { + "epoch": 0.41051956382296345, + "eval_loss": 0.00618335185572505, + "eval_runtime": 2.4101, + "eval_samples_per_second": 5172.867, + "eval_steps_per_second": 161.821, + "step": 640 + }, + { + "epoch": 0.41693393200769724, + "grad_norm": 0.030286366119980812, + "learning_rate": 1.5830660679923028e-05, + "loss": 0.0015, + "step": 650 + }, + { + "epoch": 0.41693393200769724, + "eval_loss": 0.006189233157783747, + "eval_runtime": 2.4471, + "eval_samples_per_second": 5094.534, + "eval_steps_per_second": 159.37, + "step": 650 + }, + { + "epoch": 0.423348300192431, + "grad_norm": 0.030619481578469276, + "learning_rate": 1.5766516998075692e-05, + "loss": 0.0016, + "step": 660 + }, + { + "epoch": 0.423348300192431, + "eval_loss": 0.0063001601956784725, + "eval_runtime": 2.417, + "eval_samples_per_second": 5157.96, + "eval_steps_per_second": 161.354, + "step": 660 + }, + { + "epoch": 0.42976266837716487, + "grad_norm": 11.238213539123535, + "learning_rate": 1.5702373316228353e-05, + "loss": 0.0194, + "step": 670 + }, + { + "epoch": 0.42976266837716487, + "eval_loss": 0.006720089819282293, + "eval_runtime": 2.6201, + "eval_samples_per_second": 4758.272, + "eval_steps_per_second": 148.851, + "step": 670 + }, + { + "epoch": 0.43617703656189866, + "grad_norm": 0.02961154095828533, + "learning_rate": 1.5638229634381014e-05, + "loss": 0.0014, + "step": 680 + }, + { + "epoch": 0.43617703656189866, + "eval_loss": 0.006322733126580715, + "eval_runtime": 2.7635, + "eval_samples_per_second": 4511.389, + "eval_steps_per_second": 141.128, + "step": 680 + }, + { + "epoch": 0.44259140474663244, + "grad_norm": 0.03423071652650833, + "learning_rate": 1.557408595253368e-05, + "loss": 0.013, + "step": 690 + }, + { + "epoch": 0.44259140474663244, + "eval_loss": 0.005984561517834663, + "eval_runtime": 2.7417, + "eval_samples_per_second": 4547.152, + "eval_steps_per_second": 142.247, + "step": 690 + }, + { + "epoch": 0.4490057729313663, + "grad_norm": 0.04294706881046295, + "learning_rate": 1.550994227068634e-05, + "loss": 0.0226, + "step": 700 + }, + { + "epoch": 0.4490057729313663, + "eval_loss": 0.006303212605416775, + "eval_runtime": 2.783, + "eval_samples_per_second": 4479.762, + "eval_steps_per_second": 140.139, + "step": 700 + }, + { + "epoch": 0.4554201411161001, + "grad_norm": 0.04019453376531601, + "learning_rate": 1.5445798588839e-05, + "loss": 0.0016, + "step": 710 + }, + { + "epoch": 0.4554201411161001, + "eval_loss": 0.0058699618093669415, + "eval_runtime": 2.7406, + "eval_samples_per_second": 4549.084, + "eval_steps_per_second": 142.307, + "step": 710 + }, + { + "epoch": 0.46183450930083386, + "grad_norm": 0.0327487550675869, + "learning_rate": 1.5381654906991665e-05, + "loss": 0.0018, + "step": 720 + }, + { + "epoch": 0.46183450930083386, + "eval_loss": 0.0058822124265134335, + "eval_runtime": 2.7933, + "eval_samples_per_second": 4463.192, + "eval_steps_per_second": 139.62, + "step": 720 + }, + { + "epoch": 0.46824887748556765, + "grad_norm": 0.025891833007335663, + "learning_rate": 1.5317511225144322e-05, + "loss": 0.0013, + "step": 730 + }, + { + "epoch": 0.46824887748556765, + "eval_loss": 0.00716716842725873, + "eval_runtime": 2.7388, + "eval_samples_per_second": 4551.919, + "eval_steps_per_second": 142.396, + "step": 730 + }, + { + "epoch": 0.4746632456703015, + "grad_norm": 0.03438916057348251, + "learning_rate": 1.5253367543296987e-05, + "loss": 0.0015, + "step": 740 + }, + { + "epoch": 0.4746632456703015, + "eval_loss": 0.007040859200060368, + "eval_runtime": 2.7963, + "eval_samples_per_second": 4458.353, + "eval_steps_per_second": 139.469, + "step": 740 + }, + { + "epoch": 0.4810776138550353, + "grad_norm": 0.02543172985315323, + "learning_rate": 1.518922386144965e-05, + "loss": 0.0025, + "step": 750 + }, + { + "epoch": 0.4810776138550353, + "eval_loss": 0.006040054839104414, + "eval_runtime": 2.7444, + "eval_samples_per_second": 4542.754, + "eval_steps_per_second": 142.109, + "step": 750 + }, + { + "epoch": 0.48749198203976907, + "grad_norm": 0.02564307488501072, + "learning_rate": 1.5125080179602309e-05, + "loss": 0.0012, + "step": 760 + }, + { + "epoch": 0.48749198203976907, + "eval_loss": 0.006054045632481575, + "eval_runtime": 2.7961, + "eval_samples_per_second": 4458.733, + "eval_steps_per_second": 139.481, + "step": 760 + }, + { + "epoch": 0.4939063502245029, + "grad_norm": 0.026773959398269653, + "learning_rate": 1.5060936497754971e-05, + "loss": 0.0084, + "step": 770 + }, + { + "epoch": 0.4939063502245029, + "eval_loss": 0.006040018983185291, + "eval_runtime": 2.7423, + "eval_samples_per_second": 4546.213, + "eval_steps_per_second": 142.217, + "step": 770 + }, + { + "epoch": 0.5003207184092366, + "grad_norm": 0.02138395607471466, + "learning_rate": 1.4996792815907636e-05, + "loss": 0.0022, + "step": 780 + }, + { + "epoch": 0.5003207184092366, + "eval_loss": 0.006401837337762117, + "eval_runtime": 2.7962, + "eval_samples_per_second": 4458.477, + "eval_steps_per_second": 139.473, + "step": 780 + }, + { + "epoch": 0.5067350865939705, + "grad_norm": 0.021893974393606186, + "learning_rate": 1.4932649134060295e-05, + "loss": 0.0032, + "step": 790 + }, + { + "epoch": 0.5067350865939705, + "eval_loss": 0.006350088398903608, + "eval_runtime": 2.7416, + "eval_samples_per_second": 4547.347, + "eval_steps_per_second": 142.253, + "step": 790 + }, + { + "epoch": 0.5131494547787043, + "grad_norm": 0.026742270216345787, + "learning_rate": 1.4868505452212958e-05, + "loss": 0.0492, + "step": 800 + }, + { + "epoch": 0.5131494547787043, + "eval_loss": 0.005826563574373722, + "eval_runtime": 2.7898, + "eval_samples_per_second": 4468.753, + "eval_steps_per_second": 139.794, + "step": 800 + }, + { + "epoch": 0.5195638229634381, + "grad_norm": 0.03277752548456192, + "learning_rate": 1.480436177036562e-05, + "loss": 0.0014, + "step": 810 + }, + { + "epoch": 0.5195638229634381, + "eval_loss": 0.005615294445306063, + "eval_runtime": 2.743, + "eval_samples_per_second": 4545.047, + "eval_steps_per_second": 142.181, + "step": 810 + }, + { + "epoch": 0.5259781911481719, + "grad_norm": 0.03740512579679489, + "learning_rate": 1.4740218088518281e-05, + "loss": 0.0014, + "step": 820 + }, + { + "epoch": 0.5259781911481719, + "eval_loss": 0.0055384160950779915, + "eval_runtime": 2.7995, + "eval_samples_per_second": 4453.26, + "eval_steps_per_second": 139.309, + "step": 820 + }, + { + "epoch": 0.5323925593329057, + "grad_norm": 0.02818211168050766, + "learning_rate": 1.4676074406670944e-05, + "loss": 0.0016, + "step": 830 + }, + { + "epoch": 0.5323925593329057, + "eval_loss": 0.006212199572473764, + "eval_runtime": 2.747, + "eval_samples_per_second": 4538.341, + "eval_steps_per_second": 141.971, + "step": 830 + }, + { + "epoch": 0.5388069275176395, + "grad_norm": 0.02874821238219738, + "learning_rate": 1.4611930724823606e-05, + "loss": 0.0023, + "step": 840 + }, + { + "epoch": 0.5388069275176395, + "eval_loss": 0.0062833670526742935, + "eval_runtime": 2.7804, + "eval_samples_per_second": 4483.918, + "eval_steps_per_second": 140.269, + "step": 840 + }, + { + "epoch": 0.5452212957023733, + "grad_norm": 15.998806953430176, + "learning_rate": 1.4547787042976269e-05, + "loss": 0.0069, + "step": 850 + }, + { + "epoch": 0.5452212957023733, + "eval_loss": 0.005439049564301968, + "eval_runtime": 2.7374, + "eval_samples_per_second": 4554.31, + "eval_steps_per_second": 142.471, + "step": 850 + }, + { + "epoch": 0.5516356638871072, + "grad_norm": 0.13971057534217834, + "learning_rate": 1.448364336112893e-05, + "loss": 0.0104, + "step": 860 + }, + { + "epoch": 0.5516356638871072, + "eval_loss": 0.007052511442452669, + "eval_runtime": 2.7895, + "eval_samples_per_second": 4469.333, + "eval_steps_per_second": 139.812, + "step": 860 + }, + { + "epoch": 0.5580500320718409, + "grad_norm": 1.057011604309082, + "learning_rate": 1.4419499679281593e-05, + "loss": 0.0205, + "step": 870 + }, + { + "epoch": 0.5580500320718409, + "eval_loss": 0.006652463227510452, + "eval_runtime": 2.4224, + "eval_samples_per_second": 5146.536, + "eval_steps_per_second": 160.997, + "step": 870 + }, + { + "epoch": 0.5644644002565747, + "grad_norm": 0.02542971819639206, + "learning_rate": 1.4355355997434255e-05, + "loss": 0.0033, + "step": 880 + }, + { + "epoch": 0.5644644002565747, + "eval_loss": 0.005239278543740511, + "eval_runtime": 2.4966, + "eval_samples_per_second": 4993.593, + "eval_steps_per_second": 156.213, + "step": 880 + }, + { + "epoch": 0.5708787684413086, + "grad_norm": 0.024873876944184303, + "learning_rate": 1.4291212315586915e-05, + "loss": 0.0011, + "step": 890 + }, + { + "epoch": 0.5708787684413086, + "eval_loss": 0.006826899945735931, + "eval_runtime": 2.4524, + "eval_samples_per_second": 5083.67, + "eval_steps_per_second": 159.03, + "step": 890 + }, + { + "epoch": 0.5772931366260423, + "grad_norm": 0.022238241508603096, + "learning_rate": 1.4227068633739577e-05, + "loss": 0.0012, + "step": 900 + }, + { + "epoch": 0.5772931366260423, + "eval_loss": 0.0071626221761107445, + "eval_runtime": 2.4183, + "eval_samples_per_second": 5155.357, + "eval_steps_per_second": 161.273, + "step": 900 + }, + { + "epoch": 0.5837075048107762, + "grad_norm": 0.029160836711525917, + "learning_rate": 1.416292495189224e-05, + "loss": 0.0012, + "step": 910 + }, + { + "epoch": 0.5837075048107762, + "eval_loss": 0.007146148942410946, + "eval_runtime": 2.4316, + "eval_samples_per_second": 5127.12, + "eval_steps_per_second": 160.39, + "step": 910 + }, + { + "epoch": 0.5901218729955099, + "grad_norm": 0.020332586020231247, + "learning_rate": 1.40987812700449e-05, + "loss": 0.001, + "step": 920 + }, + { + "epoch": 0.5901218729955099, + "eval_loss": 0.007053141016513109, + "eval_runtime": 2.4123, + "eval_samples_per_second": 5168.073, + "eval_steps_per_second": 161.671, + "step": 920 + }, + { + "epoch": 0.5965362411802437, + "grad_norm": 0.019170017912983894, + "learning_rate": 1.4034637588197563e-05, + "loss": 0.001, + "step": 930 + }, + { + "epoch": 0.5965362411802437, + "eval_loss": 0.006995479576289654, + "eval_runtime": 2.4138, + "eval_samples_per_second": 5164.946, + "eval_steps_per_second": 161.573, + "step": 930 + }, + { + "epoch": 0.6029506093649776, + "grad_norm": 0.01940876618027687, + "learning_rate": 1.3970493906350226e-05, + "loss": 0.001, + "step": 940 + }, + { + "epoch": 0.6029506093649776, + "eval_loss": 0.006961450912058353, + "eval_runtime": 2.4248, + "eval_samples_per_second": 5141.427, + "eval_steps_per_second": 160.837, + "step": 940 + }, + { + "epoch": 0.6093649775497113, + "grad_norm": 0.023615067824721336, + "learning_rate": 1.3906350224502887e-05, + "loss": 0.0205, + "step": 950 + }, + { + "epoch": 0.6093649775497113, + "eval_loss": 0.006625923793762922, + "eval_runtime": 2.4163, + "eval_samples_per_second": 5159.503, + "eval_steps_per_second": 161.403, + "step": 950 + }, + { + "epoch": 0.6157793457344451, + "grad_norm": 0.35211464762687683, + "learning_rate": 1.384220654265555e-05, + "loss": 0.0025, + "step": 960 + }, + { + "epoch": 0.6157793457344451, + "eval_loss": 0.005356335546821356, + "eval_runtime": 2.4192, + "eval_samples_per_second": 5153.285, + "eval_steps_per_second": 161.208, + "step": 960 + }, + { + "epoch": 0.622193713919179, + "grad_norm": 0.028275813907384872, + "learning_rate": 1.3778062860808212e-05, + "loss": 0.0127, + "step": 970 + }, + { + "epoch": 0.622193713919179, + "eval_loss": 0.0050421105697751045, + "eval_runtime": 2.4233, + "eval_samples_per_second": 5144.725, + "eval_steps_per_second": 160.94, + "step": 970 + }, + { + "epoch": 0.6286080821039127, + "grad_norm": 0.026007099077105522, + "learning_rate": 1.3713919178960873e-05, + "loss": 0.001, + "step": 980 + }, + { + "epoch": 0.6286080821039127, + "eval_loss": 0.004812104627490044, + "eval_runtime": 2.4411, + "eval_samples_per_second": 5107.113, + "eval_steps_per_second": 159.764, + "step": 980 + }, + { + "epoch": 0.6350224502886466, + "grad_norm": 0.030545761808753014, + "learning_rate": 1.3649775497113536e-05, + "loss": 0.001, + "step": 990 + }, + { + "epoch": 0.6350224502886466, + "eval_loss": 0.004728221334517002, + "eval_runtime": 2.4142, + "eval_samples_per_second": 5164.069, + "eval_steps_per_second": 161.545, + "step": 990 + }, + { + "epoch": 0.6414368184733804, + "grad_norm": 0.018988870084285736, + "learning_rate": 1.3585631815266199e-05, + "loss": 0.0018, + "step": 1000 + }, + { + "epoch": 0.6414368184733804, + "eval_loss": 0.004783857148140669, + "eval_runtime": 2.4228, + "eval_samples_per_second": 5145.609, + "eval_steps_per_second": 160.968, + "step": 1000 + }, + { + "epoch": 0.6478511866581141, + "grad_norm": 0.022379985079169273, + "learning_rate": 1.3521488133418858e-05, + "loss": 0.0009, + "step": 1010 + }, + { + "epoch": 0.6478511866581141, + "eval_loss": 0.005252769682556391, + "eval_runtime": 2.6978, + "eval_samples_per_second": 4621.196, + "eval_steps_per_second": 144.563, + "step": 1010 + }, + { + "epoch": 0.654265554842848, + "grad_norm": 0.04967594891786575, + "learning_rate": 1.345734445157152e-05, + "loss": 0.001, + "step": 1020 + }, + { + "epoch": 0.654265554842848, + "eval_loss": 0.005410985555499792, + "eval_runtime": 2.7624, + "eval_samples_per_second": 4513.09, + "eval_steps_per_second": 141.181, + "step": 1020 + }, + { + "epoch": 0.6606799230275818, + "grad_norm": 0.021347397938370705, + "learning_rate": 1.3393200769724183e-05, + "loss": 0.001, + "step": 1030 + }, + { + "epoch": 0.6606799230275818, + "eval_loss": 0.005168409552425146, + "eval_runtime": 2.811, + "eval_samples_per_second": 4435.142, + "eval_steps_per_second": 138.743, + "step": 1030 + }, + { + "epoch": 0.6670942912123156, + "grad_norm": 0.038585614413022995, + "learning_rate": 1.3329057087876844e-05, + "loss": 0.0082, + "step": 1040 + }, + { + "epoch": 0.6670942912123156, + "eval_loss": 0.006633167155086994, + "eval_runtime": 2.7415, + "eval_samples_per_second": 4547.528, + "eval_steps_per_second": 142.258, + "step": 1040 + }, + { + "epoch": 0.6735086593970494, + "grad_norm": 0.02415415085852146, + "learning_rate": 1.3264913406029507e-05, + "loss": 0.0009, + "step": 1050 + }, + { + "epoch": 0.6735086593970494, + "eval_loss": 0.00695871701464057, + "eval_runtime": 2.7897, + "eval_samples_per_second": 4468.88, + "eval_steps_per_second": 139.798, + "step": 1050 + }, + { + "epoch": 0.6799230275817832, + "grad_norm": 0.03703638166189194, + "learning_rate": 1.320076972418217e-05, + "loss": 0.0033, + "step": 1060 + }, + { + "epoch": 0.6799230275817832, + "eval_loss": 0.004926735535264015, + "eval_runtime": 2.747, + "eval_samples_per_second": 4538.411, + "eval_steps_per_second": 141.973, + "step": 1060 + }, + { + "epoch": 0.686337395766517, + "grad_norm": 0.019467538222670555, + "learning_rate": 1.313662604233483e-05, + "loss": 0.0009, + "step": 1070 + }, + { + "epoch": 0.686337395766517, + "eval_loss": 0.004651096649467945, + "eval_runtime": 2.8053, + "eval_samples_per_second": 4444.041, + "eval_steps_per_second": 139.021, + "step": 1070 + }, + { + "epoch": 0.6927517639512508, + "grad_norm": 0.019495300948619843, + "learning_rate": 1.3072482360487493e-05, + "loss": 0.0008, + "step": 1080 + }, + { + "epoch": 0.6927517639512508, + "eval_loss": 0.004747034516185522, + "eval_runtime": 2.7655, + "eval_samples_per_second": 4508.104, + "eval_steps_per_second": 141.025, + "step": 1080 + }, + { + "epoch": 0.6991661321359846, + "grad_norm": 0.017888143658638, + "learning_rate": 1.3008338678640156e-05, + "loss": 0.0025, + "step": 1090 + }, + { + "epoch": 0.6991661321359846, + "eval_loss": 0.004640842322260141, + "eval_runtime": 2.764, + "eval_samples_per_second": 4510.484, + "eval_steps_per_second": 141.1, + "step": 1090 + }, + { + "epoch": 0.7055805003207184, + "grad_norm": 0.016517043113708496, + "learning_rate": 1.2944194996792817e-05, + "loss": 0.0028, + "step": 1100 + }, + { + "epoch": 0.7055805003207184, + "eval_loss": 0.004688124172389507, + "eval_runtime": 2.7296, + "eval_samples_per_second": 4567.403, + "eval_steps_per_second": 142.88, + "step": 1100 + }, + { + "epoch": 0.7119948685054522, + "grad_norm": 0.021053675562143326, + "learning_rate": 1.288005131494548e-05, + "loss": 0.0208, + "step": 1110 + }, + { + "epoch": 0.7119948685054522, + "eval_loss": 0.005344197154045105, + "eval_runtime": 2.7735, + "eval_samples_per_second": 4494.962, + "eval_steps_per_second": 140.614, + "step": 1110 + }, + { + "epoch": 0.718409236690186, + "grad_norm": 0.01877407915890217, + "learning_rate": 1.2815907633098142e-05, + "loss": 0.0008, + "step": 1120 + }, + { + "epoch": 0.718409236690186, + "eval_loss": 0.005765383131802082, + "eval_runtime": 2.7453, + "eval_samples_per_second": 4541.15, + "eval_steps_per_second": 142.059, + "step": 1120 + }, + { + "epoch": 0.7248236048749198, + "grad_norm": 0.018694570288062096, + "learning_rate": 1.2751763951250801e-05, + "loss": 0.0118, + "step": 1130 + }, + { + "epoch": 0.7248236048749198, + "eval_loss": 0.005010578315705061, + "eval_runtime": 2.7586, + "eval_samples_per_second": 4519.321, + "eval_steps_per_second": 141.376, + "step": 1130 + }, + { + "epoch": 0.7312379730596537, + "grad_norm": 0.024368854239583015, + "learning_rate": 1.2687620269403464e-05, + "loss": 0.0012, + "step": 1140 + }, + { + "epoch": 0.7312379730596537, + "eval_loss": 0.0046744393184781075, + "eval_runtime": 2.7454, + "eval_samples_per_second": 4541.114, + "eval_steps_per_second": 142.058, + "step": 1140 + }, + { + "epoch": 0.7376523412443874, + "grad_norm": 0.01632387563586235, + "learning_rate": 1.2623476587556126e-05, + "loss": 0.0008, + "step": 1150 + }, + { + "epoch": 0.7376523412443874, + "eval_loss": 0.005472252145409584, + "eval_runtime": 2.7352, + "eval_samples_per_second": 4557.999, + "eval_steps_per_second": 142.586, + "step": 1150 + }, + { + "epoch": 0.7440667094291212, + "grad_norm": 0.01977686956524849, + "learning_rate": 1.2559332905708787e-05, + "loss": 0.0113, + "step": 1160 + }, + { + "epoch": 0.7440667094291212, + "eval_loss": 0.004667737055569887, + "eval_runtime": 2.7817, + "eval_samples_per_second": 4481.825, + "eval_steps_per_second": 140.203, + "step": 1160 + }, + { + "epoch": 0.7504810776138551, + "grad_norm": 0.01677733100950718, + "learning_rate": 1.249518922386145e-05, + "loss": 0.0008, + "step": 1170 + }, + { + "epoch": 0.7504810776138551, + "eval_loss": 0.004439252428710461, + "eval_runtime": 2.7489, + "eval_samples_per_second": 4535.208, + "eval_steps_per_second": 141.873, + "step": 1170 + }, + { + "epoch": 0.7568954457985888, + "grad_norm": 0.01814711093902588, + "learning_rate": 1.2431045542014113e-05, + "loss": 0.0008, + "step": 1180 + }, + { + "epoch": 0.7568954457985888, + "eval_loss": 0.0044731213711202145, + "eval_runtime": 2.7808, + "eval_samples_per_second": 4483.222, + "eval_steps_per_second": 140.247, + "step": 1180 + }, + { + "epoch": 0.7633098139833226, + "grad_norm": 0.01768704131245613, + "learning_rate": 1.2366901860166775e-05, + "loss": 0.0008, + "step": 1190 + }, + { + "epoch": 0.7633098139833226, + "eval_loss": 0.00443003186956048, + "eval_runtime": 2.7322, + "eval_samples_per_second": 4562.948, + "eval_steps_per_second": 142.741, + "step": 1190 + }, + { + "epoch": 0.7697241821680565, + "grad_norm": 0.018905559554696083, + "learning_rate": 1.2302758178319436e-05, + "loss": 0.0009, + "step": 1200 + }, + { + "epoch": 0.7697241821680565, + "eval_loss": 0.0043729268945753574, + "eval_runtime": 2.7422, + "eval_samples_per_second": 4546.366, + "eval_steps_per_second": 142.222, + "step": 1200 + }, + { + "epoch": 0.7761385503527902, + "grad_norm": 0.016700776293873787, + "learning_rate": 1.2238614496472099e-05, + "loss": 0.0007, + "step": 1210 + }, + { + "epoch": 0.7761385503527902, + "eval_loss": 0.004335304256528616, + "eval_runtime": 2.7415, + "eval_samples_per_second": 4547.513, + "eval_steps_per_second": 142.258, + "step": 1210 + }, + { + "epoch": 0.7825529185375241, + "grad_norm": 0.017289504408836365, + "learning_rate": 1.2174470814624762e-05, + "loss": 0.0007, + "step": 1220 + }, + { + "epoch": 0.7825529185375241, + "eval_loss": 0.004327945411205292, + "eval_runtime": 2.7495, + "eval_samples_per_second": 4534.227, + "eval_steps_per_second": 141.842, + "step": 1220 + }, + { + "epoch": 0.7889672867222579, + "grad_norm": 0.01972338743507862, + "learning_rate": 1.2110327132777423e-05, + "loss": 0.0008, + "step": 1230 + }, + { + "epoch": 0.7889672867222579, + "eval_loss": 0.004332894925028086, + "eval_runtime": 2.7828, + "eval_samples_per_second": 4480.07, + "eval_steps_per_second": 140.148, + "step": 1230 + }, + { + "epoch": 0.7953816549069916, + "grad_norm": 0.7810164093971252, + "learning_rate": 1.2046183450930085e-05, + "loss": 0.001, + "step": 1240 + }, + { + "epoch": 0.7953816549069916, + "eval_loss": 0.0043570250272750854, + "eval_runtime": 2.7393, + "eval_samples_per_second": 4551.141, + "eval_steps_per_second": 142.371, + "step": 1240 + }, + { + "epoch": 0.8017960230917255, + "grad_norm": 0.013027627021074295, + "learning_rate": 1.1982039769082748e-05, + "loss": 0.001, + "step": 1250 + }, + { + "epoch": 0.8017960230917255, + "eval_loss": 0.00467626703903079, + "eval_runtime": 2.7636, + "eval_samples_per_second": 4511.123, + "eval_steps_per_second": 141.12, + "step": 1250 + }, + { + "epoch": 0.8082103912764592, + "grad_norm": 0.019782286137342453, + "learning_rate": 1.1917896087235407e-05, + "loss": 0.0009, + "step": 1260 + }, + { + "epoch": 0.8082103912764592, + "eval_loss": 0.004564644303172827, + "eval_runtime": 2.7335, + "eval_samples_per_second": 4560.869, + "eval_steps_per_second": 142.676, + "step": 1260 + }, + { + "epoch": 0.8146247594611931, + "grad_norm": 0.019999559968709946, + "learning_rate": 1.185375240538807e-05, + "loss": 0.0007, + "step": 1270 + }, + { + "epoch": 0.8146247594611931, + "eval_loss": 0.004904980771243572, + "eval_runtime": 2.732, + "eval_samples_per_second": 4563.357, + "eval_steps_per_second": 142.754, + "step": 1270 + }, + { + "epoch": 0.8210391276459269, + "grad_norm": 0.02157980017364025, + "learning_rate": 1.1789608723540732e-05, + "loss": 0.0213, + "step": 1280 + }, + { + "epoch": 0.8210391276459269, + "eval_loss": 0.005200786981731653, + "eval_runtime": 2.7722, + "eval_samples_per_second": 4497.167, + "eval_steps_per_second": 140.683, + "step": 1280 + }, + { + "epoch": 0.8274534958306606, + "grad_norm": 0.01738838665187359, + "learning_rate": 1.1725465041693393e-05, + "loss": 0.0008, + "step": 1290 + }, + { + "epoch": 0.8274534958306606, + "eval_loss": 0.005184350069612265, + "eval_runtime": 2.7378, + "eval_samples_per_second": 4553.624, + "eval_steps_per_second": 142.449, + "step": 1290 + }, + { + "epoch": 0.8338678640153945, + "grad_norm": 0.014976155944168568, + "learning_rate": 1.1661321359846056e-05, + "loss": 0.0186, + "step": 1300 + }, + { + "epoch": 0.8338678640153945, + "eval_loss": 0.005122625734657049, + "eval_runtime": 2.7545, + "eval_samples_per_second": 4526.014, + "eval_steps_per_second": 141.585, + "step": 1300 + }, + { + "epoch": 0.8402822322001283, + "grad_norm": 0.022448979318141937, + "learning_rate": 1.1597177677998719e-05, + "loss": 0.0009, + "step": 1310 + }, + { + "epoch": 0.8402822322001283, + "eval_loss": 0.005329828709363937, + "eval_runtime": 2.757, + "eval_samples_per_second": 4521.993, + "eval_steps_per_second": 141.46, + "step": 1310 + }, + { + "epoch": 0.846696600384862, + "grad_norm": 0.022593596950173378, + "learning_rate": 1.153303399615138e-05, + "loss": 0.0009, + "step": 1320 + }, + { + "epoch": 0.846696600384862, + "eval_loss": 0.005285405088216066, + "eval_runtime": 2.7287, + "eval_samples_per_second": 4568.789, + "eval_steps_per_second": 142.924, + "step": 1320 + }, + { + "epoch": 0.8531109685695959, + "grad_norm": 0.013608959503471851, + "learning_rate": 1.1468890314304042e-05, + "loss": 0.0009, + "step": 1330 + }, + { + "epoch": 0.8531109685695959, + "eval_loss": 0.0051283277571201324, + "eval_runtime": 2.4129, + "eval_samples_per_second": 5166.744, + "eval_steps_per_second": 161.629, + "step": 1330 + }, + { + "epoch": 0.8595253367543297, + "grad_norm": 1.3526451587677002, + "learning_rate": 1.1404746632456705e-05, + "loss": 0.0196, + "step": 1340 + }, + { + "epoch": 0.8595253367543297, + "eval_loss": 0.004866322036832571, + "eval_runtime": 2.3974, + "eval_samples_per_second": 5200.213, + "eval_steps_per_second": 162.676, + "step": 1340 + }, + { + "epoch": 0.8659397049390635, + "grad_norm": 0.025603530928492546, + "learning_rate": 1.1340602950609366e-05, + "loss": 0.0008, + "step": 1350 + }, + { + "epoch": 0.8659397049390635, + "eval_loss": 0.004412362352013588, + "eval_runtime": 2.3987, + "eval_samples_per_second": 5197.483, + "eval_steps_per_second": 162.591, + "step": 1350 + }, + { + "epoch": 0.8723540731237973, + "grad_norm": 0.02262856625020504, + "learning_rate": 1.1276459268762029e-05, + "loss": 0.0008, + "step": 1360 + }, + { + "epoch": 0.8723540731237973, + "eval_loss": 0.0042611462995409966, + "eval_runtime": 2.3923, + "eval_samples_per_second": 5211.293, + "eval_steps_per_second": 163.023, + "step": 1360 + }, + { + "epoch": 0.8787684413085312, + "grad_norm": 0.018682410940527916, + "learning_rate": 1.1212315586914691e-05, + "loss": 0.0008, + "step": 1370 + }, + { + "epoch": 0.8787684413085312, + "eval_loss": 0.004274127539247274, + "eval_runtime": 2.4094, + "eval_samples_per_second": 5174.395, + "eval_steps_per_second": 161.868, + "step": 1370 + }, + { + "epoch": 0.8851828094932649, + "grad_norm": 0.02168123796582222, + "learning_rate": 1.114817190506735e-05, + "loss": 0.0008, + "step": 1380 + }, + { + "epoch": 0.8851828094932649, + "eval_loss": 0.004351937212049961, + "eval_runtime": 2.4398, + "eval_samples_per_second": 5109.791, + "eval_steps_per_second": 159.847, + "step": 1380 + }, + { + "epoch": 0.8915971776779987, + "grad_norm": 0.01584063470363617, + "learning_rate": 1.1084028223220013e-05, + "loss": 0.0008, + "step": 1390 + }, + { + "epoch": 0.8915971776779987, + "eval_loss": 0.004328163340687752, + "eval_runtime": 2.4187, + "eval_samples_per_second": 5154.385, + "eval_steps_per_second": 161.242, + "step": 1390 + }, + { + "epoch": 0.8980115458627326, + "grad_norm": 0.02418622002005577, + "learning_rate": 1.1019884541372676e-05, + "loss": 0.0179, + "step": 1400 + }, + { + "epoch": 0.8980115458627326, + "eval_loss": 0.004465954378247261, + "eval_runtime": 2.4217, + "eval_samples_per_second": 5148.09, + "eval_steps_per_second": 161.046, + "step": 1400 + }, + { + "epoch": 0.9044259140474663, + "grad_norm": 0.03038007766008377, + "learning_rate": 1.0955740859525337e-05, + "loss": 0.0008, + "step": 1410 + }, + { + "epoch": 0.9044259140474663, + "eval_loss": 0.004557340405881405, + "eval_runtime": 2.4249, + "eval_samples_per_second": 5141.298, + "eval_steps_per_second": 160.833, + "step": 1410 + }, + { + "epoch": 0.9108402822322001, + "grad_norm": 0.022822652012109756, + "learning_rate": 1.0891597177678e-05, + "loss": 0.001, + "step": 1420 + }, + { + "epoch": 0.9108402822322001, + "eval_loss": 0.0044282288290560246, + "eval_runtime": 2.4459, + "eval_samples_per_second": 5097.083, + "eval_steps_per_second": 159.45, + "step": 1420 + }, + { + "epoch": 0.9172546504169339, + "grad_norm": 0.019525406882166862, + "learning_rate": 1.0827453495830662e-05, + "loss": 0.0009, + "step": 1430 + }, + { + "epoch": 0.9172546504169339, + "eval_loss": 0.004077858291566372, + "eval_runtime": 2.4255, + "eval_samples_per_second": 5139.973, + "eval_steps_per_second": 160.792, + "step": 1430 + }, + { + "epoch": 0.9236690186016677, + "grad_norm": 0.02353576384484768, + "learning_rate": 1.0763309813983323e-05, + "loss": 0.033, + "step": 1440 + }, + { + "epoch": 0.9236690186016677, + "eval_loss": 0.0042343405075371265, + "eval_runtime": 2.4218, + "eval_samples_per_second": 5147.784, + "eval_steps_per_second": 161.036, + "step": 1440 + }, + { + "epoch": 0.9300833867864016, + "grad_norm": 0.019818825647234917, + "learning_rate": 1.0699166132135986e-05, + "loss": 0.001, + "step": 1450 + }, + { + "epoch": 0.9300833867864016, + "eval_loss": 0.004364923574030399, + "eval_runtime": 2.419, + "eval_samples_per_second": 5153.833, + "eval_steps_per_second": 161.225, + "step": 1450 + }, + { + "epoch": 0.9364977549711353, + "grad_norm": 0.08867379277944565, + "learning_rate": 1.0635022450288648e-05, + "loss": 0.001, + "step": 1460 + }, + { + "epoch": 0.9364977549711353, + "eval_loss": 0.004264220129698515, + "eval_runtime": 2.4136, + "eval_samples_per_second": 5165.211, + "eval_steps_per_second": 161.581, + "step": 1460 + }, + { + "epoch": 0.9429121231558691, + "grad_norm": 0.014765486121177673, + "learning_rate": 1.0570878768441307e-05, + "loss": 0.0008, + "step": 1470 + }, + { + "epoch": 0.9429121231558691, + "eval_loss": 0.004075978416949511, + "eval_runtime": 2.4088, + "eval_samples_per_second": 5175.684, + "eval_steps_per_second": 161.909, + "step": 1470 + }, + { + "epoch": 0.949326491340603, + "grad_norm": 0.06277275830507278, + "learning_rate": 1.0506735086593972e-05, + "loss": 0.0008, + "step": 1480 + }, + { + "epoch": 0.949326491340603, + "eval_loss": 0.003993914928287268, + "eval_runtime": 2.4044, + "eval_samples_per_second": 5185.179, + "eval_steps_per_second": 162.206, + "step": 1480 + }, + { + "epoch": 0.9557408595253367, + "grad_norm": 0.015658292919397354, + "learning_rate": 1.0442591404746634e-05, + "loss": 0.0007, + "step": 1490 + }, + { + "epoch": 0.9557408595253367, + "eval_loss": 0.003960499074310064, + "eval_runtime": 2.4068, + "eval_samples_per_second": 5179.913, + "eval_steps_per_second": 162.041, + "step": 1490 + }, + { + "epoch": 0.9621552277100706, + "grad_norm": 0.01588534004986286, + "learning_rate": 1.0378447722899297e-05, + "loss": 0.0006, + "step": 1500 + }, + { + "epoch": 0.9621552277100706, + "eval_loss": 0.003956847358494997, + "eval_runtime": 2.4027, + "eval_samples_per_second": 5188.731, + "eval_steps_per_second": 162.317, + "step": 1500 + }, + { + "epoch": 0.9685695958948044, + "grad_norm": 0.02051517553627491, + "learning_rate": 1.0314304041051956e-05, + "loss": 0.0007, + "step": 1510 + }, + { + "epoch": 0.9685695958948044, + "eval_loss": 0.003954595420509577, + "eval_runtime": 2.4096, + "eval_samples_per_second": 5173.928, + "eval_steps_per_second": 161.854, + "step": 1510 + }, + { + "epoch": 0.9749839640795381, + "grad_norm": 0.014356785453855991, + "learning_rate": 1.0250160359204619e-05, + "loss": 0.0008, + "step": 1520 + }, + { + "epoch": 0.9749839640795381, + "eval_loss": 0.003977358806878328, + "eval_runtime": 2.4108, + "eval_samples_per_second": 5171.403, + "eval_steps_per_second": 161.775, + "step": 1520 + }, + { + "epoch": 0.981398332264272, + "grad_norm": 0.017719434574246407, + "learning_rate": 1.0186016677357282e-05, + "loss": 0.0041, + "step": 1530 + }, + { + "epoch": 0.981398332264272, + "eval_loss": 0.0038570731412619352, + "eval_runtime": 2.4041, + "eval_samples_per_second": 5185.718, + "eval_steps_per_second": 162.223, + "step": 1530 + }, + { + "epoch": 0.9878127004490058, + "grad_norm": 0.013949839398264885, + "learning_rate": 1.0121872995509943e-05, + "loss": 0.0006, + "step": 1540 + }, + { + "epoch": 0.9878127004490058, + "eval_loss": 0.003924272954463959, + "eval_runtime": 2.4092, + "eval_samples_per_second": 5174.835, + "eval_steps_per_second": 161.882, + "step": 1540 + }, + { + "epoch": 0.9942270686337396, + "grad_norm": 0.014598234556615353, + "learning_rate": 1.0057729313662605e-05, + "loss": 0.0006, + "step": 1550 + }, + { + "epoch": 0.9942270686337396, + "eval_loss": 0.003942632116377354, + "eval_runtime": 2.4412, + "eval_samples_per_second": 5106.919, + "eval_steps_per_second": 159.758, + "step": 1550 + }, + { + "epoch": 1.0006414368184733, + "grad_norm": 0.011457240208983421, + "learning_rate": 9.993585631815266e-06, + "loss": 0.004, + "step": 1560 + }, + { + "epoch": 1.0006414368184733, + "eval_loss": 0.004661895334720612, + "eval_runtime": 2.4112, + "eval_samples_per_second": 5170.457, + "eval_steps_per_second": 161.745, + "step": 1560 + }, + { + "epoch": 1.0070558050032072, + "grad_norm": 0.01190961617976427, + "learning_rate": 9.929441949967929e-06, + "loss": 0.0007, + "step": 1570 + }, + { + "epoch": 1.0070558050032072, + "eval_loss": 0.004823943134397268, + "eval_runtime": 2.4047, + "eval_samples_per_second": 5184.371, + "eval_steps_per_second": 162.181, + "step": 1570 + }, + { + "epoch": 1.013470173187941, + "grad_norm": 0.33191996812820435, + "learning_rate": 9.865298268120592e-06, + "loss": 0.0007, + "step": 1580 + }, + { + "epoch": 1.013470173187941, + "eval_loss": 0.004680118057876825, + "eval_runtime": 2.4166, + "eval_samples_per_second": 5159.007, + "eval_steps_per_second": 161.387, + "step": 1580 + }, + { + "epoch": 1.0198845413726747, + "grad_norm": 0.011999332346022129, + "learning_rate": 9.801154586273252e-06, + "loss": 0.0006, + "step": 1590 + }, + { + "epoch": 1.0198845413726747, + "eval_loss": 0.004358895123004913, + "eval_runtime": 2.4225, + "eval_samples_per_second": 5146.254, + "eval_steps_per_second": 160.988, + "step": 1590 + }, + { + "epoch": 1.0262989095574087, + "grad_norm": 0.012122240848839283, + "learning_rate": 9.737010904425915e-06, + "loss": 0.0006, + "step": 1600 + }, + { + "epoch": 1.0262989095574087, + "eval_loss": 0.0042406474240124226, + "eval_runtime": 2.4348, + "eval_samples_per_second": 5120.284, + "eval_steps_per_second": 160.176, + "step": 1600 + }, + { + "epoch": 1.0327132777421424, + "grad_norm": 0.011430976912379265, + "learning_rate": 9.672867222578576e-06, + "loss": 0.0006, + "step": 1610 + }, + { + "epoch": 1.0327132777421424, + "eval_loss": 0.0041888197883963585, + "eval_runtime": 2.4379, + "eval_samples_per_second": 5113.784, + "eval_steps_per_second": 159.972, + "step": 1610 + }, + { + "epoch": 1.0391276459268761, + "grad_norm": 0.013485722243785858, + "learning_rate": 9.608723540731239e-06, + "loss": 0.0006, + "step": 1620 + }, + { + "epoch": 1.0391276459268761, + "eval_loss": 0.004160948097705841, + "eval_runtime": 2.432, + "eval_samples_per_second": 5126.259, + "eval_steps_per_second": 160.363, + "step": 1620 + }, + { + "epoch": 1.04554201411161, + "grad_norm": 0.012653365731239319, + "learning_rate": 9.544579858883901e-06, + "loss": 0.0006, + "step": 1630 + }, + { + "epoch": 1.04554201411161, + "eval_loss": 0.004139183554798365, + "eval_runtime": 2.4264, + "eval_samples_per_second": 5137.96, + "eval_steps_per_second": 160.729, + "step": 1630 + }, + { + "epoch": 1.0519563822963438, + "grad_norm": 0.011114208027720451, + "learning_rate": 9.480436177036562e-06, + "loss": 0.0221, + "step": 1640 + }, + { + "epoch": 1.0519563822963438, + "eval_loss": 0.0040536741726100445, + "eval_runtime": 2.4288, + "eval_samples_per_second": 5132.976, + "eval_steps_per_second": 160.573, + "step": 1640 + }, + { + "epoch": 1.0583707504810775, + "grad_norm": 0.012626360170543194, + "learning_rate": 9.416292495189225e-06, + "loss": 0.0006, + "step": 1650 + }, + { + "epoch": 1.0583707504810775, + "eval_loss": 0.0038833727594465017, + "eval_runtime": 2.4227, + "eval_samples_per_second": 5145.84, + "eval_steps_per_second": 160.975, + "step": 1650 + }, + { + "epoch": 1.0647851186658115, + "grad_norm": 0.014159155078232288, + "learning_rate": 9.352148813341888e-06, + "loss": 0.0006, + "step": 1660 + }, + { + "epoch": 1.0647851186658115, + "eval_loss": 0.0038352308329194784, + "eval_runtime": 2.4498, + "eval_samples_per_second": 5088.937, + "eval_steps_per_second": 159.195, + "step": 1660 + }, + { + "epoch": 1.0711994868505452, + "grad_norm": 0.012477426789700985, + "learning_rate": 9.288005131494549e-06, + "loss": 0.0006, + "step": 1670 + }, + { + "epoch": 1.0711994868505452, + "eval_loss": 0.003819151548668742, + "eval_runtime": 2.4787, + "eval_samples_per_second": 5029.751, + "eval_steps_per_second": 157.344, + "step": 1670 + }, + { + "epoch": 1.077613855035279, + "grad_norm": 0.015881817787885666, + "learning_rate": 9.22386144964721e-06, + "loss": 0.0006, + "step": 1680 + }, + { + "epoch": 1.077613855035279, + "eval_loss": 0.0038124537095427513, + "eval_runtime": 2.7882, + "eval_samples_per_second": 4471.31, + "eval_steps_per_second": 139.874, + "step": 1680 + }, + { + "epoch": 1.084028223220013, + "grad_norm": 0.012099322862923145, + "learning_rate": 9.159717767799872e-06, + "loss": 0.0006, + "step": 1690 + }, + { + "epoch": 1.084028223220013, + "eval_loss": 0.003807657863944769, + "eval_runtime": 2.7594, + "eval_samples_per_second": 4518.067, + "eval_steps_per_second": 141.337, + "step": 1690 + }, + { + "epoch": 1.0904425914047466, + "grad_norm": 0.013776600360870361, + "learning_rate": 9.095574085952535e-06, + "loss": 0.0006, + "step": 1700 + }, + { + "epoch": 1.0904425914047466, + "eval_loss": 0.003805548418313265, + "eval_runtime": 2.7956, + "eval_samples_per_second": 4459.47, + "eval_steps_per_second": 139.504, + "step": 1700 + }, + { + "epoch": 1.0968569595894804, + "grad_norm": 0.011048905551433563, + "learning_rate": 9.031430404105196e-06, + "loss": 0.0005, + "step": 1710 + }, + { + "epoch": 1.0968569595894804, + "eval_loss": 0.0038041335064917803, + "eval_runtime": 2.7701, + "eval_samples_per_second": 4500.54, + "eval_steps_per_second": 140.789, + "step": 1710 + }, + { + "epoch": 1.1032713277742143, + "grad_norm": 0.015901437029242516, + "learning_rate": 8.967286722257858e-06, + "loss": 0.0005, + "step": 1720 + }, + { + "epoch": 1.1032713277742143, + "eval_loss": 0.0038053819444030523, + "eval_runtime": 2.8155, + "eval_samples_per_second": 4428.021, + "eval_steps_per_second": 138.52, + "step": 1720 + }, + { + "epoch": 1.109685695958948, + "grad_norm": 0.01117046270519495, + "learning_rate": 8.90314304041052e-06, + "loss": 0.0006, + "step": 1730 + }, + { + "epoch": 1.109685695958948, + "eval_loss": 0.0038178153336048126, + "eval_runtime": 2.7607, + "eval_samples_per_second": 4515.815, + "eval_steps_per_second": 141.266, + "step": 1730 + }, + { + "epoch": 1.1161000641436818, + "grad_norm": 0.0106582622975111, + "learning_rate": 8.838999358563182e-06, + "loss": 0.0005, + "step": 1740 + }, + { + "epoch": 1.1161000641436818, + "eval_loss": 0.0038231906946748495, + "eval_runtime": 2.7615, + "eval_samples_per_second": 4514.606, + "eval_steps_per_second": 141.229, + "step": 1740 + }, + { + "epoch": 1.1225144323284157, + "grad_norm": 0.011909229680895805, + "learning_rate": 8.774855676715845e-06, + "loss": 0.0005, + "step": 1750 + }, + { + "epoch": 1.1225144323284157, + "eval_loss": 0.0038271723315119743, + "eval_runtime": 2.7512, + "eval_samples_per_second": 4531.54, + "eval_steps_per_second": 141.758, + "step": 1750 + }, + { + "epoch": 1.1289288005131495, + "grad_norm": 0.011566118337213993, + "learning_rate": 8.710711994868506e-06, + "loss": 0.0006, + "step": 1760 + }, + { + "epoch": 1.1289288005131495, + "eval_loss": 0.0038472446613013744, + "eval_runtime": 2.7699, + "eval_samples_per_second": 4500.962, + "eval_steps_per_second": 140.802, + "step": 1760 + }, + { + "epoch": 1.1353431686978832, + "grad_norm": 0.011672618798911572, + "learning_rate": 8.646568313021168e-06, + "loss": 0.0006, + "step": 1770 + }, + { + "epoch": 1.1353431686978832, + "eval_loss": 0.0039085946045815945, + "eval_runtime": 2.7447, + "eval_samples_per_second": 4542.214, + "eval_steps_per_second": 142.092, + "step": 1770 + }, + { + "epoch": 1.1417575368826172, + "grad_norm": 0.011934245936572552, + "learning_rate": 8.582424631173831e-06, + "loss": 0.0027, + "step": 1780 + }, + { + "epoch": 1.1417575368826172, + "eval_loss": 0.003827283624559641, + "eval_runtime": 2.7992, + "eval_samples_per_second": 4453.718, + "eval_steps_per_second": 139.324, + "step": 1780 + }, + { + "epoch": 1.148171905067351, + "grad_norm": 0.011731648817658424, + "learning_rate": 8.518280949326492e-06, + "loss": 0.021, + "step": 1790 + }, + { + "epoch": 1.148171905067351, + "eval_loss": 0.00375727703794837, + "eval_runtime": 2.7483, + "eval_samples_per_second": 4536.229, + "eval_steps_per_second": 141.905, + "step": 1790 + }, + { + "epoch": 1.1545862732520846, + "grad_norm": 0.015154370106756687, + "learning_rate": 8.454137267479155e-06, + "loss": 0.0006, + "step": 1800 + }, + { + "epoch": 1.1545862732520846, + "eval_loss": 0.003766178386285901, + "eval_runtime": 2.7826, + "eval_samples_per_second": 4480.277, + "eval_steps_per_second": 140.155, + "step": 1800 + }, + { + "epoch": 1.1610006414368184, + "grad_norm": 0.017620213329792023, + "learning_rate": 8.389993585631815e-06, + "loss": 0.0169, + "step": 1810 + }, + { + "epoch": 1.1610006414368184, + "eval_loss": 0.0037458380684256554, + "eval_runtime": 2.7342, + "eval_samples_per_second": 4559.694, + "eval_steps_per_second": 142.639, + "step": 1810 + }, + { + "epoch": 1.1674150096215523, + "grad_norm": 0.014909962192177773, + "learning_rate": 8.325849903784478e-06, + "loss": 0.0006, + "step": 1820 + }, + { + "epoch": 1.1674150096215523, + "eval_loss": 0.0036850275937467813, + "eval_runtime": 2.7781, + "eval_samples_per_second": 4487.627, + "eval_steps_per_second": 140.385, + "step": 1820 + }, + { + "epoch": 1.173829377806286, + "grad_norm": 0.03852078691124916, + "learning_rate": 8.26170622193714e-06, + "loss": 0.0006, + "step": 1830 + }, + { + "epoch": 1.173829377806286, + "eval_loss": 0.0036811623722314835, + "eval_runtime": 2.7264, + "eval_samples_per_second": 4572.666, + "eval_steps_per_second": 143.045, + "step": 1830 + }, + { + "epoch": 1.18024374599102, + "grad_norm": 0.013608761131763458, + "learning_rate": 8.197562540089802e-06, + "loss": 0.0006, + "step": 1840 + }, + { + "epoch": 1.18024374599102, + "eval_loss": 0.0036842951085418463, + "eval_runtime": 2.7401, + "eval_samples_per_second": 4549.854, + "eval_steps_per_second": 142.331, + "step": 1840 + }, + { + "epoch": 1.1866581141757537, + "grad_norm": 0.01463257521390915, + "learning_rate": 8.133418858242463e-06, + "loss": 0.0006, + "step": 1850 + }, + { + "epoch": 1.1866581141757537, + "eval_loss": 0.0036834382917732, + "eval_runtime": 2.7684, + "eval_samples_per_second": 4503.347, + "eval_steps_per_second": 140.876, + "step": 1850 + }, + { + "epoch": 1.1930724823604875, + "grad_norm": 0.015022198669612408, + "learning_rate": 8.069275176395125e-06, + "loss": 0.0006, + "step": 1860 + }, + { + "epoch": 1.1930724823604875, + "eval_loss": 0.003694625571370125, + "eval_runtime": 2.7501, + "eval_samples_per_second": 4533.313, + "eval_steps_per_second": 141.814, + "step": 1860 + }, + { + "epoch": 1.1994868505452212, + "grad_norm": 0.011660662479698658, + "learning_rate": 8.005131494547788e-06, + "loss": 0.0006, + "step": 1870 + }, + { + "epoch": 1.1994868505452212, + "eval_loss": 0.003708133939653635, + "eval_runtime": 2.7645, + "eval_samples_per_second": 4509.744, + "eval_steps_per_second": 141.076, + "step": 1870 + }, + { + "epoch": 1.2059012187299551, + "grad_norm": 0.01334298774600029, + "learning_rate": 7.940987812700449e-06, + "loss": 0.0005, + "step": 1880 + }, + { + "epoch": 1.2059012187299551, + "eval_loss": 0.003715616650879383, + "eval_runtime": 2.7401, + "eval_samples_per_second": 4549.905, + "eval_steps_per_second": 142.333, + "step": 1880 + }, + { + "epoch": 1.2123155869146889, + "grad_norm": 0.012951839715242386, + "learning_rate": 7.876844130853112e-06, + "loss": 0.0006, + "step": 1890 + }, + { + "epoch": 1.2123155869146889, + "eval_loss": 0.0037382924929261208, + "eval_runtime": 2.7682, + "eval_samples_per_second": 4503.721, + "eval_steps_per_second": 140.888, + "step": 1890 + }, + { + "epoch": 1.2187299550994226, + "grad_norm": 0.009976466186344624, + "learning_rate": 7.812700449005774e-06, + "loss": 0.0005, + "step": 1900 + }, + { + "epoch": 1.2187299550994226, + "eval_loss": 0.0037695877254009247, + "eval_runtime": 2.7309, + "eval_samples_per_second": 4565.132, + "eval_steps_per_second": 142.809, + "step": 1900 + }, + { + "epoch": 1.2251443232841566, + "grad_norm": 0.012881353497505188, + "learning_rate": 7.748556767158437e-06, + "loss": 0.0005, + "step": 1910 + }, + { + "epoch": 1.2251443232841566, + "eval_loss": 0.003783053020015359, + "eval_runtime": 2.5865, + "eval_samples_per_second": 4820.049, + "eval_steps_per_second": 150.784, + "step": 1910 + }, + { + "epoch": 1.2315586914688903, + "grad_norm": 0.01235857605934143, + "learning_rate": 7.684413085311098e-06, + "loss": 0.0005, + "step": 1920 + }, + { + "epoch": 1.2315586914688903, + "eval_loss": 0.0037915727589279413, + "eval_runtime": 2.4029, + "eval_samples_per_second": 5188.243, + "eval_steps_per_second": 162.302, + "step": 1920 + }, + { + "epoch": 1.237973059653624, + "grad_norm": 0.01356592122465372, + "learning_rate": 7.620269403463759e-06, + "loss": 0.0006, + "step": 1930 + }, + { + "epoch": 1.237973059653624, + "eval_loss": 0.003795850556343794, + "eval_runtime": 2.4046, + "eval_samples_per_second": 5184.593, + "eval_steps_per_second": 162.187, + "step": 1930 + }, + { + "epoch": 1.244387427838358, + "grad_norm": 0.01199817843735218, + "learning_rate": 7.556125721616422e-06, + "loss": 0.0005, + "step": 1940 + }, + { + "epoch": 1.244387427838358, + "eval_loss": 0.003800132079049945, + "eval_runtime": 2.4031, + "eval_samples_per_second": 5187.901, + "eval_steps_per_second": 162.291, + "step": 1940 + }, + { + "epoch": 1.2508017960230917, + "grad_norm": 0.00958781223744154, + "learning_rate": 7.491982039769083e-06, + "loss": 0.0005, + "step": 1950 + }, + { + "epoch": 1.2508017960230917, + "eval_loss": 0.0038033805321902037, + "eval_runtime": 2.4027, + "eval_samples_per_second": 5188.694, + "eval_steps_per_second": 162.316, + "step": 1950 + }, + { + "epoch": 1.2572161642078254, + "grad_norm": 0.011594674549996853, + "learning_rate": 7.427838357921745e-06, + "loss": 0.0005, + "step": 1960 + }, + { + "epoch": 1.2572161642078254, + "eval_loss": 0.003806302323937416, + "eval_runtime": 2.39, + "eval_samples_per_second": 5216.258, + "eval_steps_per_second": 163.178, + "step": 1960 + }, + { + "epoch": 1.2636305323925594, + "grad_norm": 0.012927345000207424, + "learning_rate": 7.363694676074408e-06, + "loss": 0.0005, + "step": 1970 + }, + { + "epoch": 1.2636305323925594, + "eval_loss": 0.0038109412416815758, + "eval_runtime": 2.3902, + "eval_samples_per_second": 5215.912, + "eval_steps_per_second": 163.167, + "step": 1970 + }, + { + "epoch": 1.2700449005772931, + "grad_norm": 0.010677590034902096, + "learning_rate": 7.2995509942270695e-06, + "loss": 0.0005, + "step": 1980 + }, + { + "epoch": 1.2700449005772931, + "eval_loss": 0.0038150884211063385, + "eval_runtime": 2.4019, + "eval_samples_per_second": 5190.418, + "eval_steps_per_second": 162.37, + "step": 1980 + }, + { + "epoch": 1.2764592687620269, + "grad_norm": 0.009424679912626743, + "learning_rate": 7.2354073123797304e-06, + "loss": 0.0209, + "step": 1990 + }, + { + "epoch": 1.2764592687620269, + "eval_loss": 0.0037959227338433266, + "eval_runtime": 2.4118, + "eval_samples_per_second": 5169.261, + "eval_steps_per_second": 161.708, + "step": 1990 + }, + { + "epoch": 1.2828736369467608, + "grad_norm": 0.014829086139798164, + "learning_rate": 7.171263630532393e-06, + "loss": 0.0006, + "step": 2000 + }, + { + "epoch": 1.2828736369467608, + "eval_loss": 0.00383403105661273, + "eval_runtime": 2.4131, + "eval_samples_per_second": 5166.39, + "eval_steps_per_second": 161.618, + "step": 2000 + }, + { + "epoch": 1.2892880051314946, + "grad_norm": 0.01788959838449955, + "learning_rate": 7.107119948685055e-06, + "loss": 0.0005, + "step": 2010 + }, + { + "epoch": 1.2892880051314946, + "eval_loss": 0.0038744392804801464, + "eval_runtime": 2.4295, + "eval_samples_per_second": 5131.522, + "eval_steps_per_second": 160.527, + "step": 2010 + }, + { + "epoch": 1.2957023733162283, + "grad_norm": 0.013126869685947895, + "learning_rate": 7.042976266837717e-06, + "loss": 0.0006, + "step": 2020 + }, + { + "epoch": 1.2957023733162283, + "eval_loss": 0.0038970729801803827, + "eval_runtime": 2.4135, + "eval_samples_per_second": 5165.481, + "eval_steps_per_second": 161.59, + "step": 2020 + }, + { + "epoch": 1.3021167415009622, + "grad_norm": 0.010365926660597324, + "learning_rate": 6.978832584990379e-06, + "loss": 0.0005, + "step": 2030 + }, + { + "epoch": 1.3021167415009622, + "eval_loss": 0.00391175365075469, + "eval_runtime": 2.4159, + "eval_samples_per_second": 5160.436, + "eval_steps_per_second": 161.432, + "step": 2030 + }, + { + "epoch": 1.308531109685696, + "grad_norm": 0.014084501191973686, + "learning_rate": 6.914688903143041e-06, + "loss": 0.0005, + "step": 2040 + }, + { + "epoch": 1.308531109685696, + "eval_loss": 0.003916487097740173, + "eval_runtime": 2.412, + "eval_samples_per_second": 5168.686, + "eval_steps_per_second": 161.69, + "step": 2040 + }, + { + "epoch": 1.3149454778704297, + "grad_norm": 0.009805840440094471, + "learning_rate": 6.850545221295702e-06, + "loss": 0.0005, + "step": 2050 + }, + { + "epoch": 1.3149454778704297, + "eval_loss": 0.003910783212631941, + "eval_runtime": 2.5001, + "eval_samples_per_second": 4986.515, + "eval_steps_per_second": 155.991, + "step": 2050 + }, + { + "epoch": 1.3213598460551634, + "grad_norm": 0.0099485469982028, + "learning_rate": 6.786401539448365e-06, + "loss": 0.0005, + "step": 2060 + }, + { + "epoch": 1.3213598460551634, + "eval_loss": 0.003903586184605956, + "eval_runtime": 2.7919, + "eval_samples_per_second": 4465.368, + "eval_steps_per_second": 139.688, + "step": 2060 + }, + { + "epoch": 1.3277742142398974, + "grad_norm": 0.01362858060747385, + "learning_rate": 6.7222578576010265e-06, + "loss": 0.0221, + "step": 2070 + }, + { + "epoch": 1.3277742142398974, + "eval_loss": 0.003886124351993203, + "eval_runtime": 2.7347, + "eval_samples_per_second": 4558.826, + "eval_steps_per_second": 142.612, + "step": 2070 + }, + { + "epoch": 1.3341885824246311, + "grad_norm": 0.010772572830319405, + "learning_rate": 6.658114175753689e-06, + "loss": 0.0005, + "step": 2080 + }, + { + "epoch": 1.3341885824246311, + "eval_loss": 0.003885190933942795, + "eval_runtime": 2.775, + "eval_samples_per_second": 4492.608, + "eval_steps_per_second": 140.54, + "step": 2080 + }, + { + "epoch": 1.340602950609365, + "grad_norm": 0.0132959159091115, + "learning_rate": 6.593970493906351e-06, + "loss": 0.0006, + "step": 2090 + }, + { + "epoch": 1.340602950609365, + "eval_loss": 0.0038879828061908484, + "eval_runtime": 2.7484, + "eval_samples_per_second": 4536.089, + "eval_steps_per_second": 141.901, + "step": 2090 + }, + { + "epoch": 1.3470173187940988, + "grad_norm": 0.028292661532759666, + "learning_rate": 6.529826812059013e-06, + "loss": 0.0006, + "step": 2100 + }, + { + "epoch": 1.3470173187940988, + "eval_loss": 0.0038887602277100086, + "eval_runtime": 2.7721, + "eval_samples_per_second": 4497.362, + "eval_steps_per_second": 140.689, + "step": 2100 + }, + { + "epoch": 1.3534316869788325, + "grad_norm": 0.014790890738368034, + "learning_rate": 6.4656831302116754e-06, + "loss": 0.0006, + "step": 2110 + }, + { + "epoch": 1.3534316869788325, + "eval_loss": 0.003893056884407997, + "eval_runtime": 2.7258, + "eval_samples_per_second": 4573.656, + "eval_steps_per_second": 143.076, + "step": 2110 + }, + { + "epoch": 1.3598460551635663, + "grad_norm": 0.018104661256074905, + "learning_rate": 6.401539448364336e-06, + "loss": 0.0005, + "step": 2120 + }, + { + "epoch": 1.3598460551635663, + "eval_loss": 0.0038974089547991753, + "eval_runtime": 2.7467, + "eval_samples_per_second": 4538.939, + "eval_steps_per_second": 141.99, + "step": 2120 + }, + { + "epoch": 1.3662604233483002, + "grad_norm": 0.011067189276218414, + "learning_rate": 6.337395766516998e-06, + "loss": 0.0005, + "step": 2130 + }, + { + "epoch": 1.3662604233483002, + "eval_loss": 0.00390303460881114, + "eval_runtime": 2.7776, + "eval_samples_per_second": 4488.473, + "eval_steps_per_second": 140.411, + "step": 2130 + }, + { + "epoch": 1.372674791533034, + "grad_norm": 0.022579031065106392, + "learning_rate": 6.273252084669661e-06, + "loss": 0.0005, + "step": 2140 + }, + { + "epoch": 1.372674791533034, + "eval_loss": 0.003907787147909403, + "eval_runtime": 2.7255, + "eval_samples_per_second": 4574.164, + "eval_steps_per_second": 143.092, + "step": 2140 + }, + { + "epoch": 1.379089159717768, + "grad_norm": 0.009309990331530571, + "learning_rate": 6.209108402822323e-06, + "loss": 0.0005, + "step": 2150 + }, + { + "epoch": 1.379089159717768, + "eval_loss": 0.003909664694219828, + "eval_runtime": 2.7365, + "eval_samples_per_second": 4555.883, + "eval_steps_per_second": 142.52, + "step": 2150 + }, + { + "epoch": 1.3855035279025016, + "grad_norm": 0.019414927810430527, + "learning_rate": 6.1449647209749844e-06, + "loss": 0.018, + "step": 2160 + }, + { + "epoch": 1.3855035279025016, + "eval_loss": 0.003788945497944951, + "eval_runtime": 2.5986, + "eval_samples_per_second": 4797.494, + "eval_steps_per_second": 150.078, + "step": 2160 + }, + { + "epoch": 1.3919178960872354, + "grad_norm": 0.01467384584248066, + "learning_rate": 6.080821039127647e-06, + "loss": 0.0006, + "step": 2170 + }, + { + "epoch": 1.3919178960872354, + "eval_loss": 0.003753120545297861, + "eval_runtime": 2.483, + "eval_samples_per_second": 5020.994, + "eval_steps_per_second": 157.07, + "step": 2170 + }, + { + "epoch": 1.398332264271969, + "grad_norm": 0.012767287902534008, + "learning_rate": 6.016677357280308e-06, + "loss": 0.0006, + "step": 2180 + }, + { + "epoch": 1.398332264271969, + "eval_loss": 0.0037547799292951822, + "eval_runtime": 2.7563, + "eval_samples_per_second": 4523.089, + "eval_steps_per_second": 141.494, + "step": 2180 + }, + { + "epoch": 1.404746632456703, + "grad_norm": 0.015485835261642933, + "learning_rate": 5.95253367543297e-06, + "loss": 0.0005, + "step": 2190 + }, + { + "epoch": 1.404746632456703, + "eval_loss": 0.0037627057172358036, + "eval_runtime": 2.7383, + "eval_samples_per_second": 4552.902, + "eval_steps_per_second": 142.427, + "step": 2190 + }, + { + "epoch": 1.4111610006414368, + "grad_norm": 0.01081483718007803, + "learning_rate": 5.8883899935856325e-06, + "loss": 0.0005, + "step": 2200 + }, + { + "epoch": 1.4111610006414368, + "eval_loss": 0.0037731672637164593, + "eval_runtime": 2.7497, + "eval_samples_per_second": 4533.983, + "eval_steps_per_second": 141.835, + "step": 2200 + }, + { + "epoch": 1.4175753688261707, + "grad_norm": 0.009181569330394268, + "learning_rate": 5.824246311738294e-06, + "loss": 0.0213, + "step": 2210 + }, + { + "epoch": 1.4175753688261707, + "eval_loss": 0.003773350967094302, + "eval_runtime": 2.7961, + "eval_samples_per_second": 4458.641, + "eval_steps_per_second": 139.478, + "step": 2210 + }, + { + "epoch": 1.4239897370109045, + "grad_norm": 0.01457177009433508, + "learning_rate": 5.760102629890956e-06, + "loss": 0.0006, + "step": 2220 + }, + { + "epoch": 1.4239897370109045, + "eval_loss": 0.0037766669411212206, + "eval_runtime": 2.7373, + "eval_samples_per_second": 4554.509, + "eval_steps_per_second": 142.477, + "step": 2220 + }, + { + "epoch": 1.4304041051956382, + "grad_norm": 0.017023414373397827, + "learning_rate": 5.695958948043619e-06, + "loss": 0.0005, + "step": 2230 + }, + { + "epoch": 1.4304041051956382, + "eval_loss": 0.003780537284910679, + "eval_runtime": 2.7984, + "eval_samples_per_second": 4455.052, + "eval_steps_per_second": 139.366, + "step": 2230 + }, + { + "epoch": 1.436818473380372, + "grad_norm": 0.024216625839471817, + "learning_rate": 5.63181526619628e-06, + "loss": 0.0006, + "step": 2240 + }, + { + "epoch": 1.436818473380372, + "eval_loss": 0.0037873839028179646, + "eval_runtime": 2.7322, + "eval_samples_per_second": 4562.914, + "eval_steps_per_second": 142.74, + "step": 2240 + }, + { + "epoch": 1.443232841565106, + "grad_norm": 0.013345190323889256, + "learning_rate": 5.567671584348942e-06, + "loss": 0.0005, + "step": 2250 + }, + { + "epoch": 1.443232841565106, + "eval_loss": 0.0037942323833703995, + "eval_runtime": 2.7887, + "eval_samples_per_second": 4470.565, + "eval_steps_per_second": 139.851, + "step": 2250 + }, + { + "epoch": 1.4496472097498396, + "grad_norm": 0.015292412601411343, + "learning_rate": 5.503527902501604e-06, + "loss": 0.0006, + "step": 2260 + }, + { + "epoch": 1.4496472097498396, + "eval_loss": 0.0038015488535165787, + "eval_runtime": 2.726, + "eval_samples_per_second": 4573.445, + "eval_steps_per_second": 143.069, + "step": 2260 + }, + { + "epoch": 1.4560615779345736, + "grad_norm": 0.008000009693205357, + "learning_rate": 5.439384220654266e-06, + "loss": 0.0005, + "step": 2270 + }, + { + "epoch": 1.4560615779345736, + "eval_loss": 0.003808696521446109, + "eval_runtime": 2.7534, + "eval_samples_per_second": 4527.914, + "eval_steps_per_second": 141.645, + "step": 2270 + }, + { + "epoch": 1.4624759461193073, + "grad_norm": 0.011347589083015919, + "learning_rate": 5.375240538806929e-06, + "loss": 0.0308, + "step": 2280 + }, + { + "epoch": 1.4624759461193073, + "eval_loss": 0.003851969027891755, + "eval_runtime": 2.5855, + "eval_samples_per_second": 4821.875, + "eval_steps_per_second": 150.841, + "step": 2280 + }, + { + "epoch": 1.468890314304041, + "grad_norm": 0.017860205844044685, + "learning_rate": 5.31109685695959e-06, + "loss": 0.0006, + "step": 2290 + }, + { + "epoch": 1.468890314304041, + "eval_loss": 0.00419242400676012, + "eval_runtime": 2.4698, + "eval_samples_per_second": 5047.81, + "eval_steps_per_second": 157.909, + "step": 2290 + }, + { + "epoch": 1.4753046824887748, + "grad_norm": 0.014923288486897945, + "learning_rate": 5.246953175112251e-06, + "loss": 0.0005, + "step": 2300 + }, + { + "epoch": 1.4753046824887748, + "eval_loss": 0.004501288756728172, + "eval_runtime": 2.7337, + "eval_samples_per_second": 4560.461, + "eval_steps_per_second": 142.663, + "step": 2300 + }, + { + "epoch": 1.4817190506735087, + "grad_norm": 0.05106737092137337, + "learning_rate": 5.182809493264914e-06, + "loss": 0.0006, + "step": 2310 + }, + { + "epoch": 1.4817190506735087, + "eval_loss": 0.0046123480424284935, + "eval_runtime": 2.7714, + "eval_samples_per_second": 4498.443, + "eval_steps_per_second": 140.723, + "step": 2310 + }, + { + "epoch": 1.4881334188582425, + "grad_norm": 0.018469417467713356, + "learning_rate": 5.118665811417576e-06, + "loss": 0.0006, + "step": 2320 + }, + { + "epoch": 1.4881334188582425, + "eval_loss": 0.004673714749515057, + "eval_runtime": 2.7182, + "eval_samples_per_second": 4586.408, + "eval_steps_per_second": 143.475, + "step": 2320 + }, + { + "epoch": 1.4945477870429762, + "grad_norm": 0.010447741486132145, + "learning_rate": 5.054522129570238e-06, + "loss": 0.0007, + "step": 2330 + }, + { + "epoch": 1.4945477870429762, + "eval_loss": 0.004349403083324432, + "eval_runtime": 2.7507, + "eval_samples_per_second": 4532.368, + "eval_steps_per_second": 141.784, + "step": 2330 + }, + { + "epoch": 1.5009621552277101, + "grad_norm": 0.012541081756353378, + "learning_rate": 4.990378447722899e-06, + "loss": 0.0005, + "step": 2340 + }, + { + "epoch": 1.5009621552277101, + "eval_loss": 0.004171546548604965, + "eval_runtime": 2.7531, + "eval_samples_per_second": 4528.382, + "eval_steps_per_second": 141.659, + "step": 2340 + }, + { + "epoch": 1.5073765234124439, + "grad_norm": 2.703684091567993, + "learning_rate": 4.926234765875561e-06, + "loss": 0.0142, + "step": 2350 + }, + { + "epoch": 1.5073765234124439, + "eval_loss": 0.004026424139738083, + "eval_runtime": 2.7277, + "eval_samples_per_second": 4570.578, + "eval_steps_per_second": 142.98, + "step": 2350 + }, + { + "epoch": 1.5137908915971776, + "grad_norm": 0.02144004963338375, + "learning_rate": 4.862091084028224e-06, + "loss": 0.0005, + "step": 2360 + }, + { + "epoch": 1.5137908915971776, + "eval_loss": 0.0038558936212211847, + "eval_runtime": 2.7816, + "eval_samples_per_second": 4481.995, + "eval_steps_per_second": 140.208, + "step": 2360 + }, + { + "epoch": 1.5202052597819113, + "grad_norm": 0.0292587261646986, + "learning_rate": 4.797947402180886e-06, + "loss": 0.0005, + "step": 2370 + }, + { + "epoch": 1.5202052597819113, + "eval_loss": 0.0038230891805142164, + "eval_runtime": 2.7173, + "eval_samples_per_second": 4588.073, + "eval_steps_per_second": 143.527, + "step": 2370 + }, + { + "epoch": 1.5266196279666453, + "grad_norm": 0.007578797172755003, + "learning_rate": 4.7338037203335474e-06, + "loss": 0.0005, + "step": 2380 + }, + { + "epoch": 1.5266196279666453, + "eval_loss": 0.003819518955424428, + "eval_runtime": 2.7184, + "eval_samples_per_second": 4586.149, + "eval_steps_per_second": 143.467, + "step": 2380 + }, + { + "epoch": 1.5330339961513793, + "grad_norm": 0.013671874068677425, + "learning_rate": 4.669660038486209e-06, + "loss": 0.0005, + "step": 2390 + }, + { + "epoch": 1.5330339961513793, + "eval_loss": 0.003823925508186221, + "eval_runtime": 2.7743, + "eval_samples_per_second": 4493.785, + "eval_steps_per_second": 140.577, + "step": 2390 + }, + { + "epoch": 1.539448364336113, + "grad_norm": 0.01323388610035181, + "learning_rate": 4.605516356638872e-06, + "loss": 0.0005, + "step": 2400 + }, + { + "epoch": 1.539448364336113, + "eval_loss": 0.003829265246167779, + "eval_runtime": 2.7303, + "eval_samples_per_second": 4566.097, + "eval_steps_per_second": 142.839, + "step": 2400 + }, + { + "epoch": 1.5458627325208467, + "grad_norm": 0.015381712466478348, + "learning_rate": 4.541372674791533e-06, + "loss": 0.0005, + "step": 2410 + }, + { + "epoch": 1.5458627325208467, + "eval_loss": 0.003835107199847698, + "eval_runtime": 2.6429, + "eval_samples_per_second": 4717.103, + "eval_steps_per_second": 147.563, + "step": 2410 + }, + { + "epoch": 1.5522771007055804, + "grad_norm": 0.010174254886806011, + "learning_rate": 4.4772289929441955e-06, + "loss": 0.0005, + "step": 2420 + }, + { + "epoch": 1.5522771007055804, + "eval_loss": 0.0038460749201476574, + "eval_runtime": 2.634, + "eval_samples_per_second": 4733.166, + "eval_steps_per_second": 148.066, + "step": 2420 + }, + { + "epoch": 1.5586914688903142, + "grad_norm": 0.008752675727009773, + "learning_rate": 4.413085311096857e-06, + "loss": 0.0004, + "step": 2430 + }, + { + "epoch": 1.5586914688903142, + "eval_loss": 0.003855367423966527, + "eval_runtime": 2.7267, + "eval_samples_per_second": 4572.183, + "eval_steps_per_second": 143.03, + "step": 2430 + }, + { + "epoch": 1.5651058370750481, + "grad_norm": 0.010081687942147255, + "learning_rate": 4.348941629249519e-06, + "loss": 0.0005, + "step": 2440 + }, + { + "epoch": 1.5651058370750481, + "eval_loss": 0.003865085309371352, + "eval_runtime": 2.7222, + "eval_samples_per_second": 4579.785, + "eval_steps_per_second": 143.268, + "step": 2440 + }, + { + "epoch": 1.5715202052597819, + "grad_norm": 0.010632511228322983, + "learning_rate": 4.284797947402181e-06, + "loss": 0.0005, + "step": 2450 + }, + { + "epoch": 1.5715202052597819, + "eval_loss": 0.003874831600114703, + "eval_runtime": 2.7831, + "eval_samples_per_second": 4479.547, + "eval_steps_per_second": 140.132, + "step": 2450 + }, + { + "epoch": 1.5779345734445158, + "grad_norm": 0.015775036066770554, + "learning_rate": 4.2206542655548435e-06, + "loss": 0.0005, + "step": 2460 + }, + { + "epoch": 1.5779345734445158, + "eval_loss": 0.00388737628236413, + "eval_runtime": 2.7237, + "eval_samples_per_second": 4577.207, + "eval_steps_per_second": 143.187, + "step": 2460 + }, + { + "epoch": 1.5843489416292496, + "grad_norm": 0.011216863058507442, + "learning_rate": 4.156510583707505e-06, + "loss": 0.0005, + "step": 2470 + }, + { + "epoch": 1.5843489416292496, + "eval_loss": 0.0038962597027420998, + "eval_runtime": 2.7328, + "eval_samples_per_second": 4561.93, + "eval_steps_per_second": 142.709, + "step": 2470 + }, + { + "epoch": 1.5907633098139833, + "grad_norm": 0.009197092615067959, + "learning_rate": 4.092366901860167e-06, + "loss": 0.0004, + "step": 2480 + }, + { + "epoch": 1.5907633098139833, + "eval_loss": 0.0039031601045280695, + "eval_runtime": 2.7575, + "eval_samples_per_second": 4521.111, + "eval_steps_per_second": 141.432, + "step": 2480 + }, + { + "epoch": 1.597177677998717, + "grad_norm": 0.007920138537883759, + "learning_rate": 4.028223220012829e-06, + "loss": 0.0004, + "step": 2490 + }, + { + "epoch": 1.597177677998717, + "eval_loss": 0.003908549435436726, + "eval_runtime": 2.7164, + "eval_samples_per_second": 4589.533, + "eval_steps_per_second": 143.572, + "step": 2490 + }, + { + "epoch": 1.603592046183451, + "grad_norm": 0.01130605023354292, + "learning_rate": 3.964079538165492e-06, + "loss": 0.0202, + "step": 2500 + }, + { + "epoch": 1.603592046183451, + "eval_loss": 0.003887481288984418, + "eval_runtime": 2.7368, + "eval_samples_per_second": 4555.355, + "eval_steps_per_second": 142.503, + "step": 2500 + }, + { + "epoch": 1.6100064143681847, + "grad_norm": 0.010281969793140888, + "learning_rate": 3.8999358563181525e-06, + "loss": 0.0004, + "step": 2510 + }, + { + "epoch": 1.6100064143681847, + "eval_loss": 0.0038782022893428802, + "eval_runtime": 2.7779, + "eval_samples_per_second": 4487.867, + "eval_steps_per_second": 140.392, + "step": 2510 + }, + { + "epoch": 1.6164207825529187, + "grad_norm": 0.009838576428592205, + "learning_rate": 3.835792174470815e-06, + "loss": 0.0145, + "step": 2520 + }, + { + "epoch": 1.6164207825529187, + "eval_loss": 0.0037884835619479418, + "eval_runtime": 2.7139, + "eval_samples_per_second": 4593.753, + "eval_steps_per_second": 143.704, + "step": 2520 + }, + { + "epoch": 1.6228351507376524, + "grad_norm": 0.010711363516747952, + "learning_rate": 3.771648492623477e-06, + "loss": 0.0005, + "step": 2530 + }, + { + "epoch": 1.6228351507376524, + "eval_loss": 0.003752094926312566, + "eval_runtime": 2.3995, + "eval_samples_per_second": 5195.686, + "eval_steps_per_second": 162.534, + "step": 2530 + }, + { + "epoch": 1.6292495189223861, + "grad_norm": 0.010661286301910877, + "learning_rate": 3.707504810776139e-06, + "loss": 0.0005, + "step": 2540 + }, + { + "epoch": 1.6292495189223861, + "eval_loss": 0.0037525563966482878, + "eval_runtime": 2.4005, + "eval_samples_per_second": 5193.403, + "eval_steps_per_second": 162.463, + "step": 2540 + }, + { + "epoch": 1.6356638871071199, + "grad_norm": 0.010547863319516182, + "learning_rate": 3.6433611289288006e-06, + "loss": 0.0005, + "step": 2550 + }, + { + "epoch": 1.6356638871071199, + "eval_loss": 0.0037555524613708258, + "eval_runtime": 2.3918, + "eval_samples_per_second": 5212.477, + "eval_steps_per_second": 163.06, + "step": 2550 + }, + { + "epoch": 1.6420782552918538, + "grad_norm": 0.009853300638496876, + "learning_rate": 3.579217447081463e-06, + "loss": 0.0005, + "step": 2560 + }, + { + "epoch": 1.6420782552918538, + "eval_loss": 0.0037585473619401455, + "eval_runtime": 2.4095, + "eval_samples_per_second": 5174.209, + "eval_steps_per_second": 161.863, + "step": 2560 + }, + { + "epoch": 1.6484926234765875, + "grad_norm": 0.015307929366827011, + "learning_rate": 3.515073765234125e-06, + "loss": 0.0004, + "step": 2570 + }, + { + "epoch": 1.6484926234765875, + "eval_loss": 0.0037612884771078825, + "eval_runtime": 2.3983, + "eval_samples_per_second": 5198.286, + "eval_steps_per_second": 162.616, + "step": 2570 + }, + { + "epoch": 1.6549069916613215, + "grad_norm": 0.011234630830585957, + "learning_rate": 3.4509300833867864e-06, + "loss": 0.0005, + "step": 2580 + }, + { + "epoch": 1.6549069916613215, + "eval_loss": 0.0037648940924555063, + "eval_runtime": 2.3961, + "eval_samples_per_second": 5202.987, + "eval_steps_per_second": 162.763, + "step": 2580 + }, + { + "epoch": 1.6613213598460552, + "grad_norm": 0.09665284305810928, + "learning_rate": 3.3867864015394486e-06, + "loss": 0.0006, + "step": 2590 + }, + { + "epoch": 1.6613213598460552, + "eval_loss": 0.003771902294829488, + "eval_runtime": 2.396, + "eval_samples_per_second": 5203.268, + "eval_steps_per_second": 162.772, + "step": 2590 + }, + { + "epoch": 1.667735728030789, + "grad_norm": 0.009439531713724136, + "learning_rate": 3.322642719692111e-06, + "loss": 0.0005, + "step": 2600 + }, + { + "epoch": 1.667735728030789, + "eval_loss": 0.0037818914279341698, + "eval_runtime": 2.4026, + "eval_samples_per_second": 5189.052, + "eval_steps_per_second": 162.327, + "step": 2600 + }, + { + "epoch": 1.6741500962155227, + "grad_norm": 0.011505583301186562, + "learning_rate": 3.2584990378447722e-06, + "loss": 0.0004, + "step": 2610 + }, + { + "epoch": 1.6741500962155227, + "eval_loss": 0.0037876018323004246, + "eval_runtime": 2.4217, + "eval_samples_per_second": 5148.034, + "eval_steps_per_second": 161.044, + "step": 2610 + }, + { + "epoch": 1.6805644644002564, + "grad_norm": 0.01173364743590355, + "learning_rate": 3.1943553559974345e-06, + "loss": 0.0004, + "step": 2620 + }, + { + "epoch": 1.6805644644002564, + "eval_loss": 0.003792904084548354, + "eval_runtime": 2.4025, + "eval_samples_per_second": 5189.132, + "eval_steps_per_second": 162.329, + "step": 2620 + }, + { + "epoch": 1.6869788325849904, + "grad_norm": 0.009778267703950405, + "learning_rate": 3.1302116741500967e-06, + "loss": 0.0005, + "step": 2630 + }, + { + "epoch": 1.6869788325849904, + "eval_loss": 0.0037972936406731606, + "eval_runtime": 2.4016, + "eval_samples_per_second": 5191.166, + "eval_steps_per_second": 162.393, + "step": 2630 + }, + { + "epoch": 1.6933932007697243, + "grad_norm": 0.012008159421384335, + "learning_rate": 3.0660679923027585e-06, + "loss": 0.0004, + "step": 2640 + }, + { + "epoch": 1.6933932007697243, + "eval_loss": 0.0038023728411644697, + "eval_runtime": 2.4092, + "eval_samples_per_second": 5174.654, + "eval_steps_per_second": 161.877, + "step": 2640 + }, + { + "epoch": 1.699807568954458, + "grad_norm": 0.008908640593290329, + "learning_rate": 3.0019243104554203e-06, + "loss": 0.0005, + "step": 2650 + }, + { + "epoch": 1.699807568954458, + "eval_loss": 0.00380841176956892, + "eval_runtime": 2.4007, + "eval_samples_per_second": 5193.082, + "eval_steps_per_second": 162.453, + "step": 2650 + }, + { + "epoch": 1.7062219371391918, + "grad_norm": 0.01192167866975069, + "learning_rate": 2.9377806286080825e-06, + "loss": 0.0004, + "step": 2660 + }, + { + "epoch": 1.7062219371391918, + "eval_loss": 0.003814821597188711, + "eval_runtime": 2.4035, + "eval_samples_per_second": 5187.046, + "eval_steps_per_second": 162.264, + "step": 2660 + }, + { + "epoch": 1.7126363053239255, + "grad_norm": 0.008039736188948154, + "learning_rate": 2.8736369467607443e-06, + "loss": 0.0004, + "step": 2670 + }, + { + "epoch": 1.7126363053239255, + "eval_loss": 0.0038202591240406036, + "eval_runtime": 2.4055, + "eval_samples_per_second": 5182.655, + "eval_steps_per_second": 162.127, + "step": 2670 + }, + { + "epoch": 1.7190506735086593, + "grad_norm": 0.013746132142841816, + "learning_rate": 2.809493264913406e-06, + "loss": 0.0004, + "step": 2680 + }, + { + "epoch": 1.7190506735086593, + "eval_loss": 0.003824560670182109, + "eval_runtime": 2.4046, + "eval_samples_per_second": 5184.652, + "eval_steps_per_second": 162.189, + "step": 2680 + }, + { + "epoch": 1.7254650416933932, + "grad_norm": 0.011107255704700947, + "learning_rate": 2.7453495830660683e-06, + "loss": 0.0004, + "step": 2690 + }, + { + "epoch": 1.7254650416933932, + "eval_loss": 0.0038276948034763336, + "eval_runtime": 2.446, + "eval_samples_per_second": 5096.936, + "eval_steps_per_second": 159.445, + "step": 2690 + }, + { + "epoch": 1.7318794098781272, + "grad_norm": 0.009707199409604073, + "learning_rate": 2.68120590121873e-06, + "loss": 0.0005, + "step": 2700 + }, + { + "epoch": 1.7318794098781272, + "eval_loss": 0.003818488446995616, + "eval_runtime": 2.7582, + "eval_samples_per_second": 4520.011, + "eval_steps_per_second": 141.398, + "step": 2700 + }, + { + "epoch": 1.738293778062861, + "grad_norm": 0.007962013594806194, + "learning_rate": 2.6170622193713924e-06, + "loss": 0.0004, + "step": 2710 + }, + { + "epoch": 1.738293778062861, + "eval_loss": 0.003793991869315505, + "eval_runtime": 2.7217, + "eval_samples_per_second": 4580.669, + "eval_steps_per_second": 143.295, + "step": 2710 + }, + { + "epoch": 1.7447081462475946, + "grad_norm": 0.013016624376177788, + "learning_rate": 2.5529185375240537e-06, + "loss": 0.0005, + "step": 2720 + }, + { + "epoch": 1.7447081462475946, + "eval_loss": 0.003789684269577265, + "eval_runtime": 2.7327, + "eval_samples_per_second": 4562.147, + "eval_steps_per_second": 142.716, + "step": 2720 + }, + { + "epoch": 1.7511225144323284, + "grad_norm": 0.007025664672255516, + "learning_rate": 2.488774855676716e-06, + "loss": 0.0154, + "step": 2730 + }, + { + "epoch": 1.7511225144323284, + "eval_loss": 0.0038092422764748335, + "eval_runtime": 2.744, + "eval_samples_per_second": 4543.305, + "eval_steps_per_second": 142.126, + "step": 2730 + }, + { + "epoch": 1.757536882617062, + "grad_norm": 0.0099189393222332, + "learning_rate": 2.4246311738293778e-06, + "loss": 0.0004, + "step": 2740 + }, + { + "epoch": 1.757536882617062, + "eval_loss": 0.0038577597588300705, + "eval_runtime": 2.7325, + "eval_samples_per_second": 4562.439, + "eval_steps_per_second": 142.725, + "step": 2740 + }, + { + "epoch": 1.763951250801796, + "grad_norm": 0.011446350254118443, + "learning_rate": 2.36048749198204e-06, + "loss": 0.0004, + "step": 2750 + }, + { + "epoch": 1.763951250801796, + "eval_loss": 0.003888155333697796, + "eval_runtime": 2.7408, + "eval_samples_per_second": 4548.595, + "eval_steps_per_second": 142.292, + "step": 2750 + }, + { + "epoch": 1.7703656189865298, + "grad_norm": 0.006927170790731907, + "learning_rate": 2.2963438101347018e-06, + "loss": 0.0004, + "step": 2760 + }, + { + "epoch": 1.7703656189865298, + "eval_loss": 0.0039006902370601892, + "eval_runtime": 2.7693, + "eval_samples_per_second": 4501.888, + "eval_steps_per_second": 140.831, + "step": 2760 + }, + { + "epoch": 1.7767799871712637, + "grad_norm": 0.010873212479054928, + "learning_rate": 2.232200128287364e-06, + "loss": 0.0005, + "step": 2770 + }, + { + "epoch": 1.7767799871712637, + "eval_loss": 0.003896691370755434, + "eval_runtime": 2.7253, + "eval_samples_per_second": 4574.621, + "eval_steps_per_second": 143.106, + "step": 2770 + }, + { + "epoch": 1.7831943553559975, + "grad_norm": 0.008327585645020008, + "learning_rate": 2.168056446440026e-06, + "loss": 0.0004, + "step": 2780 + }, + { + "epoch": 1.7831943553559975, + "eval_loss": 0.003881297539919615, + "eval_runtime": 2.6677, + "eval_samples_per_second": 4673.4, + "eval_steps_per_second": 146.196, + "step": 2780 + }, + { + "epoch": 1.7896087235407312, + "grad_norm": 0.009809192270040512, + "learning_rate": 2.1039127645926876e-06, + "loss": 0.0004, + "step": 2790 + }, + { + "epoch": 1.7896087235407312, + "eval_loss": 0.0038780542090535164, + "eval_runtime": 2.3819, + "eval_samples_per_second": 5234.003, + "eval_steps_per_second": 163.733, + "step": 2790 + }, + { + "epoch": 1.796023091725465, + "grad_norm": 0.008661613799631596, + "learning_rate": 2.03976908274535e-06, + "loss": 0.0004, + "step": 2800 + }, + { + "epoch": 1.796023091725465, + "eval_loss": 0.0038783461786806583, + "eval_runtime": 2.3989, + "eval_samples_per_second": 5196.882, + "eval_steps_per_second": 162.572, + "step": 2800 + }, + { + "epoch": 1.8024374599101989, + "grad_norm": 0.008230826817452908, + "learning_rate": 1.9756254008980116e-06, + "loss": 0.0004, + "step": 2810 + }, + { + "epoch": 1.8024374599101989, + "eval_loss": 0.0038756639696657658, + "eval_runtime": 2.3947, + "eval_samples_per_second": 5206.034, + "eval_steps_per_second": 162.858, + "step": 2810 + }, + { + "epoch": 1.8088518280949326, + "grad_norm": 0.010815752670168877, + "learning_rate": 1.911481719050674e-06, + "loss": 0.001, + "step": 2820 + }, + { + "epoch": 1.8088518280949326, + "eval_loss": 0.0038313877303153276, + "eval_runtime": 2.3985, + "eval_samples_per_second": 5197.74, + "eval_steps_per_second": 162.599, + "step": 2820 + }, + { + "epoch": 1.8152661962796666, + "grad_norm": 0.008790099062025547, + "learning_rate": 1.8473380372033357e-06, + "loss": 0.0004, + "step": 2830 + }, + { + "epoch": 1.8152661962796666, + "eval_loss": 0.0038214183878153563, + "eval_runtime": 2.4048, + "eval_samples_per_second": 5184.222, + "eval_steps_per_second": 162.176, + "step": 2830 + }, + { + "epoch": 1.8216805644644003, + "grad_norm": 0.008266700431704521, + "learning_rate": 1.7831943553559975e-06, + "loss": 0.0124, + "step": 2840 + }, + { + "epoch": 1.8216805644644003, + "eval_loss": 0.0038049728609621525, + "eval_runtime": 2.4016, + "eval_samples_per_second": 5191.199, + "eval_steps_per_second": 162.394, + "step": 2840 + }, + { + "epoch": 1.828094932649134, + "grad_norm": 0.007836179807782173, + "learning_rate": 1.7190506735086595e-06, + "loss": 0.0004, + "step": 2850 + }, + { + "epoch": 1.828094932649134, + "eval_loss": 0.0037862639874219894, + "eval_runtime": 2.4046, + "eval_samples_per_second": 5184.681, + "eval_steps_per_second": 162.19, + "step": 2850 + }, + { + "epoch": 1.8345093008338678, + "grad_norm": 0.010235415771603584, + "learning_rate": 1.6549069916613215e-06, + "loss": 0.0003, + "step": 2860 + }, + { + "epoch": 1.8345093008338678, + "eval_loss": 0.003780810162425041, + "eval_runtime": 2.4032, + "eval_samples_per_second": 5187.659, + "eval_steps_per_second": 162.283, + "step": 2860 + }, + { + "epoch": 1.8409236690186017, + "grad_norm": 0.008026237599551678, + "learning_rate": 1.5907633098139835e-06, + "loss": 0.0004, + "step": 2870 + }, + { + "epoch": 1.8409236690186017, + "eval_loss": 0.003780545899644494, + "eval_runtime": 2.4055, + "eval_samples_per_second": 5182.623, + "eval_steps_per_second": 162.126, + "step": 2870 + }, + { + "epoch": 1.8473380372033354, + "grad_norm": 0.0085107097402215, + "learning_rate": 1.5266196279666453e-06, + "loss": 0.0004, + "step": 2880 + }, + { + "epoch": 1.8473380372033354, + "eval_loss": 0.003781872568652034, + "eval_runtime": 2.4133, + "eval_samples_per_second": 5166.047, + "eval_steps_per_second": 161.607, + "step": 2880 + }, + { + "epoch": 1.8537524053880694, + "grad_norm": 0.012739639729261398, + "learning_rate": 1.4624759461193075e-06, + "loss": 0.0004, + "step": 2890 + }, + { + "epoch": 1.8537524053880694, + "eval_loss": 0.003783388528972864, + "eval_runtime": 2.3985, + "eval_samples_per_second": 5197.939, + "eval_steps_per_second": 162.605, + "step": 2890 + }, + { + "epoch": 1.8601667735728031, + "grad_norm": 0.007168211042881012, + "learning_rate": 1.3983322642719693e-06, + "loss": 0.0173, + "step": 2900 + }, + { + "epoch": 1.8601667735728031, + "eval_loss": 0.0037781535647809505, + "eval_runtime": 2.4089, + "eval_samples_per_second": 5175.415, + "eval_steps_per_second": 161.9, + "step": 2900 + }, + { + "epoch": 1.8665811417575369, + "grad_norm": 0.008200396783649921, + "learning_rate": 1.3341885824246311e-06, + "loss": 0.0004, + "step": 2910 + }, + { + "epoch": 1.8665811417575369, + "eval_loss": 0.003767445683479309, + "eval_runtime": 2.405, + "eval_samples_per_second": 5183.787, + "eval_steps_per_second": 162.162, + "step": 2910 + }, + { + "epoch": 1.8729955099422706, + "grad_norm": 0.007957600988447666, + "learning_rate": 1.2700449005772933e-06, + "loss": 0.0156, + "step": 2920 + }, + { + "epoch": 1.8729955099422706, + "eval_loss": 0.0037655681371688843, + "eval_runtime": 2.4106, + "eval_samples_per_second": 5171.768, + "eval_steps_per_second": 161.786, + "step": 2920 + }, + { + "epoch": 1.8794098781270043, + "grad_norm": 0.011560726910829544, + "learning_rate": 1.2059012187299551e-06, + "loss": 0.0004, + "step": 2930 + }, + { + "epoch": 1.8794098781270043, + "eval_loss": 0.003768111579120159, + "eval_runtime": 2.4108, + "eval_samples_per_second": 5171.259, + "eval_steps_per_second": 161.77, + "step": 2930 + }, + { + "epoch": 1.8858242463117383, + "grad_norm": 0.041454609483480453, + "learning_rate": 1.1417575368826172e-06, + "loss": 0.0004, + "step": 2940 + }, + { + "epoch": 1.8858242463117383, + "eval_loss": 0.0037701409310102463, + "eval_runtime": 2.4098, + "eval_samples_per_second": 5173.398, + "eval_steps_per_second": 161.837, + "step": 2940 + }, + { + "epoch": 1.8922386144964722, + "grad_norm": 0.013258632272481918, + "learning_rate": 1.0776138550352792e-06, + "loss": 0.0004, + "step": 2950 + }, + { + "epoch": 1.8922386144964722, + "eval_loss": 0.003771682735532522, + "eval_runtime": 2.3911, + "eval_samples_per_second": 5213.929, + "eval_steps_per_second": 163.105, + "step": 2950 + }, + { + "epoch": 1.898652982681206, + "grad_norm": 0.007926654070615768, + "learning_rate": 1.0134701731879412e-06, + "loss": 0.0151, + "step": 2960 + }, + { + "epoch": 1.898652982681206, + "eval_loss": 0.0037694782949984074, + "eval_runtime": 2.3961, + "eval_samples_per_second": 5203.0, + "eval_steps_per_second": 162.763, + "step": 2960 + }, + { + "epoch": 1.9050673508659397, + "grad_norm": 0.009811542928218842, + "learning_rate": 9.493264913406031e-07, + "loss": 0.0004, + "step": 2970 + }, + { + "epoch": 1.9050673508659397, + "eval_loss": 0.00376922101713717, + "eval_runtime": 2.3914, + "eval_samples_per_second": 5213.272, + "eval_steps_per_second": 163.085, + "step": 2970 + }, + { + "epoch": 1.9114817190506734, + "grad_norm": 0.010268312878906727, + "learning_rate": 8.85182809493265e-07, + "loss": 0.0004, + "step": 2980 + }, + { + "epoch": 1.9114817190506734, + "eval_loss": 0.0037699865642935038, + "eval_runtime": 2.3966, + "eval_samples_per_second": 5201.847, + "eval_steps_per_second": 162.727, + "step": 2980 + }, + { + "epoch": 1.9178960872354072, + "grad_norm": 0.009732135571539402, + "learning_rate": 8.210391276459269e-07, + "loss": 0.0004, + "step": 2990 + }, + { + "epoch": 1.9178960872354072, + "eval_loss": 0.0037714012432843447, + "eval_runtime": 2.4004, + "eval_samples_per_second": 5193.805, + "eval_steps_per_second": 162.476, + "step": 2990 + }, + { + "epoch": 1.9243104554201411, + "grad_norm": 0.00821003783494234, + "learning_rate": 7.568954457985889e-07, + "loss": 0.0004, + "step": 3000 + }, + { + "epoch": 1.9243104554201411, + "eval_loss": 0.0037729167379438877, + "eval_runtime": 2.4092, + "eval_samples_per_second": 5174.836, + "eval_steps_per_second": 161.882, + "step": 3000 + }, + { + "epoch": 1.930724823604875, + "grad_norm": 0.008221003226935863, + "learning_rate": 6.927517639512508e-07, + "loss": 0.0004, + "step": 3010 + }, + { + "epoch": 1.930724823604875, + "eval_loss": 0.0037741470150649548, + "eval_runtime": 2.4854, + "eval_samples_per_second": 5016.031, + "eval_steps_per_second": 156.914, + "step": 3010 + }, + { + "epoch": 1.9371391917896088, + "grad_norm": 0.0076448554173111916, + "learning_rate": 6.286080821039128e-07, + "loss": 0.0004, + "step": 3020 + }, + { + "epoch": 1.9371391917896088, + "eval_loss": 0.0037749563343822956, + "eval_runtime": 2.7299, + "eval_samples_per_second": 4566.865, + "eval_steps_per_second": 142.863, + "step": 3020 + }, + { + "epoch": 1.9435535599743425, + "grad_norm": 0.011786909773945808, + "learning_rate": 5.644644002565747e-07, + "loss": 0.0096, + "step": 3030 + }, + { + "epoch": 1.9435535599743425, + "eval_loss": 0.0037560511846095324, + "eval_runtime": 2.7211, + "eval_samples_per_second": 4581.635, + "eval_steps_per_second": 143.325, + "step": 3030 + }, + { + "epoch": 1.9499679281590763, + "grad_norm": 0.009966257959604263, + "learning_rate": 5.003207184092367e-07, + "loss": 0.0004, + "step": 3040 + }, + { + "epoch": 1.9499679281590763, + "eval_loss": 0.0037503966595977545, + "eval_runtime": 2.7541, + "eval_samples_per_second": 4526.77, + "eval_steps_per_second": 141.609, + "step": 3040 + }, + { + "epoch": 1.95638229634381, + "grad_norm": 0.008138866163790226, + "learning_rate": 4.361770365618987e-07, + "loss": 0.0004, + "step": 3050 + }, + { + "epoch": 1.95638229634381, + "eval_loss": 0.0037492290139198303, + "eval_runtime": 2.7213, + "eval_samples_per_second": 4581.226, + "eval_steps_per_second": 143.313, + "step": 3050 + }, + { + "epoch": 1.962796664528544, + "grad_norm": 0.010643853805959225, + "learning_rate": 3.720333547145606e-07, + "loss": 0.0004, + "step": 3060 + }, + { + "epoch": 1.962796664528544, + "eval_loss": 0.00374912703409791, + "eval_runtime": 2.7173, + "eval_samples_per_second": 4587.941, + "eval_steps_per_second": 143.523, + "step": 3060 + }, + { + "epoch": 1.9692110327132777, + "grad_norm": 0.010715777054429054, + "learning_rate": 3.078896728672226e-07, + "loss": 0.0004, + "step": 3070 + }, + { + "epoch": 1.9692110327132777, + "eval_loss": 0.0037495270371437073, + "eval_runtime": 2.7194, + "eval_samples_per_second": 4584.515, + "eval_steps_per_second": 143.415, + "step": 3070 + }, + { + "epoch": 1.9756254008980116, + "grad_norm": 0.0092789800837636, + "learning_rate": 2.4374599101988453e-07, + "loss": 0.0011, + "step": 3080 + }, + { + "epoch": 1.9756254008980116, + "eval_loss": 0.003745446214452386, + "eval_runtime": 2.7756, + "eval_samples_per_second": 4491.701, + "eval_steps_per_second": 140.512, + "step": 3080 + }, + { + "epoch": 1.9820397690827454, + "grad_norm": 0.008841835893690586, + "learning_rate": 1.7960230917254652e-07, + "loss": 0.0004, + "step": 3090 + }, + { + "epoch": 1.9820397690827454, + "eval_loss": 0.0037423004396259785, + "eval_runtime": 2.7196, + "eval_samples_per_second": 4584.057, + "eval_steps_per_second": 143.401, + "step": 3090 + }, + { + "epoch": 1.988454137267479, + "grad_norm": 0.010190588422119617, + "learning_rate": 1.1545862732520848e-07, + "loss": 0.0004, + "step": 3100 + }, + { + "epoch": 1.988454137267479, + "eval_loss": 0.003741658991202712, + "eval_runtime": 2.6701, + "eval_samples_per_second": 4669.199, + "eval_steps_per_second": 146.065, + "step": 3100 + }, + { + "epoch": 1.9948685054522128, + "grad_norm": 0.008077690377831459, + "learning_rate": 5.131494547787043e-08, + "loss": 0.0004, + "step": 3110 + }, + { + "epoch": 1.9948685054522128, + "eval_loss": 0.003741599852219224, + "eval_runtime": 2.4727, + "eval_samples_per_second": 5041.765, + "eval_steps_per_second": 157.719, + "step": 3110 + } + ], + "logging_steps": 10, + "max_steps": 3118, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 50206333455360.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}