diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8440 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999459527627146, + "eval_steps": 100, + "global_step": 3469, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008647557965661989, + "grad_norm": 10.80162525177002, + "learning_rate": 1.7241379310344828e-07, + "loss": 2.2563, + "step": 3 + }, + { + "epoch": 0.0017295115931323978, + "grad_norm": 9.61453914642334, + "learning_rate": 3.4482758620689656e-07, + "loss": 2.0897, + "step": 6 + }, + { + "epoch": 0.0025942673896985967, + "grad_norm": 9.470385551452637, + "learning_rate": 5.172413793103449e-07, + "loss": 1.9261, + "step": 9 + }, + { + "epoch": 0.0034590231862647956, + "grad_norm": 9.873086929321289, + "learning_rate": 6.896551724137931e-07, + "loss": 2.0147, + "step": 12 + }, + { + "epoch": 0.004323778982830994, + "grad_norm": 10.191353797912598, + "learning_rate": 8.620689655172415e-07, + "loss": 1.858, + "step": 15 + }, + { + "epoch": 0.005188534779397193, + "grad_norm": 10.490758895874023, + "learning_rate": 1.0344827586206898e-06, + "loss": 2.2373, + "step": 18 + }, + { + "epoch": 0.006053290575963392, + "grad_norm": 10.7379789352417, + "learning_rate": 1.2068965517241381e-06, + "loss": 2.2257, + "step": 21 + }, + { + "epoch": 0.006918046372529591, + "grad_norm": 9.31098747253418, + "learning_rate": 1.3793103448275862e-06, + "loss": 1.9876, + "step": 24 + }, + { + "epoch": 0.00778280216909579, + "grad_norm": 9.79057502746582, + "learning_rate": 1.5517241379310346e-06, + "loss": 1.9015, + "step": 27 + }, + { + "epoch": 0.008647557965661988, + "grad_norm": 9.3207368850708, + "learning_rate": 1.724137931034483e-06, + "loss": 1.8092, + "step": 30 + }, + { + "epoch": 0.009512313762228188, + "grad_norm": 8.36523151397705, + "learning_rate": 1.896551724137931e-06, + "loss": 1.7048, + "step": 33 + }, + { + "epoch": 0.010377069558794387, + "grad_norm": 7.840041160583496, + "learning_rate": 2.0689655172413796e-06, + "loss": 1.4938, + "step": 36 + }, + { + "epoch": 0.011241825355360585, + "grad_norm": 8.646334648132324, + "learning_rate": 2.241379310344828e-06, + "loss": 1.4769, + "step": 39 + }, + { + "epoch": 0.012106581151926784, + "grad_norm": 8.217284202575684, + "learning_rate": 2.4137931034482762e-06, + "loss": 1.4442, + "step": 42 + }, + { + "epoch": 0.012971336948492982, + "grad_norm": 8.16273307800293, + "learning_rate": 2.5862068965517246e-06, + "loss": 1.6091, + "step": 45 + }, + { + "epoch": 0.013836092745059182, + "grad_norm": 7.606343746185303, + "learning_rate": 2.7586206896551725e-06, + "loss": 1.565, + "step": 48 + }, + { + "epoch": 0.01470084854162538, + "grad_norm": 5.831145763397217, + "learning_rate": 2.931034482758621e-06, + "loss": 1.4604, + "step": 51 + }, + { + "epoch": 0.01556560433819158, + "grad_norm": 5.626949310302734, + "learning_rate": 3.103448275862069e-06, + "loss": 1.2422, + "step": 54 + }, + { + "epoch": 0.01643036013475778, + "grad_norm": 8.196585655212402, + "learning_rate": 3.2758620689655175e-06, + "loss": 1.178, + "step": 57 + }, + { + "epoch": 0.017295115931323976, + "grad_norm": 3.9342713356018066, + "learning_rate": 3.448275862068966e-06, + "loss": 1.1149, + "step": 60 + }, + { + "epoch": 0.018159871727890176, + "grad_norm": 5.660026550292969, + "learning_rate": 3.620689655172414e-06, + "loss": 1.2666, + "step": 63 + }, + { + "epoch": 0.019024627524456376, + "grad_norm": 3.853914737701416, + "learning_rate": 3.793103448275862e-06, + "loss": 0.9817, + "step": 66 + }, + { + "epoch": 0.019889383321022573, + "grad_norm": 4.146341323852539, + "learning_rate": 3.96551724137931e-06, + "loss": 1.0925, + "step": 69 + }, + { + "epoch": 0.020754139117588773, + "grad_norm": 3.1577234268188477, + "learning_rate": 4.137931034482759e-06, + "loss": 0.8252, + "step": 72 + }, + { + "epoch": 0.02161889491415497, + "grad_norm": 3.2642972469329834, + "learning_rate": 4.310344827586207e-06, + "loss": 0.7533, + "step": 75 + }, + { + "epoch": 0.02248365071072117, + "grad_norm": 3.2303547859191895, + "learning_rate": 4.482758620689656e-06, + "loss": 0.7326, + "step": 78 + }, + { + "epoch": 0.02334840650728737, + "grad_norm": 2.59647536277771, + "learning_rate": 4.655172413793104e-06, + "loss": 0.7174, + "step": 81 + }, + { + "epoch": 0.024213162303853567, + "grad_norm": 2.8148062229156494, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.7488, + "step": 84 + }, + { + "epoch": 0.025077918100419767, + "grad_norm": 1.758013129234314, + "learning_rate": 5e-06, + "loss": 0.7332, + "step": 87 + }, + { + "epoch": 0.025942673896985964, + "grad_norm": 2.0942482948303223, + "learning_rate": 5.172413793103449e-06, + "loss": 0.8884, + "step": 90 + }, + { + "epoch": 0.026807429693552164, + "grad_norm": 0.4857475757598877, + "learning_rate": 5.344827586206896e-06, + "loss": 0.497, + "step": 93 + }, + { + "epoch": 0.027672185490118364, + "grad_norm": 1.5475839376449585, + "learning_rate": 5.517241379310345e-06, + "loss": 0.669, + "step": 96 + }, + { + "epoch": 0.02853694128668456, + "grad_norm": 1.0966241359710693, + "learning_rate": 5.689655172413794e-06, + "loss": 0.5887, + "step": 99 + }, + { + "epoch": 0.028825193218873297, + "eval_loss": 0.6418702006340027, + "eval_mse": 0.6418701782226562, + "eval_runtime": 6.7842, + "eval_samples_per_second": 147.401, + "eval_steps_per_second": 18.425, + "step": 100 + }, + { + "epoch": 0.02940169708325076, + "grad_norm": 0.6968008875846863, + "learning_rate": 5.862068965517242e-06, + "loss": 0.7585, + "step": 102 + }, + { + "epoch": 0.03026645287981696, + "grad_norm": 1.3219984769821167, + "learning_rate": 6.03448275862069e-06, + "loss": 0.7782, + "step": 105 + }, + { + "epoch": 0.03113120867638316, + "grad_norm": 1.0195666551589966, + "learning_rate": 6.206896551724138e-06, + "loss": 0.6689, + "step": 108 + }, + { + "epoch": 0.031995964472949355, + "grad_norm": 1.4114892482757568, + "learning_rate": 6.379310344827587e-06, + "loss": 0.6057, + "step": 111 + }, + { + "epoch": 0.03286072026951556, + "grad_norm": 1.173322319984436, + "learning_rate": 6.551724137931035e-06, + "loss": 0.7234, + "step": 114 + }, + { + "epoch": 0.033725476066081755, + "grad_norm": 1.520411729812622, + "learning_rate": 6.724137931034484e-06, + "loss": 0.7145, + "step": 117 + }, + { + "epoch": 0.03459023186264795, + "grad_norm": 2.086540937423706, + "learning_rate": 6.896551724137932e-06, + "loss": 0.6265, + "step": 120 + }, + { + "epoch": 0.035454987659214156, + "grad_norm": 1.582171082496643, + "learning_rate": 7.0689655172413796e-06, + "loss": 0.6202, + "step": 123 + }, + { + "epoch": 0.03631974345578035, + "grad_norm": 1.1396665573120117, + "learning_rate": 7.241379310344828e-06, + "loss": 0.5784, + "step": 126 + }, + { + "epoch": 0.03718449925234655, + "grad_norm": 1.427815318107605, + "learning_rate": 7.413793103448277e-06, + "loss": 0.4815, + "step": 129 + }, + { + "epoch": 0.03804925504891275, + "grad_norm": 4.718282699584961, + "learning_rate": 7.586206896551724e-06, + "loss": 0.6151, + "step": 132 + }, + { + "epoch": 0.03891401084547895, + "grad_norm": 1.1084308624267578, + "learning_rate": 7.758620689655173e-06, + "loss": 0.548, + "step": 135 + }, + { + "epoch": 0.039778766642045146, + "grad_norm": 2.411567211151123, + "learning_rate": 7.93103448275862e-06, + "loss": 0.4664, + "step": 138 + }, + { + "epoch": 0.04064352243861134, + "grad_norm": 1.9377732276916504, + "learning_rate": 8.103448275862069e-06, + "loss": 0.4761, + "step": 141 + }, + { + "epoch": 0.04150827823517755, + "grad_norm": 1.6360706090927124, + "learning_rate": 8.275862068965518e-06, + "loss": 0.5101, + "step": 144 + }, + { + "epoch": 0.04237303403174374, + "grad_norm": 1.6306743621826172, + "learning_rate": 8.448275862068966e-06, + "loss": 0.4356, + "step": 147 + }, + { + "epoch": 0.04323778982830994, + "grad_norm": 2.038235664367676, + "learning_rate": 8.620689655172414e-06, + "loss": 0.4647, + "step": 150 + }, + { + "epoch": 0.044102545624876144, + "grad_norm": 2.968766450881958, + "learning_rate": 8.793103448275862e-06, + "loss": 0.5035, + "step": 153 + }, + { + "epoch": 0.04496730142144234, + "grad_norm": 1.407654047012329, + "learning_rate": 8.965517241379312e-06, + "loss": 0.4639, + "step": 156 + }, + { + "epoch": 0.04583205721800854, + "grad_norm": 2.141657590866089, + "learning_rate": 9.13793103448276e-06, + "loss": 0.4255, + "step": 159 + }, + { + "epoch": 0.04669681301457474, + "grad_norm": 2.2702648639678955, + "learning_rate": 9.310344827586207e-06, + "loss": 0.49, + "step": 162 + }, + { + "epoch": 0.04756156881114094, + "grad_norm": 1.1990652084350586, + "learning_rate": 9.482758620689655e-06, + "loss": 0.4145, + "step": 165 + }, + { + "epoch": 0.048426324607707134, + "grad_norm": 2.7780394554138184, + "learning_rate": 9.655172413793105e-06, + "loss": 0.4001, + "step": 168 + }, + { + "epoch": 0.04929108040427334, + "grad_norm": 1.6007318496704102, + "learning_rate": 9.827586206896553e-06, + "loss": 0.3719, + "step": 171 + }, + { + "epoch": 0.050155836200839535, + "grad_norm": 1.820497751235962, + "learning_rate": 1e-05, + "loss": 0.4183, + "step": 174 + }, + { + "epoch": 0.05102059199740573, + "grad_norm": 2.3770830631256104, + "learning_rate": 9.990895295902884e-06, + "loss": 0.4273, + "step": 177 + }, + { + "epoch": 0.05188534779397193, + "grad_norm": 1.9047311544418335, + "learning_rate": 9.981790591805767e-06, + "loss": 0.4622, + "step": 180 + }, + { + "epoch": 0.05275010359053813, + "grad_norm": 1.876373052597046, + "learning_rate": 9.972685887708651e-06, + "loss": 0.3132, + "step": 183 + }, + { + "epoch": 0.05361485938710433, + "grad_norm": 1.6469354629516602, + "learning_rate": 9.963581183611534e-06, + "loss": 0.3647, + "step": 186 + }, + { + "epoch": 0.054479615183670525, + "grad_norm": 1.3573085069656372, + "learning_rate": 9.954476479514417e-06, + "loss": 0.3267, + "step": 189 + }, + { + "epoch": 0.05534437098023673, + "grad_norm": 6.931013107299805, + "learning_rate": 9.9453717754173e-06, + "loss": 0.3981, + "step": 192 + }, + { + "epoch": 0.056209126776802926, + "grad_norm": 3.3258678913116455, + "learning_rate": 9.936267071320182e-06, + "loss": 0.3855, + "step": 195 + }, + { + "epoch": 0.05707388257336912, + "grad_norm": 4.0184550285339355, + "learning_rate": 9.927162367223067e-06, + "loss": 0.371, + "step": 198 + }, + { + "epoch": 0.057650386437746594, + "eval_loss": 0.34387460350990295, + "eval_mse": 0.3438746197223663, + "eval_runtime": 6.7538, + "eval_samples_per_second": 148.065, + "eval_steps_per_second": 18.508, + "step": 200 + }, + { + "epoch": 0.057938638369935326, + "grad_norm": 1.7570185661315918, + "learning_rate": 9.91805766312595e-06, + "loss": 0.4019, + "step": 201 + }, + { + "epoch": 0.05880339416650152, + "grad_norm": 3.925257444381714, + "learning_rate": 9.908952959028833e-06, + "loss": 0.3352, + "step": 204 + }, + { + "epoch": 0.05966814996306772, + "grad_norm": 1.5225563049316406, + "learning_rate": 9.899848254931715e-06, + "loss": 0.3081, + "step": 207 + }, + { + "epoch": 0.06053290575963392, + "grad_norm": 1.902543067932129, + "learning_rate": 9.890743550834598e-06, + "loss": 0.3908, + "step": 210 + }, + { + "epoch": 0.06139766155620012, + "grad_norm": 1.2444404363632202, + "learning_rate": 9.881638846737481e-06, + "loss": 0.31, + "step": 213 + }, + { + "epoch": 0.06226241735276632, + "grad_norm": 1.8852041959762573, + "learning_rate": 9.872534142640366e-06, + "loss": 0.297, + "step": 216 + }, + { + "epoch": 0.06312717314933251, + "grad_norm": 5.955360412597656, + "learning_rate": 9.863429438543249e-06, + "loss": 0.3639, + "step": 219 + }, + { + "epoch": 0.06399192894589871, + "grad_norm": 1.8612481355667114, + "learning_rate": 9.854324734446131e-06, + "loss": 0.3808, + "step": 222 + }, + { + "epoch": 0.06485668474246492, + "grad_norm": 1.4975637197494507, + "learning_rate": 9.845220030349014e-06, + "loss": 0.3694, + "step": 225 + }, + { + "epoch": 0.06572144053903112, + "grad_norm": 1.5896477699279785, + "learning_rate": 9.836115326251897e-06, + "loss": 0.3056, + "step": 228 + }, + { + "epoch": 0.06658619633559731, + "grad_norm": 3.6155972480773926, + "learning_rate": 9.827010622154782e-06, + "loss": 0.3084, + "step": 231 + }, + { + "epoch": 0.06745095213216351, + "grad_norm": 4.709763526916504, + "learning_rate": 9.817905918057664e-06, + "loss": 0.3949, + "step": 234 + }, + { + "epoch": 0.06831570792872971, + "grad_norm": 3.7941088676452637, + "learning_rate": 9.808801213960547e-06, + "loss": 0.3213, + "step": 237 + }, + { + "epoch": 0.0691804637252959, + "grad_norm": 2.548584461212158, + "learning_rate": 9.79969650986343e-06, + "loss": 0.3335, + "step": 240 + }, + { + "epoch": 0.0700452195218621, + "grad_norm": 1.5992470979690552, + "learning_rate": 9.790591805766313e-06, + "loss": 0.2688, + "step": 243 + }, + { + "epoch": 0.07090997531842831, + "grad_norm": 2.99310564994812, + "learning_rate": 9.781487101669198e-06, + "loss": 0.3488, + "step": 246 + }, + { + "epoch": 0.07177473111499451, + "grad_norm": 2.987074613571167, + "learning_rate": 9.77238239757208e-06, + "loss": 0.2703, + "step": 249 + }, + { + "epoch": 0.0726394869115607, + "grad_norm": 5.408164978027344, + "learning_rate": 9.763277693474963e-06, + "loss": 0.2854, + "step": 252 + }, + { + "epoch": 0.0735042427081269, + "grad_norm": 1.6425719261169434, + "learning_rate": 9.754172989377846e-06, + "loss": 0.2824, + "step": 255 + }, + { + "epoch": 0.0743689985046931, + "grad_norm": 1.8134018182754517, + "learning_rate": 9.745068285280729e-06, + "loss": 0.3286, + "step": 258 + }, + { + "epoch": 0.0752337543012593, + "grad_norm": 5.619387626647949, + "learning_rate": 9.735963581183613e-06, + "loss": 0.3226, + "step": 261 + }, + { + "epoch": 0.0760985100978255, + "grad_norm": 2.6264755725860596, + "learning_rate": 9.726858877086496e-06, + "loss": 0.3442, + "step": 264 + }, + { + "epoch": 0.0769632658943917, + "grad_norm": 3.537142515182495, + "learning_rate": 9.717754172989379e-06, + "loss": 0.2687, + "step": 267 + }, + { + "epoch": 0.0778280216909579, + "grad_norm": 2.1801705360412598, + "learning_rate": 9.708649468892262e-06, + "loss": 0.3705, + "step": 270 + }, + { + "epoch": 0.0786927774875241, + "grad_norm": 6.869683742523193, + "learning_rate": 9.699544764795145e-06, + "loss": 0.3777, + "step": 273 + }, + { + "epoch": 0.07955753328409029, + "grad_norm": 3.4050021171569824, + "learning_rate": 9.690440060698028e-06, + "loss": 0.3265, + "step": 276 + }, + { + "epoch": 0.08042228908065649, + "grad_norm": 2.8107333183288574, + "learning_rate": 9.681335356600912e-06, + "loss": 0.3286, + "step": 279 + }, + { + "epoch": 0.08128704487722269, + "grad_norm": 1.5242034196853638, + "learning_rate": 9.672230652503795e-06, + "loss": 0.2786, + "step": 282 + }, + { + "epoch": 0.0821518006737889, + "grad_norm": 22.8824462890625, + "learning_rate": 9.663125948406678e-06, + "loss": 0.3706, + "step": 285 + }, + { + "epoch": 0.0830165564703551, + "grad_norm": 4.904447555541992, + "learning_rate": 9.65402124430956e-06, + "loss": 0.298, + "step": 288 + }, + { + "epoch": 0.08388131226692129, + "grad_norm": 4.405270576477051, + "learning_rate": 9.644916540212444e-06, + "loss": 0.3625, + "step": 291 + }, + { + "epoch": 0.08474606806348749, + "grad_norm": 6.424873352050781, + "learning_rate": 9.635811836115328e-06, + "loss": 0.3246, + "step": 294 + }, + { + "epoch": 0.08561082386005368, + "grad_norm": 1.9719147682189941, + "learning_rate": 9.626707132018211e-06, + "loss": 0.3014, + "step": 297 + }, + { + "epoch": 0.08647557965661988, + "grad_norm": 5.9254021644592285, + "learning_rate": 9.617602427921094e-06, + "loss": 0.3607, + "step": 300 + }, + { + "epoch": 0.08647557965661988, + "eval_loss": 0.28438636660575867, + "eval_mse": 0.2843863361030817, + "eval_runtime": 6.6145, + "eval_samples_per_second": 151.183, + "eval_steps_per_second": 18.898, + "step": 300 + }, + { + "epoch": 0.08734033545318609, + "grad_norm": 3.4420008659362793, + "learning_rate": 9.608497723823977e-06, + "loss": 0.2631, + "step": 303 + }, + { + "epoch": 0.08820509124975229, + "grad_norm": 2.4832332134246826, + "learning_rate": 9.59939301972686e-06, + "loss": 0.2985, + "step": 306 + }, + { + "epoch": 0.08906984704631848, + "grad_norm": 3.3724935054779053, + "learning_rate": 9.590288315629744e-06, + "loss": 0.2703, + "step": 309 + }, + { + "epoch": 0.08993460284288468, + "grad_norm": 8.569412231445312, + "learning_rate": 9.581183611532627e-06, + "loss": 0.3068, + "step": 312 + }, + { + "epoch": 0.09079935863945088, + "grad_norm": 1.9817373752593994, + "learning_rate": 9.57207890743551e-06, + "loss": 0.3271, + "step": 315 + }, + { + "epoch": 0.09166411443601707, + "grad_norm": 2.6405210494995117, + "learning_rate": 9.562974203338393e-06, + "loss": 0.2454, + "step": 318 + }, + { + "epoch": 0.09252887023258327, + "grad_norm": 1.6005452871322632, + "learning_rate": 9.553869499241275e-06, + "loss": 0.2506, + "step": 321 + }, + { + "epoch": 0.09339362602914948, + "grad_norm": 1.67816162109375, + "learning_rate": 9.54476479514416e-06, + "loss": 0.2503, + "step": 324 + }, + { + "epoch": 0.09425838182571568, + "grad_norm": 1.4401655197143555, + "learning_rate": 9.535660091047043e-06, + "loss": 0.2322, + "step": 327 + }, + { + "epoch": 0.09512313762228188, + "grad_norm": 2.919785261154175, + "learning_rate": 9.526555386949926e-06, + "loss": 0.2746, + "step": 330 + }, + { + "epoch": 0.09598789341884807, + "grad_norm": 4.454317569732666, + "learning_rate": 9.517450682852808e-06, + "loss": 0.2907, + "step": 333 + }, + { + "epoch": 0.09685264921541427, + "grad_norm": 4.585294246673584, + "learning_rate": 9.508345978755691e-06, + "loss": 0.2873, + "step": 336 + }, + { + "epoch": 0.09771740501198047, + "grad_norm": 2.247422218322754, + "learning_rate": 9.499241274658574e-06, + "loss": 0.2798, + "step": 339 + }, + { + "epoch": 0.09858216080854668, + "grad_norm": 3.6044836044311523, + "learning_rate": 9.490136570561459e-06, + "loss": 0.3284, + "step": 342 + }, + { + "epoch": 0.09944691660511287, + "grad_norm": 4.609151363372803, + "learning_rate": 9.481031866464341e-06, + "loss": 0.2274, + "step": 345 + }, + { + "epoch": 0.10031167240167907, + "grad_norm": 4.793229103088379, + "learning_rate": 9.471927162367224e-06, + "loss": 0.2734, + "step": 348 + }, + { + "epoch": 0.10117642819824527, + "grad_norm": 4.507264614105225, + "learning_rate": 9.462822458270107e-06, + "loss": 0.3082, + "step": 351 + }, + { + "epoch": 0.10204118399481146, + "grad_norm": 6.129451274871826, + "learning_rate": 9.45371775417299e-06, + "loss": 0.2617, + "step": 354 + }, + { + "epoch": 0.10290593979137766, + "grad_norm": 2.631593704223633, + "learning_rate": 9.444613050075875e-06, + "loss": 0.2565, + "step": 357 + }, + { + "epoch": 0.10377069558794386, + "grad_norm": 4.379823684692383, + "learning_rate": 9.435508345978757e-06, + "loss": 0.3235, + "step": 360 + }, + { + "epoch": 0.10463545138451007, + "grad_norm": 2.077354907989502, + "learning_rate": 9.42640364188164e-06, + "loss": 0.2938, + "step": 363 + }, + { + "epoch": 0.10550020718107626, + "grad_norm": 5.108116626739502, + "learning_rate": 9.417298937784523e-06, + "loss": 0.2698, + "step": 366 + }, + { + "epoch": 0.10636496297764246, + "grad_norm": 7.464448928833008, + "learning_rate": 9.408194233687406e-06, + "loss": 0.2703, + "step": 369 + }, + { + "epoch": 0.10722971877420866, + "grad_norm": 1.855411410331726, + "learning_rate": 9.399089529590289e-06, + "loss": 0.2958, + "step": 372 + }, + { + "epoch": 0.10809447457077485, + "grad_norm": 5.284719944000244, + "learning_rate": 9.389984825493173e-06, + "loss": 0.2703, + "step": 375 + }, + { + "epoch": 0.10895923036734105, + "grad_norm": 2.494473457336426, + "learning_rate": 9.380880121396056e-06, + "loss": 0.2698, + "step": 378 + }, + { + "epoch": 0.10982398616390726, + "grad_norm": 2.0765345096588135, + "learning_rate": 9.371775417298939e-06, + "loss": 0.2726, + "step": 381 + }, + { + "epoch": 0.11068874196047346, + "grad_norm": 1.893574595451355, + "learning_rate": 9.362670713201822e-06, + "loss": 0.2739, + "step": 384 + }, + { + "epoch": 0.11155349775703965, + "grad_norm": 2.0016255378723145, + "learning_rate": 9.353566009104705e-06, + "loss": 0.2624, + "step": 387 + }, + { + "epoch": 0.11241825355360585, + "grad_norm": 2.99924898147583, + "learning_rate": 9.344461305007587e-06, + "loss": 0.3098, + "step": 390 + }, + { + "epoch": 0.11328300935017205, + "grad_norm": 1.666891098022461, + "learning_rate": 9.335356600910472e-06, + "loss": 0.2451, + "step": 393 + }, + { + "epoch": 0.11414776514673824, + "grad_norm": 2.8993024826049805, + "learning_rate": 9.326251896813355e-06, + "loss": 0.271, + "step": 396 + }, + { + "epoch": 0.11501252094330444, + "grad_norm": 5.040359973907471, + "learning_rate": 9.317147192716238e-06, + "loss": 0.2576, + "step": 399 + }, + { + "epoch": 0.11530077287549319, + "eval_loss": 0.2588765025138855, + "eval_mse": 0.25887649209996744, + "eval_runtime": 6.6586, + "eval_samples_per_second": 150.181, + "eval_steps_per_second": 18.773, + "step": 400 + }, + { + "epoch": 0.11587727673987065, + "grad_norm": 3.234560251235962, + "learning_rate": 9.30804248861912e-06, + "loss": 0.2702, + "step": 402 + }, + { + "epoch": 0.11674203253643685, + "grad_norm": 2.8497729301452637, + "learning_rate": 9.298937784522003e-06, + "loss": 0.2371, + "step": 405 + }, + { + "epoch": 0.11760678833300305, + "grad_norm": 1.3214294910430908, + "learning_rate": 9.289833080424886e-06, + "loss": 0.2262, + "step": 408 + }, + { + "epoch": 0.11847154412956924, + "grad_norm": 3.5736958980560303, + "learning_rate": 9.28072837632777e-06, + "loss": 0.2619, + "step": 411 + }, + { + "epoch": 0.11933629992613544, + "grad_norm": 3.478178024291992, + "learning_rate": 9.271623672230654e-06, + "loss": 0.3061, + "step": 414 + }, + { + "epoch": 0.12020105572270164, + "grad_norm": 2.523387908935547, + "learning_rate": 9.262518968133536e-06, + "loss": 0.3232, + "step": 417 + }, + { + "epoch": 0.12106581151926785, + "grad_norm": 2.1046786308288574, + "learning_rate": 9.25341426403642e-06, + "loss": 0.2815, + "step": 420 + }, + { + "epoch": 0.12193056731583404, + "grad_norm": 4.513411045074463, + "learning_rate": 9.244309559939302e-06, + "loss": 0.3053, + "step": 423 + }, + { + "epoch": 0.12279532311240024, + "grad_norm": 2.628030776977539, + "learning_rate": 9.235204855842187e-06, + "loss": 0.2714, + "step": 426 + }, + { + "epoch": 0.12366007890896644, + "grad_norm": 6.008927345275879, + "learning_rate": 9.22610015174507e-06, + "loss": 0.3243, + "step": 429 + }, + { + "epoch": 0.12452483470553263, + "grad_norm": 6.236274242401123, + "learning_rate": 9.216995447647952e-06, + "loss": 0.2916, + "step": 432 + }, + { + "epoch": 0.12538959050209883, + "grad_norm": 2.30412220954895, + "learning_rate": 9.207890743550835e-06, + "loss": 0.2731, + "step": 435 + }, + { + "epoch": 0.12625434629866503, + "grad_norm": 3.3161492347717285, + "learning_rate": 9.198786039453718e-06, + "loss": 0.2508, + "step": 438 + }, + { + "epoch": 0.12711910209523122, + "grad_norm": 4.023074626922607, + "learning_rate": 9.189681335356601e-06, + "loss": 0.2973, + "step": 441 + }, + { + "epoch": 0.12798385789179742, + "grad_norm": 2.486236333847046, + "learning_rate": 9.180576631259485e-06, + "loss": 0.235, + "step": 444 + }, + { + "epoch": 0.12884861368836362, + "grad_norm": 4.445496082305908, + "learning_rate": 9.171471927162368e-06, + "loss": 0.2519, + "step": 447 + }, + { + "epoch": 0.12971336948492984, + "grad_norm": 1.629809021949768, + "learning_rate": 9.162367223065251e-06, + "loss": 0.2664, + "step": 450 + }, + { + "epoch": 0.13057812528149604, + "grad_norm": 3.0351674556732178, + "learning_rate": 9.153262518968134e-06, + "loss": 0.2433, + "step": 453 + }, + { + "epoch": 0.13144288107806223, + "grad_norm": 1.9163283109664917, + "learning_rate": 9.144157814871017e-06, + "loss": 0.2997, + "step": 456 + }, + { + "epoch": 0.13230763687462843, + "grad_norm": 3.978429079055786, + "learning_rate": 9.1350531107739e-06, + "loss": 0.2969, + "step": 459 + }, + { + "epoch": 0.13317239267119463, + "grad_norm": 1.7428011894226074, + "learning_rate": 9.125948406676784e-06, + "loss": 0.2721, + "step": 462 + }, + { + "epoch": 0.13403714846776082, + "grad_norm": 1.462106704711914, + "learning_rate": 9.116843702579667e-06, + "loss": 0.3179, + "step": 465 + }, + { + "epoch": 0.13490190426432702, + "grad_norm": 4.226785659790039, + "learning_rate": 9.10773899848255e-06, + "loss": 0.262, + "step": 468 + }, + { + "epoch": 0.13576666006089322, + "grad_norm": 3.227842330932617, + "learning_rate": 9.098634294385433e-06, + "loss": 0.2602, + "step": 471 + }, + { + "epoch": 0.13663141585745942, + "grad_norm": 4.708644866943359, + "learning_rate": 9.089529590288316e-06, + "loss": 0.2502, + "step": 474 + }, + { + "epoch": 0.1374961716540256, + "grad_norm": 3.478773832321167, + "learning_rate": 9.080424886191198e-06, + "loss": 0.2799, + "step": 477 + }, + { + "epoch": 0.1383609274505918, + "grad_norm": 5.991547107696533, + "learning_rate": 9.071320182094083e-06, + "loss": 0.3103, + "step": 480 + }, + { + "epoch": 0.139225683247158, + "grad_norm": 5.446369647979736, + "learning_rate": 9.062215477996966e-06, + "loss": 0.2834, + "step": 483 + }, + { + "epoch": 0.1400904390437242, + "grad_norm": 4.723580360412598, + "learning_rate": 9.053110773899849e-06, + "loss": 0.244, + "step": 486 + }, + { + "epoch": 0.14095519484029043, + "grad_norm": 2.518148183822632, + "learning_rate": 9.044006069802731e-06, + "loss": 0.3102, + "step": 489 + }, + { + "epoch": 0.14181995063685662, + "grad_norm": 4.1146039962768555, + "learning_rate": 9.034901365705614e-06, + "loss": 0.2913, + "step": 492 + }, + { + "epoch": 0.14268470643342282, + "grad_norm": 2.6333212852478027, + "learning_rate": 9.025796661608497e-06, + "loss": 0.2195, + "step": 495 + }, + { + "epoch": 0.14354946222998902, + "grad_norm": 3.344228506088257, + "learning_rate": 9.016691957511382e-06, + "loss": 0.2822, + "step": 498 + }, + { + "epoch": 0.14412596609436648, + "eval_loss": 0.2707275450229645, + "eval_mse": 0.27072753977278263, + "eval_runtime": 6.638, + "eval_samples_per_second": 150.647, + "eval_steps_per_second": 18.831, + "step": 500 + }, + { + "epoch": 0.1444142180265552, + "grad_norm": 2.839517593383789, + "learning_rate": 9.007587253414265e-06, + "loss": 0.2469, + "step": 501 + }, + { + "epoch": 0.1452789738231214, + "grad_norm": 3.71785569190979, + "learning_rate": 8.998482549317147e-06, + "loss": 0.2695, + "step": 504 + }, + { + "epoch": 0.1461437296196876, + "grad_norm": 2.9626924991607666, + "learning_rate": 8.98937784522003e-06, + "loss": 0.233, + "step": 507 + }, + { + "epoch": 0.1470084854162538, + "grad_norm": 2.0274481773376465, + "learning_rate": 8.980273141122913e-06, + "loss": 0.2854, + "step": 510 + }, + { + "epoch": 0.14787324121282, + "grad_norm": 2.5144805908203125, + "learning_rate": 8.971168437025798e-06, + "loss": 0.2545, + "step": 513 + }, + { + "epoch": 0.1487379970093862, + "grad_norm": 3.106039047241211, + "learning_rate": 8.96206373292868e-06, + "loss": 0.2305, + "step": 516 + }, + { + "epoch": 0.1496027528059524, + "grad_norm": 3.3986117839813232, + "learning_rate": 8.952959028831563e-06, + "loss": 0.2564, + "step": 519 + }, + { + "epoch": 0.1504675086025186, + "grad_norm": 2.877206325531006, + "learning_rate": 8.943854324734446e-06, + "loss": 0.2781, + "step": 522 + }, + { + "epoch": 0.1513322643990848, + "grad_norm": 2.561119556427002, + "learning_rate": 8.934749620637329e-06, + "loss": 0.2666, + "step": 525 + }, + { + "epoch": 0.152197020195651, + "grad_norm": 2.566633939743042, + "learning_rate": 8.925644916540213e-06, + "loss": 0.2791, + "step": 528 + }, + { + "epoch": 0.1530617759922172, + "grad_norm": 2.1075491905212402, + "learning_rate": 8.916540212443096e-06, + "loss": 0.2276, + "step": 531 + }, + { + "epoch": 0.1539265317887834, + "grad_norm": 3.239712715148926, + "learning_rate": 8.90743550834598e-06, + "loss": 0.2749, + "step": 534 + }, + { + "epoch": 0.1547912875853496, + "grad_norm": 6.005987167358398, + "learning_rate": 8.898330804248862e-06, + "loss": 0.2395, + "step": 537 + }, + { + "epoch": 0.1556560433819158, + "grad_norm": 1.6621935367584229, + "learning_rate": 8.889226100151745e-06, + "loss": 0.2616, + "step": 540 + }, + { + "epoch": 0.156520799178482, + "grad_norm": 3.1361093521118164, + "learning_rate": 8.880121396054628e-06, + "loss": 0.2712, + "step": 543 + }, + { + "epoch": 0.1573855549750482, + "grad_norm": 1.8823013305664062, + "learning_rate": 8.871016691957512e-06, + "loss": 0.2571, + "step": 546 + }, + { + "epoch": 0.1582503107716144, + "grad_norm": 3.6857988834381104, + "learning_rate": 8.861911987860395e-06, + "loss": 0.2552, + "step": 549 + }, + { + "epoch": 0.15911506656818059, + "grad_norm": 2.26597261428833, + "learning_rate": 8.852807283763278e-06, + "loss": 0.2239, + "step": 552 + }, + { + "epoch": 0.15997982236474678, + "grad_norm": 5.791572570800781, + "learning_rate": 8.84370257966616e-06, + "loss": 0.2859, + "step": 555 + }, + { + "epoch": 0.16084457816131298, + "grad_norm": 3.4344265460968018, + "learning_rate": 8.834597875569044e-06, + "loss": 0.3055, + "step": 558 + }, + { + "epoch": 0.16170933395787918, + "grad_norm": 3.98288631439209, + "learning_rate": 8.825493171471928e-06, + "loss": 0.2698, + "step": 561 + }, + { + "epoch": 0.16257408975444537, + "grad_norm": 7.429836273193359, + "learning_rate": 8.816388467374811e-06, + "loss": 0.3207, + "step": 564 + }, + { + "epoch": 0.1634388455510116, + "grad_norm": 4.021480560302734, + "learning_rate": 8.807283763277694e-06, + "loss": 0.2477, + "step": 567 + }, + { + "epoch": 0.1643036013475778, + "grad_norm": 2.619497537612915, + "learning_rate": 8.798179059180577e-06, + "loss": 0.2152, + "step": 570 + }, + { + "epoch": 0.165168357144144, + "grad_norm": 2.073925018310547, + "learning_rate": 8.78907435508346e-06, + "loss": 0.2557, + "step": 573 + }, + { + "epoch": 0.1660331129407102, + "grad_norm": 2.250293493270874, + "learning_rate": 8.779969650986344e-06, + "loss": 0.2294, + "step": 576 + }, + { + "epoch": 0.16689786873727638, + "grad_norm": 2.9747421741485596, + "learning_rate": 8.770864946889227e-06, + "loss": 0.2461, + "step": 579 + }, + { + "epoch": 0.16776262453384258, + "grad_norm": 1.8815991878509521, + "learning_rate": 8.76176024279211e-06, + "loss": 0.2698, + "step": 582 + }, + { + "epoch": 0.16862738033040878, + "grad_norm": 2.1905224323272705, + "learning_rate": 8.752655538694993e-06, + "loss": 0.2631, + "step": 585 + }, + { + "epoch": 0.16949213612697497, + "grad_norm": 3.6808903217315674, + "learning_rate": 8.743550834597875e-06, + "loss": 0.2341, + "step": 588 + }, + { + "epoch": 0.17035689192354117, + "grad_norm": 2.905963897705078, + "learning_rate": 8.73444613050076e-06, + "loss": 0.2295, + "step": 591 + }, + { + "epoch": 0.17122164772010737, + "grad_norm": 5.486540794372559, + "learning_rate": 8.725341426403643e-06, + "loss": 0.2816, + "step": 594 + }, + { + "epoch": 0.17208640351667356, + "grad_norm": 1.8525919914245605, + "learning_rate": 8.716236722306526e-06, + "loss": 0.2197, + "step": 597 + }, + { + "epoch": 0.17295115931323976, + "grad_norm": 5.908591270446777, + "learning_rate": 8.707132018209408e-06, + "loss": 0.2908, + "step": 600 + }, + { + "epoch": 0.17295115931323976, + "eval_loss": 0.23815499246120453, + "eval_mse": 0.23815500601008535, + "eval_runtime": 6.6897, + "eval_samples_per_second": 149.484, + "eval_steps_per_second": 18.685, + "step": 600 + }, + { + "epoch": 0.17381591510980596, + "grad_norm": 3.1334948539733887, + "learning_rate": 8.698027314112291e-06, + "loss": 0.253, + "step": 603 + }, + { + "epoch": 0.17468067090637218, + "grad_norm": 2.314412832260132, + "learning_rate": 8.688922610015174e-06, + "loss": 0.2631, + "step": 606 + }, + { + "epoch": 0.17554542670293838, + "grad_norm": 3.483959436416626, + "learning_rate": 8.679817905918059e-06, + "loss": 0.2452, + "step": 609 + }, + { + "epoch": 0.17641018249950458, + "grad_norm": 2.2747623920440674, + "learning_rate": 8.670713201820942e-06, + "loss": 0.2462, + "step": 612 + }, + { + "epoch": 0.17727493829607077, + "grad_norm": 5.517392635345459, + "learning_rate": 8.661608497723824e-06, + "loss": 0.2707, + "step": 615 + }, + { + "epoch": 0.17813969409263697, + "grad_norm": 2.7062125205993652, + "learning_rate": 8.652503793626707e-06, + "loss": 0.2324, + "step": 618 + }, + { + "epoch": 0.17900444988920317, + "grad_norm": 2.712933301925659, + "learning_rate": 8.64339908952959e-06, + "loss": 0.2914, + "step": 621 + }, + { + "epoch": 0.17986920568576936, + "grad_norm": 3.0349957942962646, + "learning_rate": 8.634294385432475e-06, + "loss": 0.245, + "step": 624 + }, + { + "epoch": 0.18073396148233556, + "grad_norm": 1.533530592918396, + "learning_rate": 8.625189681335357e-06, + "loss": 0.245, + "step": 627 + }, + { + "epoch": 0.18159871727890176, + "grad_norm": 1.5070098638534546, + "learning_rate": 8.61608497723824e-06, + "loss": 0.252, + "step": 630 + }, + { + "epoch": 0.18246347307546795, + "grad_norm": 1.803514003753662, + "learning_rate": 8.606980273141123e-06, + "loss": 0.2395, + "step": 633 + }, + { + "epoch": 0.18332822887203415, + "grad_norm": 1.8047112226486206, + "learning_rate": 8.597875569044006e-06, + "loss": 0.2731, + "step": 636 + }, + { + "epoch": 0.18419298466860035, + "grad_norm": 3.090371608734131, + "learning_rate": 8.58877086494689e-06, + "loss": 0.2314, + "step": 639 + }, + { + "epoch": 0.18505774046516654, + "grad_norm": 2.2020678520202637, + "learning_rate": 8.579666160849773e-06, + "loss": 0.2828, + "step": 642 + }, + { + "epoch": 0.18592249626173277, + "grad_norm": 1.8086636066436768, + "learning_rate": 8.570561456752656e-06, + "loss": 0.2736, + "step": 645 + }, + { + "epoch": 0.18678725205829896, + "grad_norm": 1.9154764413833618, + "learning_rate": 8.561456752655539e-06, + "loss": 0.2422, + "step": 648 + }, + { + "epoch": 0.18765200785486516, + "grad_norm": 2.6091620922088623, + "learning_rate": 8.552352048558422e-06, + "loss": 0.278, + "step": 651 + }, + { + "epoch": 0.18851676365143136, + "grad_norm": 4.057301998138428, + "learning_rate": 8.543247344461306e-06, + "loss": 0.2268, + "step": 654 + }, + { + "epoch": 0.18938151944799755, + "grad_norm": 1.6707180738449097, + "learning_rate": 8.53414264036419e-06, + "loss": 0.2657, + "step": 657 + }, + { + "epoch": 0.19024627524456375, + "grad_norm": 4.327409744262695, + "learning_rate": 8.525037936267072e-06, + "loss": 0.2235, + "step": 660 + }, + { + "epoch": 0.19111103104112995, + "grad_norm": 3.991241931915283, + "learning_rate": 8.515933232169955e-06, + "loss": 0.2429, + "step": 663 + }, + { + "epoch": 0.19197578683769614, + "grad_norm": 1.742564082145691, + "learning_rate": 8.506828528072838e-06, + "loss": 0.2245, + "step": 666 + }, + { + "epoch": 0.19284054263426234, + "grad_norm": 2.362626791000366, + "learning_rate": 8.49772382397572e-06, + "loss": 0.2689, + "step": 669 + }, + { + "epoch": 0.19370529843082854, + "grad_norm": 1.4896934032440186, + "learning_rate": 8.488619119878605e-06, + "loss": 0.2669, + "step": 672 + }, + { + "epoch": 0.19457005422739473, + "grad_norm": 1.8312315940856934, + "learning_rate": 8.479514415781488e-06, + "loss": 0.2255, + "step": 675 + }, + { + "epoch": 0.19543481002396093, + "grad_norm": 3.9799551963806152, + "learning_rate": 8.470409711684371e-06, + "loss": 0.2152, + "step": 678 + }, + { + "epoch": 0.19629956582052713, + "grad_norm": 1.9584782123565674, + "learning_rate": 8.461305007587254e-06, + "loss": 0.2721, + "step": 681 + }, + { + "epoch": 0.19716432161709335, + "grad_norm": 3.361952781677246, + "learning_rate": 8.452200303490137e-06, + "loss": 0.265, + "step": 684 + }, + { + "epoch": 0.19802907741365955, + "grad_norm": 2.125466823577881, + "learning_rate": 8.443095599393021e-06, + "loss": 0.2473, + "step": 687 + }, + { + "epoch": 0.19889383321022575, + "grad_norm": 4.599373817443848, + "learning_rate": 8.433990895295904e-06, + "loss": 0.2201, + "step": 690 + }, + { + "epoch": 0.19975858900679194, + "grad_norm": 2.139647960662842, + "learning_rate": 8.424886191198787e-06, + "loss": 0.2647, + "step": 693 + }, + { + "epoch": 0.20062334480335814, + "grad_norm": 1.7437653541564941, + "learning_rate": 8.41578148710167e-06, + "loss": 0.2242, + "step": 696 + }, + { + "epoch": 0.20148810059992434, + "grad_norm": 4.154083728790283, + "learning_rate": 8.406676783004552e-06, + "loss": 0.2258, + "step": 699 + }, + { + "epoch": 0.20177635253211307, + "eval_loss": 0.2404525727033615, + "eval_mse": 0.24045259381830691, + "eval_runtime": 6.5158, + "eval_samples_per_second": 153.474, + "eval_steps_per_second": 19.184, + "step": 700 + }, + { + "epoch": 0.20235285639649053, + "grad_norm": 1.7175828218460083, + "learning_rate": 8.397572078907437e-06, + "loss": 0.2734, + "step": 702 + }, + { + "epoch": 0.20321761219305673, + "grad_norm": 1.8725277185440063, + "learning_rate": 8.38846737481032e-06, + "loss": 0.2309, + "step": 705 + }, + { + "epoch": 0.20408236798962293, + "grad_norm": 1.7434577941894531, + "learning_rate": 8.379362670713203e-06, + "loss": 0.2395, + "step": 708 + }, + { + "epoch": 0.20494712378618912, + "grad_norm": 2.8038480281829834, + "learning_rate": 8.370257966616086e-06, + "loss": 0.2787, + "step": 711 + }, + { + "epoch": 0.20581187958275532, + "grad_norm": 1.701920509338379, + "learning_rate": 8.361153262518968e-06, + "loss": 0.2267, + "step": 714 + }, + { + "epoch": 0.20667663537932152, + "grad_norm": 4.903564453125, + "learning_rate": 8.352048558421853e-06, + "loss": 0.2716, + "step": 717 + }, + { + "epoch": 0.2075413911758877, + "grad_norm": 3.723651647567749, + "learning_rate": 8.342943854324736e-06, + "loss": 0.2674, + "step": 720 + }, + { + "epoch": 0.20840614697245394, + "grad_norm": 1.7158082723617554, + "learning_rate": 8.333839150227619e-06, + "loss": 0.2296, + "step": 723 + }, + { + "epoch": 0.20927090276902013, + "grad_norm": 2.1699960231781006, + "learning_rate": 8.324734446130501e-06, + "loss": 0.2843, + "step": 726 + }, + { + "epoch": 0.21013565856558633, + "grad_norm": 4.244576454162598, + "learning_rate": 8.315629742033384e-06, + "loss": 0.2678, + "step": 729 + }, + { + "epoch": 0.21100041436215253, + "grad_norm": 1.4597645998001099, + "learning_rate": 8.306525037936269e-06, + "loss": 0.248, + "step": 732 + }, + { + "epoch": 0.21186517015871872, + "grad_norm": 2.9361813068389893, + "learning_rate": 8.297420333839152e-06, + "loss": 0.241, + "step": 735 + }, + { + "epoch": 0.21272992595528492, + "grad_norm": 4.552188396453857, + "learning_rate": 8.288315629742034e-06, + "loss": 0.2935, + "step": 738 + }, + { + "epoch": 0.21359468175185112, + "grad_norm": 4.201780796051025, + "learning_rate": 8.279210925644917e-06, + "loss": 0.2935, + "step": 741 + }, + { + "epoch": 0.21445943754841731, + "grad_norm": 7.236446380615234, + "learning_rate": 8.2701062215478e-06, + "loss": 0.2584, + "step": 744 + }, + { + "epoch": 0.2153241933449835, + "grad_norm": 1.7462209463119507, + "learning_rate": 8.261001517450683e-06, + "loss": 0.2457, + "step": 747 + }, + { + "epoch": 0.2161889491415497, + "grad_norm": 3.9612677097320557, + "learning_rate": 8.251896813353568e-06, + "loss": 0.2472, + "step": 750 + }, + { + "epoch": 0.2170537049381159, + "grad_norm": 3.581313371658325, + "learning_rate": 8.24279210925645e-06, + "loss": 0.2833, + "step": 753 + }, + { + "epoch": 0.2179184607346821, + "grad_norm": 3.473001003265381, + "learning_rate": 8.233687405159333e-06, + "loss": 0.2474, + "step": 756 + }, + { + "epoch": 0.2187832165312483, + "grad_norm": 5.877528667449951, + "learning_rate": 8.224582701062216e-06, + "loss": 0.2469, + "step": 759 + }, + { + "epoch": 0.21964797232781452, + "grad_norm": 4.294084072113037, + "learning_rate": 8.215477996965099e-06, + "loss": 0.2569, + "step": 762 + }, + { + "epoch": 0.22051272812438072, + "grad_norm": 1.9812819957733154, + "learning_rate": 8.206373292867983e-06, + "loss": 0.2592, + "step": 765 + }, + { + "epoch": 0.22137748392094692, + "grad_norm": 1.6627389192581177, + "learning_rate": 8.197268588770866e-06, + "loss": 0.2597, + "step": 768 + }, + { + "epoch": 0.2222422397175131, + "grad_norm": 2.557081699371338, + "learning_rate": 8.188163884673749e-06, + "loss": 0.2624, + "step": 771 + }, + { + "epoch": 0.2231069955140793, + "grad_norm": 3.3301448822021484, + "learning_rate": 8.179059180576632e-06, + "loss": 0.2458, + "step": 774 + }, + { + "epoch": 0.2239717513106455, + "grad_norm": 3.717036247253418, + "learning_rate": 8.169954476479515e-06, + "loss": 0.2403, + "step": 777 + }, + { + "epoch": 0.2248365071072117, + "grad_norm": 1.9032166004180908, + "learning_rate": 8.1608497723824e-06, + "loss": 0.2194, + "step": 780 + }, + { + "epoch": 0.2257012629037779, + "grad_norm": 11.293305397033691, + "learning_rate": 8.151745068285282e-06, + "loss": 0.276, + "step": 783 + }, + { + "epoch": 0.2265660187003441, + "grad_norm": 2.2903361320495605, + "learning_rate": 8.142640364188165e-06, + "loss": 0.2089, + "step": 786 + }, + { + "epoch": 0.2274307744969103, + "grad_norm": 1.6450647115707397, + "learning_rate": 8.133535660091048e-06, + "loss": 0.239, + "step": 789 + }, + { + "epoch": 0.2282955302934765, + "grad_norm": 2.290724277496338, + "learning_rate": 8.12443095599393e-06, + "loss": 0.2509, + "step": 792 + }, + { + "epoch": 0.2291602860900427, + "grad_norm": 2.6252450942993164, + "learning_rate": 8.115326251896815e-06, + "loss": 0.2578, + "step": 795 + }, + { + "epoch": 0.23002504188660888, + "grad_norm": 4.217925071716309, + "learning_rate": 8.106221547799698e-06, + "loss": 0.2604, + "step": 798 + }, + { + "epoch": 0.23060154575098638, + "eval_loss": 0.23178604245185852, + "eval_mse": 0.2317860303344205, + "eval_runtime": 6.6317, + "eval_samples_per_second": 150.79, + "eval_steps_per_second": 18.849, + "step": 800 + }, + { + "epoch": 0.2308897976831751, + "grad_norm": 2.014478921890259, + "learning_rate": 8.097116843702581e-06, + "loss": 0.238, + "step": 801 + }, + { + "epoch": 0.2317545534797413, + "grad_norm": 3.1536102294921875, + "learning_rate": 8.088012139605464e-06, + "loss": 0.2236, + "step": 804 + }, + { + "epoch": 0.2326193092763075, + "grad_norm": 2.094320297241211, + "learning_rate": 8.078907435508347e-06, + "loss": 0.2536, + "step": 807 + }, + { + "epoch": 0.2334840650728737, + "grad_norm": 1.7041524648666382, + "learning_rate": 8.06980273141123e-06, + "loss": 0.2243, + "step": 810 + }, + { + "epoch": 0.2343488208694399, + "grad_norm": 4.586849689483643, + "learning_rate": 8.060698027314114e-06, + "loss": 0.2276, + "step": 813 + }, + { + "epoch": 0.2352135766660061, + "grad_norm": 1.8360718488693237, + "learning_rate": 8.051593323216997e-06, + "loss": 0.2299, + "step": 816 + }, + { + "epoch": 0.2360783324625723, + "grad_norm": 2.283907651901245, + "learning_rate": 8.04248861911988e-06, + "loss": 0.2996, + "step": 819 + }, + { + "epoch": 0.23694308825913848, + "grad_norm": 2.762160301208496, + "learning_rate": 8.033383915022763e-06, + "loss": 0.2474, + "step": 822 + }, + { + "epoch": 0.23780784405570468, + "grad_norm": 2.776780366897583, + "learning_rate": 8.024279210925645e-06, + "loss": 0.2394, + "step": 825 + }, + { + "epoch": 0.23867259985227088, + "grad_norm": 1.8678719997406006, + "learning_rate": 8.01517450682853e-06, + "loss": 0.2732, + "step": 828 + }, + { + "epoch": 0.23953735564883707, + "grad_norm": 1.6664822101593018, + "learning_rate": 8.006069802731413e-06, + "loss": 0.245, + "step": 831 + }, + { + "epoch": 0.24040211144540327, + "grad_norm": 2.244666814804077, + "learning_rate": 7.996965098634296e-06, + "loss": 0.2587, + "step": 834 + }, + { + "epoch": 0.24126686724196947, + "grad_norm": 1.8958755731582642, + "learning_rate": 7.987860394537178e-06, + "loss": 0.2632, + "step": 837 + }, + { + "epoch": 0.2421316230385357, + "grad_norm": 1.7408092021942139, + "learning_rate": 7.978755690440061e-06, + "loss": 0.2475, + "step": 840 + }, + { + "epoch": 0.2429963788351019, + "grad_norm": 4.280789375305176, + "learning_rate": 7.969650986342944e-06, + "loss": 0.2478, + "step": 843 + }, + { + "epoch": 0.2438611346316681, + "grad_norm": 3.1445536613464355, + "learning_rate": 7.960546282245829e-06, + "loss": 0.2412, + "step": 846 + }, + { + "epoch": 0.24472589042823428, + "grad_norm": 1.806154727935791, + "learning_rate": 7.951441578148712e-06, + "loss": 0.2772, + "step": 849 + }, + { + "epoch": 0.24559064622480048, + "grad_norm": 2.123932123184204, + "learning_rate": 7.942336874051594e-06, + "loss": 0.2535, + "step": 852 + }, + { + "epoch": 0.24645540202136668, + "grad_norm": 1.488558292388916, + "learning_rate": 7.933232169954477e-06, + "loss": 0.2342, + "step": 855 + }, + { + "epoch": 0.24732015781793287, + "grad_norm": 3.2414135932922363, + "learning_rate": 7.92412746585736e-06, + "loss": 0.2536, + "step": 858 + }, + { + "epoch": 0.24818491361449907, + "grad_norm": 1.952009916305542, + "learning_rate": 7.915022761760245e-06, + "loss": 0.2028, + "step": 861 + }, + { + "epoch": 0.24904966941106527, + "grad_norm": 1.7001383304595947, + "learning_rate": 7.905918057663127e-06, + "loss": 0.2791, + "step": 864 + }, + { + "epoch": 0.24991442520763146, + "grad_norm": 2.1715829372406006, + "learning_rate": 7.89681335356601e-06, + "loss": 0.2211, + "step": 867 + }, + { + "epoch": 0.25077918100419766, + "grad_norm": 2.4868202209472656, + "learning_rate": 7.887708649468893e-06, + "loss": 0.2599, + "step": 870 + }, + { + "epoch": 0.25164393680076386, + "grad_norm": 1.4740101099014282, + "learning_rate": 7.878603945371776e-06, + "loss": 0.2076, + "step": 873 + }, + { + "epoch": 0.25250869259733005, + "grad_norm": 2.241642951965332, + "learning_rate": 7.869499241274659e-06, + "loss": 0.2413, + "step": 876 + }, + { + "epoch": 0.25337344839389625, + "grad_norm": 4.3130784034729, + "learning_rate": 7.860394537177543e-06, + "loss": 0.2495, + "step": 879 + }, + { + "epoch": 0.25423820419046245, + "grad_norm": 6.674787998199463, + "learning_rate": 7.851289833080426e-06, + "loss": 0.2396, + "step": 882 + }, + { + "epoch": 0.25510295998702864, + "grad_norm": 2.463395118713379, + "learning_rate": 7.842185128983309e-06, + "loss": 0.2304, + "step": 885 + }, + { + "epoch": 0.25596771578359484, + "grad_norm": 4.519803047180176, + "learning_rate": 7.833080424886192e-06, + "loss": 0.2603, + "step": 888 + }, + { + "epoch": 0.25683247158016104, + "grad_norm": 1.3876014947891235, + "learning_rate": 7.823975720789075e-06, + "loss": 0.2282, + "step": 891 + }, + { + "epoch": 0.25769722737672723, + "grad_norm": 2.0575900077819824, + "learning_rate": 7.814871016691958e-06, + "loss": 0.2286, + "step": 894 + }, + { + "epoch": 0.2585619831732935, + "grad_norm": 4.538529872894287, + "learning_rate": 7.805766312594842e-06, + "loss": 0.2606, + "step": 897 + }, + { + "epoch": 0.2594267389698597, + "grad_norm": 3.036686658859253, + "learning_rate": 7.796661608497725e-06, + "loss": 0.2961, + "step": 900 + }, + { + "epoch": 0.2594267389698597, + "eval_loss": 0.2185884416103363, + "eval_mse": 0.21858844196051358, + "eval_runtime": 6.596, + "eval_samples_per_second": 151.607, + "eval_steps_per_second": 18.951, + "step": 900 + }, + { + "epoch": 0.2602914947664259, + "grad_norm": 2.319840669631958, + "learning_rate": 7.787556904400608e-06, + "loss": 0.2493, + "step": 903 + }, + { + "epoch": 0.2611562505629921, + "grad_norm": 4.723567485809326, + "learning_rate": 7.77845220030349e-06, + "loss": 0.2236, + "step": 906 + }, + { + "epoch": 0.2620210063595583, + "grad_norm": 2.231250286102295, + "learning_rate": 7.769347496206373e-06, + "loss": 0.2766, + "step": 909 + }, + { + "epoch": 0.26288576215612447, + "grad_norm": 2.3697762489318848, + "learning_rate": 7.760242792109256e-06, + "loss": 0.2319, + "step": 912 + }, + { + "epoch": 0.26375051795269067, + "grad_norm": 2.0771355628967285, + "learning_rate": 7.75113808801214e-06, + "loss": 0.246, + "step": 915 + }, + { + "epoch": 0.26461527374925686, + "grad_norm": 2.4788358211517334, + "learning_rate": 7.742033383915024e-06, + "loss": 0.2163, + "step": 918 + }, + { + "epoch": 0.26548002954582306, + "grad_norm": 1.5741719007492065, + "learning_rate": 7.732928679817907e-06, + "loss": 0.2308, + "step": 921 + }, + { + "epoch": 0.26634478534238926, + "grad_norm": 2.12677001953125, + "learning_rate": 7.72382397572079e-06, + "loss": 0.2304, + "step": 924 + }, + { + "epoch": 0.26720954113895545, + "grad_norm": 1.9022995233535767, + "learning_rate": 7.714719271623672e-06, + "loss": 0.2454, + "step": 927 + }, + { + "epoch": 0.26807429693552165, + "grad_norm": 3.3529253005981445, + "learning_rate": 7.705614567526557e-06, + "loss": 0.2137, + "step": 930 + }, + { + "epoch": 0.26893905273208785, + "grad_norm": 1.935281753540039, + "learning_rate": 7.69650986342944e-06, + "loss": 0.2581, + "step": 933 + }, + { + "epoch": 0.26980380852865404, + "grad_norm": 2.14315128326416, + "learning_rate": 7.687405159332322e-06, + "loss": 0.2317, + "step": 936 + }, + { + "epoch": 0.27066856432522024, + "grad_norm": 2.028090238571167, + "learning_rate": 7.678300455235205e-06, + "loss": 0.2379, + "step": 939 + }, + { + "epoch": 0.27153332012178644, + "grad_norm": 2.918959379196167, + "learning_rate": 7.669195751138088e-06, + "loss": 0.2708, + "step": 942 + }, + { + "epoch": 0.27239807591835263, + "grad_norm": 4.042644500732422, + "learning_rate": 7.660091047040971e-06, + "loss": 0.3112, + "step": 945 + }, + { + "epoch": 0.27326283171491883, + "grad_norm": 1.8345041275024414, + "learning_rate": 7.650986342943855e-06, + "loss": 0.267, + "step": 948 + }, + { + "epoch": 0.274127587511485, + "grad_norm": 5.901050567626953, + "learning_rate": 7.641881638846738e-06, + "loss": 0.272, + "step": 951 + }, + { + "epoch": 0.2749923433080512, + "grad_norm": 2.413675546646118, + "learning_rate": 7.632776934749621e-06, + "loss": 0.2906, + "step": 954 + }, + { + "epoch": 0.2758570991046174, + "grad_norm": 2.699126958847046, + "learning_rate": 7.623672230652505e-06, + "loss": 0.2201, + "step": 957 + }, + { + "epoch": 0.2767218549011836, + "grad_norm": 1.3186564445495605, + "learning_rate": 7.614567526555388e-06, + "loss": 0.227, + "step": 960 + }, + { + "epoch": 0.2775866106977498, + "grad_norm": 7.576478958129883, + "learning_rate": 7.6054628224582705e-06, + "loss": 0.2569, + "step": 963 + }, + { + "epoch": 0.278451366494316, + "grad_norm": 1.853145718574524, + "learning_rate": 7.596358118361153e-06, + "loss": 0.2586, + "step": 966 + }, + { + "epoch": 0.2793161222908822, + "grad_norm": 3.2667322158813477, + "learning_rate": 7.587253414264037e-06, + "loss": 0.2483, + "step": 969 + }, + { + "epoch": 0.2801808780874484, + "grad_norm": 2.7756762504577637, + "learning_rate": 7.578148710166921e-06, + "loss": 0.2724, + "step": 972 + }, + { + "epoch": 0.28104563388401466, + "grad_norm": 2.138936758041382, + "learning_rate": 7.569044006069804e-06, + "loss": 0.2517, + "step": 975 + }, + { + "epoch": 0.28191038968058085, + "grad_norm": 1.9090849161148071, + "learning_rate": 7.5599393019726864e-06, + "loss": 0.2608, + "step": 978 + }, + { + "epoch": 0.28277514547714705, + "grad_norm": 2.6184728145599365, + "learning_rate": 7.550834597875569e-06, + "loss": 0.2728, + "step": 981 + }, + { + "epoch": 0.28363990127371325, + "grad_norm": 1.5910395383834839, + "learning_rate": 7.541729893778453e-06, + "loss": 0.2144, + "step": 984 + }, + { + "epoch": 0.28450465707027944, + "grad_norm": 2.3524558544158936, + "learning_rate": 7.532625189681337e-06, + "loss": 0.23, + "step": 987 + }, + { + "epoch": 0.28536941286684564, + "grad_norm": 3.056361675262451, + "learning_rate": 7.5235204855842195e-06, + "loss": 0.2397, + "step": 990 + }, + { + "epoch": 0.28623416866341184, + "grad_norm": 3.4158847332000732, + "learning_rate": 7.514415781487102e-06, + "loss": 0.2712, + "step": 993 + }, + { + "epoch": 0.28709892445997803, + "grad_norm": 3.3620333671569824, + "learning_rate": 7.505311077389985e-06, + "loss": 0.2107, + "step": 996 + }, + { + "epoch": 0.28796368025654423, + "grad_norm": 3.508890390396118, + "learning_rate": 7.496206373292868e-06, + "loss": 0.2453, + "step": 999 + }, + { + "epoch": 0.28825193218873296, + "eval_loss": 0.21684841811656952, + "eval_mse": 0.21684841979760677, + "eval_runtime": 6.6234, + "eval_samples_per_second": 150.98, + "eval_steps_per_second": 18.872, + "step": 1000 + }, + { + "epoch": 0.2888284360531104, + "grad_norm": 2.658740997314453, + "learning_rate": 7.487101669195752e-06, + "loss": 0.2319, + "step": 1002 + }, + { + "epoch": 0.2896931918496766, + "grad_norm": 1.370540976524353, + "learning_rate": 7.477996965098635e-06, + "loss": 0.2443, + "step": 1005 + }, + { + "epoch": 0.2905579476462428, + "grad_norm": 8.996585845947266, + "learning_rate": 7.468892261001518e-06, + "loss": 0.2859, + "step": 1008 + }, + { + "epoch": 0.291422703442809, + "grad_norm": 3.880053758621216, + "learning_rate": 7.459787556904401e-06, + "loss": 0.2913, + "step": 1011 + }, + { + "epoch": 0.2922874592393752, + "grad_norm": 4.735537052154541, + "learning_rate": 7.450682852807284e-06, + "loss": 0.258, + "step": 1014 + }, + { + "epoch": 0.2931522150359414, + "grad_norm": 1.7119396924972534, + "learning_rate": 7.441578148710168e-06, + "loss": 0.2397, + "step": 1017 + }, + { + "epoch": 0.2940169708325076, + "grad_norm": 3.4769861698150635, + "learning_rate": 7.4324734446130505e-06, + "loss": 0.209, + "step": 1020 + }, + { + "epoch": 0.2948817266290738, + "grad_norm": 2.3741278648376465, + "learning_rate": 7.423368740515934e-06, + "loss": 0.2627, + "step": 1023 + }, + { + "epoch": 0.29574648242564, + "grad_norm": 1.5303798913955688, + "learning_rate": 7.414264036418817e-06, + "loss": 0.2352, + "step": 1026 + }, + { + "epoch": 0.2966112382222062, + "grad_norm": 1.63661789894104, + "learning_rate": 7.4051593323217e-06, + "loss": 0.2448, + "step": 1029 + }, + { + "epoch": 0.2974759940187724, + "grad_norm": 1.531538724899292, + "learning_rate": 7.3960546282245835e-06, + "loss": 0.23, + "step": 1032 + }, + { + "epoch": 0.2983407498153386, + "grad_norm": 1.3936281204223633, + "learning_rate": 7.386949924127466e-06, + "loss": 0.2367, + "step": 1035 + }, + { + "epoch": 0.2992055056119048, + "grad_norm": 4.795119762420654, + "learning_rate": 7.377845220030349e-06, + "loss": 0.2621, + "step": 1038 + }, + { + "epoch": 0.300070261408471, + "grad_norm": 3.725170135498047, + "learning_rate": 7.368740515933233e-06, + "loss": 0.2708, + "step": 1041 + }, + { + "epoch": 0.3009350172050372, + "grad_norm": 1.5772522687911987, + "learning_rate": 7.359635811836116e-06, + "loss": 0.1897, + "step": 1044 + }, + { + "epoch": 0.3017997730016034, + "grad_norm": 4.716231822967529, + "learning_rate": 7.3505311077389994e-06, + "loss": 0.2297, + "step": 1047 + }, + { + "epoch": 0.3026645287981696, + "grad_norm": 3.9254777431488037, + "learning_rate": 7.341426403641882e-06, + "loss": 0.2285, + "step": 1050 + }, + { + "epoch": 0.3035292845947358, + "grad_norm": 2.7238216400146484, + "learning_rate": 7.332321699544765e-06, + "loss": 0.2488, + "step": 1053 + }, + { + "epoch": 0.304394040391302, + "grad_norm": 7.061728477478027, + "learning_rate": 7.323216995447649e-06, + "loss": 0.2329, + "step": 1056 + }, + { + "epoch": 0.3052587961878682, + "grad_norm": 2.2323367595672607, + "learning_rate": 7.314112291350532e-06, + "loss": 0.2511, + "step": 1059 + }, + { + "epoch": 0.3061235519844344, + "grad_norm": 3.432279586791992, + "learning_rate": 7.305007587253415e-06, + "loss": 0.251, + "step": 1062 + }, + { + "epoch": 0.3069883077810006, + "grad_norm": 4.372462272644043, + "learning_rate": 7.295902883156298e-06, + "loss": 0.2306, + "step": 1065 + }, + { + "epoch": 0.3078530635775668, + "grad_norm": 3.428677558898926, + "learning_rate": 7.286798179059181e-06, + "loss": 0.2389, + "step": 1068 + }, + { + "epoch": 0.308717819374133, + "grad_norm": 1.7803095579147339, + "learning_rate": 7.277693474962064e-06, + "loss": 0.2374, + "step": 1071 + }, + { + "epoch": 0.3095825751706992, + "grad_norm": 1.7453327178955078, + "learning_rate": 7.2685887708649476e-06, + "loss": 0.2175, + "step": 1074 + }, + { + "epoch": 0.3104473309672654, + "grad_norm": 3.4764058589935303, + "learning_rate": 7.25948406676783e-06, + "loss": 0.2649, + "step": 1077 + }, + { + "epoch": 0.3113120867638316, + "grad_norm": 2.5065643787384033, + "learning_rate": 7.250379362670714e-06, + "loss": 0.2436, + "step": 1080 + }, + { + "epoch": 0.3121768425603978, + "grad_norm": 2.4735498428344727, + "learning_rate": 7.241274658573597e-06, + "loss": 0.2565, + "step": 1083 + }, + { + "epoch": 0.313041598356964, + "grad_norm": 7.107683181762695, + "learning_rate": 7.23216995447648e-06, + "loss": 0.2691, + "step": 1086 + }, + { + "epoch": 0.3139063541535302, + "grad_norm": 1.9517550468444824, + "learning_rate": 7.223065250379363e-06, + "loss": 0.2869, + "step": 1089 + }, + { + "epoch": 0.3147711099500964, + "grad_norm": 3.1179702281951904, + "learning_rate": 7.213960546282246e-06, + "loss": 0.2434, + "step": 1092 + }, + { + "epoch": 0.3156358657466626, + "grad_norm": 4.619999885559082, + "learning_rate": 7.20485584218513e-06, + "loss": 0.2469, + "step": 1095 + }, + { + "epoch": 0.3165006215432288, + "grad_norm": 6.7724528312683105, + "learning_rate": 7.195751138088013e-06, + "loss": 0.278, + "step": 1098 + }, + { + "epoch": 0.31707712540760624, + "eval_loss": 0.2247343808412552, + "eval_mse": 0.22473438137583435, + "eval_runtime": 6.5256, + "eval_samples_per_second": 153.242, + "eval_steps_per_second": 19.155, + "step": 1100 + }, + { + "epoch": 0.317365377339795, + "grad_norm": 1.6179205179214478, + "learning_rate": 7.186646433990896e-06, + "loss": 0.252, + "step": 1101 + }, + { + "epoch": 0.31823013313636117, + "grad_norm": 1.8803197145462036, + "learning_rate": 7.1775417298937785e-06, + "loss": 0.2227, + "step": 1104 + }, + { + "epoch": 0.31909488893292737, + "grad_norm": 2.2427573204040527, + "learning_rate": 7.168437025796661e-06, + "loss": 0.2399, + "step": 1107 + }, + { + "epoch": 0.31995964472949356, + "grad_norm": 1.907244086265564, + "learning_rate": 7.159332321699546e-06, + "loss": 0.2162, + "step": 1110 + }, + { + "epoch": 0.32082440052605976, + "grad_norm": 3.7878000736236572, + "learning_rate": 7.150227617602429e-06, + "loss": 0.246, + "step": 1113 + }, + { + "epoch": 0.32168915632262596, + "grad_norm": 1.9053196907043457, + "learning_rate": 7.141122913505312e-06, + "loss": 0.2684, + "step": 1116 + }, + { + "epoch": 0.32255391211919215, + "grad_norm": 5.108983039855957, + "learning_rate": 7.1320182094081944e-06, + "loss": 0.2237, + "step": 1119 + }, + { + "epoch": 0.32341866791575835, + "grad_norm": 2.2469422817230225, + "learning_rate": 7.122913505311077e-06, + "loss": 0.2462, + "step": 1122 + }, + { + "epoch": 0.32428342371232455, + "grad_norm": 5.127351760864258, + "learning_rate": 7.113808801213962e-06, + "loss": 0.2543, + "step": 1125 + }, + { + "epoch": 0.32514817950889074, + "grad_norm": 4.980170249938965, + "learning_rate": 7.104704097116845e-06, + "loss": 0.2775, + "step": 1128 + }, + { + "epoch": 0.326012935305457, + "grad_norm": 3.701903820037842, + "learning_rate": 7.0955993930197275e-06, + "loss": 0.2548, + "step": 1131 + }, + { + "epoch": 0.3268776911020232, + "grad_norm": 3.780144214630127, + "learning_rate": 7.08649468892261e-06, + "loss": 0.2687, + "step": 1134 + }, + { + "epoch": 0.3277424468985894, + "grad_norm": 2.2161099910736084, + "learning_rate": 7.077389984825493e-06, + "loss": 0.2351, + "step": 1137 + }, + { + "epoch": 0.3286072026951556, + "grad_norm": 4.7017998695373535, + "learning_rate": 7.068285280728376e-06, + "loss": 0.2431, + "step": 1140 + }, + { + "epoch": 0.3294719584917218, + "grad_norm": 2.053750991821289, + "learning_rate": 7.0591805766312606e-06, + "loss": 0.254, + "step": 1143 + }, + { + "epoch": 0.330336714288288, + "grad_norm": 2.8078341484069824, + "learning_rate": 7.050075872534143e-06, + "loss": 0.2646, + "step": 1146 + }, + { + "epoch": 0.3312014700848542, + "grad_norm": 2.585087776184082, + "learning_rate": 7.040971168437026e-06, + "loss": 0.2536, + "step": 1149 + }, + { + "epoch": 0.3320662258814204, + "grad_norm": 4.2963104248046875, + "learning_rate": 7.031866464339909e-06, + "loss": 0.2276, + "step": 1152 + }, + { + "epoch": 0.33293098167798657, + "grad_norm": 4.205751419067383, + "learning_rate": 7.022761760242792e-06, + "loss": 0.2404, + "step": 1155 + }, + { + "epoch": 0.33379573747455277, + "grad_norm": 3.460796356201172, + "learning_rate": 7.0136570561456765e-06, + "loss": 0.2473, + "step": 1158 + }, + { + "epoch": 0.33466049327111896, + "grad_norm": 3.529181957244873, + "learning_rate": 7.004552352048559e-06, + "loss": 0.24, + "step": 1161 + }, + { + "epoch": 0.33552524906768516, + "grad_norm": 3.203437328338623, + "learning_rate": 6.995447647951442e-06, + "loss": 0.2547, + "step": 1164 + }, + { + "epoch": 0.33639000486425136, + "grad_norm": 4.1535491943359375, + "learning_rate": 6.986342943854325e-06, + "loss": 0.2532, + "step": 1167 + }, + { + "epoch": 0.33725476066081755, + "grad_norm": 3.7933478355407715, + "learning_rate": 6.977238239757208e-06, + "loss": 0.2435, + "step": 1170 + }, + { + "epoch": 0.33811951645738375, + "grad_norm": 2.6703147888183594, + "learning_rate": 6.968133535660092e-06, + "loss": 0.2293, + "step": 1173 + }, + { + "epoch": 0.33898427225394995, + "grad_norm": 2.8900182247161865, + "learning_rate": 6.959028831562975e-06, + "loss": 0.2163, + "step": 1176 + }, + { + "epoch": 0.33984902805051614, + "grad_norm": 3.9563350677490234, + "learning_rate": 6.949924127465858e-06, + "loss": 0.2682, + "step": 1179 + }, + { + "epoch": 0.34071378384708234, + "grad_norm": 1.8461941480636597, + "learning_rate": 6.940819423368741e-06, + "loss": 0.2293, + "step": 1182 + }, + { + "epoch": 0.34157853964364854, + "grad_norm": 3.313368082046509, + "learning_rate": 6.931714719271624e-06, + "loss": 0.2436, + "step": 1185 + }, + { + "epoch": 0.34244329544021473, + "grad_norm": 1.7820873260498047, + "learning_rate": 6.922610015174508e-06, + "loss": 0.212, + "step": 1188 + }, + { + "epoch": 0.34330805123678093, + "grad_norm": 1.995291829109192, + "learning_rate": 6.913505311077391e-06, + "loss": 0.2243, + "step": 1191 + }, + { + "epoch": 0.34417280703334713, + "grad_norm": 2.928727626800537, + "learning_rate": 6.904400606980274e-06, + "loss": 0.2296, + "step": 1194 + }, + { + "epoch": 0.3450375628299133, + "grad_norm": 2.5598068237304688, + "learning_rate": 6.895295902883157e-06, + "loss": 0.2135, + "step": 1197 + }, + { + "epoch": 0.3459023186264795, + "grad_norm": 2.4700326919555664, + "learning_rate": 6.88619119878604e-06, + "loss": 0.2319, + "step": 1200 + }, + { + "epoch": 0.3459023186264795, + "eval_loss": 0.21415139734745026, + "eval_mse": 0.2141513990436215, + "eval_runtime": 6.5018, + "eval_samples_per_second": 153.803, + "eval_steps_per_second": 19.225, + "step": 1200 + }, + { + "epoch": 0.3467670744230457, + "grad_norm": 3.6825876235961914, + "learning_rate": 6.8770864946889225e-06, + "loss": 0.2409, + "step": 1203 + }, + { + "epoch": 0.3476318302196119, + "grad_norm": 2.8780245780944824, + "learning_rate": 6.867981790591807e-06, + "loss": 0.2219, + "step": 1206 + }, + { + "epoch": 0.34849658601617817, + "grad_norm": 2.641505479812622, + "learning_rate": 6.85887708649469e-06, + "loss": 0.2315, + "step": 1209 + }, + { + "epoch": 0.34936134181274436, + "grad_norm": 1.660909652709961, + "learning_rate": 6.849772382397573e-06, + "loss": 0.224, + "step": 1212 + }, + { + "epoch": 0.35022609760931056, + "grad_norm": 5.104085445404053, + "learning_rate": 6.8406676783004556e-06, + "loss": 0.2184, + "step": 1215 + }, + { + "epoch": 0.35109085340587676, + "grad_norm": 1.870938777923584, + "learning_rate": 6.831562974203338e-06, + "loss": 0.234, + "step": 1218 + }, + { + "epoch": 0.35195560920244295, + "grad_norm": 4.542322635650635, + "learning_rate": 6.822458270106223e-06, + "loss": 0.2247, + "step": 1221 + }, + { + "epoch": 0.35282036499900915, + "grad_norm": 1.4236701726913452, + "learning_rate": 6.813353566009106e-06, + "loss": 0.2168, + "step": 1224 + }, + { + "epoch": 0.35368512079557535, + "grad_norm": 2.0418410301208496, + "learning_rate": 6.804248861911989e-06, + "loss": 0.2425, + "step": 1227 + }, + { + "epoch": 0.35454987659214154, + "grad_norm": 2.868399143218994, + "learning_rate": 6.7951441578148715e-06, + "loss": 0.2354, + "step": 1230 + }, + { + "epoch": 0.35541463238870774, + "grad_norm": 2.453361749649048, + "learning_rate": 6.786039453717754e-06, + "loss": 0.2266, + "step": 1233 + }, + { + "epoch": 0.35627938818527394, + "grad_norm": 2.0826542377471924, + "learning_rate": 6.776934749620638e-06, + "loss": 0.2194, + "step": 1236 + }, + { + "epoch": 0.35714414398184013, + "grad_norm": 1.511440634727478, + "learning_rate": 6.767830045523522e-06, + "loss": 0.2264, + "step": 1239 + }, + { + "epoch": 0.35800889977840633, + "grad_norm": 2.456897020339966, + "learning_rate": 6.7587253414264045e-06, + "loss": 0.2864, + "step": 1242 + }, + { + "epoch": 0.3588736555749725, + "grad_norm": 2.873429298400879, + "learning_rate": 6.749620637329287e-06, + "loss": 0.2055, + "step": 1245 + }, + { + "epoch": 0.3597384113715387, + "grad_norm": 1.7501113414764404, + "learning_rate": 6.74051593323217e-06, + "loss": 0.2205, + "step": 1248 + }, + { + "epoch": 0.3606031671681049, + "grad_norm": 1.615105390548706, + "learning_rate": 6.731411229135054e-06, + "loss": 0.2337, + "step": 1251 + }, + { + "epoch": 0.3614679229646711, + "grad_norm": 1.6872748136520386, + "learning_rate": 6.722306525037937e-06, + "loss": 0.2164, + "step": 1254 + }, + { + "epoch": 0.3623326787612373, + "grad_norm": 3.1001312732696533, + "learning_rate": 6.7132018209408204e-06, + "loss": 0.2154, + "step": 1257 + }, + { + "epoch": 0.3631974345578035, + "grad_norm": 2.1165292263031006, + "learning_rate": 6.704097116843703e-06, + "loss": 0.2512, + "step": 1260 + }, + { + "epoch": 0.3640621903543697, + "grad_norm": 4.326318740844727, + "learning_rate": 6.694992412746586e-06, + "loss": 0.2471, + "step": 1263 + }, + { + "epoch": 0.3649269461509359, + "grad_norm": 5.773290634155273, + "learning_rate": 6.685887708649469e-06, + "loss": 0.2407, + "step": 1266 + }, + { + "epoch": 0.3657917019475021, + "grad_norm": 3.2723119258880615, + "learning_rate": 6.676783004552353e-06, + "loss": 0.3129, + "step": 1269 + }, + { + "epoch": 0.3666564577440683, + "grad_norm": 2.927086114883423, + "learning_rate": 6.6676783004552355e-06, + "loss": 0.231, + "step": 1272 + }, + { + "epoch": 0.3675212135406345, + "grad_norm": 1.7322252988815308, + "learning_rate": 6.658573596358119e-06, + "loss": 0.2504, + "step": 1275 + }, + { + "epoch": 0.3683859693372007, + "grad_norm": 2.5904715061187744, + "learning_rate": 6.649468892261002e-06, + "loss": 0.2079, + "step": 1278 + }, + { + "epoch": 0.3692507251337669, + "grad_norm": 2.6561062335968018, + "learning_rate": 6.640364188163885e-06, + "loss": 0.2423, + "step": 1281 + }, + { + "epoch": 0.3701154809303331, + "grad_norm": 3.3299241065979004, + "learning_rate": 6.6312594840667686e-06, + "loss": 0.2386, + "step": 1284 + }, + { + "epoch": 0.37098023672689934, + "grad_norm": 1.731477975845337, + "learning_rate": 6.622154779969651e-06, + "loss": 0.2017, + "step": 1287 + }, + { + "epoch": 0.37184499252346553, + "grad_norm": 2.5077965259552, + "learning_rate": 6.613050075872534e-06, + "loss": 0.2446, + "step": 1290 + }, + { + "epoch": 0.37270974832003173, + "grad_norm": 2.36556077003479, + "learning_rate": 6.603945371775418e-06, + "loss": 0.2544, + "step": 1293 + }, + { + "epoch": 0.3735745041165979, + "grad_norm": 1.9789232015609741, + "learning_rate": 6.594840667678301e-06, + "loss": 0.2247, + "step": 1296 + }, + { + "epoch": 0.3744392599131641, + "grad_norm": 1.7310489416122437, + "learning_rate": 6.5857359635811845e-06, + "loss": 0.1983, + "step": 1299 + }, + { + "epoch": 0.37472751184535286, + "eval_loss": 0.2175411880016327, + "eval_mse": 0.21754117820138344, + "eval_runtime": 7.1272, + "eval_samples_per_second": 140.308, + "eval_steps_per_second": 17.538, + "step": 1300 + }, + { + "epoch": 0.3753040157097303, + "grad_norm": 4.152148723602295, + "learning_rate": 6.576631259484067e-06, + "loss": 0.2899, + "step": 1302 + }, + { + "epoch": 0.3761687715062965, + "grad_norm": 2.1581296920776367, + "learning_rate": 6.56752655538695e-06, + "loss": 0.2448, + "step": 1305 + }, + { + "epoch": 0.3770335273028627, + "grad_norm": 2.8600330352783203, + "learning_rate": 6.558421851289834e-06, + "loss": 0.2084, + "step": 1308 + }, + { + "epoch": 0.3778982830994289, + "grad_norm": 1.5723686218261719, + "learning_rate": 6.549317147192717e-06, + "loss": 0.2441, + "step": 1311 + }, + { + "epoch": 0.3787630388959951, + "grad_norm": 3.4149153232574463, + "learning_rate": 6.5402124430956e-06, + "loss": 0.247, + "step": 1314 + }, + { + "epoch": 0.3796277946925613, + "grad_norm": 16.35414695739746, + "learning_rate": 6.531107738998483e-06, + "loss": 0.2478, + "step": 1317 + }, + { + "epoch": 0.3804925504891275, + "grad_norm": 6.227761268615723, + "learning_rate": 6.522003034901366e-06, + "loss": 0.2316, + "step": 1320 + }, + { + "epoch": 0.3813573062856937, + "grad_norm": 2.2669637203216553, + "learning_rate": 6.512898330804249e-06, + "loss": 0.2836, + "step": 1323 + }, + { + "epoch": 0.3822220620822599, + "grad_norm": 1.4385027885437012, + "learning_rate": 6.503793626707133e-06, + "loss": 0.2122, + "step": 1326 + }, + { + "epoch": 0.3830868178788261, + "grad_norm": 2.3909130096435547, + "learning_rate": 6.4946889226100154e-06, + "loss": 0.2357, + "step": 1329 + }, + { + "epoch": 0.3839515736753923, + "grad_norm": 1.7610464096069336, + "learning_rate": 6.485584218512899e-06, + "loss": 0.2746, + "step": 1332 + }, + { + "epoch": 0.3848163294719585, + "grad_norm": 2.8983278274536133, + "learning_rate": 6.476479514415782e-06, + "loss": 0.2176, + "step": 1335 + }, + { + "epoch": 0.3856810852685247, + "grad_norm": 1.7231597900390625, + "learning_rate": 6.467374810318665e-06, + "loss": 0.2065, + "step": 1338 + }, + { + "epoch": 0.3865458410650909, + "grad_norm": 1.6913188695907593, + "learning_rate": 6.458270106221548e-06, + "loss": 0.2541, + "step": 1341 + }, + { + "epoch": 0.3874105968616571, + "grad_norm": 2.1574645042419434, + "learning_rate": 6.449165402124431e-06, + "loss": 0.2393, + "step": 1344 + }, + { + "epoch": 0.38827535265822327, + "grad_norm": 3.8315231800079346, + "learning_rate": 6.440060698027315e-06, + "loss": 0.2353, + "step": 1347 + }, + { + "epoch": 0.38914010845478947, + "grad_norm": 2.655545711517334, + "learning_rate": 6.430955993930198e-06, + "loss": 0.2222, + "step": 1350 + }, + { + "epoch": 0.39000486425135567, + "grad_norm": 2.0346477031707764, + "learning_rate": 6.421851289833081e-06, + "loss": 0.2289, + "step": 1353 + }, + { + "epoch": 0.39086962004792186, + "grad_norm": 4.726449489593506, + "learning_rate": 6.4127465857359636e-06, + "loss": 0.2748, + "step": 1356 + }, + { + "epoch": 0.39173437584448806, + "grad_norm": 3.0121731758117676, + "learning_rate": 6.403641881638846e-06, + "loss": 0.2244, + "step": 1359 + }, + { + "epoch": 0.39259913164105426, + "grad_norm": 6.562178611755371, + "learning_rate": 6.394537177541731e-06, + "loss": 0.264, + "step": 1362 + }, + { + "epoch": 0.3934638874376205, + "grad_norm": 3.1092755794525146, + "learning_rate": 6.385432473444614e-06, + "loss": 0.228, + "step": 1365 + }, + { + "epoch": 0.3943286432341867, + "grad_norm": 4.431830406188965, + "learning_rate": 6.376327769347497e-06, + "loss": 0.2544, + "step": 1368 + }, + { + "epoch": 0.3951933990307529, + "grad_norm": 2.545694351196289, + "learning_rate": 6.3672230652503795e-06, + "loss": 0.2618, + "step": 1371 + }, + { + "epoch": 0.3960581548273191, + "grad_norm": 1.5483859777450562, + "learning_rate": 6.358118361153262e-06, + "loss": 0.282, + "step": 1374 + }, + { + "epoch": 0.3969229106238853, + "grad_norm": 1.750784993171692, + "learning_rate": 6.349013657056147e-06, + "loss": 0.2447, + "step": 1377 + }, + { + "epoch": 0.3977876664204515, + "grad_norm": 2.440020799636841, + "learning_rate": 6.33990895295903e-06, + "loss": 0.2472, + "step": 1380 + }, + { + "epoch": 0.3986524222170177, + "grad_norm": 1.4584132432937622, + "learning_rate": 6.3308042488619125e-06, + "loss": 0.2469, + "step": 1383 + }, + { + "epoch": 0.3995171780135839, + "grad_norm": 1.3845562934875488, + "learning_rate": 6.321699544764795e-06, + "loss": 0.2111, + "step": 1386 + }, + { + "epoch": 0.4003819338101501, + "grad_norm": 1.736345648765564, + "learning_rate": 6.312594840667678e-06, + "loss": 0.2203, + "step": 1389 + }, + { + "epoch": 0.4012466896067163, + "grad_norm": 2.5879809856414795, + "learning_rate": 6.303490136570563e-06, + "loss": 0.2776, + "step": 1392 + }, + { + "epoch": 0.4021114454032825, + "grad_norm": 1.3224149942398071, + "learning_rate": 6.294385432473446e-06, + "loss": 0.2016, + "step": 1395 + }, + { + "epoch": 0.40297620119984867, + "grad_norm": 2.7825825214385986, + "learning_rate": 6.2852807283763284e-06, + "loss": 0.2264, + "step": 1398 + }, + { + "epoch": 0.40355270506422614, + "eval_loss": 0.23064111173152924, + "eval_mse": 0.2306411211611703, + "eval_runtime": 6.653, + "eval_samples_per_second": 150.308, + "eval_steps_per_second": 18.789, + "step": 1400 + }, + { + "epoch": 0.40384095699641487, + "grad_norm": 3.059859275817871, + "learning_rate": 6.276176024279211e-06, + "loss": 0.1952, + "step": 1401 + }, + { + "epoch": 0.40470571279298106, + "grad_norm": 2.1431009769439697, + "learning_rate": 6.267071320182094e-06, + "loss": 0.2681, + "step": 1404 + }, + { + "epoch": 0.40557046858954726, + "grad_norm": 1.6716617345809937, + "learning_rate": 6.257966616084977e-06, + "loss": 0.221, + "step": 1407 + }, + { + "epoch": 0.40643522438611346, + "grad_norm": 1.9646525382995605, + "learning_rate": 6.2488619119878615e-06, + "loss": 0.2218, + "step": 1410 + }, + { + "epoch": 0.40729998018267966, + "grad_norm": 1.2912189960479736, + "learning_rate": 6.239757207890744e-06, + "loss": 0.2077, + "step": 1413 + }, + { + "epoch": 0.40816473597924585, + "grad_norm": 2.4723434448242188, + "learning_rate": 6.230652503793627e-06, + "loss": 0.2629, + "step": 1416 + }, + { + "epoch": 0.40902949177581205, + "grad_norm": 2.1053199768066406, + "learning_rate": 6.22154779969651e-06, + "loss": 0.2524, + "step": 1419 + }, + { + "epoch": 0.40989424757237825, + "grad_norm": 2.039580821990967, + "learning_rate": 6.212443095599393e-06, + "loss": 0.2035, + "step": 1422 + }, + { + "epoch": 0.41075900336894444, + "grad_norm": 1.499022364616394, + "learning_rate": 6.203338391502277e-06, + "loss": 0.2263, + "step": 1425 + }, + { + "epoch": 0.41162375916551064, + "grad_norm": 2.090580701828003, + "learning_rate": 6.19423368740516e-06, + "loss": 0.2834, + "step": 1428 + }, + { + "epoch": 0.41248851496207684, + "grad_norm": 2.0547232627868652, + "learning_rate": 6.185128983308043e-06, + "loss": 0.2725, + "step": 1431 + }, + { + "epoch": 0.41335327075864303, + "grad_norm": 1.8254441022872925, + "learning_rate": 6.176024279210926e-06, + "loss": 0.2234, + "step": 1434 + }, + { + "epoch": 0.41421802655520923, + "grad_norm": 1.860533595085144, + "learning_rate": 6.166919575113809e-06, + "loss": 0.2554, + "step": 1437 + }, + { + "epoch": 0.4150827823517754, + "grad_norm": 3.225929021835327, + "learning_rate": 6.157814871016693e-06, + "loss": 0.2784, + "step": 1440 + }, + { + "epoch": 0.4159475381483417, + "grad_norm": 2.2769436836242676, + "learning_rate": 6.148710166919576e-06, + "loss": 0.2261, + "step": 1443 + }, + { + "epoch": 0.4168122939449079, + "grad_norm": 1.6467565298080444, + "learning_rate": 6.139605462822459e-06, + "loss": 0.2702, + "step": 1446 + }, + { + "epoch": 0.41767704974147407, + "grad_norm": 3.0362329483032227, + "learning_rate": 6.130500758725342e-06, + "loss": 0.2348, + "step": 1449 + }, + { + "epoch": 0.41854180553804027, + "grad_norm": 1.8852200508117676, + "learning_rate": 6.121396054628225e-06, + "loss": 0.2222, + "step": 1452 + }, + { + "epoch": 0.41940656133460646, + "grad_norm": 2.119568109512329, + "learning_rate": 6.112291350531108e-06, + "loss": 0.226, + "step": 1455 + }, + { + "epoch": 0.42027131713117266, + "grad_norm": 2.534950017929077, + "learning_rate": 6.103186646433992e-06, + "loss": 0.2353, + "step": 1458 + }, + { + "epoch": 0.42113607292773886, + "grad_norm": 3.6363894939422607, + "learning_rate": 6.094081942336875e-06, + "loss": 0.2724, + "step": 1461 + }, + { + "epoch": 0.42200082872430505, + "grad_norm": 1.8480486869812012, + "learning_rate": 6.084977238239758e-06, + "loss": 0.2417, + "step": 1464 + }, + { + "epoch": 0.42286558452087125, + "grad_norm": 4.110941410064697, + "learning_rate": 6.075872534142641e-06, + "loss": 0.2061, + "step": 1467 + }, + { + "epoch": 0.42373034031743745, + "grad_norm": 2.7998435497283936, + "learning_rate": 6.0667678300455234e-06, + "loss": 0.231, + "step": 1470 + }, + { + "epoch": 0.42459509611400365, + "grad_norm": 1.7628705501556396, + "learning_rate": 6.057663125948408e-06, + "loss": 0.2193, + "step": 1473 + }, + { + "epoch": 0.42545985191056984, + "grad_norm": 2.7976937294006348, + "learning_rate": 6.048558421851291e-06, + "loss": 0.2361, + "step": 1476 + }, + { + "epoch": 0.42632460770713604, + "grad_norm": 2.4593019485473633, + "learning_rate": 6.039453717754174e-06, + "loss": 0.2494, + "step": 1479 + }, + { + "epoch": 0.42718936350370224, + "grad_norm": 2.5946741104125977, + "learning_rate": 6.0303490136570565e-06, + "loss": 0.2279, + "step": 1482 + }, + { + "epoch": 0.42805411930026843, + "grad_norm": 1.6827466487884521, + "learning_rate": 6.021244309559939e-06, + "loss": 0.2458, + "step": 1485 + }, + { + "epoch": 0.42891887509683463, + "grad_norm": 4.625283241271973, + "learning_rate": 6.012139605462823e-06, + "loss": 0.2707, + "step": 1488 + }, + { + "epoch": 0.4297836308934008, + "grad_norm": 2.733687400817871, + "learning_rate": 6.003034901365707e-06, + "loss": 0.2294, + "step": 1491 + }, + { + "epoch": 0.430648386689967, + "grad_norm": 1.38575279712677, + "learning_rate": 5.9939301972685896e-06, + "loss": 0.1992, + "step": 1494 + }, + { + "epoch": 0.4315131424865332, + "grad_norm": 1.9684631824493408, + "learning_rate": 5.984825493171472e-06, + "loss": 0.2118, + "step": 1497 + }, + { + "epoch": 0.4323778982830994, + "grad_norm": 3.40984845161438, + "learning_rate": 5.975720789074355e-06, + "loss": 0.2175, + "step": 1500 + }, + { + "epoch": 0.4323778982830994, + "eval_loss": 0.2375136762857437, + "eval_mse": 0.23751368772797288, + "eval_runtime": 6.7445, + "eval_samples_per_second": 148.27, + "eval_steps_per_second": 18.534, + "step": 1500 + }, + { + "epoch": 0.4332426540796656, + "grad_norm": 2.3788678646087646, + "learning_rate": 5.966616084977239e-06, + "loss": 0.2311, + "step": 1503 + }, + { + "epoch": 0.4341074098762318, + "grad_norm": 4.027227401733398, + "learning_rate": 5.957511380880122e-06, + "loss": 0.2456, + "step": 1506 + }, + { + "epoch": 0.434972165672798, + "grad_norm": 5.0818586349487305, + "learning_rate": 5.9484066767830055e-06, + "loss": 0.2631, + "step": 1509 + }, + { + "epoch": 0.4358369214693642, + "grad_norm": 4.373122215270996, + "learning_rate": 5.939301972685888e-06, + "loss": 0.243, + "step": 1512 + }, + { + "epoch": 0.4367016772659304, + "grad_norm": 3.31792950630188, + "learning_rate": 5.930197268588771e-06, + "loss": 0.2278, + "step": 1515 + }, + { + "epoch": 0.4375664330624966, + "grad_norm": 1.9427984952926636, + "learning_rate": 5.921092564491655e-06, + "loss": 0.2578, + "step": 1518 + }, + { + "epoch": 0.43843118885906285, + "grad_norm": 2.5355935096740723, + "learning_rate": 5.911987860394538e-06, + "loss": 0.2125, + "step": 1521 + }, + { + "epoch": 0.43929594465562904, + "grad_norm": 5.661628723144531, + "learning_rate": 5.9028831562974205e-06, + "loss": 0.2591, + "step": 1524 + }, + { + "epoch": 0.44016070045219524, + "grad_norm": 2.133945941925049, + "learning_rate": 5.893778452200304e-06, + "loss": 0.2577, + "step": 1527 + }, + { + "epoch": 0.44102545624876144, + "grad_norm": 2.8841874599456787, + "learning_rate": 5.884673748103187e-06, + "loss": 0.2344, + "step": 1530 + }, + { + "epoch": 0.44189021204532763, + "grad_norm": 1.6562261581420898, + "learning_rate": 5.87556904400607e-06, + "loss": 0.2133, + "step": 1533 + }, + { + "epoch": 0.44275496784189383, + "grad_norm": 3.133864164352417, + "learning_rate": 5.866464339908954e-06, + "loss": 0.2154, + "step": 1536 + }, + { + "epoch": 0.44361972363846003, + "grad_norm": 1.9966986179351807, + "learning_rate": 5.8573596358118364e-06, + "loss": 0.1964, + "step": 1539 + }, + { + "epoch": 0.4444844794350262, + "grad_norm": 1.9703294038772583, + "learning_rate": 5.848254931714719e-06, + "loss": 0.2238, + "step": 1542 + }, + { + "epoch": 0.4453492352315924, + "grad_norm": 3.2984211444854736, + "learning_rate": 5.839150227617603e-06, + "loss": 0.2628, + "step": 1545 + }, + { + "epoch": 0.4462139910281586, + "grad_norm": 3.6368560791015625, + "learning_rate": 5.830045523520486e-06, + "loss": 0.2567, + "step": 1548 + }, + { + "epoch": 0.4470787468247248, + "grad_norm": 2.366480827331543, + "learning_rate": 5.8209408194233695e-06, + "loss": 0.2493, + "step": 1551 + }, + { + "epoch": 0.447943502621291, + "grad_norm": 3.3239293098449707, + "learning_rate": 5.811836115326252e-06, + "loss": 0.2268, + "step": 1554 + }, + { + "epoch": 0.4488082584178572, + "grad_norm": 4.618416786193848, + "learning_rate": 5.802731411229135e-06, + "loss": 0.2456, + "step": 1557 + }, + { + "epoch": 0.4496730142144234, + "grad_norm": 2.826070785522461, + "learning_rate": 5.793626707132019e-06, + "loss": 0.2206, + "step": 1560 + }, + { + "epoch": 0.4505377700109896, + "grad_norm": 4.012238025665283, + "learning_rate": 5.784522003034902e-06, + "loss": 0.2157, + "step": 1563 + }, + { + "epoch": 0.4514025258075558, + "grad_norm": 2.0067975521087646, + "learning_rate": 5.775417298937785e-06, + "loss": 0.2489, + "step": 1566 + }, + { + "epoch": 0.452267281604122, + "grad_norm": 1.500907301902771, + "learning_rate": 5.766312594840668e-06, + "loss": 0.2348, + "step": 1569 + }, + { + "epoch": 0.4531320374006882, + "grad_norm": 1.8496533632278442, + "learning_rate": 5.757207890743551e-06, + "loss": 0.2019, + "step": 1572 + }, + { + "epoch": 0.4539967931972544, + "grad_norm": 3.222740650177002, + "learning_rate": 5.748103186646434e-06, + "loss": 0.2587, + "step": 1575 + }, + { + "epoch": 0.4548615489938206, + "grad_norm": 1.572925329208374, + "learning_rate": 5.738998482549318e-06, + "loss": 0.2265, + "step": 1578 + }, + { + "epoch": 0.4557263047903868, + "grad_norm": 1.603624701499939, + "learning_rate": 5.729893778452201e-06, + "loss": 0.2433, + "step": 1581 + }, + { + "epoch": 0.456591060586953, + "grad_norm": 3.0979552268981934, + "learning_rate": 5.720789074355084e-06, + "loss": 0.221, + "step": 1584 + }, + { + "epoch": 0.4574558163835192, + "grad_norm": 1.8691914081573486, + "learning_rate": 5.711684370257967e-06, + "loss": 0.2083, + "step": 1587 + }, + { + "epoch": 0.4583205721800854, + "grad_norm": 1.6643935441970825, + "learning_rate": 5.70257966616085e-06, + "loss": 0.2173, + "step": 1590 + }, + { + "epoch": 0.45918532797665157, + "grad_norm": 3.626629590988159, + "learning_rate": 5.693474962063733e-06, + "loss": 0.2256, + "step": 1593 + }, + { + "epoch": 0.46005008377321777, + "grad_norm": 1.6160587072372437, + "learning_rate": 5.684370257966616e-06, + "loss": 0.2197, + "step": 1596 + }, + { + "epoch": 0.460914839569784, + "grad_norm": 3.206094264984131, + "learning_rate": 5.6752655538695e-06, + "loss": 0.2461, + "step": 1599 + }, + { + "epoch": 0.46120309150197275, + "eval_loss": 0.24926815927028656, + "eval_mse": 0.24926817585621028, + "eval_runtime": 6.5749, + "eval_samples_per_second": 152.094, + "eval_steps_per_second": 19.012, + "step": 1600 + }, + { + "epoch": 0.4617795953663502, + "grad_norm": 4.698122024536133, + "learning_rate": 5.666160849772383e-06, + "loss": 0.2546, + "step": 1602 + }, + { + "epoch": 0.4626443511629164, + "grad_norm": 1.5181471109390259, + "learning_rate": 5.657056145675266e-06, + "loss": 0.2167, + "step": 1605 + }, + { + "epoch": 0.4635091069594826, + "grad_norm": 7.337409496307373, + "learning_rate": 5.647951441578149e-06, + "loss": 0.2435, + "step": 1608 + }, + { + "epoch": 0.4643738627560488, + "grad_norm": 1.6400500535964966, + "learning_rate": 5.6388467374810314e-06, + "loss": 0.2336, + "step": 1611 + }, + { + "epoch": 0.465238618552615, + "grad_norm": 1.7364429235458374, + "learning_rate": 5.629742033383916e-06, + "loss": 0.2397, + "step": 1614 + }, + { + "epoch": 0.4661033743491812, + "grad_norm": 2.7070679664611816, + "learning_rate": 5.620637329286799e-06, + "loss": 0.2389, + "step": 1617 + }, + { + "epoch": 0.4669681301457474, + "grad_norm": 1.28640878200531, + "learning_rate": 5.611532625189682e-06, + "loss": 0.202, + "step": 1620 + }, + { + "epoch": 0.4678328859423136, + "grad_norm": 2.6867973804473877, + "learning_rate": 5.6024279210925645e-06, + "loss": 0.2501, + "step": 1623 + }, + { + "epoch": 0.4686976417388798, + "grad_norm": 2.1441292762756348, + "learning_rate": 5.593323216995447e-06, + "loss": 0.2364, + "step": 1626 + }, + { + "epoch": 0.469562397535446, + "grad_norm": 1.6109544038772583, + "learning_rate": 5.584218512898332e-06, + "loss": 0.244, + "step": 1629 + }, + { + "epoch": 0.4704271533320122, + "grad_norm": 2.0842268466949463, + "learning_rate": 5.575113808801215e-06, + "loss": 0.2434, + "step": 1632 + }, + { + "epoch": 0.4712919091285784, + "grad_norm": 1.4527335166931152, + "learning_rate": 5.5660091047040976e-06, + "loss": 0.2113, + "step": 1635 + }, + { + "epoch": 0.4721566649251446, + "grad_norm": 2.0434927940368652, + "learning_rate": 5.55690440060698e-06, + "loss": 0.2404, + "step": 1638 + }, + { + "epoch": 0.4730214207217108, + "grad_norm": 3.0256736278533936, + "learning_rate": 5.547799696509863e-06, + "loss": 0.2652, + "step": 1641 + }, + { + "epoch": 0.47388617651827697, + "grad_norm": 2.3856120109558105, + "learning_rate": 5.538694992412748e-06, + "loss": 0.2156, + "step": 1644 + }, + { + "epoch": 0.47475093231484317, + "grad_norm": 2.0948779582977295, + "learning_rate": 5.529590288315631e-06, + "loss": 0.2245, + "step": 1647 + }, + { + "epoch": 0.47561568811140936, + "grad_norm": 1.7975860834121704, + "learning_rate": 5.5204855842185135e-06, + "loss": 0.2542, + "step": 1650 + }, + { + "epoch": 0.47648044390797556, + "grad_norm": 3.510812997817993, + "learning_rate": 5.511380880121396e-06, + "loss": 0.2288, + "step": 1653 + }, + { + "epoch": 0.47734519970454176, + "grad_norm": 3.4858286380767822, + "learning_rate": 5.502276176024279e-06, + "loss": 0.2497, + "step": 1656 + }, + { + "epoch": 0.47820995550110795, + "grad_norm": 2.605661630630493, + "learning_rate": 5.493171471927162e-06, + "loss": 0.2062, + "step": 1659 + }, + { + "epoch": 0.47907471129767415, + "grad_norm": 2.162203788757324, + "learning_rate": 5.4840667678300465e-06, + "loss": 0.2167, + "step": 1662 + }, + { + "epoch": 0.47993946709424035, + "grad_norm": 4.7574262619018555, + "learning_rate": 5.474962063732929e-06, + "loss": 0.2088, + "step": 1665 + }, + { + "epoch": 0.48080422289080654, + "grad_norm": 2.1833505630493164, + "learning_rate": 5.465857359635812e-06, + "loss": 0.2622, + "step": 1668 + }, + { + "epoch": 0.48166897868737274, + "grad_norm": 1.645045280456543, + "learning_rate": 5.456752655538695e-06, + "loss": 0.2164, + "step": 1671 + }, + { + "epoch": 0.48253373448393894, + "grad_norm": 2.270944356918335, + "learning_rate": 5.447647951441578e-06, + "loss": 0.2441, + "step": 1674 + }, + { + "epoch": 0.4833984902805052, + "grad_norm": 2.0133309364318848, + "learning_rate": 5.4385432473444624e-06, + "loss": 0.2032, + "step": 1677 + }, + { + "epoch": 0.4842632460770714, + "grad_norm": 2.5585310459136963, + "learning_rate": 5.429438543247345e-06, + "loss": 0.2385, + "step": 1680 + }, + { + "epoch": 0.4851280018736376, + "grad_norm": 1.7377713918685913, + "learning_rate": 5.420333839150228e-06, + "loss": 0.216, + "step": 1683 + }, + { + "epoch": 0.4859927576702038, + "grad_norm": 1.8322491645812988, + "learning_rate": 5.411229135053111e-06, + "loss": 0.2102, + "step": 1686 + }, + { + "epoch": 0.48685751346677, + "grad_norm": 1.8956815004348755, + "learning_rate": 5.402124430955994e-06, + "loss": 0.2561, + "step": 1689 + }, + { + "epoch": 0.4877222692633362, + "grad_norm": 1.5535213947296143, + "learning_rate": 5.393019726858878e-06, + "loss": 0.2099, + "step": 1692 + }, + { + "epoch": 0.48858702505990237, + "grad_norm": 2.924278974533081, + "learning_rate": 5.383915022761761e-06, + "loss": 0.2662, + "step": 1695 + }, + { + "epoch": 0.48945178085646857, + "grad_norm": 2.1653637886047363, + "learning_rate": 5.374810318664644e-06, + "loss": 0.2419, + "step": 1698 + }, + { + "epoch": 0.49002828472084603, + "eval_loss": 0.22344937920570374, + "eval_mse": 0.22344938813522458, + "eval_runtime": 6.5992, + "eval_samples_per_second": 151.533, + "eval_steps_per_second": 18.942, + "step": 1700 + }, + { + "epoch": 0.49031653665303476, + "grad_norm": 2.360328197479248, + "learning_rate": 5.365705614567527e-06, + "loss": 0.2661, + "step": 1701 + }, + { + "epoch": 0.49118129244960096, + "grad_norm": 2.328495502471924, + "learning_rate": 5.35660091047041e-06, + "loss": 0.2347, + "step": 1704 + }, + { + "epoch": 0.49204604824616716, + "grad_norm": 1.6670514345169067, + "learning_rate": 5.347496206373293e-06, + "loss": 0.2269, + "step": 1707 + }, + { + "epoch": 0.49291080404273335, + "grad_norm": 2.426805257797241, + "learning_rate": 5.338391502276177e-06, + "loss": 0.1946, + "step": 1710 + }, + { + "epoch": 0.49377555983929955, + "grad_norm": 1.7583879232406616, + "learning_rate": 5.32928679817906e-06, + "loss": 0.2235, + "step": 1713 + }, + { + "epoch": 0.49464031563586575, + "grad_norm": 1.7235326766967773, + "learning_rate": 5.320182094081943e-06, + "loss": 0.2395, + "step": 1716 + }, + { + "epoch": 0.49550507143243194, + "grad_norm": 1.4216803312301636, + "learning_rate": 5.311077389984826e-06, + "loss": 0.2634, + "step": 1719 + }, + { + "epoch": 0.49636982722899814, + "grad_norm": 1.2892686128616333, + "learning_rate": 5.301972685887709e-06, + "loss": 0.2083, + "step": 1722 + }, + { + "epoch": 0.49723458302556434, + "grad_norm": 2.5210540294647217, + "learning_rate": 5.292867981790593e-06, + "loss": 0.2017, + "step": 1725 + }, + { + "epoch": 0.49809933882213053, + "grad_norm": 5.790046691894531, + "learning_rate": 5.283763277693476e-06, + "loss": 0.2306, + "step": 1728 + }, + { + "epoch": 0.49896409461869673, + "grad_norm": 1.4158023595809937, + "learning_rate": 5.274658573596359e-06, + "loss": 0.266, + "step": 1731 + }, + { + "epoch": 0.4998288504152629, + "grad_norm": 3.0490024089813232, + "learning_rate": 5.2655538694992415e-06, + "loss": 0.2506, + "step": 1734 + }, + { + "epoch": 0.5006936062118291, + "grad_norm": 2.33208966255188, + "learning_rate": 5.256449165402124e-06, + "loss": 0.2088, + "step": 1737 + }, + { + "epoch": 0.5015583620083953, + "grad_norm": 1.8087997436523438, + "learning_rate": 5.247344461305008e-06, + "loss": 0.2095, + "step": 1740 + }, + { + "epoch": 0.5024231178049615, + "grad_norm": 1.5517979860305786, + "learning_rate": 5.238239757207892e-06, + "loss": 0.2342, + "step": 1743 + }, + { + "epoch": 0.5032878736015277, + "grad_norm": 1.841036319732666, + "learning_rate": 5.229135053110775e-06, + "loss": 0.2385, + "step": 1746 + }, + { + "epoch": 0.5041526293980939, + "grad_norm": 1.8034095764160156, + "learning_rate": 5.2200303490136574e-06, + "loss": 0.2537, + "step": 1749 + }, + { + "epoch": 0.5050173851946601, + "grad_norm": 3.617159366607666, + "learning_rate": 5.21092564491654e-06, + "loss": 0.2378, + "step": 1752 + }, + { + "epoch": 0.5058821409912263, + "grad_norm": 2.903215169906616, + "learning_rate": 5.201820940819424e-06, + "loss": 0.2356, + "step": 1755 + }, + { + "epoch": 0.5067468967877925, + "grad_norm": 2.084693193435669, + "learning_rate": 5.192716236722307e-06, + "loss": 0.2631, + "step": 1758 + }, + { + "epoch": 0.5076116525843587, + "grad_norm": 1.8994488716125488, + "learning_rate": 5.1836115326251905e-06, + "loss": 0.2742, + "step": 1761 + }, + { + "epoch": 0.5084764083809249, + "grad_norm": 2.651257276535034, + "learning_rate": 5.174506828528073e-06, + "loss": 0.2332, + "step": 1764 + }, + { + "epoch": 0.5093411641774911, + "grad_norm": 4.182311534881592, + "learning_rate": 5.165402124430956e-06, + "loss": 0.2467, + "step": 1767 + }, + { + "epoch": 0.5102059199740573, + "grad_norm": 27.990720748901367, + "learning_rate": 5.15629742033384e-06, + "loss": 0.2135, + "step": 1770 + }, + { + "epoch": 0.5110706757706235, + "grad_norm": 1.942474126815796, + "learning_rate": 5.147192716236723e-06, + "loss": 0.2443, + "step": 1773 + }, + { + "epoch": 0.5119354315671897, + "grad_norm": 2.768105983734131, + "learning_rate": 5.1380880121396055e-06, + "loss": 0.2566, + "step": 1776 + }, + { + "epoch": 0.5128001873637559, + "grad_norm": 2.423797607421875, + "learning_rate": 5.128983308042489e-06, + "loss": 0.2667, + "step": 1779 + }, + { + "epoch": 0.5136649431603221, + "grad_norm": 2.395047426223755, + "learning_rate": 5.119878603945372e-06, + "loss": 0.2495, + "step": 1782 + }, + { + "epoch": 0.5145296989568883, + "grad_norm": 2.577787160873413, + "learning_rate": 5.110773899848256e-06, + "loss": 0.2454, + "step": 1785 + }, + { + "epoch": 0.5153944547534545, + "grad_norm": 3.106776714324951, + "learning_rate": 5.101669195751139e-06, + "loss": 0.2162, + "step": 1788 + }, + { + "epoch": 0.5162592105500207, + "grad_norm": 1.5912446975708008, + "learning_rate": 5.0925644916540215e-06, + "loss": 0.2201, + "step": 1791 + }, + { + "epoch": 0.517123966346587, + "grad_norm": 2.427795171737671, + "learning_rate": 5.083459787556905e-06, + "loss": 0.236, + "step": 1794 + }, + { + "epoch": 0.5179887221431532, + "grad_norm": 2.5363399982452393, + "learning_rate": 5.074355083459788e-06, + "loss": 0.2551, + "step": 1797 + }, + { + "epoch": 0.5188534779397194, + "grad_norm": 1.8077950477600098, + "learning_rate": 5.065250379362671e-06, + "loss": 0.2411, + "step": 1800 + }, + { + "epoch": 0.5188534779397194, + "eval_loss": 0.21374772489070892, + "eval_mse": 0.21374773593991994, + "eval_runtime": 6.6116, + "eval_samples_per_second": 151.249, + "eval_steps_per_second": 18.906, + "step": 1800 + }, + { + "epoch": 0.5197182337362856, + "grad_norm": 5.243107795715332, + "learning_rate": 5.0561456752655545e-06, + "loss": 0.243, + "step": 1803 + }, + { + "epoch": 0.5205829895328518, + "grad_norm": 2.7711331844329834, + "learning_rate": 5.047040971168437e-06, + "loss": 0.2037, + "step": 1806 + }, + { + "epoch": 0.521447745329418, + "grad_norm": 2.8418238162994385, + "learning_rate": 5.03793626707132e-06, + "loss": 0.2331, + "step": 1809 + }, + { + "epoch": 0.5223125011259842, + "grad_norm": 3.0842299461364746, + "learning_rate": 5.028831562974204e-06, + "loss": 0.2422, + "step": 1812 + }, + { + "epoch": 0.5231772569225503, + "grad_norm": 2.5956835746765137, + "learning_rate": 5.019726858877087e-06, + "loss": 0.2057, + "step": 1815 + }, + { + "epoch": 0.5240420127191165, + "grad_norm": 1.3121715784072876, + "learning_rate": 5.0106221547799704e-06, + "loss": 0.2387, + "step": 1818 + }, + { + "epoch": 0.5249067685156827, + "grad_norm": 2.2341432571411133, + "learning_rate": 5.001517450682853e-06, + "loss": 0.2611, + "step": 1821 + }, + { + "epoch": 0.5257715243122489, + "grad_norm": 2.0494587421417236, + "learning_rate": 4.992412746585736e-06, + "loss": 0.2238, + "step": 1824 + }, + { + "epoch": 0.5266362801088151, + "grad_norm": 2.2597897052764893, + "learning_rate": 4.983308042488619e-06, + "loss": 0.2322, + "step": 1827 + }, + { + "epoch": 0.5275010359053813, + "grad_norm": 3.5993051528930664, + "learning_rate": 4.974203338391503e-06, + "loss": 0.234, + "step": 1830 + }, + { + "epoch": 0.5283657917019475, + "grad_norm": 3.769505262374878, + "learning_rate": 4.9650986342943855e-06, + "loss": 0.2454, + "step": 1833 + }, + { + "epoch": 0.5292305474985137, + "grad_norm": 1.4726693630218506, + "learning_rate": 4.955993930197269e-06, + "loss": 0.2261, + "step": 1836 + }, + { + "epoch": 0.5300953032950799, + "grad_norm": 2.2910659313201904, + "learning_rate": 4.946889226100152e-06, + "loss": 0.2551, + "step": 1839 + }, + { + "epoch": 0.5309600590916461, + "grad_norm": 1.8401825428009033, + "learning_rate": 4.937784522003035e-06, + "loss": 0.1891, + "step": 1842 + }, + { + "epoch": 0.5318248148882123, + "grad_norm": 2.0589776039123535, + "learning_rate": 4.9286798179059185e-06, + "loss": 0.2475, + "step": 1845 + }, + { + "epoch": 0.5326895706847785, + "grad_norm": 1.8461434841156006, + "learning_rate": 4.919575113808801e-06, + "loss": 0.2516, + "step": 1848 + }, + { + "epoch": 0.5335543264813447, + "grad_norm": 1.8950605392456055, + "learning_rate": 4.910470409711684e-06, + "loss": 0.246, + "step": 1851 + }, + { + "epoch": 0.5344190822779109, + "grad_norm": 1.7544567584991455, + "learning_rate": 4.901365705614568e-06, + "loss": 0.2159, + "step": 1854 + }, + { + "epoch": 0.5352838380744771, + "grad_norm": 2.04953932762146, + "learning_rate": 4.892261001517451e-06, + "loss": 0.2745, + "step": 1857 + }, + { + "epoch": 0.5361485938710433, + "grad_norm": 2.173112154006958, + "learning_rate": 4.8831562974203345e-06, + "loss": 0.2222, + "step": 1860 + }, + { + "epoch": 0.5370133496676095, + "grad_norm": 1.4713711738586426, + "learning_rate": 4.874051593323217e-06, + "loss": 0.2182, + "step": 1863 + }, + { + "epoch": 0.5378781054641757, + "grad_norm": 2.421405792236328, + "learning_rate": 4.8649468892261e-06, + "loss": 0.2472, + "step": 1866 + }, + { + "epoch": 0.5387428612607419, + "grad_norm": 2.6306304931640625, + "learning_rate": 4.855842185128984e-06, + "loss": 0.2148, + "step": 1869 + }, + { + "epoch": 0.5396076170573081, + "grad_norm": 3.4957375526428223, + "learning_rate": 4.846737481031867e-06, + "loss": 0.2321, + "step": 1872 + }, + { + "epoch": 0.5404723728538743, + "grad_norm": 4.008154392242432, + "learning_rate": 4.8376327769347495e-06, + "loss": 0.216, + "step": 1875 + }, + { + "epoch": 0.5413371286504405, + "grad_norm": 2.417433977127075, + "learning_rate": 4.828528072837633e-06, + "loss": 0.2261, + "step": 1878 + }, + { + "epoch": 0.5422018844470067, + "grad_norm": 1.7908028364181519, + "learning_rate": 4.819423368740516e-06, + "loss": 0.2445, + "step": 1881 + }, + { + "epoch": 0.5430666402435729, + "grad_norm": 1.3756656646728516, + "learning_rate": 4.8103186646434e-06, + "loss": 0.201, + "step": 1884 + }, + { + "epoch": 0.5439313960401391, + "grad_norm": 1.686787724494934, + "learning_rate": 4.801213960546283e-06, + "loss": 0.217, + "step": 1887 + }, + { + "epoch": 0.5447961518367053, + "grad_norm": 3.6207942962646484, + "learning_rate": 4.792109256449165e-06, + "loss": 0.2746, + "step": 1890 + }, + { + "epoch": 0.5456609076332715, + "grad_norm": 3.838956117630005, + "learning_rate": 4.783004552352049e-06, + "loss": 0.2313, + "step": 1893 + }, + { + "epoch": 0.5465256634298377, + "grad_norm": 2.059926748275757, + "learning_rate": 4.773899848254932e-06, + "loss": 0.2302, + "step": 1896 + }, + { + "epoch": 0.5473904192264039, + "grad_norm": 1.9524738788604736, + "learning_rate": 4.764795144157816e-06, + "loss": 0.2473, + "step": 1899 + }, + { + "epoch": 0.5476786711585926, + "eval_loss": 0.21403329074382782, + "eval_mse": 0.2140333094932139, + "eval_runtime": 6.5394, + "eval_samples_per_second": 152.918, + "eval_steps_per_second": 19.115, + "step": 1900 + }, + { + "epoch": 0.54825517502297, + "grad_norm": 1.8144432306289673, + "learning_rate": 4.7556904400606985e-06, + "loss": 0.2082, + "step": 1902 + }, + { + "epoch": 0.5491199308195363, + "grad_norm": 2.4071717262268066, + "learning_rate": 4.746585735963581e-06, + "loss": 0.2364, + "step": 1905 + }, + { + "epoch": 0.5499846866161024, + "grad_norm": 1.7162179946899414, + "learning_rate": 4.737481031866465e-06, + "loss": 0.2119, + "step": 1908 + }, + { + "epoch": 0.5508494424126686, + "grad_norm": 2.368528366088867, + "learning_rate": 4.728376327769348e-06, + "loss": 0.2314, + "step": 1911 + }, + { + "epoch": 0.5517141982092348, + "grad_norm": 3.422670602798462, + "learning_rate": 4.719271623672231e-06, + "loss": 0.2355, + "step": 1914 + }, + { + "epoch": 0.552578954005801, + "grad_norm": 2.324976682662964, + "learning_rate": 4.710166919575114e-06, + "loss": 0.2483, + "step": 1917 + }, + { + "epoch": 0.5534437098023672, + "grad_norm": 2.3686418533325195, + "learning_rate": 4.701062215477997e-06, + "loss": 0.204, + "step": 1920 + }, + { + "epoch": 0.5543084655989334, + "grad_norm": 2.1361286640167236, + "learning_rate": 4.691957511380881e-06, + "loss": 0.195, + "step": 1923 + }, + { + "epoch": 0.5551732213954996, + "grad_norm": 1.5527316331863403, + "learning_rate": 4.682852807283764e-06, + "loss": 0.2232, + "step": 1926 + }, + { + "epoch": 0.5560379771920658, + "grad_norm": 3.792592763900757, + "learning_rate": 4.673748103186647e-06, + "loss": 0.2157, + "step": 1929 + }, + { + "epoch": 0.556902732988632, + "grad_norm": 1.8878562450408936, + "learning_rate": 4.66464339908953e-06, + "loss": 0.21, + "step": 1932 + }, + { + "epoch": 0.5577674887851982, + "grad_norm": 1.686164140701294, + "learning_rate": 4.655538694992413e-06, + "loss": 0.2177, + "step": 1935 + }, + { + "epoch": 0.5586322445817644, + "grad_norm": 2.310054302215576, + "learning_rate": 4.646433990895296e-06, + "loss": 0.2039, + "step": 1938 + }, + { + "epoch": 0.5594970003783306, + "grad_norm": 1.8163293600082397, + "learning_rate": 4.63732928679818e-06, + "loss": 0.2117, + "step": 1941 + }, + { + "epoch": 0.5603617561748968, + "grad_norm": 1.608209490776062, + "learning_rate": 4.6282245827010625e-06, + "loss": 0.2361, + "step": 1944 + }, + { + "epoch": 0.561226511971463, + "grad_norm": 1.4159106016159058, + "learning_rate": 4.619119878603946e-06, + "loss": 0.2042, + "step": 1947 + }, + { + "epoch": 0.5620912677680293, + "grad_norm": 2.920888900756836, + "learning_rate": 4.610015174506829e-06, + "loss": 0.2715, + "step": 1950 + }, + { + "epoch": 0.5629560235645955, + "grad_norm": 1.1805585622787476, + "learning_rate": 4.600910470409712e-06, + "loss": 0.2329, + "step": 1953 + }, + { + "epoch": 0.5638207793611617, + "grad_norm": 3.1957271099090576, + "learning_rate": 4.591805766312596e-06, + "loss": 0.253, + "step": 1956 + }, + { + "epoch": 0.5646855351577279, + "grad_norm": 2.3281495571136475, + "learning_rate": 4.582701062215478e-06, + "loss": 0.2508, + "step": 1959 + }, + { + "epoch": 0.5655502909542941, + "grad_norm": 1.9826480150222778, + "learning_rate": 4.573596358118362e-06, + "loss": 0.2107, + "step": 1962 + }, + { + "epoch": 0.5664150467508603, + "grad_norm": 5.98641300201416, + "learning_rate": 4.564491654021245e-06, + "loss": 0.2343, + "step": 1965 + }, + { + "epoch": 0.5672798025474265, + "grad_norm": 4.082788944244385, + "learning_rate": 4.555386949924128e-06, + "loss": 0.2493, + "step": 1968 + }, + { + "epoch": 0.5681445583439927, + "grad_norm": 1.6683293581008911, + "learning_rate": 4.5462822458270115e-06, + "loss": 0.1892, + "step": 1971 + }, + { + "epoch": 0.5690093141405589, + "grad_norm": 4.348227024078369, + "learning_rate": 4.537177541729894e-06, + "loss": 0.2277, + "step": 1974 + }, + { + "epoch": 0.5698740699371251, + "grad_norm": 1.2774113416671753, + "learning_rate": 4.528072837632777e-06, + "loss": 0.2077, + "step": 1977 + }, + { + "epoch": 0.5707388257336913, + "grad_norm": 1.5915591716766357, + "learning_rate": 4.518968133535661e-06, + "loss": 0.2284, + "step": 1980 + }, + { + "epoch": 0.5716035815302575, + "grad_norm": 1.3817569017410278, + "learning_rate": 4.509863429438544e-06, + "loss": 0.2159, + "step": 1983 + }, + { + "epoch": 0.5724683373268237, + "grad_norm": 1.3606081008911133, + "learning_rate": 4.500758725341427e-06, + "loss": 0.2055, + "step": 1986 + }, + { + "epoch": 0.5733330931233899, + "grad_norm": 2.6602344512939453, + "learning_rate": 4.49165402124431e-06, + "loss": 0.2611, + "step": 1989 + }, + { + "epoch": 0.5741978489199561, + "grad_norm": 1.796570897102356, + "learning_rate": 4.482549317147193e-06, + "loss": 0.1925, + "step": 1992 + }, + { + "epoch": 0.5750626047165223, + "grad_norm": 2.4218592643737793, + "learning_rate": 4.473444613050077e-06, + "loss": 0.1992, + "step": 1995 + }, + { + "epoch": 0.5759273605130885, + "grad_norm": 1.9690345525741577, + "learning_rate": 4.46433990895296e-06, + "loss": 0.237, + "step": 1998 + }, + { + "epoch": 0.5765038643774659, + "eval_loss": 0.2176571637392044, + "eval_mse": 0.2176571699755732, + "eval_runtime": 6.5589, + "eval_samples_per_second": 152.465, + "eval_steps_per_second": 19.058, + "step": 2000 + }, + { + "epoch": 0.5767921163096547, + "grad_norm": 2.168555498123169, + "learning_rate": 4.4552352048558425e-06, + "loss": 0.2009, + "step": 2001 + }, + { + "epoch": 0.5776568721062209, + "grad_norm": 3.1508069038391113, + "learning_rate": 4.446130500758726e-06, + "loss": 0.2316, + "step": 2004 + }, + { + "epoch": 0.578521627902787, + "grad_norm": 1.686283826828003, + "learning_rate": 4.437025796661609e-06, + "loss": 0.2313, + "step": 2007 + }, + { + "epoch": 0.5793863836993532, + "grad_norm": 1.3212209939956665, + "learning_rate": 4.427921092564492e-06, + "loss": 0.2076, + "step": 2010 + }, + { + "epoch": 0.5802511394959194, + "grad_norm": 3.2884185314178467, + "learning_rate": 4.4188163884673755e-06, + "loss": 0.2345, + "step": 2013 + }, + { + "epoch": 0.5811158952924856, + "grad_norm": 1.9892568588256836, + "learning_rate": 4.409711684370258e-06, + "loss": 0.2428, + "step": 2016 + }, + { + "epoch": 0.5819806510890518, + "grad_norm": 1.5294564962387085, + "learning_rate": 4.400606980273141e-06, + "loss": 0.2252, + "step": 2019 + }, + { + "epoch": 0.582845406885618, + "grad_norm": 2.2944376468658447, + "learning_rate": 4.391502276176025e-06, + "loss": 0.2429, + "step": 2022 + }, + { + "epoch": 0.5837101626821842, + "grad_norm": 1.584145426750183, + "learning_rate": 4.382397572078908e-06, + "loss": 0.2548, + "step": 2025 + }, + { + "epoch": 0.5845749184787504, + "grad_norm": 1.604771375656128, + "learning_rate": 4.3732928679817906e-06, + "loss": 0.2211, + "step": 2028 + }, + { + "epoch": 0.5854396742753166, + "grad_norm": 1.7536587715148926, + "learning_rate": 4.364188163884674e-06, + "loss": 0.2165, + "step": 2031 + }, + { + "epoch": 0.5863044300718828, + "grad_norm": 1.6281161308288574, + "learning_rate": 4.355083459787557e-06, + "loss": 0.2609, + "step": 2034 + }, + { + "epoch": 0.587169185868449, + "grad_norm": 1.77180016040802, + "learning_rate": 4.34597875569044e-06, + "loss": 0.2333, + "step": 2037 + }, + { + "epoch": 0.5880339416650152, + "grad_norm": 3.2408559322357178, + "learning_rate": 4.336874051593324e-06, + "loss": 0.2454, + "step": 2040 + }, + { + "epoch": 0.5888986974615814, + "grad_norm": 3.4444427490234375, + "learning_rate": 4.3277693474962065e-06, + "loss": 0.2549, + "step": 2043 + }, + { + "epoch": 0.5897634532581476, + "grad_norm": 4.399412155151367, + "learning_rate": 4.31866464339909e-06, + "loss": 0.2258, + "step": 2046 + }, + { + "epoch": 0.5906282090547138, + "grad_norm": 1.646681785583496, + "learning_rate": 4.309559939301973e-06, + "loss": 0.1858, + "step": 2049 + }, + { + "epoch": 0.59149296485128, + "grad_norm": 2.786576986312866, + "learning_rate": 4.300455235204856e-06, + "loss": 0.2085, + "step": 2052 + }, + { + "epoch": 0.5923577206478462, + "grad_norm": 3.0838379859924316, + "learning_rate": 4.2913505311077395e-06, + "loss": 0.2428, + "step": 2055 + }, + { + "epoch": 0.5932224764444124, + "grad_norm": 1.518548846244812, + "learning_rate": 4.282245827010622e-06, + "loss": 0.2159, + "step": 2058 + }, + { + "epoch": 0.5940872322409786, + "grad_norm": 2.2088887691497803, + "learning_rate": 4.273141122913505e-06, + "loss": 0.2107, + "step": 2061 + }, + { + "epoch": 0.5949519880375448, + "grad_norm": 4.359911918640137, + "learning_rate": 4.264036418816389e-06, + "loss": 0.2461, + "step": 2064 + }, + { + "epoch": 0.595816743834111, + "grad_norm": 1.796669840812683, + "learning_rate": 4.254931714719272e-06, + "loss": 0.1836, + "step": 2067 + }, + { + "epoch": 0.5966814996306772, + "grad_norm": 2.0835959911346436, + "learning_rate": 4.245827010622155e-06, + "loss": 0.2192, + "step": 2070 + }, + { + "epoch": 0.5975462554272434, + "grad_norm": 2.772815227508545, + "learning_rate": 4.236722306525038e-06, + "loss": 0.1974, + "step": 2073 + }, + { + "epoch": 0.5984110112238096, + "grad_norm": 1.8529541492462158, + "learning_rate": 4.227617602427921e-06, + "loss": 0.1946, + "step": 2076 + }, + { + "epoch": 0.5992757670203758, + "grad_norm": 2.0415849685668945, + "learning_rate": 4.218512898330804e-06, + "loss": 0.2182, + "step": 2079 + }, + { + "epoch": 0.600140522816942, + "grad_norm": 2.6295053958892822, + "learning_rate": 4.209408194233688e-06, + "loss": 0.2438, + "step": 2082 + }, + { + "epoch": 0.6010052786135082, + "grad_norm": 1.9082621335983276, + "learning_rate": 4.2003034901365705e-06, + "loss": 0.1834, + "step": 2085 + }, + { + "epoch": 0.6018700344100744, + "grad_norm": 1.692436933517456, + "learning_rate": 4.191198786039454e-06, + "loss": 0.2104, + "step": 2088 + }, + { + "epoch": 0.6027347902066406, + "grad_norm": 2.0836997032165527, + "learning_rate": 4.182094081942337e-06, + "loss": 0.2323, + "step": 2091 + }, + { + "epoch": 0.6035995460032068, + "grad_norm": 1.354612946510315, + "learning_rate": 4.17298937784522e-06, + "loss": 0.1975, + "step": 2094 + }, + { + "epoch": 0.604464301799773, + "grad_norm": 3.683278799057007, + "learning_rate": 4.1638846737481036e-06, + "loss": 0.2256, + "step": 2097 + }, + { + "epoch": 0.6053290575963391, + "grad_norm": 2.232513189315796, + "learning_rate": 4.154779969650986e-06, + "loss": 0.1972, + "step": 2100 + }, + { + "epoch": 0.6053290575963391, + "eval_loss": 0.21862919628620148, + "eval_mse": 0.218629194105044, + "eval_runtime": 6.7266, + "eval_samples_per_second": 148.664, + "eval_steps_per_second": 18.583, + "step": 2100 + }, + { + "epoch": 0.6061938133929053, + "grad_norm": 1.5396499633789062, + "learning_rate": 4.145675265553869e-06, + "loss": 0.2151, + "step": 2103 + }, + { + "epoch": 0.6070585691894717, + "grad_norm": 1.482253909111023, + "learning_rate": 4.136570561456753e-06, + "loss": 0.2222, + "step": 2106 + }, + { + "epoch": 0.6079233249860378, + "grad_norm": 4.880987644195557, + "learning_rate": 4.127465857359636e-06, + "loss": 0.2535, + "step": 2109 + }, + { + "epoch": 0.608788080782604, + "grad_norm": 2.1557230949401855, + "learning_rate": 4.1183611532625195e-06, + "loss": 0.2213, + "step": 2112 + }, + { + "epoch": 0.6096528365791702, + "grad_norm": 2.417856454849243, + "learning_rate": 4.109256449165402e-06, + "loss": 0.2481, + "step": 2115 + }, + { + "epoch": 0.6105175923757364, + "grad_norm": 2.211514949798584, + "learning_rate": 4.100151745068285e-06, + "loss": 0.2369, + "step": 2118 + }, + { + "epoch": 0.6113823481723026, + "grad_norm": 2.2844600677490234, + "learning_rate": 4.091047040971169e-06, + "loss": 0.2174, + "step": 2121 + }, + { + "epoch": 0.6122471039688688, + "grad_norm": 2.7534289360046387, + "learning_rate": 4.081942336874052e-06, + "loss": 0.22, + "step": 2124 + }, + { + "epoch": 0.613111859765435, + "grad_norm": 1.5547044277191162, + "learning_rate": 4.072837632776935e-06, + "loss": 0.227, + "step": 2127 + }, + { + "epoch": 0.6139766155620012, + "grad_norm": 1.6965092420578003, + "learning_rate": 4.063732928679818e-06, + "loss": 0.219, + "step": 2130 + }, + { + "epoch": 0.6148413713585674, + "grad_norm": 1.8000000715255737, + "learning_rate": 4.054628224582701e-06, + "loss": 0.2303, + "step": 2133 + }, + { + "epoch": 0.6157061271551336, + "grad_norm": 1.5764316320419312, + "learning_rate": 4.045523520485585e-06, + "loss": 0.191, + "step": 2136 + }, + { + "epoch": 0.6165708829516998, + "grad_norm": 2.0041658878326416, + "learning_rate": 4.036418816388468e-06, + "loss": 0.2025, + "step": 2139 + }, + { + "epoch": 0.617435638748266, + "grad_norm": 1.9013463258743286, + "learning_rate": 4.0273141122913504e-06, + "loss": 0.225, + "step": 2142 + }, + { + "epoch": 0.6183003945448322, + "grad_norm": 2.1815786361694336, + "learning_rate": 4.018209408194234e-06, + "loss": 0.2409, + "step": 2145 + }, + { + "epoch": 0.6191651503413984, + "grad_norm": 1.6740418672561646, + "learning_rate": 4.009104704097117e-06, + "loss": 0.2143, + "step": 2148 + }, + { + "epoch": 0.6200299061379646, + "grad_norm": 2.1056814193725586, + "learning_rate": 4.000000000000001e-06, + "loss": 0.2365, + "step": 2151 + }, + { + "epoch": 0.6208946619345308, + "grad_norm": 2.629563808441162, + "learning_rate": 3.9908952959028835e-06, + "loss": 0.1932, + "step": 2154 + }, + { + "epoch": 0.621759417731097, + "grad_norm": 1.7547650337219238, + "learning_rate": 3.981790591805766e-06, + "loss": 0.246, + "step": 2157 + }, + { + "epoch": 0.6226241735276632, + "grad_norm": 2.2451794147491455, + "learning_rate": 3.97268588770865e-06, + "loss": 0.2405, + "step": 2160 + }, + { + "epoch": 0.6234889293242294, + "grad_norm": 2.8820624351501465, + "learning_rate": 3.963581183611533e-06, + "loss": 0.253, + "step": 2163 + }, + { + "epoch": 0.6243536851207956, + "grad_norm": 2.9832215309143066, + "learning_rate": 3.9544764795144166e-06, + "loss": 0.223, + "step": 2166 + }, + { + "epoch": 0.6252184409173618, + "grad_norm": 2.911879539489746, + "learning_rate": 3.945371775417299e-06, + "loss": 0.2177, + "step": 2169 + }, + { + "epoch": 0.626083196713928, + "grad_norm": 2.266767740249634, + "learning_rate": 3.936267071320182e-06, + "loss": 0.2288, + "step": 2172 + }, + { + "epoch": 0.6269479525104942, + "grad_norm": 1.401633858680725, + "learning_rate": 3.927162367223066e-06, + "loss": 0.2556, + "step": 2175 + }, + { + "epoch": 0.6278127083070604, + "grad_norm": 2.7354822158813477, + "learning_rate": 3.918057663125949e-06, + "loss": 0.2439, + "step": 2178 + }, + { + "epoch": 0.6286774641036266, + "grad_norm": 1.6652506589889526, + "learning_rate": 3.908952959028832e-06, + "loss": 0.2041, + "step": 2181 + }, + { + "epoch": 0.6295422199001928, + "grad_norm": 3.3072733879089355, + "learning_rate": 3.899848254931715e-06, + "loss": 0.2207, + "step": 2184 + }, + { + "epoch": 0.630406975696759, + "grad_norm": 2.254608392715454, + "learning_rate": 3.890743550834598e-06, + "loss": 0.2352, + "step": 2187 + }, + { + "epoch": 0.6312717314933252, + "grad_norm": 1.765981674194336, + "learning_rate": 3.881638846737482e-06, + "loss": 0.2923, + "step": 2190 + }, + { + "epoch": 0.6321364872898914, + "grad_norm": 1.7792342901229858, + "learning_rate": 3.872534142640365e-06, + "loss": 0.2428, + "step": 2193 + }, + { + "epoch": 0.6330012430864576, + "grad_norm": 5.084781646728516, + "learning_rate": 3.8634294385432475e-06, + "loss": 0.2616, + "step": 2196 + }, + { + "epoch": 0.6338659988830238, + "grad_norm": 2.0845305919647217, + "learning_rate": 3.854324734446131e-06, + "loss": 0.2556, + "step": 2199 + }, + { + "epoch": 0.6341542508152125, + "eval_loss": 0.2416454404592514, + "eval_mse": 0.2416454482518602, + "eval_runtime": 6.5442, + "eval_samples_per_second": 152.808, + "eval_steps_per_second": 19.101, + "step": 2200 + }, + { + "epoch": 0.63473075467959, + "grad_norm": 3.8279173374176025, + "learning_rate": 3.845220030349014e-06, + "loss": 0.2245, + "step": 2202 + }, + { + "epoch": 0.6355955104761561, + "grad_norm": 3.564417839050293, + "learning_rate": 3.836115326251897e-06, + "loss": 0.2233, + "step": 2205 + }, + { + "epoch": 0.6364602662727223, + "grad_norm": 2.4322614669799805, + "learning_rate": 3.827010622154781e-06, + "loss": 0.2227, + "step": 2208 + }, + { + "epoch": 0.6373250220692885, + "grad_norm": 1.6439640522003174, + "learning_rate": 3.8179059180576634e-06, + "loss": 0.2135, + "step": 2211 + }, + { + "epoch": 0.6381897778658547, + "grad_norm": 2.605598211288452, + "learning_rate": 3.8088012139605467e-06, + "loss": 0.2203, + "step": 2214 + }, + { + "epoch": 0.6390545336624209, + "grad_norm": 1.939488410949707, + "learning_rate": 3.7996965098634296e-06, + "loss": 0.2377, + "step": 2217 + }, + { + "epoch": 0.6399192894589871, + "grad_norm": 1.851778507232666, + "learning_rate": 3.790591805766313e-06, + "loss": 0.2199, + "step": 2220 + }, + { + "epoch": 0.6407840452555533, + "grad_norm": 2.077923059463501, + "learning_rate": 3.781487101669196e-06, + "loss": 0.223, + "step": 2223 + }, + { + "epoch": 0.6416488010521195, + "grad_norm": 5.192010402679443, + "learning_rate": 3.772382397572079e-06, + "loss": 0.2412, + "step": 2226 + }, + { + "epoch": 0.6425135568486857, + "grad_norm": 1.6057194471359253, + "learning_rate": 3.7632776934749626e-06, + "loss": 0.2354, + "step": 2229 + }, + { + "epoch": 0.6433783126452519, + "grad_norm": 3.0130653381347656, + "learning_rate": 3.7541729893778455e-06, + "loss": 0.2307, + "step": 2232 + }, + { + "epoch": 0.6442430684418181, + "grad_norm": 2.835080623626709, + "learning_rate": 3.7450682852807287e-06, + "loss": 0.1929, + "step": 2235 + }, + { + "epoch": 0.6451078242383843, + "grad_norm": 1.800140619277954, + "learning_rate": 3.735963581183612e-06, + "loss": 0.2158, + "step": 2238 + }, + { + "epoch": 0.6459725800349505, + "grad_norm": 1.8859021663665771, + "learning_rate": 3.726858877086495e-06, + "loss": 0.2469, + "step": 2241 + }, + { + "epoch": 0.6468373358315167, + "grad_norm": 1.8524531126022339, + "learning_rate": 3.717754172989378e-06, + "loss": 0.2257, + "step": 2244 + }, + { + "epoch": 0.6477020916280829, + "grad_norm": 2.759021520614624, + "learning_rate": 3.7086494688922614e-06, + "loss": 0.2187, + "step": 2247 + }, + { + "epoch": 0.6485668474246491, + "grad_norm": 1.885272741317749, + "learning_rate": 3.699544764795144e-06, + "loss": 0.2416, + "step": 2250 + }, + { + "epoch": 0.6494316032212153, + "grad_norm": 2.244595527648926, + "learning_rate": 3.690440060698028e-06, + "loss": 0.2002, + "step": 2253 + }, + { + "epoch": 0.6502963590177815, + "grad_norm": 2.533815622329712, + "learning_rate": 3.6813353566009107e-06, + "loss": 0.2065, + "step": 2256 + }, + { + "epoch": 0.6511611148143477, + "grad_norm": 1.883478045463562, + "learning_rate": 3.6722306525037936e-06, + "loss": 0.2014, + "step": 2259 + }, + { + "epoch": 0.652025870610914, + "grad_norm": 4.37358283996582, + "learning_rate": 3.6631259484066773e-06, + "loss": 0.2347, + "step": 2262 + }, + { + "epoch": 0.6528906264074802, + "grad_norm": 1.9453434944152832, + "learning_rate": 3.65402124430956e-06, + "loss": 0.2414, + "step": 2265 + }, + { + "epoch": 0.6537553822040464, + "grad_norm": 1.5430552959442139, + "learning_rate": 3.644916540212443e-06, + "loss": 0.2012, + "step": 2268 + }, + { + "epoch": 0.6546201380006126, + "grad_norm": 2.023857593536377, + "learning_rate": 3.6358118361153266e-06, + "loss": 0.216, + "step": 2271 + }, + { + "epoch": 0.6554848937971788, + "grad_norm": 2.0380475521087646, + "learning_rate": 3.6267071320182095e-06, + "loss": 0.1851, + "step": 2274 + }, + { + "epoch": 0.656349649593745, + "grad_norm": 1.9703435897827148, + "learning_rate": 3.617602427921093e-06, + "loss": 0.2152, + "step": 2277 + }, + { + "epoch": 0.6572144053903112, + "grad_norm": 2.0153567790985107, + "learning_rate": 3.608497723823976e-06, + "loss": 0.2248, + "step": 2280 + }, + { + "epoch": 0.6580791611868774, + "grad_norm": 1.6611104011535645, + "learning_rate": 3.599393019726859e-06, + "loss": 0.2416, + "step": 2283 + }, + { + "epoch": 0.6589439169834436, + "grad_norm": 1.8866307735443115, + "learning_rate": 3.5902883156297426e-06, + "loss": 0.2319, + "step": 2286 + }, + { + "epoch": 0.6598086727800098, + "grad_norm": 2.9478352069854736, + "learning_rate": 3.5811836115326254e-06, + "loss": 0.2614, + "step": 2289 + }, + { + "epoch": 0.660673428576576, + "grad_norm": 1.578539252281189, + "learning_rate": 3.572078907435509e-06, + "loss": 0.2097, + "step": 2292 + }, + { + "epoch": 0.6615381843731422, + "grad_norm": 3.096663236618042, + "learning_rate": 3.562974203338392e-06, + "loss": 0.1977, + "step": 2295 + }, + { + "epoch": 0.6624029401697084, + "grad_norm": 1.81285560131073, + "learning_rate": 3.5538694992412748e-06, + "loss": 0.2273, + "step": 2298 + }, + { + "epoch": 0.6629794440340858, + "eval_loss": 0.2196592092514038, + "eval_mse": 0.2196592075770641, + "eval_runtime": 6.5661, + "eval_samples_per_second": 152.298, + "eval_steps_per_second": 19.037, + "step": 2300 + }, + { + "epoch": 0.6632676959662746, + "grad_norm": 1.6824992895126343, + "learning_rate": 3.5447647951441585e-06, + "loss": 0.1899, + "step": 2301 + }, + { + "epoch": 0.6641324517628407, + "grad_norm": 2.2244341373443604, + "learning_rate": 3.5356600910470413e-06, + "loss": 0.2113, + "step": 2304 + }, + { + "epoch": 0.6649972075594069, + "grad_norm": 2.0596795082092285, + "learning_rate": 3.526555386949924e-06, + "loss": 0.2264, + "step": 2307 + }, + { + "epoch": 0.6658619633559731, + "grad_norm": 2.2340962886810303, + "learning_rate": 3.517450682852808e-06, + "loss": 0.2168, + "step": 2310 + }, + { + "epoch": 0.6667267191525393, + "grad_norm": 1.9558560848236084, + "learning_rate": 3.5083459787556907e-06, + "loss": 0.2088, + "step": 2313 + }, + { + "epoch": 0.6675914749491055, + "grad_norm": 1.9379972219467163, + "learning_rate": 3.499241274658574e-06, + "loss": 0.2284, + "step": 2316 + }, + { + "epoch": 0.6684562307456717, + "grad_norm": 2.3833818435668945, + "learning_rate": 3.490136570561457e-06, + "loss": 0.2526, + "step": 2319 + }, + { + "epoch": 0.6693209865422379, + "grad_norm": 2.1912760734558105, + "learning_rate": 3.48103186646434e-06, + "loss": 0.2579, + "step": 2322 + }, + { + "epoch": 0.6701857423388041, + "grad_norm": 1.4502041339874268, + "learning_rate": 3.4719271623672233e-06, + "loss": 0.2118, + "step": 2325 + }, + { + "epoch": 0.6710504981353703, + "grad_norm": 1.6936084032058716, + "learning_rate": 3.4628224582701066e-06, + "loss": 0.2417, + "step": 2328 + }, + { + "epoch": 0.6719152539319365, + "grad_norm": 2.5630977153778076, + "learning_rate": 3.45371775417299e-06, + "loss": 0.255, + "step": 2331 + }, + { + "epoch": 0.6727800097285027, + "grad_norm": 1.7571030855178833, + "learning_rate": 3.4446130500758727e-06, + "loss": 0.2122, + "step": 2334 + }, + { + "epoch": 0.6736447655250689, + "grad_norm": 2.4220032691955566, + "learning_rate": 3.435508345978756e-06, + "loss": 0.2383, + "step": 2337 + }, + { + "epoch": 0.6745095213216351, + "grad_norm": 2.3608193397521973, + "learning_rate": 3.4264036418816392e-06, + "loss": 0.241, + "step": 2340 + }, + { + "epoch": 0.6753742771182013, + "grad_norm": 2.0964229106903076, + "learning_rate": 3.417298937784522e-06, + "loss": 0.2178, + "step": 2343 + }, + { + "epoch": 0.6762390329147675, + "grad_norm": 3.038722038269043, + "learning_rate": 3.4081942336874053e-06, + "loss": 0.2276, + "step": 2346 + }, + { + "epoch": 0.6771037887113337, + "grad_norm": 3.0551295280456543, + "learning_rate": 3.3990895295902886e-06, + "loss": 0.1959, + "step": 2349 + }, + { + "epoch": 0.6779685445078999, + "grad_norm": 1.4905811548233032, + "learning_rate": 3.3899848254931714e-06, + "loss": 0.1863, + "step": 2352 + }, + { + "epoch": 0.6788333003044661, + "grad_norm": 3.0637025833129883, + "learning_rate": 3.380880121396055e-06, + "loss": 0.2341, + "step": 2355 + }, + { + "epoch": 0.6796980561010323, + "grad_norm": 2.9851162433624268, + "learning_rate": 3.371775417298938e-06, + "loss": 0.2586, + "step": 2358 + }, + { + "epoch": 0.6805628118975985, + "grad_norm": 2.0650391578674316, + "learning_rate": 3.3626707132018212e-06, + "loss": 0.2181, + "step": 2361 + }, + { + "epoch": 0.6814275676941647, + "grad_norm": 3.1792430877685547, + "learning_rate": 3.3535660091047045e-06, + "loss": 0.234, + "step": 2364 + }, + { + "epoch": 0.6822923234907309, + "grad_norm": 2.171764612197876, + "learning_rate": 3.3444613050075873e-06, + "loss": 0.2232, + "step": 2367 + }, + { + "epoch": 0.6831570792872971, + "grad_norm": 1.8832968473434448, + "learning_rate": 3.3353566009104706e-06, + "loss": 0.2386, + "step": 2370 + }, + { + "epoch": 0.6840218350838633, + "grad_norm": 2.217407703399658, + "learning_rate": 3.326251896813354e-06, + "loss": 0.2575, + "step": 2373 + }, + { + "epoch": 0.6848865908804295, + "grad_norm": 1.3866760730743408, + "learning_rate": 3.3171471927162367e-06, + "loss": 0.2321, + "step": 2376 + }, + { + "epoch": 0.6857513466769957, + "grad_norm": 2.836749315261841, + "learning_rate": 3.3080424886191204e-06, + "loss": 0.2082, + "step": 2379 + }, + { + "epoch": 0.6866161024735619, + "grad_norm": 4.798961162567139, + "learning_rate": 3.2989377845220033e-06, + "loss": 0.2397, + "step": 2382 + }, + { + "epoch": 0.6874808582701281, + "grad_norm": 1.8099883794784546, + "learning_rate": 3.289833080424886e-06, + "loss": 0.2468, + "step": 2385 + }, + { + "epoch": 0.6883456140666943, + "grad_norm": 3.0821380615234375, + "learning_rate": 3.2807283763277698e-06, + "loss": 0.2013, + "step": 2388 + }, + { + "epoch": 0.6892103698632605, + "grad_norm": 1.6952015161514282, + "learning_rate": 3.2716236722306526e-06, + "loss": 0.2156, + "step": 2391 + }, + { + "epoch": 0.6900751256598266, + "grad_norm": 2.4413681030273438, + "learning_rate": 3.2625189681335363e-06, + "loss": 0.2307, + "step": 2394 + }, + { + "epoch": 0.6909398814563928, + "grad_norm": 1.9589879512786865, + "learning_rate": 3.253414264036419e-06, + "loss": 0.2948, + "step": 2397 + }, + { + "epoch": 0.691804637252959, + "grad_norm": 2.4465548992156982, + "learning_rate": 3.244309559939302e-06, + "loss": 0.223, + "step": 2400 + }, + { + "epoch": 0.691804637252959, + "eval_loss": 0.22531487047672272, + "eval_mse": 0.2253148703626357, + "eval_runtime": 6.6334, + "eval_samples_per_second": 150.751, + "eval_steps_per_second": 18.844, + "step": 2400 + }, + { + "epoch": 0.6926693930495252, + "grad_norm": 2.3037302494049072, + "learning_rate": 3.2352048558421857e-06, + "loss": 0.2199, + "step": 2403 + }, + { + "epoch": 0.6935341488460914, + "grad_norm": 1.6721099615097046, + "learning_rate": 3.2261001517450685e-06, + "loss": 0.2286, + "step": 2406 + }, + { + "epoch": 0.6943989046426576, + "grad_norm": 3.3806381225585938, + "learning_rate": 3.2169954476479514e-06, + "loss": 0.1988, + "step": 2409 + }, + { + "epoch": 0.6952636604392238, + "grad_norm": 2.1515021324157715, + "learning_rate": 3.207890743550835e-06, + "loss": 0.2412, + "step": 2412 + }, + { + "epoch": 0.69612841623579, + "grad_norm": 3.980482816696167, + "learning_rate": 3.198786039453718e-06, + "loss": 0.2088, + "step": 2415 + }, + { + "epoch": 0.6969931720323563, + "grad_norm": 2.2418131828308105, + "learning_rate": 3.1896813353566016e-06, + "loss": 0.2107, + "step": 2418 + }, + { + "epoch": 0.6978579278289225, + "grad_norm": 1.819807767868042, + "learning_rate": 3.1805766312594844e-06, + "loss": 0.2435, + "step": 2421 + }, + { + "epoch": 0.6987226836254887, + "grad_norm": 3.8227691650390625, + "learning_rate": 3.1714719271623673e-06, + "loss": 0.2278, + "step": 2424 + }, + { + "epoch": 0.6995874394220549, + "grad_norm": 2.207240104675293, + "learning_rate": 3.162367223065251e-06, + "loss": 0.2362, + "step": 2427 + }, + { + "epoch": 0.7004521952186211, + "grad_norm": 1.796724796295166, + "learning_rate": 3.153262518968134e-06, + "loss": 0.2383, + "step": 2430 + }, + { + "epoch": 0.7013169510151873, + "grad_norm": 2.7628397941589355, + "learning_rate": 3.1441578148710167e-06, + "loss": 0.2326, + "step": 2433 + }, + { + "epoch": 0.7021817068117535, + "grad_norm": 1.3642479181289673, + "learning_rate": 3.1350531107739003e-06, + "loss": 0.2059, + "step": 2436 + }, + { + "epoch": 0.7030464626083197, + "grad_norm": 1.5554901361465454, + "learning_rate": 3.125948406676783e-06, + "loss": 0.2162, + "step": 2439 + }, + { + "epoch": 0.7039112184048859, + "grad_norm": 2.5311179161071777, + "learning_rate": 3.1168437025796665e-06, + "loss": 0.237, + "step": 2442 + }, + { + "epoch": 0.7047759742014521, + "grad_norm": 1.805991768836975, + "learning_rate": 3.1077389984825497e-06, + "loss": 0.2426, + "step": 2445 + }, + { + "epoch": 0.7056407299980183, + "grad_norm": 2.029891014099121, + "learning_rate": 3.0986342943854326e-06, + "loss": 0.2139, + "step": 2448 + }, + { + "epoch": 0.7065054857945845, + "grad_norm": 2.5495572090148926, + "learning_rate": 3.089529590288316e-06, + "loss": 0.2279, + "step": 2451 + }, + { + "epoch": 0.7073702415911507, + "grad_norm": 1.5610768795013428, + "learning_rate": 3.080424886191199e-06, + "loss": 0.2175, + "step": 2454 + }, + { + "epoch": 0.7082349973877169, + "grad_norm": 3.5896153450012207, + "learning_rate": 3.0713201820940824e-06, + "loss": 0.2459, + "step": 2457 + }, + { + "epoch": 0.7090997531842831, + "grad_norm": 2.1120688915252686, + "learning_rate": 3.062215477996965e-06, + "loss": 0.2565, + "step": 2460 + }, + { + "epoch": 0.7099645089808493, + "grad_norm": 2.7099833488464355, + "learning_rate": 3.0531107738998485e-06, + "loss": 0.2088, + "step": 2463 + }, + { + "epoch": 0.7108292647774155, + "grad_norm": 1.7153905630111694, + "learning_rate": 3.0440060698027317e-06, + "loss": 0.2114, + "step": 2466 + }, + { + "epoch": 0.7116940205739817, + "grad_norm": 1.9465105533599854, + "learning_rate": 3.0349013657056146e-06, + "loss": 0.1932, + "step": 2469 + }, + { + "epoch": 0.7125587763705479, + "grad_norm": 1.6483453512191772, + "learning_rate": 3.025796661608498e-06, + "loss": 0.232, + "step": 2472 + }, + { + "epoch": 0.7134235321671141, + "grad_norm": 2.3854711055755615, + "learning_rate": 3.016691957511381e-06, + "loss": 0.2397, + "step": 2475 + }, + { + "epoch": 0.7142882879636803, + "grad_norm": 1.565765380859375, + "learning_rate": 3.0075872534142644e-06, + "loss": 0.2055, + "step": 2478 + }, + { + "epoch": 0.7151530437602465, + "grad_norm": 1.5985909700393677, + "learning_rate": 2.9984825493171476e-06, + "loss": 0.2077, + "step": 2481 + }, + { + "epoch": 0.7160177995568127, + "grad_norm": 2.9907102584838867, + "learning_rate": 2.9893778452200305e-06, + "loss": 0.2125, + "step": 2484 + }, + { + "epoch": 0.7168825553533789, + "grad_norm": 3.0764994621276855, + "learning_rate": 2.9802731411229137e-06, + "loss": 0.2342, + "step": 2487 + }, + { + "epoch": 0.717747311149945, + "grad_norm": 2.954237461090088, + "learning_rate": 2.971168437025797e-06, + "loss": 0.232, + "step": 2490 + }, + { + "epoch": 0.7186120669465113, + "grad_norm": 1.5421547889709473, + "learning_rate": 2.96206373292868e-06, + "loss": 0.2019, + "step": 2493 + }, + { + "epoch": 0.7194768227430774, + "grad_norm": 5.054042816162109, + "learning_rate": 2.9529590288315635e-06, + "loss": 0.2359, + "step": 2496 + }, + { + "epoch": 0.7203415785396436, + "grad_norm": 1.5477067232131958, + "learning_rate": 2.9438543247344464e-06, + "loss": 0.2028, + "step": 2499 + }, + { + "epoch": 0.7206298304718324, + "eval_loss": 0.22387926280498505, + "eval_mse": 0.22387926151184365, + "eval_runtime": 6.5037, + "eval_samples_per_second": 153.759, + "eval_steps_per_second": 19.22, + "step": 2500 + }, + { + "epoch": 0.7212063343362098, + "grad_norm": 1.912636160850525, + "learning_rate": 2.9347496206373292e-06, + "loss": 0.2167, + "step": 2502 + }, + { + "epoch": 0.722071090132776, + "grad_norm": 1.644394040107727, + "learning_rate": 2.925644916540213e-06, + "loss": 0.2068, + "step": 2505 + }, + { + "epoch": 0.7229358459293422, + "grad_norm": 3.4328315258026123, + "learning_rate": 2.9165402124430958e-06, + "loss": 0.2417, + "step": 2508 + }, + { + "epoch": 0.7238006017259084, + "grad_norm": 1.918043613433838, + "learning_rate": 2.9074355083459786e-06, + "loss": 0.2308, + "step": 2511 + }, + { + "epoch": 0.7246653575224746, + "grad_norm": 1.94221031665802, + "learning_rate": 2.8983308042488623e-06, + "loss": 0.2957, + "step": 2514 + }, + { + "epoch": 0.7255301133190408, + "grad_norm": 2.877037525177002, + "learning_rate": 2.889226100151745e-06, + "loss": 0.2133, + "step": 2517 + }, + { + "epoch": 0.726394869115607, + "grad_norm": 2.2768120765686035, + "learning_rate": 2.880121396054629e-06, + "loss": 0.2251, + "step": 2520 + }, + { + "epoch": 0.7272596249121732, + "grad_norm": 2.9239742755889893, + "learning_rate": 2.8710166919575117e-06, + "loss": 0.215, + "step": 2523 + }, + { + "epoch": 0.7281243807087394, + "grad_norm": 1.5520339012145996, + "learning_rate": 2.8619119878603945e-06, + "loss": 0.2113, + "step": 2526 + }, + { + "epoch": 0.7289891365053056, + "grad_norm": 3.458822011947632, + "learning_rate": 2.852807283763278e-06, + "loss": 0.2465, + "step": 2529 + }, + { + "epoch": 0.7298538923018718, + "grad_norm": 1.606724500656128, + "learning_rate": 2.843702579666161e-06, + "loss": 0.188, + "step": 2532 + }, + { + "epoch": 0.730718648098438, + "grad_norm": 3.552236318588257, + "learning_rate": 2.834597875569044e-06, + "loss": 0.2178, + "step": 2535 + }, + { + "epoch": 0.7315834038950042, + "grad_norm": 2.940363883972168, + "learning_rate": 2.8254931714719276e-06, + "loss": 0.2149, + "step": 2538 + }, + { + "epoch": 0.7324481596915704, + "grad_norm": 1.3787034749984741, + "learning_rate": 2.8163884673748104e-06, + "loss": 0.2155, + "step": 2541 + }, + { + "epoch": 0.7333129154881366, + "grad_norm": 1.4637516736984253, + "learning_rate": 2.807283763277694e-06, + "loss": 0.1944, + "step": 2544 + }, + { + "epoch": 0.7341776712847028, + "grad_norm": 1.5903189182281494, + "learning_rate": 2.798179059180577e-06, + "loss": 0.214, + "step": 2547 + }, + { + "epoch": 0.735042427081269, + "grad_norm": 1.999664306640625, + "learning_rate": 2.78907435508346e-06, + "loss": 0.2277, + "step": 2550 + }, + { + "epoch": 0.7359071828778352, + "grad_norm": 2.250450849533081, + "learning_rate": 2.7799696509863435e-06, + "loss": 0.1958, + "step": 2553 + }, + { + "epoch": 0.7367719386744014, + "grad_norm": 1.8212629556655884, + "learning_rate": 2.7708649468892263e-06, + "loss": 0.2155, + "step": 2556 + }, + { + "epoch": 0.7376366944709676, + "grad_norm": 1.4670761823654175, + "learning_rate": 2.7617602427921096e-06, + "loss": 0.2251, + "step": 2559 + }, + { + "epoch": 0.7385014502675338, + "grad_norm": 2.9052860736846924, + "learning_rate": 2.752655538694993e-06, + "loss": 0.2279, + "step": 2562 + }, + { + "epoch": 0.7393662060641, + "grad_norm": 1.647455096244812, + "learning_rate": 2.7435508345978757e-06, + "loss": 0.2011, + "step": 2565 + }, + { + "epoch": 0.7402309618606662, + "grad_norm": 2.5667457580566406, + "learning_rate": 2.734446130500759e-06, + "loss": 0.2113, + "step": 2568 + }, + { + "epoch": 0.7410957176572324, + "grad_norm": 2.021571159362793, + "learning_rate": 2.7253414264036422e-06, + "loss": 0.2085, + "step": 2571 + }, + { + "epoch": 0.7419604734537987, + "grad_norm": 3.436924457550049, + "learning_rate": 2.716236722306525e-06, + "loss": 0.2658, + "step": 2574 + }, + { + "epoch": 0.7428252292503649, + "grad_norm": 1.9480434656143188, + "learning_rate": 2.7071320182094083e-06, + "loss": 0.2595, + "step": 2577 + }, + { + "epoch": 0.7436899850469311, + "grad_norm": 1.8556321859359741, + "learning_rate": 2.6980273141122916e-06, + "loss": 0.2308, + "step": 2580 + }, + { + "epoch": 0.7445547408434973, + "grad_norm": 3.174111843109131, + "learning_rate": 2.688922610015175e-06, + "loss": 0.2172, + "step": 2583 + }, + { + "epoch": 0.7454194966400635, + "grad_norm": 1.5629518032073975, + "learning_rate": 2.6798179059180577e-06, + "loss": 0.2417, + "step": 2586 + }, + { + "epoch": 0.7462842524366297, + "grad_norm": 1.8133536577224731, + "learning_rate": 2.670713201820941e-06, + "loss": 0.2188, + "step": 2589 + }, + { + "epoch": 0.7471490082331959, + "grad_norm": 1.5448634624481201, + "learning_rate": 2.6616084977238242e-06, + "loss": 0.2201, + "step": 2592 + }, + { + "epoch": 0.748013764029762, + "grad_norm": 2.376194953918457, + "learning_rate": 2.652503793626707e-06, + "loss": 0.2146, + "step": 2595 + }, + { + "epoch": 0.7488785198263282, + "grad_norm": 1.8988285064697266, + "learning_rate": 2.6433990895295904e-06, + "loss": 0.2322, + "step": 2598 + }, + { + "epoch": 0.7494550236907057, + "eval_loss": 0.21797478199005127, + "eval_mse": 0.21797478066571058, + "eval_runtime": 6.533, + "eval_samples_per_second": 153.07, + "eval_steps_per_second": 19.134, + "step": 2600 + }, + { + "epoch": 0.7497432756228944, + "grad_norm": 3.16768217086792, + "learning_rate": 2.6342943854324736e-06, + "loss": 0.2533, + "step": 2601 + }, + { + "epoch": 0.7506080314194606, + "grad_norm": 1.7228988409042358, + "learning_rate": 2.625189681335357e-06, + "loss": 0.2406, + "step": 2604 + }, + { + "epoch": 0.7514727872160268, + "grad_norm": 2.9629013538360596, + "learning_rate": 2.61608497723824e-06, + "loss": 0.2131, + "step": 2607 + }, + { + "epoch": 0.752337543012593, + "grad_norm": 3.4181559085845947, + "learning_rate": 2.606980273141123e-06, + "loss": 0.2126, + "step": 2610 + }, + { + "epoch": 0.7532022988091592, + "grad_norm": 2.142685890197754, + "learning_rate": 2.5978755690440063e-06, + "loss": 0.2345, + "step": 2613 + }, + { + "epoch": 0.7540670546057254, + "grad_norm": 3.622145175933838, + "learning_rate": 2.5887708649468895e-06, + "loss": 0.2187, + "step": 2616 + }, + { + "epoch": 0.7549318104022916, + "grad_norm": 1.5243175029754639, + "learning_rate": 2.5796661608497724e-06, + "loss": 0.1996, + "step": 2619 + }, + { + "epoch": 0.7557965661988578, + "grad_norm": 2.7075355052948, + "learning_rate": 2.570561456752656e-06, + "loss": 0.2755, + "step": 2622 + }, + { + "epoch": 0.756661321995424, + "grad_norm": 2.6778082847595215, + "learning_rate": 2.561456752655539e-06, + "loss": 0.1959, + "step": 2625 + }, + { + "epoch": 0.7575260777919902, + "grad_norm": 3.8043668270111084, + "learning_rate": 2.5523520485584217e-06, + "loss": 0.2054, + "step": 2628 + }, + { + "epoch": 0.7583908335885564, + "grad_norm": 3.628281354904175, + "learning_rate": 2.5432473444613054e-06, + "loss": 0.2095, + "step": 2631 + }, + { + "epoch": 0.7592555893851226, + "grad_norm": 3.2685089111328125, + "learning_rate": 2.5341426403641883e-06, + "loss": 0.2167, + "step": 2634 + }, + { + "epoch": 0.7601203451816888, + "grad_norm": 1.6048041582107544, + "learning_rate": 2.525037936267071e-06, + "loss": 0.2416, + "step": 2637 + }, + { + "epoch": 0.760985100978255, + "grad_norm": 1.5175867080688477, + "learning_rate": 2.515933232169955e-06, + "loss": 0.2326, + "step": 2640 + }, + { + "epoch": 0.7618498567748212, + "grad_norm": 1.9787158966064453, + "learning_rate": 2.5068285280728376e-06, + "loss": 0.2266, + "step": 2643 + }, + { + "epoch": 0.7627146125713874, + "grad_norm": 4.297834873199463, + "learning_rate": 2.497723823975721e-06, + "loss": 0.2414, + "step": 2646 + }, + { + "epoch": 0.7635793683679536, + "grad_norm": 1.822587251663208, + "learning_rate": 2.488619119878604e-06, + "loss": 0.2386, + "step": 2649 + }, + { + "epoch": 0.7644441241645198, + "grad_norm": 4.966648101806641, + "learning_rate": 2.4795144157814874e-06, + "loss": 0.2268, + "step": 2652 + }, + { + "epoch": 0.765308879961086, + "grad_norm": 2.8137335777282715, + "learning_rate": 2.4704097116843703e-06, + "loss": 0.2349, + "step": 2655 + }, + { + "epoch": 0.7661736357576522, + "grad_norm": 2.5401577949523926, + "learning_rate": 2.4613050075872536e-06, + "loss": 0.1941, + "step": 2658 + }, + { + "epoch": 0.7670383915542184, + "grad_norm": 1.550758957862854, + "learning_rate": 2.452200303490137e-06, + "loss": 0.2151, + "step": 2661 + }, + { + "epoch": 0.7679031473507846, + "grad_norm": 6.889467239379883, + "learning_rate": 2.44309559939302e-06, + "loss": 0.2287, + "step": 2664 + }, + { + "epoch": 0.7687679031473508, + "grad_norm": 1.4833314418792725, + "learning_rate": 2.4339908952959034e-06, + "loss": 0.2017, + "step": 2667 + }, + { + "epoch": 0.769632658943917, + "grad_norm": 1.607751727104187, + "learning_rate": 2.424886191198786e-06, + "loss": 0.2309, + "step": 2670 + }, + { + "epoch": 0.7704974147404832, + "grad_norm": 1.7369478940963745, + "learning_rate": 2.4157814871016695e-06, + "loss": 0.217, + "step": 2673 + }, + { + "epoch": 0.7713621705370494, + "grad_norm": 1.7309290170669556, + "learning_rate": 2.4066767830045527e-06, + "loss": 0.2151, + "step": 2676 + }, + { + "epoch": 0.7722269263336156, + "grad_norm": 2.047727108001709, + "learning_rate": 2.397572078907436e-06, + "loss": 0.2078, + "step": 2679 + }, + { + "epoch": 0.7730916821301818, + "grad_norm": 2.2800142765045166, + "learning_rate": 2.388467374810319e-06, + "loss": 0.2372, + "step": 2682 + }, + { + "epoch": 0.773956437926748, + "grad_norm": 3.920849323272705, + "learning_rate": 2.379362670713202e-06, + "loss": 0.2304, + "step": 2685 + }, + { + "epoch": 0.7748211937233141, + "grad_norm": 3.4216678142547607, + "learning_rate": 2.3702579666160854e-06, + "loss": 0.2382, + "step": 2688 + }, + { + "epoch": 0.7756859495198803, + "grad_norm": 1.5471861362457275, + "learning_rate": 2.3611532625189686e-06, + "loss": 0.2243, + "step": 2691 + }, + { + "epoch": 0.7765507053164465, + "grad_norm": 1.7489866018295288, + "learning_rate": 2.3520485584218515e-06, + "loss": 0.2239, + "step": 2694 + }, + { + "epoch": 0.7774154611130127, + "grad_norm": 4.3836822509765625, + "learning_rate": 2.3429438543247347e-06, + "loss": 0.205, + "step": 2697 + }, + { + "epoch": 0.7782802169095789, + "grad_norm": 1.8707342147827148, + "learning_rate": 2.333839150227618e-06, + "loss": 0.1933, + "step": 2700 + }, + { + "epoch": 0.7782802169095789, + "eval_loss": 0.2158193737268448, + "eval_mse": 0.21581936262454837, + "eval_runtime": 6.5614, + "eval_samples_per_second": 152.407, + "eval_steps_per_second": 19.051, + "step": 2700 + }, + { + "epoch": 0.7791449727061451, + "grad_norm": 1.6554896831512451, + "learning_rate": 2.324734446130501e-06, + "loss": 0.2178, + "step": 2703 + }, + { + "epoch": 0.7800097285027113, + "grad_norm": 3.131352424621582, + "learning_rate": 2.315629742033384e-06, + "loss": 0.2122, + "step": 2706 + }, + { + "epoch": 0.7808744842992775, + "grad_norm": 2.62422776222229, + "learning_rate": 2.3065250379362674e-06, + "loss": 0.2085, + "step": 2709 + }, + { + "epoch": 0.7817392400958437, + "grad_norm": 2.1258456707000732, + "learning_rate": 2.2974203338391502e-06, + "loss": 0.2203, + "step": 2712 + }, + { + "epoch": 0.7826039958924099, + "grad_norm": 3.144688606262207, + "learning_rate": 2.2883156297420335e-06, + "loss": 0.2428, + "step": 2715 + }, + { + "epoch": 0.7834687516889761, + "grad_norm": 4.5740180015563965, + "learning_rate": 2.2792109256449168e-06, + "loss": 0.2196, + "step": 2718 + }, + { + "epoch": 0.7843335074855423, + "grad_norm": 1.7256083488464355, + "learning_rate": 2.2701062215477996e-06, + "loss": 0.2316, + "step": 2721 + }, + { + "epoch": 0.7851982632821085, + "grad_norm": 2.0723230838775635, + "learning_rate": 2.261001517450683e-06, + "loss": 0.1958, + "step": 2724 + }, + { + "epoch": 0.7860630190786747, + "grad_norm": 1.6268962621688843, + "learning_rate": 2.251896813353566e-06, + "loss": 0.2329, + "step": 2727 + }, + { + "epoch": 0.786927774875241, + "grad_norm": 4.054417610168457, + "learning_rate": 2.2427921092564494e-06, + "loss": 0.2091, + "step": 2730 + }, + { + "epoch": 0.7877925306718072, + "grad_norm": 1.7409260272979736, + "learning_rate": 2.2336874051593322e-06, + "loss": 0.2144, + "step": 2733 + }, + { + "epoch": 0.7886572864683734, + "grad_norm": 2.77607798576355, + "learning_rate": 2.2245827010622155e-06, + "loss": 0.2199, + "step": 2736 + }, + { + "epoch": 0.7895220422649396, + "grad_norm": 2.215284585952759, + "learning_rate": 2.2154779969650988e-06, + "loss": 0.2388, + "step": 2739 + }, + { + "epoch": 0.7903867980615058, + "grad_norm": 1.7318382263183594, + "learning_rate": 2.206373292867982e-06, + "loss": 0.2115, + "step": 2742 + }, + { + "epoch": 0.791251553858072, + "grad_norm": 1.5627691745758057, + "learning_rate": 2.197268588770865e-06, + "loss": 0.2054, + "step": 2745 + }, + { + "epoch": 0.7921163096546382, + "grad_norm": 1.810509443283081, + "learning_rate": 2.188163884673748e-06, + "loss": 0.2086, + "step": 2748 + }, + { + "epoch": 0.7929810654512044, + "grad_norm": 2.1531972885131836, + "learning_rate": 2.1790591805766314e-06, + "loss": 0.2252, + "step": 2751 + }, + { + "epoch": 0.7938458212477706, + "grad_norm": 2.0212440490722656, + "learning_rate": 2.1699544764795147e-06, + "loss": 0.2355, + "step": 2754 + }, + { + "epoch": 0.7947105770443368, + "grad_norm": 5.030855178833008, + "learning_rate": 2.1608497723823975e-06, + "loss": 0.2341, + "step": 2757 + }, + { + "epoch": 0.795575332840903, + "grad_norm": 2.213249921798706, + "learning_rate": 2.1517450682852808e-06, + "loss": 0.2133, + "step": 2760 + }, + { + "epoch": 0.7964400886374692, + "grad_norm": 1.8025689125061035, + "learning_rate": 2.142640364188164e-06, + "loss": 0.2251, + "step": 2763 + }, + { + "epoch": 0.7973048444340354, + "grad_norm": 4.14149284362793, + "learning_rate": 2.1335356600910473e-06, + "loss": 0.253, + "step": 2766 + }, + { + "epoch": 0.7981696002306016, + "grad_norm": 2.2051069736480713, + "learning_rate": 2.12443095599393e-06, + "loss": 0.2238, + "step": 2769 + }, + { + "epoch": 0.7990343560271678, + "grad_norm": 2.249032497406006, + "learning_rate": 2.1153262518968134e-06, + "loss": 0.2282, + "step": 2772 + }, + { + "epoch": 0.799899111823734, + "grad_norm": 1.5087867975234985, + "learning_rate": 2.1062215477996967e-06, + "loss": 0.1948, + "step": 2775 + }, + { + "epoch": 0.8007638676203002, + "grad_norm": 1.9934585094451904, + "learning_rate": 2.09711684370258e-06, + "loss": 0.2061, + "step": 2778 + }, + { + "epoch": 0.8016286234168664, + "grad_norm": 2.521526336669922, + "learning_rate": 2.0880121396054632e-06, + "loss": 0.2331, + "step": 2781 + }, + { + "epoch": 0.8024933792134326, + "grad_norm": 4.441010475158691, + "learning_rate": 2.078907435508346e-06, + "loss": 0.2337, + "step": 2784 + }, + { + "epoch": 0.8033581350099988, + "grad_norm": 1.9386543035507202, + "learning_rate": 2.0698027314112293e-06, + "loss": 0.2434, + "step": 2787 + }, + { + "epoch": 0.804222890806565, + "grad_norm": 1.6140722036361694, + "learning_rate": 2.0606980273141126e-06, + "loss": 0.223, + "step": 2790 + }, + { + "epoch": 0.8050876466031311, + "grad_norm": 3.248769998550415, + "learning_rate": 2.051593323216996e-06, + "loss": 0.2352, + "step": 2793 + }, + { + "epoch": 0.8059524023996973, + "grad_norm": 2.259561061859131, + "learning_rate": 2.0424886191198787e-06, + "loss": 0.2428, + "step": 2796 + }, + { + "epoch": 0.8068171581962635, + "grad_norm": 2.289113998413086, + "learning_rate": 2.033383915022762e-06, + "loss": 0.2085, + "step": 2799 + }, + { + "epoch": 0.8071054101284523, + "eval_loss": 0.22976131737232208, + "eval_mse": 0.22976131996285404, + "eval_runtime": 6.5104, + "eval_samples_per_second": 153.6, + "eval_steps_per_second": 19.2, + "step": 2800 + }, + { + "epoch": 0.8076819139928297, + "grad_norm": 1.4588847160339355, + "learning_rate": 2.0242792109256452e-06, + "loss": 0.2004, + "step": 2802 + }, + { + "epoch": 0.8085466697893959, + "grad_norm": 3.1680517196655273, + "learning_rate": 2.0151745068285285e-06, + "loss": 0.2038, + "step": 2805 + }, + { + "epoch": 0.8094114255859621, + "grad_norm": 2.708411693572998, + "learning_rate": 2.0060698027314113e-06, + "loss": 0.2289, + "step": 2808 + }, + { + "epoch": 0.8102761813825283, + "grad_norm": 2.2479403018951416, + "learning_rate": 1.9969650986342946e-06, + "loss": 0.2465, + "step": 2811 + }, + { + "epoch": 0.8111409371790945, + "grad_norm": 3.2582664489746094, + "learning_rate": 1.987860394537178e-06, + "loss": 0.2187, + "step": 2814 + }, + { + "epoch": 0.8120056929756607, + "grad_norm": 2.5267367362976074, + "learning_rate": 1.978755690440061e-06, + "loss": 0.1955, + "step": 2817 + }, + { + "epoch": 0.8128704487722269, + "grad_norm": 2.42645525932312, + "learning_rate": 1.969650986342944e-06, + "loss": 0.233, + "step": 2820 + }, + { + "epoch": 0.8137352045687931, + "grad_norm": 1.6183414459228516, + "learning_rate": 1.9605462822458273e-06, + "loss": 0.2187, + "step": 2823 + }, + { + "epoch": 0.8145999603653593, + "grad_norm": 2.7215240001678467, + "learning_rate": 1.9514415781487105e-06, + "loss": 0.2109, + "step": 2826 + }, + { + "epoch": 0.8154647161619255, + "grad_norm": 2.152639389038086, + "learning_rate": 1.9423368740515934e-06, + "loss": 0.2056, + "step": 2829 + }, + { + "epoch": 0.8163294719584917, + "grad_norm": 2.530045509338379, + "learning_rate": 1.9332321699544766e-06, + "loss": 0.2185, + "step": 2832 + }, + { + "epoch": 0.8171942277550579, + "grad_norm": 1.8189818859100342, + "learning_rate": 1.92412746585736e-06, + "loss": 0.2502, + "step": 2835 + }, + { + "epoch": 0.8180589835516241, + "grad_norm": 1.5473167896270752, + "learning_rate": 1.9150227617602427e-06, + "loss": 0.1923, + "step": 2838 + }, + { + "epoch": 0.8189237393481903, + "grad_norm": 2.99245285987854, + "learning_rate": 1.9059180576631262e-06, + "loss": 0.209, + "step": 2841 + }, + { + "epoch": 0.8197884951447565, + "grad_norm": 1.509892225265503, + "learning_rate": 1.8968133535660093e-06, + "loss": 0.2293, + "step": 2844 + }, + { + "epoch": 0.8206532509413227, + "grad_norm": 3.5667645931243896, + "learning_rate": 1.8877086494688923e-06, + "loss": 0.1935, + "step": 2847 + }, + { + "epoch": 0.8215180067378889, + "grad_norm": 3.1929867267608643, + "learning_rate": 1.8786039453717756e-06, + "loss": 0.201, + "step": 2850 + }, + { + "epoch": 0.8223827625344551, + "grad_norm": 1.5706427097320557, + "learning_rate": 1.8694992412746589e-06, + "loss": 0.2315, + "step": 2853 + }, + { + "epoch": 0.8232475183310213, + "grad_norm": 3.1730329990386963, + "learning_rate": 1.860394537177542e-06, + "loss": 0.2475, + "step": 2856 + }, + { + "epoch": 0.8241122741275875, + "grad_norm": 1.3369548320770264, + "learning_rate": 1.851289833080425e-06, + "loss": 0.2186, + "step": 2859 + }, + { + "epoch": 0.8249770299241537, + "grad_norm": 2.522751569747925, + "learning_rate": 1.8421851289833082e-06, + "loss": 0.2477, + "step": 2862 + }, + { + "epoch": 0.8258417857207199, + "grad_norm": 1.991076111793518, + "learning_rate": 1.8330804248861913e-06, + "loss": 0.2009, + "step": 2865 + }, + { + "epoch": 0.8267065415172861, + "grad_norm": 2.707282781600952, + "learning_rate": 1.8239757207890745e-06, + "loss": 0.2302, + "step": 2868 + }, + { + "epoch": 0.8275712973138523, + "grad_norm": 1.578192949295044, + "learning_rate": 1.8148710166919576e-06, + "loss": 0.2021, + "step": 2871 + }, + { + "epoch": 0.8284360531104185, + "grad_norm": 3.6003148555755615, + "learning_rate": 1.8057663125948407e-06, + "loss": 0.2409, + "step": 2874 + }, + { + "epoch": 0.8293008089069847, + "grad_norm": 3.1442506313323975, + "learning_rate": 1.796661608497724e-06, + "loss": 0.2129, + "step": 2877 + }, + { + "epoch": 0.8301655647035509, + "grad_norm": 1.509333610534668, + "learning_rate": 1.7875569044006072e-06, + "loss": 0.2202, + "step": 2880 + }, + { + "epoch": 0.831030320500117, + "grad_norm": 1.9379024505615234, + "learning_rate": 1.7784522003034905e-06, + "loss": 0.2238, + "step": 2883 + }, + { + "epoch": 0.8318950762966834, + "grad_norm": 1.3617918491363525, + "learning_rate": 1.7693474962063733e-06, + "loss": 0.1972, + "step": 2886 + }, + { + "epoch": 0.8327598320932496, + "grad_norm": 1.4775515794754028, + "learning_rate": 1.7602427921092566e-06, + "loss": 0.1985, + "step": 2889 + }, + { + "epoch": 0.8336245878898157, + "grad_norm": 1.4302202463150024, + "learning_rate": 1.7511380880121398e-06, + "loss": 0.2403, + "step": 2892 + }, + { + "epoch": 0.834489343686382, + "grad_norm": 2.128401041030884, + "learning_rate": 1.742033383915023e-06, + "loss": 0.2204, + "step": 2895 + }, + { + "epoch": 0.8353540994829481, + "grad_norm": 1.9766216278076172, + "learning_rate": 1.732928679817906e-06, + "loss": 0.2038, + "step": 2898 + }, + { + "epoch": 0.8359306033473256, + "eval_loss": 0.21660968661308289, + "eval_mse": 0.21660968138270925, + "eval_runtime": 6.5731, + "eval_samples_per_second": 152.135, + "eval_steps_per_second": 19.017, + "step": 2900 + }, + { + "epoch": 0.8362188552795143, + "grad_norm": 2.260625123977661, + "learning_rate": 1.7238239757207892e-06, + "loss": 0.2206, + "step": 2901 + }, + { + "epoch": 0.8370836110760805, + "grad_norm": 2.605224609375, + "learning_rate": 1.7147192716236725e-06, + "loss": 0.2186, + "step": 2904 + }, + { + "epoch": 0.8379483668726467, + "grad_norm": 2.7619729042053223, + "learning_rate": 1.7056145675265557e-06, + "loss": 0.2195, + "step": 2907 + }, + { + "epoch": 0.8388131226692129, + "grad_norm": 1.9395873546600342, + "learning_rate": 1.6965098634294386e-06, + "loss": 0.2119, + "step": 2910 + }, + { + "epoch": 0.8396778784657791, + "grad_norm": 1.727263331413269, + "learning_rate": 1.6874051593323218e-06, + "loss": 0.2125, + "step": 2913 + }, + { + "epoch": 0.8405426342623453, + "grad_norm": 2.149775981903076, + "learning_rate": 1.6783004552352051e-06, + "loss": 0.2644, + "step": 2916 + }, + { + "epoch": 0.8414073900589115, + "grad_norm": 2.6743104457855225, + "learning_rate": 1.6691957511380882e-06, + "loss": 0.2217, + "step": 2919 + }, + { + "epoch": 0.8422721458554777, + "grad_norm": 2.795736074447632, + "learning_rate": 1.6600910470409712e-06, + "loss": 0.2115, + "step": 2922 + }, + { + "epoch": 0.8431369016520439, + "grad_norm": 1.8719727993011475, + "learning_rate": 1.6509863429438545e-06, + "loss": 0.1768, + "step": 2925 + }, + { + "epoch": 0.8440016574486101, + "grad_norm": 3.4366025924682617, + "learning_rate": 1.6418816388467375e-06, + "loss": 0.2357, + "step": 2928 + }, + { + "epoch": 0.8448664132451763, + "grad_norm": 2.2458267211914062, + "learning_rate": 1.6327769347496208e-06, + "loss": 0.2147, + "step": 2931 + }, + { + "epoch": 0.8457311690417425, + "grad_norm": 1.958115577697754, + "learning_rate": 1.6236722306525039e-06, + "loss": 0.2339, + "step": 2934 + }, + { + "epoch": 0.8465959248383087, + "grad_norm": 1.6470586061477661, + "learning_rate": 1.614567526555387e-06, + "loss": 0.2259, + "step": 2937 + }, + { + "epoch": 0.8474606806348749, + "grad_norm": 1.2936792373657227, + "learning_rate": 1.6054628224582702e-06, + "loss": 0.1981, + "step": 2940 + }, + { + "epoch": 0.8483254364314411, + "grad_norm": 4.565530300140381, + "learning_rate": 1.5963581183611534e-06, + "loss": 0.1993, + "step": 2943 + }, + { + "epoch": 0.8491901922280073, + "grad_norm": 2.8682401180267334, + "learning_rate": 1.5872534142640367e-06, + "loss": 0.2093, + "step": 2946 + }, + { + "epoch": 0.8500549480245735, + "grad_norm": 1.7801469564437866, + "learning_rate": 1.5781487101669196e-06, + "loss": 0.2136, + "step": 2949 + }, + { + "epoch": 0.8509197038211397, + "grad_norm": 2.372549057006836, + "learning_rate": 1.5690440060698028e-06, + "loss": 0.1845, + "step": 2952 + }, + { + "epoch": 0.8517844596177059, + "grad_norm": 2.190469741821289, + "learning_rate": 1.559939301972686e-06, + "loss": 0.2137, + "step": 2955 + }, + { + "epoch": 0.8526492154142721, + "grad_norm": 1.6399952173233032, + "learning_rate": 1.5508345978755694e-06, + "loss": 0.2206, + "step": 2958 + }, + { + "epoch": 0.8535139712108383, + "grad_norm": 1.6555943489074707, + "learning_rate": 1.5417298937784522e-06, + "loss": 0.2403, + "step": 2961 + }, + { + "epoch": 0.8543787270074045, + "grad_norm": 2.6609280109405518, + "learning_rate": 1.5326251896813355e-06, + "loss": 0.2176, + "step": 2964 + }, + { + "epoch": 0.8552434828039707, + "grad_norm": 2.3398261070251465, + "learning_rate": 1.5235204855842187e-06, + "loss": 0.219, + "step": 2967 + }, + { + "epoch": 0.8561082386005369, + "grad_norm": 1.9740712642669678, + "learning_rate": 1.514415781487102e-06, + "loss": 0.2232, + "step": 2970 + }, + { + "epoch": 0.8569729943971031, + "grad_norm": 1.6300252676010132, + "learning_rate": 1.5053110773899848e-06, + "loss": 0.2094, + "step": 2973 + }, + { + "epoch": 0.8578377501936693, + "grad_norm": 2.8211612701416016, + "learning_rate": 1.496206373292868e-06, + "loss": 0.1696, + "step": 2976 + }, + { + "epoch": 0.8587025059902355, + "grad_norm": 2.621321439743042, + "learning_rate": 1.4871016691957514e-06, + "loss": 0.2357, + "step": 2979 + }, + { + "epoch": 0.8595672617868017, + "grad_norm": 1.5020322799682617, + "learning_rate": 1.4779969650986344e-06, + "loss": 0.2456, + "step": 2982 + }, + { + "epoch": 0.8604320175833678, + "grad_norm": 1.474507212638855, + "learning_rate": 1.4688922610015175e-06, + "loss": 0.2074, + "step": 2985 + }, + { + "epoch": 0.861296773379934, + "grad_norm": 2.9856317043304443, + "learning_rate": 1.4597875569044007e-06, + "loss": 0.2241, + "step": 2988 + }, + { + "epoch": 0.8621615291765002, + "grad_norm": 2.0011954307556152, + "learning_rate": 1.4506828528072838e-06, + "loss": 0.2026, + "step": 2991 + }, + { + "epoch": 0.8630262849730664, + "grad_norm": 1.6045671701431274, + "learning_rate": 1.441578148710167e-06, + "loss": 0.186, + "step": 2994 + }, + { + "epoch": 0.8638910407696326, + "grad_norm": 1.5708575248718262, + "learning_rate": 1.4324734446130503e-06, + "loss": 0.2048, + "step": 2997 + }, + { + "epoch": 0.8647557965661988, + "grad_norm": 2.8704543113708496, + "learning_rate": 1.4233687405159332e-06, + "loss": 0.2158, + "step": 3000 + }, + { + "epoch": 0.8647557965661988, + "eval_loss": 0.2083793729543686, + "eval_mse": 0.20837937731285638, + "eval_runtime": 6.6796, + "eval_samples_per_second": 149.709, + "eval_steps_per_second": 18.714, + "step": 3000 + }, + { + "epoch": 0.865620552362765, + "grad_norm": 3.2445404529571533, + "learning_rate": 1.4142640364188164e-06, + "loss": 0.2144, + "step": 3003 + }, + { + "epoch": 0.8664853081593312, + "grad_norm": 1.96418297290802, + "learning_rate": 1.4051593323216997e-06, + "loss": 0.2491, + "step": 3006 + }, + { + "epoch": 0.8673500639558974, + "grad_norm": 1.8195468187332153, + "learning_rate": 1.396054628224583e-06, + "loss": 0.2065, + "step": 3009 + }, + { + "epoch": 0.8682148197524636, + "grad_norm": 1.3888121843338013, + "learning_rate": 1.3869499241274658e-06, + "loss": 0.2049, + "step": 3012 + }, + { + "epoch": 0.8690795755490298, + "grad_norm": 3.738133668899536, + "learning_rate": 1.377845220030349e-06, + "loss": 0.2522, + "step": 3015 + }, + { + "epoch": 0.869944331345596, + "grad_norm": 6.097411632537842, + "learning_rate": 1.3687405159332323e-06, + "loss": 0.2532, + "step": 3018 + }, + { + "epoch": 0.8708090871421622, + "grad_norm": 2.2873427867889404, + "learning_rate": 1.3596358118361156e-06, + "loss": 0.2382, + "step": 3021 + }, + { + "epoch": 0.8716738429387284, + "grad_norm": 1.540143370628357, + "learning_rate": 1.3505311077389985e-06, + "loss": 0.1979, + "step": 3024 + }, + { + "epoch": 0.8725385987352946, + "grad_norm": 1.6231845617294312, + "learning_rate": 1.3414264036418817e-06, + "loss": 0.195, + "step": 3027 + }, + { + "epoch": 0.8734033545318608, + "grad_norm": 2.2970290184020996, + "learning_rate": 1.332321699544765e-06, + "loss": 0.2154, + "step": 3030 + }, + { + "epoch": 0.874268110328427, + "grad_norm": 2.0112340450286865, + "learning_rate": 1.3232169954476482e-06, + "loss": 0.2355, + "step": 3033 + }, + { + "epoch": 0.8751328661249932, + "grad_norm": 1.3887783288955688, + "learning_rate": 1.314112291350531e-06, + "loss": 0.201, + "step": 3036 + }, + { + "epoch": 0.8759976219215594, + "grad_norm": 2.187082529067993, + "learning_rate": 1.3050075872534144e-06, + "loss": 0.1965, + "step": 3039 + }, + { + "epoch": 0.8768623777181257, + "grad_norm": 1.300243616104126, + "learning_rate": 1.2959028831562976e-06, + "loss": 0.2097, + "step": 3042 + }, + { + "epoch": 0.8777271335146919, + "grad_norm": 2.1217234134674072, + "learning_rate": 1.2867981790591807e-06, + "loss": 0.214, + "step": 3045 + }, + { + "epoch": 0.8785918893112581, + "grad_norm": 1.8281973600387573, + "learning_rate": 1.277693474962064e-06, + "loss": 0.2082, + "step": 3048 + }, + { + "epoch": 0.8794566451078243, + "grad_norm": 2.3602306842803955, + "learning_rate": 1.268588770864947e-06, + "loss": 0.2172, + "step": 3051 + }, + { + "epoch": 0.8803214009043905, + "grad_norm": 1.903954267501831, + "learning_rate": 1.25948406676783e-06, + "loss": 0.2323, + "step": 3054 + }, + { + "epoch": 0.8811861567009567, + "grad_norm": 3.514057159423828, + "learning_rate": 1.2503793626707133e-06, + "loss": 0.2323, + "step": 3057 + }, + { + "epoch": 0.8820509124975229, + "grad_norm": 3.3089487552642822, + "learning_rate": 1.2412746585735964e-06, + "loss": 0.2506, + "step": 3060 + }, + { + "epoch": 0.8829156682940891, + "grad_norm": 2.317981004714966, + "learning_rate": 1.2321699544764796e-06, + "loss": 0.2274, + "step": 3063 + }, + { + "epoch": 0.8837804240906553, + "grad_norm": 2.7326478958129883, + "learning_rate": 1.2230652503793627e-06, + "loss": 0.2057, + "step": 3066 + }, + { + "epoch": 0.8846451798872215, + "grad_norm": 4.13656759262085, + "learning_rate": 1.213960546282246e-06, + "loss": 0.2209, + "step": 3069 + }, + { + "epoch": 0.8855099356837877, + "grad_norm": 2.0688633918762207, + "learning_rate": 1.204855842185129e-06, + "loss": 0.2608, + "step": 3072 + }, + { + "epoch": 0.8863746914803539, + "grad_norm": 1.9340734481811523, + "learning_rate": 1.1957511380880123e-06, + "loss": 0.2187, + "step": 3075 + }, + { + "epoch": 0.8872394472769201, + "grad_norm": 2.4431509971618652, + "learning_rate": 1.1866464339908953e-06, + "loss": 0.2578, + "step": 3078 + }, + { + "epoch": 0.8881042030734863, + "grad_norm": 2.9879822731018066, + "learning_rate": 1.1775417298937786e-06, + "loss": 0.2614, + "step": 3081 + }, + { + "epoch": 0.8889689588700525, + "grad_norm": 1.412150502204895, + "learning_rate": 1.1684370257966617e-06, + "loss": 0.1879, + "step": 3084 + }, + { + "epoch": 0.8898337146666186, + "grad_norm": 2.1106693744659424, + "learning_rate": 1.159332321699545e-06, + "loss": 0.2436, + "step": 3087 + }, + { + "epoch": 0.8906984704631848, + "grad_norm": 1.6913747787475586, + "learning_rate": 1.150227617602428e-06, + "loss": 0.2234, + "step": 3090 + }, + { + "epoch": 0.891563226259751, + "grad_norm": 1.6746766567230225, + "learning_rate": 1.1411229135053112e-06, + "loss": 0.2381, + "step": 3093 + }, + { + "epoch": 0.8924279820563172, + "grad_norm": 3.068824291229248, + "learning_rate": 1.1320182094081943e-06, + "loss": 0.223, + "step": 3096 + }, + { + "epoch": 0.8932927378528834, + "grad_norm": 2.9033825397491455, + "learning_rate": 1.1229135053110776e-06, + "loss": 0.2197, + "step": 3099 + }, + { + "epoch": 0.8935809897850722, + "eval_loss": 0.21448279917240143, + "eval_mse": 0.21448280355427415, + "eval_runtime": 6.6727, + "eval_samples_per_second": 149.865, + "eval_steps_per_second": 18.733, + "step": 3100 + }, + { + "epoch": 0.8941574936494496, + "grad_norm": 1.8722037076950073, + "learning_rate": 1.1138088012139606e-06, + "loss": 0.1974, + "step": 3102 + }, + { + "epoch": 0.8950222494460158, + "grad_norm": 1.319684624671936, + "learning_rate": 1.1047040971168439e-06, + "loss": 0.2027, + "step": 3105 + }, + { + "epoch": 0.895887005242582, + "grad_norm": 3.9506242275238037, + "learning_rate": 1.095599393019727e-06, + "loss": 0.2124, + "step": 3108 + }, + { + "epoch": 0.8967517610391482, + "grad_norm": 1.7725896835327148, + "learning_rate": 1.08649468892261e-06, + "loss": 0.2146, + "step": 3111 + }, + { + "epoch": 0.8976165168357144, + "grad_norm": 1.9070608615875244, + "learning_rate": 1.0773899848254933e-06, + "loss": 0.2311, + "step": 3114 + }, + { + "epoch": 0.8984812726322806, + "grad_norm": 2.3098270893096924, + "learning_rate": 1.0682852807283763e-06, + "loss": 0.1847, + "step": 3117 + }, + { + "epoch": 0.8993460284288468, + "grad_norm": 2.392598867416382, + "learning_rate": 1.0591805766312596e-06, + "loss": 0.252, + "step": 3120 + }, + { + "epoch": 0.900210784225413, + "grad_norm": 1.592748761177063, + "learning_rate": 1.0500758725341426e-06, + "loss": 0.2226, + "step": 3123 + }, + { + "epoch": 0.9010755400219792, + "grad_norm": 2.1816020011901855, + "learning_rate": 1.0409711684370259e-06, + "loss": 0.2551, + "step": 3126 + }, + { + "epoch": 0.9019402958185454, + "grad_norm": 2.04571270942688, + "learning_rate": 1.031866464339909e-06, + "loss": 0.2506, + "step": 3129 + }, + { + "epoch": 0.9028050516151116, + "grad_norm": 3.148040771484375, + "learning_rate": 1.0227617602427922e-06, + "loss": 0.2056, + "step": 3132 + }, + { + "epoch": 0.9036698074116778, + "grad_norm": 4.721248626708984, + "learning_rate": 1.0136570561456753e-06, + "loss": 0.2243, + "step": 3135 + }, + { + "epoch": 0.904534563208244, + "grad_norm": 1.5059829950332642, + "learning_rate": 1.0045523520485585e-06, + "loss": 0.1884, + "step": 3138 + }, + { + "epoch": 0.9053993190048102, + "grad_norm": 1.9442839622497559, + "learning_rate": 9.954476479514416e-07, + "loss": 0.1965, + "step": 3141 + }, + { + "epoch": 0.9062640748013764, + "grad_norm": 2.1955509185791016, + "learning_rate": 9.863429438543249e-07, + "loss": 0.204, + "step": 3144 + }, + { + "epoch": 0.9071288305979426, + "grad_norm": 1.5504348278045654, + "learning_rate": 9.77238239757208e-07, + "loss": 0.2248, + "step": 3147 + }, + { + "epoch": 0.9079935863945088, + "grad_norm": 1.6235114336013794, + "learning_rate": 9.681335356600912e-07, + "loss": 0.2276, + "step": 3150 + }, + { + "epoch": 0.908858342191075, + "grad_norm": 2.8215818405151367, + "learning_rate": 9.590288315629742e-07, + "loss": 0.2292, + "step": 3153 + }, + { + "epoch": 0.9097230979876412, + "grad_norm": 1.7022403478622437, + "learning_rate": 9.499241274658574e-07, + "loss": 0.2383, + "step": 3156 + }, + { + "epoch": 0.9105878537842074, + "grad_norm": 2.5963640213012695, + "learning_rate": 9.408194233687407e-07, + "loss": 0.2311, + "step": 3159 + }, + { + "epoch": 0.9114526095807736, + "grad_norm": 1.830937147140503, + "learning_rate": 9.317147192716237e-07, + "loss": 0.2073, + "step": 3162 + }, + { + "epoch": 0.9123173653773398, + "grad_norm": 1.8354508876800537, + "learning_rate": 9.22610015174507e-07, + "loss": 0.2143, + "step": 3165 + }, + { + "epoch": 0.913182121173906, + "grad_norm": 1.5569084882736206, + "learning_rate": 9.1350531107739e-07, + "loss": 0.2102, + "step": 3168 + }, + { + "epoch": 0.9140468769704722, + "grad_norm": 1.5762006044387817, + "learning_rate": 9.044006069802733e-07, + "loss": 0.2335, + "step": 3171 + }, + { + "epoch": 0.9149116327670384, + "grad_norm": 1.4155080318450928, + "learning_rate": 8.952959028831563e-07, + "loss": 0.1993, + "step": 3174 + }, + { + "epoch": 0.9157763885636045, + "grad_norm": 3.324310779571533, + "learning_rate": 8.861911987860396e-07, + "loss": 0.2209, + "step": 3177 + }, + { + "epoch": 0.9166411443601707, + "grad_norm": 2.4857962131500244, + "learning_rate": 8.770864946889227e-07, + "loss": 0.2384, + "step": 3180 + }, + { + "epoch": 0.9175059001567369, + "grad_norm": 1.998383641242981, + "learning_rate": 8.679817905918058e-07, + "loss": 0.2216, + "step": 3183 + }, + { + "epoch": 0.9183706559533031, + "grad_norm": 2.2427866458892822, + "learning_rate": 8.58877086494689e-07, + "loss": 0.2328, + "step": 3186 + }, + { + "epoch": 0.9192354117498693, + "grad_norm": 2.5861330032348633, + "learning_rate": 8.497723823975721e-07, + "loss": 0.2228, + "step": 3189 + }, + { + "epoch": 0.9201001675464355, + "grad_norm": 1.680492639541626, + "learning_rate": 8.406676783004553e-07, + "loss": 0.2241, + "step": 3192 + }, + { + "epoch": 0.9209649233430017, + "grad_norm": 2.616267204284668, + "learning_rate": 8.315629742033385e-07, + "loss": 0.2524, + "step": 3195 + }, + { + "epoch": 0.921829679139568, + "grad_norm": 1.922303318977356, + "learning_rate": 8.224582701062215e-07, + "loss": 0.2397, + "step": 3198 + }, + { + "epoch": 0.9224061830039455, + "eval_loss": 0.21633096039295197, + "eval_mse": 0.2163309759118274, + "eval_runtime": 6.5496, + "eval_samples_per_second": 152.68, + "eval_steps_per_second": 19.085, + "step": 3200 + }, + { + "epoch": 0.9226944349361342, + "grad_norm": 4.39849853515625, + "learning_rate": 8.133535660091048e-07, + "loss": 0.2216, + "step": 3201 + }, + { + "epoch": 0.9235591907327004, + "grad_norm": 2.2769124507904053, + "learning_rate": 8.042488619119878e-07, + "loss": 0.2182, + "step": 3204 + }, + { + "epoch": 0.9244239465292666, + "grad_norm": 2.8028557300567627, + "learning_rate": 7.951441578148711e-07, + "loss": 0.229, + "step": 3207 + }, + { + "epoch": 0.9252887023258328, + "grad_norm": 2.1702733039855957, + "learning_rate": 7.860394537177542e-07, + "loss": 0.2261, + "step": 3210 + }, + { + "epoch": 0.926153458122399, + "grad_norm": 2.6439995765686035, + "learning_rate": 7.769347496206374e-07, + "loss": 0.2246, + "step": 3213 + }, + { + "epoch": 0.9270182139189652, + "grad_norm": 1.576919436454773, + "learning_rate": 7.678300455235206e-07, + "loss": 0.2354, + "step": 3216 + }, + { + "epoch": 0.9278829697155314, + "grad_norm": 1.6755398511886597, + "learning_rate": 7.587253414264036e-07, + "loss": 0.2158, + "step": 3219 + }, + { + "epoch": 0.9287477255120976, + "grad_norm": 2.1890718936920166, + "learning_rate": 7.496206373292869e-07, + "loss": 0.2233, + "step": 3222 + }, + { + "epoch": 0.9296124813086638, + "grad_norm": 1.7316986322402954, + "learning_rate": 7.4051593323217e-07, + "loss": 0.2296, + "step": 3225 + }, + { + "epoch": 0.93047723710523, + "grad_norm": 1.7639137506484985, + "learning_rate": 7.314112291350532e-07, + "loss": 0.2274, + "step": 3228 + }, + { + "epoch": 0.9313419929017962, + "grad_norm": 1.7912460565567017, + "learning_rate": 7.223065250379363e-07, + "loss": 0.1942, + "step": 3231 + }, + { + "epoch": 0.9322067486983624, + "grad_norm": 1.9391751289367676, + "learning_rate": 7.132018209408196e-07, + "loss": 0.2326, + "step": 3234 + }, + { + "epoch": 0.9330715044949286, + "grad_norm": 2.4329934120178223, + "learning_rate": 7.040971168437026e-07, + "loss": 0.2205, + "step": 3237 + }, + { + "epoch": 0.9339362602914948, + "grad_norm": 1.5994445085525513, + "learning_rate": 6.949924127465859e-07, + "loss": 0.2149, + "step": 3240 + }, + { + "epoch": 0.934801016088061, + "grad_norm": 2.992966651916504, + "learning_rate": 6.858877086494689e-07, + "loss": 0.2194, + "step": 3243 + }, + { + "epoch": 0.9356657718846272, + "grad_norm": 1.8795028924942017, + "learning_rate": 6.767830045523521e-07, + "loss": 0.2158, + "step": 3246 + }, + { + "epoch": 0.9365305276811934, + "grad_norm": 3.5229902267456055, + "learning_rate": 6.676783004552352e-07, + "loss": 0.2396, + "step": 3249 + }, + { + "epoch": 0.9373952834777596, + "grad_norm": 1.6539433002471924, + "learning_rate": 6.585735963581184e-07, + "loss": 0.2077, + "step": 3252 + }, + { + "epoch": 0.9382600392743258, + "grad_norm": 2.448824405670166, + "learning_rate": 6.494688922610016e-07, + "loss": 0.2271, + "step": 3255 + }, + { + "epoch": 0.939124795070892, + "grad_norm": 1.3006185293197632, + "learning_rate": 6.403641881638847e-07, + "loss": 0.2432, + "step": 3258 + }, + { + "epoch": 0.9399895508674582, + "grad_norm": 2.689985752105713, + "learning_rate": 6.312594840667678e-07, + "loss": 0.2152, + "step": 3261 + }, + { + "epoch": 0.9408543066640244, + "grad_norm": 1.5370323657989502, + "learning_rate": 6.22154779969651e-07, + "loss": 0.197, + "step": 3264 + }, + { + "epoch": 0.9417190624605906, + "grad_norm": 1.6089318990707397, + "learning_rate": 6.130500758725342e-07, + "loss": 0.2145, + "step": 3267 + }, + { + "epoch": 0.9425838182571568, + "grad_norm": 2.1321656703948975, + "learning_rate": 6.039453717754174e-07, + "loss": 0.1989, + "step": 3270 + }, + { + "epoch": 0.943448574053723, + "grad_norm": 1.990070104598999, + "learning_rate": 5.948406676783005e-07, + "loss": 0.1982, + "step": 3273 + }, + { + "epoch": 0.9443133298502892, + "grad_norm": 3.087510585784912, + "learning_rate": 5.857359635811837e-07, + "loss": 0.206, + "step": 3276 + }, + { + "epoch": 0.9451780856468553, + "grad_norm": 2.3679909706115723, + "learning_rate": 5.766312594840668e-07, + "loss": 0.2292, + "step": 3279 + }, + { + "epoch": 0.9460428414434215, + "grad_norm": 1.546036958694458, + "learning_rate": 5.675265553869499e-07, + "loss": 0.2127, + "step": 3282 + }, + { + "epoch": 0.9469075972399877, + "grad_norm": 1.9500880241394043, + "learning_rate": 5.584218512898331e-07, + "loss": 0.2157, + "step": 3285 + }, + { + "epoch": 0.9477723530365539, + "grad_norm": 1.9931670427322388, + "learning_rate": 5.493171471927162e-07, + "loss": 0.2151, + "step": 3288 + }, + { + "epoch": 0.9486371088331201, + "grad_norm": 1.7962517738342285, + "learning_rate": 5.402124430955994e-07, + "loss": 0.2035, + "step": 3291 + }, + { + "epoch": 0.9495018646296863, + "grad_norm": 1.3835334777832031, + "learning_rate": 5.311077389984825e-07, + "loss": 0.2059, + "step": 3294 + }, + { + "epoch": 0.9503666204262525, + "grad_norm": 3.157975912094116, + "learning_rate": 5.220030349013658e-07, + "loss": 0.2002, + "step": 3297 + }, + { + "epoch": 0.9512313762228187, + "grad_norm": 2.0881123542785645, + "learning_rate": 5.12898330804249e-07, + "loss": 0.2307, + "step": 3300 + }, + { + "epoch": 0.9512313762228187, + "eval_loss": 0.21600358188152313, + "eval_mse": 0.21600358275626785, + "eval_runtime": 6.4833, + "eval_samples_per_second": 154.243, + "eval_steps_per_second": 19.28, + "step": 3300 + }, + { + "epoch": 0.9520961320193849, + "grad_norm": 2.1528842449188232, + "learning_rate": 5.037936267071321e-07, + "loss": 0.2657, + "step": 3303 + }, + { + "epoch": 0.9529608878159511, + "grad_norm": 2.4048590660095215, + "learning_rate": 4.946889226100153e-07, + "loss": 0.2355, + "step": 3306 + }, + { + "epoch": 0.9538256436125173, + "grad_norm": 1.866373896598816, + "learning_rate": 4.855842185128983e-07, + "loss": 0.1932, + "step": 3309 + }, + { + "epoch": 0.9546903994090835, + "grad_norm": 1.543273687362671, + "learning_rate": 4.7647951441578155e-07, + "loss": 0.2088, + "step": 3312 + }, + { + "epoch": 0.9555551552056497, + "grad_norm": 1.585208535194397, + "learning_rate": 4.673748103186647e-07, + "loss": 0.1945, + "step": 3315 + }, + { + "epoch": 0.9564199110022159, + "grad_norm": 1.9510430097579956, + "learning_rate": 4.582701062215478e-07, + "loss": 0.1719, + "step": 3318 + }, + { + "epoch": 0.9572846667987821, + "grad_norm": 1.36229407787323, + "learning_rate": 4.49165402124431e-07, + "loss": 0.1997, + "step": 3321 + }, + { + "epoch": 0.9581494225953483, + "grad_norm": 2.563950777053833, + "learning_rate": 4.4006069802731414e-07, + "loss": 0.2385, + "step": 3324 + }, + { + "epoch": 0.9590141783919145, + "grad_norm": 2.159186363220215, + "learning_rate": 4.309559939301973e-07, + "loss": 0.2319, + "step": 3327 + }, + { + "epoch": 0.9598789341884807, + "grad_norm": 2.4233345985412598, + "learning_rate": 4.2185128983308046e-07, + "loss": 0.2117, + "step": 3330 + }, + { + "epoch": 0.9607436899850469, + "grad_norm": 1.915822982788086, + "learning_rate": 4.127465857359636e-07, + "loss": 0.2244, + "step": 3333 + }, + { + "epoch": 0.9616084457816131, + "grad_norm": 3.731882333755493, + "learning_rate": 4.0364188163884673e-07, + "loss": 0.2222, + "step": 3336 + }, + { + "epoch": 0.9624732015781793, + "grad_norm": 1.4122893810272217, + "learning_rate": 3.945371775417299e-07, + "loss": 0.2052, + "step": 3339 + }, + { + "epoch": 0.9633379573747455, + "grad_norm": 3.098508596420288, + "learning_rate": 3.8543247344461305e-07, + "loss": 0.212, + "step": 3342 + }, + { + "epoch": 0.9642027131713117, + "grad_norm": 1.4726747274398804, + "learning_rate": 3.763277693474962e-07, + "loss": 0.2374, + "step": 3345 + }, + { + "epoch": 0.9650674689678779, + "grad_norm": 1.8609498739242554, + "learning_rate": 3.6722306525037937e-07, + "loss": 0.2173, + "step": 3348 + }, + { + "epoch": 0.9659322247644441, + "grad_norm": 1.8475359678268433, + "learning_rate": 3.581183611532626e-07, + "loss": 0.2032, + "step": 3351 + }, + { + "epoch": 0.9667969805610104, + "grad_norm": 1.7029978036880493, + "learning_rate": 3.4901365705614574e-07, + "loss": 0.1713, + "step": 3354 + }, + { + "epoch": 0.9676617363575766, + "grad_norm": 2.657390832901001, + "learning_rate": 3.399089529590289e-07, + "loss": 0.2216, + "step": 3357 + }, + { + "epoch": 0.9685264921541428, + "grad_norm": 3.23854398727417, + "learning_rate": 3.3080424886191206e-07, + "loss": 0.1776, + "step": 3360 + }, + { + "epoch": 0.969391247950709, + "grad_norm": 2.09384822845459, + "learning_rate": 3.2169954476479517e-07, + "loss": 0.1866, + "step": 3363 + }, + { + "epoch": 0.9702560037472752, + "grad_norm": 1.8957816362380981, + "learning_rate": 3.1259484066767833e-07, + "loss": 0.2317, + "step": 3366 + }, + { + "epoch": 0.9711207595438414, + "grad_norm": 1.5353327989578247, + "learning_rate": 3.034901365705615e-07, + "loss": 0.209, + "step": 3369 + }, + { + "epoch": 0.9719855153404076, + "grad_norm": 1.776352882385254, + "learning_rate": 2.9438543247344465e-07, + "loss": 0.219, + "step": 3372 + }, + { + "epoch": 0.9728502711369738, + "grad_norm": 3.282552480697632, + "learning_rate": 2.852807283763278e-07, + "loss": 0.224, + "step": 3375 + }, + { + "epoch": 0.97371502693354, + "grad_norm": 1.8530248403549194, + "learning_rate": 2.7617602427921097e-07, + "loss": 0.2147, + "step": 3378 + }, + { + "epoch": 0.9745797827301061, + "grad_norm": 2.0258686542510986, + "learning_rate": 2.670713201820941e-07, + "loss": 0.2081, + "step": 3381 + }, + { + "epoch": 0.9754445385266723, + "grad_norm": 1.830100655555725, + "learning_rate": 2.5796661608497724e-07, + "loss": 0.2248, + "step": 3384 + }, + { + "epoch": 0.9763092943232385, + "grad_norm": 3.846280813217163, + "learning_rate": 2.488619119878604e-07, + "loss": 0.2269, + "step": 3387 + }, + { + "epoch": 0.9771740501198047, + "grad_norm": 2.3556368350982666, + "learning_rate": 2.3975720789074356e-07, + "loss": 0.2195, + "step": 3390 + }, + { + "epoch": 0.9780388059163709, + "grad_norm": 1.8791780471801758, + "learning_rate": 2.3065250379362674e-07, + "loss": 0.2297, + "step": 3393 + }, + { + "epoch": 0.9789035617129371, + "grad_norm": 1.6351841688156128, + "learning_rate": 2.215477996965099e-07, + "loss": 0.1925, + "step": 3396 + }, + { + "epoch": 0.9797683175095033, + "grad_norm": 2.4028263092041016, + "learning_rate": 2.1244309559939304e-07, + "loss": 0.2099, + "step": 3399 + }, + { + "epoch": 0.9800565694416921, + "eval_loss": 0.2100730687379837, + "eval_mse": 0.21007308381050824, + "eval_runtime": 6.746, + "eval_samples_per_second": 148.235, + "eval_steps_per_second": 18.529, + "step": 3400 + }, + { + "epoch": 0.9806330733060695, + "grad_norm": 3.0006494522094727, + "learning_rate": 2.033383915022762e-07, + "loss": 0.2095, + "step": 3402 + }, + { + "epoch": 0.9814978291026357, + "grad_norm": 2.7023770809173584, + "learning_rate": 1.9423368740515936e-07, + "loss": 0.2064, + "step": 3405 + }, + { + "epoch": 0.9823625848992019, + "grad_norm": 3.8468315601348877, + "learning_rate": 1.851289833080425e-07, + "loss": 0.2412, + "step": 3408 + }, + { + "epoch": 0.9832273406957681, + "grad_norm": 2.5841078758239746, + "learning_rate": 1.7602427921092565e-07, + "loss": 0.2056, + "step": 3411 + }, + { + "epoch": 0.9840920964923343, + "grad_norm": 2.557180881500244, + "learning_rate": 1.669195751138088e-07, + "loss": 0.2449, + "step": 3414 + }, + { + "epoch": 0.9849568522889005, + "grad_norm": 2.1071228981018066, + "learning_rate": 1.5781487101669194e-07, + "loss": 0.2664, + "step": 3417 + }, + { + "epoch": 0.9858216080854667, + "grad_norm": 2.5804591178894043, + "learning_rate": 1.4871016691957513e-07, + "loss": 0.2252, + "step": 3420 + }, + { + "epoch": 0.9866863638820329, + "grad_norm": 1.557554006576538, + "learning_rate": 1.3960546282245826e-07, + "loss": 0.1979, + "step": 3423 + }, + { + "epoch": 0.9875511196785991, + "grad_norm": 2.0957911014556885, + "learning_rate": 1.3050075872534145e-07, + "loss": 0.21, + "step": 3426 + }, + { + "epoch": 0.9884158754751653, + "grad_norm": 2.239748954772949, + "learning_rate": 1.2139605462822459e-07, + "loss": 0.1995, + "step": 3429 + }, + { + "epoch": 0.9892806312717315, + "grad_norm": 2.46882963180542, + "learning_rate": 1.1229135053110775e-07, + "loss": 0.2319, + "step": 3432 + }, + { + "epoch": 0.9901453870682977, + "grad_norm": 2.6145310401916504, + "learning_rate": 1.031866464339909e-07, + "loss": 0.2245, + "step": 3435 + }, + { + "epoch": 0.9910101428648639, + "grad_norm": 2.32053279876709, + "learning_rate": 9.408194233687405e-08, + "loss": 0.2338, + "step": 3438 + }, + { + "epoch": 0.9918748986614301, + "grad_norm": 1.4136054515838623, + "learning_rate": 8.497723823975723e-08, + "loss": 0.2173, + "step": 3441 + }, + { + "epoch": 0.9927396544579963, + "grad_norm": 1.3618991374969482, + "learning_rate": 7.587253414264037e-08, + "loss": 0.1794, + "step": 3444 + }, + { + "epoch": 0.9936044102545625, + "grad_norm": 2.8345561027526855, + "learning_rate": 6.676783004552352e-08, + "loss": 0.2182, + "step": 3447 + }, + { + "epoch": 0.9944691660511287, + "grad_norm": 1.9578803777694702, + "learning_rate": 5.7663125948406686e-08, + "loss": 0.2333, + "step": 3450 + }, + { + "epoch": 0.9953339218476949, + "grad_norm": 1.6162538528442383, + "learning_rate": 4.855842185128984e-08, + "loss": 0.2005, + "step": 3453 + }, + { + "epoch": 0.9961986776442611, + "grad_norm": 3.075516939163208, + "learning_rate": 3.9453717754172986e-08, + "loss": 0.2249, + "step": 3456 + }, + { + "epoch": 0.9970634334408273, + "grad_norm": 2.544214963912964, + "learning_rate": 3.0349013657056146e-08, + "loss": 0.2147, + "step": 3459 + }, + { + "epoch": 0.9979281892373935, + "grad_norm": 1.9591963291168213, + "learning_rate": 2.1244309559939306e-08, + "loss": 0.2164, + "step": 3462 + }, + { + "epoch": 0.9987929450339597, + "grad_norm": 1.6480742692947388, + "learning_rate": 1.213960546282246e-08, + "loss": 0.1889, + "step": 3465 + }, + { + "epoch": 0.9996577008305259, + "grad_norm": 1.6390196084976196, + "learning_rate": 3.034901365705615e-09, + "loss": 0.2205, + "step": 3468 + }, + { + "epoch": 0.9999459527627146, + "step": 3469, + "total_flos": 1.1682867916662374e+17, + "train_loss": 0.2771636627187094, + "train_runtime": 4603.777, + "train_samples_per_second": 96.454, + "train_steps_per_second": 0.754 + } + ], + "logging_steps": 3, + "max_steps": 3469, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1682867916662374e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}