diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65844 @@ +{ + "best_global_step": 34400, + "best_metric": 0.30035173892974854, + "best_model_checkpoint": "saves/lntuning/gemma-3-1b-it/train_stsb_1745333589/checkpoint-34400", + "epoch": 123.45749613601237, + "eval_steps": 200, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015455950540958269, + "grad_norm": 9.226296424865723, + "learning_rate": 4.999999876629946e-05, + "loss": 9.1651, + "num_input_tokens_seen": 7424, + "step": 5 + }, + { + "epoch": 0.030911901081916538, + "grad_norm": 9.222990036010742, + "learning_rate": 4.999999375439123e-05, + "loss": 9.0102, + "num_input_tokens_seen": 14848, + "step": 10 + }, + { + "epoch": 0.04636785162287481, + "grad_norm": 8.519775390625, + "learning_rate": 4.9999984887169785e-05, + "loss": 8.8724, + "num_input_tokens_seen": 22144, + "step": 15 + }, + { + "epoch": 0.061823802163833076, + "grad_norm": 8.680684089660645, + "learning_rate": 4.9999972164636506e-05, + "loss": 8.7402, + "num_input_tokens_seen": 30112, + "step": 20 + }, + { + "epoch": 0.07727975270479134, + "grad_norm": 7.912083625793457, + "learning_rate": 4.999995558679334e-05, + "loss": 8.3038, + "num_input_tokens_seen": 37728, + "step": 25 + }, + { + "epoch": 0.09273570324574962, + "grad_norm": 8.116209983825684, + "learning_rate": 4.999993515364287e-05, + "loss": 8.2241, + "num_input_tokens_seen": 45280, + "step": 30 + }, + { + "epoch": 0.10819165378670788, + "grad_norm": 8.148954391479492, + "learning_rate": 4.999991086518822e-05, + "loss": 8.0231, + "num_input_tokens_seen": 52768, + "step": 35 + }, + { + "epoch": 0.12364760432766615, + "grad_norm": 7.437188625335693, + "learning_rate": 4.999988272143315e-05, + "loss": 7.5294, + "num_input_tokens_seen": 60384, + "step": 40 + }, + { + "epoch": 0.1391035548686244, + "grad_norm": 7.58148717880249, + "learning_rate": 4.999985072238199e-05, + "loss": 7.2901, + "num_input_tokens_seen": 67872, + "step": 45 + }, + { + "epoch": 0.1545595054095827, + "grad_norm": 7.324509143829346, + "learning_rate": 4.999981486803969e-05, + "loss": 7.5274, + "num_input_tokens_seen": 75296, + "step": 50 + }, + { + "epoch": 0.17001545595054096, + "grad_norm": 6.90251350402832, + "learning_rate": 4.999977515841176e-05, + "loss": 7.1195, + "num_input_tokens_seen": 82816, + "step": 55 + }, + { + "epoch": 0.18547140649149924, + "grad_norm": 7.279465198516846, + "learning_rate": 4.9999731593504344e-05, + "loss": 7.147, + "num_input_tokens_seen": 89920, + "step": 60 + }, + { + "epoch": 0.2009273570324575, + "grad_norm": 6.85942268371582, + "learning_rate": 4.999968417332415e-05, + "loss": 6.7894, + "num_input_tokens_seen": 97888, + "step": 65 + }, + { + "epoch": 0.21638330757341576, + "grad_norm": 6.295539855957031, + "learning_rate": 4.999963289787848e-05, + "loss": 6.7172, + "num_input_tokens_seen": 104992, + "step": 70 + }, + { + "epoch": 0.23183925811437403, + "grad_norm": 6.602344989776611, + "learning_rate": 4.999957776717526e-05, + "loss": 6.6851, + "num_input_tokens_seen": 112992, + "step": 75 + }, + { + "epoch": 0.2472952086553323, + "grad_norm": 6.801296234130859, + "learning_rate": 4.9999518781222984e-05, + "loss": 6.3046, + "num_input_tokens_seen": 121024, + "step": 80 + }, + { + "epoch": 0.26275115919629055, + "grad_norm": 6.376216888427734, + "learning_rate": 4.9999455940030746e-05, + "loss": 6.4236, + "num_input_tokens_seen": 128672, + "step": 85 + }, + { + "epoch": 0.2782071097372488, + "grad_norm": 6.000462055206299, + "learning_rate": 4.999938924360824e-05, + "loss": 5.9794, + "num_input_tokens_seen": 136544, + "step": 90 + }, + { + "epoch": 0.2936630602782071, + "grad_norm": 5.497887134552002, + "learning_rate": 4.999931869196575e-05, + "loss": 5.8589, + "num_input_tokens_seen": 144160, + "step": 95 + }, + { + "epoch": 0.3091190108191654, + "grad_norm": 6.425207138061523, + "learning_rate": 4.999924428511416e-05, + "loss": 5.9299, + "num_input_tokens_seen": 151296, + "step": 100 + }, + { + "epoch": 0.32457496136012365, + "grad_norm": 5.885847091674805, + "learning_rate": 4.999916602306494e-05, + "loss": 5.7206, + "num_input_tokens_seen": 159456, + "step": 105 + }, + { + "epoch": 0.3400309119010819, + "grad_norm": 5.204844951629639, + "learning_rate": 4.999908390583016e-05, + "loss": 5.5123, + "num_input_tokens_seen": 167392, + "step": 110 + }, + { + "epoch": 0.3554868624420402, + "grad_norm": 5.372021198272705, + "learning_rate": 4.999899793342247e-05, + "loss": 5.6137, + "num_input_tokens_seen": 174528, + "step": 115 + }, + { + "epoch": 0.37094281298299847, + "grad_norm": 5.0562567710876465, + "learning_rate": 4.999890810585516e-05, + "loss": 5.2688, + "num_input_tokens_seen": 182464, + "step": 120 + }, + { + "epoch": 0.38639876352395675, + "grad_norm": 5.2072434425354, + "learning_rate": 4.999881442314206e-05, + "loss": 5.1788, + "num_input_tokens_seen": 190240, + "step": 125 + }, + { + "epoch": 0.401854714064915, + "grad_norm": 5.656543254852295, + "learning_rate": 4.9998716885297617e-05, + "loss": 5.0677, + "num_input_tokens_seen": 197984, + "step": 130 + }, + { + "epoch": 0.41731066460587324, + "grad_norm": 5.285520076751709, + "learning_rate": 4.999861549233688e-05, + "loss": 4.9216, + "num_input_tokens_seen": 205536, + "step": 135 + }, + { + "epoch": 0.4327666151468315, + "grad_norm": 4.824496746063232, + "learning_rate": 4.999851024427548e-05, + "loss": 4.8209, + "num_input_tokens_seen": 212640, + "step": 140 + }, + { + "epoch": 0.4482225656877898, + "grad_norm": 5.07562780380249, + "learning_rate": 4.999840114112965e-05, + "loss": 4.7347, + "num_input_tokens_seen": 220480, + "step": 145 + }, + { + "epoch": 0.46367851622874806, + "grad_norm": 4.665829181671143, + "learning_rate": 4.999828818291621e-05, + "loss": 4.5482, + "num_input_tokens_seen": 228224, + "step": 150 + }, + { + "epoch": 0.47913446676970634, + "grad_norm": 4.5057854652404785, + "learning_rate": 4.999817136965259e-05, + "loss": 4.439, + "num_input_tokens_seen": 235392, + "step": 155 + }, + { + "epoch": 0.4945904173106646, + "grad_norm": 4.600275993347168, + "learning_rate": 4.9998050701356794e-05, + "loss": 4.3218, + "num_input_tokens_seen": 243328, + "step": 160 + }, + { + "epoch": 0.5100463678516228, + "grad_norm": 3.9509365558624268, + "learning_rate": 4.999792617804744e-05, + "loss": 4.1134, + "num_input_tokens_seen": 251136, + "step": 165 + }, + { + "epoch": 0.5255023183925811, + "grad_norm": 4.331943988800049, + "learning_rate": 4.9997797799743724e-05, + "loss": 4.0625, + "num_input_tokens_seen": 258944, + "step": 170 + }, + { + "epoch": 0.5409582689335394, + "grad_norm": 4.4769062995910645, + "learning_rate": 4.999766556646545e-05, + "loss": 3.9523, + "num_input_tokens_seen": 266912, + "step": 175 + }, + { + "epoch": 0.5564142194744977, + "grad_norm": 4.205275535583496, + "learning_rate": 4.9997529478232996e-05, + "loss": 3.8876, + "num_input_tokens_seen": 274432, + "step": 180 + }, + { + "epoch": 0.5718701700154559, + "grad_norm": 4.172173976898193, + "learning_rate": 4.9997389535067365e-05, + "loss": 3.7253, + "num_input_tokens_seen": 282400, + "step": 185 + }, + { + "epoch": 0.5873261205564142, + "grad_norm": 3.9711039066314697, + "learning_rate": 4.999724573699012e-05, + "loss": 3.6504, + "num_input_tokens_seen": 289984, + "step": 190 + }, + { + "epoch": 0.6027820710973725, + "grad_norm": 3.6793811321258545, + "learning_rate": 4.9997098084023457e-05, + "loss": 3.6017, + "num_input_tokens_seen": 297984, + "step": 195 + }, + { + "epoch": 0.6182380216383307, + "grad_norm": 3.3676133155822754, + "learning_rate": 4.999694657619013e-05, + "loss": 3.4322, + "num_input_tokens_seen": 305312, + "step": 200 + }, + { + "epoch": 0.6182380216383307, + "eval_loss": 3.398878812789917, + "eval_runtime": 6.3096, + "eval_samples_per_second": 91.132, + "eval_steps_per_second": 22.823, + "num_input_tokens_seen": 305312, + "step": 200 + }, + { + "epoch": 0.633693972179289, + "grad_norm": 3.370088577270508, + "learning_rate": 4.999679121351352e-05, + "loss": 3.3863, + "num_input_tokens_seen": 312608, + "step": 205 + }, + { + "epoch": 0.6491499227202473, + "grad_norm": 3.5878167152404785, + "learning_rate": 4.9996631996017565e-05, + "loss": 3.3621, + "num_input_tokens_seen": 320672, + "step": 210 + }, + { + "epoch": 0.6646058732612056, + "grad_norm": 3.5217669010162354, + "learning_rate": 4.9996468923726835e-05, + "loss": 3.2534, + "num_input_tokens_seen": 328224, + "step": 215 + }, + { + "epoch": 0.6800618238021638, + "grad_norm": 3.328702926635742, + "learning_rate": 4.999630199666647e-05, + "loss": 3.0635, + "num_input_tokens_seen": 335520, + "step": 220 + }, + { + "epoch": 0.6955177743431221, + "grad_norm": 2.9892923831939697, + "learning_rate": 4.999613121486222e-05, + "loss": 2.9943, + "num_input_tokens_seen": 343008, + "step": 225 + }, + { + "epoch": 0.7109737248840804, + "grad_norm": 3.000009536743164, + "learning_rate": 4.999595657834041e-05, + "loss": 3.0, + "num_input_tokens_seen": 350528, + "step": 230 + }, + { + "epoch": 0.7264296754250387, + "grad_norm": 2.5820298194885254, + "learning_rate": 4.999577808712798e-05, + "loss": 2.9361, + "num_input_tokens_seen": 358368, + "step": 235 + }, + { + "epoch": 0.7418856259659969, + "grad_norm": 2.8355607986450195, + "learning_rate": 4.999559574125244e-05, + "loss": 2.8833, + "num_input_tokens_seen": 366080, + "step": 240 + }, + { + "epoch": 0.7573415765069552, + "grad_norm": 2.730891704559326, + "learning_rate": 4.9995409540741934e-05, + "loss": 2.8062, + "num_input_tokens_seen": 373888, + "step": 245 + }, + { + "epoch": 0.7727975270479135, + "grad_norm": 2.529583215713501, + "learning_rate": 4.999521948562516e-05, + "loss": 2.7272, + "num_input_tokens_seen": 381248, + "step": 250 + }, + { + "epoch": 0.7882534775888718, + "grad_norm": 2.4684531688690186, + "learning_rate": 4.999502557593143e-05, + "loss": 2.6576, + "num_input_tokens_seen": 388928, + "step": 255 + }, + { + "epoch": 0.80370942812983, + "grad_norm": 2.2863550186157227, + "learning_rate": 4.999482781169066e-05, + "loss": 2.6469, + "num_input_tokens_seen": 396416, + "step": 260 + }, + { + "epoch": 0.8191653786707882, + "grad_norm": 1.934937834739685, + "learning_rate": 4.9994626192933324e-05, + "loss": 2.5967, + "num_input_tokens_seen": 403968, + "step": 265 + }, + { + "epoch": 0.8346213292117465, + "grad_norm": 1.7734750509262085, + "learning_rate": 4.999442071969054e-05, + "loss": 2.4379, + "num_input_tokens_seen": 412192, + "step": 270 + }, + { + "epoch": 0.8500772797527048, + "grad_norm": 1.6304538249969482, + "learning_rate": 4.999421139199397e-05, + "loss": 2.4429, + "num_input_tokens_seen": 420000, + "step": 275 + }, + { + "epoch": 0.865533230293663, + "grad_norm": 1.7310148477554321, + "learning_rate": 4.999399820987592e-05, + "loss": 2.4931, + "num_input_tokens_seen": 427392, + "step": 280 + }, + { + "epoch": 0.8809891808346213, + "grad_norm": 1.650916576385498, + "learning_rate": 4.999378117336924e-05, + "loss": 2.275, + "num_input_tokens_seen": 435168, + "step": 285 + }, + { + "epoch": 0.8964451313755796, + "grad_norm": 1.5713309049606323, + "learning_rate": 4.9993560282507415e-05, + "loss": 2.4617, + "num_input_tokens_seen": 443424, + "step": 290 + }, + { + "epoch": 0.9119010819165378, + "grad_norm": 1.6697794198989868, + "learning_rate": 4.9993335537324495e-05, + "loss": 2.195, + "num_input_tokens_seen": 451040, + "step": 295 + }, + { + "epoch": 0.9273570324574961, + "grad_norm": 1.8273131847381592, + "learning_rate": 4.999310693785516e-05, + "loss": 2.3458, + "num_input_tokens_seen": 458848, + "step": 300 + }, + { + "epoch": 0.9428129829984544, + "grad_norm": 1.4630181789398193, + "learning_rate": 4.9992874484134653e-05, + "loss": 2.25, + "num_input_tokens_seen": 466560, + "step": 305 + }, + { + "epoch": 0.9582689335394127, + "grad_norm": 1.3963651657104492, + "learning_rate": 4.999263817619882e-05, + "loss": 2.1623, + "num_input_tokens_seen": 474272, + "step": 310 + }, + { + "epoch": 0.973724884080371, + "grad_norm": 1.2382981777191162, + "learning_rate": 4.9992398014084105e-05, + "loss": 2.139, + "num_input_tokens_seen": 482304, + "step": 315 + }, + { + "epoch": 0.9891808346213292, + "grad_norm": 1.584847331047058, + "learning_rate": 4.999215399782754e-05, + "loss": 2.1812, + "num_input_tokens_seen": 489952, + "step": 320 + }, + { + "epoch": 1.0030911901081916, + "grad_norm": 1.293550968170166, + "learning_rate": 4.999190612746675e-05, + "loss": 2.1673, + "num_input_tokens_seen": 496320, + "step": 325 + }, + { + "epoch": 1.01854714064915, + "grad_norm": 1.2016807794570923, + "learning_rate": 4.999165440303998e-05, + "loss": 2.0809, + "num_input_tokens_seen": 504736, + "step": 330 + }, + { + "epoch": 1.0340030911901081, + "grad_norm": 1.1789546012878418, + "learning_rate": 4.999139882458603e-05, + "loss": 2.0121, + "num_input_tokens_seen": 512256, + "step": 335 + }, + { + "epoch": 1.0494590417310665, + "grad_norm": 1.9898319244384766, + "learning_rate": 4.9991139392144314e-05, + "loss": 1.9752, + "num_input_tokens_seen": 519872, + "step": 340 + }, + { + "epoch": 1.0649149922720247, + "grad_norm": 1.177803874015808, + "learning_rate": 4.999087610575485e-05, + "loss": 1.9975, + "num_input_tokens_seen": 527712, + "step": 345 + }, + { + "epoch": 1.080370942812983, + "grad_norm": 1.3393034934997559, + "learning_rate": 4.999060896545824e-05, + "loss": 1.973, + "num_input_tokens_seen": 534592, + "step": 350 + }, + { + "epoch": 1.0958268933539412, + "grad_norm": 1.4243723154067993, + "learning_rate": 4.999033797129568e-05, + "loss": 1.9272, + "num_input_tokens_seen": 542432, + "step": 355 + }, + { + "epoch": 1.1112828438948996, + "grad_norm": 1.5232435464859009, + "learning_rate": 4.999006312330894e-05, + "loss": 2.0194, + "num_input_tokens_seen": 549856, + "step": 360 + }, + { + "epoch": 1.1267387944358578, + "grad_norm": 1.1323976516723633, + "learning_rate": 4.998978442154043e-05, + "loss": 1.9411, + "num_input_tokens_seen": 557792, + "step": 365 + }, + { + "epoch": 1.1421947449768162, + "grad_norm": 1.2476133108139038, + "learning_rate": 4.9989501866033125e-05, + "loss": 1.8829, + "num_input_tokens_seen": 565088, + "step": 370 + }, + { + "epoch": 1.1576506955177743, + "grad_norm": 1.0874273777008057, + "learning_rate": 4.998921545683059e-05, + "loss": 1.8792, + "num_input_tokens_seen": 572384, + "step": 375 + }, + { + "epoch": 1.1731066460587325, + "grad_norm": 1.1873692274093628, + "learning_rate": 4.9988925193976996e-05, + "loss": 1.7916, + "num_input_tokens_seen": 580064, + "step": 380 + }, + { + "epoch": 1.1885625965996909, + "grad_norm": 0.9547975063323975, + "learning_rate": 4.998863107751711e-05, + "loss": 1.7998, + "num_input_tokens_seen": 587424, + "step": 385 + }, + { + "epoch": 1.2040185471406493, + "grad_norm": 1.0285735130310059, + "learning_rate": 4.998833310749629e-05, + "loss": 1.9013, + "num_input_tokens_seen": 594944, + "step": 390 + }, + { + "epoch": 1.2194744976816074, + "grad_norm": 0.9065995812416077, + "learning_rate": 4.998803128396047e-05, + "loss": 1.8302, + "num_input_tokens_seen": 602816, + "step": 395 + }, + { + "epoch": 1.2349304482225656, + "grad_norm": 0.9820312261581421, + "learning_rate": 4.9987725606956215e-05, + "loss": 1.8164, + "num_input_tokens_seen": 610048, + "step": 400 + }, + { + "epoch": 1.2349304482225656, + "eval_loss": 1.7951388359069824, + "eval_runtime": 6.3249, + "eval_samples_per_second": 90.911, + "eval_steps_per_second": 22.767, + "num_input_tokens_seen": 610048, + "step": 400 + }, + { + "epoch": 1.250386398763524, + "grad_norm": 0.9885538816452026, + "learning_rate": 4.998741607653066e-05, + "loss": 1.7677, + "num_input_tokens_seen": 617536, + "step": 405 + }, + { + "epoch": 1.2658423493044824, + "grad_norm": 0.9737051725387573, + "learning_rate": 4.9987102692731523e-05, + "loss": 1.7865, + "num_input_tokens_seen": 625280, + "step": 410 + }, + { + "epoch": 1.2812982998454405, + "grad_norm": 1.1447163820266724, + "learning_rate": 4.9986785455607157e-05, + "loss": 1.7133, + "num_input_tokens_seen": 633120, + "step": 415 + }, + { + "epoch": 1.2967542503863987, + "grad_norm": 1.1590800285339355, + "learning_rate": 4.9986464365206456e-05, + "loss": 1.754, + "num_input_tokens_seen": 640064, + "step": 420 + }, + { + "epoch": 1.312210200927357, + "grad_norm": 0.8255159258842468, + "learning_rate": 4.9986139421578956e-05, + "loss": 1.7298, + "num_input_tokens_seen": 648032, + "step": 425 + }, + { + "epoch": 1.3276661514683152, + "grad_norm": 0.843002438545227, + "learning_rate": 4.998581062477477e-05, + "loss": 1.7572, + "num_input_tokens_seen": 655648, + "step": 430 + }, + { + "epoch": 1.3431221020092736, + "grad_norm": 0.9854825139045715, + "learning_rate": 4.998547797484458e-05, + "loss": 1.6995, + "num_input_tokens_seen": 662848, + "step": 435 + }, + { + "epoch": 1.3585780525502318, + "grad_norm": 0.9934817552566528, + "learning_rate": 4.9985141471839706e-05, + "loss": 1.6669, + "num_input_tokens_seen": 670496, + "step": 440 + }, + { + "epoch": 1.3740340030911902, + "grad_norm": 1.2670109272003174, + "learning_rate": 4.998480111581203e-05, + "loss": 1.6673, + "num_input_tokens_seen": 678240, + "step": 445 + }, + { + "epoch": 1.3894899536321483, + "grad_norm": 1.0868405103683472, + "learning_rate": 4.998445690681405e-05, + "loss": 1.6384, + "num_input_tokens_seen": 686784, + "step": 450 + }, + { + "epoch": 1.4049459041731067, + "grad_norm": 0.8322428464889526, + "learning_rate": 4.9984108844898834e-05, + "loss": 1.6854, + "num_input_tokens_seen": 693888, + "step": 455 + }, + { + "epoch": 1.4204018547140649, + "grad_norm": 0.9425356984138489, + "learning_rate": 4.9983756930120076e-05, + "loss": 1.6608, + "num_input_tokens_seen": 701568, + "step": 460 + }, + { + "epoch": 1.435857805255023, + "grad_norm": 1.2053849697113037, + "learning_rate": 4.9983401162532025e-05, + "loss": 1.6611, + "num_input_tokens_seen": 709344, + "step": 465 + }, + { + "epoch": 1.4513137557959814, + "grad_norm": 0.8954638838768005, + "learning_rate": 4.998304154218955e-05, + "loss": 1.6771, + "num_input_tokens_seen": 717152, + "step": 470 + }, + { + "epoch": 1.4667697063369398, + "grad_norm": 1.0540295839309692, + "learning_rate": 4.998267806914812e-05, + "loss": 1.6836, + "num_input_tokens_seen": 724832, + "step": 475 + }, + { + "epoch": 1.482225656877898, + "grad_norm": 0.8107435703277588, + "learning_rate": 4.998231074346378e-05, + "loss": 1.5497, + "num_input_tokens_seen": 732704, + "step": 480 + }, + { + "epoch": 1.4976816074188561, + "grad_norm": 0.7813143730163574, + "learning_rate": 4.998193956519317e-05, + "loss": 1.5597, + "num_input_tokens_seen": 740256, + "step": 485 + }, + { + "epoch": 1.5131375579598145, + "grad_norm": 0.883548378944397, + "learning_rate": 4.9981564534393545e-05, + "loss": 1.6132, + "num_input_tokens_seen": 747968, + "step": 490 + }, + { + "epoch": 1.528593508500773, + "grad_norm": 0.8526323437690735, + "learning_rate": 4.998118565112272e-05, + "loss": 1.6035, + "num_input_tokens_seen": 756032, + "step": 495 + }, + { + "epoch": 1.544049459041731, + "grad_norm": 0.8508664965629578, + "learning_rate": 4.998080291543914e-05, + "loss": 1.6547, + "num_input_tokens_seen": 763744, + "step": 500 + }, + { + "epoch": 1.5595054095826892, + "grad_norm": 0.9107279777526855, + "learning_rate": 4.9980416327401826e-05, + "loss": 1.5399, + "num_input_tokens_seen": 771648, + "step": 505 + }, + { + "epoch": 1.5749613601236476, + "grad_norm": 0.7748051285743713, + "learning_rate": 4.998002588707038e-05, + "loss": 1.572, + "num_input_tokens_seen": 779808, + "step": 510 + }, + { + "epoch": 1.590417310664606, + "grad_norm": 0.9456562995910645, + "learning_rate": 4.997963159450503e-05, + "loss": 1.5448, + "num_input_tokens_seen": 787296, + "step": 515 + }, + { + "epoch": 1.6058732612055642, + "grad_norm": 1.0259522199630737, + "learning_rate": 4.9979233449766575e-05, + "loss": 1.55, + "num_input_tokens_seen": 794592, + "step": 520 + }, + { + "epoch": 1.6213292117465223, + "grad_norm": 0.8664754033088684, + "learning_rate": 4.997883145291641e-05, + "loss": 1.4951, + "num_input_tokens_seen": 802144, + "step": 525 + }, + { + "epoch": 1.6367851622874807, + "grad_norm": 1.0702552795410156, + "learning_rate": 4.9978425604016536e-05, + "loss": 1.5378, + "num_input_tokens_seen": 809600, + "step": 530 + }, + { + "epoch": 1.652241112828439, + "grad_norm": 1.305601716041565, + "learning_rate": 4.9978015903129536e-05, + "loss": 1.4697, + "num_input_tokens_seen": 816832, + "step": 535 + }, + { + "epoch": 1.6676970633693973, + "grad_norm": 0.7949715852737427, + "learning_rate": 4.997760235031859e-05, + "loss": 1.5593, + "num_input_tokens_seen": 824320, + "step": 540 + }, + { + "epoch": 1.6831530139103554, + "grad_norm": 1.0603950023651123, + "learning_rate": 4.9977184945647473e-05, + "loss": 1.563, + "num_input_tokens_seen": 832960, + "step": 545 + }, + { + "epoch": 1.6986089644513136, + "grad_norm": 0.8376322984695435, + "learning_rate": 4.997676368918055e-05, + "loss": 1.5391, + "num_input_tokens_seen": 840256, + "step": 550 + }, + { + "epoch": 1.714064914992272, + "grad_norm": 0.7568612098693848, + "learning_rate": 4.9976338580982794e-05, + "loss": 1.5185, + "num_input_tokens_seen": 847936, + "step": 555 + }, + { + "epoch": 1.7295208655332304, + "grad_norm": 0.6629490852355957, + "learning_rate": 4.9975909621119755e-05, + "loss": 1.4945, + "num_input_tokens_seen": 855520, + "step": 560 + }, + { + "epoch": 1.7449768160741885, + "grad_norm": 1.1465359926223755, + "learning_rate": 4.997547680965758e-05, + "loss": 1.4469, + "num_input_tokens_seen": 862624, + "step": 565 + }, + { + "epoch": 1.7604327666151467, + "grad_norm": 0.9734804034233093, + "learning_rate": 4.997504014666302e-05, + "loss": 1.4396, + "num_input_tokens_seen": 869952, + "step": 570 + }, + { + "epoch": 1.775888717156105, + "grad_norm": 0.9343405365943909, + "learning_rate": 4.997459963220342e-05, + "loss": 1.538, + "num_input_tokens_seen": 878240, + "step": 575 + }, + { + "epoch": 1.7913446676970635, + "grad_norm": 0.8838407397270203, + "learning_rate": 4.997415526634671e-05, + "loss": 1.4325, + "num_input_tokens_seen": 885472, + "step": 580 + }, + { + "epoch": 1.8068006182380216, + "grad_norm": 0.6326628923416138, + "learning_rate": 4.99737070491614e-05, + "loss": 1.4309, + "num_input_tokens_seen": 892864, + "step": 585 + }, + { + "epoch": 1.8222565687789798, + "grad_norm": 1.136695146560669, + "learning_rate": 4.997325498071663e-05, + "loss": 1.5036, + "num_input_tokens_seen": 901568, + "step": 590 + }, + { + "epoch": 1.8377125193199382, + "grad_norm": 0.5912725329399109, + "learning_rate": 4.997279906108211e-05, + "loss": 1.4186, + "num_input_tokens_seen": 909632, + "step": 595 + }, + { + "epoch": 1.8531684698608966, + "grad_norm": 0.6421916484832764, + "learning_rate": 4.9972339290328155e-05, + "loss": 1.4417, + "num_input_tokens_seen": 917664, + "step": 600 + }, + { + "epoch": 1.8531684698608966, + "eval_loss": 1.41982901096344, + "eval_runtime": 6.2716, + "eval_samples_per_second": 91.683, + "eval_steps_per_second": 22.961, + "num_input_tokens_seen": 917664, + "step": 600 + }, + { + "epoch": 1.8686244204018547, + "grad_norm": 0.9575716853141785, + "learning_rate": 4.9971875668525646e-05, + "loss": 1.4649, + "num_input_tokens_seen": 924992, + "step": 605 + }, + { + "epoch": 1.8840803709428129, + "grad_norm": 1.3434261083602905, + "learning_rate": 4.997140819574609e-05, + "loss": 1.343, + "num_input_tokens_seen": 932480, + "step": 610 + }, + { + "epoch": 1.8995363214837713, + "grad_norm": 0.6652854681015015, + "learning_rate": 4.997093687206159e-05, + "loss": 1.3435, + "num_input_tokens_seen": 940224, + "step": 615 + }, + { + "epoch": 1.9149922720247297, + "grad_norm": 0.6432847380638123, + "learning_rate": 4.997046169754482e-05, + "loss": 1.4195, + "num_input_tokens_seen": 948000, + "step": 620 + }, + { + "epoch": 1.9304482225656878, + "grad_norm": 0.7536585330963135, + "learning_rate": 4.996998267226905e-05, + "loss": 1.3205, + "num_input_tokens_seen": 956064, + "step": 625 + }, + { + "epoch": 1.945904173106646, + "grad_norm": 0.813018262386322, + "learning_rate": 4.996949979630817e-05, + "loss": 1.4783, + "num_input_tokens_seen": 963520, + "step": 630 + }, + { + "epoch": 1.9613601236476044, + "grad_norm": 0.7649021744728088, + "learning_rate": 4.996901306973663e-05, + "loss": 1.3212, + "num_input_tokens_seen": 971200, + "step": 635 + }, + { + "epoch": 1.9768160741885628, + "grad_norm": 0.6755572557449341, + "learning_rate": 4.996852249262949e-05, + "loss": 1.3398, + "num_input_tokens_seen": 979232, + "step": 640 + }, + { + "epoch": 1.992272024729521, + "grad_norm": 0.6340906620025635, + "learning_rate": 4.996802806506241e-05, + "loss": 1.3211, + "num_input_tokens_seen": 986848, + "step": 645 + }, + { + "epoch": 2.006182380216383, + "grad_norm": 0.7200747132301331, + "learning_rate": 4.996752978711164e-05, + "loss": 1.2217, + "num_input_tokens_seen": 992800, + "step": 650 + }, + { + "epoch": 2.021638330757342, + "grad_norm": 1.1105310916900635, + "learning_rate": 4.996702765885401e-05, + "loss": 1.3459, + "num_input_tokens_seen": 1000672, + "step": 655 + }, + { + "epoch": 2.0370942812983, + "grad_norm": 0.8049219250679016, + "learning_rate": 4.9966521680366964e-05, + "loss": 1.4026, + "num_input_tokens_seen": 1008992, + "step": 660 + }, + { + "epoch": 2.052550231839258, + "grad_norm": 0.801409900188446, + "learning_rate": 4.9966011851728524e-05, + "loss": 1.4159, + "num_input_tokens_seen": 1016064, + "step": 665 + }, + { + "epoch": 2.0680061823802163, + "grad_norm": 0.9803038835525513, + "learning_rate": 4.996549817301731e-05, + "loss": 1.3593, + "num_input_tokens_seen": 1024320, + "step": 670 + }, + { + "epoch": 2.0834621329211744, + "grad_norm": 0.8756961822509766, + "learning_rate": 4.9964980644312544e-05, + "loss": 1.3705, + "num_input_tokens_seen": 1031712, + "step": 675 + }, + { + "epoch": 2.098918083462133, + "grad_norm": 0.7279723286628723, + "learning_rate": 4.996445926569403e-05, + "loss": 1.332, + "num_input_tokens_seen": 1039232, + "step": 680 + }, + { + "epoch": 2.114374034003091, + "grad_norm": 0.6435777544975281, + "learning_rate": 4.996393403724218e-05, + "loss": 1.3209, + "num_input_tokens_seen": 1046848, + "step": 685 + }, + { + "epoch": 2.1298299845440494, + "grad_norm": 0.5693140625953674, + "learning_rate": 4.9963404959037985e-05, + "loss": 1.343, + "num_input_tokens_seen": 1054976, + "step": 690 + }, + { + "epoch": 2.1452859350850075, + "grad_norm": 0.5152165293693542, + "learning_rate": 4.996287203116303e-05, + "loss": 1.2214, + "num_input_tokens_seen": 1061920, + "step": 695 + }, + { + "epoch": 2.160741885625966, + "grad_norm": 0.8966767191886902, + "learning_rate": 4.996233525369951e-05, + "loss": 1.3147, + "num_input_tokens_seen": 1069408, + "step": 700 + }, + { + "epoch": 2.1761978361669243, + "grad_norm": 0.6653637886047363, + "learning_rate": 4.99617946267302e-05, + "loss": 1.3505, + "num_input_tokens_seen": 1076864, + "step": 705 + }, + { + "epoch": 2.1916537867078825, + "grad_norm": 0.5810138583183289, + "learning_rate": 4.996125015033846e-05, + "loss": 1.2432, + "num_input_tokens_seen": 1084768, + "step": 710 + }, + { + "epoch": 2.2071097372488406, + "grad_norm": 1.0940895080566406, + "learning_rate": 4.996070182460827e-05, + "loss": 1.2847, + "num_input_tokens_seen": 1092032, + "step": 715 + }, + { + "epoch": 2.2225656877897992, + "grad_norm": 0.8814454078674316, + "learning_rate": 4.996014964962418e-05, + "loss": 1.3464, + "num_input_tokens_seen": 1099648, + "step": 720 + }, + { + "epoch": 2.2380216383307574, + "grad_norm": 0.7026343941688538, + "learning_rate": 4.9959593625471344e-05, + "loss": 1.2957, + "num_input_tokens_seen": 1107648, + "step": 725 + }, + { + "epoch": 2.2534775888717156, + "grad_norm": 0.6694613099098206, + "learning_rate": 4.995903375223552e-05, + "loss": 1.2478, + "num_input_tokens_seen": 1115392, + "step": 730 + }, + { + "epoch": 2.2689335394126737, + "grad_norm": 0.6388654708862305, + "learning_rate": 4.995847003000302e-05, + "loss": 1.2717, + "num_input_tokens_seen": 1122784, + "step": 735 + }, + { + "epoch": 2.2843894899536323, + "grad_norm": 0.7894784808158875, + "learning_rate": 4.9957902458860804e-05, + "loss": 1.3125, + "num_input_tokens_seen": 1130272, + "step": 740 + }, + { + "epoch": 2.2998454404945905, + "grad_norm": 0.5486288070678711, + "learning_rate": 4.995733103889639e-05, + "loss": 1.2427, + "num_input_tokens_seen": 1138784, + "step": 745 + }, + { + "epoch": 2.3153013910355487, + "grad_norm": 0.789227306842804, + "learning_rate": 4.99567557701979e-05, + "loss": 1.2636, + "num_input_tokens_seen": 1146240, + "step": 750 + }, + { + "epoch": 2.330757341576507, + "grad_norm": 0.7120614051818848, + "learning_rate": 4.995617665285403e-05, + "loss": 1.2594, + "num_input_tokens_seen": 1154208, + "step": 755 + }, + { + "epoch": 2.346213292117465, + "grad_norm": 0.8232840299606323, + "learning_rate": 4.99555936869541e-05, + "loss": 1.2632, + "num_input_tokens_seen": 1162112, + "step": 760 + }, + { + "epoch": 2.3616692426584236, + "grad_norm": 0.65553879737854, + "learning_rate": 4.995500687258803e-05, + "loss": 1.2363, + "num_input_tokens_seen": 1169472, + "step": 765 + }, + { + "epoch": 2.3771251931993818, + "grad_norm": 0.8463524580001831, + "learning_rate": 4.995441620984628e-05, + "loss": 1.2596, + "num_input_tokens_seen": 1177024, + "step": 770 + }, + { + "epoch": 2.39258114374034, + "grad_norm": 0.7362673282623291, + "learning_rate": 4.995382169881996e-05, + "loss": 1.1812, + "num_input_tokens_seen": 1184352, + "step": 775 + }, + { + "epoch": 2.4080370942812985, + "grad_norm": 0.7437820434570312, + "learning_rate": 4.9953223339600755e-05, + "loss": 1.2526, + "num_input_tokens_seen": 1192160, + "step": 780 + }, + { + "epoch": 2.4234930448222567, + "grad_norm": 0.7790391445159912, + "learning_rate": 4.995262113228091e-05, + "loss": 1.2776, + "num_input_tokens_seen": 1199744, + "step": 785 + }, + { + "epoch": 2.438948995363215, + "grad_norm": 0.6904948353767395, + "learning_rate": 4.995201507695332e-05, + "loss": 1.2042, + "num_input_tokens_seen": 1207744, + "step": 790 + }, + { + "epoch": 2.454404945904173, + "grad_norm": 0.7213172912597656, + "learning_rate": 4.995140517371144e-05, + "loss": 1.2312, + "num_input_tokens_seen": 1215584, + "step": 795 + }, + { + "epoch": 2.469860896445131, + "grad_norm": 0.930949330329895, + "learning_rate": 4.995079142264932e-05, + "loss": 1.2241, + "num_input_tokens_seen": 1223104, + "step": 800 + }, + { + "epoch": 2.469860896445131, + "eval_loss": 1.2245581150054932, + "eval_runtime": 6.2726, + "eval_samples_per_second": 91.668, + "eval_steps_per_second": 22.957, + "num_input_tokens_seen": 1223104, + "step": 800 + }, + { + "epoch": 2.48531684698609, + "grad_norm": 0.6925356388092041, + "learning_rate": 4.995017382386162e-05, + "loss": 1.1981, + "num_input_tokens_seen": 1230720, + "step": 805 + }, + { + "epoch": 2.500772797527048, + "grad_norm": 0.7306983470916748, + "learning_rate": 4.994955237744356e-05, + "loss": 1.1711, + "num_input_tokens_seen": 1238304, + "step": 810 + }, + { + "epoch": 2.516228748068006, + "grad_norm": 0.7554611563682556, + "learning_rate": 4.994892708349101e-05, + "loss": 1.1596, + "num_input_tokens_seen": 1246208, + "step": 815 + }, + { + "epoch": 2.5316846986089647, + "grad_norm": 0.6808359622955322, + "learning_rate": 4.994829794210035e-05, + "loss": 1.1826, + "num_input_tokens_seen": 1254080, + "step": 820 + }, + { + "epoch": 2.547140649149923, + "grad_norm": 1.303930401802063, + "learning_rate": 4.994766495336864e-05, + "loss": 1.194, + "num_input_tokens_seen": 1261184, + "step": 825 + }, + { + "epoch": 2.562596599690881, + "grad_norm": 0.7494289875030518, + "learning_rate": 4.994702811739348e-05, + "loss": 1.2613, + "num_input_tokens_seen": 1269152, + "step": 830 + }, + { + "epoch": 2.578052550231839, + "grad_norm": 0.561695396900177, + "learning_rate": 4.994638743427308e-05, + "loss": 1.1929, + "num_input_tokens_seen": 1276704, + "step": 835 + }, + { + "epoch": 2.5935085007727974, + "grad_norm": 0.701637864112854, + "learning_rate": 4.994574290410624e-05, + "loss": 1.2094, + "num_input_tokens_seen": 1284128, + "step": 840 + }, + { + "epoch": 2.6089644513137555, + "grad_norm": 0.6511617302894592, + "learning_rate": 4.9945094526992364e-05, + "loss": 1.2048, + "num_input_tokens_seen": 1291968, + "step": 845 + }, + { + "epoch": 2.624420401854714, + "grad_norm": 0.6652617454528809, + "learning_rate": 4.994444230303142e-05, + "loss": 1.1619, + "num_input_tokens_seen": 1299712, + "step": 850 + }, + { + "epoch": 2.6398763523956723, + "grad_norm": 0.8844426870346069, + "learning_rate": 4.994378623232402e-05, + "loss": 1.1997, + "num_input_tokens_seen": 1307072, + "step": 855 + }, + { + "epoch": 2.6553323029366305, + "grad_norm": 0.7097378373146057, + "learning_rate": 4.99431263149713e-05, + "loss": 1.1413, + "num_input_tokens_seen": 1314784, + "step": 860 + }, + { + "epoch": 2.670788253477589, + "grad_norm": 0.6142622828483582, + "learning_rate": 4.9942462551075056e-05, + "loss": 1.0687, + "num_input_tokens_seen": 1322816, + "step": 865 + }, + { + "epoch": 2.6862442040185472, + "grad_norm": 0.5880860686302185, + "learning_rate": 4.994179494073764e-05, + "loss": 1.1217, + "num_input_tokens_seen": 1330720, + "step": 870 + }, + { + "epoch": 2.7017001545595054, + "grad_norm": 0.6646177172660828, + "learning_rate": 4.9941123484062e-05, + "loss": 1.1571, + "num_input_tokens_seen": 1337952, + "step": 875 + }, + { + "epoch": 2.7171561051004636, + "grad_norm": 0.6729475855827332, + "learning_rate": 4.99404481811517e-05, + "loss": 1.1806, + "num_input_tokens_seen": 1346016, + "step": 880 + }, + { + "epoch": 2.7326120556414217, + "grad_norm": 0.9198416471481323, + "learning_rate": 4.9939769032110864e-05, + "loss": 1.2203, + "num_input_tokens_seen": 1353376, + "step": 885 + }, + { + "epoch": 2.7480680061823803, + "grad_norm": 0.8325666189193726, + "learning_rate": 4.993908603704423e-05, + "loss": 1.1477, + "num_input_tokens_seen": 1360992, + "step": 890 + }, + { + "epoch": 2.7635239567233385, + "grad_norm": 0.8375571370124817, + "learning_rate": 4.9938399196057126e-05, + "loss": 1.148, + "num_input_tokens_seen": 1368512, + "step": 895 + }, + { + "epoch": 2.7789799072642967, + "grad_norm": 0.6589453816413879, + "learning_rate": 4.993770850925547e-05, + "loss": 1.2036, + "num_input_tokens_seen": 1376768, + "step": 900 + }, + { + "epoch": 2.7944358578052553, + "grad_norm": 0.6692481637001038, + "learning_rate": 4.993701397674577e-05, + "loss": 1.1693, + "num_input_tokens_seen": 1384864, + "step": 905 + }, + { + "epoch": 2.8098918083462134, + "grad_norm": 0.6885107159614563, + "learning_rate": 4.993631559863515e-05, + "loss": 1.1921, + "num_input_tokens_seen": 1392512, + "step": 910 + }, + { + "epoch": 2.8253477588871716, + "grad_norm": 0.9478461146354675, + "learning_rate": 4.9935613375031283e-05, + "loss": 1.13, + "num_input_tokens_seen": 1400160, + "step": 915 + }, + { + "epoch": 2.8408037094281298, + "grad_norm": 0.7574657201766968, + "learning_rate": 4.993490730604248e-05, + "loss": 1.1946, + "num_input_tokens_seen": 1407456, + "step": 920 + }, + { + "epoch": 2.856259659969088, + "grad_norm": 0.6848726868629456, + "learning_rate": 4.993419739177761e-05, + "loss": 1.1566, + "num_input_tokens_seen": 1415712, + "step": 925 + }, + { + "epoch": 2.871715610510046, + "grad_norm": 0.7456622123718262, + "learning_rate": 4.9933483632346164e-05, + "loss": 1.1617, + "num_input_tokens_seen": 1423584, + "step": 930 + }, + { + "epoch": 2.8871715610510047, + "grad_norm": 0.6260982155799866, + "learning_rate": 4.993276602785821e-05, + "loss": 1.1666, + "num_input_tokens_seen": 1430848, + "step": 935 + }, + { + "epoch": 2.902627511591963, + "grad_norm": 0.5619131326675415, + "learning_rate": 4.993204457842441e-05, + "loss": 1.1045, + "num_input_tokens_seen": 1438496, + "step": 940 + }, + { + "epoch": 2.918083462132921, + "grad_norm": 0.5645473003387451, + "learning_rate": 4.993131928415602e-05, + "loss": 1.1106, + "num_input_tokens_seen": 1445952, + "step": 945 + }, + { + "epoch": 2.9335394126738796, + "grad_norm": 0.7002622485160828, + "learning_rate": 4.993059014516489e-05, + "loss": 1.1379, + "num_input_tokens_seen": 1453184, + "step": 950 + }, + { + "epoch": 2.948995363214838, + "grad_norm": 0.8259776830673218, + "learning_rate": 4.9929857161563464e-05, + "loss": 1.1821, + "num_input_tokens_seen": 1461664, + "step": 955 + }, + { + "epoch": 2.964451313755796, + "grad_norm": 0.5031753182411194, + "learning_rate": 4.992912033346477e-05, + "loss": 1.1216, + "num_input_tokens_seen": 1468736, + "step": 960 + }, + { + "epoch": 2.979907264296754, + "grad_norm": 0.898300051689148, + "learning_rate": 4.992837966098245e-05, + "loss": 1.0682, + "num_input_tokens_seen": 1476288, + "step": 965 + }, + { + "epoch": 2.9953632148377123, + "grad_norm": 0.5490151047706604, + "learning_rate": 4.992763514423071e-05, + "loss": 1.1081, + "num_input_tokens_seen": 1483840, + "step": 970 + }, + { + "epoch": 3.009273570324575, + "grad_norm": 0.726442277431488, + "learning_rate": 4.992688678332437e-05, + "loss": 1.2246, + "num_input_tokens_seen": 1490832, + "step": 975 + }, + { + "epoch": 3.024729520865533, + "grad_norm": 0.6793272495269775, + "learning_rate": 4.992613457837884e-05, + "loss": 1.0972, + "num_input_tokens_seen": 1497840, + "step": 980 + }, + { + "epoch": 3.0401854714064913, + "grad_norm": 0.5567182302474976, + "learning_rate": 4.992537852951011e-05, + "loss": 0.9866, + "num_input_tokens_seen": 1505520, + "step": 985 + }, + { + "epoch": 3.05564142194745, + "grad_norm": 0.5790271162986755, + "learning_rate": 4.9924618636834785e-05, + "loss": 1.0185, + "num_input_tokens_seen": 1513392, + "step": 990 + }, + { + "epoch": 3.071097372488408, + "grad_norm": 0.5449028015136719, + "learning_rate": 4.9923854900470046e-05, + "loss": 1.157, + "num_input_tokens_seen": 1521008, + "step": 995 + }, + { + "epoch": 3.0865533230293662, + "grad_norm": 0.9423348903656006, + "learning_rate": 4.992308732053367e-05, + "loss": 1.0684, + "num_input_tokens_seen": 1528432, + "step": 1000 + }, + { + "epoch": 3.0865533230293662, + "eval_loss": 1.0838674306869507, + "eval_runtime": 6.3154, + "eval_samples_per_second": 91.047, + "eval_steps_per_second": 22.801, + "num_input_tokens_seen": 1528432, + "step": 1000 + }, + { + "epoch": 3.1020092735703244, + "grad_norm": 0.7008082270622253, + "learning_rate": 4.992231589714402e-05, + "loss": 1.0789, + "num_input_tokens_seen": 1535920, + "step": 1005 + }, + { + "epoch": 3.117465224111283, + "grad_norm": 0.7734846472740173, + "learning_rate": 4.992154063042007e-05, + "loss": 1.0399, + "num_input_tokens_seen": 1544496, + "step": 1010 + }, + { + "epoch": 3.132921174652241, + "grad_norm": 0.6289916634559631, + "learning_rate": 4.992076152048136e-05, + "loss": 1.075, + "num_input_tokens_seen": 1552176, + "step": 1015 + }, + { + "epoch": 3.1483771251931993, + "grad_norm": 0.9545561075210571, + "learning_rate": 4.991997856744807e-05, + "loss": 1.0902, + "num_input_tokens_seen": 1559440, + "step": 1020 + }, + { + "epoch": 3.1638330757341575, + "grad_norm": 0.7472225427627563, + "learning_rate": 4.9919191771440905e-05, + "loss": 1.0898, + "num_input_tokens_seen": 1566800, + "step": 1025 + }, + { + "epoch": 3.179289026275116, + "grad_norm": 0.6217725276947021, + "learning_rate": 4.991840113258122e-05, + "loss": 1.0061, + "num_input_tokens_seen": 1574736, + "step": 1030 + }, + { + "epoch": 3.1947449768160743, + "grad_norm": 0.5926253199577332, + "learning_rate": 4.9917606650990933e-05, + "loss": 1.0647, + "num_input_tokens_seen": 1582512, + "step": 1035 + }, + { + "epoch": 3.2102009273570324, + "grad_norm": 0.6413880586624146, + "learning_rate": 4.9916808326792566e-05, + "loss": 1.0982, + "num_input_tokens_seen": 1590160, + "step": 1040 + }, + { + "epoch": 3.2256568778979906, + "grad_norm": 0.6426993012428284, + "learning_rate": 4.9916006160109235e-05, + "loss": 1.1267, + "num_input_tokens_seen": 1597968, + "step": 1045 + }, + { + "epoch": 3.2411128284389488, + "grad_norm": 0.6985977292060852, + "learning_rate": 4.991520015106464e-05, + "loss": 1.0722, + "num_input_tokens_seen": 1606128, + "step": 1050 + }, + { + "epoch": 3.2565687789799074, + "grad_norm": 0.5918837189674377, + "learning_rate": 4.991439029978308e-05, + "loss": 1.0408, + "num_input_tokens_seen": 1613424, + "step": 1055 + }, + { + "epoch": 3.2720247295208655, + "grad_norm": 0.6555137038230896, + "learning_rate": 4.9913576606389434e-05, + "loss": 1.0942, + "num_input_tokens_seen": 1621072, + "step": 1060 + }, + { + "epoch": 3.2874806800618237, + "grad_norm": 0.6201384663581848, + "learning_rate": 4.991275907100919e-05, + "loss": 1.0275, + "num_input_tokens_seen": 1629136, + "step": 1065 + }, + { + "epoch": 3.3029366306027823, + "grad_norm": 0.634002685546875, + "learning_rate": 4.9911937693768434e-05, + "loss": 1.0255, + "num_input_tokens_seen": 1636560, + "step": 1070 + }, + { + "epoch": 3.3183925811437405, + "grad_norm": 0.5446100234985352, + "learning_rate": 4.991111247479382e-05, + "loss": 1.0386, + "num_input_tokens_seen": 1644560, + "step": 1075 + }, + { + "epoch": 3.3338485316846986, + "grad_norm": 0.5539417862892151, + "learning_rate": 4.9910283414212605e-05, + "loss": 1.0754, + "num_input_tokens_seen": 1652144, + "step": 1080 + }, + { + "epoch": 3.349304482225657, + "grad_norm": 0.7709252238273621, + "learning_rate": 4.990945051215265e-05, + "loss": 1.0146, + "num_input_tokens_seen": 1659792, + "step": 1085 + }, + { + "epoch": 3.364760432766615, + "grad_norm": 0.7384884357452393, + "learning_rate": 4.99086137687424e-05, + "loss": 1.0617, + "num_input_tokens_seen": 1667152, + "step": 1090 + }, + { + "epoch": 3.3802163833075736, + "grad_norm": 0.708892822265625, + "learning_rate": 4.9907773184110874e-05, + "loss": 1.0908, + "num_input_tokens_seen": 1675408, + "step": 1095 + }, + { + "epoch": 3.3956723338485317, + "grad_norm": 0.534225583076477, + "learning_rate": 4.9906928758387715e-05, + "loss": 1.0014, + "num_input_tokens_seen": 1683152, + "step": 1100 + }, + { + "epoch": 3.41112828438949, + "grad_norm": 0.5688941478729248, + "learning_rate": 4.9906080491703146e-05, + "loss": 0.9942, + "num_input_tokens_seen": 1690928, + "step": 1105 + }, + { + "epoch": 3.426584234930448, + "grad_norm": 0.6129929423332214, + "learning_rate": 4.990522838418797e-05, + "loss": 1.016, + "num_input_tokens_seen": 1698736, + "step": 1110 + }, + { + "epoch": 3.4420401854714067, + "grad_norm": 0.7378246188163757, + "learning_rate": 4.9904372435973604e-05, + "loss": 1.0386, + "num_input_tokens_seen": 1707120, + "step": 1115 + }, + { + "epoch": 3.457496136012365, + "grad_norm": 0.8215299844741821, + "learning_rate": 4.990351264719203e-05, + "loss": 0.9559, + "num_input_tokens_seen": 1714832, + "step": 1120 + }, + { + "epoch": 3.472952086553323, + "grad_norm": 0.6235546469688416, + "learning_rate": 4.990264901797586e-05, + "loss": 1.0636, + "num_input_tokens_seen": 1722832, + "step": 1125 + }, + { + "epoch": 3.488408037094281, + "grad_norm": 0.6443334817886353, + "learning_rate": 4.990178154845826e-05, + "loss": 0.9963, + "num_input_tokens_seen": 1730992, + "step": 1130 + }, + { + "epoch": 3.5038639876352393, + "grad_norm": 0.708801805973053, + "learning_rate": 4.9900910238773014e-05, + "loss": 0.9683, + "num_input_tokens_seen": 1738672, + "step": 1135 + }, + { + "epoch": 3.519319938176198, + "grad_norm": 0.6079695820808411, + "learning_rate": 4.990003508905448e-05, + "loss": 1.0271, + "num_input_tokens_seen": 1746544, + "step": 1140 + }, + { + "epoch": 3.534775888717156, + "grad_norm": 0.7194398641586304, + "learning_rate": 4.989915609943763e-05, + "loss": 1.0474, + "num_input_tokens_seen": 1753744, + "step": 1145 + }, + { + "epoch": 3.5502318392581143, + "grad_norm": 0.6236816048622131, + "learning_rate": 4.9898273270058e-05, + "loss": 1.03, + "num_input_tokens_seen": 1761552, + "step": 1150 + }, + { + "epoch": 3.565687789799073, + "grad_norm": 0.6094138026237488, + "learning_rate": 4.989738660105174e-05, + "loss": 0.8669, + "num_input_tokens_seen": 1768848, + "step": 1155 + }, + { + "epoch": 3.581143740340031, + "grad_norm": 0.6223769187927246, + "learning_rate": 4.989649609255559e-05, + "loss": 0.8783, + "num_input_tokens_seen": 1776400, + "step": 1160 + }, + { + "epoch": 3.596599690880989, + "grad_norm": 0.6641162633895874, + "learning_rate": 4.989560174470687e-05, + "loss": 0.9802, + "num_input_tokens_seen": 1783984, + "step": 1165 + }, + { + "epoch": 3.6120556414219473, + "grad_norm": 0.5471371412277222, + "learning_rate": 4.989470355764351e-05, + "loss": 0.9973, + "num_input_tokens_seen": 1792240, + "step": 1170 + }, + { + "epoch": 3.6275115919629055, + "grad_norm": 0.6383033394813538, + "learning_rate": 4.9893801531504e-05, + "loss": 0.9875, + "num_input_tokens_seen": 1799344, + "step": 1175 + }, + { + "epoch": 3.642967542503864, + "grad_norm": 0.9796339273452759, + "learning_rate": 4.9892895666427475e-05, + "loss": 0.9917, + "num_input_tokens_seen": 1807216, + "step": 1180 + }, + { + "epoch": 3.6584234930448223, + "grad_norm": 0.8313068747520447, + "learning_rate": 4.9891985962553606e-05, + "loss": 0.9561, + "num_input_tokens_seen": 1814992, + "step": 1185 + }, + { + "epoch": 3.6738794435857804, + "grad_norm": 0.5542605519294739, + "learning_rate": 4.989107242002269e-05, + "loss": 0.881, + "num_input_tokens_seen": 1822480, + "step": 1190 + }, + { + "epoch": 3.689335394126739, + "grad_norm": 0.7559302449226379, + "learning_rate": 4.989015503897561e-05, + "loss": 0.9469, + "num_input_tokens_seen": 1830224, + "step": 1195 + }, + { + "epoch": 3.704791344667697, + "grad_norm": 0.6799903512001038, + "learning_rate": 4.988923381955383e-05, + "loss": 1.0292, + "num_input_tokens_seen": 1837520, + "step": 1200 + }, + { + "epoch": 3.704791344667697, + "eval_loss": 0.9669994711875916, + "eval_runtime": 6.333, + "eval_samples_per_second": 90.794, + "eval_steps_per_second": 22.738, + "num_input_tokens_seen": 1837520, + "step": 1200 + }, + { + "epoch": 3.7202472952086554, + "grad_norm": 0.6588506102561951, + "learning_rate": 4.988830876189942e-05, + "loss": 0.9612, + "num_input_tokens_seen": 1844976, + "step": 1205 + }, + { + "epoch": 3.7357032457496135, + "grad_norm": 0.8796734809875488, + "learning_rate": 4.988737986615503e-05, + "loss": 0.9605, + "num_input_tokens_seen": 1852688, + "step": 1210 + }, + { + "epoch": 3.7511591962905717, + "grad_norm": 0.6363288760185242, + "learning_rate": 4.988644713246391e-05, + "loss": 0.9581, + "num_input_tokens_seen": 1860144, + "step": 1215 + }, + { + "epoch": 3.76661514683153, + "grad_norm": 0.6527089476585388, + "learning_rate": 4.988551056096991e-05, + "loss": 0.9287, + "num_input_tokens_seen": 1867856, + "step": 1220 + }, + { + "epoch": 3.7820710973724885, + "grad_norm": 0.787335991859436, + "learning_rate": 4.988457015181743e-05, + "loss": 0.857, + "num_input_tokens_seen": 1875632, + "step": 1225 + }, + { + "epoch": 3.7975270479134466, + "grad_norm": 0.7233056426048279, + "learning_rate": 4.988362590515153e-05, + "loss": 0.9075, + "num_input_tokens_seen": 1883728, + "step": 1230 + }, + { + "epoch": 3.812982998454405, + "grad_norm": 0.9463915228843689, + "learning_rate": 4.9882677821117805e-05, + "loss": 0.9017, + "num_input_tokens_seen": 1891152, + "step": 1235 + }, + { + "epoch": 3.8284389489953634, + "grad_norm": 0.638397753238678, + "learning_rate": 4.988172589986246e-05, + "loss": 0.9859, + "num_input_tokens_seen": 1898288, + "step": 1240 + }, + { + "epoch": 3.8438948995363216, + "grad_norm": 0.8164963722229004, + "learning_rate": 4.9880770141532304e-05, + "loss": 0.917, + "num_input_tokens_seen": 1905936, + "step": 1245 + }, + { + "epoch": 3.8593508500772797, + "grad_norm": 0.755261242389679, + "learning_rate": 4.987981054627472e-05, + "loss": 0.9437, + "num_input_tokens_seen": 1913392, + "step": 1250 + }, + { + "epoch": 3.874806800618238, + "grad_norm": 0.6247695088386536, + "learning_rate": 4.987884711423769e-05, + "loss": 0.9652, + "num_input_tokens_seen": 1921136, + "step": 1255 + }, + { + "epoch": 3.890262751159196, + "grad_norm": 0.5899882316589355, + "learning_rate": 4.9877879845569784e-05, + "loss": 0.9488, + "num_input_tokens_seen": 1928688, + "step": 1260 + }, + { + "epoch": 3.9057187017001547, + "grad_norm": 0.6696059703826904, + "learning_rate": 4.9876908740420175e-05, + "loss": 0.939, + "num_input_tokens_seen": 1936496, + "step": 1265 + }, + { + "epoch": 3.921174652241113, + "grad_norm": 0.5409762263298035, + "learning_rate": 4.987593379893861e-05, + "loss": 0.9308, + "num_input_tokens_seen": 1944112, + "step": 1270 + }, + { + "epoch": 3.936630602782071, + "grad_norm": 0.4988725185394287, + "learning_rate": 4.987495502127545e-05, + "loss": 0.8969, + "num_input_tokens_seen": 1951888, + "step": 1275 + }, + { + "epoch": 3.9520865533230296, + "grad_norm": 0.737087607383728, + "learning_rate": 4.987397240758162e-05, + "loss": 0.907, + "num_input_tokens_seen": 1959504, + "step": 1280 + }, + { + "epoch": 3.9675425038639878, + "grad_norm": 0.7583689093589783, + "learning_rate": 4.9872985958008664e-05, + "loss": 0.928, + "num_input_tokens_seen": 1967184, + "step": 1285 + }, + { + "epoch": 3.982998454404946, + "grad_norm": 0.6505270004272461, + "learning_rate": 4.987199567270871e-05, + "loss": 0.8162, + "num_input_tokens_seen": 1974416, + "step": 1290 + }, + { + "epoch": 3.998454404945904, + "grad_norm": 0.7423642873764038, + "learning_rate": 4.9871001551834444e-05, + "loss": 0.8612, + "num_input_tokens_seen": 1982352, + "step": 1295 + }, + { + "epoch": 4.012364760432766, + "grad_norm": 0.8109255433082581, + "learning_rate": 4.98700035955392e-05, + "loss": 0.895, + "num_input_tokens_seen": 1989328, + "step": 1300 + }, + { + "epoch": 4.0278207109737245, + "grad_norm": 0.6302828788757324, + "learning_rate": 4.986900180397686e-05, + "loss": 0.8594, + "num_input_tokens_seen": 1996976, + "step": 1305 + }, + { + "epoch": 4.043276661514684, + "grad_norm": 0.7109437584877014, + "learning_rate": 4.9867996177301926e-05, + "loss": 0.9029, + "num_input_tokens_seen": 2004592, + "step": 1310 + }, + { + "epoch": 4.058732612055642, + "grad_norm": 0.9484000205993652, + "learning_rate": 4.9866986715669464e-05, + "loss": 0.968, + "num_input_tokens_seen": 2012368, + "step": 1315 + }, + { + "epoch": 4.0741885625966, + "grad_norm": 0.7501100897789001, + "learning_rate": 4.9865973419235155e-05, + "loss": 0.8508, + "num_input_tokens_seen": 2019952, + "step": 1320 + }, + { + "epoch": 4.089644513137558, + "grad_norm": 0.545806884765625, + "learning_rate": 4.986495628815526e-05, + "loss": 0.9556, + "num_input_tokens_seen": 2027696, + "step": 1325 + }, + { + "epoch": 4.105100463678516, + "grad_norm": 0.6976214647293091, + "learning_rate": 4.986393532258663e-05, + "loss": 0.8294, + "num_input_tokens_seen": 2035120, + "step": 1330 + }, + { + "epoch": 4.120556414219474, + "grad_norm": 0.6060914397239685, + "learning_rate": 4.986291052268671e-05, + "loss": 0.8721, + "num_input_tokens_seen": 2042992, + "step": 1335 + }, + { + "epoch": 4.1360123647604325, + "grad_norm": 0.7804686427116394, + "learning_rate": 4.986188188861355e-05, + "loss": 0.8769, + "num_input_tokens_seen": 2050832, + "step": 1340 + }, + { + "epoch": 4.151468315301391, + "grad_norm": 0.6583150029182434, + "learning_rate": 4.9860849420525766e-05, + "loss": 0.8888, + "num_input_tokens_seen": 2058512, + "step": 1345 + }, + { + "epoch": 4.166924265842349, + "grad_norm": 0.730006754398346, + "learning_rate": 4.9859813118582575e-05, + "loss": 0.8415, + "num_input_tokens_seen": 2066512, + "step": 1350 + }, + { + "epoch": 4.182380216383308, + "grad_norm": 0.7349391579627991, + "learning_rate": 4.98587729829438e-05, + "loss": 0.8927, + "num_input_tokens_seen": 2074608, + "step": 1355 + }, + { + "epoch": 4.197836166924266, + "grad_norm": 0.6353805661201477, + "learning_rate": 4.985772901376983e-05, + "loss": 0.905, + "num_input_tokens_seen": 2082576, + "step": 1360 + }, + { + "epoch": 4.213292117465224, + "grad_norm": 0.6864052414894104, + "learning_rate": 4.9856681211221666e-05, + "loss": 0.8421, + "num_input_tokens_seen": 2090800, + "step": 1365 + }, + { + "epoch": 4.228748068006182, + "grad_norm": 0.6439478993415833, + "learning_rate": 4.985562957546089e-05, + "loss": 0.8405, + "num_input_tokens_seen": 2098128, + "step": 1370 + }, + { + "epoch": 4.244204018547141, + "grad_norm": 0.7152281403541565, + "learning_rate": 4.9854574106649686e-05, + "loss": 0.787, + "num_input_tokens_seen": 2105264, + "step": 1375 + }, + { + "epoch": 4.259659969088099, + "grad_norm": 0.8114583492279053, + "learning_rate": 4.985351480495081e-05, + "loss": 0.8741, + "num_input_tokens_seen": 2112464, + "step": 1380 + }, + { + "epoch": 4.275115919629057, + "grad_norm": 0.6604759693145752, + "learning_rate": 4.985245167052762e-05, + "loss": 0.7739, + "num_input_tokens_seen": 2119952, + "step": 1385 + }, + { + "epoch": 4.290571870170015, + "grad_norm": 0.6790342926979065, + "learning_rate": 4.9851384703544066e-05, + "loss": 0.9557, + "num_input_tokens_seen": 2127728, + "step": 1390 + }, + { + "epoch": 4.306027820710974, + "grad_norm": 0.575843870639801, + "learning_rate": 4.985031390416469e-05, + "loss": 0.7719, + "num_input_tokens_seen": 2136080, + "step": 1395 + }, + { + "epoch": 4.321483771251932, + "grad_norm": 0.6255269050598145, + "learning_rate": 4.984923927255461e-05, + "loss": 0.8334, + "num_input_tokens_seen": 2143216, + "step": 1400 + }, + { + "epoch": 4.321483771251932, + "eval_loss": 0.8560550212860107, + "eval_runtime": 6.309, + "eval_samples_per_second": 91.14, + "eval_steps_per_second": 22.825, + "num_input_tokens_seen": 2143216, + "step": 1400 + }, + { + "epoch": 4.3369397217928904, + "grad_norm": 0.6663662195205688, + "learning_rate": 4.984816080887958e-05, + "loss": 0.8621, + "num_input_tokens_seen": 2151312, + "step": 1405 + }, + { + "epoch": 4.352395672333849, + "grad_norm": 0.653238832950592, + "learning_rate": 4.9847078513305875e-05, + "loss": 0.8399, + "num_input_tokens_seen": 2158512, + "step": 1410 + }, + { + "epoch": 4.367851622874807, + "grad_norm": 0.5841424465179443, + "learning_rate": 4.984599238600043e-05, + "loss": 0.8207, + "num_input_tokens_seen": 2166256, + "step": 1415 + }, + { + "epoch": 4.383307573415765, + "grad_norm": 0.7210288047790527, + "learning_rate": 4.9844902427130716e-05, + "loss": 0.8324, + "num_input_tokens_seen": 2173584, + "step": 1420 + }, + { + "epoch": 4.398763523956723, + "grad_norm": 0.5167375802993774, + "learning_rate": 4.984380863686482e-05, + "loss": 0.8727, + "num_input_tokens_seen": 2181936, + "step": 1425 + }, + { + "epoch": 4.414219474497681, + "grad_norm": 0.7309324741363525, + "learning_rate": 4.984271101537143e-05, + "loss": 0.9149, + "num_input_tokens_seen": 2189776, + "step": 1430 + }, + { + "epoch": 4.42967542503864, + "grad_norm": 0.6534067392349243, + "learning_rate": 4.9841609562819816e-05, + "loss": 0.6675, + "num_input_tokens_seen": 2197104, + "step": 1435 + }, + { + "epoch": 4.4451313755795985, + "grad_norm": 0.9155667424201965, + "learning_rate": 4.984050427937983e-05, + "loss": 0.8519, + "num_input_tokens_seen": 2204816, + "step": 1440 + }, + { + "epoch": 4.460587326120557, + "grad_norm": 0.7735638618469238, + "learning_rate": 4.983939516522191e-05, + "loss": 0.842, + "num_input_tokens_seen": 2213104, + "step": 1445 + }, + { + "epoch": 4.476043276661515, + "grad_norm": 0.6521425247192383, + "learning_rate": 4.983828222051711e-05, + "loss": 0.7268, + "num_input_tokens_seen": 2220816, + "step": 1450 + }, + { + "epoch": 4.491499227202473, + "grad_norm": 0.7868285179138184, + "learning_rate": 4.983716544543705e-05, + "loss": 0.8405, + "num_input_tokens_seen": 2228176, + "step": 1455 + }, + { + "epoch": 4.506955177743431, + "grad_norm": 0.6431261897087097, + "learning_rate": 4.983604484015395e-05, + "loss": 0.8034, + "num_input_tokens_seen": 2235984, + "step": 1460 + }, + { + "epoch": 4.522411128284389, + "grad_norm": 0.6892287135124207, + "learning_rate": 4.983492040484064e-05, + "loss": 0.7457, + "num_input_tokens_seen": 2243472, + "step": 1465 + }, + { + "epoch": 4.5378670788253475, + "grad_norm": 0.8452478051185608, + "learning_rate": 4.98337921396705e-05, + "loss": 0.7907, + "num_input_tokens_seen": 2251280, + "step": 1470 + }, + { + "epoch": 4.553323029366306, + "grad_norm": 0.697215735912323, + "learning_rate": 4.983266004481753e-05, + "loss": 0.7778, + "num_input_tokens_seen": 2258960, + "step": 1475 + }, + { + "epoch": 4.568778979907265, + "grad_norm": 0.5976248383522034, + "learning_rate": 4.9831524120456316e-05, + "loss": 0.7684, + "num_input_tokens_seen": 2265872, + "step": 1480 + }, + { + "epoch": 4.584234930448223, + "grad_norm": 0.5541280508041382, + "learning_rate": 4.9830384366762026e-05, + "loss": 0.8018, + "num_input_tokens_seen": 2274096, + "step": 1485 + }, + { + "epoch": 4.599690880989181, + "grad_norm": 0.7426663041114807, + "learning_rate": 4.9829240783910436e-05, + "loss": 0.8832, + "num_input_tokens_seen": 2282768, + "step": 1490 + }, + { + "epoch": 4.615146831530139, + "grad_norm": 0.5671806335449219, + "learning_rate": 4.982809337207789e-05, + "loss": 0.7434, + "num_input_tokens_seen": 2290192, + "step": 1495 + }, + { + "epoch": 4.630602782071097, + "grad_norm": 0.8463546633720398, + "learning_rate": 4.9826942131441337e-05, + "loss": 0.7966, + "num_input_tokens_seen": 2297904, + "step": 1500 + }, + { + "epoch": 4.6460587326120555, + "grad_norm": 0.7348947525024414, + "learning_rate": 4.9825787062178315e-05, + "loss": 0.7898, + "num_input_tokens_seen": 2305808, + "step": 1505 + }, + { + "epoch": 4.661514683153014, + "grad_norm": 0.8612911105155945, + "learning_rate": 4.9824628164466945e-05, + "loss": 0.7716, + "num_input_tokens_seen": 2312784, + "step": 1510 + }, + { + "epoch": 4.676970633693972, + "grad_norm": 0.602661669254303, + "learning_rate": 4.982346543848595e-05, + "loss": 0.7264, + "num_input_tokens_seen": 2319856, + "step": 1515 + }, + { + "epoch": 4.69242658423493, + "grad_norm": 0.6582660675048828, + "learning_rate": 4.9822298884414626e-05, + "loss": 0.7238, + "num_input_tokens_seen": 2327184, + "step": 1520 + }, + { + "epoch": 4.707882534775889, + "grad_norm": 0.6654687523841858, + "learning_rate": 4.982112850243288e-05, + "loss": 0.7808, + "num_input_tokens_seen": 2334352, + "step": 1525 + }, + { + "epoch": 4.723338485316847, + "grad_norm": 0.6693872213363647, + "learning_rate": 4.98199542927212e-05, + "loss": 0.9103, + "num_input_tokens_seen": 2341648, + "step": 1530 + }, + { + "epoch": 4.738794435857805, + "grad_norm": 0.71971195936203, + "learning_rate": 4.981877625546066e-05, + "loss": 0.7492, + "num_input_tokens_seen": 2349328, + "step": 1535 + }, + { + "epoch": 4.7542503863987635, + "grad_norm": 0.7917207479476929, + "learning_rate": 4.981759439083293e-05, + "loss": 0.8244, + "num_input_tokens_seen": 2356752, + "step": 1540 + }, + { + "epoch": 4.769706336939722, + "grad_norm": 0.8877680897712708, + "learning_rate": 4.981640869902027e-05, + "loss": 0.7279, + "num_input_tokens_seen": 2363792, + "step": 1545 + }, + { + "epoch": 4.78516228748068, + "grad_norm": 0.7114736437797546, + "learning_rate": 4.9815219180205517e-05, + "loss": 0.7164, + "num_input_tokens_seen": 2371120, + "step": 1550 + }, + { + "epoch": 4.800618238021638, + "grad_norm": 0.7424584031105042, + "learning_rate": 4.9814025834572126e-05, + "loss": 0.8474, + "num_input_tokens_seen": 2378672, + "step": 1555 + }, + { + "epoch": 4.816074188562597, + "grad_norm": 0.8733319044113159, + "learning_rate": 4.981282866230411e-05, + "loss": 0.7187, + "num_input_tokens_seen": 2386192, + "step": 1560 + }, + { + "epoch": 4.831530139103555, + "grad_norm": 0.6201744079589844, + "learning_rate": 4.981162766358611e-05, + "loss": 0.7179, + "num_input_tokens_seen": 2393456, + "step": 1565 + }, + { + "epoch": 4.846986089644513, + "grad_norm": 0.6635327935218811, + "learning_rate": 4.9810422838603316e-05, + "loss": 0.7616, + "num_input_tokens_seen": 2401488, + "step": 1570 + }, + { + "epoch": 4.8624420401854715, + "grad_norm": 0.6215106248855591, + "learning_rate": 4.9809214187541533e-05, + "loss": 0.7266, + "num_input_tokens_seen": 2408976, + "step": 1575 + }, + { + "epoch": 4.87789799072643, + "grad_norm": 0.5256595015525818, + "learning_rate": 4.980800171058715e-05, + "loss": 0.6854, + "num_input_tokens_seen": 2416848, + "step": 1580 + }, + { + "epoch": 4.893353941267388, + "grad_norm": 0.75858473777771, + "learning_rate": 4.980678540792715e-05, + "loss": 0.6931, + "num_input_tokens_seen": 2424560, + "step": 1585 + }, + { + "epoch": 4.908809891808346, + "grad_norm": 0.5945013165473938, + "learning_rate": 4.980556527974909e-05, + "loss": 0.7518, + "num_input_tokens_seen": 2432496, + "step": 1590 + }, + { + "epoch": 4.924265842349304, + "grad_norm": 0.49469777941703796, + "learning_rate": 4.980434132624114e-05, + "loss": 0.782, + "num_input_tokens_seen": 2440496, + "step": 1595 + }, + { + "epoch": 4.939721792890262, + "grad_norm": 0.5580058693885803, + "learning_rate": 4.980311354759205e-05, + "loss": 0.7228, + "num_input_tokens_seen": 2448176, + "step": 1600 + }, + { + "epoch": 4.939721792890262, + "eval_loss": 0.7670724391937256, + "eval_runtime": 6.3236, + "eval_samples_per_second": 90.929, + "eval_steps_per_second": 22.772, + "num_input_tokens_seen": 2448176, + "step": 1600 + }, + { + "epoch": 4.955177743431221, + "grad_norm": 0.8737671971321106, + "learning_rate": 4.980188194399116e-05, + "loss": 0.6621, + "num_input_tokens_seen": 2455792, + "step": 1605 + }, + { + "epoch": 4.97063369397218, + "grad_norm": 1.149711012840271, + "learning_rate": 4.9800646515628384e-05, + "loss": 0.7739, + "num_input_tokens_seen": 2463280, + "step": 1610 + }, + { + "epoch": 4.986089644513138, + "grad_norm": 0.5554394125938416, + "learning_rate": 4.979940726269426e-05, + "loss": 0.6622, + "num_input_tokens_seen": 2470704, + "step": 1615 + }, + { + "epoch": 5.0, + "grad_norm": 2.325308084487915, + "learning_rate": 4.979816418537988e-05, + "loss": 0.7924, + "num_input_tokens_seen": 2477696, + "step": 1620 + }, + { + "epoch": 5.015455950540958, + "grad_norm": 0.746472954750061, + "learning_rate": 4.979691728387696e-05, + "loss": 0.7691, + "num_input_tokens_seen": 2485696, + "step": 1625 + }, + { + "epoch": 5.030911901081916, + "grad_norm": 0.7856459021568298, + "learning_rate": 4.979566655837776e-05, + "loss": 0.738, + "num_input_tokens_seen": 2494016, + "step": 1630 + }, + { + "epoch": 5.0463678516228745, + "grad_norm": 0.7444531321525574, + "learning_rate": 4.9794412009075184e-05, + "loss": 0.7137, + "num_input_tokens_seen": 2501504, + "step": 1635 + }, + { + "epoch": 5.061823802163833, + "grad_norm": 0.6342084407806396, + "learning_rate": 4.979315363616269e-05, + "loss": 0.6966, + "num_input_tokens_seen": 2508928, + "step": 1640 + }, + { + "epoch": 5.077279752704792, + "grad_norm": 0.6073787808418274, + "learning_rate": 4.979189143983434e-05, + "loss": 0.6759, + "num_input_tokens_seen": 2516736, + "step": 1645 + }, + { + "epoch": 5.09273570324575, + "grad_norm": 0.5741644501686096, + "learning_rate": 4.979062542028478e-05, + "loss": 0.7243, + "num_input_tokens_seen": 2524320, + "step": 1650 + }, + { + "epoch": 5.108191653786708, + "grad_norm": 0.665347695350647, + "learning_rate": 4.978935557770923e-05, + "loss": 0.5874, + "num_input_tokens_seen": 2531520, + "step": 1655 + }, + { + "epoch": 5.123647604327666, + "grad_norm": 0.6783438920974731, + "learning_rate": 4.978808191230353e-05, + "loss": 0.7098, + "num_input_tokens_seen": 2539200, + "step": 1660 + }, + { + "epoch": 5.139103554868624, + "grad_norm": 0.6016685366630554, + "learning_rate": 4.9786804424264085e-05, + "loss": 0.6276, + "num_input_tokens_seen": 2546464, + "step": 1665 + }, + { + "epoch": 5.1545595054095825, + "grad_norm": 1.0033166408538818, + "learning_rate": 4.978552311378792e-05, + "loss": 0.7651, + "num_input_tokens_seen": 2554016, + "step": 1670 + }, + { + "epoch": 5.170015455950541, + "grad_norm": 0.8491500020027161, + "learning_rate": 4.978423798107261e-05, + "loss": 0.6903, + "num_input_tokens_seen": 2561952, + "step": 1675 + }, + { + "epoch": 5.185471406491499, + "grad_norm": 0.7388545274734497, + "learning_rate": 4.978294902631635e-05, + "loss": 0.7051, + "num_input_tokens_seen": 2569760, + "step": 1680 + }, + { + "epoch": 5.200927357032458, + "grad_norm": 0.6043853759765625, + "learning_rate": 4.9781656249717914e-05, + "loss": 0.6671, + "num_input_tokens_seen": 2576992, + "step": 1685 + }, + { + "epoch": 5.216383307573416, + "grad_norm": 0.8271062970161438, + "learning_rate": 4.9780359651476645e-05, + "loss": 0.6862, + "num_input_tokens_seen": 2584832, + "step": 1690 + }, + { + "epoch": 5.231839258114374, + "grad_norm": 0.7980518937110901, + "learning_rate": 4.977905923179251e-05, + "loss": 0.7292, + "num_input_tokens_seen": 2592960, + "step": 1695 + }, + { + "epoch": 5.247295208655332, + "grad_norm": 0.5164510607719421, + "learning_rate": 4.977775499086606e-05, + "loss": 0.6549, + "num_input_tokens_seen": 2600864, + "step": 1700 + }, + { + "epoch": 5.2627511591962906, + "grad_norm": 0.6835931539535522, + "learning_rate": 4.97764469288984e-05, + "loss": 0.6987, + "num_input_tokens_seen": 2608032, + "step": 1705 + }, + { + "epoch": 5.278207109737249, + "grad_norm": 0.6435307264328003, + "learning_rate": 4.977513504609127e-05, + "loss": 0.5901, + "num_input_tokens_seen": 2615840, + "step": 1710 + }, + { + "epoch": 5.293663060278207, + "grad_norm": 0.6973457336425781, + "learning_rate": 4.9773819342646965e-05, + "loss": 0.6603, + "num_input_tokens_seen": 2623072, + "step": 1715 + }, + { + "epoch": 5.309119010819165, + "grad_norm": 0.9888620972633362, + "learning_rate": 4.97724998187684e-05, + "loss": 0.7051, + "num_input_tokens_seen": 2630464, + "step": 1720 + }, + { + "epoch": 5.324574961360124, + "grad_norm": 1.010613203048706, + "learning_rate": 4.9771176474659045e-05, + "loss": 0.708, + "num_input_tokens_seen": 2637952, + "step": 1725 + }, + { + "epoch": 5.340030911901082, + "grad_norm": 0.6200861930847168, + "learning_rate": 4.976984931052299e-05, + "loss": 0.6602, + "num_input_tokens_seen": 2645504, + "step": 1730 + }, + { + "epoch": 5.35548686244204, + "grad_norm": 0.7521124482154846, + "learning_rate": 4.976851832656489e-05, + "loss": 0.7335, + "num_input_tokens_seen": 2653312, + "step": 1735 + }, + { + "epoch": 5.370942812982999, + "grad_norm": 0.7952502369880676, + "learning_rate": 4.9767183522990004e-05, + "loss": 0.717, + "num_input_tokens_seen": 2660832, + "step": 1740 + }, + { + "epoch": 5.386398763523957, + "grad_norm": 0.7257140874862671, + "learning_rate": 4.9765844900004176e-05, + "loss": 0.6655, + "num_input_tokens_seen": 2668352, + "step": 1745 + }, + { + "epoch": 5.401854714064915, + "grad_norm": 0.55604487657547, + "learning_rate": 4.9764502457813834e-05, + "loss": 0.6821, + "num_input_tokens_seen": 2676160, + "step": 1750 + }, + { + "epoch": 5.417310664605873, + "grad_norm": 0.6746121048927307, + "learning_rate": 4.9763156196626005e-05, + "loss": 0.7147, + "num_input_tokens_seen": 2683488, + "step": 1755 + }, + { + "epoch": 5.432766615146831, + "grad_norm": 0.6349546313285828, + "learning_rate": 4.97618061166483e-05, + "loss": 0.6891, + "num_input_tokens_seen": 2690752, + "step": 1760 + }, + { + "epoch": 5.448222565687789, + "grad_norm": 0.6246626377105713, + "learning_rate": 4.9760452218088915e-05, + "loss": 0.6235, + "num_input_tokens_seen": 2698752, + "step": 1765 + }, + { + "epoch": 5.4636785162287484, + "grad_norm": 0.6767257452011108, + "learning_rate": 4.975909450115663e-05, + "loss": 0.6786, + "num_input_tokens_seen": 2706848, + "step": 1770 + }, + { + "epoch": 5.479134466769707, + "grad_norm": 0.7782546281814575, + "learning_rate": 4.975773296606084e-05, + "loss": 0.6756, + "num_input_tokens_seen": 2714432, + "step": 1775 + }, + { + "epoch": 5.494590417310665, + "grad_norm": 0.7220044732093811, + "learning_rate": 4.97563676130115e-05, + "loss": 0.7615, + "num_input_tokens_seen": 2722304, + "step": 1780 + }, + { + "epoch": 5.510046367851623, + "grad_norm": 0.5918099880218506, + "learning_rate": 4.9754998442219166e-05, + "loss": 0.7196, + "num_input_tokens_seen": 2729888, + "step": 1785 + }, + { + "epoch": 5.525502318392581, + "grad_norm": 0.7263619899749756, + "learning_rate": 4.9753625453894984e-05, + "loss": 0.7128, + "num_input_tokens_seen": 2737696, + "step": 1790 + }, + { + "epoch": 5.540958268933539, + "grad_norm": 0.9207194447517395, + "learning_rate": 4.975224864825068e-05, + "loss": 0.7289, + "num_input_tokens_seen": 2745344, + "step": 1795 + }, + { + "epoch": 5.556414219474497, + "grad_norm": 0.6877893209457397, + "learning_rate": 4.9750868025498576e-05, + "loss": 0.6282, + "num_input_tokens_seen": 2752768, + "step": 1800 + }, + { + "epoch": 5.556414219474497, + "eval_loss": 0.6983717679977417, + "eval_runtime": 6.2789, + "eval_samples_per_second": 91.576, + "eval_steps_per_second": 22.934, + "num_input_tokens_seen": 2752768, + "step": 1800 + }, + { + "epoch": 5.571870170015456, + "grad_norm": 0.6713560223579407, + "learning_rate": 4.974948358585158e-05, + "loss": 0.5805, + "num_input_tokens_seen": 2760256, + "step": 1805 + }, + { + "epoch": 5.587326120556414, + "grad_norm": 0.7022420763969421, + "learning_rate": 4.9748095329523205e-05, + "loss": 0.5648, + "num_input_tokens_seen": 2767296, + "step": 1810 + }, + { + "epoch": 5.602782071097373, + "grad_norm": 0.8896868824958801, + "learning_rate": 4.974670325672752e-05, + "loss": 0.724, + "num_input_tokens_seen": 2775328, + "step": 1815 + }, + { + "epoch": 5.618238021638331, + "grad_norm": 0.8083236217498779, + "learning_rate": 4.974530736767921e-05, + "loss": 0.6131, + "num_input_tokens_seen": 2782560, + "step": 1820 + }, + { + "epoch": 5.633693972179289, + "grad_norm": 0.6804031133651733, + "learning_rate": 4.9743907662593524e-05, + "loss": 0.6172, + "num_input_tokens_seen": 2789952, + "step": 1825 + }, + { + "epoch": 5.649149922720247, + "grad_norm": 0.5791587233543396, + "learning_rate": 4.974250414168633e-05, + "loss": 0.6697, + "num_input_tokens_seen": 2797888, + "step": 1830 + }, + { + "epoch": 5.6646058732612055, + "grad_norm": 1.0821713209152222, + "learning_rate": 4.974109680517407e-05, + "loss": 0.5911, + "num_input_tokens_seen": 2805280, + "step": 1835 + }, + { + "epoch": 5.680061823802164, + "grad_norm": 0.7031528949737549, + "learning_rate": 4.973968565327376e-05, + "loss": 0.556, + "num_input_tokens_seen": 2812736, + "step": 1840 + }, + { + "epoch": 5.695517774343122, + "grad_norm": 0.5370704531669617, + "learning_rate": 4.973827068620303e-05, + "loss": 0.5832, + "num_input_tokens_seen": 2820480, + "step": 1845 + }, + { + "epoch": 5.710973724884081, + "grad_norm": 0.7888002395629883, + "learning_rate": 4.973685190418008e-05, + "loss": 0.6426, + "num_input_tokens_seen": 2828544, + "step": 1850 + }, + { + "epoch": 5.726429675425039, + "grad_norm": 0.6944665908813477, + "learning_rate": 4.97354293074237e-05, + "loss": 0.5355, + "num_input_tokens_seen": 2835808, + "step": 1855 + }, + { + "epoch": 5.741885625965997, + "grad_norm": 0.8017171621322632, + "learning_rate": 4.9734002896153276e-05, + "loss": 0.6299, + "num_input_tokens_seen": 2843072, + "step": 1860 + }, + { + "epoch": 5.757341576506955, + "grad_norm": 0.6602806448936462, + "learning_rate": 4.973257267058877e-05, + "loss": 0.5968, + "num_input_tokens_seen": 2850336, + "step": 1865 + }, + { + "epoch": 5.7727975270479135, + "grad_norm": 1.0233227014541626, + "learning_rate": 4.973113863095076e-05, + "loss": 0.5644, + "num_input_tokens_seen": 2857984, + "step": 1870 + }, + { + "epoch": 5.788253477588872, + "grad_norm": 0.8187620043754578, + "learning_rate": 4.9729700777460384e-05, + "loss": 0.5582, + "num_input_tokens_seen": 2866400, + "step": 1875 + }, + { + "epoch": 5.80370942812983, + "grad_norm": 0.8009936809539795, + "learning_rate": 4.972825911033937e-05, + "loss": 0.7641, + "num_input_tokens_seen": 2874336, + "step": 1880 + }, + { + "epoch": 5.819165378670788, + "grad_norm": 0.6326345801353455, + "learning_rate": 4.9726813629810056e-05, + "loss": 0.7334, + "num_input_tokens_seen": 2882496, + "step": 1885 + }, + { + "epoch": 5.834621329211746, + "grad_norm": 0.8841684460639954, + "learning_rate": 4.9725364336095326e-05, + "loss": 0.6023, + "num_input_tokens_seen": 2890176, + "step": 1890 + }, + { + "epoch": 5.850077279752705, + "grad_norm": 1.3980591297149658, + "learning_rate": 4.972391122941871e-05, + "loss": 0.6256, + "num_input_tokens_seen": 2898784, + "step": 1895 + }, + { + "epoch": 5.865533230293663, + "grad_norm": 0.8527557849884033, + "learning_rate": 4.972245431000428e-05, + "loss": 0.6747, + "num_input_tokens_seen": 2906528, + "step": 1900 + }, + { + "epoch": 5.8809891808346215, + "grad_norm": 0.9478814601898193, + "learning_rate": 4.972099357807671e-05, + "loss": 0.667, + "num_input_tokens_seen": 2914400, + "step": 1905 + }, + { + "epoch": 5.89644513137558, + "grad_norm": 0.6795268654823303, + "learning_rate": 4.971952903386127e-05, + "loss": 0.7493, + "num_input_tokens_seen": 2922528, + "step": 1910 + }, + { + "epoch": 5.911901081916538, + "grad_norm": 1.6141411066055298, + "learning_rate": 4.971806067758381e-05, + "loss": 0.7384, + "num_input_tokens_seen": 2930176, + "step": 1915 + }, + { + "epoch": 5.927357032457496, + "grad_norm": 0.67658531665802, + "learning_rate": 4.971658850947076e-05, + "loss": 0.6037, + "num_input_tokens_seen": 2937856, + "step": 1920 + }, + { + "epoch": 5.942812982998454, + "grad_norm": 0.895926833152771, + "learning_rate": 4.9715112529749165e-05, + "loss": 0.6242, + "num_input_tokens_seen": 2945824, + "step": 1925 + }, + { + "epoch": 5.958268933539412, + "grad_norm": 0.8742234110832214, + "learning_rate": 4.9713632738646624e-05, + "loss": 0.7346, + "num_input_tokens_seen": 2953792, + "step": 1930 + }, + { + "epoch": 5.9737248840803705, + "grad_norm": 0.7517218589782715, + "learning_rate": 4.971214913639134e-05, + "loss": 0.6679, + "num_input_tokens_seen": 2961824, + "step": 1935 + }, + { + "epoch": 5.9891808346213296, + "grad_norm": 0.8994750380516052, + "learning_rate": 4.9710661723212104e-05, + "loss": 0.6897, + "num_input_tokens_seen": 2969312, + "step": 1940 + }, + { + "epoch": 6.003091190108192, + "grad_norm": 0.6868995428085327, + "learning_rate": 4.9709170499338295e-05, + "loss": 0.7105, + "num_input_tokens_seen": 2975760, + "step": 1945 + }, + { + "epoch": 6.01854714064915, + "grad_norm": 0.8491050004959106, + "learning_rate": 4.9707675464999895e-05, + "loss": 0.5895, + "num_input_tokens_seen": 2983152, + "step": 1950 + }, + { + "epoch": 6.034003091190108, + "grad_norm": 0.8359560966491699, + "learning_rate": 4.970617662042743e-05, + "loss": 0.5962, + "num_input_tokens_seen": 2991120, + "step": 1955 + }, + { + "epoch": 6.049459041731066, + "grad_norm": 0.5409308075904846, + "learning_rate": 4.970467396585206e-05, + "loss": 0.6245, + "num_input_tokens_seen": 2998672, + "step": 1960 + }, + { + "epoch": 6.0649149922720245, + "grad_norm": 0.7104329466819763, + "learning_rate": 4.97031675015055e-05, + "loss": 0.547, + "num_input_tokens_seen": 3006160, + "step": 1965 + }, + { + "epoch": 6.080370942812983, + "grad_norm": 0.7167556881904602, + "learning_rate": 4.9701657227620075e-05, + "loss": 0.5449, + "num_input_tokens_seen": 3013456, + "step": 1970 + }, + { + "epoch": 6.095826893353942, + "grad_norm": 0.7389307022094727, + "learning_rate": 4.9700143144428685e-05, + "loss": 0.681, + "num_input_tokens_seen": 3020976, + "step": 1975 + }, + { + "epoch": 6.1112828438949, + "grad_norm": 0.636371374130249, + "learning_rate": 4.969862525216482e-05, + "loss": 0.5603, + "num_input_tokens_seen": 3028592, + "step": 1980 + }, + { + "epoch": 6.126738794435858, + "grad_norm": 0.8574012517929077, + "learning_rate": 4.9697103551062556e-05, + "loss": 0.5518, + "num_input_tokens_seen": 3035984, + "step": 1985 + }, + { + "epoch": 6.142194744976816, + "grad_norm": 0.705716609954834, + "learning_rate": 4.9695578041356565e-05, + "loss": 0.5928, + "num_input_tokens_seen": 3043952, + "step": 1990 + }, + { + "epoch": 6.157650695517774, + "grad_norm": 0.7028347253799438, + "learning_rate": 4.969404872328209e-05, + "loss": 0.5461, + "num_input_tokens_seen": 3052144, + "step": 1995 + }, + { + "epoch": 6.1731066460587325, + "grad_norm": 0.8769800066947937, + "learning_rate": 4.969251559707498e-05, + "loss": 0.639, + "num_input_tokens_seen": 3059504, + "step": 2000 + }, + { + "epoch": 6.1731066460587325, + "eval_loss": 0.6433413028717041, + "eval_runtime": 6.2907, + "eval_samples_per_second": 91.404, + "eval_steps_per_second": 22.891, + "num_input_tokens_seen": 3059504, + "step": 2000 + }, + { + "epoch": 6.188562596599691, + "grad_norm": 0.8283852338790894, + "learning_rate": 4.9690978662971674e-05, + "loss": 0.5891, + "num_input_tokens_seen": 3067792, + "step": 2005 + }, + { + "epoch": 6.204018547140649, + "grad_norm": 0.6060513257980347, + "learning_rate": 4.968943792120916e-05, + "loss": 0.5845, + "num_input_tokens_seen": 3075472, + "step": 2010 + }, + { + "epoch": 6.219474497681608, + "grad_norm": 0.5330929756164551, + "learning_rate": 4.9687893372025046e-05, + "loss": 0.5947, + "num_input_tokens_seen": 3082768, + "step": 2015 + }, + { + "epoch": 6.234930448222566, + "grad_norm": 0.721720814704895, + "learning_rate": 4.9686345015657535e-05, + "loss": 0.5493, + "num_input_tokens_seen": 3090544, + "step": 2020 + }, + { + "epoch": 6.250386398763524, + "grad_norm": 0.8855581283569336, + "learning_rate": 4.968479285234538e-05, + "loss": 0.6462, + "num_input_tokens_seen": 3098256, + "step": 2025 + }, + { + "epoch": 6.265842349304482, + "grad_norm": 0.707988977432251, + "learning_rate": 4.9683236882327974e-05, + "loss": 0.5411, + "num_input_tokens_seen": 3105872, + "step": 2030 + }, + { + "epoch": 6.2812982998454405, + "grad_norm": 0.6746358275413513, + "learning_rate": 4.968167710584526e-05, + "loss": 0.5397, + "num_input_tokens_seen": 3113392, + "step": 2035 + }, + { + "epoch": 6.296754250386399, + "grad_norm": 0.8131060004234314, + "learning_rate": 4.968011352313775e-05, + "loss": 0.5144, + "num_input_tokens_seen": 3121040, + "step": 2040 + }, + { + "epoch": 6.312210200927357, + "grad_norm": 0.7129751443862915, + "learning_rate": 4.967854613444659e-05, + "loss": 0.5621, + "num_input_tokens_seen": 3129008, + "step": 2045 + }, + { + "epoch": 6.327666151468315, + "grad_norm": 0.7945042848587036, + "learning_rate": 4.967697494001349e-05, + "loss": 0.6262, + "num_input_tokens_seen": 3136464, + "step": 2050 + }, + { + "epoch": 6.343122102009273, + "grad_norm": 1.1902741193771362, + "learning_rate": 4.9675399940080736e-05, + "loss": 0.6527, + "num_input_tokens_seen": 3143824, + "step": 2055 + }, + { + "epoch": 6.358578052550232, + "grad_norm": 0.7157202363014221, + "learning_rate": 4.9673821134891226e-05, + "loss": 0.564, + "num_input_tokens_seen": 3151664, + "step": 2060 + }, + { + "epoch": 6.37403400309119, + "grad_norm": 0.7522504925727844, + "learning_rate": 4.967223852468842e-05, + "loss": 0.5769, + "num_input_tokens_seen": 3158864, + "step": 2065 + }, + { + "epoch": 6.3894899536321486, + "grad_norm": 0.6726906895637512, + "learning_rate": 4.967065210971639e-05, + "loss": 0.5674, + "num_input_tokens_seen": 3166480, + "step": 2070 + }, + { + "epoch": 6.404945904173107, + "grad_norm": 0.7374562621116638, + "learning_rate": 4.966906189021977e-05, + "loss": 0.6844, + "num_input_tokens_seen": 3174000, + "step": 2075 + }, + { + "epoch": 6.420401854714065, + "grad_norm": 0.719575047492981, + "learning_rate": 4.966746786644379e-05, + "loss": 0.6147, + "num_input_tokens_seen": 3182480, + "step": 2080 + }, + { + "epoch": 6.435857805255023, + "grad_norm": 0.9669398069381714, + "learning_rate": 4.966587003863429e-05, + "loss": 0.6171, + "num_input_tokens_seen": 3190160, + "step": 2085 + }, + { + "epoch": 6.451313755795981, + "grad_norm": 0.567224383354187, + "learning_rate": 4.966426840703765e-05, + "loss": 0.5895, + "num_input_tokens_seen": 3197680, + "step": 2090 + }, + { + "epoch": 6.466769706336939, + "grad_norm": 0.7215051651000977, + "learning_rate": 4.9662662971900875e-05, + "loss": 0.627, + "num_input_tokens_seen": 3205296, + "step": 2095 + }, + { + "epoch": 6.4822256568778975, + "grad_norm": 0.7404988408088684, + "learning_rate": 4.9661053733471534e-05, + "loss": 0.551, + "num_input_tokens_seen": 3212944, + "step": 2100 + }, + { + "epoch": 6.497681607418857, + "grad_norm": 0.807461142539978, + "learning_rate": 4.965944069199781e-05, + "loss": 0.5291, + "num_input_tokens_seen": 3220656, + "step": 2105 + }, + { + "epoch": 6.513137557959815, + "grad_norm": 0.5641626119613647, + "learning_rate": 4.965782384772842e-05, + "loss": 0.5501, + "num_input_tokens_seen": 3228176, + "step": 2110 + }, + { + "epoch": 6.528593508500773, + "grad_norm": 0.5931814908981323, + "learning_rate": 4.9656203200912734e-05, + "loss": 0.526, + "num_input_tokens_seen": 3235408, + "step": 2115 + }, + { + "epoch": 6.544049459041731, + "grad_norm": 0.817730188369751, + "learning_rate": 4.965457875180067e-05, + "loss": 0.5824, + "num_input_tokens_seen": 3243120, + "step": 2120 + }, + { + "epoch": 6.559505409582689, + "grad_norm": 1.2166928052902222, + "learning_rate": 4.9652950500642724e-05, + "loss": 0.6487, + "num_input_tokens_seen": 3250480, + "step": 2125 + }, + { + "epoch": 6.574961360123647, + "grad_norm": 0.7193739414215088, + "learning_rate": 4.965131844769001e-05, + "loss": 0.5901, + "num_input_tokens_seen": 3258352, + "step": 2130 + }, + { + "epoch": 6.590417310664606, + "grad_norm": 1.3405578136444092, + "learning_rate": 4.96496825931942e-05, + "loss": 0.7378, + "num_input_tokens_seen": 3266032, + "step": 2135 + }, + { + "epoch": 6.605873261205565, + "grad_norm": 0.47201576828956604, + "learning_rate": 4.9648042937407566e-05, + "loss": 0.5656, + "num_input_tokens_seen": 3273744, + "step": 2140 + }, + { + "epoch": 6.621329211746523, + "grad_norm": 0.7545175552368164, + "learning_rate": 4.964639948058297e-05, + "loss": 0.4922, + "num_input_tokens_seen": 3281136, + "step": 2145 + }, + { + "epoch": 6.636785162287481, + "grad_norm": 0.6261192560195923, + "learning_rate": 4.9644752222973846e-05, + "loss": 0.5353, + "num_input_tokens_seen": 3289232, + "step": 2150 + }, + { + "epoch": 6.652241112828439, + "grad_norm": 0.7170491814613342, + "learning_rate": 4.964310116483422e-05, + "loss": 0.538, + "num_input_tokens_seen": 3296720, + "step": 2155 + }, + { + "epoch": 6.667697063369397, + "grad_norm": 0.7170806527137756, + "learning_rate": 4.964144630641872e-05, + "loss": 0.5422, + "num_input_tokens_seen": 3304272, + "step": 2160 + }, + { + "epoch": 6.683153013910355, + "grad_norm": 0.7963667511940002, + "learning_rate": 4.9639787647982525e-05, + "loss": 0.4973, + "num_input_tokens_seen": 3312176, + "step": 2165 + }, + { + "epoch": 6.698608964451314, + "grad_norm": 0.7153801918029785, + "learning_rate": 4.963812518978143e-05, + "loss": 0.5207, + "num_input_tokens_seen": 3319248, + "step": 2170 + }, + { + "epoch": 6.714064914992272, + "grad_norm": 0.48877814412117004, + "learning_rate": 4.963645893207182e-05, + "loss": 0.6153, + "num_input_tokens_seen": 3326864, + "step": 2175 + }, + { + "epoch": 6.72952086553323, + "grad_norm": 1.0044116973876953, + "learning_rate": 4.963478887511063e-05, + "loss": 0.518, + "num_input_tokens_seen": 3334032, + "step": 2180 + }, + { + "epoch": 6.744976816074189, + "grad_norm": 0.963096022605896, + "learning_rate": 4.963311501915542e-05, + "loss": 0.5571, + "num_input_tokens_seen": 3342000, + "step": 2185 + }, + { + "epoch": 6.760432766615147, + "grad_norm": 0.6714496612548828, + "learning_rate": 4.963143736446432e-05, + "loss": 0.5918, + "num_input_tokens_seen": 3349520, + "step": 2190 + }, + { + "epoch": 6.775888717156105, + "grad_norm": 0.5421562790870667, + "learning_rate": 4.962975591129603e-05, + "loss": 0.5077, + "num_input_tokens_seen": 3356912, + "step": 2195 + }, + { + "epoch": 6.7913446676970635, + "grad_norm": 0.6462876796722412, + "learning_rate": 4.962807065990986e-05, + "loss": 0.5502, + "num_input_tokens_seen": 3364688, + "step": 2200 + }, + { + "epoch": 6.7913446676970635, + "eval_loss": 0.5964299440383911, + "eval_runtime": 6.2944, + "eval_samples_per_second": 91.352, + "eval_steps_per_second": 22.878, + "num_input_tokens_seen": 3364688, + "step": 2200 + }, + { + "epoch": 6.806800618238022, + "grad_norm": 0.9408058524131775, + "learning_rate": 4.9626381610565714e-05, + "loss": 0.6417, + "num_input_tokens_seen": 3372912, + "step": 2205 + }, + { + "epoch": 6.82225656877898, + "grad_norm": 0.7491603493690491, + "learning_rate": 4.9624688763524043e-05, + "loss": 0.5909, + "num_input_tokens_seen": 3380496, + "step": 2210 + }, + { + "epoch": 6.837712519319938, + "grad_norm": 0.9920117259025574, + "learning_rate": 4.962299211904591e-05, + "loss": 0.5733, + "num_input_tokens_seen": 3388688, + "step": 2215 + }, + { + "epoch": 6.853168469860896, + "grad_norm": 0.5556808710098267, + "learning_rate": 4.962129167739296e-05, + "loss": 0.5508, + "num_input_tokens_seen": 3396784, + "step": 2220 + }, + { + "epoch": 6.868624420401854, + "grad_norm": 0.7010271549224854, + "learning_rate": 4.961958743882742e-05, + "loss": 0.5128, + "num_input_tokens_seen": 3405296, + "step": 2225 + }, + { + "epoch": 6.884080370942813, + "grad_norm": 0.8907914757728577, + "learning_rate": 4.961787940361211e-05, + "loss": 0.5665, + "num_input_tokens_seen": 3413200, + "step": 2230 + }, + { + "epoch": 6.8995363214837715, + "grad_norm": 0.8187472224235535, + "learning_rate": 4.961616757201043e-05, + "loss": 0.546, + "num_input_tokens_seen": 3420592, + "step": 2235 + }, + { + "epoch": 6.91499227202473, + "grad_norm": 0.5401462912559509, + "learning_rate": 4.961445194428637e-05, + "loss": 0.4799, + "num_input_tokens_seen": 3428368, + "step": 2240 + }, + { + "epoch": 6.930448222565688, + "grad_norm": 0.5530579090118408, + "learning_rate": 4.9612732520704486e-05, + "loss": 0.4754, + "num_input_tokens_seen": 3436592, + "step": 2245 + }, + { + "epoch": 6.945904173106646, + "grad_norm": 0.6471156477928162, + "learning_rate": 4.961100930152994e-05, + "loss": 0.5328, + "num_input_tokens_seen": 3444304, + "step": 2250 + }, + { + "epoch": 6.961360123647604, + "grad_norm": 0.7973666191101074, + "learning_rate": 4.960928228702849e-05, + "loss": 0.5541, + "num_input_tokens_seen": 3451824, + "step": 2255 + }, + { + "epoch": 6.976816074188562, + "grad_norm": 1.0054826736450195, + "learning_rate": 4.960755147746645e-05, + "loss": 0.6015, + "num_input_tokens_seen": 3459408, + "step": 2260 + }, + { + "epoch": 6.992272024729521, + "grad_norm": 0.724249005317688, + "learning_rate": 4.9605816873110736e-05, + "loss": 0.5068, + "num_input_tokens_seen": 3467152, + "step": 2265 + }, + { + "epoch": 7.006182380216384, + "grad_norm": 0.6975013017654419, + "learning_rate": 4.960407847422883e-05, + "loss": 0.5888, + "num_input_tokens_seen": 3473584, + "step": 2270 + }, + { + "epoch": 7.021638330757342, + "grad_norm": 0.8822712898254395, + "learning_rate": 4.960233628108885e-05, + "loss": 0.5251, + "num_input_tokens_seen": 3481136, + "step": 2275 + }, + { + "epoch": 7.0370942812983, + "grad_norm": 0.8891996145248413, + "learning_rate": 4.960059029395942e-05, + "loss": 0.4906, + "num_input_tokens_seen": 3488784, + "step": 2280 + }, + { + "epoch": 7.052550231839258, + "grad_norm": 0.7080563306808472, + "learning_rate": 4.959884051310983e-05, + "loss": 0.6033, + "num_input_tokens_seen": 3495888, + "step": 2285 + }, + { + "epoch": 7.068006182380216, + "grad_norm": 0.8714733719825745, + "learning_rate": 4.959708693880991e-05, + "loss": 0.5203, + "num_input_tokens_seen": 3503824, + "step": 2290 + }, + { + "epoch": 7.083462132921174, + "grad_norm": 0.6442603468894958, + "learning_rate": 4.9595329571330074e-05, + "loss": 0.5155, + "num_input_tokens_seen": 3511408, + "step": 2295 + }, + { + "epoch": 7.098918083462133, + "grad_norm": 0.6727645993232727, + "learning_rate": 4.9593568410941326e-05, + "loss": 0.5232, + "num_input_tokens_seen": 3518864, + "step": 2300 + }, + { + "epoch": 7.114374034003091, + "grad_norm": 0.7541542649269104, + "learning_rate": 4.959180345791528e-05, + "loss": 0.6135, + "num_input_tokens_seen": 3526768, + "step": 2305 + }, + { + "epoch": 7.12982998454405, + "grad_norm": 0.7202556133270264, + "learning_rate": 4.9590034712524086e-05, + "loss": 0.5996, + "num_input_tokens_seen": 3534352, + "step": 2310 + }, + { + "epoch": 7.145285935085008, + "grad_norm": 0.5483867526054382, + "learning_rate": 4.958826217504053e-05, + "loss": 0.4954, + "num_input_tokens_seen": 3541584, + "step": 2315 + }, + { + "epoch": 7.160741885625966, + "grad_norm": 0.7860913276672363, + "learning_rate": 4.958648584573795e-05, + "loss": 0.4355, + "num_input_tokens_seen": 3549008, + "step": 2320 + }, + { + "epoch": 7.176197836166924, + "grad_norm": 0.638876736164093, + "learning_rate": 4.958470572489028e-05, + "loss": 0.5348, + "num_input_tokens_seen": 3556848, + "step": 2325 + }, + { + "epoch": 7.1916537867078825, + "grad_norm": 0.9290891885757446, + "learning_rate": 4.958292181277203e-05, + "loss": 0.5641, + "num_input_tokens_seen": 3565232, + "step": 2330 + }, + { + "epoch": 7.207109737248841, + "grad_norm": 1.0606532096862793, + "learning_rate": 4.958113410965832e-05, + "loss": 0.5889, + "num_input_tokens_seen": 3572592, + "step": 2335 + }, + { + "epoch": 7.222565687789799, + "grad_norm": 0.6668662428855896, + "learning_rate": 4.957934261582481e-05, + "loss": 0.5303, + "num_input_tokens_seen": 3580432, + "step": 2340 + }, + { + "epoch": 7.238021638330757, + "grad_norm": 0.4905584752559662, + "learning_rate": 4.95775473315478e-05, + "loss": 0.4491, + "num_input_tokens_seen": 3587856, + "step": 2345 + }, + { + "epoch": 7.253477588871716, + "grad_norm": 0.53667813539505, + "learning_rate": 4.9575748257104124e-05, + "loss": 0.4702, + "num_input_tokens_seen": 3595440, + "step": 2350 + }, + { + "epoch": 7.268933539412674, + "grad_norm": 0.794960618019104, + "learning_rate": 4.9573945392771224e-05, + "loss": 0.5785, + "num_input_tokens_seen": 3603088, + "step": 2355 + }, + { + "epoch": 7.284389489953632, + "grad_norm": 0.8590783476829529, + "learning_rate": 4.9572138738827134e-05, + "loss": 0.5357, + "num_input_tokens_seen": 3610960, + "step": 2360 + }, + { + "epoch": 7.2998454404945905, + "grad_norm": 0.6469554901123047, + "learning_rate": 4.957032829555046e-05, + "loss": 0.5037, + "num_input_tokens_seen": 3618736, + "step": 2365 + }, + { + "epoch": 7.315301391035549, + "grad_norm": 0.6657320857048035, + "learning_rate": 4.956851406322039e-05, + "loss": 0.4795, + "num_input_tokens_seen": 3627216, + "step": 2370 + }, + { + "epoch": 7.330757341576507, + "grad_norm": 0.7078161835670471, + "learning_rate": 4.9566696042116704e-05, + "loss": 0.4871, + "num_input_tokens_seen": 3634704, + "step": 2375 + }, + { + "epoch": 7.346213292117465, + "grad_norm": 0.49915993213653564, + "learning_rate": 4.9564874232519766e-05, + "loss": 0.4269, + "num_input_tokens_seen": 3641776, + "step": 2380 + }, + { + "epoch": 7.361669242658423, + "grad_norm": 0.6406752467155457, + "learning_rate": 4.9563048634710516e-05, + "loss": 0.5067, + "num_input_tokens_seen": 3649616, + "step": 2385 + }, + { + "epoch": 7.377125193199381, + "grad_norm": 0.8236833214759827, + "learning_rate": 4.956121924897049e-05, + "loss": 0.5374, + "num_input_tokens_seen": 3657360, + "step": 2390 + }, + { + "epoch": 7.39258114374034, + "grad_norm": 0.6752957105636597, + "learning_rate": 4.955938607558181e-05, + "loss": 0.5329, + "num_input_tokens_seen": 3665040, + "step": 2395 + }, + { + "epoch": 7.4080370942812985, + "grad_norm": 0.5223928093910217, + "learning_rate": 4.955754911482715e-05, + "loss": 0.5518, + "num_input_tokens_seen": 3672432, + "step": 2400 + }, + { + "epoch": 7.4080370942812985, + "eval_loss": 0.5614389777183533, + "eval_runtime": 6.2626, + "eval_samples_per_second": 91.815, + "eval_steps_per_second": 22.994, + "num_input_tokens_seen": 3672432, + "step": 2400 + }, + { + "epoch": 7.423493044822257, + "grad_norm": 1.0201690196990967, + "learning_rate": 4.9555708366989804e-05, + "loss": 0.4486, + "num_input_tokens_seen": 3680368, + "step": 2405 + }, + { + "epoch": 7.438948995363215, + "grad_norm": 0.6913936734199524, + "learning_rate": 4.9553863832353655e-05, + "loss": 0.4538, + "num_input_tokens_seen": 3687888, + "step": 2410 + }, + { + "epoch": 7.454404945904173, + "grad_norm": 1.0568920373916626, + "learning_rate": 4.955201551120313e-05, + "loss": 0.6094, + "num_input_tokens_seen": 3696304, + "step": 2415 + }, + { + "epoch": 7.469860896445131, + "grad_norm": 0.4180598556995392, + "learning_rate": 4.955016340382328e-05, + "loss": 0.4919, + "num_input_tokens_seen": 3703888, + "step": 2420 + }, + { + "epoch": 7.485316846986089, + "grad_norm": 0.6638259291648865, + "learning_rate": 4.954830751049972e-05, + "loss": 0.4869, + "num_input_tokens_seen": 3711248, + "step": 2425 + }, + { + "epoch": 7.500772797527048, + "grad_norm": 0.8204613924026489, + "learning_rate": 4.954644783151864e-05, + "loss": 0.552, + "num_input_tokens_seen": 3719248, + "step": 2430 + }, + { + "epoch": 7.516228748068007, + "grad_norm": 0.4565852880477905, + "learning_rate": 4.954458436716684e-05, + "loss": 0.4659, + "num_input_tokens_seen": 3726768, + "step": 2435 + }, + { + "epoch": 7.531684698608965, + "grad_norm": 0.7320737838745117, + "learning_rate": 4.954271711773168e-05, + "loss": 0.4582, + "num_input_tokens_seen": 3734736, + "step": 2440 + }, + { + "epoch": 7.547140649149923, + "grad_norm": 0.5416005253791809, + "learning_rate": 4.9540846083501115e-05, + "loss": 0.5042, + "num_input_tokens_seen": 3742384, + "step": 2445 + }, + { + "epoch": 7.562596599690881, + "grad_norm": 0.5921560525894165, + "learning_rate": 4.953897126476369e-05, + "loss": 0.4077, + "num_input_tokens_seen": 3750640, + "step": 2450 + }, + { + "epoch": 7.578052550231839, + "grad_norm": 0.6853830814361572, + "learning_rate": 4.9537092661808514e-05, + "loss": 0.4911, + "num_input_tokens_seen": 3758096, + "step": 2455 + }, + { + "epoch": 7.593508500772797, + "grad_norm": 0.6138102412223816, + "learning_rate": 4.9535210274925306e-05, + "loss": 0.3935, + "num_input_tokens_seen": 3765328, + "step": 2460 + }, + { + "epoch": 7.6089644513137555, + "grad_norm": 1.1984213590621948, + "learning_rate": 4.953332410440435e-05, + "loss": 0.7447, + "num_input_tokens_seen": 3773712, + "step": 2465 + }, + { + "epoch": 7.624420401854714, + "grad_norm": 0.7852861285209656, + "learning_rate": 4.9531434150536496e-05, + "loss": 0.5227, + "num_input_tokens_seen": 3781520, + "step": 2470 + }, + { + "epoch": 7.639876352395673, + "grad_norm": 0.6755810976028442, + "learning_rate": 4.952954041361322e-05, + "loss": 0.4271, + "num_input_tokens_seen": 3790000, + "step": 2475 + }, + { + "epoch": 7.655332302936631, + "grad_norm": 0.7008607387542725, + "learning_rate": 4.952764289392655e-05, + "loss": 0.4957, + "num_input_tokens_seen": 3797712, + "step": 2480 + }, + { + "epoch": 7.670788253477589, + "grad_norm": 0.7710264921188354, + "learning_rate": 4.952574159176912e-05, + "loss": 0.4513, + "num_input_tokens_seen": 3805136, + "step": 2485 + }, + { + "epoch": 7.686244204018547, + "grad_norm": 0.5803573131561279, + "learning_rate": 4.952383650743413e-05, + "loss": 0.5908, + "num_input_tokens_seen": 3813104, + "step": 2490 + }, + { + "epoch": 7.701700154559505, + "grad_norm": 0.6234481334686279, + "learning_rate": 4.952192764121536e-05, + "loss": 0.4402, + "num_input_tokens_seen": 3820432, + "step": 2495 + }, + { + "epoch": 7.717156105100464, + "grad_norm": 0.7391915917396545, + "learning_rate": 4.9520014993407185e-05, + "loss": 0.5624, + "num_input_tokens_seen": 3828144, + "step": 2500 + }, + { + "epoch": 7.732612055641422, + "grad_norm": 0.727198600769043, + "learning_rate": 4.951809856430456e-05, + "loss": 0.5641, + "num_input_tokens_seen": 3835952, + "step": 2505 + }, + { + "epoch": 7.74806800618238, + "grad_norm": 0.47546160221099854, + "learning_rate": 4.951617835420303e-05, + "loss": 0.6395, + "num_input_tokens_seen": 3843664, + "step": 2510 + }, + { + "epoch": 7.763523956723338, + "grad_norm": 0.5726385116577148, + "learning_rate": 4.951425436339869e-05, + "loss": 0.5616, + "num_input_tokens_seen": 3851312, + "step": 2515 + }, + { + "epoch": 7.778979907264297, + "grad_norm": 0.7807177305221558, + "learning_rate": 4.9512326592188274e-05, + "loss": 0.4831, + "num_input_tokens_seen": 3858288, + "step": 2520 + }, + { + "epoch": 7.794435857805255, + "grad_norm": 0.8516373634338379, + "learning_rate": 4.9510395040869054e-05, + "loss": 0.5956, + "num_input_tokens_seen": 3866160, + "step": 2525 + }, + { + "epoch": 7.809891808346213, + "grad_norm": 0.5698123574256897, + "learning_rate": 4.9508459709738905e-05, + "loss": 0.4346, + "num_input_tokens_seen": 3873552, + "step": 2530 + }, + { + "epoch": 7.825347758887172, + "grad_norm": 0.7805798649787903, + "learning_rate": 4.950652059909627e-05, + "loss": 0.5073, + "num_input_tokens_seen": 3880688, + "step": 2535 + }, + { + "epoch": 7.84080370942813, + "grad_norm": 0.7593126893043518, + "learning_rate": 4.95045777092402e-05, + "loss": 0.5853, + "num_input_tokens_seen": 3887824, + "step": 2540 + }, + { + "epoch": 7.856259659969088, + "grad_norm": 0.5818095207214355, + "learning_rate": 4.950263104047031e-05, + "loss": 0.4882, + "num_input_tokens_seen": 3895344, + "step": 2545 + }, + { + "epoch": 7.871715610510046, + "grad_norm": 0.7603031992912292, + "learning_rate": 4.9500680593086775e-05, + "loss": 0.4174, + "num_input_tokens_seen": 3902896, + "step": 2550 + }, + { + "epoch": 7.887171561051004, + "grad_norm": 0.7667840123176575, + "learning_rate": 4.94987263673904e-05, + "loss": 0.4695, + "num_input_tokens_seen": 3910096, + "step": 2555 + }, + { + "epoch": 7.902627511591962, + "grad_norm": 0.6217713356018066, + "learning_rate": 4.949676836368256e-05, + "loss": 0.5416, + "num_input_tokens_seen": 3918128, + "step": 2560 + }, + { + "epoch": 7.9180834621329215, + "grad_norm": 0.90149986743927, + "learning_rate": 4.949480658226518e-05, + "loss": 0.4161, + "num_input_tokens_seen": 3925648, + "step": 2565 + }, + { + "epoch": 7.93353941267388, + "grad_norm": 0.6228740215301514, + "learning_rate": 4.949284102344082e-05, + "loss": 0.4745, + "num_input_tokens_seen": 3933200, + "step": 2570 + }, + { + "epoch": 7.948995363214838, + "grad_norm": 0.9647697806358337, + "learning_rate": 4.9490871687512565e-05, + "loss": 0.5025, + "num_input_tokens_seen": 3940528, + "step": 2575 + }, + { + "epoch": 7.964451313755796, + "grad_norm": 0.6374658346176147, + "learning_rate": 4.948889857478413e-05, + "loss": 0.4674, + "num_input_tokens_seen": 3947920, + "step": 2580 + }, + { + "epoch": 7.979907264296754, + "grad_norm": 0.606547474861145, + "learning_rate": 4.948692168555978e-05, + "loss": 0.4252, + "num_input_tokens_seen": 3955536, + "step": 2585 + }, + { + "epoch": 7.995363214837712, + "grad_norm": 0.7638858556747437, + "learning_rate": 4.94849410201444e-05, + "loss": 0.4809, + "num_input_tokens_seen": 3963696, + "step": 2590 + }, + { + "epoch": 8.009273570324575, + "grad_norm": 0.7335249185562134, + "learning_rate": 4.948295657884341e-05, + "loss": 0.3915, + "num_input_tokens_seen": 3970240, + "step": 2595 + }, + { + "epoch": 8.024729520865533, + "grad_norm": 0.9086295366287231, + "learning_rate": 4.9480968361962835e-05, + "loss": 0.4631, + "num_input_tokens_seen": 3978272, + "step": 2600 + }, + { + "epoch": 8.024729520865533, + "eval_loss": 0.5373055338859558, + "eval_runtime": 6.2726, + "eval_samples_per_second": 91.668, + "eval_steps_per_second": 22.957, + "num_input_tokens_seen": 3978272, + "step": 2600 + }, + { + "epoch": 8.040185471406492, + "grad_norm": 0.8320391774177551, + "learning_rate": 4.9478976369809305e-05, + "loss": 0.4455, + "num_input_tokens_seen": 3985248, + "step": 2605 + }, + { + "epoch": 8.055641421947449, + "grad_norm": 0.8961117267608643, + "learning_rate": 4.947698060268999e-05, + "loss": 0.5223, + "num_input_tokens_seen": 3993056, + "step": 2610 + }, + { + "epoch": 8.071097372488408, + "grad_norm": 0.8682878017425537, + "learning_rate": 4.9474981060912665e-05, + "loss": 0.4864, + "num_input_tokens_seen": 4000256, + "step": 2615 + }, + { + "epoch": 8.086553323029367, + "grad_norm": 0.8327137231826782, + "learning_rate": 4.94729777447857e-05, + "loss": 0.4474, + "num_input_tokens_seen": 4007712, + "step": 2620 + }, + { + "epoch": 8.102009273570324, + "grad_norm": 0.9982582926750183, + "learning_rate": 4.947097065461801e-05, + "loss": 0.5656, + "num_input_tokens_seen": 4015648, + "step": 2625 + }, + { + "epoch": 8.117465224111283, + "grad_norm": 0.4656861424446106, + "learning_rate": 4.9468959790719125e-05, + "loss": 0.5002, + "num_input_tokens_seen": 4024064, + "step": 2630 + }, + { + "epoch": 8.13292117465224, + "grad_norm": 0.837123692035675, + "learning_rate": 4.9466945153399146e-05, + "loss": 0.6231, + "num_input_tokens_seen": 4031680, + "step": 2635 + }, + { + "epoch": 8.1483771251932, + "grad_norm": 1.1280498504638672, + "learning_rate": 4.9464926742968755e-05, + "loss": 0.4887, + "num_input_tokens_seen": 4039456, + "step": 2640 + }, + { + "epoch": 8.163833075734157, + "grad_norm": 0.4815214276313782, + "learning_rate": 4.946290455973921e-05, + "loss": 0.4427, + "num_input_tokens_seen": 4046752, + "step": 2645 + }, + { + "epoch": 8.179289026275116, + "grad_norm": 0.7195155024528503, + "learning_rate": 4.9460878604022365e-05, + "loss": 0.4031, + "num_input_tokens_seen": 4054368, + "step": 2650 + }, + { + "epoch": 8.194744976816073, + "grad_norm": 0.5902311205863953, + "learning_rate": 4.945884887613065e-05, + "loss": 0.4437, + "num_input_tokens_seen": 4062592, + "step": 2655 + }, + { + "epoch": 8.210200927357032, + "grad_norm": 0.7555180191993713, + "learning_rate": 4.9456815376377055e-05, + "loss": 0.576, + "num_input_tokens_seen": 4069984, + "step": 2660 + }, + { + "epoch": 8.225656877897991, + "grad_norm": 0.5630970597267151, + "learning_rate": 4.9454778105075195e-05, + "loss": 0.4186, + "num_input_tokens_seen": 4078208, + "step": 2665 + }, + { + "epoch": 8.241112828438949, + "grad_norm": 1.2206480503082275, + "learning_rate": 4.945273706253924e-05, + "loss": 0.5552, + "num_input_tokens_seen": 4085984, + "step": 2670 + }, + { + "epoch": 8.256568778979908, + "grad_norm": 0.6231361031532288, + "learning_rate": 4.9450692249083925e-05, + "loss": 0.4494, + "num_input_tokens_seen": 4093760, + "step": 2675 + }, + { + "epoch": 8.272024729520865, + "grad_norm": 0.48145022988319397, + "learning_rate": 4.9448643665024605e-05, + "loss": 0.4544, + "num_input_tokens_seen": 4101152, + "step": 2680 + }, + { + "epoch": 8.287480680061824, + "grad_norm": 1.0135653018951416, + "learning_rate": 4.944659131067719e-05, + "loss": 0.5416, + "num_input_tokens_seen": 4108928, + "step": 2685 + }, + { + "epoch": 8.302936630602781, + "grad_norm": 0.5270571112632751, + "learning_rate": 4.944453518635818e-05, + "loss": 0.5277, + "num_input_tokens_seen": 4116672, + "step": 2690 + }, + { + "epoch": 8.31839258114374, + "grad_norm": 0.7772669792175293, + "learning_rate": 4.944247529238465e-05, + "loss": 0.5359, + "num_input_tokens_seen": 4124000, + "step": 2695 + }, + { + "epoch": 8.333848531684698, + "grad_norm": 0.6262822151184082, + "learning_rate": 4.944041162907427e-05, + "loss": 0.4783, + "num_input_tokens_seen": 4131616, + "step": 2700 + }, + { + "epoch": 8.349304482225657, + "grad_norm": 0.6000958681106567, + "learning_rate": 4.943834419674529e-05, + "loss": 0.4277, + "num_input_tokens_seen": 4139424, + "step": 2705 + }, + { + "epoch": 8.364760432766616, + "grad_norm": 0.43749067187309265, + "learning_rate": 4.9436272995716506e-05, + "loss": 0.4237, + "num_input_tokens_seen": 4147040, + "step": 2710 + }, + { + "epoch": 8.380216383307573, + "grad_norm": 0.628383219242096, + "learning_rate": 4.943419802630735e-05, + "loss": 0.4611, + "num_input_tokens_seen": 4155296, + "step": 2715 + }, + { + "epoch": 8.395672333848532, + "grad_norm": 0.5246738195419312, + "learning_rate": 4.94321192888378e-05, + "loss": 0.4847, + "num_input_tokens_seen": 4162464, + "step": 2720 + }, + { + "epoch": 8.41112828438949, + "grad_norm": 0.6150352358818054, + "learning_rate": 4.943003678362842e-05, + "loss": 0.4855, + "num_input_tokens_seen": 4169984, + "step": 2725 + }, + { + "epoch": 8.426584234930449, + "grad_norm": 0.5502138137817383, + "learning_rate": 4.942795051100036e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4177568, + "step": 2730 + }, + { + "epoch": 8.442040185471406, + "grad_norm": 0.5635000467300415, + "learning_rate": 4.942586047127536e-05, + "loss": 0.4605, + "num_input_tokens_seen": 4185408, + "step": 2735 + }, + { + "epoch": 8.457496136012365, + "grad_norm": 0.9356651902198792, + "learning_rate": 4.942376666477571e-05, + "loss": 0.4896, + "num_input_tokens_seen": 4193152, + "step": 2740 + }, + { + "epoch": 8.472952086553324, + "grad_norm": 0.5444778203964233, + "learning_rate": 4.9421669091824304e-05, + "loss": 0.524, + "num_input_tokens_seen": 4200608, + "step": 2745 + }, + { + "epoch": 8.488408037094281, + "grad_norm": 0.6566770672798157, + "learning_rate": 4.9419567752744634e-05, + "loss": 0.5332, + "num_input_tokens_seen": 4208256, + "step": 2750 + }, + { + "epoch": 8.50386398763524, + "grad_norm": 0.8476648926734924, + "learning_rate": 4.941746264786074e-05, + "loss": 0.528, + "num_input_tokens_seen": 4216288, + "step": 2755 + }, + { + "epoch": 8.519319938176197, + "grad_norm": 0.6035640835762024, + "learning_rate": 4.9415353777497254e-05, + "loss": 0.4708, + "num_input_tokens_seen": 4224288, + "step": 2760 + }, + { + "epoch": 8.534775888717157, + "grad_norm": 0.8335319757461548, + "learning_rate": 4.9413241141979394e-05, + "loss": 0.5101, + "num_input_tokens_seen": 4231648, + "step": 2765 + }, + { + "epoch": 8.550231839258114, + "grad_norm": 1.11898672580719, + "learning_rate": 4.9411124741632956e-05, + "loss": 0.5264, + "num_input_tokens_seen": 4239200, + "step": 2770 + }, + { + "epoch": 8.565687789799073, + "grad_norm": 0.8184955716133118, + "learning_rate": 4.940900457678431e-05, + "loss": 0.4164, + "num_input_tokens_seen": 4246944, + "step": 2775 + }, + { + "epoch": 8.58114374034003, + "grad_norm": 0.7829414010047913, + "learning_rate": 4.9406880647760425e-05, + "loss": 0.4107, + "num_input_tokens_seen": 4254528, + "step": 2780 + }, + { + "epoch": 8.59659969088099, + "grad_norm": 0.6965505480766296, + "learning_rate": 4.9404752954888824e-05, + "loss": 0.3786, + "num_input_tokens_seen": 4262368, + "step": 2785 + }, + { + "epoch": 8.612055641421948, + "grad_norm": 0.8783209919929504, + "learning_rate": 4.940262149849762e-05, + "loss": 0.4438, + "num_input_tokens_seen": 4270560, + "step": 2790 + }, + { + "epoch": 8.627511591962906, + "grad_norm": 0.6555018424987793, + "learning_rate": 4.9400486278915526e-05, + "loss": 0.4822, + "num_input_tokens_seen": 4278368, + "step": 2795 + }, + { + "epoch": 8.642967542503865, + "grad_norm": 0.7295514345169067, + "learning_rate": 4.939834729647181e-05, + "loss": 0.4895, + "num_input_tokens_seen": 4285856, + "step": 2800 + }, + { + "epoch": 8.642967542503865, + "eval_loss": 0.5145593285560608, + "eval_runtime": 6.2961, + "eval_samples_per_second": 91.326, + "eval_steps_per_second": 22.871, + "num_input_tokens_seen": 4285856, + "step": 2800 + }, + { + "epoch": 8.658423493044822, + "grad_norm": 0.6169438362121582, + "learning_rate": 4.9396204551496326e-05, + "loss": 0.5051, + "num_input_tokens_seen": 4293440, + "step": 2805 + }, + { + "epoch": 8.673879443585781, + "grad_norm": 0.8093663454055786, + "learning_rate": 4.939405804431952e-05, + "loss": 0.5196, + "num_input_tokens_seen": 4301312, + "step": 2810 + }, + { + "epoch": 8.689335394126738, + "grad_norm": 0.5951443314552307, + "learning_rate": 4.9391907775272414e-05, + "loss": 0.4177, + "num_input_tokens_seen": 4308640, + "step": 2815 + }, + { + "epoch": 8.704791344667697, + "grad_norm": 0.722361147403717, + "learning_rate": 4.9389753744686604e-05, + "loss": 0.5023, + "num_input_tokens_seen": 4316032, + "step": 2820 + }, + { + "epoch": 8.720247295208654, + "grad_norm": 0.9429264664649963, + "learning_rate": 4.938759595289426e-05, + "loss": 0.5717, + "num_input_tokens_seen": 4323840, + "step": 2825 + }, + { + "epoch": 8.735703245749614, + "grad_norm": 0.9859187006950378, + "learning_rate": 4.938543440022815e-05, + "loss": 0.4276, + "num_input_tokens_seen": 4331200, + "step": 2830 + }, + { + "epoch": 8.751159196290573, + "grad_norm": 0.8896517157554626, + "learning_rate": 4.938326908702161e-05, + "loss": 0.5189, + "num_input_tokens_seen": 4338336, + "step": 2835 + }, + { + "epoch": 8.76661514683153, + "grad_norm": 0.5959006547927856, + "learning_rate": 4.9381100013608554e-05, + "loss": 0.3816, + "num_input_tokens_seen": 4345888, + "step": 2840 + }, + { + "epoch": 8.782071097372489, + "grad_norm": 0.5382099151611328, + "learning_rate": 4.9378927180323485e-05, + "loss": 0.4231, + "num_input_tokens_seen": 4353088, + "step": 2845 + }, + { + "epoch": 8.797527047913446, + "grad_norm": 0.624610960483551, + "learning_rate": 4.937675058750148e-05, + "loss": 0.3244, + "num_input_tokens_seen": 4360544, + "step": 2850 + }, + { + "epoch": 8.812982998454405, + "grad_norm": 0.9415662884712219, + "learning_rate": 4.937457023547819e-05, + "loss": 0.4624, + "num_input_tokens_seen": 4367968, + "step": 2855 + }, + { + "epoch": 8.828438948995363, + "grad_norm": 0.5607941746711731, + "learning_rate": 4.9372386124589876e-05, + "loss": 0.3685, + "num_input_tokens_seen": 4375008, + "step": 2860 + }, + { + "epoch": 8.843894899536322, + "grad_norm": 0.803534209728241, + "learning_rate": 4.937019825517333e-05, + "loss": 0.4362, + "num_input_tokens_seen": 4382528, + "step": 2865 + }, + { + "epoch": 8.85935085007728, + "grad_norm": 0.6634131073951721, + "learning_rate": 4.9368006627565954e-05, + "loss": 0.5566, + "num_input_tokens_seen": 4389984, + "step": 2870 + }, + { + "epoch": 8.874806800618238, + "grad_norm": 0.6094066500663757, + "learning_rate": 4.936581124210573e-05, + "loss": 0.4222, + "num_input_tokens_seen": 4397536, + "step": 2875 + }, + { + "epoch": 8.890262751159197, + "grad_norm": 0.47012898325920105, + "learning_rate": 4.9363612099131216e-05, + "loss": 0.4074, + "num_input_tokens_seen": 4404736, + "step": 2880 + }, + { + "epoch": 8.905718701700154, + "grad_norm": 0.5830479264259338, + "learning_rate": 4.936140919898155e-05, + "loss": 0.4924, + "num_input_tokens_seen": 4412704, + "step": 2885 + }, + { + "epoch": 8.921174652241113, + "grad_norm": 0.6526751518249512, + "learning_rate": 4.9359202541996426e-05, + "loss": 0.4859, + "num_input_tokens_seen": 4420320, + "step": 2890 + }, + { + "epoch": 8.93663060278207, + "grad_norm": 0.591101884841919, + "learning_rate": 4.935699212851616e-05, + "loss": 0.5039, + "num_input_tokens_seen": 4428064, + "step": 2895 + }, + { + "epoch": 8.95208655332303, + "grad_norm": 0.5865139961242676, + "learning_rate": 4.935477795888162e-05, + "loss": 0.3828, + "num_input_tokens_seen": 4435616, + "step": 2900 + }, + { + "epoch": 8.967542503863987, + "grad_norm": 0.6650347113609314, + "learning_rate": 4.935256003343426e-05, + "loss": 0.469, + "num_input_tokens_seen": 4443584, + "step": 2905 + }, + { + "epoch": 8.982998454404946, + "grad_norm": 0.3873775601387024, + "learning_rate": 4.93503383525161e-05, + "loss": 0.3816, + "num_input_tokens_seen": 4451584, + "step": 2910 + }, + { + "epoch": 8.998454404945905, + "grad_norm": 0.5505792498588562, + "learning_rate": 4.934811291646977e-05, + "loss": 0.4528, + "num_input_tokens_seen": 4459680, + "step": 2915 + }, + { + "epoch": 9.012364760432767, + "grad_norm": 0.651421844959259, + "learning_rate": 4.934588372563845e-05, + "loss": 0.502, + "num_input_tokens_seen": 4466720, + "step": 2920 + }, + { + "epoch": 9.027820710973725, + "grad_norm": 0.6168310642242432, + "learning_rate": 4.93436507803659e-05, + "loss": 0.3666, + "num_input_tokens_seen": 4474208, + "step": 2925 + }, + { + "epoch": 9.043276661514684, + "grad_norm": 0.6104885935783386, + "learning_rate": 4.934141408099649e-05, + "loss": 0.4363, + "num_input_tokens_seen": 4482304, + "step": 2930 + }, + { + "epoch": 9.05873261205564, + "grad_norm": 0.443208247423172, + "learning_rate": 4.9339173627875135e-05, + "loss": 0.4735, + "num_input_tokens_seen": 4490368, + "step": 2935 + }, + { + "epoch": 9.0741885625966, + "grad_norm": 0.6859422326087952, + "learning_rate": 4.9336929421347335e-05, + "loss": 0.5452, + "num_input_tokens_seen": 4497888, + "step": 2940 + }, + { + "epoch": 9.089644513137557, + "grad_norm": 0.4261942505836487, + "learning_rate": 4.933468146175918e-05, + "loss": 0.4056, + "num_input_tokens_seen": 4505920, + "step": 2945 + }, + { + "epoch": 9.105100463678516, + "grad_norm": 0.9670826196670532, + "learning_rate": 4.933242974945734e-05, + "loss": 0.5769, + "num_input_tokens_seen": 4513472, + "step": 2950 + }, + { + "epoch": 9.120556414219475, + "grad_norm": 0.8964840769767761, + "learning_rate": 4.933017428478906e-05, + "loss": 0.4988, + "num_input_tokens_seen": 4521088, + "step": 2955 + }, + { + "epoch": 9.136012364760433, + "grad_norm": 0.5952701568603516, + "learning_rate": 4.932791506810214e-05, + "loss": 0.4516, + "num_input_tokens_seen": 4528384, + "step": 2960 + }, + { + "epoch": 9.151468315301392, + "grad_norm": 0.9695752263069153, + "learning_rate": 4.932565209974499e-05, + "loss": 0.4072, + "num_input_tokens_seen": 4535840, + "step": 2965 + }, + { + "epoch": 9.166924265842349, + "grad_norm": 0.5233954191207886, + "learning_rate": 4.93233853800666e-05, + "loss": 0.4036, + "num_input_tokens_seen": 4543392, + "step": 2970 + }, + { + "epoch": 9.182380216383308, + "grad_norm": 0.4182771146297455, + "learning_rate": 4.932111490941651e-05, + "loss": 0.3911, + "num_input_tokens_seen": 4550720, + "step": 2975 + }, + { + "epoch": 9.197836166924265, + "grad_norm": 0.8773304224014282, + "learning_rate": 4.9318840688144876e-05, + "loss": 0.3965, + "num_input_tokens_seen": 4557984, + "step": 2980 + }, + { + "epoch": 9.213292117465224, + "grad_norm": 0.6439784169197083, + "learning_rate": 4.9316562716602387e-05, + "loss": 0.4168, + "num_input_tokens_seen": 4565408, + "step": 2985 + }, + { + "epoch": 9.228748068006182, + "grad_norm": 0.7124764919281006, + "learning_rate": 4.9314280995140346e-05, + "loss": 0.3881, + "num_input_tokens_seen": 4572960, + "step": 2990 + }, + { + "epoch": 9.24420401854714, + "grad_norm": 1.0819069147109985, + "learning_rate": 4.931199552411063e-05, + "loss": 0.4659, + "num_input_tokens_seen": 4580736, + "step": 2995 + }, + { + "epoch": 9.2596599690881, + "grad_norm": 0.6013742685317993, + "learning_rate": 4.930970630386568e-05, + "loss": 0.4762, + "num_input_tokens_seen": 4588608, + "step": 3000 + }, + { + "epoch": 9.2596599690881, + "eval_loss": 0.497619092464447, + "eval_runtime": 6.281, + "eval_samples_per_second": 91.546, + "eval_steps_per_second": 22.926, + "num_input_tokens_seen": 4588608, + "step": 3000 + }, + { + "epoch": 9.275115919629057, + "grad_norm": 0.9025372266769409, + "learning_rate": 4.9307413334758524e-05, + "loss": 0.4917, + "num_input_tokens_seen": 4596000, + "step": 3005 + }, + { + "epoch": 9.290571870170016, + "grad_norm": 0.7342119216918945, + "learning_rate": 4.930511661714276e-05, + "loss": 0.3558, + "num_input_tokens_seen": 4603648, + "step": 3010 + }, + { + "epoch": 9.306027820710973, + "grad_norm": 0.49968400597572327, + "learning_rate": 4.9302816151372576e-05, + "loss": 0.4241, + "num_input_tokens_seen": 4611360, + "step": 3015 + }, + { + "epoch": 9.321483771251932, + "grad_norm": 0.5490708947181702, + "learning_rate": 4.930051193780274e-05, + "loss": 0.4944, + "num_input_tokens_seen": 4618656, + "step": 3020 + }, + { + "epoch": 9.33693972179289, + "grad_norm": 0.6694681644439697, + "learning_rate": 4.929820397678858e-05, + "loss": 0.4445, + "num_input_tokens_seen": 4626336, + "step": 3025 + }, + { + "epoch": 9.352395672333849, + "grad_norm": 0.985956072807312, + "learning_rate": 4.9295892268686015e-05, + "loss": 0.4936, + "num_input_tokens_seen": 4634368, + "step": 3030 + }, + { + "epoch": 9.367851622874808, + "grad_norm": 0.46300846338272095, + "learning_rate": 4.9293576813851536e-05, + "loss": 0.3711, + "num_input_tokens_seen": 4642112, + "step": 3035 + }, + { + "epoch": 9.383307573415765, + "grad_norm": 0.5803442597389221, + "learning_rate": 4.929125761264223e-05, + "loss": 0.4682, + "num_input_tokens_seen": 4649568, + "step": 3040 + }, + { + "epoch": 9.398763523956724, + "grad_norm": 0.5930024981498718, + "learning_rate": 4.928893466541573e-05, + "loss": 0.5131, + "num_input_tokens_seen": 4657376, + "step": 3045 + }, + { + "epoch": 9.414219474497681, + "grad_norm": 0.6702645421028137, + "learning_rate": 4.928660797253027e-05, + "loss": 0.4008, + "num_input_tokens_seen": 4665280, + "step": 3050 + }, + { + "epoch": 9.42967542503864, + "grad_norm": 0.5467454195022583, + "learning_rate": 4.928427753434467e-05, + "loss": 0.4465, + "num_input_tokens_seen": 4673184, + "step": 3055 + }, + { + "epoch": 9.445131375579598, + "grad_norm": 0.610544741153717, + "learning_rate": 4.9281943351218286e-05, + "loss": 0.3864, + "num_input_tokens_seen": 4680576, + "step": 3060 + }, + { + "epoch": 9.460587326120557, + "grad_norm": 0.6584251523017883, + "learning_rate": 4.9279605423511095e-05, + "loss": 0.4199, + "num_input_tokens_seen": 4688768, + "step": 3065 + }, + { + "epoch": 9.476043276661514, + "grad_norm": 0.7417483925819397, + "learning_rate": 4.927726375158363e-05, + "loss": 0.4313, + "num_input_tokens_seen": 4696480, + "step": 3070 + }, + { + "epoch": 9.491499227202473, + "grad_norm": 0.53084397315979, + "learning_rate": 4.9274918335797004e-05, + "loss": 0.4736, + "num_input_tokens_seen": 4703776, + "step": 3075 + }, + { + "epoch": 9.506955177743432, + "grad_norm": 0.8630356192588806, + "learning_rate": 4.927256917651292e-05, + "loss": 0.3628, + "num_input_tokens_seen": 4711424, + "step": 3080 + }, + { + "epoch": 9.52241112828439, + "grad_norm": 0.4162272810935974, + "learning_rate": 4.927021627409364e-05, + "loss": 0.418, + "num_input_tokens_seen": 4718752, + "step": 3085 + }, + { + "epoch": 9.537867078825348, + "grad_norm": 0.6981655955314636, + "learning_rate": 4.9267859628902005e-05, + "loss": 0.4151, + "num_input_tokens_seen": 4726528, + "step": 3090 + }, + { + "epoch": 9.553323029366306, + "grad_norm": 0.4660645127296448, + "learning_rate": 4.9265499241301454e-05, + "loss": 0.377, + "num_input_tokens_seen": 4734048, + "step": 3095 + }, + { + "epoch": 9.568778979907265, + "grad_norm": 0.7534204721450806, + "learning_rate": 4.926313511165598e-05, + "loss": 0.4029, + "num_input_tokens_seen": 4741472, + "step": 3100 + }, + { + "epoch": 9.584234930448222, + "grad_norm": 0.740565836429596, + "learning_rate": 4.926076724033016e-05, + "loss": 0.4133, + "num_input_tokens_seen": 4749376, + "step": 3105 + }, + { + "epoch": 9.599690880989181, + "grad_norm": 0.4975444972515106, + "learning_rate": 4.9258395627689146e-05, + "loss": 0.4826, + "num_input_tokens_seen": 4756480, + "step": 3110 + }, + { + "epoch": 9.615146831530138, + "grad_norm": 0.5461101531982422, + "learning_rate": 4.925602027409868e-05, + "loss": 0.5018, + "num_input_tokens_seen": 4764416, + "step": 3115 + }, + { + "epoch": 9.630602782071097, + "grad_norm": 0.7129260897636414, + "learning_rate": 4.925364117992507e-05, + "loss": 0.4668, + "num_input_tokens_seen": 4772160, + "step": 3120 + }, + { + "epoch": 9.646058732612056, + "grad_norm": 0.6370190382003784, + "learning_rate": 4.92512583455352e-05, + "loss": 0.3886, + "num_input_tokens_seen": 4779616, + "step": 3125 + }, + { + "epoch": 9.661514683153014, + "grad_norm": 0.4388972222805023, + "learning_rate": 4.9248871771296536e-05, + "loss": 0.3622, + "num_input_tokens_seen": 4786944, + "step": 3130 + }, + { + "epoch": 9.676970633693973, + "grad_norm": 0.7102662324905396, + "learning_rate": 4.924648145757711e-05, + "loss": 0.4174, + "num_input_tokens_seen": 4794624, + "step": 3135 + }, + { + "epoch": 9.69242658423493, + "grad_norm": 0.5481711626052856, + "learning_rate": 4.924408740474554e-05, + "loss": 0.4391, + "num_input_tokens_seen": 4802752, + "step": 3140 + }, + { + "epoch": 9.707882534775889, + "grad_norm": 0.8782704472541809, + "learning_rate": 4.924168961317103e-05, + "loss": 0.483, + "num_input_tokens_seen": 4810432, + "step": 3145 + }, + { + "epoch": 9.723338485316846, + "grad_norm": 0.563685417175293, + "learning_rate": 4.9239288083223334e-05, + "loss": 0.4657, + "num_input_tokens_seen": 4818144, + "step": 3150 + }, + { + "epoch": 9.738794435857805, + "grad_norm": 0.5786684155464172, + "learning_rate": 4.9236882815272803e-05, + "loss": 0.379, + "num_input_tokens_seen": 4825280, + "step": 3155 + }, + { + "epoch": 9.754250386398763, + "grad_norm": 0.6950434446334839, + "learning_rate": 4.9234473809690365e-05, + "loss": 0.3419, + "num_input_tokens_seen": 4833536, + "step": 3160 + }, + { + "epoch": 9.769706336939722, + "grad_norm": 0.7877092361450195, + "learning_rate": 4.923206106684752e-05, + "loss": 0.4535, + "num_input_tokens_seen": 4841088, + "step": 3165 + }, + { + "epoch": 9.78516228748068, + "grad_norm": 0.7708442211151123, + "learning_rate": 4.922964458711634e-05, + "loss": 0.6137, + "num_input_tokens_seen": 4848608, + "step": 3170 + }, + { + "epoch": 9.800618238021638, + "grad_norm": 0.6472955942153931, + "learning_rate": 4.9227224370869474e-05, + "loss": 0.4706, + "num_input_tokens_seen": 4856128, + "step": 3175 + }, + { + "epoch": 9.816074188562597, + "grad_norm": 0.8640816807746887, + "learning_rate": 4.9224800418480155e-05, + "loss": 0.466, + "num_input_tokens_seen": 4863488, + "step": 3180 + }, + { + "epoch": 9.831530139103554, + "grad_norm": 0.6959113478660583, + "learning_rate": 4.9222372730322176e-05, + "loss": 0.4782, + "num_input_tokens_seen": 4871392, + "step": 3185 + }, + { + "epoch": 9.846986089644513, + "grad_norm": 0.5501073002815247, + "learning_rate": 4.921994130676993e-05, + "loss": 0.4511, + "num_input_tokens_seen": 4879360, + "step": 3190 + }, + { + "epoch": 9.86244204018547, + "grad_norm": 1.0510506629943848, + "learning_rate": 4.9217506148198366e-05, + "loss": 0.4371, + "num_input_tokens_seen": 4886976, + "step": 3195 + }, + { + "epoch": 9.87789799072643, + "grad_norm": 0.38321369886398315, + "learning_rate": 4.921506725498302e-05, + "loss": 0.4262, + "num_input_tokens_seen": 4894432, + "step": 3200 + }, + { + "epoch": 9.87789799072643, + "eval_loss": 0.48274415731430054, + "eval_runtime": 6.3086, + "eval_samples_per_second": 91.146, + "eval_steps_per_second": 22.826, + "num_input_tokens_seen": 4894432, + "step": 3200 + }, + { + "epoch": 9.893353941267389, + "grad_norm": 1.0592955350875854, + "learning_rate": 4.9212624627499994e-05, + "loss": 0.4495, + "num_input_tokens_seen": 4902336, + "step": 3205 + }, + { + "epoch": 9.908809891808346, + "grad_norm": 0.5129683017730713, + "learning_rate": 4.921017826612597e-05, + "loss": 0.35, + "num_input_tokens_seen": 4910016, + "step": 3210 + }, + { + "epoch": 9.924265842349305, + "grad_norm": 0.42292264103889465, + "learning_rate": 4.9207728171238223e-05, + "loss": 0.4164, + "num_input_tokens_seen": 4917824, + "step": 3215 + }, + { + "epoch": 9.939721792890262, + "grad_norm": 0.5316721796989441, + "learning_rate": 4.920527434321458e-05, + "loss": 0.3528, + "num_input_tokens_seen": 4925472, + "step": 3220 + }, + { + "epoch": 9.955177743431221, + "grad_norm": 0.6407940983772278, + "learning_rate": 4.920281678243345e-05, + "loss": 0.4271, + "num_input_tokens_seen": 4933216, + "step": 3225 + }, + { + "epoch": 9.970633693972179, + "grad_norm": 0.6948913931846619, + "learning_rate": 4.920035548927381e-05, + "loss": 0.4361, + "num_input_tokens_seen": 4940928, + "step": 3230 + }, + { + "epoch": 9.986089644513138, + "grad_norm": 0.584884762763977, + "learning_rate": 4.919789046411525e-05, + "loss": 0.4307, + "num_input_tokens_seen": 4948128, + "step": 3235 + }, + { + "epoch": 10.0, + "grad_norm": 0.6706665754318237, + "learning_rate": 4.919542170733787e-05, + "loss": 0.4584, + "num_input_tokens_seen": 4954832, + "step": 3240 + }, + { + "epoch": 10.015455950540959, + "grad_norm": 0.68323814868927, + "learning_rate": 4.919294921932242e-05, + "loss": 0.4636, + "num_input_tokens_seen": 4962352, + "step": 3245 + }, + { + "epoch": 10.030911901081916, + "grad_norm": 0.6860997676849365, + "learning_rate": 4.919047300045016e-05, + "loss": 0.368, + "num_input_tokens_seen": 4970128, + "step": 3250 + }, + { + "epoch": 10.046367851622875, + "grad_norm": 0.5850483775138855, + "learning_rate": 4.918799305110299e-05, + "loss": 0.371, + "num_input_tokens_seen": 4977776, + "step": 3255 + }, + { + "epoch": 10.061823802163833, + "grad_norm": 0.5947558283805847, + "learning_rate": 4.918550937166331e-05, + "loss": 0.4306, + "num_input_tokens_seen": 4985200, + "step": 3260 + }, + { + "epoch": 10.077279752704792, + "grad_norm": 0.6309699416160583, + "learning_rate": 4.918302196251415e-05, + "loss": 0.358, + "num_input_tokens_seen": 4993360, + "step": 3265 + }, + { + "epoch": 10.092735703245749, + "grad_norm": 0.6469031572341919, + "learning_rate": 4.91805308240391e-05, + "loss": 0.3934, + "num_input_tokens_seen": 5000688, + "step": 3270 + }, + { + "epoch": 10.108191653786708, + "grad_norm": 0.6726474165916443, + "learning_rate": 4.9178035956622326e-05, + "loss": 0.3041, + "num_input_tokens_seen": 5008048, + "step": 3275 + }, + { + "epoch": 10.123647604327665, + "grad_norm": 0.4511040449142456, + "learning_rate": 4.917553736064857e-05, + "loss": 0.3519, + "num_input_tokens_seen": 5015920, + "step": 3280 + }, + { + "epoch": 10.139103554868624, + "grad_norm": 0.6808083653450012, + "learning_rate": 4.917303503650314e-05, + "loss": 0.4529, + "num_input_tokens_seen": 5023824, + "step": 3285 + }, + { + "epoch": 10.154559505409583, + "grad_norm": 0.5196136832237244, + "learning_rate": 4.917052898457194e-05, + "loss": 0.4132, + "num_input_tokens_seen": 5031568, + "step": 3290 + }, + { + "epoch": 10.17001545595054, + "grad_norm": 0.5827124714851379, + "learning_rate": 4.916801920524141e-05, + "loss": 0.4416, + "num_input_tokens_seen": 5039248, + "step": 3295 + }, + { + "epoch": 10.1854714064915, + "grad_norm": 0.681298017501831, + "learning_rate": 4.916550569889862e-05, + "loss": 0.3553, + "num_input_tokens_seen": 5046512, + "step": 3300 + }, + { + "epoch": 10.200927357032457, + "grad_norm": 0.8677992224693298, + "learning_rate": 4.916298846593116e-05, + "loss": 0.5466, + "num_input_tokens_seen": 5054320, + "step": 3305 + }, + { + "epoch": 10.216383307573416, + "grad_norm": 0.6618756651878357, + "learning_rate": 4.916046750672722e-05, + "loss": 0.4457, + "num_input_tokens_seen": 5062800, + "step": 3310 + }, + { + "epoch": 10.231839258114373, + "grad_norm": 1.1299867630004883, + "learning_rate": 4.915794282167559e-05, + "loss": 0.4239, + "num_input_tokens_seen": 5070160, + "step": 3315 + }, + { + "epoch": 10.247295208655332, + "grad_norm": 0.7422680854797363, + "learning_rate": 4.915541441116558e-05, + "loss": 0.5182, + "num_input_tokens_seen": 5077840, + "step": 3320 + }, + { + "epoch": 10.262751159196291, + "grad_norm": 0.5662881135940552, + "learning_rate": 4.915288227558711e-05, + "loss": 0.3835, + "num_input_tokens_seen": 5085968, + "step": 3325 + }, + { + "epoch": 10.278207109737249, + "grad_norm": 0.7844827771186829, + "learning_rate": 4.915034641533066e-05, + "loss": 0.3604, + "num_input_tokens_seen": 5093488, + "step": 3330 + }, + { + "epoch": 10.293663060278208, + "grad_norm": 0.6909022927284241, + "learning_rate": 4.914780683078731e-05, + "loss": 0.4153, + "num_input_tokens_seen": 5100976, + "step": 3335 + }, + { + "epoch": 10.309119010819165, + "grad_norm": 0.6693107485771179, + "learning_rate": 4.9145263522348695e-05, + "loss": 0.4084, + "num_input_tokens_seen": 5108560, + "step": 3340 + }, + { + "epoch": 10.324574961360124, + "grad_norm": 0.4900054335594177, + "learning_rate": 4.9142716490407e-05, + "loss": 0.4295, + "num_input_tokens_seen": 5116336, + "step": 3345 + }, + { + "epoch": 10.340030911901081, + "grad_norm": 0.7537118196487427, + "learning_rate": 4.914016573535504e-05, + "loss": 0.4716, + "num_input_tokens_seen": 5124176, + "step": 3350 + }, + { + "epoch": 10.35548686244204, + "grad_norm": 0.49470898509025574, + "learning_rate": 4.9137611257586154e-05, + "loss": 0.4036, + "num_input_tokens_seen": 5131440, + "step": 3355 + }, + { + "epoch": 10.370942812982998, + "grad_norm": 0.8018416166305542, + "learning_rate": 4.9135053057494274e-05, + "loss": 0.411, + "num_input_tokens_seen": 5139024, + "step": 3360 + }, + { + "epoch": 10.386398763523957, + "grad_norm": 0.4879326820373535, + "learning_rate": 4.913249113547392e-05, + "loss": 0.3361, + "num_input_tokens_seen": 5146960, + "step": 3365 + }, + { + "epoch": 10.401854714064916, + "grad_norm": 0.4659757614135742, + "learning_rate": 4.912992549192016e-05, + "loss": 0.3565, + "num_input_tokens_seen": 5155056, + "step": 3370 + }, + { + "epoch": 10.417310664605873, + "grad_norm": 0.5485870242118835, + "learning_rate": 4.9127356127228665e-05, + "loss": 0.3874, + "num_input_tokens_seen": 5162288, + "step": 3375 + }, + { + "epoch": 10.432766615146832, + "grad_norm": 0.8584051132202148, + "learning_rate": 4.912478304179564e-05, + "loss": 0.4367, + "num_input_tokens_seen": 5170256, + "step": 3380 + }, + { + "epoch": 10.44822256568779, + "grad_norm": 0.42583656311035156, + "learning_rate": 4.9122206236017896e-05, + "loss": 0.4332, + "num_input_tokens_seen": 5178224, + "step": 3385 + }, + { + "epoch": 10.463678516228748, + "grad_norm": 0.4427963197231293, + "learning_rate": 4.911962571029282e-05, + "loss": 0.4349, + "num_input_tokens_seen": 5185456, + "step": 3390 + }, + { + "epoch": 10.479134466769706, + "grad_norm": 0.7411807775497437, + "learning_rate": 4.9117041465018353e-05, + "loss": 0.5243, + "num_input_tokens_seen": 5192976, + "step": 3395 + }, + { + "epoch": 10.494590417310665, + "grad_norm": 0.6595512628555298, + "learning_rate": 4.911445350059302e-05, + "loss": 0.4203, + "num_input_tokens_seen": 5200528, + "step": 3400 + }, + { + "epoch": 10.494590417310665, + "eval_loss": 0.47119367122650146, + "eval_runtime": 6.3, + "eval_samples_per_second": 91.27, + "eval_steps_per_second": 22.857, + "num_input_tokens_seen": 5200528, + "step": 3400 + }, + { + "epoch": 10.510046367851622, + "grad_norm": 0.7810150384902954, + "learning_rate": 4.9111861817415905e-05, + "loss": 0.392, + "num_input_tokens_seen": 5208528, + "step": 3405 + }, + { + "epoch": 10.525502318392581, + "grad_norm": 1.0842161178588867, + "learning_rate": 4.91092664158867e-05, + "loss": 0.4366, + "num_input_tokens_seen": 5215696, + "step": 3410 + }, + { + "epoch": 10.54095826893354, + "grad_norm": 0.8062556385993958, + "learning_rate": 4.910666729640563e-05, + "loss": 0.4629, + "num_input_tokens_seen": 5223024, + "step": 3415 + }, + { + "epoch": 10.556414219474497, + "grad_norm": 0.566119372844696, + "learning_rate": 4.910406445937353e-05, + "loss": 0.5249, + "num_input_tokens_seen": 5230800, + "step": 3420 + }, + { + "epoch": 10.571870170015456, + "grad_norm": 0.8134851455688477, + "learning_rate": 4.9101457905191774e-05, + "loss": 0.3653, + "num_input_tokens_seen": 5238192, + "step": 3425 + }, + { + "epoch": 10.587326120556414, + "grad_norm": 0.9220712780952454, + "learning_rate": 4.909884763426233e-05, + "loss": 0.3894, + "num_input_tokens_seen": 5246032, + "step": 3430 + }, + { + "epoch": 10.602782071097373, + "grad_norm": 0.6193529963493347, + "learning_rate": 4.9096233646987736e-05, + "loss": 0.4247, + "num_input_tokens_seen": 5253264, + "step": 3435 + }, + { + "epoch": 10.61823802163833, + "grad_norm": 0.568263828754425, + "learning_rate": 4.9093615943771104e-05, + "loss": 0.3965, + "num_input_tokens_seen": 5260592, + "step": 3440 + }, + { + "epoch": 10.63369397217929, + "grad_norm": 0.5582303404808044, + "learning_rate": 4.909099452501611e-05, + "loss": 0.3538, + "num_input_tokens_seen": 5268080, + "step": 3445 + }, + { + "epoch": 10.649149922720248, + "grad_norm": 0.6478096842765808, + "learning_rate": 4.908836939112702e-05, + "loss": 0.4124, + "num_input_tokens_seen": 5275760, + "step": 3450 + }, + { + "epoch": 10.664605873261205, + "grad_norm": 0.7921094298362732, + "learning_rate": 4.908574054250865e-05, + "loss": 0.482, + "num_input_tokens_seen": 5282992, + "step": 3455 + }, + { + "epoch": 10.680061823802165, + "grad_norm": 0.6945898532867432, + "learning_rate": 4.9083107979566414e-05, + "loss": 0.3932, + "num_input_tokens_seen": 5290352, + "step": 3460 + }, + { + "epoch": 10.695517774343122, + "grad_norm": 0.8916829228401184, + "learning_rate": 4.908047170270628e-05, + "loss": 0.5047, + "num_input_tokens_seen": 5297584, + "step": 3465 + }, + { + "epoch": 10.71097372488408, + "grad_norm": 0.5181012749671936, + "learning_rate": 4.9077831712334784e-05, + "loss": 0.3324, + "num_input_tokens_seen": 5305200, + "step": 3470 + }, + { + "epoch": 10.726429675425038, + "grad_norm": 0.6405507326126099, + "learning_rate": 4.907518800885907e-05, + "loss": 0.397, + "num_input_tokens_seen": 5312976, + "step": 3475 + }, + { + "epoch": 10.741885625965997, + "grad_norm": 0.4341070353984833, + "learning_rate": 4.907254059268681e-05, + "loss": 0.4071, + "num_input_tokens_seen": 5320656, + "step": 3480 + }, + { + "epoch": 10.757341576506954, + "grad_norm": 0.3879861831665039, + "learning_rate": 4.906988946422628e-05, + "loss": 0.3757, + "num_input_tokens_seen": 5328304, + "step": 3485 + }, + { + "epoch": 10.772797527047913, + "grad_norm": 0.6093811988830566, + "learning_rate": 4.9067234623886315e-05, + "loss": 0.4161, + "num_input_tokens_seen": 5336304, + "step": 3490 + }, + { + "epoch": 10.788253477588873, + "grad_norm": 0.5157049298286438, + "learning_rate": 4.9064576072076316e-05, + "loss": 0.3873, + "num_input_tokens_seen": 5344208, + "step": 3495 + }, + { + "epoch": 10.80370942812983, + "grad_norm": 0.5533719062805176, + "learning_rate": 4.906191380920628e-05, + "loss": 0.4022, + "num_input_tokens_seen": 5352080, + "step": 3500 + }, + { + "epoch": 10.819165378670789, + "grad_norm": 0.7964470982551575, + "learning_rate": 4.905924783568675e-05, + "loss": 0.4479, + "num_input_tokens_seen": 5359696, + "step": 3505 + }, + { + "epoch": 10.834621329211746, + "grad_norm": 0.8753867149353027, + "learning_rate": 4.905657815192886e-05, + "loss": 0.4153, + "num_input_tokens_seen": 5367376, + "step": 3510 + }, + { + "epoch": 10.850077279752705, + "grad_norm": 0.4412286877632141, + "learning_rate": 4.90539047583443e-05, + "loss": 0.3825, + "num_input_tokens_seen": 5375888, + "step": 3515 + }, + { + "epoch": 10.865533230293662, + "grad_norm": 0.4895785450935364, + "learning_rate": 4.905122765534534e-05, + "loss": 0.4233, + "num_input_tokens_seen": 5383632, + "step": 3520 + }, + { + "epoch": 10.880989180834622, + "grad_norm": 0.5734339952468872, + "learning_rate": 4.9048546843344846e-05, + "loss": 0.4269, + "num_input_tokens_seen": 5390672, + "step": 3525 + }, + { + "epoch": 10.896445131375579, + "grad_norm": 0.9011109471321106, + "learning_rate": 4.9045862322756206e-05, + "loss": 0.4544, + "num_input_tokens_seen": 5399312, + "step": 3530 + }, + { + "epoch": 10.911901081916538, + "grad_norm": 0.8534422516822815, + "learning_rate": 4.904317409399342e-05, + "loss": 0.4316, + "num_input_tokens_seen": 5406608, + "step": 3535 + }, + { + "epoch": 10.927357032457497, + "grad_norm": 0.7153432965278625, + "learning_rate": 4.904048215747104e-05, + "loss": 0.4332, + "num_input_tokens_seen": 5414448, + "step": 3540 + }, + { + "epoch": 10.942812982998454, + "grad_norm": 0.691185712814331, + "learning_rate": 4.90377865136042e-05, + "loss": 0.4606, + "num_input_tokens_seen": 5422096, + "step": 3545 + }, + { + "epoch": 10.958268933539413, + "grad_norm": 0.5544580817222595, + "learning_rate": 4.90350871628086e-05, + "loss": 0.4388, + "num_input_tokens_seen": 5429456, + "step": 3550 + }, + { + "epoch": 10.97372488408037, + "grad_norm": 0.5177947878837585, + "learning_rate": 4.903238410550052e-05, + "loss": 0.3475, + "num_input_tokens_seen": 5436976, + "step": 3555 + }, + { + "epoch": 10.98918083462133, + "grad_norm": 0.6900402903556824, + "learning_rate": 4.90296773420968e-05, + "loss": 0.4559, + "num_input_tokens_seen": 5444592, + "step": 3560 + }, + { + "epoch": 11.003091190108192, + "grad_norm": 0.9334075450897217, + "learning_rate": 4.902696687301486e-05, + "loss": 0.4779, + "num_input_tokens_seen": 5451136, + "step": 3565 + }, + { + "epoch": 11.018547140649149, + "grad_norm": 0.7826884388923645, + "learning_rate": 4.902425269867268e-05, + "loss": 0.5073, + "num_input_tokens_seen": 5458848, + "step": 3570 + }, + { + "epoch": 11.034003091190108, + "grad_norm": 0.35748234391212463, + "learning_rate": 4.902153481948883e-05, + "loss": 0.3343, + "num_input_tokens_seen": 5466112, + "step": 3575 + }, + { + "epoch": 11.049459041731067, + "grad_norm": 0.44556519389152527, + "learning_rate": 4.901881323588244e-05, + "loss": 0.4166, + "num_input_tokens_seen": 5474272, + "step": 3580 + }, + { + "epoch": 11.064914992272024, + "grad_norm": 0.7604163885116577, + "learning_rate": 4.90160879482732e-05, + "loss": 0.5134, + "num_input_tokens_seen": 5481952, + "step": 3585 + }, + { + "epoch": 11.080370942812984, + "grad_norm": 0.5273403525352478, + "learning_rate": 4.9013358957081405e-05, + "loss": 0.3976, + "num_input_tokens_seen": 5489632, + "step": 3590 + }, + { + "epoch": 11.09582689335394, + "grad_norm": 0.8595371842384338, + "learning_rate": 4.901062626272789e-05, + "loss": 0.3688, + "num_input_tokens_seen": 5497344, + "step": 3595 + }, + { + "epoch": 11.1112828438949, + "grad_norm": 0.6076413989067078, + "learning_rate": 4.900788986563406e-05, + "loss": 0.3634, + "num_input_tokens_seen": 5504960, + "step": 3600 + }, + { + "epoch": 11.1112828438949, + "eval_loss": 0.46121877431869507, + "eval_runtime": 6.2921, + "eval_samples_per_second": 91.385, + "eval_steps_per_second": 22.886, + "num_input_tokens_seen": 5504960, + "step": 3600 + }, + { + "epoch": 11.126738794435857, + "grad_norm": 0.46772754192352295, + "learning_rate": 4.9005149766221915e-05, + "loss": 0.3911, + "num_input_tokens_seen": 5512416, + "step": 3605 + }, + { + "epoch": 11.142194744976816, + "grad_norm": 0.8573851585388184, + "learning_rate": 4.9002405964914e-05, + "loss": 0.4422, + "num_input_tokens_seen": 5519840, + "step": 3610 + }, + { + "epoch": 11.157650695517773, + "grad_norm": 0.633934497833252, + "learning_rate": 4.899965846213346e-05, + "loss": 0.4273, + "num_input_tokens_seen": 5528128, + "step": 3615 + }, + { + "epoch": 11.173106646058732, + "grad_norm": 0.46305492520332336, + "learning_rate": 4.899690725830399e-05, + "loss": 0.4658, + "num_input_tokens_seen": 5535680, + "step": 3620 + }, + { + "epoch": 11.188562596599692, + "grad_norm": 0.8783971071243286, + "learning_rate": 4.899415235384985e-05, + "loss": 0.4512, + "num_input_tokens_seen": 5543232, + "step": 3625 + }, + { + "epoch": 11.204018547140649, + "grad_norm": 0.565785825252533, + "learning_rate": 4.899139374919589e-05, + "loss": 0.4574, + "num_input_tokens_seen": 5551040, + "step": 3630 + }, + { + "epoch": 11.219474497681608, + "grad_norm": 0.8638449907302856, + "learning_rate": 4.898863144476752e-05, + "loss": 0.4458, + "num_input_tokens_seen": 5558336, + "step": 3635 + }, + { + "epoch": 11.234930448222565, + "grad_norm": 0.4552540183067322, + "learning_rate": 4.898586544099072e-05, + "loss": 0.3689, + "num_input_tokens_seen": 5565440, + "step": 3640 + }, + { + "epoch": 11.250386398763524, + "grad_norm": 0.6131048202514648, + "learning_rate": 4.898309573829204e-05, + "loss": 0.3287, + "num_input_tokens_seen": 5572960, + "step": 3645 + }, + { + "epoch": 11.265842349304481, + "grad_norm": 0.5881274938583374, + "learning_rate": 4.898032233709862e-05, + "loss": 0.3534, + "num_input_tokens_seen": 5580896, + "step": 3650 + }, + { + "epoch": 11.28129829984544, + "grad_norm": 0.5887120366096497, + "learning_rate": 4.8977545237838123e-05, + "loss": 0.3372, + "num_input_tokens_seen": 5588416, + "step": 3655 + }, + { + "epoch": 11.2967542503864, + "grad_norm": 0.5352303981781006, + "learning_rate": 4.8974764440938836e-05, + "loss": 0.3207, + "num_input_tokens_seen": 5595424, + "step": 3660 + }, + { + "epoch": 11.312210200927357, + "grad_norm": 0.5871121883392334, + "learning_rate": 4.897197994682959e-05, + "loss": 0.4179, + "num_input_tokens_seen": 5603264, + "step": 3665 + }, + { + "epoch": 11.327666151468316, + "grad_norm": 0.6706465482711792, + "learning_rate": 4.8969191755939786e-05, + "loss": 0.3688, + "num_input_tokens_seen": 5610816, + "step": 3670 + }, + { + "epoch": 11.343122102009273, + "grad_norm": 0.40392282605171204, + "learning_rate": 4.8966399868699396e-05, + "loss": 0.5029, + "num_input_tokens_seen": 5618208, + "step": 3675 + }, + { + "epoch": 11.358578052550232, + "grad_norm": 0.5846600532531738, + "learning_rate": 4.8963604285538965e-05, + "loss": 0.3888, + "num_input_tokens_seen": 5625312, + "step": 3680 + }, + { + "epoch": 11.37403400309119, + "grad_norm": 0.34512802958488464, + "learning_rate": 4.8960805006889604e-05, + "loss": 0.3579, + "num_input_tokens_seen": 5632896, + "step": 3685 + }, + { + "epoch": 11.389489953632149, + "grad_norm": 0.7547349333763123, + "learning_rate": 4.8958002033183004e-05, + "loss": 0.3385, + "num_input_tokens_seen": 5640992, + "step": 3690 + }, + { + "epoch": 11.404945904173106, + "grad_norm": 0.5673174262046814, + "learning_rate": 4.8955195364851414e-05, + "loss": 0.4204, + "num_input_tokens_seen": 5648768, + "step": 3695 + }, + { + "epoch": 11.420401854714065, + "grad_norm": 0.8104907870292664, + "learning_rate": 4.895238500232766e-05, + "loss": 0.3908, + "num_input_tokens_seen": 5656608, + "step": 3700 + }, + { + "epoch": 11.435857805255024, + "grad_norm": 0.794833779335022, + "learning_rate": 4.8949570946045143e-05, + "loss": 0.3173, + "num_input_tokens_seen": 5663616, + "step": 3705 + }, + { + "epoch": 11.451313755795981, + "grad_norm": 0.6649784445762634, + "learning_rate": 4.89467531964378e-05, + "loss": 0.4348, + "num_input_tokens_seen": 5671232, + "step": 3710 + }, + { + "epoch": 11.46676970633694, + "grad_norm": 0.6149303913116455, + "learning_rate": 4.894393175394019e-05, + "loss": 0.3624, + "num_input_tokens_seen": 5678720, + "step": 3715 + }, + { + "epoch": 11.482225656877898, + "grad_norm": 0.5443590879440308, + "learning_rate": 4.8941106618987406e-05, + "loss": 0.3963, + "num_input_tokens_seen": 5686080, + "step": 3720 + }, + { + "epoch": 11.497681607418857, + "grad_norm": 0.7773472666740417, + "learning_rate": 4.893827779201512e-05, + "loss": 0.4488, + "num_input_tokens_seen": 5693760, + "step": 3725 + }, + { + "epoch": 11.513137557959814, + "grad_norm": 0.7744869589805603, + "learning_rate": 4.893544527345957e-05, + "loss": 0.4507, + "num_input_tokens_seen": 5701088, + "step": 3730 + }, + { + "epoch": 11.528593508500773, + "grad_norm": 0.9120211601257324, + "learning_rate": 4.8932609063757563e-05, + "loss": 0.4751, + "num_input_tokens_seen": 5708576, + "step": 3735 + }, + { + "epoch": 11.54404945904173, + "grad_norm": 0.8515130281448364, + "learning_rate": 4.8929769163346484e-05, + "loss": 0.3971, + "num_input_tokens_seen": 5716960, + "step": 3740 + }, + { + "epoch": 11.55950540958269, + "grad_norm": 0.6055454015731812, + "learning_rate": 4.892692557266429e-05, + "loss": 0.4235, + "num_input_tokens_seen": 5724480, + "step": 3745 + }, + { + "epoch": 11.574961360123648, + "grad_norm": 0.6968094110488892, + "learning_rate": 4.8924078292149464e-05, + "loss": 0.5514, + "num_input_tokens_seen": 5732864, + "step": 3750 + }, + { + "epoch": 11.590417310664606, + "grad_norm": 0.6423795223236084, + "learning_rate": 4.892122732224114e-05, + "loss": 0.4352, + "num_input_tokens_seen": 5740448, + "step": 3755 + }, + { + "epoch": 11.605873261205565, + "grad_norm": 0.6659529805183411, + "learning_rate": 4.8918372663378944e-05, + "loss": 0.3562, + "num_input_tokens_seen": 5748064, + "step": 3760 + }, + { + "epoch": 11.621329211746522, + "grad_norm": 0.5665530562400818, + "learning_rate": 4.89155143160031e-05, + "loss": 0.3413, + "num_input_tokens_seen": 5756096, + "step": 3765 + }, + { + "epoch": 11.636785162287481, + "grad_norm": 0.48105931282043457, + "learning_rate": 4.891265228055441e-05, + "loss": 0.363, + "num_input_tokens_seen": 5763328, + "step": 3770 + }, + { + "epoch": 11.652241112828438, + "grad_norm": 0.8003988862037659, + "learning_rate": 4.890978655747424e-05, + "loss": 0.4025, + "num_input_tokens_seen": 5770976, + "step": 3775 + }, + { + "epoch": 11.667697063369397, + "grad_norm": 0.5620589256286621, + "learning_rate": 4.89069171472045e-05, + "loss": 0.3551, + "num_input_tokens_seen": 5778592, + "step": 3780 + }, + { + "epoch": 11.683153013910356, + "grad_norm": 0.6773874759674072, + "learning_rate": 4.890404405018772e-05, + "loss": 0.4302, + "num_input_tokens_seen": 5785920, + "step": 3785 + }, + { + "epoch": 11.698608964451314, + "grad_norm": 0.4330943822860718, + "learning_rate": 4.8901167266866934e-05, + "loss": 0.3939, + "num_input_tokens_seen": 5793696, + "step": 3790 + }, + { + "epoch": 11.714064914992273, + "grad_norm": 0.5635738968849182, + "learning_rate": 4.88982867976858e-05, + "loss": 0.3459, + "num_input_tokens_seen": 5801632, + "step": 3795 + }, + { + "epoch": 11.72952086553323, + "grad_norm": 0.46252962946891785, + "learning_rate": 4.889540264308852e-05, + "loss": 0.3689, + "num_input_tokens_seen": 5808800, + "step": 3800 + }, + { + "epoch": 11.72952086553323, + "eval_loss": 0.45051902532577515, + "eval_runtime": 6.3553, + "eval_samples_per_second": 90.476, + "eval_steps_per_second": 22.658, + "num_input_tokens_seen": 5808800, + "step": 3800 + }, + { + "epoch": 11.744976816074189, + "grad_norm": 0.4691567122936249, + "learning_rate": 4.889251480351986e-05, + "loss": 0.3885, + "num_input_tokens_seen": 5816256, + "step": 3805 + }, + { + "epoch": 11.760432766615146, + "grad_norm": 0.5146664977073669, + "learning_rate": 4.888962327942517e-05, + "loss": 0.341, + "num_input_tokens_seen": 5824224, + "step": 3810 + }, + { + "epoch": 11.775888717156105, + "grad_norm": 1.0315916538238525, + "learning_rate": 4.8886728071250356e-05, + "loss": 0.3594, + "num_input_tokens_seen": 5831968, + "step": 3815 + }, + { + "epoch": 11.791344667697063, + "grad_norm": 0.5395402908325195, + "learning_rate": 4.8883829179441884e-05, + "loss": 0.3655, + "num_input_tokens_seen": 5839488, + "step": 3820 + }, + { + "epoch": 11.806800618238022, + "grad_norm": 0.5381171703338623, + "learning_rate": 4.888092660444682e-05, + "loss": 0.3716, + "num_input_tokens_seen": 5846976, + "step": 3825 + }, + { + "epoch": 11.82225656877898, + "grad_norm": 0.5914226174354553, + "learning_rate": 4.887802034671276e-05, + "loss": 0.4149, + "num_input_tokens_seen": 5855168, + "step": 3830 + }, + { + "epoch": 11.837712519319938, + "grad_norm": 0.716601550579071, + "learning_rate": 4.88751104066879e-05, + "loss": 0.395, + "num_input_tokens_seen": 5862848, + "step": 3835 + }, + { + "epoch": 11.853168469860897, + "grad_norm": 0.7826097011566162, + "learning_rate": 4.887219678482098e-05, + "loss": 0.3966, + "num_input_tokens_seen": 5870432, + "step": 3840 + }, + { + "epoch": 11.868624420401854, + "grad_norm": 0.7037624716758728, + "learning_rate": 4.8869279481561316e-05, + "loss": 0.347, + "num_input_tokens_seen": 5878368, + "step": 3845 + }, + { + "epoch": 11.884080370942813, + "grad_norm": 0.7576178312301636, + "learning_rate": 4.88663584973588e-05, + "loss": 0.369, + "num_input_tokens_seen": 5885952, + "step": 3850 + }, + { + "epoch": 11.89953632148377, + "grad_norm": 1.085850477218628, + "learning_rate": 4.8863433832663874e-05, + "loss": 0.3914, + "num_input_tokens_seen": 5893664, + "step": 3855 + }, + { + "epoch": 11.91499227202473, + "grad_norm": 0.3421981930732727, + "learning_rate": 4.886050548792757e-05, + "loss": 0.3996, + "num_input_tokens_seen": 5901088, + "step": 3860 + }, + { + "epoch": 11.930448222565687, + "grad_norm": 0.7437856793403625, + "learning_rate": 4.8857573463601465e-05, + "loss": 0.3611, + "num_input_tokens_seen": 5908288, + "step": 3865 + }, + { + "epoch": 11.945904173106646, + "grad_norm": 0.6698380708694458, + "learning_rate": 4.885463776013772e-05, + "loss": 0.3978, + "num_input_tokens_seen": 5916160, + "step": 3870 + }, + { + "epoch": 11.961360123647605, + "grad_norm": 0.653241753578186, + "learning_rate": 4.8851698377989056e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5923744, + "step": 3875 + }, + { + "epoch": 11.976816074188562, + "grad_norm": 0.6145855784416199, + "learning_rate": 4.884875531760876e-05, + "loss": 0.3977, + "num_input_tokens_seen": 5931008, + "step": 3880 + }, + { + "epoch": 11.992272024729521, + "grad_norm": 0.45196717977523804, + "learning_rate": 4.88458085794507e-05, + "loss": 0.352, + "num_input_tokens_seen": 5938528, + "step": 3885 + }, + { + "epoch": 12.006182380216384, + "grad_norm": 1.3026444911956787, + "learning_rate": 4.884285816396929e-05, + "loss": 0.4294, + "num_input_tokens_seen": 5945360, + "step": 3890 + }, + { + "epoch": 12.021638330757341, + "grad_norm": 0.5575951933860779, + "learning_rate": 4.8839904071619526e-05, + "loss": 0.4175, + "num_input_tokens_seen": 5952752, + "step": 3895 + }, + { + "epoch": 12.0370942812983, + "grad_norm": 0.5202766060829163, + "learning_rate": 4.8836946302856955e-05, + "loss": 0.4087, + "num_input_tokens_seen": 5960336, + "step": 3900 + }, + { + "epoch": 12.052550231839259, + "grad_norm": 0.5107921361923218, + "learning_rate": 4.8833984858137715e-05, + "loss": 0.3584, + "num_input_tokens_seen": 5968016, + "step": 3905 + }, + { + "epoch": 12.068006182380216, + "grad_norm": 0.5772713422775269, + "learning_rate": 4.8831019737918494e-05, + "loss": 0.3195, + "num_input_tokens_seen": 5975472, + "step": 3910 + }, + { + "epoch": 12.083462132921175, + "grad_norm": 0.5582951307296753, + "learning_rate": 4.882805094265655e-05, + "loss": 0.4337, + "num_input_tokens_seen": 5983536, + "step": 3915 + }, + { + "epoch": 12.098918083462133, + "grad_norm": 0.4299047887325287, + "learning_rate": 4.8825078472809706e-05, + "loss": 0.3463, + "num_input_tokens_seen": 5991344, + "step": 3920 + }, + { + "epoch": 12.114374034003092, + "grad_norm": 0.5004845261573792, + "learning_rate": 4.882210232883635e-05, + "loss": 0.3749, + "num_input_tokens_seen": 5998864, + "step": 3925 + }, + { + "epoch": 12.129829984544049, + "grad_norm": 0.42182016372680664, + "learning_rate": 4.881912251119546e-05, + "loss": 0.3416, + "num_input_tokens_seen": 6006416, + "step": 3930 + }, + { + "epoch": 12.145285935085008, + "grad_norm": 0.7163271307945251, + "learning_rate": 4.881613902034654e-05, + "loss": 0.4273, + "num_input_tokens_seen": 6014512, + "step": 3935 + }, + { + "epoch": 12.160741885625965, + "grad_norm": 0.5021644234657288, + "learning_rate": 4.88131518567497e-05, + "loss": 0.4289, + "num_input_tokens_seen": 6022224, + "step": 3940 + }, + { + "epoch": 12.176197836166924, + "grad_norm": 0.8600055575370789, + "learning_rate": 4.881016102086558e-05, + "loss": 0.4116, + "num_input_tokens_seen": 6029936, + "step": 3945 + }, + { + "epoch": 12.191653786707883, + "grad_norm": 0.5912600159645081, + "learning_rate": 4.8807166513155425e-05, + "loss": 0.3578, + "num_input_tokens_seen": 6037424, + "step": 3950 + }, + { + "epoch": 12.20710973724884, + "grad_norm": 0.48267263174057007, + "learning_rate": 4.8804168334081004e-05, + "loss": 0.3235, + "num_input_tokens_seen": 6045200, + "step": 3955 + }, + { + "epoch": 12.2225656877898, + "grad_norm": 0.616167426109314, + "learning_rate": 4.880116648410468e-05, + "loss": 0.4091, + "num_input_tokens_seen": 6052752, + "step": 3960 + }, + { + "epoch": 12.238021638330757, + "grad_norm": 0.7051360607147217, + "learning_rate": 4.879816096368939e-05, + "loss": 0.325, + "num_input_tokens_seen": 6060848, + "step": 3965 + }, + { + "epoch": 12.253477588871716, + "grad_norm": 0.6484952569007874, + "learning_rate": 4.879515177329861e-05, + "loss": 0.5808, + "num_input_tokens_seen": 6068624, + "step": 3970 + }, + { + "epoch": 12.268933539412673, + "grad_norm": 0.5434243679046631, + "learning_rate": 4.8792138913396394e-05, + "loss": 0.4061, + "num_input_tokens_seen": 6076432, + "step": 3975 + }, + { + "epoch": 12.284389489953632, + "grad_norm": 1.0527878999710083, + "learning_rate": 4.8789122384447374e-05, + "loss": 0.4329, + "num_input_tokens_seen": 6083824, + "step": 3980 + }, + { + "epoch": 12.29984544049459, + "grad_norm": 0.42255696654319763, + "learning_rate": 4.878610218691673e-05, + "loss": 0.3833, + "num_input_tokens_seen": 6091408, + "step": 3985 + }, + { + "epoch": 12.315301391035549, + "grad_norm": 0.48011061549186707, + "learning_rate": 4.87830783212702e-05, + "loss": 0.3808, + "num_input_tokens_seen": 6099440, + "step": 3990 + }, + { + "epoch": 12.330757341576508, + "grad_norm": 0.34644415974617004, + "learning_rate": 4.878005078797413e-05, + "loss": 0.324, + "num_input_tokens_seen": 6107184, + "step": 3995 + }, + { + "epoch": 12.346213292117465, + "grad_norm": 0.5895490050315857, + "learning_rate": 4.877701958749539e-05, + "loss": 0.3246, + "num_input_tokens_seen": 6114608, + "step": 4000 + }, + { + "epoch": 12.346213292117465, + "eval_loss": 0.4437834620475769, + "eval_runtime": 6.3239, + "eval_samples_per_second": 90.925, + "eval_steps_per_second": 22.771, + "num_input_tokens_seen": 6114608, + "step": 4000 + }, + { + "epoch": 12.361669242658424, + "grad_norm": 0.916710376739502, + "learning_rate": 4.877398472030142e-05, + "loss": 0.4258, + "num_input_tokens_seen": 6122352, + "step": 4005 + }, + { + "epoch": 12.377125193199381, + "grad_norm": 0.7787930369377136, + "learning_rate": 4.877094618686024e-05, + "loss": 0.4005, + "num_input_tokens_seen": 6130064, + "step": 4010 + }, + { + "epoch": 12.39258114374034, + "grad_norm": 0.4999927878379822, + "learning_rate": 4.876790398764045e-05, + "loss": 0.4119, + "num_input_tokens_seen": 6137776, + "step": 4015 + }, + { + "epoch": 12.408037094281298, + "grad_norm": 0.8937094211578369, + "learning_rate": 4.8764858123111167e-05, + "loss": 0.3524, + "num_input_tokens_seen": 6144912, + "step": 4020 + }, + { + "epoch": 12.423493044822257, + "grad_norm": 0.6014568209648132, + "learning_rate": 4.876180859374212e-05, + "loss": 0.4113, + "num_input_tokens_seen": 6153200, + "step": 4025 + }, + { + "epoch": 12.438948995363216, + "grad_norm": 0.45381084084510803, + "learning_rate": 4.875875540000357e-05, + "loss": 0.5142, + "num_input_tokens_seen": 6160656, + "step": 4030 + }, + { + "epoch": 12.454404945904173, + "grad_norm": 0.57855623960495, + "learning_rate": 4.8755698542366376e-05, + "loss": 0.3357, + "num_input_tokens_seen": 6168176, + "step": 4035 + }, + { + "epoch": 12.469860896445132, + "grad_norm": 0.5826019048690796, + "learning_rate": 4.875263802130193e-05, + "loss": 0.4297, + "num_input_tokens_seen": 6175632, + "step": 4040 + }, + { + "epoch": 12.48531684698609, + "grad_norm": 0.6593685150146484, + "learning_rate": 4.8749573837282207e-05, + "loss": 0.3854, + "num_input_tokens_seen": 6182864, + "step": 4045 + }, + { + "epoch": 12.500772797527048, + "grad_norm": 0.40891310572624207, + "learning_rate": 4.874650599077974e-05, + "loss": 0.3429, + "num_input_tokens_seen": 6190896, + "step": 4050 + }, + { + "epoch": 12.516228748068006, + "grad_norm": 0.49381452798843384, + "learning_rate": 4.874343448226764e-05, + "loss": 0.2989, + "num_input_tokens_seen": 6198288, + "step": 4055 + }, + { + "epoch": 12.531684698608965, + "grad_norm": 0.7372799515724182, + "learning_rate": 4.874035931221955e-05, + "loss": 0.4659, + "num_input_tokens_seen": 6205904, + "step": 4060 + }, + { + "epoch": 12.547140649149922, + "grad_norm": 0.896119236946106, + "learning_rate": 4.8737280481109724e-05, + "loss": 0.3754, + "num_input_tokens_seen": 6213168, + "step": 4065 + }, + { + "epoch": 12.562596599690881, + "grad_norm": 0.6315919160842896, + "learning_rate": 4.873419798941294e-05, + "loss": 0.3564, + "num_input_tokens_seen": 6220816, + "step": 4070 + }, + { + "epoch": 12.578052550231838, + "grad_norm": 0.6594139337539673, + "learning_rate": 4.873111183760458e-05, + "loss": 0.3768, + "num_input_tokens_seen": 6228464, + "step": 4075 + }, + { + "epoch": 12.593508500772797, + "grad_norm": 0.662477433681488, + "learning_rate": 4.8728022026160537e-05, + "loss": 0.4264, + "num_input_tokens_seen": 6235920, + "step": 4080 + }, + { + "epoch": 12.608964451313756, + "grad_norm": 0.48820650577545166, + "learning_rate": 4.872492855555732e-05, + "loss": 0.3591, + "num_input_tokens_seen": 6244080, + "step": 4085 + }, + { + "epoch": 12.624420401854714, + "grad_norm": 0.44621309638023376, + "learning_rate": 4.8721831426271956e-05, + "loss": 0.371, + "num_input_tokens_seen": 6251984, + "step": 4090 + }, + { + "epoch": 12.639876352395673, + "grad_norm": 0.6159834861755371, + "learning_rate": 4.87187306387821e-05, + "loss": 0.35, + "num_input_tokens_seen": 6260208, + "step": 4095 + }, + { + "epoch": 12.65533230293663, + "grad_norm": 0.7855387926101685, + "learning_rate": 4.87156261935659e-05, + "loss": 0.3495, + "num_input_tokens_seen": 6267312, + "step": 4100 + }, + { + "epoch": 12.670788253477589, + "grad_norm": 0.6076005101203918, + "learning_rate": 4.871251809110211e-05, + "loss": 0.3551, + "num_input_tokens_seen": 6274992, + "step": 4105 + }, + { + "epoch": 12.686244204018546, + "grad_norm": 0.5833804607391357, + "learning_rate": 4.8709406331870044e-05, + "loss": 0.4085, + "num_input_tokens_seen": 6282352, + "step": 4110 + }, + { + "epoch": 12.701700154559505, + "grad_norm": 0.8255681395530701, + "learning_rate": 4.8706290916349574e-05, + "loss": 0.3463, + "num_input_tokens_seen": 6289200, + "step": 4115 + }, + { + "epoch": 12.717156105100464, + "grad_norm": 1.286970853805542, + "learning_rate": 4.8703171845021134e-05, + "loss": 0.433, + "num_input_tokens_seen": 6296368, + "step": 4120 + }, + { + "epoch": 12.732612055641422, + "grad_norm": 0.5098376274108887, + "learning_rate": 4.870004911836572e-05, + "loss": 0.3111, + "num_input_tokens_seen": 6304240, + "step": 4125 + }, + { + "epoch": 12.74806800618238, + "grad_norm": 0.43340370059013367, + "learning_rate": 4.869692273686489e-05, + "loss": 0.3329, + "num_input_tokens_seen": 6311920, + "step": 4130 + }, + { + "epoch": 12.763523956723338, + "grad_norm": 0.6638621687889099, + "learning_rate": 4.869379270100079e-05, + "loss": 0.3092, + "num_input_tokens_seen": 6319504, + "step": 4135 + }, + { + "epoch": 12.778979907264297, + "grad_norm": 0.5254101753234863, + "learning_rate": 4.86906590112561e-05, + "loss": 0.3039, + "num_input_tokens_seen": 6327216, + "step": 4140 + }, + { + "epoch": 12.794435857805254, + "grad_norm": 0.7661398649215698, + "learning_rate": 4.8687521668114064e-05, + "loss": 0.3321, + "num_input_tokens_seen": 6334608, + "step": 4145 + }, + { + "epoch": 12.809891808346213, + "grad_norm": 0.5853785872459412, + "learning_rate": 4.868438067205853e-05, + "loss": 0.4277, + "num_input_tokens_seen": 6342512, + "step": 4150 + }, + { + "epoch": 12.825347758887172, + "grad_norm": 0.8266714215278625, + "learning_rate": 4.8681236023573844e-05, + "loss": 0.3282, + "num_input_tokens_seen": 6349744, + "step": 4155 + }, + { + "epoch": 12.84080370942813, + "grad_norm": 0.3610658347606659, + "learning_rate": 4.867808772314497e-05, + "loss": 0.2947, + "num_input_tokens_seen": 6357136, + "step": 4160 + }, + { + "epoch": 12.856259659969089, + "grad_norm": 0.5576721429824829, + "learning_rate": 4.867493577125741e-05, + "loss": 0.3434, + "num_input_tokens_seen": 6364400, + "step": 4165 + }, + { + "epoch": 12.871715610510046, + "grad_norm": 0.8305234909057617, + "learning_rate": 4.867178016839725e-05, + "loss": 0.4268, + "num_input_tokens_seen": 6372304, + "step": 4170 + }, + { + "epoch": 12.887171561051005, + "grad_norm": 0.7812581062316895, + "learning_rate": 4.8668620915051094e-05, + "loss": 0.4335, + "num_input_tokens_seen": 6380112, + "step": 4175 + }, + { + "epoch": 12.902627511591962, + "grad_norm": 1.4319462776184082, + "learning_rate": 4.866545801170616e-05, + "loss": 0.5775, + "num_input_tokens_seen": 6387952, + "step": 4180 + }, + { + "epoch": 12.918083462132921, + "grad_norm": 0.5792189836502075, + "learning_rate": 4.86622914588502e-05, + "loss": 0.3427, + "num_input_tokens_seen": 6396208, + "step": 4185 + }, + { + "epoch": 12.933539412673879, + "grad_norm": 0.4015393853187561, + "learning_rate": 4.865912125697154e-05, + "loss": 0.2897, + "num_input_tokens_seen": 6403920, + "step": 4190 + }, + { + "epoch": 12.948995363214838, + "grad_norm": 0.4793500304222107, + "learning_rate": 4.865594740655907e-05, + "loss": 0.4333, + "num_input_tokens_seen": 6411536, + "step": 4195 + }, + { + "epoch": 12.964451313755795, + "grad_norm": 0.7075718641281128, + "learning_rate": 4.865276990810222e-05, + "loss": 0.4121, + "num_input_tokens_seen": 6419376, + "step": 4200 + }, + { + "epoch": 12.964451313755795, + "eval_loss": 0.4351543188095093, + "eval_runtime": 6.3111, + "eval_samples_per_second": 91.109, + "eval_steps_per_second": 22.817, + "num_input_tokens_seen": 6419376, + "step": 4200 + }, + { + "epoch": 12.979907264296754, + "grad_norm": 0.48225119709968567, + "learning_rate": 4.8649588762091016e-05, + "loss": 0.338, + "num_input_tokens_seen": 6427344, + "step": 4205 + }, + { + "epoch": 12.995363214837713, + "grad_norm": 0.45899316668510437, + "learning_rate": 4.8646403969016016e-05, + "loss": 0.3801, + "num_input_tokens_seen": 6434736, + "step": 4210 + }, + { + "epoch": 13.009273570324575, + "grad_norm": 0.8491412401199341, + "learning_rate": 4.864321552936838e-05, + "loss": 0.4345, + "num_input_tokens_seen": 6441312, + "step": 4215 + }, + { + "epoch": 13.024729520865533, + "grad_norm": 0.8008528351783752, + "learning_rate": 4.864002344363978e-05, + "loss": 0.3692, + "num_input_tokens_seen": 6448928, + "step": 4220 + }, + { + "epoch": 13.040185471406492, + "grad_norm": 0.6886972784996033, + "learning_rate": 4.863682771232248e-05, + "loss": 0.3854, + "num_input_tokens_seen": 6456288, + "step": 4225 + }, + { + "epoch": 13.055641421947449, + "grad_norm": 0.6287875175476074, + "learning_rate": 4.8633628335909324e-05, + "loss": 0.4181, + "num_input_tokens_seen": 6464000, + "step": 4230 + }, + { + "epoch": 13.071097372488408, + "grad_norm": 0.570147693157196, + "learning_rate": 4.8630425314893676e-05, + "loss": 0.2879, + "num_input_tokens_seen": 6471136, + "step": 4235 + }, + { + "epoch": 13.086553323029367, + "grad_norm": 0.5587822794914246, + "learning_rate": 4.862721864976948e-05, + "loss": 0.421, + "num_input_tokens_seen": 6478272, + "step": 4240 + }, + { + "epoch": 13.102009273570324, + "grad_norm": 0.6576064825057983, + "learning_rate": 4.862400834103125e-05, + "loss": 0.434, + "num_input_tokens_seen": 6486272, + "step": 4245 + }, + { + "epoch": 13.117465224111283, + "grad_norm": 0.8187768459320068, + "learning_rate": 4.862079438917406e-05, + "loss": 0.348, + "num_input_tokens_seen": 6493888, + "step": 4250 + }, + { + "epoch": 13.13292117465224, + "grad_norm": 0.5865563750267029, + "learning_rate": 4.8617576794693536e-05, + "loss": 0.3202, + "num_input_tokens_seen": 6501536, + "step": 4255 + }, + { + "epoch": 13.1483771251932, + "grad_norm": 0.4604993462562561, + "learning_rate": 4.8614355558085875e-05, + "loss": 0.3811, + "num_input_tokens_seen": 6509056, + "step": 4260 + }, + { + "epoch": 13.163833075734157, + "grad_norm": 0.6985146403312683, + "learning_rate": 4.861113067984783e-05, + "loss": 0.348, + "num_input_tokens_seen": 6516544, + "step": 4265 + }, + { + "epoch": 13.179289026275116, + "grad_norm": 0.3955577313899994, + "learning_rate": 4.860790216047671e-05, + "loss": 0.3648, + "num_input_tokens_seen": 6524032, + "step": 4270 + }, + { + "epoch": 13.194744976816073, + "grad_norm": 0.8517614006996155, + "learning_rate": 4.860467000047041e-05, + "loss": 0.473, + "num_input_tokens_seen": 6532096, + "step": 4275 + }, + { + "epoch": 13.210200927357032, + "grad_norm": 1.085478663444519, + "learning_rate": 4.860143420032737e-05, + "loss": 0.3835, + "num_input_tokens_seen": 6539936, + "step": 4280 + }, + { + "epoch": 13.225656877897991, + "grad_norm": 0.6375027894973755, + "learning_rate": 4.859819476054657e-05, + "loss": 0.3277, + "num_input_tokens_seen": 6547904, + "step": 4285 + }, + { + "epoch": 13.241112828438949, + "grad_norm": 0.6106756925582886, + "learning_rate": 4.859495168162758e-05, + "loss": 0.3165, + "num_input_tokens_seen": 6555392, + "step": 4290 + }, + { + "epoch": 13.256568778979908, + "grad_norm": 0.5164577960968018, + "learning_rate": 4.859170496407054e-05, + "loss": 0.3206, + "num_input_tokens_seen": 6563360, + "step": 4295 + }, + { + "epoch": 13.272024729520865, + "grad_norm": 0.7822169661521912, + "learning_rate": 4.8588454608376114e-05, + "loss": 0.4155, + "num_input_tokens_seen": 6571328, + "step": 4300 + }, + { + "epoch": 13.287480680061824, + "grad_norm": 0.4809003174304962, + "learning_rate": 4.8585200615045555e-05, + "loss": 0.4857, + "num_input_tokens_seen": 6578912, + "step": 4305 + }, + { + "epoch": 13.302936630602781, + "grad_norm": 0.8143923282623291, + "learning_rate": 4.8581942984580674e-05, + "loss": 0.4172, + "num_input_tokens_seen": 6585984, + "step": 4310 + }, + { + "epoch": 13.31839258114374, + "grad_norm": 0.6718477010726929, + "learning_rate": 4.857868171748384e-05, + "loss": 0.327, + "num_input_tokens_seen": 6593504, + "step": 4315 + }, + { + "epoch": 13.333848531684698, + "grad_norm": 0.6040611267089844, + "learning_rate": 4.8575416814257976e-05, + "loss": 0.332, + "num_input_tokens_seen": 6601184, + "step": 4320 + }, + { + "epoch": 13.349304482225657, + "grad_norm": 0.5825080275535583, + "learning_rate": 4.857214827540657e-05, + "loss": 0.348, + "num_input_tokens_seen": 6608416, + "step": 4325 + }, + { + "epoch": 13.364760432766616, + "grad_norm": 0.4764118790626526, + "learning_rate": 4.856887610143367e-05, + "loss": 0.3401, + "num_input_tokens_seen": 6616160, + "step": 4330 + }, + { + "epoch": 13.380216383307573, + "grad_norm": 0.48460766673088074, + "learning_rate": 4.8565600292843896e-05, + "loss": 0.3119, + "num_input_tokens_seen": 6623968, + "step": 4335 + }, + { + "epoch": 13.395672333848532, + "grad_norm": 0.8821828961372375, + "learning_rate": 4.856232085014241e-05, + "loss": 0.4223, + "num_input_tokens_seen": 6632192, + "step": 4340 + }, + { + "epoch": 13.41112828438949, + "grad_norm": 0.8100664019584656, + "learning_rate": 4.855903777383495e-05, + "loss": 0.3912, + "num_input_tokens_seen": 6639936, + "step": 4345 + }, + { + "epoch": 13.426584234930449, + "grad_norm": 0.7385225892066956, + "learning_rate": 4.85557510644278e-05, + "loss": 0.3297, + "num_input_tokens_seen": 6647648, + "step": 4350 + }, + { + "epoch": 13.442040185471406, + "grad_norm": 0.4200953543186188, + "learning_rate": 4.855246072242782e-05, + "loss": 0.2938, + "num_input_tokens_seen": 6655776, + "step": 4355 + }, + { + "epoch": 13.457496136012365, + "grad_norm": 1.1121962070465088, + "learning_rate": 4.8549166748342414e-05, + "loss": 0.3669, + "num_input_tokens_seen": 6663168, + "step": 4360 + }, + { + "epoch": 13.472952086553324, + "grad_norm": 0.4434937536716461, + "learning_rate": 4.8545869142679556e-05, + "loss": 0.3193, + "num_input_tokens_seen": 6671008, + "step": 4365 + }, + { + "epoch": 13.488408037094281, + "grad_norm": 0.5528193712234497, + "learning_rate": 4.8542567905947776e-05, + "loss": 0.356, + "num_input_tokens_seen": 6678944, + "step": 4370 + }, + { + "epoch": 13.50386398763524, + "grad_norm": 1.0701195001602173, + "learning_rate": 4.853926303865618e-05, + "loss": 0.3935, + "num_input_tokens_seen": 6687360, + "step": 4375 + }, + { + "epoch": 13.519319938176197, + "grad_norm": 1.1309890747070312, + "learning_rate": 4.853595454131441e-05, + "loss": 0.4689, + "num_input_tokens_seen": 6695296, + "step": 4380 + }, + { + "epoch": 13.534775888717157, + "grad_norm": 0.5731208920478821, + "learning_rate": 4.8532642414432674e-05, + "loss": 0.3775, + "num_input_tokens_seen": 6702528, + "step": 4385 + }, + { + "epoch": 13.550231839258114, + "grad_norm": 0.5712190270423889, + "learning_rate": 4.8529326658521754e-05, + "loss": 0.3031, + "num_input_tokens_seen": 6710432, + "step": 4390 + }, + { + "epoch": 13.565687789799073, + "grad_norm": 0.37513467669487, + "learning_rate": 4.8526007274092965e-05, + "loss": 0.3922, + "num_input_tokens_seen": 6717920, + "step": 4395 + }, + { + "epoch": 13.58114374034003, + "grad_norm": 0.6719768047332764, + "learning_rate": 4.852268426165822e-05, + "loss": 0.377, + "num_input_tokens_seen": 6725664, + "step": 4400 + }, + { + "epoch": 13.58114374034003, + "eval_loss": 0.42903241515159607, + "eval_runtime": 6.3106, + "eval_samples_per_second": 91.116, + "eval_steps_per_second": 22.819, + "num_input_tokens_seen": 6725664, + "step": 4400 + }, + { + "epoch": 13.59659969088099, + "grad_norm": 0.7628097534179688, + "learning_rate": 4.851935762172995e-05, + "loss": 0.5285, + "num_input_tokens_seen": 6733376, + "step": 4405 + }, + { + "epoch": 13.612055641421948, + "grad_norm": 0.8471111059188843, + "learning_rate": 4.8516027354821175e-05, + "loss": 0.3264, + "num_input_tokens_seen": 6740864, + "step": 4410 + }, + { + "epoch": 13.627511591962906, + "grad_norm": 0.45323848724365234, + "learning_rate": 4.851269346144546e-05, + "loss": 0.3321, + "num_input_tokens_seen": 6748256, + "step": 4415 + }, + { + "epoch": 13.642967542503865, + "grad_norm": 0.3803878724575043, + "learning_rate": 4.850935594211693e-05, + "loss": 0.4327, + "num_input_tokens_seen": 6755936, + "step": 4420 + }, + { + "epoch": 13.658423493044822, + "grad_norm": 0.5063687562942505, + "learning_rate": 4.850601479735029e-05, + "loss": 0.3377, + "num_input_tokens_seen": 6763680, + "step": 4425 + }, + { + "epoch": 13.673879443585781, + "grad_norm": 0.5770256519317627, + "learning_rate": 4.850267002766076e-05, + "loss": 0.3711, + "num_input_tokens_seen": 6770976, + "step": 4430 + }, + { + "epoch": 13.689335394126738, + "grad_norm": 0.7485835552215576, + "learning_rate": 4.849932163356417e-05, + "loss": 0.3989, + "num_input_tokens_seen": 6778624, + "step": 4435 + }, + { + "epoch": 13.704791344667697, + "grad_norm": 0.722798764705658, + "learning_rate": 4.8495969615576864e-05, + "loss": 0.4107, + "num_input_tokens_seen": 6785728, + "step": 4440 + }, + { + "epoch": 13.720247295208654, + "grad_norm": 0.3963666558265686, + "learning_rate": 4.849261397421577e-05, + "loss": 0.3465, + "num_input_tokens_seen": 6793152, + "step": 4445 + }, + { + "epoch": 13.735703245749614, + "grad_norm": 0.6036388278007507, + "learning_rate": 4.848925470999839e-05, + "loss": 0.3698, + "num_input_tokens_seen": 6801600, + "step": 4450 + }, + { + "epoch": 13.751159196290573, + "grad_norm": 0.828738808631897, + "learning_rate": 4.848589182344273e-05, + "loss": 0.3913, + "num_input_tokens_seen": 6809120, + "step": 4455 + }, + { + "epoch": 13.76661514683153, + "grad_norm": 0.4490659534931183, + "learning_rate": 4.848252531506742e-05, + "loss": 0.3968, + "num_input_tokens_seen": 6816768, + "step": 4460 + }, + { + "epoch": 13.782071097372489, + "grad_norm": 0.7000062465667725, + "learning_rate": 4.847915518539161e-05, + "loss": 0.3215, + "num_input_tokens_seen": 6824640, + "step": 4465 + }, + { + "epoch": 13.797527047913446, + "grad_norm": 0.5503018498420715, + "learning_rate": 4.847578143493501e-05, + "loss": 0.3333, + "num_input_tokens_seen": 6832672, + "step": 4470 + }, + { + "epoch": 13.812982998454405, + "grad_norm": 0.47761672735214233, + "learning_rate": 4.847240406421789e-05, + "loss": 0.3692, + "num_input_tokens_seen": 6839904, + "step": 4475 + }, + { + "epoch": 13.828438948995363, + "grad_norm": 0.48844125866889954, + "learning_rate": 4.84690230737611e-05, + "loss": 0.3752, + "num_input_tokens_seen": 6848416, + "step": 4480 + }, + { + "epoch": 13.843894899536322, + "grad_norm": 0.4621097147464752, + "learning_rate": 4.846563846408602e-05, + "loss": 0.3078, + "num_input_tokens_seen": 6855872, + "step": 4485 + }, + { + "epoch": 13.85935085007728, + "grad_norm": 0.5132933259010315, + "learning_rate": 4.84622502357146e-05, + "loss": 0.2712, + "num_input_tokens_seen": 6863744, + "step": 4490 + }, + { + "epoch": 13.874806800618238, + "grad_norm": 0.7185845375061035, + "learning_rate": 4.8458858389169345e-05, + "loss": 0.3857, + "num_input_tokens_seen": 6871648, + "step": 4495 + }, + { + "epoch": 13.890262751159197, + "grad_norm": 1.0464918613433838, + "learning_rate": 4.8455462924973334e-05, + "loss": 0.3598, + "num_input_tokens_seen": 6879328, + "step": 4500 + }, + { + "epoch": 13.905718701700154, + "grad_norm": 0.556227445602417, + "learning_rate": 4.845206384365018e-05, + "loss": 0.3716, + "num_input_tokens_seen": 6886752, + "step": 4505 + }, + { + "epoch": 13.921174652241113, + "grad_norm": 0.6991230845451355, + "learning_rate": 4.844866114572405e-05, + "loss": 0.3774, + "num_input_tokens_seen": 6894272, + "step": 4510 + }, + { + "epoch": 13.93663060278207, + "grad_norm": 0.6234842538833618, + "learning_rate": 4.8445254831719706e-05, + "loss": 0.3667, + "num_input_tokens_seen": 6901984, + "step": 4515 + }, + { + "epoch": 13.95208655332303, + "grad_norm": 0.8052206635475159, + "learning_rate": 4.8441844902162434e-05, + "loss": 0.3869, + "num_input_tokens_seen": 6909632, + "step": 4520 + }, + { + "epoch": 13.967542503863987, + "grad_norm": 0.49763819575309753, + "learning_rate": 4.843843135757809e-05, + "loss": 0.3948, + "num_input_tokens_seen": 6916896, + "step": 4525 + }, + { + "epoch": 13.982998454404946, + "grad_norm": 0.6514707803726196, + "learning_rate": 4.843501419849308e-05, + "loss": 0.3134, + "num_input_tokens_seen": 6925152, + "step": 4530 + }, + { + "epoch": 13.998454404945905, + "grad_norm": 0.6811992526054382, + "learning_rate": 4.8431593425434386e-05, + "loss": 0.3878, + "num_input_tokens_seen": 6932640, + "step": 4535 + }, + { + "epoch": 14.012364760432767, + "grad_norm": 0.5586262941360474, + "learning_rate": 4.8428169038929526e-05, + "loss": 0.2793, + "num_input_tokens_seen": 6939360, + "step": 4540 + }, + { + "epoch": 14.027820710973725, + "grad_norm": 1.032075047492981, + "learning_rate": 4.8424741039506575e-05, + "loss": 0.3717, + "num_input_tokens_seen": 6947072, + "step": 4545 + }, + { + "epoch": 14.043276661514684, + "grad_norm": 0.6978206634521484, + "learning_rate": 4.842130942769419e-05, + "loss": 0.3292, + "num_input_tokens_seen": 6954848, + "step": 4550 + }, + { + "epoch": 14.05873261205564, + "grad_norm": 1.0376440286636353, + "learning_rate": 4.841787420402156e-05, + "loss": 0.3608, + "num_input_tokens_seen": 6962272, + "step": 4555 + }, + { + "epoch": 14.0741885625966, + "grad_norm": 0.8480003476142883, + "learning_rate": 4.841443536901844e-05, + "loss": 0.343, + "num_input_tokens_seen": 6969888, + "step": 4560 + }, + { + "epoch": 14.089644513137557, + "grad_norm": 0.6580856442451477, + "learning_rate": 4.841099292321514e-05, + "loss": 0.3536, + "num_input_tokens_seen": 6977664, + "step": 4565 + }, + { + "epoch": 14.105100463678516, + "grad_norm": 0.674606204032898, + "learning_rate": 4.8407546867142525e-05, + "loss": 0.4087, + "num_input_tokens_seen": 6985056, + "step": 4570 + }, + { + "epoch": 14.120556414219475, + "grad_norm": 0.8916165828704834, + "learning_rate": 4.840409720133203e-05, + "loss": 0.3765, + "num_input_tokens_seen": 6992928, + "step": 4575 + }, + { + "epoch": 14.136012364760433, + "grad_norm": 0.41151750087738037, + "learning_rate": 4.8400643926315634e-05, + "loss": 0.2965, + "num_input_tokens_seen": 7000320, + "step": 4580 + }, + { + "epoch": 14.151468315301392, + "grad_norm": 0.4623599946498871, + "learning_rate": 4.839718704262587e-05, + "loss": 0.3183, + "num_input_tokens_seen": 7008032, + "step": 4585 + }, + { + "epoch": 14.166924265842349, + "grad_norm": 0.6608055830001831, + "learning_rate": 4.839372655079585e-05, + "loss": 0.3878, + "num_input_tokens_seen": 7015328, + "step": 4590 + }, + { + "epoch": 14.182380216383308, + "grad_norm": 0.6807621121406555, + "learning_rate": 4.83902624513592e-05, + "loss": 0.4461, + "num_input_tokens_seen": 7022688, + "step": 4595 + }, + { + "epoch": 14.197836166924265, + "grad_norm": 0.7003764510154724, + "learning_rate": 4.838679474485014e-05, + "loss": 0.3873, + "num_input_tokens_seen": 7030208, + "step": 4600 + }, + { + "epoch": 14.197836166924265, + "eval_loss": 0.4208325147628784, + "eval_runtime": 6.314, + "eval_samples_per_second": 91.067, + "eval_steps_per_second": 22.806, + "num_input_tokens_seen": 7030208, + "step": 4600 + }, + { + "epoch": 14.213292117465224, + "grad_norm": 0.3635127544403076, + "learning_rate": 4.838332343180343e-05, + "loss": 0.4065, + "num_input_tokens_seen": 7037984, + "step": 4605 + }, + { + "epoch": 14.228748068006182, + "grad_norm": 0.5490813851356506, + "learning_rate": 4.83798485127544e-05, + "loss": 0.3262, + "num_input_tokens_seen": 7045184, + "step": 4610 + }, + { + "epoch": 14.24420401854714, + "grad_norm": 0.5098409056663513, + "learning_rate": 4.837636998823892e-05, + "loss": 0.2926, + "num_input_tokens_seen": 7052992, + "step": 4615 + }, + { + "epoch": 14.2596599690881, + "grad_norm": 0.8080095052719116, + "learning_rate": 4.8372887858793414e-05, + "loss": 0.4013, + "num_input_tokens_seen": 7060800, + "step": 4620 + }, + { + "epoch": 14.275115919629057, + "grad_norm": 0.9534701704978943, + "learning_rate": 4.836940212495489e-05, + "loss": 0.3696, + "num_input_tokens_seen": 7067872, + "step": 4625 + }, + { + "epoch": 14.290571870170016, + "grad_norm": 1.046134114265442, + "learning_rate": 4.836591278726087e-05, + "loss": 0.3221, + "num_input_tokens_seen": 7075328, + "step": 4630 + }, + { + "epoch": 14.306027820710973, + "grad_norm": 0.5735355019569397, + "learning_rate": 4.836241984624947e-05, + "loss": 0.3941, + "num_input_tokens_seen": 7083008, + "step": 4635 + }, + { + "epoch": 14.321483771251932, + "grad_norm": 0.4469137489795685, + "learning_rate": 4.8358923302459336e-05, + "loss": 0.359, + "num_input_tokens_seen": 7090688, + "step": 4640 + }, + { + "epoch": 14.33693972179289, + "grad_norm": 0.660449206829071, + "learning_rate": 4.835542315642968e-05, + "loss": 0.3116, + "num_input_tokens_seen": 7098304, + "step": 4645 + }, + { + "epoch": 14.352395672333849, + "grad_norm": 0.45807981491088867, + "learning_rate": 4.8351919408700274e-05, + "loss": 0.2849, + "num_input_tokens_seen": 7105760, + "step": 4650 + }, + { + "epoch": 14.367851622874808, + "grad_norm": 0.45920631289482117, + "learning_rate": 4.834841205981144e-05, + "loss": 0.318, + "num_input_tokens_seen": 7113024, + "step": 4655 + }, + { + "epoch": 14.383307573415765, + "grad_norm": 0.4988662600517273, + "learning_rate": 4.8344901110304054e-05, + "loss": 0.3639, + "num_input_tokens_seen": 7120672, + "step": 4660 + }, + { + "epoch": 14.398763523956724, + "grad_norm": 0.3752710223197937, + "learning_rate": 4.8341386560719534e-05, + "loss": 0.3774, + "num_input_tokens_seen": 7128384, + "step": 4665 + }, + { + "epoch": 14.414219474497681, + "grad_norm": 0.8198193907737732, + "learning_rate": 4.833786841159989e-05, + "loss": 0.3211, + "num_input_tokens_seen": 7136000, + "step": 4670 + }, + { + "epoch": 14.42967542503864, + "grad_norm": 1.2980599403381348, + "learning_rate": 4.833434666348765e-05, + "loss": 0.4402, + "num_input_tokens_seen": 7143552, + "step": 4675 + }, + { + "epoch": 14.445131375579598, + "grad_norm": 0.4690445363521576, + "learning_rate": 4.833082131692592e-05, + "loss": 0.3998, + "num_input_tokens_seen": 7151360, + "step": 4680 + }, + { + "epoch": 14.460587326120557, + "grad_norm": 0.8062753081321716, + "learning_rate": 4.832729237245835e-05, + "loss": 0.445, + "num_input_tokens_seen": 7158592, + "step": 4685 + }, + { + "epoch": 14.476043276661514, + "grad_norm": 0.5264611840248108, + "learning_rate": 4.8323759830629145e-05, + "loss": 0.2724, + "num_input_tokens_seen": 7166016, + "step": 4690 + }, + { + "epoch": 14.491499227202473, + "grad_norm": 0.5958806276321411, + "learning_rate": 4.8320223691983066e-05, + "loss": 0.3493, + "num_input_tokens_seen": 7173312, + "step": 4695 + }, + { + "epoch": 14.506955177743432, + "grad_norm": 0.5057716965675354, + "learning_rate": 4.831668395706544e-05, + "loss": 0.4039, + "num_input_tokens_seen": 7181312, + "step": 4700 + }, + { + "epoch": 14.52241112828439, + "grad_norm": 0.935779333114624, + "learning_rate": 4.8313140626422125e-05, + "loss": 0.4549, + "num_input_tokens_seen": 7188608, + "step": 4705 + }, + { + "epoch": 14.537867078825348, + "grad_norm": 0.38399091362953186, + "learning_rate": 4.830959370059956e-05, + "loss": 0.4065, + "num_input_tokens_seen": 7196384, + "step": 4710 + }, + { + "epoch": 14.553323029366306, + "grad_norm": 0.7083280086517334, + "learning_rate": 4.830604318014472e-05, + "loss": 0.4699, + "num_input_tokens_seen": 7204032, + "step": 4715 + }, + { + "epoch": 14.568778979907265, + "grad_norm": 0.7457036375999451, + "learning_rate": 4.830248906560514e-05, + "loss": 0.3045, + "num_input_tokens_seen": 7211872, + "step": 4720 + }, + { + "epoch": 14.584234930448222, + "grad_norm": 0.8161150813102722, + "learning_rate": 4.829893135752891e-05, + "loss": 0.3618, + "num_input_tokens_seen": 7219488, + "step": 4725 + }, + { + "epoch": 14.599690880989181, + "grad_norm": 0.4117393493652344, + "learning_rate": 4.829537005646466e-05, + "loss": 0.3184, + "num_input_tokens_seen": 7227648, + "step": 4730 + }, + { + "epoch": 14.615146831530138, + "grad_norm": 0.661454975605011, + "learning_rate": 4.8291805162961615e-05, + "loss": 0.3948, + "num_input_tokens_seen": 7235040, + "step": 4735 + }, + { + "epoch": 14.630602782071097, + "grad_norm": 0.3840634524822235, + "learning_rate": 4.82882366775695e-05, + "loss": 0.3173, + "num_input_tokens_seen": 7242592, + "step": 4740 + }, + { + "epoch": 14.646058732612056, + "grad_norm": 0.7308349013328552, + "learning_rate": 4.828466460083864e-05, + "loss": 0.3171, + "num_input_tokens_seen": 7249824, + "step": 4745 + }, + { + "epoch": 14.661514683153014, + "grad_norm": 0.667025089263916, + "learning_rate": 4.8281088933319877e-05, + "loss": 0.3096, + "num_input_tokens_seen": 7258176, + "step": 4750 + }, + { + "epoch": 14.676970633693973, + "grad_norm": 0.6800103783607483, + "learning_rate": 4.827750967556464e-05, + "loss": 0.4206, + "num_input_tokens_seen": 7266080, + "step": 4755 + }, + { + "epoch": 14.69242658423493, + "grad_norm": 0.3654455840587616, + "learning_rate": 4.827392682812488e-05, + "loss": 0.3433, + "num_input_tokens_seen": 7274112, + "step": 4760 + }, + { + "epoch": 14.707882534775889, + "grad_norm": 0.3719217777252197, + "learning_rate": 4.827034039155312e-05, + "loss": 0.3919, + "num_input_tokens_seen": 7282016, + "step": 4765 + }, + { + "epoch": 14.723338485316846, + "grad_norm": 0.5746392011642456, + "learning_rate": 4.8266750366402445e-05, + "loss": 0.4413, + "num_input_tokens_seen": 7289856, + "step": 4770 + }, + { + "epoch": 14.738794435857805, + "grad_norm": 0.5686441659927368, + "learning_rate": 4.8263156753226476e-05, + "loss": 0.298, + "num_input_tokens_seen": 7297184, + "step": 4775 + }, + { + "epoch": 14.754250386398763, + "grad_norm": 0.5973247289657593, + "learning_rate": 4.8259559552579394e-05, + "loss": 0.386, + "num_input_tokens_seen": 7304832, + "step": 4780 + }, + { + "epoch": 14.769706336939722, + "grad_norm": 0.5941442251205444, + "learning_rate": 4.825595876501593e-05, + "loss": 0.3419, + "num_input_tokens_seen": 7312640, + "step": 4785 + }, + { + "epoch": 14.78516228748068, + "grad_norm": 0.6477248668670654, + "learning_rate": 4.825235439109137e-05, + "loss": 0.3763, + "num_input_tokens_seen": 7319840, + "step": 4790 + }, + { + "epoch": 14.800618238021638, + "grad_norm": 0.6037468314170837, + "learning_rate": 4.824874643136156e-05, + "loss": 0.3142, + "num_input_tokens_seen": 7327968, + "step": 4795 + }, + { + "epoch": 14.816074188562597, + "grad_norm": 0.5528859496116638, + "learning_rate": 4.824513488638288e-05, + "loss": 0.4336, + "num_input_tokens_seen": 7335712, + "step": 4800 + }, + { + "epoch": 14.816074188562597, + "eval_loss": 0.4164484739303589, + "eval_runtime": 6.3203, + "eval_samples_per_second": 90.977, + "eval_steps_per_second": 22.784, + "num_input_tokens_seen": 7335712, + "step": 4800 + }, + { + "epoch": 14.831530139103554, + "grad_norm": 0.7496144771575928, + "learning_rate": 4.8241519756712293e-05, + "loss": 0.3591, + "num_input_tokens_seen": 7342880, + "step": 4805 + }, + { + "epoch": 14.846986089644513, + "grad_norm": 0.7338517308235168, + "learning_rate": 4.8237901042907285e-05, + "loss": 0.3363, + "num_input_tokens_seen": 7350880, + "step": 4810 + }, + { + "epoch": 14.86244204018547, + "grad_norm": 0.5532991886138916, + "learning_rate": 4.823427874552591e-05, + "loss": 0.3393, + "num_input_tokens_seen": 7359072, + "step": 4815 + }, + { + "epoch": 14.87789799072643, + "grad_norm": 0.6163798570632935, + "learning_rate": 4.823065286512677e-05, + "loss": 0.323, + "num_input_tokens_seen": 7366080, + "step": 4820 + }, + { + "epoch": 14.893353941267389, + "grad_norm": 0.8125919699668884, + "learning_rate": 4.8227023402269025e-05, + "loss": 0.3351, + "num_input_tokens_seen": 7374144, + "step": 4825 + }, + { + "epoch": 14.908809891808346, + "grad_norm": 0.5089489221572876, + "learning_rate": 4.822339035751239e-05, + "loss": 0.3212, + "num_input_tokens_seen": 7381888, + "step": 4830 + }, + { + "epoch": 14.924265842349305, + "grad_norm": 0.46935680508613586, + "learning_rate": 4.8219753731417104e-05, + "loss": 0.4116, + "num_input_tokens_seen": 7389120, + "step": 4835 + }, + { + "epoch": 14.939721792890262, + "grad_norm": 0.5320732593536377, + "learning_rate": 4.821611352454401e-05, + "loss": 0.3555, + "num_input_tokens_seen": 7397024, + "step": 4840 + }, + { + "epoch": 14.955177743431221, + "grad_norm": 0.3197724223136902, + "learning_rate": 4.8212469737454444e-05, + "loss": 0.3853, + "num_input_tokens_seen": 7405120, + "step": 4845 + }, + { + "epoch": 14.970633693972179, + "grad_norm": 0.7888110876083374, + "learning_rate": 4.820882237071035e-05, + "loss": 0.352, + "num_input_tokens_seen": 7412992, + "step": 4850 + }, + { + "epoch": 14.986089644513138, + "grad_norm": 0.5779101252555847, + "learning_rate": 4.820517142487417e-05, + "loss": 0.3515, + "num_input_tokens_seen": 7420192, + "step": 4855 + }, + { + "epoch": 15.0, + "grad_norm": 0.6741199493408203, + "learning_rate": 4.8201516900508956e-05, + "loss": 0.2977, + "num_input_tokens_seen": 7427344, + "step": 4860 + }, + { + "epoch": 15.015455950540959, + "grad_norm": 0.428605318069458, + "learning_rate": 4.819785879817827e-05, + "loss": 0.3176, + "num_input_tokens_seen": 7434800, + "step": 4865 + }, + { + "epoch": 15.030911901081916, + "grad_norm": 0.5415523648262024, + "learning_rate": 4.8194197118446226e-05, + "loss": 0.388, + "num_input_tokens_seen": 7442416, + "step": 4870 + }, + { + "epoch": 15.046367851622875, + "grad_norm": 0.3321598470211029, + "learning_rate": 4.819053186187752e-05, + "loss": 0.345, + "num_input_tokens_seen": 7449872, + "step": 4875 + }, + { + "epoch": 15.061823802163833, + "grad_norm": 0.7349294424057007, + "learning_rate": 4.818686302903736e-05, + "loss": 0.4596, + "num_input_tokens_seen": 7457456, + "step": 4880 + }, + { + "epoch": 15.077279752704792, + "grad_norm": 1.2989522218704224, + "learning_rate": 4.818319062049154e-05, + "loss": 0.3328, + "num_input_tokens_seen": 7465520, + "step": 4885 + }, + { + "epoch": 15.092735703245749, + "grad_norm": 0.475929856300354, + "learning_rate": 4.817951463680639e-05, + "loss": 0.3379, + "num_input_tokens_seen": 7473232, + "step": 4890 + }, + { + "epoch": 15.108191653786708, + "grad_norm": 0.382153183221817, + "learning_rate": 4.817583507854879e-05, + "loss": 0.4489, + "num_input_tokens_seen": 7480784, + "step": 4895 + }, + { + "epoch": 15.123647604327665, + "grad_norm": 0.5600056648254395, + "learning_rate": 4.817215194628617e-05, + "loss": 0.4209, + "num_input_tokens_seen": 7488560, + "step": 4900 + }, + { + "epoch": 15.139103554868624, + "grad_norm": 0.469318151473999, + "learning_rate": 4.816846524058653e-05, + "loss": 0.433, + "num_input_tokens_seen": 7496752, + "step": 4905 + }, + { + "epoch": 15.154559505409583, + "grad_norm": 0.6526662111282349, + "learning_rate": 4.816477496201839e-05, + "loss": 0.2921, + "num_input_tokens_seen": 7504880, + "step": 4910 + }, + { + "epoch": 15.17001545595054, + "grad_norm": 0.4023224711418152, + "learning_rate": 4.8161081111150845e-05, + "loss": 0.33, + "num_input_tokens_seen": 7513008, + "step": 4915 + }, + { + "epoch": 15.1854714064915, + "grad_norm": 0.5180260539054871, + "learning_rate": 4.815738368855354e-05, + "loss": 0.2871, + "num_input_tokens_seen": 7520688, + "step": 4920 + }, + { + "epoch": 15.200927357032457, + "grad_norm": 0.6153039336204529, + "learning_rate": 4.815368269479664e-05, + "loss": 0.3622, + "num_input_tokens_seen": 7528240, + "step": 4925 + }, + { + "epoch": 15.216383307573416, + "grad_norm": 0.4467327296733856, + "learning_rate": 4.814997813045092e-05, + "loss": 0.4306, + "num_input_tokens_seen": 7535920, + "step": 4930 + }, + { + "epoch": 15.231839258114373, + "grad_norm": 0.4932445287704468, + "learning_rate": 4.814626999608764e-05, + "loss": 0.3287, + "num_input_tokens_seen": 7543280, + "step": 4935 + }, + { + "epoch": 15.247295208655332, + "grad_norm": 0.6774265170097351, + "learning_rate": 4.814255829227865e-05, + "loss": 0.3621, + "num_input_tokens_seen": 7550608, + "step": 4940 + }, + { + "epoch": 15.262751159196291, + "grad_norm": 0.6160472631454468, + "learning_rate": 4.813884301959635e-05, + "loss": 0.4025, + "num_input_tokens_seen": 7558160, + "step": 4945 + }, + { + "epoch": 15.278207109737249, + "grad_norm": 0.5851579308509827, + "learning_rate": 4.813512417861368e-05, + "loss": 0.3944, + "num_input_tokens_seen": 7565776, + "step": 4950 + }, + { + "epoch": 15.293663060278208, + "grad_norm": 0.4240354597568512, + "learning_rate": 4.813140176990411e-05, + "loss": 0.3093, + "num_input_tokens_seen": 7573808, + "step": 4955 + }, + { + "epoch": 15.309119010819165, + "grad_norm": 0.4602854251861572, + "learning_rate": 4.8127675794041714e-05, + "loss": 0.3083, + "num_input_tokens_seen": 7581392, + "step": 4960 + }, + { + "epoch": 15.324574961360124, + "grad_norm": 0.5500034689903259, + "learning_rate": 4.812394625160107e-05, + "loss": 0.3274, + "num_input_tokens_seen": 7588848, + "step": 4965 + }, + { + "epoch": 15.340030911901081, + "grad_norm": 0.4719586968421936, + "learning_rate": 4.812021314315732e-05, + "loss": 0.3343, + "num_input_tokens_seen": 7595952, + "step": 4970 + }, + { + "epoch": 15.35548686244204, + "grad_norm": 0.5938678979873657, + "learning_rate": 4.811647646928616e-05, + "loss": 0.3931, + "num_input_tokens_seen": 7603152, + "step": 4975 + }, + { + "epoch": 15.370942812982998, + "grad_norm": 0.734429121017456, + "learning_rate": 4.8112736230563814e-05, + "loss": 0.3712, + "num_input_tokens_seen": 7610192, + "step": 4980 + }, + { + "epoch": 15.386398763523957, + "grad_norm": 0.955437958240509, + "learning_rate": 4.81089924275671e-05, + "loss": 0.3825, + "num_input_tokens_seen": 7617968, + "step": 4985 + }, + { + "epoch": 15.401854714064916, + "grad_norm": 0.500339686870575, + "learning_rate": 4.810524506087335e-05, + "loss": 0.3515, + "num_input_tokens_seen": 7625680, + "step": 4990 + }, + { + "epoch": 15.417310664605873, + "grad_norm": 0.646614134311676, + "learning_rate": 4.810149413106044e-05, + "loss": 0.3412, + "num_input_tokens_seen": 7633456, + "step": 4995 + }, + { + "epoch": 15.432766615146832, + "grad_norm": 0.5217337012290955, + "learning_rate": 4.809773963870684e-05, + "loss": 0.2933, + "num_input_tokens_seen": 7641232, + "step": 5000 + }, + { + "epoch": 15.432766615146832, + "eval_loss": 0.41046273708343506, + "eval_runtime": 6.3205, + "eval_samples_per_second": 90.973, + "eval_steps_per_second": 22.783, + "num_input_tokens_seen": 7641232, + "step": 5000 + }, + { + "epoch": 15.44822256568779, + "grad_norm": 0.6909058094024658, + "learning_rate": 4.809398158439151e-05, + "loss": 0.3016, + "num_input_tokens_seen": 7648816, + "step": 5005 + }, + { + "epoch": 15.463678516228748, + "grad_norm": 0.7310165166854858, + "learning_rate": 4.8090219968694005e-05, + "loss": 0.3364, + "num_input_tokens_seen": 7656432, + "step": 5010 + }, + { + "epoch": 15.479134466769706, + "grad_norm": 1.0543469190597534, + "learning_rate": 4.808645479219442e-05, + "loss": 0.4362, + "num_input_tokens_seen": 7664144, + "step": 5015 + }, + { + "epoch": 15.494590417310665, + "grad_norm": 0.9122295379638672, + "learning_rate": 4.8082686055473375e-05, + "loss": 0.3104, + "num_input_tokens_seen": 7671440, + "step": 5020 + }, + { + "epoch": 15.510046367851622, + "grad_norm": 0.8394597172737122, + "learning_rate": 4.8078913759112066e-05, + "loss": 0.3384, + "num_input_tokens_seen": 7679248, + "step": 5025 + }, + { + "epoch": 15.525502318392581, + "grad_norm": 0.45587974786758423, + "learning_rate": 4.807513790369223e-05, + "loss": 0.2988, + "num_input_tokens_seen": 7686320, + "step": 5030 + }, + { + "epoch": 15.54095826893354, + "grad_norm": 0.5135344862937927, + "learning_rate": 4.8071358489796145e-05, + "loss": 0.4084, + "num_input_tokens_seen": 7694672, + "step": 5035 + }, + { + "epoch": 15.556414219474497, + "grad_norm": 0.5084633231163025, + "learning_rate": 4.806757551800665e-05, + "loss": 0.3858, + "num_input_tokens_seen": 7702704, + "step": 5040 + }, + { + "epoch": 15.571870170015456, + "grad_norm": 0.5363686084747314, + "learning_rate": 4.806378898890713e-05, + "loss": 0.2953, + "num_input_tokens_seen": 7710032, + "step": 5045 + }, + { + "epoch": 15.587326120556414, + "grad_norm": 0.6747179627418518, + "learning_rate": 4.80599989030815e-05, + "loss": 0.304, + "num_input_tokens_seen": 7717424, + "step": 5050 + }, + { + "epoch": 15.602782071097373, + "grad_norm": 0.7786310911178589, + "learning_rate": 4.805620526111426e-05, + "loss": 0.4381, + "num_input_tokens_seen": 7724304, + "step": 5055 + }, + { + "epoch": 15.61823802163833, + "grad_norm": 0.29762277007102966, + "learning_rate": 4.805240806359042e-05, + "loss": 0.3016, + "num_input_tokens_seen": 7732176, + "step": 5060 + }, + { + "epoch": 15.63369397217929, + "grad_norm": 0.9445527791976929, + "learning_rate": 4.804860731109557e-05, + "loss": 0.3439, + "num_input_tokens_seen": 7739824, + "step": 5065 + }, + { + "epoch": 15.649149922720248, + "grad_norm": 1.2385755777359009, + "learning_rate": 4.804480300421581e-05, + "loss": 0.4107, + "num_input_tokens_seen": 7747984, + "step": 5070 + }, + { + "epoch": 15.664605873261205, + "grad_norm": 0.535657525062561, + "learning_rate": 4.804099514353784e-05, + "loss": 0.3494, + "num_input_tokens_seen": 7755216, + "step": 5075 + }, + { + "epoch": 15.680061823802165, + "grad_norm": 0.7190422415733337, + "learning_rate": 4.8037183729648867e-05, + "loss": 0.3841, + "num_input_tokens_seen": 7762416, + "step": 5080 + }, + { + "epoch": 15.695517774343122, + "grad_norm": 0.5360310077667236, + "learning_rate": 4.803336876313666e-05, + "loss": 0.3872, + "num_input_tokens_seen": 7769936, + "step": 5085 + }, + { + "epoch": 15.71097372488408, + "grad_norm": 0.6709827780723572, + "learning_rate": 4.802955024458953e-05, + "loss": 0.3256, + "num_input_tokens_seen": 7778480, + "step": 5090 + }, + { + "epoch": 15.726429675425038, + "grad_norm": 0.7580018639564514, + "learning_rate": 4.802572817459634e-05, + "loss": 0.3898, + "num_input_tokens_seen": 7786064, + "step": 5095 + }, + { + "epoch": 15.741885625965997, + "grad_norm": 0.47846949100494385, + "learning_rate": 4.802190255374651e-05, + "loss": 0.3962, + "num_input_tokens_seen": 7793904, + "step": 5100 + }, + { + "epoch": 15.757341576506954, + "grad_norm": 0.682629406452179, + "learning_rate": 4.801807338263e-05, + "loss": 0.2994, + "num_input_tokens_seen": 7802000, + "step": 5105 + }, + { + "epoch": 15.772797527047913, + "grad_norm": 0.4951744079589844, + "learning_rate": 4.8014240661837306e-05, + "loss": 0.3335, + "num_input_tokens_seen": 7809584, + "step": 5110 + }, + { + "epoch": 15.788253477588873, + "grad_norm": 0.4092724919319153, + "learning_rate": 4.80104043919595e-05, + "loss": 0.3313, + "num_input_tokens_seen": 7817392, + "step": 5115 + }, + { + "epoch": 15.80370942812983, + "grad_norm": 0.5080147385597229, + "learning_rate": 4.800656457358815e-05, + "loss": 0.3818, + "num_input_tokens_seen": 7824496, + "step": 5120 + }, + { + "epoch": 15.819165378670789, + "grad_norm": 0.5440807938575745, + "learning_rate": 4.800272120731544e-05, + "loss": 0.3287, + "num_input_tokens_seen": 7832528, + "step": 5125 + }, + { + "epoch": 15.834621329211746, + "grad_norm": 0.8563321828842163, + "learning_rate": 4.799887429373404e-05, + "loss": 0.2792, + "num_input_tokens_seen": 7840048, + "step": 5130 + }, + { + "epoch": 15.850077279752705, + "grad_norm": 0.5260058641433716, + "learning_rate": 4.79950238334372e-05, + "loss": 0.3385, + "num_input_tokens_seen": 7847632, + "step": 5135 + }, + { + "epoch": 15.865533230293662, + "grad_norm": 0.47615575790405273, + "learning_rate": 4.799116982701872e-05, + "loss": 0.2997, + "num_input_tokens_seen": 7855120, + "step": 5140 + }, + { + "epoch": 15.880989180834622, + "grad_norm": 0.6369916200637817, + "learning_rate": 4.7987312275072926e-05, + "loss": 0.2622, + "num_input_tokens_seen": 7862928, + "step": 5145 + }, + { + "epoch": 15.896445131375579, + "grad_norm": 0.36661991477012634, + "learning_rate": 4.79834511781947e-05, + "loss": 0.2763, + "num_input_tokens_seen": 7870096, + "step": 5150 + }, + { + "epoch": 15.911901081916538, + "grad_norm": 0.4444720447063446, + "learning_rate": 4.797958653697947e-05, + "loss": 0.3146, + "num_input_tokens_seen": 7877456, + "step": 5155 + }, + { + "epoch": 15.927357032457497, + "grad_norm": 0.6931703686714172, + "learning_rate": 4.7975718352023225e-05, + "loss": 0.256, + "num_input_tokens_seen": 7884976, + "step": 5160 + }, + { + "epoch": 15.942812982998454, + "grad_norm": 0.5968213677406311, + "learning_rate": 4.7971846623922476e-05, + "loss": 0.2921, + "num_input_tokens_seen": 7892592, + "step": 5165 + }, + { + "epoch": 15.958268933539413, + "grad_norm": 0.5754075050354004, + "learning_rate": 4.7967971353274294e-05, + "loss": 0.359, + "num_input_tokens_seen": 7899632, + "step": 5170 + }, + { + "epoch": 15.97372488408037, + "grad_norm": 0.9230079650878906, + "learning_rate": 4.79640925406763e-05, + "loss": 0.4855, + "num_input_tokens_seen": 7907632, + "step": 5175 + }, + { + "epoch": 15.98918083462133, + "grad_norm": 0.602358877658844, + "learning_rate": 4.796021018672664e-05, + "loss": 0.3611, + "num_input_tokens_seen": 7915728, + "step": 5180 + }, + { + "epoch": 16.00309119010819, + "grad_norm": 0.3977338969707489, + "learning_rate": 4.795632429202405e-05, + "loss": 0.3236, + "num_input_tokens_seen": 7922672, + "step": 5185 + }, + { + "epoch": 16.01854714064915, + "grad_norm": 0.5924001932144165, + "learning_rate": 4.795243485716775e-05, + "loss": 0.3528, + "num_input_tokens_seen": 7929840, + "step": 5190 + }, + { + "epoch": 16.034003091190108, + "grad_norm": 0.8766772150993347, + "learning_rate": 4.794854188275757e-05, + "loss": 0.3566, + "num_input_tokens_seen": 7937264, + "step": 5195 + }, + { + "epoch": 16.049459041731065, + "grad_norm": 0.5038928389549255, + "learning_rate": 4.794464536939384e-05, + "loss": 0.3062, + "num_input_tokens_seen": 7945360, + "step": 5200 + }, + { + "epoch": 16.049459041731065, + "eval_loss": 0.4053465723991394, + "eval_runtime": 6.3222, + "eval_samples_per_second": 90.949, + "eval_steps_per_second": 22.777, + "num_input_tokens_seen": 7945360, + "step": 5200 + }, + { + "epoch": 16.064914992272026, + "grad_norm": 0.5822982788085938, + "learning_rate": 4.794074531767745e-05, + "loss": 0.2575, + "num_input_tokens_seen": 7952784, + "step": 5205 + }, + { + "epoch": 16.080370942812984, + "grad_norm": 0.46194201707839966, + "learning_rate": 4.7936841728209834e-05, + "loss": 0.3147, + "num_input_tokens_seen": 7960560, + "step": 5210 + }, + { + "epoch": 16.09582689335394, + "grad_norm": 0.4995709955692291, + "learning_rate": 4.7932934601593e-05, + "loss": 0.3396, + "num_input_tokens_seen": 7968368, + "step": 5215 + }, + { + "epoch": 16.111282843894898, + "grad_norm": 0.5919976830482483, + "learning_rate": 4.792902393842943e-05, + "loss": 0.429, + "num_input_tokens_seen": 7975952, + "step": 5220 + }, + { + "epoch": 16.12673879443586, + "grad_norm": 0.6028421521186829, + "learning_rate": 4.792510973932225e-05, + "loss": 0.3296, + "num_input_tokens_seen": 7983472, + "step": 5225 + }, + { + "epoch": 16.142194744976816, + "grad_norm": 0.5120159387588501, + "learning_rate": 4.7921192004875036e-05, + "loss": 0.3128, + "num_input_tokens_seen": 7990672, + "step": 5230 + }, + { + "epoch": 16.157650695517773, + "grad_norm": 0.40599045157432556, + "learning_rate": 4.791727073569198e-05, + "loss": 0.3275, + "num_input_tokens_seen": 7998288, + "step": 5235 + }, + { + "epoch": 16.173106646058734, + "grad_norm": 0.7088078260421753, + "learning_rate": 4.7913345932377775e-05, + "loss": 0.3388, + "num_input_tokens_seen": 8006000, + "step": 5240 + }, + { + "epoch": 16.18856259659969, + "grad_norm": 0.5177988409996033, + "learning_rate": 4.790941759553769e-05, + "loss": 0.3391, + "num_input_tokens_seen": 8013584, + "step": 5245 + }, + { + "epoch": 16.20401854714065, + "grad_norm": 0.6685429811477661, + "learning_rate": 4.79054857257775e-05, + "loss": 0.3314, + "num_input_tokens_seen": 8021808, + "step": 5250 + }, + { + "epoch": 16.219474497681606, + "grad_norm": 0.47418472170829773, + "learning_rate": 4.790155032370357e-05, + "loss": 0.3386, + "num_input_tokens_seen": 8029424, + "step": 5255 + }, + { + "epoch": 16.234930448222567, + "grad_norm": 0.3963150978088379, + "learning_rate": 4.789761138992278e-05, + "loss": 0.3539, + "num_input_tokens_seen": 8037232, + "step": 5260 + }, + { + "epoch": 16.250386398763524, + "grad_norm": 0.5104145407676697, + "learning_rate": 4.7893668925042565e-05, + "loss": 0.4085, + "num_input_tokens_seen": 8044880, + "step": 5265 + }, + { + "epoch": 16.26584234930448, + "grad_norm": 0.6486783623695374, + "learning_rate": 4.78897229296709e-05, + "loss": 0.3044, + "num_input_tokens_seen": 8053008, + "step": 5270 + }, + { + "epoch": 16.28129829984544, + "grad_norm": 0.8379858136177063, + "learning_rate": 4.7885773404416315e-05, + "loss": 0.3147, + "num_input_tokens_seen": 8060656, + "step": 5275 + }, + { + "epoch": 16.2967542503864, + "grad_norm": 0.5786405205726624, + "learning_rate": 4.788182034988786e-05, + "loss": 0.3161, + "num_input_tokens_seen": 8067920, + "step": 5280 + }, + { + "epoch": 16.312210200927357, + "grad_norm": 0.590939998626709, + "learning_rate": 4.787786376669516e-05, + "loss": 0.2528, + "num_input_tokens_seen": 8074832, + "step": 5285 + }, + { + "epoch": 16.327666151468314, + "grad_norm": 0.4342695474624634, + "learning_rate": 4.787390365544837e-05, + "loss": 0.3205, + "num_input_tokens_seen": 8082128, + "step": 5290 + }, + { + "epoch": 16.343122102009275, + "grad_norm": 0.6605080962181091, + "learning_rate": 4.786994001675818e-05, + "loss": 0.3977, + "num_input_tokens_seen": 8089936, + "step": 5295 + }, + { + "epoch": 16.358578052550232, + "grad_norm": 0.6259591579437256, + "learning_rate": 4.786597285123584e-05, + "loss": 0.293, + "num_input_tokens_seen": 8097456, + "step": 5300 + }, + { + "epoch": 16.37403400309119, + "grad_norm": 0.40839701890945435, + "learning_rate": 4.7862002159493135e-05, + "loss": 0.2807, + "num_input_tokens_seen": 8104752, + "step": 5305 + }, + { + "epoch": 16.389489953632147, + "grad_norm": 0.7222330570220947, + "learning_rate": 4.785802794214239e-05, + "loss": 0.3647, + "num_input_tokens_seen": 8112176, + "step": 5310 + }, + { + "epoch": 16.404945904173108, + "grad_norm": 0.9794154763221741, + "learning_rate": 4.7854050199796495e-05, + "loss": 0.3423, + "num_input_tokens_seen": 8119792, + "step": 5315 + }, + { + "epoch": 16.420401854714065, + "grad_norm": 1.0521703958511353, + "learning_rate": 4.7850068933068845e-05, + "loss": 0.3613, + "num_input_tokens_seen": 8127696, + "step": 5320 + }, + { + "epoch": 16.435857805255022, + "grad_norm": 0.5349413156509399, + "learning_rate": 4.7846084142573425e-05, + "loss": 0.2806, + "num_input_tokens_seen": 8135824, + "step": 5325 + }, + { + "epoch": 16.451313755795983, + "grad_norm": 0.8033201694488525, + "learning_rate": 4.7842095828924725e-05, + "loss": 0.3217, + "num_input_tokens_seen": 8144080, + "step": 5330 + }, + { + "epoch": 16.46676970633694, + "grad_norm": 0.6798512935638428, + "learning_rate": 4.783810399273779e-05, + "loss": 0.3366, + "num_input_tokens_seen": 8152208, + "step": 5335 + }, + { + "epoch": 16.482225656877898, + "grad_norm": 0.650684118270874, + "learning_rate": 4.7834108634628226e-05, + "loss": 0.3334, + "num_input_tokens_seen": 8159888, + "step": 5340 + }, + { + "epoch": 16.497681607418855, + "grad_norm": 0.4537340998649597, + "learning_rate": 4.783010975521216e-05, + "loss": 0.3058, + "num_input_tokens_seen": 8167568, + "step": 5345 + }, + { + "epoch": 16.513137557959816, + "grad_norm": 0.9267253875732422, + "learning_rate": 4.782610735510626e-05, + "loss": 0.3755, + "num_input_tokens_seen": 8174896, + "step": 5350 + }, + { + "epoch": 16.528593508500773, + "grad_norm": 0.38334423303604126, + "learning_rate": 4.782210143492776e-05, + "loss": 0.397, + "num_input_tokens_seen": 8183088, + "step": 5355 + }, + { + "epoch": 16.54404945904173, + "grad_norm": 0.7289148569107056, + "learning_rate": 4.781809199529442e-05, + "loss": 0.3262, + "num_input_tokens_seen": 8190608, + "step": 5360 + }, + { + "epoch": 16.55950540958269, + "grad_norm": 0.6135659217834473, + "learning_rate": 4.781407903682454e-05, + "loss": 0.3134, + "num_input_tokens_seen": 8198128, + "step": 5365 + }, + { + "epoch": 16.57496136012365, + "grad_norm": 0.4219799041748047, + "learning_rate": 4.781006256013698e-05, + "loss": 0.4055, + "num_input_tokens_seen": 8206512, + "step": 5370 + }, + { + "epoch": 16.590417310664606, + "grad_norm": 0.46800073981285095, + "learning_rate": 4.7806042565851115e-05, + "loss": 0.3624, + "num_input_tokens_seen": 8214064, + "step": 5375 + }, + { + "epoch": 16.605873261205563, + "grad_norm": 0.3633764088153839, + "learning_rate": 4.7802019054586895e-05, + "loss": 0.2806, + "num_input_tokens_seen": 8221968, + "step": 5380 + }, + { + "epoch": 16.621329211746524, + "grad_norm": 0.53122478723526, + "learning_rate": 4.779799202696479e-05, + "loss": 0.3657, + "num_input_tokens_seen": 8229360, + "step": 5385 + }, + { + "epoch": 16.63678516228748, + "grad_norm": 0.6909035444259644, + "learning_rate": 4.779396148360581e-05, + "loss": 0.337, + "num_input_tokens_seen": 8236944, + "step": 5390 + }, + { + "epoch": 16.652241112828438, + "grad_norm": 0.4873929023742676, + "learning_rate": 4.7789927425131517e-05, + "loss": 0.3846, + "num_input_tokens_seen": 8244880, + "step": 5395 + }, + { + "epoch": 16.667697063369395, + "grad_norm": 0.4304744303226471, + "learning_rate": 4.778588985216403e-05, + "loss": 0.2797, + "num_input_tokens_seen": 8252048, + "step": 5400 + }, + { + "epoch": 16.667697063369395, + "eval_loss": 0.40135371685028076, + "eval_runtime": 6.3301, + "eval_samples_per_second": 90.836, + "eval_steps_per_second": 22.749, + "num_input_tokens_seen": 8252048, + "step": 5400 + }, + { + "epoch": 16.683153013910356, + "grad_norm": 0.5512888431549072, + "learning_rate": 4.778184876532598e-05, + "loss": 0.363, + "num_input_tokens_seen": 8259344, + "step": 5405 + }, + { + "epoch": 16.698608964451314, + "grad_norm": 0.7662515044212341, + "learning_rate": 4.7777804165240556e-05, + "loss": 0.3545, + "num_input_tokens_seen": 8266480, + "step": 5410 + }, + { + "epoch": 16.71406491499227, + "grad_norm": 0.5469078421592712, + "learning_rate": 4.7773756052531485e-05, + "loss": 0.4089, + "num_input_tokens_seen": 8274480, + "step": 5415 + }, + { + "epoch": 16.72952086553323, + "grad_norm": 0.5851755738258362, + "learning_rate": 4.7769704427823035e-05, + "loss": 0.4048, + "num_input_tokens_seen": 8282128, + "step": 5420 + }, + { + "epoch": 16.74497681607419, + "grad_norm": 0.6705233454704285, + "learning_rate": 4.776564929174003e-05, + "loss": 0.3665, + "num_input_tokens_seen": 8289808, + "step": 5425 + }, + { + "epoch": 16.760432766615146, + "grad_norm": 0.5390312671661377, + "learning_rate": 4.7761590644907806e-05, + "loss": 0.2668, + "num_input_tokens_seen": 8297936, + "step": 5430 + }, + { + "epoch": 16.775888717156104, + "grad_norm": 0.6811644434928894, + "learning_rate": 4.7757528487952263e-05, + "loss": 0.3864, + "num_input_tokens_seen": 8305456, + "step": 5435 + }, + { + "epoch": 16.791344667697064, + "grad_norm": 0.43260058760643005, + "learning_rate": 4.7753462821499836e-05, + "loss": 0.3601, + "num_input_tokens_seen": 8313040, + "step": 5440 + }, + { + "epoch": 16.80680061823802, + "grad_norm": 0.5605297088623047, + "learning_rate": 4.774939364617751e-05, + "loss": 0.315, + "num_input_tokens_seen": 8320208, + "step": 5445 + }, + { + "epoch": 16.82225656877898, + "grad_norm": 0.7371994256973267, + "learning_rate": 4.7745320962612795e-05, + "loss": 0.4244, + "num_input_tokens_seen": 8327696, + "step": 5450 + }, + { + "epoch": 16.83771251931994, + "grad_norm": 0.7517450451850891, + "learning_rate": 4.7741244771433756e-05, + "loss": 0.3585, + "num_input_tokens_seen": 8335472, + "step": 5455 + }, + { + "epoch": 16.853168469860897, + "grad_norm": 0.40297001600265503, + "learning_rate": 4.7737165073268985e-05, + "loss": 0.3594, + "num_input_tokens_seen": 8343152, + "step": 5460 + }, + { + "epoch": 16.868624420401854, + "grad_norm": 0.6239968538284302, + "learning_rate": 4.7733081868747626e-05, + "loss": 0.3291, + "num_input_tokens_seen": 8350992, + "step": 5465 + }, + { + "epoch": 16.88408037094281, + "grad_norm": 0.40457794070243835, + "learning_rate": 4.772899515849936e-05, + "loss": 0.3196, + "num_input_tokens_seen": 8358288, + "step": 5470 + }, + { + "epoch": 16.899536321483772, + "grad_norm": 0.7475414276123047, + "learning_rate": 4.7724904943154414e-05, + "loss": 0.3684, + "num_input_tokens_seen": 8365552, + "step": 5475 + }, + { + "epoch": 16.91499227202473, + "grad_norm": 0.6856083273887634, + "learning_rate": 4.772081122334354e-05, + "loss": 0.2932, + "num_input_tokens_seen": 8373680, + "step": 5480 + }, + { + "epoch": 16.930448222565687, + "grad_norm": 1.2193773984909058, + "learning_rate": 4.771671399969806e-05, + "loss": 0.3897, + "num_input_tokens_seen": 8380752, + "step": 5485 + }, + { + "epoch": 16.945904173106648, + "grad_norm": 0.5515151023864746, + "learning_rate": 4.7712613272849794e-05, + "loss": 0.2991, + "num_input_tokens_seen": 8388368, + "step": 5490 + }, + { + "epoch": 16.961360123647605, + "grad_norm": 0.38224712014198303, + "learning_rate": 4.770850904343114e-05, + "loss": 0.3302, + "num_input_tokens_seen": 8396080, + "step": 5495 + }, + { + "epoch": 16.976816074188562, + "grad_norm": 0.6487033367156982, + "learning_rate": 4.770440131207502e-05, + "loss": 0.4696, + "num_input_tokens_seen": 8403824, + "step": 5500 + }, + { + "epoch": 16.99227202472952, + "grad_norm": 0.4376344680786133, + "learning_rate": 4.7700290079414896e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8412240, + "step": 5505 + }, + { + "epoch": 17.006182380216384, + "grad_norm": 1.3371526002883911, + "learning_rate": 4.769617534608477e-05, + "loss": 0.3989, + "num_input_tokens_seen": 8419392, + "step": 5510 + }, + { + "epoch": 17.02163833075734, + "grad_norm": 0.4830784201622009, + "learning_rate": 4.7692057112719193e-05, + "loss": 0.2843, + "num_input_tokens_seen": 8427136, + "step": 5515 + }, + { + "epoch": 17.037094281298298, + "grad_norm": 0.520561933517456, + "learning_rate": 4.7687935379953234e-05, + "loss": 0.2713, + "num_input_tokens_seen": 8434752, + "step": 5520 + }, + { + "epoch": 17.05255023183926, + "grad_norm": 0.6047643423080444, + "learning_rate": 4.7683810148422534e-05, + "loss": 0.2922, + "num_input_tokens_seen": 8442528, + "step": 5525 + }, + { + "epoch": 17.068006182380216, + "grad_norm": 0.32451334595680237, + "learning_rate": 4.767968141876324e-05, + "loss": 0.3368, + "num_input_tokens_seen": 8450176, + "step": 5530 + }, + { + "epoch": 17.083462132921174, + "grad_norm": 0.590020477771759, + "learning_rate": 4.767554919161207e-05, + "loss": 0.3262, + "num_input_tokens_seen": 8457568, + "step": 5535 + }, + { + "epoch": 17.098918083462134, + "grad_norm": 0.5035136938095093, + "learning_rate": 4.767141346760624e-05, + "loss": 0.3765, + "num_input_tokens_seen": 8465344, + "step": 5540 + }, + { + "epoch": 17.11437403400309, + "grad_norm": 0.4658122956752777, + "learning_rate": 4.766727424738356e-05, + "loss": 0.2574, + "num_input_tokens_seen": 8473120, + "step": 5545 + }, + { + "epoch": 17.12982998454405, + "grad_norm": 0.42445558309555054, + "learning_rate": 4.7663131531582325e-05, + "loss": 0.3279, + "num_input_tokens_seen": 8480960, + "step": 5550 + }, + { + "epoch": 17.145285935085006, + "grad_norm": 0.4421517848968506, + "learning_rate": 4.765898532084142e-05, + "loss": 0.3602, + "num_input_tokens_seen": 8488480, + "step": 5555 + }, + { + "epoch": 17.160741885625967, + "grad_norm": 0.49517160654067993, + "learning_rate": 4.765483561580022e-05, + "loss": 0.3342, + "num_input_tokens_seen": 8496256, + "step": 5560 + }, + { + "epoch": 17.176197836166924, + "grad_norm": 0.45521679520606995, + "learning_rate": 4.7650682417098666e-05, + "loss": 0.3265, + "num_input_tokens_seen": 8504032, + "step": 5565 + }, + { + "epoch": 17.19165378670788, + "grad_norm": 0.7814056277275085, + "learning_rate": 4.7646525725377244e-05, + "loss": 0.3369, + "num_input_tokens_seen": 8511680, + "step": 5570 + }, + { + "epoch": 17.207109737248842, + "grad_norm": 0.661605715751648, + "learning_rate": 4.764236554127696e-05, + "loss": 0.3802, + "num_input_tokens_seen": 8519680, + "step": 5575 + }, + { + "epoch": 17.2225656877898, + "grad_norm": 0.4802936315536499, + "learning_rate": 4.7638201865439356e-05, + "loss": 0.2794, + "num_input_tokens_seen": 8527456, + "step": 5580 + }, + { + "epoch": 17.238021638330757, + "grad_norm": 0.7207053899765015, + "learning_rate": 4.7634034698506545e-05, + "loss": 0.2978, + "num_input_tokens_seen": 8534912, + "step": 5585 + }, + { + "epoch": 17.253477588871714, + "grad_norm": 0.8799278140068054, + "learning_rate": 4.762986404112115e-05, + "loss": 0.3071, + "num_input_tokens_seen": 8541984, + "step": 5590 + }, + { + "epoch": 17.268933539412675, + "grad_norm": 0.5799072980880737, + "learning_rate": 4.762568989392633e-05, + "loss": 0.4227, + "num_input_tokens_seen": 8549472, + "step": 5595 + }, + { + "epoch": 17.284389489953632, + "grad_norm": 0.6769182682037354, + "learning_rate": 4.76215122575658e-05, + "loss": 0.3236, + "num_input_tokens_seen": 8557024, + "step": 5600 + }, + { + "epoch": 17.284389489953632, + "eval_loss": 0.39534708857536316, + "eval_runtime": 6.2881, + "eval_samples_per_second": 91.443, + "eval_steps_per_second": 22.901, + "num_input_tokens_seen": 8557024, + "step": 5600 + }, + { + "epoch": 17.29984544049459, + "grad_norm": 0.5564948320388794, + "learning_rate": 4.7617331132683795e-05, + "loss": 0.3766, + "num_input_tokens_seen": 8565152, + "step": 5605 + }, + { + "epoch": 17.315301391035547, + "grad_norm": 0.5693391561508179, + "learning_rate": 4.7613146519925105e-05, + "loss": 0.3798, + "num_input_tokens_seen": 8572672, + "step": 5610 + }, + { + "epoch": 17.330757341576508, + "grad_norm": 0.6645658016204834, + "learning_rate": 4.7608958419935045e-05, + "loss": 0.3149, + "num_input_tokens_seen": 8579744, + "step": 5615 + }, + { + "epoch": 17.346213292117465, + "grad_norm": 0.5317525863647461, + "learning_rate": 4.760476683335948e-05, + "loss": 0.3855, + "num_input_tokens_seen": 8587360, + "step": 5620 + }, + { + "epoch": 17.361669242658422, + "grad_norm": 0.6218377947807312, + "learning_rate": 4.760057176084479e-05, + "loss": 0.3364, + "num_input_tokens_seen": 8595456, + "step": 5625 + }, + { + "epoch": 17.377125193199383, + "grad_norm": 1.184591293334961, + "learning_rate": 4.759637320303793e-05, + "loss": 0.4108, + "num_input_tokens_seen": 8602976, + "step": 5630 + }, + { + "epoch": 17.39258114374034, + "grad_norm": 0.7135097980499268, + "learning_rate": 4.759217116058635e-05, + "loss": 0.3343, + "num_input_tokens_seen": 8611328, + "step": 5635 + }, + { + "epoch": 17.408037094281298, + "grad_norm": 0.6597471833229065, + "learning_rate": 4.758796563413807e-05, + "loss": 0.3789, + "num_input_tokens_seen": 8618592, + "step": 5640 + }, + { + "epoch": 17.423493044822255, + "grad_norm": 0.5384281277656555, + "learning_rate": 4.758375662434163e-05, + "loss": 0.3821, + "num_input_tokens_seen": 8626528, + "step": 5645 + }, + { + "epoch": 17.438948995363216, + "grad_norm": 1.3023817539215088, + "learning_rate": 4.7579544131846114e-05, + "loss": 0.4394, + "num_input_tokens_seen": 8634016, + "step": 5650 + }, + { + "epoch": 17.454404945904173, + "grad_norm": 1.3154947757720947, + "learning_rate": 4.757532815730114e-05, + "loss": 0.3723, + "num_input_tokens_seen": 8641792, + "step": 5655 + }, + { + "epoch": 17.46986089644513, + "grad_norm": 0.70183265209198, + "learning_rate": 4.7571108701356865e-05, + "loss": 0.3908, + "num_input_tokens_seen": 8649440, + "step": 5660 + }, + { + "epoch": 17.48531684698609, + "grad_norm": 0.6939021348953247, + "learning_rate": 4.756688576466398e-05, + "loss": 0.3351, + "num_input_tokens_seen": 8656960, + "step": 5665 + }, + { + "epoch": 17.50077279752705, + "grad_norm": 0.4241112768650055, + "learning_rate": 4.756265934787372e-05, + "loss": 0.2872, + "num_input_tokens_seen": 8664320, + "step": 5670 + }, + { + "epoch": 17.516228748068006, + "grad_norm": 0.8496220111846924, + "learning_rate": 4.755842945163785e-05, + "loss": 0.394, + "num_input_tokens_seen": 8672672, + "step": 5675 + }, + { + "epoch": 17.531684698608963, + "grad_norm": 0.4549221396446228, + "learning_rate": 4.755419607660867e-05, + "loss": 0.3106, + "num_input_tokens_seen": 8680384, + "step": 5680 + }, + { + "epoch": 17.547140649149924, + "grad_norm": 0.5808056592941284, + "learning_rate": 4.7549959223439016e-05, + "loss": 0.309, + "num_input_tokens_seen": 8687680, + "step": 5685 + }, + { + "epoch": 17.56259659969088, + "grad_norm": 0.5249761939048767, + "learning_rate": 4.754571889278228e-05, + "loss": 0.3376, + "num_input_tokens_seen": 8696096, + "step": 5690 + }, + { + "epoch": 17.57805255023184, + "grad_norm": 0.7619712352752686, + "learning_rate": 4.754147508529235e-05, + "loss": 0.3218, + "num_input_tokens_seen": 8703456, + "step": 5695 + }, + { + "epoch": 17.5935085007728, + "grad_norm": 0.7187018394470215, + "learning_rate": 4.75372278016237e-05, + "loss": 0.338, + "num_input_tokens_seen": 8710976, + "step": 5700 + }, + { + "epoch": 17.608964451313756, + "grad_norm": 0.7138301730155945, + "learning_rate": 4.753297704243129e-05, + "loss": 0.308, + "num_input_tokens_seen": 8718400, + "step": 5705 + }, + { + "epoch": 17.624420401854714, + "grad_norm": 0.474495530128479, + "learning_rate": 4.752872280837066e-05, + "loss": 0.2776, + "num_input_tokens_seen": 8726176, + "step": 5710 + }, + { + "epoch": 17.63987635239567, + "grad_norm": 0.44225460290908813, + "learning_rate": 4.752446510009786e-05, + "loss": 0.2738, + "num_input_tokens_seen": 8733792, + "step": 5715 + }, + { + "epoch": 17.655332302936632, + "grad_norm": 0.36202141642570496, + "learning_rate": 4.7520203918269476e-05, + "loss": 0.2811, + "num_input_tokens_seen": 8741472, + "step": 5720 + }, + { + "epoch": 17.67078825347759, + "grad_norm": 0.6718831062316895, + "learning_rate": 4.751593926354265e-05, + "loss": 0.3712, + "num_input_tokens_seen": 8749760, + "step": 5725 + }, + { + "epoch": 17.686244204018546, + "grad_norm": 0.5910775065422058, + "learning_rate": 4.751167113657503e-05, + "loss": 0.3391, + "num_input_tokens_seen": 8757344, + "step": 5730 + }, + { + "epoch": 17.701700154559504, + "grad_norm": 0.33752012252807617, + "learning_rate": 4.7507399538024834e-05, + "loss": 0.4379, + "num_input_tokens_seen": 8764192, + "step": 5735 + }, + { + "epoch": 17.717156105100464, + "grad_norm": 0.39182335138320923, + "learning_rate": 4.750312446855077e-05, + "loss": 0.271, + "num_input_tokens_seen": 8772064, + "step": 5740 + }, + { + "epoch": 17.73261205564142, + "grad_norm": 0.5137474536895752, + "learning_rate": 4.749884592881212e-05, + "loss": 0.3292, + "num_input_tokens_seen": 8779360, + "step": 5745 + }, + { + "epoch": 17.74806800618238, + "grad_norm": 0.692825198173523, + "learning_rate": 4.74945639194687e-05, + "loss": 0.2985, + "num_input_tokens_seen": 8787360, + "step": 5750 + }, + { + "epoch": 17.76352395672334, + "grad_norm": 0.643999457359314, + "learning_rate": 4.749027844118083e-05, + "loss": 0.3444, + "num_input_tokens_seen": 8795040, + "step": 5755 + }, + { + "epoch": 17.778979907264297, + "grad_norm": 0.45638686418533325, + "learning_rate": 4.7485989494609395e-05, + "loss": 0.3081, + "num_input_tokens_seen": 8802016, + "step": 5760 + }, + { + "epoch": 17.794435857805254, + "grad_norm": 0.41271668672561646, + "learning_rate": 4.748169708041581e-05, + "loss": 0.304, + "num_input_tokens_seen": 8809472, + "step": 5765 + }, + { + "epoch": 17.80989180834621, + "grad_norm": 0.4474446773529053, + "learning_rate": 4.7477401199262004e-05, + "loss": 0.2873, + "num_input_tokens_seen": 8816928, + "step": 5770 + }, + { + "epoch": 17.825347758887172, + "grad_norm": 0.7016522288322449, + "learning_rate": 4.747310185181048e-05, + "loss": 0.3145, + "num_input_tokens_seen": 8824640, + "step": 5775 + }, + { + "epoch": 17.84080370942813, + "grad_norm": 0.7349714040756226, + "learning_rate": 4.746879903872422e-05, + "loss": 0.3863, + "num_input_tokens_seen": 8831776, + "step": 5780 + }, + { + "epoch": 17.856259659969087, + "grad_norm": 0.5589519739151001, + "learning_rate": 4.746449276066679e-05, + "loss": 0.3012, + "num_input_tokens_seen": 8839136, + "step": 5785 + }, + { + "epoch": 17.871715610510048, + "grad_norm": 0.5933765172958374, + "learning_rate": 4.746018301830227e-05, + "loss": 0.2879, + "num_input_tokens_seen": 8846720, + "step": 5790 + }, + { + "epoch": 17.887171561051005, + "grad_norm": 0.34757882356643677, + "learning_rate": 4.7455869812295275e-05, + "loss": 0.3323, + "num_input_tokens_seen": 8854176, + "step": 5795 + }, + { + "epoch": 17.902627511591962, + "grad_norm": 0.617489755153656, + "learning_rate": 4.7451553143310964e-05, + "loss": 0.365, + "num_input_tokens_seen": 8862080, + "step": 5800 + }, + { + "epoch": 17.902627511591962, + "eval_loss": 0.38970205187797546, + "eval_runtime": 6.3051, + "eval_samples_per_second": 91.196, + "eval_steps_per_second": 22.839, + "num_input_tokens_seen": 8862080, + "step": 5800 + }, + { + "epoch": 17.91808346213292, + "grad_norm": 0.7817850112915039, + "learning_rate": 4.744723301201501e-05, + "loss": 0.3502, + "num_input_tokens_seen": 8869408, + "step": 5805 + }, + { + "epoch": 17.93353941267388, + "grad_norm": 0.8044437170028687, + "learning_rate": 4.744290941907364e-05, + "loss": 0.3671, + "num_input_tokens_seen": 8876992, + "step": 5810 + }, + { + "epoch": 17.948995363214838, + "grad_norm": 0.4442068636417389, + "learning_rate": 4.7438582365153594e-05, + "loss": 0.3211, + "num_input_tokens_seen": 8884512, + "step": 5815 + }, + { + "epoch": 17.964451313755795, + "grad_norm": 0.6503137946128845, + "learning_rate": 4.743425185092217e-05, + "loss": 0.358, + "num_input_tokens_seen": 8891936, + "step": 5820 + }, + { + "epoch": 17.979907264296756, + "grad_norm": 0.26411595940589905, + "learning_rate": 4.742991787704719e-05, + "loss": 0.2484, + "num_input_tokens_seen": 8899680, + "step": 5825 + }, + { + "epoch": 17.995363214837713, + "grad_norm": 0.34958985447883606, + "learning_rate": 4.7425580444196994e-05, + "loss": 0.3112, + "num_input_tokens_seen": 8907808, + "step": 5830 + }, + { + "epoch": 18.009273570324574, + "grad_norm": 0.675845205783844, + "learning_rate": 4.742123955304048e-05, + "loss": 0.3322, + "num_input_tokens_seen": 8914224, + "step": 5835 + }, + { + "epoch": 18.024729520865534, + "grad_norm": 0.6803374886512756, + "learning_rate": 4.741689520424706e-05, + "loss": 0.3454, + "num_input_tokens_seen": 8921808, + "step": 5840 + }, + { + "epoch": 18.04018547140649, + "grad_norm": 0.8593791127204895, + "learning_rate": 4.741254739848669e-05, + "loss": 0.3453, + "num_input_tokens_seen": 8929072, + "step": 5845 + }, + { + "epoch": 18.05564142194745, + "grad_norm": 0.4069240391254425, + "learning_rate": 4.740819613642987e-05, + "loss": 0.3141, + "num_input_tokens_seen": 8937008, + "step": 5850 + }, + { + "epoch": 18.071097372488406, + "grad_norm": 0.4809638261795044, + "learning_rate": 4.74038414187476e-05, + "loss": 0.2985, + "num_input_tokens_seen": 8944688, + "step": 5855 + }, + { + "epoch": 18.086553323029367, + "grad_norm": 0.43252480030059814, + "learning_rate": 4.739948324611144e-05, + "loss": 0.3491, + "num_input_tokens_seen": 8952304, + "step": 5860 + }, + { + "epoch": 18.102009273570324, + "grad_norm": 0.3463718891143799, + "learning_rate": 4.7395121619193465e-05, + "loss": 0.3998, + "num_input_tokens_seen": 8960240, + "step": 5865 + }, + { + "epoch": 18.11746522411128, + "grad_norm": 0.7154097557067871, + "learning_rate": 4.7390756538666313e-05, + "loss": 0.2951, + "num_input_tokens_seen": 8967856, + "step": 5870 + }, + { + "epoch": 18.132921174652243, + "grad_norm": 0.4457140862941742, + "learning_rate": 4.738638800520311e-05, + "loss": 0.3929, + "num_input_tokens_seen": 8975696, + "step": 5875 + }, + { + "epoch": 18.1483771251932, + "grad_norm": 0.7682527899742126, + "learning_rate": 4.738201601947757e-05, + "loss": 0.3247, + "num_input_tokens_seen": 8983056, + "step": 5880 + }, + { + "epoch": 18.163833075734157, + "grad_norm": 0.6177226901054382, + "learning_rate": 4.7377640582163876e-05, + "loss": 0.2934, + "num_input_tokens_seen": 8990768, + "step": 5885 + }, + { + "epoch": 18.179289026275114, + "grad_norm": 0.5769564509391785, + "learning_rate": 4.7373261693936786e-05, + "loss": 0.338, + "num_input_tokens_seen": 8998704, + "step": 5890 + }, + { + "epoch": 18.194744976816075, + "grad_norm": 0.4581329822540283, + "learning_rate": 4.7368879355471595e-05, + "loss": 0.2839, + "num_input_tokens_seen": 9006288, + "step": 5895 + }, + { + "epoch": 18.210200927357032, + "grad_norm": 0.523284912109375, + "learning_rate": 4.736449356744409e-05, + "loss": 0.3246, + "num_input_tokens_seen": 9014896, + "step": 5900 + }, + { + "epoch": 18.22565687789799, + "grad_norm": 0.49084869027137756, + "learning_rate": 4.736010433053064e-05, + "loss": 0.3419, + "num_input_tokens_seen": 9022352, + "step": 5905 + }, + { + "epoch": 18.24111282843895, + "grad_norm": 0.5490776300430298, + "learning_rate": 4.73557116454081e-05, + "loss": 0.3124, + "num_input_tokens_seen": 9029744, + "step": 5910 + }, + { + "epoch": 18.256568778979908, + "grad_norm": 1.1223102807998657, + "learning_rate": 4.735131551275389e-05, + "loss": 0.3113, + "num_input_tokens_seen": 9037840, + "step": 5915 + }, + { + "epoch": 18.272024729520865, + "grad_norm": 0.5000991225242615, + "learning_rate": 4.734691593324594e-05, + "loss": 0.2899, + "num_input_tokens_seen": 9045296, + "step": 5920 + }, + { + "epoch": 18.287480680061822, + "grad_norm": 0.6503373980522156, + "learning_rate": 4.734251290756272e-05, + "loss": 0.3089, + "num_input_tokens_seen": 9052656, + "step": 5925 + }, + { + "epoch": 18.302936630602783, + "grad_norm": 0.6458615064620972, + "learning_rate": 4.7338106436383246e-05, + "loss": 0.4043, + "num_input_tokens_seen": 9060240, + "step": 5930 + }, + { + "epoch": 18.31839258114374, + "grad_norm": 0.5474774241447449, + "learning_rate": 4.733369652038703e-05, + "loss": 0.3345, + "num_input_tokens_seen": 9067376, + "step": 5935 + }, + { + "epoch": 18.333848531684698, + "grad_norm": 0.6404802203178406, + "learning_rate": 4.7329283160254156e-05, + "loss": 0.3449, + "num_input_tokens_seen": 9075056, + "step": 5940 + }, + { + "epoch": 18.34930448222566, + "grad_norm": 0.6855826377868652, + "learning_rate": 4.732486635666521e-05, + "loss": 0.3304, + "num_input_tokens_seen": 9082544, + "step": 5945 + }, + { + "epoch": 18.364760432766616, + "grad_norm": 0.6664712429046631, + "learning_rate": 4.732044611030132e-05, + "loss": 0.2597, + "num_input_tokens_seen": 9090416, + "step": 5950 + }, + { + "epoch": 18.380216383307573, + "grad_norm": 0.4991304278373718, + "learning_rate": 4.731602242184414e-05, + "loss": 0.3195, + "num_input_tokens_seen": 9098032, + "step": 5955 + }, + { + "epoch": 18.39567233384853, + "grad_norm": 0.7524475455284119, + "learning_rate": 4.7311595291975864e-05, + "loss": 0.3246, + "num_input_tokens_seen": 9106448, + "step": 5960 + }, + { + "epoch": 18.41112828438949, + "grad_norm": 0.5573334097862244, + "learning_rate": 4.7307164721379216e-05, + "loss": 0.3268, + "num_input_tokens_seen": 9114416, + "step": 5965 + }, + { + "epoch": 18.42658423493045, + "grad_norm": 0.43922850489616394, + "learning_rate": 4.730273071073743e-05, + "loss": 0.3498, + "num_input_tokens_seen": 9122512, + "step": 5970 + }, + { + "epoch": 18.442040185471406, + "grad_norm": 0.8247613310813904, + "learning_rate": 4.729829326073429e-05, + "loss": 0.3911, + "num_input_tokens_seen": 9130064, + "step": 5975 + }, + { + "epoch": 18.457496136012363, + "grad_norm": 0.48820698261260986, + "learning_rate": 4.7293852372054126e-05, + "loss": 0.2682, + "num_input_tokens_seen": 9137328, + "step": 5980 + }, + { + "epoch": 18.472952086553324, + "grad_norm": 0.5283095240592957, + "learning_rate": 4.728940804538176e-05, + "loss": 0.2965, + "num_input_tokens_seen": 9145008, + "step": 5985 + }, + { + "epoch": 18.48840803709428, + "grad_norm": 1.1380741596221924, + "learning_rate": 4.7284960281402556e-05, + "loss": 0.3038, + "num_input_tokens_seen": 9152240, + "step": 5990 + }, + { + "epoch": 18.50386398763524, + "grad_norm": 0.48664528131484985, + "learning_rate": 4.728050908080244e-05, + "loss": 0.3, + "num_input_tokens_seen": 9159728, + "step": 5995 + }, + { + "epoch": 18.5193199381762, + "grad_norm": 0.5942592620849609, + "learning_rate": 4.727605444426782e-05, + "loss": 0.3191, + "num_input_tokens_seen": 9167248, + "step": 6000 + }, + { + "epoch": 18.5193199381762, + "eval_loss": 0.38718482851982117, + "eval_runtime": 6.2763, + "eval_samples_per_second": 91.614, + "eval_steps_per_second": 22.943, + "num_input_tokens_seen": 9167248, + "step": 6000 + }, + { + "epoch": 18.534775888717157, + "grad_norm": 0.4185212254524231, + "learning_rate": 4.727159637248567e-05, + "loss": 0.3249, + "num_input_tokens_seen": 9174704, + "step": 6005 + }, + { + "epoch": 18.550231839258114, + "grad_norm": 0.6179144382476807, + "learning_rate": 4.7267134866143474e-05, + "loss": 0.2501, + "num_input_tokens_seen": 9182352, + "step": 6010 + }, + { + "epoch": 18.56568778979907, + "grad_norm": 0.6531522870063782, + "learning_rate": 4.726266992592926e-05, + "loss": 0.3128, + "num_input_tokens_seen": 9189168, + "step": 6015 + }, + { + "epoch": 18.581143740340032, + "grad_norm": 0.9173106551170349, + "learning_rate": 4.725820155253157e-05, + "loss": 0.2498, + "num_input_tokens_seen": 9196944, + "step": 6020 + }, + { + "epoch": 18.59659969088099, + "grad_norm": 0.6179139018058777, + "learning_rate": 4.725372974663948e-05, + "loss": 0.3056, + "num_input_tokens_seen": 9204304, + "step": 6025 + }, + { + "epoch": 18.612055641421946, + "grad_norm": 0.6928637623786926, + "learning_rate": 4.724925450894262e-05, + "loss": 0.335, + "num_input_tokens_seen": 9211760, + "step": 6030 + }, + { + "epoch": 18.627511591962907, + "grad_norm": 0.420225590467453, + "learning_rate": 4.72447758401311e-05, + "loss": 0.2591, + "num_input_tokens_seen": 9219152, + "step": 6035 + }, + { + "epoch": 18.642967542503865, + "grad_norm": 0.7330701351165771, + "learning_rate": 4.7240293740895616e-05, + "loss": 0.4031, + "num_input_tokens_seen": 9227280, + "step": 6040 + }, + { + "epoch": 18.658423493044822, + "grad_norm": 0.5226097106933594, + "learning_rate": 4.723580821192733e-05, + "loss": 0.4624, + "num_input_tokens_seen": 9235152, + "step": 6045 + }, + { + "epoch": 18.67387944358578, + "grad_norm": 0.4756399989128113, + "learning_rate": 4.7231319253917996e-05, + "loss": 0.2764, + "num_input_tokens_seen": 9242896, + "step": 6050 + }, + { + "epoch": 18.68933539412674, + "grad_norm": 0.3195499777793884, + "learning_rate": 4.722682686755986e-05, + "loss": 0.2813, + "num_input_tokens_seen": 9250288, + "step": 6055 + }, + { + "epoch": 18.704791344667697, + "grad_norm": 0.8066638112068176, + "learning_rate": 4.722233105354569e-05, + "loss": 0.4141, + "num_input_tokens_seen": 9257488, + "step": 6060 + }, + { + "epoch": 18.720247295208654, + "grad_norm": 0.7525232434272766, + "learning_rate": 4.7217831812568815e-05, + "loss": 0.4407, + "num_input_tokens_seen": 9265264, + "step": 6065 + }, + { + "epoch": 18.735703245749615, + "grad_norm": 0.8864522576332092, + "learning_rate": 4.721332914532307e-05, + "loss": 0.2815, + "num_input_tokens_seen": 9273072, + "step": 6070 + }, + { + "epoch": 18.751159196290573, + "grad_norm": 0.6303654909133911, + "learning_rate": 4.720882305250281e-05, + "loss": 0.2763, + "num_input_tokens_seen": 9280944, + "step": 6075 + }, + { + "epoch": 18.76661514683153, + "grad_norm": 0.7337630391120911, + "learning_rate": 4.720431353480295e-05, + "loss": 0.4044, + "num_input_tokens_seen": 9289008, + "step": 6080 + }, + { + "epoch": 18.782071097372487, + "grad_norm": 0.5441291332244873, + "learning_rate": 4.719980059291891e-05, + "loss": 0.2777, + "num_input_tokens_seen": 9297008, + "step": 6085 + }, + { + "epoch": 18.797527047913448, + "grad_norm": 0.44888928532600403, + "learning_rate": 4.7195284227546634e-05, + "loss": 0.2523, + "num_input_tokens_seen": 9304368, + "step": 6090 + }, + { + "epoch": 18.812982998454405, + "grad_norm": 0.6870009899139404, + "learning_rate": 4.7190764439382604e-05, + "loss": 0.2919, + "num_input_tokens_seen": 9311952, + "step": 6095 + }, + { + "epoch": 18.828438948995363, + "grad_norm": 0.41449663043022156, + "learning_rate": 4.7186241229123826e-05, + "loss": 0.3001, + "num_input_tokens_seen": 9319856, + "step": 6100 + }, + { + "epoch": 18.84389489953632, + "grad_norm": 0.7256513237953186, + "learning_rate": 4.718171459746785e-05, + "loss": 0.3021, + "num_input_tokens_seen": 9327440, + "step": 6105 + }, + { + "epoch": 18.85935085007728, + "grad_norm": 0.4989817440509796, + "learning_rate": 4.717718454511273e-05, + "loss": 0.2561, + "num_input_tokens_seen": 9335024, + "step": 6110 + }, + { + "epoch": 18.874806800618238, + "grad_norm": 0.7761181592941284, + "learning_rate": 4.7172651072757056e-05, + "loss": 0.4833, + "num_input_tokens_seen": 9342736, + "step": 6115 + }, + { + "epoch": 18.890262751159195, + "grad_norm": 0.5178846716880798, + "learning_rate": 4.7168114181099945e-05, + "loss": 0.4042, + "num_input_tokens_seen": 9350224, + "step": 6120 + }, + { + "epoch": 18.905718701700156, + "grad_norm": 0.5797011852264404, + "learning_rate": 4.716357387084105e-05, + "loss": 0.3313, + "num_input_tokens_seen": 9358096, + "step": 6125 + }, + { + "epoch": 18.921174652241113, + "grad_norm": 0.49375608563423157, + "learning_rate": 4.715903014268054e-05, + "loss": 0.3499, + "num_input_tokens_seen": 9365552, + "step": 6130 + }, + { + "epoch": 18.93663060278207, + "grad_norm": 0.4847618341445923, + "learning_rate": 4.715448299731911e-05, + "loss": 0.347, + "num_input_tokens_seen": 9373584, + "step": 6135 + }, + { + "epoch": 18.952086553323028, + "grad_norm": 0.8159246444702148, + "learning_rate": 4.7149932435457986e-05, + "loss": 0.4184, + "num_input_tokens_seen": 9380784, + "step": 6140 + }, + { + "epoch": 18.96754250386399, + "grad_norm": 0.5199120044708252, + "learning_rate": 4.714537845779894e-05, + "loss": 0.2639, + "num_input_tokens_seen": 9389168, + "step": 6145 + }, + { + "epoch": 18.982998454404946, + "grad_norm": 0.8669658899307251, + "learning_rate": 4.714082106504423e-05, + "loss": 0.2906, + "num_input_tokens_seen": 9396528, + "step": 6150 + }, + { + "epoch": 18.998454404945903, + "grad_norm": 0.36678603291511536, + "learning_rate": 4.713626025789667e-05, + "loss": 0.3393, + "num_input_tokens_seen": 9404176, + "step": 6155 + }, + { + "epoch": 19.012364760432767, + "grad_norm": 0.44260597229003906, + "learning_rate": 4.7131696037059606e-05, + "loss": 0.3626, + "num_input_tokens_seen": 9410992, + "step": 6160 + }, + { + "epoch": 19.027820710973725, + "grad_norm": 0.637438178062439, + "learning_rate": 4.712712840323689e-05, + "loss": 0.2791, + "num_input_tokens_seen": 9418128, + "step": 6165 + }, + { + "epoch": 19.043276661514682, + "grad_norm": 0.7032508254051208, + "learning_rate": 4.71225573571329e-05, + "loss": 0.2902, + "num_input_tokens_seen": 9425712, + "step": 6170 + }, + { + "epoch": 19.058732612055643, + "grad_norm": 0.7133444547653198, + "learning_rate": 4.711798289945256e-05, + "loss": 0.3249, + "num_input_tokens_seen": 9433744, + "step": 6175 + }, + { + "epoch": 19.0741885625966, + "grad_norm": 1.0372039079666138, + "learning_rate": 4.71134050309013e-05, + "loss": 0.2839, + "num_input_tokens_seen": 9441456, + "step": 6180 + }, + { + "epoch": 19.089644513137557, + "grad_norm": 0.2826731204986572, + "learning_rate": 4.710882375218509e-05, + "loss": 0.24, + "num_input_tokens_seen": 9449040, + "step": 6185 + }, + { + "epoch": 19.105100463678518, + "grad_norm": 0.3522513508796692, + "learning_rate": 4.7104239064010424e-05, + "loss": 0.2995, + "num_input_tokens_seen": 9457008, + "step": 6190 + }, + { + "epoch": 19.120556414219475, + "grad_norm": 0.4697796106338501, + "learning_rate": 4.709965096708432e-05, + "loss": 0.3559, + "num_input_tokens_seen": 9465296, + "step": 6195 + }, + { + "epoch": 19.136012364760433, + "grad_norm": 0.4669201970100403, + "learning_rate": 4.709505946211431e-05, + "loss": 0.2915, + "num_input_tokens_seen": 9472816, + "step": 6200 + }, + { + "epoch": 19.136012364760433, + "eval_loss": 0.38108211755752563, + "eval_runtime": 6.2634, + "eval_samples_per_second": 91.803, + "eval_steps_per_second": 22.991, + "num_input_tokens_seen": 9472816, + "step": 6200 + }, + { + "epoch": 19.15146831530139, + "grad_norm": 0.8075712323188782, + "learning_rate": 4.709046454980846e-05, + "loss": 0.3069, + "num_input_tokens_seen": 9480368, + "step": 6205 + }, + { + "epoch": 19.16692426584235, + "grad_norm": 0.3910523056983948, + "learning_rate": 4.708586623087538e-05, + "loss": 0.2852, + "num_input_tokens_seen": 9488176, + "step": 6210 + }, + { + "epoch": 19.182380216383308, + "grad_norm": 0.9846481084823608, + "learning_rate": 4.708126450602418e-05, + "loss": 0.3543, + "num_input_tokens_seen": 9495888, + "step": 6215 + }, + { + "epoch": 19.197836166924265, + "grad_norm": 0.9892299771308899, + "learning_rate": 4.7076659375964495e-05, + "loss": 0.3728, + "num_input_tokens_seen": 9503120, + "step": 6220 + }, + { + "epoch": 19.213292117465222, + "grad_norm": 0.5551151037216187, + "learning_rate": 4.707205084140651e-05, + "loss": 0.394, + "num_input_tokens_seen": 9510608, + "step": 6225 + }, + { + "epoch": 19.228748068006183, + "grad_norm": 0.6394354701042175, + "learning_rate": 4.7067438903060904e-05, + "loss": 0.3792, + "num_input_tokens_seen": 9518608, + "step": 6230 + }, + { + "epoch": 19.24420401854714, + "grad_norm": 0.4732123613357544, + "learning_rate": 4.70628235616389e-05, + "loss": 0.3253, + "num_input_tokens_seen": 9526320, + "step": 6235 + }, + { + "epoch": 19.259659969088098, + "grad_norm": 0.8344833850860596, + "learning_rate": 4.7058204817852256e-05, + "loss": 0.2709, + "num_input_tokens_seen": 9533616, + "step": 6240 + }, + { + "epoch": 19.27511591962906, + "grad_norm": 0.6804960370063782, + "learning_rate": 4.705358267241322e-05, + "loss": 0.3505, + "num_input_tokens_seen": 9541424, + "step": 6245 + }, + { + "epoch": 19.290571870170016, + "grad_norm": 0.5506664514541626, + "learning_rate": 4.704895712603459e-05, + "loss": 0.3271, + "num_input_tokens_seen": 9549200, + "step": 6250 + }, + { + "epoch": 19.306027820710973, + "grad_norm": 0.5317472815513611, + "learning_rate": 4.704432817942969e-05, + "loss": 0.3401, + "num_input_tokens_seen": 9556496, + "step": 6255 + }, + { + "epoch": 19.32148377125193, + "grad_norm": 0.574023962020874, + "learning_rate": 4.703969583331236e-05, + "loss": 0.2917, + "num_input_tokens_seen": 9564592, + "step": 6260 + }, + { + "epoch": 19.33693972179289, + "grad_norm": 0.6826043725013733, + "learning_rate": 4.7035060088396965e-05, + "loss": 0.2882, + "num_input_tokens_seen": 9571760, + "step": 6265 + }, + { + "epoch": 19.35239567233385, + "grad_norm": 0.6060888171195984, + "learning_rate": 4.703042094539839e-05, + "loss": 0.3362, + "num_input_tokens_seen": 9579216, + "step": 6270 + }, + { + "epoch": 19.367851622874806, + "grad_norm": 0.30513593554496765, + "learning_rate": 4.702577840503206e-05, + "loss": 0.2768, + "num_input_tokens_seen": 9586960, + "step": 6275 + }, + { + "epoch": 19.383307573415767, + "grad_norm": 0.5196616053581238, + "learning_rate": 4.70211324680139e-05, + "loss": 0.3194, + "num_input_tokens_seen": 9594768, + "step": 6280 + }, + { + "epoch": 19.398763523956724, + "grad_norm": 0.6531980037689209, + "learning_rate": 4.7016483135060386e-05, + "loss": 0.2797, + "num_input_tokens_seen": 9602448, + "step": 6285 + }, + { + "epoch": 19.41421947449768, + "grad_norm": 0.6358378529548645, + "learning_rate": 4.701183040688849e-05, + "loss": 0.4147, + "num_input_tokens_seen": 9610960, + "step": 6290 + }, + { + "epoch": 19.42967542503864, + "grad_norm": 0.7148062586784363, + "learning_rate": 4.700717428421573e-05, + "loss": 0.3627, + "num_input_tokens_seen": 9618480, + "step": 6295 + }, + { + "epoch": 19.4451313755796, + "grad_norm": 0.39059755206108093, + "learning_rate": 4.700251476776014e-05, + "loss": 0.2569, + "num_input_tokens_seen": 9626384, + "step": 6300 + }, + { + "epoch": 19.460587326120557, + "grad_norm": 0.6733751893043518, + "learning_rate": 4.699785185824026e-05, + "loss": 0.3016, + "num_input_tokens_seen": 9634384, + "step": 6305 + }, + { + "epoch": 19.476043276661514, + "grad_norm": 0.47887277603149414, + "learning_rate": 4.699318555637519e-05, + "loss": 0.325, + "num_input_tokens_seen": 9642064, + "step": 6310 + }, + { + "epoch": 19.491499227202475, + "grad_norm": 0.8109880089759827, + "learning_rate": 4.6988515862884525e-05, + "loss": 0.2804, + "num_input_tokens_seen": 9649424, + "step": 6315 + }, + { + "epoch": 19.506955177743432, + "grad_norm": 0.399027943611145, + "learning_rate": 4.698384277848838e-05, + "loss": 0.2425, + "num_input_tokens_seen": 9656976, + "step": 6320 + }, + { + "epoch": 19.52241112828439, + "grad_norm": 0.5299573540687561, + "learning_rate": 4.6979166303907425e-05, + "loss": 0.3373, + "num_input_tokens_seen": 9664880, + "step": 6325 + }, + { + "epoch": 19.537867078825347, + "grad_norm": 0.4571613371372223, + "learning_rate": 4.697448643986281e-05, + "loss": 0.3135, + "num_input_tokens_seen": 9672656, + "step": 6330 + }, + { + "epoch": 19.553323029366307, + "grad_norm": 0.447256863117218, + "learning_rate": 4.696980318707624e-05, + "loss": 0.2943, + "num_input_tokens_seen": 9680240, + "step": 6335 + }, + { + "epoch": 19.568778979907265, + "grad_norm": 0.7557080388069153, + "learning_rate": 4.6965116546269924e-05, + "loss": 0.4175, + "num_input_tokens_seen": 9688528, + "step": 6340 + }, + { + "epoch": 19.584234930448222, + "grad_norm": 0.5100908279418945, + "learning_rate": 4.6960426518166615e-05, + "loss": 0.2447, + "num_input_tokens_seen": 9696400, + "step": 6345 + }, + { + "epoch": 19.59969088098918, + "grad_norm": 0.6640424132347107, + "learning_rate": 4.6955733103489556e-05, + "loss": 0.286, + "num_input_tokens_seen": 9704048, + "step": 6350 + }, + { + "epoch": 19.61514683153014, + "grad_norm": 0.6493605375289917, + "learning_rate": 4.695103630296255e-05, + "loss": 0.2487, + "num_input_tokens_seen": 9711568, + "step": 6355 + }, + { + "epoch": 19.630602782071097, + "grad_norm": 0.5879271626472473, + "learning_rate": 4.694633611730988e-05, + "loss": 0.3275, + "num_input_tokens_seen": 9719088, + "step": 6360 + }, + { + "epoch": 19.646058732612055, + "grad_norm": 0.5597089529037476, + "learning_rate": 4.694163254725639e-05, + "loss": 0.3624, + "num_input_tokens_seen": 9726512, + "step": 6365 + }, + { + "epoch": 19.661514683153015, + "grad_norm": 0.8104020357131958, + "learning_rate": 4.693692559352743e-05, + "loss": 0.3313, + "num_input_tokens_seen": 9734416, + "step": 6370 + }, + { + "epoch": 19.676970633693973, + "grad_norm": 0.6394801139831543, + "learning_rate": 4.693221525684886e-05, + "loss": 0.3532, + "num_input_tokens_seen": 9741680, + "step": 6375 + }, + { + "epoch": 19.69242658423493, + "grad_norm": 0.4389471411705017, + "learning_rate": 4.6927501537947084e-05, + "loss": 0.2739, + "num_input_tokens_seen": 9749328, + "step": 6380 + }, + { + "epoch": 19.707882534775887, + "grad_norm": 0.808586835861206, + "learning_rate": 4.692278443754901e-05, + "loss": 0.3484, + "num_input_tokens_seen": 9756560, + "step": 6385 + }, + { + "epoch": 19.723338485316848, + "grad_norm": 0.7129480838775635, + "learning_rate": 4.691806395638208e-05, + "loss": 0.3738, + "num_input_tokens_seen": 9763632, + "step": 6390 + }, + { + "epoch": 19.738794435857805, + "grad_norm": 0.7631579637527466, + "learning_rate": 4.6913340095174255e-05, + "loss": 0.3425, + "num_input_tokens_seen": 9771600, + "step": 6395 + }, + { + "epoch": 19.754250386398763, + "grad_norm": 0.8345904350280762, + "learning_rate": 4.690861285465399e-05, + "loss": 0.3482, + "num_input_tokens_seen": 9779344, + "step": 6400 + }, + { + "epoch": 19.754250386398763, + "eval_loss": 0.3782135546207428, + "eval_runtime": 6.2887, + "eval_samples_per_second": 91.433, + "eval_steps_per_second": 22.898, + "num_input_tokens_seen": 9779344, + "step": 6400 + }, + { + "epoch": 19.769706336939723, + "grad_norm": 0.4628579914569855, + "learning_rate": 4.690388223555031e-05, + "loss": 0.2831, + "num_input_tokens_seen": 9787440, + "step": 6405 + }, + { + "epoch": 19.78516228748068, + "grad_norm": 0.6261805295944214, + "learning_rate": 4.689914823859273e-05, + "loss": 0.292, + "num_input_tokens_seen": 9795280, + "step": 6410 + }, + { + "epoch": 19.800618238021638, + "grad_norm": 0.5171941518783569, + "learning_rate": 4.689441086451129e-05, + "loss": 0.2895, + "num_input_tokens_seen": 9802800, + "step": 6415 + }, + { + "epoch": 19.816074188562595, + "grad_norm": 0.35208576917648315, + "learning_rate": 4.688967011403655e-05, + "loss": 0.2568, + "num_input_tokens_seen": 9810192, + "step": 6420 + }, + { + "epoch": 19.831530139103556, + "grad_norm": 0.7234087586402893, + "learning_rate": 4.68849259878996e-05, + "loss": 0.3309, + "num_input_tokens_seen": 9817968, + "step": 6425 + }, + { + "epoch": 19.846986089644513, + "grad_norm": 0.5462307333946228, + "learning_rate": 4.6880178486832036e-05, + "loss": 0.379, + "num_input_tokens_seen": 9824976, + "step": 6430 + }, + { + "epoch": 19.86244204018547, + "grad_norm": 0.8462350964546204, + "learning_rate": 4.687542761156598e-05, + "loss": 0.2916, + "num_input_tokens_seen": 9832976, + "step": 6435 + }, + { + "epoch": 19.87789799072643, + "grad_norm": 0.6238448619842529, + "learning_rate": 4.6870673362834096e-05, + "loss": 0.3786, + "num_input_tokens_seen": 9840592, + "step": 6440 + }, + { + "epoch": 19.89335394126739, + "grad_norm": 0.611262857913971, + "learning_rate": 4.6865915741369526e-05, + "loss": 0.3671, + "num_input_tokens_seen": 9848144, + "step": 6445 + }, + { + "epoch": 19.908809891808346, + "grad_norm": 0.5896044373512268, + "learning_rate": 4.686115474790597e-05, + "loss": 0.314, + "num_input_tokens_seen": 9855344, + "step": 6450 + }, + { + "epoch": 19.924265842349303, + "grad_norm": 0.49990805983543396, + "learning_rate": 4.685639038317762e-05, + "loss": 0.2751, + "num_input_tokens_seen": 9863184, + "step": 6455 + }, + { + "epoch": 19.939721792890264, + "grad_norm": 1.2044134140014648, + "learning_rate": 4.685162264791921e-05, + "loss": 0.3846, + "num_input_tokens_seen": 9871472, + "step": 6460 + }, + { + "epoch": 19.95517774343122, + "grad_norm": 0.47959715127944946, + "learning_rate": 4.684685154286599e-05, + "loss": 0.4465, + "num_input_tokens_seen": 9879184, + "step": 6465 + }, + { + "epoch": 19.97063369397218, + "grad_norm": 0.7492236495018005, + "learning_rate": 4.684207706875371e-05, + "loss": 0.2922, + "num_input_tokens_seen": 9886960, + "step": 6470 + }, + { + "epoch": 19.986089644513136, + "grad_norm": 0.7180370092391968, + "learning_rate": 4.683729922631866e-05, + "loss": 0.3458, + "num_input_tokens_seen": 9894512, + "step": 6475 + }, + { + "epoch": 20.0, + "grad_norm": 1.0040011405944824, + "learning_rate": 4.683251801629765e-05, + "loss": 0.332, + "num_input_tokens_seen": 9901280, + "step": 6480 + }, + { + "epoch": 20.015455950540957, + "grad_norm": 0.4137362241744995, + "learning_rate": 4.6827733439428e-05, + "loss": 0.3085, + "num_input_tokens_seen": 9908448, + "step": 6485 + }, + { + "epoch": 20.030911901081918, + "grad_norm": 0.5318703055381775, + "learning_rate": 4.682294549644754e-05, + "loss": 0.2918, + "num_input_tokens_seen": 9915904, + "step": 6490 + }, + { + "epoch": 20.046367851622875, + "grad_norm": 0.851500928401947, + "learning_rate": 4.681815418809464e-05, + "loss": 0.3201, + "num_input_tokens_seen": 9924384, + "step": 6495 + }, + { + "epoch": 20.061823802163833, + "grad_norm": 0.8718507289886475, + "learning_rate": 4.681335951510819e-05, + "loss": 0.3004, + "num_input_tokens_seen": 9932448, + "step": 6500 + }, + { + "epoch": 20.07727975270479, + "grad_norm": 0.37704914808273315, + "learning_rate": 4.6808561478227576e-05, + "loss": 0.3033, + "num_input_tokens_seen": 9939744, + "step": 6505 + }, + { + "epoch": 20.09273570324575, + "grad_norm": 0.44853660464286804, + "learning_rate": 4.680376007819271e-05, + "loss": 0.2514, + "num_input_tokens_seen": 9947744, + "step": 6510 + }, + { + "epoch": 20.108191653786708, + "grad_norm": 1.1261627674102783, + "learning_rate": 4.679895531574405e-05, + "loss": 0.3533, + "num_input_tokens_seen": 9955072, + "step": 6515 + }, + { + "epoch": 20.123647604327665, + "grad_norm": 0.3747239410877228, + "learning_rate": 4.679414719162253e-05, + "loss": 0.3073, + "num_input_tokens_seen": 9962752, + "step": 6520 + }, + { + "epoch": 20.139103554868626, + "grad_norm": 0.5027480125427246, + "learning_rate": 4.6789335706569635e-05, + "loss": 0.287, + "num_input_tokens_seen": 9970784, + "step": 6525 + }, + { + "epoch": 20.154559505409583, + "grad_norm": 0.7386450171470642, + "learning_rate": 4.678452086132734e-05, + "loss": 0.2999, + "num_input_tokens_seen": 9978176, + "step": 6530 + }, + { + "epoch": 20.17001545595054, + "grad_norm": 0.563643217086792, + "learning_rate": 4.677970265663818e-05, + "loss": 0.3926, + "num_input_tokens_seen": 9985728, + "step": 6535 + }, + { + "epoch": 20.185471406491498, + "grad_norm": 0.6800942420959473, + "learning_rate": 4.677488109324517e-05, + "loss": 0.3284, + "num_input_tokens_seen": 9993568, + "step": 6540 + }, + { + "epoch": 20.20092735703246, + "grad_norm": 0.5700423121452332, + "learning_rate": 4.6770056171891846e-05, + "loss": 0.3589, + "num_input_tokens_seen": 10001344, + "step": 6545 + }, + { + "epoch": 20.216383307573416, + "grad_norm": 0.48728451132774353, + "learning_rate": 4.6765227893322286e-05, + "loss": 0.2827, + "num_input_tokens_seen": 10008512, + "step": 6550 + }, + { + "epoch": 20.231839258114373, + "grad_norm": 0.6094002723693848, + "learning_rate": 4.676039625828107e-05, + "loss": 0.3162, + "num_input_tokens_seen": 10015840, + "step": 6555 + }, + { + "epoch": 20.24729520865533, + "grad_norm": 0.47687828540802, + "learning_rate": 4.675556126751328e-05, + "loss": 0.2956, + "num_input_tokens_seen": 10023456, + "step": 6560 + }, + { + "epoch": 20.26275115919629, + "grad_norm": 0.45789995789527893, + "learning_rate": 4.6750722921764556e-05, + "loss": 0.412, + "num_input_tokens_seen": 10031104, + "step": 6565 + }, + { + "epoch": 20.27820710973725, + "grad_norm": 0.3525905907154083, + "learning_rate": 4.674588122178102e-05, + "loss": 0.3181, + "num_input_tokens_seen": 10039136, + "step": 6570 + }, + { + "epoch": 20.293663060278206, + "grad_norm": 0.4267673194408417, + "learning_rate": 4.674103616830931e-05, + "loss": 0.2771, + "num_input_tokens_seen": 10046944, + "step": 6575 + }, + { + "epoch": 20.309119010819167, + "grad_norm": 0.6155996322631836, + "learning_rate": 4.673618776209663e-05, + "loss": 0.3224, + "num_input_tokens_seen": 10054848, + "step": 6580 + }, + { + "epoch": 20.324574961360124, + "grad_norm": 0.779651403427124, + "learning_rate": 4.673133600389063e-05, + "loss": 0.3389, + "num_input_tokens_seen": 10062080, + "step": 6585 + }, + { + "epoch": 20.34003091190108, + "grad_norm": 0.6163356304168701, + "learning_rate": 4.672648089443953e-05, + "loss": 0.3357, + "num_input_tokens_seen": 10069600, + "step": 6590 + }, + { + "epoch": 20.35548686244204, + "grad_norm": 0.5118224024772644, + "learning_rate": 4.672162243449204e-05, + "loss": 0.2928, + "num_input_tokens_seen": 10078016, + "step": 6595 + }, + { + "epoch": 20.370942812983, + "grad_norm": 0.6092512011528015, + "learning_rate": 4.67167606247974e-05, + "loss": 0.2646, + "num_input_tokens_seen": 10085888, + "step": 6600 + }, + { + "epoch": 20.370942812983, + "eval_loss": 0.37446561455726624, + "eval_runtime": 6.3428, + "eval_samples_per_second": 90.654, + "eval_steps_per_second": 22.703, + "num_input_tokens_seen": 10085888, + "step": 6600 + }, + { + "epoch": 20.386398763523957, + "grad_norm": 0.3959643840789795, + "learning_rate": 4.671189546610536e-05, + "loss": 0.3086, + "num_input_tokens_seen": 10093152, + "step": 6605 + }, + { + "epoch": 20.401854714064914, + "grad_norm": 0.8311211466789246, + "learning_rate": 4.67070269591662e-05, + "loss": 0.3522, + "num_input_tokens_seen": 10100832, + "step": 6610 + }, + { + "epoch": 20.417310664605875, + "grad_norm": 0.5197929739952087, + "learning_rate": 4.670215510473068e-05, + "loss": 0.2768, + "num_input_tokens_seen": 10108384, + "step": 6615 + }, + { + "epoch": 20.432766615146832, + "grad_norm": 0.7418018579483032, + "learning_rate": 4.669727990355013e-05, + "loss": 0.3628, + "num_input_tokens_seen": 10116416, + "step": 6620 + }, + { + "epoch": 20.44822256568779, + "grad_norm": 1.2884290218353271, + "learning_rate": 4.669240135637635e-05, + "loss": 0.3292, + "num_input_tokens_seen": 10123584, + "step": 6625 + }, + { + "epoch": 20.463678516228747, + "grad_norm": 0.4514104127883911, + "learning_rate": 4.6687519463961675e-05, + "loss": 0.2979, + "num_input_tokens_seen": 10131520, + "step": 6630 + }, + { + "epoch": 20.479134466769708, + "grad_norm": 0.7831520438194275, + "learning_rate": 4.668263422705896e-05, + "loss": 0.4158, + "num_input_tokens_seen": 10139328, + "step": 6635 + }, + { + "epoch": 20.494590417310665, + "grad_norm": 0.5649375319480896, + "learning_rate": 4.667774564642156e-05, + "loss": 0.2795, + "num_input_tokens_seen": 10146720, + "step": 6640 + }, + { + "epoch": 20.510046367851622, + "grad_norm": 0.30011048913002014, + "learning_rate": 4.6672853722803365e-05, + "loss": 0.2516, + "num_input_tokens_seen": 10154272, + "step": 6645 + }, + { + "epoch": 20.525502318392583, + "grad_norm": 0.909736692905426, + "learning_rate": 4.666795845695877e-05, + "loss": 0.3655, + "num_input_tokens_seen": 10161920, + "step": 6650 + }, + { + "epoch": 20.54095826893354, + "grad_norm": 0.6733002066612244, + "learning_rate": 4.666305984964269e-05, + "loss": 0.2458, + "num_input_tokens_seen": 10169408, + "step": 6655 + }, + { + "epoch": 20.556414219474497, + "grad_norm": 0.5303699970245361, + "learning_rate": 4.6658157901610535e-05, + "loss": 0.294, + "num_input_tokens_seen": 10177504, + "step": 6660 + }, + { + "epoch": 20.571870170015455, + "grad_norm": 0.5757824182510376, + "learning_rate": 4.665325261361826e-05, + "loss": 0.3033, + "num_input_tokens_seen": 10185312, + "step": 6665 + }, + { + "epoch": 20.587326120556416, + "grad_norm": 0.38786187767982483, + "learning_rate": 4.664834398642232e-05, + "loss": 0.2906, + "num_input_tokens_seen": 10193184, + "step": 6670 + }, + { + "epoch": 20.602782071097373, + "grad_norm": 0.623489260673523, + "learning_rate": 4.6643432020779686e-05, + "loss": 0.3036, + "num_input_tokens_seen": 10200704, + "step": 6675 + }, + { + "epoch": 20.61823802163833, + "grad_norm": 0.7820562124252319, + "learning_rate": 4.663851671744786e-05, + "loss": 0.3195, + "num_input_tokens_seen": 10208768, + "step": 6680 + }, + { + "epoch": 20.633693972179287, + "grad_norm": 0.4612707793712616, + "learning_rate": 4.6633598077184815e-05, + "loss": 0.3425, + "num_input_tokens_seen": 10216160, + "step": 6685 + }, + { + "epoch": 20.649149922720248, + "grad_norm": 0.47800368070602417, + "learning_rate": 4.662867610074908e-05, + "loss": 0.389, + "num_input_tokens_seen": 10223712, + "step": 6690 + }, + { + "epoch": 20.664605873261205, + "grad_norm": 0.6952123641967773, + "learning_rate": 4.6623750788899696e-05, + "loss": 0.2867, + "num_input_tokens_seen": 10231680, + "step": 6695 + }, + { + "epoch": 20.680061823802163, + "grad_norm": 0.35630521178245544, + "learning_rate": 4.6618822142396195e-05, + "loss": 0.3866, + "num_input_tokens_seen": 10238944, + "step": 6700 + }, + { + "epoch": 20.695517774343124, + "grad_norm": 0.5338093042373657, + "learning_rate": 4.661389016199864e-05, + "loss": 0.3833, + "num_input_tokens_seen": 10247136, + "step": 6705 + }, + { + "epoch": 20.71097372488408, + "grad_norm": 0.4746811091899872, + "learning_rate": 4.660895484846761e-05, + "loss": 0.3267, + "num_input_tokens_seen": 10254848, + "step": 6710 + }, + { + "epoch": 20.726429675425038, + "grad_norm": 0.4697469472885132, + "learning_rate": 4.660401620256418e-05, + "loss": 0.3491, + "num_input_tokens_seen": 10262976, + "step": 6715 + }, + { + "epoch": 20.741885625965995, + "grad_norm": 0.3735218048095703, + "learning_rate": 4.659907422504997e-05, + "loss": 0.3209, + "num_input_tokens_seen": 10270688, + "step": 6720 + }, + { + "epoch": 20.757341576506956, + "grad_norm": 1.1122324466705322, + "learning_rate": 4.6594128916687074e-05, + "loss": 0.3339, + "num_input_tokens_seen": 10278336, + "step": 6725 + }, + { + "epoch": 20.772797527047913, + "grad_norm": 0.5539305210113525, + "learning_rate": 4.658918027823813e-05, + "loss": 0.2911, + "num_input_tokens_seen": 10286304, + "step": 6730 + }, + { + "epoch": 20.78825347758887, + "grad_norm": 0.4188987910747528, + "learning_rate": 4.658422831046628e-05, + "loss": 0.3143, + "num_input_tokens_seen": 10293856, + "step": 6735 + }, + { + "epoch": 20.80370942812983, + "grad_norm": 0.463886022567749, + "learning_rate": 4.657927301413518e-05, + "loss": 0.318, + "num_input_tokens_seen": 10301184, + "step": 6740 + }, + { + "epoch": 20.81916537867079, + "grad_norm": 0.5276710391044617, + "learning_rate": 4.657431439000901e-05, + "loss": 0.303, + "num_input_tokens_seen": 10308896, + "step": 6745 + }, + { + "epoch": 20.834621329211746, + "grad_norm": 0.3705928325653076, + "learning_rate": 4.656935243885243e-05, + "loss": 0.236, + "num_input_tokens_seen": 10316512, + "step": 6750 + }, + { + "epoch": 20.850077279752703, + "grad_norm": 0.6944977045059204, + "learning_rate": 4.656438716143066e-05, + "loss": 0.321, + "num_input_tokens_seen": 10324032, + "step": 6755 + }, + { + "epoch": 20.865533230293664, + "grad_norm": 0.7788947820663452, + "learning_rate": 4.6559418558509384e-05, + "loss": 0.2749, + "num_input_tokens_seen": 10331328, + "step": 6760 + }, + { + "epoch": 20.88098918083462, + "grad_norm": 0.6618296504020691, + "learning_rate": 4.6554446630854833e-05, + "loss": 0.3073, + "num_input_tokens_seen": 10339328, + "step": 6765 + }, + { + "epoch": 20.89644513137558, + "grad_norm": 0.34084552526474, + "learning_rate": 4.654947137923374e-05, + "loss": 0.2951, + "num_input_tokens_seen": 10346368, + "step": 6770 + }, + { + "epoch": 20.91190108191654, + "grad_norm": 0.6642453670501709, + "learning_rate": 4.654449280441335e-05, + "loss": 0.3791, + "num_input_tokens_seen": 10353920, + "step": 6775 + }, + { + "epoch": 20.927357032457497, + "grad_norm": 0.7078308463096619, + "learning_rate": 4.653951090716143e-05, + "loss": 0.2961, + "num_input_tokens_seen": 10361408, + "step": 6780 + }, + { + "epoch": 20.942812982998454, + "grad_norm": 0.4350404143333435, + "learning_rate": 4.653452568824625e-05, + "loss": 0.3561, + "num_input_tokens_seen": 10369728, + "step": 6785 + }, + { + "epoch": 20.95826893353941, + "grad_norm": 0.5600757598876953, + "learning_rate": 4.6529537148436585e-05, + "loss": 0.2303, + "num_input_tokens_seen": 10377184, + "step": 6790 + }, + { + "epoch": 20.973724884080372, + "grad_norm": 0.31091707944869995, + "learning_rate": 4.6524545288501734e-05, + "loss": 0.2645, + "num_input_tokens_seen": 10384480, + "step": 6795 + }, + { + "epoch": 20.98918083462133, + "grad_norm": 0.5507645606994629, + "learning_rate": 4.6519550109211506e-05, + "loss": 0.2759, + "num_input_tokens_seen": 10391904, + "step": 6800 + }, + { + "epoch": 20.98918083462133, + "eval_loss": 0.37025368213653564, + "eval_runtime": 6.3278, + "eval_samples_per_second": 90.869, + "eval_steps_per_second": 22.757, + "num_input_tokens_seen": 10391904, + "step": 6800 + }, + { + "epoch": 21.00309119010819, + "grad_norm": 0.6314232349395752, + "learning_rate": 4.651455161133622e-05, + "loss": 0.4299, + "num_input_tokens_seen": 10398144, + "step": 6805 + }, + { + "epoch": 21.01854714064915, + "grad_norm": 0.6916938424110413, + "learning_rate": 4.6509549795646704e-05, + "loss": 0.2923, + "num_input_tokens_seen": 10405568, + "step": 6810 + }, + { + "epoch": 21.034003091190108, + "grad_norm": 0.5412012338638306, + "learning_rate": 4.6504544662914306e-05, + "loss": 0.3242, + "num_input_tokens_seen": 10413728, + "step": 6815 + }, + { + "epoch": 21.049459041731065, + "grad_norm": 0.48048949241638184, + "learning_rate": 4.6499536213910876e-05, + "loss": 0.3396, + "num_input_tokens_seen": 10421408, + "step": 6820 + }, + { + "epoch": 21.064914992272026, + "grad_norm": 0.6655123233795166, + "learning_rate": 4.6494524449408786e-05, + "loss": 0.2533, + "num_input_tokens_seen": 10429568, + "step": 6825 + }, + { + "epoch": 21.080370942812984, + "grad_norm": 0.4339289963245392, + "learning_rate": 4.6489509370180903e-05, + "loss": 0.3598, + "num_input_tokens_seen": 10437696, + "step": 6830 + }, + { + "epoch": 21.09582689335394, + "grad_norm": 0.63597172498703, + "learning_rate": 4.648449097700063e-05, + "loss": 0.3387, + "num_input_tokens_seen": 10445344, + "step": 6835 + }, + { + "epoch": 21.111282843894898, + "grad_norm": 0.469043493270874, + "learning_rate": 4.647946927064185e-05, + "loss": 0.3061, + "num_input_tokens_seen": 10452960, + "step": 6840 + }, + { + "epoch": 21.12673879443586, + "grad_norm": 0.6493447422981262, + "learning_rate": 4.647444425187898e-05, + "loss": 0.261, + "num_input_tokens_seen": 10460768, + "step": 6845 + }, + { + "epoch": 21.142194744976816, + "grad_norm": 0.5965878963470459, + "learning_rate": 4.646941592148695e-05, + "loss": 0.2895, + "num_input_tokens_seen": 10468896, + "step": 6850 + }, + { + "epoch": 21.157650695517773, + "grad_norm": 0.758394181728363, + "learning_rate": 4.646438428024117e-05, + "loss": 0.3053, + "num_input_tokens_seen": 10476096, + "step": 6855 + }, + { + "epoch": 21.173106646058734, + "grad_norm": 0.32339611649513245, + "learning_rate": 4.64593493289176e-05, + "loss": 0.3095, + "num_input_tokens_seen": 10483872, + "step": 6860 + }, + { + "epoch": 21.18856259659969, + "grad_norm": 0.6981034874916077, + "learning_rate": 4.64543110682927e-05, + "loss": 0.3372, + "num_input_tokens_seen": 10491712, + "step": 6865 + }, + { + "epoch": 21.20401854714065, + "grad_norm": 0.6534978747367859, + "learning_rate": 4.644926949914341e-05, + "loss": 0.3203, + "num_input_tokens_seen": 10499136, + "step": 6870 + }, + { + "epoch": 21.219474497681606, + "grad_norm": 0.6923830509185791, + "learning_rate": 4.644422462224722e-05, + "loss": 0.373, + "num_input_tokens_seen": 10507328, + "step": 6875 + }, + { + "epoch": 21.234930448222567, + "grad_norm": 0.4354461431503296, + "learning_rate": 4.643917643838211e-05, + "loss": 0.3362, + "num_input_tokens_seen": 10515200, + "step": 6880 + }, + { + "epoch": 21.250386398763524, + "grad_norm": 0.5080179572105408, + "learning_rate": 4.6434124948326564e-05, + "loss": 0.3372, + "num_input_tokens_seen": 10522656, + "step": 6885 + }, + { + "epoch": 21.26584234930448, + "grad_norm": 0.6538461446762085, + "learning_rate": 4.6429070152859594e-05, + "loss": 0.2865, + "num_input_tokens_seen": 10530272, + "step": 6890 + }, + { + "epoch": 21.28129829984544, + "grad_norm": 0.47602543234825134, + "learning_rate": 4.6424012052760714e-05, + "loss": 0.2596, + "num_input_tokens_seen": 10537920, + "step": 6895 + }, + { + "epoch": 21.2967542503864, + "grad_norm": 0.6049955487251282, + "learning_rate": 4.6418950648809945e-05, + "loss": 0.306, + "num_input_tokens_seen": 10545632, + "step": 6900 + }, + { + "epoch": 21.312210200927357, + "grad_norm": 0.3865658640861511, + "learning_rate": 4.641388594178782e-05, + "loss": 0.3017, + "num_input_tokens_seen": 10552864, + "step": 6905 + }, + { + "epoch": 21.327666151468314, + "grad_norm": 0.5511044263839722, + "learning_rate": 4.640881793247538e-05, + "loss": 0.2758, + "num_input_tokens_seen": 10560960, + "step": 6910 + }, + { + "epoch": 21.343122102009275, + "grad_norm": 0.6072506904602051, + "learning_rate": 4.6403746621654173e-05, + "loss": 0.3083, + "num_input_tokens_seen": 10568032, + "step": 6915 + }, + { + "epoch": 21.358578052550232, + "grad_norm": 0.34026017785072327, + "learning_rate": 4.639867201010626e-05, + "loss": 0.2823, + "num_input_tokens_seen": 10575392, + "step": 6920 + }, + { + "epoch": 21.37403400309119, + "grad_norm": 0.6633991003036499, + "learning_rate": 4.6393594098614204e-05, + "loss": 0.4552, + "num_input_tokens_seen": 10583488, + "step": 6925 + }, + { + "epoch": 21.389489953632147, + "grad_norm": 0.7070685625076294, + "learning_rate": 4.63885128879611e-05, + "loss": 0.3651, + "num_input_tokens_seen": 10591488, + "step": 6930 + }, + { + "epoch": 21.404945904173108, + "grad_norm": 0.6399410367012024, + "learning_rate": 4.638342837893052e-05, + "loss": 0.3617, + "num_input_tokens_seen": 10598656, + "step": 6935 + }, + { + "epoch": 21.420401854714065, + "grad_norm": 0.5183849334716797, + "learning_rate": 4.6378340572306565e-05, + "loss": 0.3811, + "num_input_tokens_seen": 10606400, + "step": 6940 + }, + { + "epoch": 21.435857805255022, + "grad_norm": 0.2829866111278534, + "learning_rate": 4.6373249468873833e-05, + "loss": 0.2717, + "num_input_tokens_seen": 10613920, + "step": 6945 + }, + { + "epoch": 21.451313755795983, + "grad_norm": 0.7939328551292419, + "learning_rate": 4.636815506941744e-05, + "loss": 0.2467, + "num_input_tokens_seen": 10621856, + "step": 6950 + }, + { + "epoch": 21.46676970633694, + "grad_norm": 0.7023244500160217, + "learning_rate": 4.6363057374723004e-05, + "loss": 0.3262, + "num_input_tokens_seen": 10629248, + "step": 6955 + }, + { + "epoch": 21.482225656877898, + "grad_norm": 0.5786235928535461, + "learning_rate": 4.635795638557666e-05, + "loss": 0.3437, + "num_input_tokens_seen": 10636832, + "step": 6960 + }, + { + "epoch": 21.497681607418855, + "grad_norm": 0.507459819316864, + "learning_rate": 4.635285210276504e-05, + "loss": 0.3631, + "num_input_tokens_seen": 10644512, + "step": 6965 + }, + { + "epoch": 21.513137557959816, + "grad_norm": 0.3157813549041748, + "learning_rate": 4.6347744527075295e-05, + "loss": 0.2441, + "num_input_tokens_seen": 10652192, + "step": 6970 + }, + { + "epoch": 21.528593508500773, + "grad_norm": 0.6227090358734131, + "learning_rate": 4.634263365929506e-05, + "loss": 0.2764, + "num_input_tokens_seen": 10659712, + "step": 6975 + }, + { + "epoch": 21.54404945904173, + "grad_norm": 0.6656449437141418, + "learning_rate": 4.6337519500212515e-05, + "loss": 0.2947, + "num_input_tokens_seen": 10667648, + "step": 6980 + }, + { + "epoch": 21.55950540958269, + "grad_norm": 0.7498920559883118, + "learning_rate": 4.633240205061632e-05, + "loss": 0.3544, + "num_input_tokens_seen": 10674656, + "step": 6985 + }, + { + "epoch": 21.57496136012365, + "grad_norm": 0.6165667176246643, + "learning_rate": 4.632728131129565e-05, + "loss": 0.2779, + "num_input_tokens_seen": 10682016, + "step": 6990 + }, + { + "epoch": 21.590417310664606, + "grad_norm": 0.35079407691955566, + "learning_rate": 4.632215728304018e-05, + "loss": 0.2623, + "num_input_tokens_seen": 10689696, + "step": 6995 + }, + { + "epoch": 21.605873261205563, + "grad_norm": 0.3539835810661316, + "learning_rate": 4.63170299666401e-05, + "loss": 0.3038, + "num_input_tokens_seen": 10697664, + "step": 7000 + }, + { + "epoch": 21.605873261205563, + "eval_loss": 0.3680373430252075, + "eval_runtime": 6.2842, + "eval_samples_per_second": 91.499, + "eval_steps_per_second": 22.915, + "num_input_tokens_seen": 10697664, + "step": 7000 + }, + { + "epoch": 21.621329211746524, + "grad_norm": 0.55447918176651, + "learning_rate": 4.631189936288612e-05, + "loss": 0.2527, + "num_input_tokens_seen": 10705440, + "step": 7005 + }, + { + "epoch": 21.63678516228748, + "grad_norm": 0.305332213640213, + "learning_rate": 4.630676547256944e-05, + "loss": 0.3145, + "num_input_tokens_seen": 10712992, + "step": 7010 + }, + { + "epoch": 21.652241112828438, + "grad_norm": 0.4500327706336975, + "learning_rate": 4.630162829648176e-05, + "loss": 0.3043, + "num_input_tokens_seen": 10721376, + "step": 7015 + }, + { + "epoch": 21.667697063369395, + "grad_norm": 0.4773634970188141, + "learning_rate": 4.629648783541531e-05, + "loss": 0.3126, + "num_input_tokens_seen": 10729088, + "step": 7020 + }, + { + "epoch": 21.683153013910356, + "grad_norm": 0.6508457064628601, + "learning_rate": 4.6291344090162804e-05, + "loss": 0.3138, + "num_input_tokens_seen": 10736704, + "step": 7025 + }, + { + "epoch": 21.698608964451314, + "grad_norm": 0.35712313652038574, + "learning_rate": 4.628619706151748e-05, + "loss": 0.3363, + "num_input_tokens_seen": 10744480, + "step": 7030 + }, + { + "epoch": 21.71406491499227, + "grad_norm": 0.5219606161117554, + "learning_rate": 4.628104675027306e-05, + "loss": 0.2416, + "num_input_tokens_seen": 10751840, + "step": 7035 + }, + { + "epoch": 21.72952086553323, + "grad_norm": 0.4908055365085602, + "learning_rate": 4.6275893157223805e-05, + "loss": 0.2943, + "num_input_tokens_seen": 10759200, + "step": 7040 + }, + { + "epoch": 21.74497681607419, + "grad_norm": 0.6718637347221375, + "learning_rate": 4.627073628316445e-05, + "loss": 0.3508, + "num_input_tokens_seen": 10766688, + "step": 7045 + }, + { + "epoch": 21.760432766615146, + "grad_norm": 0.5742186903953552, + "learning_rate": 4.626557612889026e-05, + "loss": 0.3457, + "num_input_tokens_seen": 10773856, + "step": 7050 + }, + { + "epoch": 21.775888717156104, + "grad_norm": 0.4828290641307831, + "learning_rate": 4.626041269519699e-05, + "loss": 0.3085, + "num_input_tokens_seen": 10781888, + "step": 7055 + }, + { + "epoch": 21.791344667697064, + "grad_norm": 0.5139726400375366, + "learning_rate": 4.6255245982880905e-05, + "loss": 0.3708, + "num_input_tokens_seen": 10788832, + "step": 7060 + }, + { + "epoch": 21.80680061823802, + "grad_norm": 0.8231979012489319, + "learning_rate": 4.625007599273879e-05, + "loss": 0.3511, + "num_input_tokens_seen": 10796640, + "step": 7065 + }, + { + "epoch": 21.82225656877898, + "grad_norm": 0.4623977839946747, + "learning_rate": 4.6244902725567895e-05, + "loss": 0.2858, + "num_input_tokens_seen": 10804224, + "step": 7070 + }, + { + "epoch": 21.83771251931994, + "grad_norm": 0.3689199984073639, + "learning_rate": 4.6239726182166024e-05, + "loss": 0.27, + "num_input_tokens_seen": 10811904, + "step": 7075 + }, + { + "epoch": 21.853168469860897, + "grad_norm": 0.60722416639328, + "learning_rate": 4.623454636333147e-05, + "loss": 0.2648, + "num_input_tokens_seen": 10819712, + "step": 7080 + }, + { + "epoch": 21.868624420401854, + "grad_norm": 0.8616818785667419, + "learning_rate": 4.622936326986301e-05, + "loss": 0.3107, + "num_input_tokens_seen": 10827296, + "step": 7085 + }, + { + "epoch": 21.88408037094281, + "grad_norm": 0.8709008097648621, + "learning_rate": 4.6224176902559946e-05, + "loss": 0.3413, + "num_input_tokens_seen": 10834592, + "step": 7090 + }, + { + "epoch": 21.899536321483772, + "grad_norm": 0.34674233198165894, + "learning_rate": 4.621898726222209e-05, + "loss": 0.2938, + "num_input_tokens_seen": 10842048, + "step": 7095 + }, + { + "epoch": 21.91499227202473, + "grad_norm": 0.5478795170783997, + "learning_rate": 4.6213794349649744e-05, + "loss": 0.3541, + "num_input_tokens_seen": 10850240, + "step": 7100 + }, + { + "epoch": 21.930448222565687, + "grad_norm": 0.5890100002288818, + "learning_rate": 4.6208598165643715e-05, + "loss": 0.3331, + "num_input_tokens_seen": 10857856, + "step": 7105 + }, + { + "epoch": 21.945904173106648, + "grad_norm": 0.5115268230438232, + "learning_rate": 4.620339871100533e-05, + "loss": 0.3056, + "num_input_tokens_seen": 10865600, + "step": 7110 + }, + { + "epoch": 21.961360123647605, + "grad_norm": 0.891210675239563, + "learning_rate": 4.6198195986536394e-05, + "loss": 0.2924, + "num_input_tokens_seen": 10873216, + "step": 7115 + }, + { + "epoch": 21.976816074188562, + "grad_norm": 0.41079264879226685, + "learning_rate": 4.619298999303926e-05, + "loss": 0.2638, + "num_input_tokens_seen": 10880928, + "step": 7120 + }, + { + "epoch": 21.99227202472952, + "grad_norm": 0.7031074166297913, + "learning_rate": 4.618778073131673e-05, + "loss": 0.3202, + "num_input_tokens_seen": 10888480, + "step": 7125 + }, + { + "epoch": 22.006182380216384, + "grad_norm": 0.930313229560852, + "learning_rate": 4.618256820217215e-05, + "loss": 0.3062, + "num_input_tokens_seen": 10895232, + "step": 7130 + }, + { + "epoch": 22.02163833075734, + "grad_norm": 0.4496231973171234, + "learning_rate": 4.617735240640936e-05, + "loss": 0.2929, + "num_input_tokens_seen": 10902944, + "step": 7135 + }, + { + "epoch": 22.037094281298298, + "grad_norm": 0.6528662443161011, + "learning_rate": 4.6172133344832705e-05, + "loss": 0.2544, + "num_input_tokens_seen": 10910144, + "step": 7140 + }, + { + "epoch": 22.05255023183926, + "grad_norm": 0.39185675978660583, + "learning_rate": 4.6166911018247004e-05, + "loss": 0.3416, + "num_input_tokens_seen": 10917632, + "step": 7145 + }, + { + "epoch": 22.068006182380216, + "grad_norm": 0.48002609610557556, + "learning_rate": 4.616168542745764e-05, + "loss": 0.3055, + "num_input_tokens_seen": 10925120, + "step": 7150 + }, + { + "epoch": 22.083462132921174, + "grad_norm": 0.4196726977825165, + "learning_rate": 4.6156456573270446e-05, + "loss": 0.2456, + "num_input_tokens_seen": 10932384, + "step": 7155 + }, + { + "epoch": 22.098918083462134, + "grad_norm": 0.8278887867927551, + "learning_rate": 4.615122445649177e-05, + "loss": 0.3341, + "num_input_tokens_seen": 10939936, + "step": 7160 + }, + { + "epoch": 22.11437403400309, + "grad_norm": 0.6173334717750549, + "learning_rate": 4.6145989077928486e-05, + "loss": 0.2574, + "num_input_tokens_seen": 10947264, + "step": 7165 + }, + { + "epoch": 22.12982998454405, + "grad_norm": 0.5831827521324158, + "learning_rate": 4.6140750438387953e-05, + "loss": 0.3991, + "num_input_tokens_seen": 10954624, + "step": 7170 + }, + { + "epoch": 22.145285935085006, + "grad_norm": 0.44955000281333923, + "learning_rate": 4.613550853867803e-05, + "loss": 0.2752, + "num_input_tokens_seen": 10961952, + "step": 7175 + }, + { + "epoch": 22.160741885625967, + "grad_norm": 0.9405484199523926, + "learning_rate": 4.613026337960708e-05, + "loss": 0.3852, + "num_input_tokens_seen": 10969344, + "step": 7180 + }, + { + "epoch": 22.176197836166924, + "grad_norm": 0.7559421062469482, + "learning_rate": 4.612501496198398e-05, + "loss": 0.3368, + "num_input_tokens_seen": 10976608, + "step": 7185 + }, + { + "epoch": 22.19165378670788, + "grad_norm": 0.5196495652198792, + "learning_rate": 4.61197632866181e-05, + "loss": 0.2695, + "num_input_tokens_seen": 10984416, + "step": 7190 + }, + { + "epoch": 22.207109737248842, + "grad_norm": 0.5791073441505432, + "learning_rate": 4.611450835431931e-05, + "loss": 0.2663, + "num_input_tokens_seen": 10992320, + "step": 7195 + }, + { + "epoch": 22.2225656877898, + "grad_norm": 0.7712091207504272, + "learning_rate": 4.6109250165898e-05, + "loss": 0.313, + "num_input_tokens_seen": 11000832, + "step": 7200 + }, + { + "epoch": 22.2225656877898, + "eval_loss": 0.3668964207172394, + "eval_runtime": 6.3221, + "eval_samples_per_second": 90.951, + "eval_steps_per_second": 22.777, + "num_input_tokens_seen": 11000832, + "step": 7200 + }, + { + "epoch": 22.238021638330757, + "grad_norm": 0.5238305926322937, + "learning_rate": 4.610398872216503e-05, + "loss": 0.2664, + "num_input_tokens_seen": 11008576, + "step": 7205 + }, + { + "epoch": 22.253477588871714, + "grad_norm": 1.5130215883255005, + "learning_rate": 4.6098724023931796e-05, + "loss": 0.3401, + "num_input_tokens_seen": 11016416, + "step": 7210 + }, + { + "epoch": 22.268933539412675, + "grad_norm": 0.5417104959487915, + "learning_rate": 4.609345607201017e-05, + "loss": 0.299, + "num_input_tokens_seen": 11023968, + "step": 7215 + }, + { + "epoch": 22.284389489953632, + "grad_norm": 0.45903822779655457, + "learning_rate": 4.608818486721254e-05, + "loss": 0.2714, + "num_input_tokens_seen": 11031616, + "step": 7220 + }, + { + "epoch": 22.29984544049459, + "grad_norm": 0.44126826524734497, + "learning_rate": 4.608291041035179e-05, + "loss": 0.3889, + "num_input_tokens_seen": 11039136, + "step": 7225 + }, + { + "epoch": 22.315301391035547, + "grad_norm": 0.7781461477279663, + "learning_rate": 4.607763270224132e-05, + "loss": 0.3074, + "num_input_tokens_seen": 11046336, + "step": 7230 + }, + { + "epoch": 22.330757341576508, + "grad_norm": 0.40850475430488586, + "learning_rate": 4.6072351743695e-05, + "loss": 0.2472, + "num_input_tokens_seen": 11054400, + "step": 7235 + }, + { + "epoch": 22.346213292117465, + "grad_norm": 0.5661147832870483, + "learning_rate": 4.606706753552723e-05, + "loss": 0.3014, + "num_input_tokens_seen": 11062240, + "step": 7240 + }, + { + "epoch": 22.361669242658422, + "grad_norm": 0.573276162147522, + "learning_rate": 4.6061780078552906e-05, + "loss": 0.3693, + "num_input_tokens_seen": 11069856, + "step": 7245 + }, + { + "epoch": 22.377125193199383, + "grad_norm": 0.570249617099762, + "learning_rate": 4.605648937358742e-05, + "loss": 0.2308, + "num_input_tokens_seen": 11077440, + "step": 7250 + }, + { + "epoch": 22.39258114374034, + "grad_norm": 0.5394953489303589, + "learning_rate": 4.605119542144665e-05, + "loss": 0.2385, + "num_input_tokens_seen": 11084576, + "step": 7255 + }, + { + "epoch": 22.408037094281298, + "grad_norm": 0.5904368758201599, + "learning_rate": 4.604589822294701e-05, + "loss": 0.2768, + "num_input_tokens_seen": 11092544, + "step": 7260 + }, + { + "epoch": 22.423493044822255, + "grad_norm": 0.492351770401001, + "learning_rate": 4.604059777890537e-05, + "loss": 0.3082, + "num_input_tokens_seen": 11100288, + "step": 7265 + }, + { + "epoch": 22.438948995363216, + "grad_norm": 0.8156108856201172, + "learning_rate": 4.6035294090139145e-05, + "loss": 0.2647, + "num_input_tokens_seen": 11107808, + "step": 7270 + }, + { + "epoch": 22.454404945904173, + "grad_norm": 0.8619848489761353, + "learning_rate": 4.6029987157466226e-05, + "loss": 0.2872, + "num_input_tokens_seen": 11115648, + "step": 7275 + }, + { + "epoch": 22.46986089644513, + "grad_norm": 0.6211925745010376, + "learning_rate": 4.602467698170502e-05, + "loss": 0.3227, + "num_input_tokens_seen": 11123520, + "step": 7280 + }, + { + "epoch": 22.48531684698609, + "grad_norm": 0.7977660298347473, + "learning_rate": 4.601936356367439e-05, + "loss": 0.3196, + "num_input_tokens_seen": 11131040, + "step": 7285 + }, + { + "epoch": 22.50077279752705, + "grad_norm": 0.6427186727523804, + "learning_rate": 4.601404690419377e-05, + "loss": 0.2176, + "num_input_tokens_seen": 11138720, + "step": 7290 + }, + { + "epoch": 22.516228748068006, + "grad_norm": 0.8422905206680298, + "learning_rate": 4.600872700408303e-05, + "loss": 0.2807, + "num_input_tokens_seen": 11147200, + "step": 7295 + }, + { + "epoch": 22.531684698608963, + "grad_norm": 0.6601250171661377, + "learning_rate": 4.600340386416258e-05, + "loss": 0.3309, + "num_input_tokens_seen": 11155232, + "step": 7300 + }, + { + "epoch": 22.547140649149924, + "grad_norm": 0.7090789675712585, + "learning_rate": 4.5998077485253296e-05, + "loss": 0.2627, + "num_input_tokens_seen": 11162496, + "step": 7305 + }, + { + "epoch": 22.56259659969088, + "grad_norm": 0.7865273356437683, + "learning_rate": 4.59927478681766e-05, + "loss": 0.3857, + "num_input_tokens_seen": 11170080, + "step": 7310 + }, + { + "epoch": 22.57805255023184, + "grad_norm": 0.6344608664512634, + "learning_rate": 4.5987415013754366e-05, + "loss": 0.2794, + "num_input_tokens_seen": 11177344, + "step": 7315 + }, + { + "epoch": 22.5935085007728, + "grad_norm": 0.31252196431159973, + "learning_rate": 4.598207892280899e-05, + "loss": 0.325, + "num_input_tokens_seen": 11184960, + "step": 7320 + }, + { + "epoch": 22.608964451313756, + "grad_norm": 1.139236330986023, + "learning_rate": 4.597673959616337e-05, + "loss": 0.3643, + "num_input_tokens_seen": 11192160, + "step": 7325 + }, + { + "epoch": 22.624420401854714, + "grad_norm": 1.0849857330322266, + "learning_rate": 4.597139703464089e-05, + "loss": 0.3901, + "num_input_tokens_seen": 11200064, + "step": 7330 + }, + { + "epoch": 22.63987635239567, + "grad_norm": 0.33517372608184814, + "learning_rate": 4.596605123906545e-05, + "loss": 0.3109, + "num_input_tokens_seen": 11207680, + "step": 7335 + }, + { + "epoch": 22.655332302936632, + "grad_norm": 0.49710512161254883, + "learning_rate": 4.596070221026143e-05, + "loss": 0.331, + "num_input_tokens_seen": 11215200, + "step": 7340 + }, + { + "epoch": 22.67078825347759, + "grad_norm": 0.5442102551460266, + "learning_rate": 4.595534994905372e-05, + "loss": 0.2359, + "num_input_tokens_seen": 11223104, + "step": 7345 + }, + { + "epoch": 22.686244204018546, + "grad_norm": 0.4656025469303131, + "learning_rate": 4.594999445626771e-05, + "loss": 0.3869, + "num_input_tokens_seen": 11231040, + "step": 7350 + }, + { + "epoch": 22.701700154559504, + "grad_norm": 0.6386780142784119, + "learning_rate": 4.5944635732729276e-05, + "loss": 0.3155, + "num_input_tokens_seen": 11238816, + "step": 7355 + }, + { + "epoch": 22.717156105100464, + "grad_norm": 0.6280299425125122, + "learning_rate": 4.5939273779264804e-05, + "loss": 0.2985, + "num_input_tokens_seen": 11246784, + "step": 7360 + }, + { + "epoch": 22.73261205564142, + "grad_norm": 0.4551738500595093, + "learning_rate": 4.593390859670118e-05, + "loss": 0.2498, + "num_input_tokens_seen": 11254688, + "step": 7365 + }, + { + "epoch": 22.74806800618238, + "grad_norm": 0.8762960433959961, + "learning_rate": 4.5928540185865776e-05, + "loss": 0.3006, + "num_input_tokens_seen": 11262272, + "step": 7370 + }, + { + "epoch": 22.76352395672334, + "grad_norm": 0.8097842335700989, + "learning_rate": 4.592316854758648e-05, + "loss": 0.3166, + "num_input_tokens_seen": 11269952, + "step": 7375 + }, + { + "epoch": 22.778979907264297, + "grad_norm": 0.7054967284202576, + "learning_rate": 4.5917793682691646e-05, + "loss": 0.383, + "num_input_tokens_seen": 11277504, + "step": 7380 + }, + { + "epoch": 22.794435857805254, + "grad_norm": 0.7241359353065491, + "learning_rate": 4.5912415592010164e-05, + "loss": 0.3348, + "num_input_tokens_seen": 11285344, + "step": 7385 + }, + { + "epoch": 22.80989180834621, + "grad_norm": 0.35520538687705994, + "learning_rate": 4.5907034276371386e-05, + "loss": 0.2537, + "num_input_tokens_seen": 11293248, + "step": 7390 + }, + { + "epoch": 22.825347758887172, + "grad_norm": 0.7292298078536987, + "learning_rate": 4.5901649736605196e-05, + "loss": 0.2694, + "num_input_tokens_seen": 11300800, + "step": 7395 + }, + { + "epoch": 22.84080370942813, + "grad_norm": 0.731389582157135, + "learning_rate": 4.589626197354195e-05, + "loss": 0.2537, + "num_input_tokens_seen": 11308384, + "step": 7400 + }, + { + "epoch": 22.84080370942813, + "eval_loss": 0.36499109864234924, + "eval_runtime": 6.3158, + "eval_samples_per_second": 91.041, + "eval_steps_per_second": 22.8, + "num_input_tokens_seen": 11308384, + "step": 7400 + }, + { + "epoch": 22.856259659969087, + "grad_norm": 0.7141160368919373, + "learning_rate": 4.5890870988012504e-05, + "loss": 0.383, + "num_input_tokens_seen": 11316064, + "step": 7405 + }, + { + "epoch": 22.871715610510048, + "grad_norm": 0.3817422389984131, + "learning_rate": 4.5885476780848226e-05, + "loss": 0.3784, + "num_input_tokens_seen": 11323488, + "step": 7410 + }, + { + "epoch": 22.887171561051005, + "grad_norm": 0.585989773273468, + "learning_rate": 4.5880079352880964e-05, + "loss": 0.3605, + "num_input_tokens_seen": 11330560, + "step": 7415 + }, + { + "epoch": 22.902627511591962, + "grad_norm": 0.4276411533355713, + "learning_rate": 4.5874678704943065e-05, + "loss": 0.3578, + "num_input_tokens_seen": 11338464, + "step": 7420 + }, + { + "epoch": 22.91808346213292, + "grad_norm": 0.7376708388328552, + "learning_rate": 4.5869274837867394e-05, + "loss": 0.3036, + "num_input_tokens_seen": 11346464, + "step": 7425 + }, + { + "epoch": 22.93353941267388, + "grad_norm": 0.5020468831062317, + "learning_rate": 4.5863867752487275e-05, + "loss": 0.4406, + "num_input_tokens_seen": 11354784, + "step": 7430 + }, + { + "epoch": 22.948995363214838, + "grad_norm": 0.5590263605117798, + "learning_rate": 4.5858457449636554e-05, + "loss": 0.2837, + "num_input_tokens_seen": 11362176, + "step": 7435 + }, + { + "epoch": 22.964451313755795, + "grad_norm": 0.8856094479560852, + "learning_rate": 4.5853043930149574e-05, + "loss": 0.3532, + "num_input_tokens_seen": 11369536, + "step": 7440 + }, + { + "epoch": 22.979907264296756, + "grad_norm": 0.5163851976394653, + "learning_rate": 4.584762719486117e-05, + "loss": 0.2546, + "num_input_tokens_seen": 11377344, + "step": 7445 + }, + { + "epoch": 22.995363214837713, + "grad_norm": 0.4214087426662445, + "learning_rate": 4.584220724460665e-05, + "loss": 0.2228, + "num_input_tokens_seen": 11385376, + "step": 7450 + }, + { + "epoch": 23.009273570324574, + "grad_norm": 0.6245213150978088, + "learning_rate": 4.5836784080221865e-05, + "loss": 0.2769, + "num_input_tokens_seen": 11392192, + "step": 7455 + }, + { + "epoch": 23.024729520865534, + "grad_norm": 0.39155763387680054, + "learning_rate": 4.583135770254312e-05, + "loss": 0.2235, + "num_input_tokens_seen": 11399648, + "step": 7460 + }, + { + "epoch": 23.04018547140649, + "grad_norm": 0.7078691720962524, + "learning_rate": 4.5825928112407236e-05, + "loss": 0.2513, + "num_input_tokens_seen": 11407552, + "step": 7465 + }, + { + "epoch": 23.05564142194745, + "grad_norm": 0.749342143535614, + "learning_rate": 4.582049531065152e-05, + "loss": 0.3121, + "num_input_tokens_seen": 11415360, + "step": 7470 + }, + { + "epoch": 23.071097372488406, + "grad_norm": 0.4986439347267151, + "learning_rate": 4.5815059298113783e-05, + "loss": 0.2759, + "num_input_tokens_seen": 11423264, + "step": 7475 + }, + { + "epoch": 23.086553323029367, + "grad_norm": 0.45460888743400574, + "learning_rate": 4.580962007563232e-05, + "loss": 0.2617, + "num_input_tokens_seen": 11430784, + "step": 7480 + }, + { + "epoch": 23.102009273570324, + "grad_norm": 0.6938380599021912, + "learning_rate": 4.5804177644045935e-05, + "loss": 0.4279, + "num_input_tokens_seen": 11439136, + "step": 7485 + }, + { + "epoch": 23.11746522411128, + "grad_norm": 0.4370904564857483, + "learning_rate": 4.579873200419391e-05, + "loss": 0.3984, + "num_input_tokens_seen": 11445728, + "step": 7490 + }, + { + "epoch": 23.132921174652243, + "grad_norm": 0.5594463348388672, + "learning_rate": 4.5793283156916046e-05, + "loss": 0.3477, + "num_input_tokens_seen": 11453280, + "step": 7495 + }, + { + "epoch": 23.1483771251932, + "grad_norm": 1.0741807222366333, + "learning_rate": 4.578783110305261e-05, + "loss": 0.3235, + "num_input_tokens_seen": 11461248, + "step": 7500 + }, + { + "epoch": 23.163833075734157, + "grad_norm": 0.45252060890197754, + "learning_rate": 4.578237584344438e-05, + "loss": 0.2742, + "num_input_tokens_seen": 11468704, + "step": 7505 + }, + { + "epoch": 23.179289026275114, + "grad_norm": 0.3606509268283844, + "learning_rate": 4.577691737893263e-05, + "loss": 0.2815, + "num_input_tokens_seen": 11476864, + "step": 7510 + }, + { + "epoch": 23.194744976816075, + "grad_norm": 0.682913601398468, + "learning_rate": 4.577145571035912e-05, + "loss": 0.3609, + "num_input_tokens_seen": 11484480, + "step": 7515 + }, + { + "epoch": 23.210200927357032, + "grad_norm": 0.597296953201294, + "learning_rate": 4.576599083856611e-05, + "loss": 0.2694, + "num_input_tokens_seen": 11492672, + "step": 7520 + }, + { + "epoch": 23.22565687789799, + "grad_norm": 0.5913915634155273, + "learning_rate": 4.576052276439635e-05, + "loss": 0.2912, + "num_input_tokens_seen": 11499968, + "step": 7525 + }, + { + "epoch": 23.24111282843895, + "grad_norm": 0.756353497505188, + "learning_rate": 4.575505148869308e-05, + "loss": 0.2794, + "num_input_tokens_seen": 11508160, + "step": 7530 + }, + { + "epoch": 23.256568778979908, + "grad_norm": 0.4839560389518738, + "learning_rate": 4.574957701230006e-05, + "loss": 0.3205, + "num_input_tokens_seen": 11515328, + "step": 7535 + }, + { + "epoch": 23.272024729520865, + "grad_norm": 0.8603294491767883, + "learning_rate": 4.57440993360615e-05, + "loss": 0.3206, + "num_input_tokens_seen": 11522368, + "step": 7540 + }, + { + "epoch": 23.287480680061822, + "grad_norm": 0.5616886019706726, + "learning_rate": 4.5738618460822134e-05, + "loss": 0.2705, + "num_input_tokens_seen": 11530176, + "step": 7545 + }, + { + "epoch": 23.302936630602783, + "grad_norm": 0.41020452976226807, + "learning_rate": 4.573313438742719e-05, + "loss": 0.266, + "num_input_tokens_seen": 11537568, + "step": 7550 + }, + { + "epoch": 23.31839258114374, + "grad_norm": 0.4058946669101715, + "learning_rate": 4.5727647116722374e-05, + "loss": 0.2709, + "num_input_tokens_seen": 11545088, + "step": 7555 + }, + { + "epoch": 23.333848531684698, + "grad_norm": 0.9411524534225464, + "learning_rate": 4.5722156649553884e-05, + "loss": 0.3119, + "num_input_tokens_seen": 11552416, + "step": 7560 + }, + { + "epoch": 23.34930448222566, + "grad_norm": 0.4377877116203308, + "learning_rate": 4.571666298676843e-05, + "loss": 0.2741, + "num_input_tokens_seen": 11560160, + "step": 7565 + }, + { + "epoch": 23.364760432766616, + "grad_norm": 0.2839808762073517, + "learning_rate": 4.571116612921321e-05, + "loss": 0.2756, + "num_input_tokens_seen": 11567520, + "step": 7570 + }, + { + "epoch": 23.380216383307573, + "grad_norm": 0.8294721245765686, + "learning_rate": 4.57056660777359e-05, + "loss": 0.3354, + "num_input_tokens_seen": 11574976, + "step": 7575 + }, + { + "epoch": 23.39567233384853, + "grad_norm": 0.45717570185661316, + "learning_rate": 4.5700162833184666e-05, + "loss": 0.4136, + "num_input_tokens_seen": 11582752, + "step": 7580 + }, + { + "epoch": 23.41112828438949, + "grad_norm": 0.5063892006874084, + "learning_rate": 4.5694656396408195e-05, + "loss": 0.3508, + "num_input_tokens_seen": 11591104, + "step": 7585 + }, + { + "epoch": 23.42658423493045, + "grad_norm": 0.8479402661323547, + "learning_rate": 4.5689146768255646e-05, + "loss": 0.4067, + "num_input_tokens_seen": 11598944, + "step": 7590 + }, + { + "epoch": 23.442040185471406, + "grad_norm": 0.6617372632026672, + "learning_rate": 4.568363394957667e-05, + "loss": 0.3698, + "num_input_tokens_seen": 11606208, + "step": 7595 + }, + { + "epoch": 23.457496136012363, + "grad_norm": 0.5091160535812378, + "learning_rate": 4.567811794122141e-05, + "loss": 0.2978, + "num_input_tokens_seen": 11614048, + "step": 7600 + }, + { + "epoch": 23.457496136012363, + "eval_loss": 0.35944679379463196, + "eval_runtime": 6.2727, + "eval_samples_per_second": 91.666, + "eval_steps_per_second": 22.956, + "num_input_tokens_seen": 11614048, + "step": 7600 + }, + { + "epoch": 23.472952086553324, + "grad_norm": 0.5733751058578491, + "learning_rate": 4.56725987440405e-05, + "loss": 0.2944, + "num_input_tokens_seen": 11621888, + "step": 7605 + }, + { + "epoch": 23.48840803709428, + "grad_norm": 0.5269988775253296, + "learning_rate": 4.566707635888508e-05, + "loss": 0.3281, + "num_input_tokens_seen": 11629344, + "step": 7610 + }, + { + "epoch": 23.50386398763524, + "grad_norm": 0.39978811144828796, + "learning_rate": 4.566155078660677e-05, + "loss": 0.3515, + "num_input_tokens_seen": 11636864, + "step": 7615 + }, + { + "epoch": 23.5193199381762, + "grad_norm": 0.3153170943260193, + "learning_rate": 4.565602202805768e-05, + "loss": 0.3782, + "num_input_tokens_seen": 11644544, + "step": 7620 + }, + { + "epoch": 23.534775888717157, + "grad_norm": 0.6286723613739014, + "learning_rate": 4.56504900840904e-05, + "loss": 0.2636, + "num_input_tokens_seen": 11652096, + "step": 7625 + }, + { + "epoch": 23.550231839258114, + "grad_norm": 0.5083929300308228, + "learning_rate": 4.564495495555805e-05, + "loss": 0.2764, + "num_input_tokens_seen": 11660160, + "step": 7630 + }, + { + "epoch": 23.56568778979907, + "grad_norm": 0.6919136047363281, + "learning_rate": 4.5639416643314204e-05, + "loss": 0.2591, + "num_input_tokens_seen": 11667552, + "step": 7635 + }, + { + "epoch": 23.581143740340032, + "grad_norm": 0.3924018442630768, + "learning_rate": 4.5633875148212946e-05, + "loss": 0.2636, + "num_input_tokens_seen": 11675264, + "step": 7640 + }, + { + "epoch": 23.59659969088099, + "grad_norm": 1.0506514310836792, + "learning_rate": 4.562833047110883e-05, + "loss": 0.3504, + "num_input_tokens_seen": 11682496, + "step": 7645 + }, + { + "epoch": 23.612055641421946, + "grad_norm": 1.0131430625915527, + "learning_rate": 4.5622782612856923e-05, + "loss": 0.3345, + "num_input_tokens_seen": 11689536, + "step": 7650 + }, + { + "epoch": 23.627511591962907, + "grad_norm": 0.47396737337112427, + "learning_rate": 4.561723157431278e-05, + "loss": 0.3213, + "num_input_tokens_seen": 11696896, + "step": 7655 + }, + { + "epoch": 23.642967542503865, + "grad_norm": 0.7663398385047913, + "learning_rate": 4.5611677356332435e-05, + "loss": 0.3014, + "num_input_tokens_seen": 11705024, + "step": 7660 + }, + { + "epoch": 23.658423493044822, + "grad_norm": 0.4235014021396637, + "learning_rate": 4.560611995977242e-05, + "loss": 0.2907, + "num_input_tokens_seen": 11712608, + "step": 7665 + }, + { + "epoch": 23.67387944358578, + "grad_norm": 0.42633527517318726, + "learning_rate": 4.560055938548975e-05, + "loss": 0.2494, + "num_input_tokens_seen": 11720352, + "step": 7670 + }, + { + "epoch": 23.68933539412674, + "grad_norm": 0.3617478609085083, + "learning_rate": 4.5594995634341944e-05, + "loss": 0.2569, + "num_input_tokens_seen": 11727488, + "step": 7675 + }, + { + "epoch": 23.704791344667697, + "grad_norm": 0.42472437024116516, + "learning_rate": 4.5589428707187e-05, + "loss": 0.2269, + "num_input_tokens_seen": 11735104, + "step": 7680 + }, + { + "epoch": 23.720247295208654, + "grad_norm": 0.49009785056114197, + "learning_rate": 4.55838586048834e-05, + "loss": 0.2961, + "num_input_tokens_seen": 11743008, + "step": 7685 + }, + { + "epoch": 23.735703245749615, + "grad_norm": 1.190268635749817, + "learning_rate": 4.557828532829013e-05, + "loss": 0.3452, + "num_input_tokens_seen": 11750528, + "step": 7690 + }, + { + "epoch": 23.751159196290573, + "grad_norm": 0.6664317846298218, + "learning_rate": 4.557270887826667e-05, + "loss": 0.3238, + "num_input_tokens_seen": 11758432, + "step": 7695 + }, + { + "epoch": 23.76661514683153, + "grad_norm": 0.6927140355110168, + "learning_rate": 4.556712925567296e-05, + "loss": 0.3421, + "num_input_tokens_seen": 11765376, + "step": 7700 + }, + { + "epoch": 23.782071097372487, + "grad_norm": 0.5214433073997498, + "learning_rate": 4.5561546461369454e-05, + "loss": 0.2356, + "num_input_tokens_seen": 11772800, + "step": 7705 + }, + { + "epoch": 23.797527047913448, + "grad_norm": 0.6226105093955994, + "learning_rate": 4.55559604962171e-05, + "loss": 0.3122, + "num_input_tokens_seen": 11780512, + "step": 7710 + }, + { + "epoch": 23.812982998454405, + "grad_norm": 0.7088974714279175, + "learning_rate": 4.55503713610773e-05, + "loss": 0.2766, + "num_input_tokens_seen": 11787936, + "step": 7715 + }, + { + "epoch": 23.828438948995363, + "grad_norm": 0.9187988638877869, + "learning_rate": 4.5544779056812e-05, + "loss": 0.3387, + "num_input_tokens_seen": 11796128, + "step": 7720 + }, + { + "epoch": 23.84389489953632, + "grad_norm": 0.6568143963813782, + "learning_rate": 4.553918358428358e-05, + "loss": 0.3006, + "num_input_tokens_seen": 11804000, + "step": 7725 + }, + { + "epoch": 23.85935085007728, + "grad_norm": 0.5434647798538208, + "learning_rate": 4.553358494435494e-05, + "loss": 0.3592, + "num_input_tokens_seen": 11811776, + "step": 7730 + }, + { + "epoch": 23.874806800618238, + "grad_norm": 0.7357754111289978, + "learning_rate": 4.5527983137889464e-05, + "loss": 0.3273, + "num_input_tokens_seen": 11819776, + "step": 7735 + }, + { + "epoch": 23.890262751159195, + "grad_norm": 0.6726850867271423, + "learning_rate": 4.5522378165751015e-05, + "loss": 0.3006, + "num_input_tokens_seen": 11827520, + "step": 7740 + }, + { + "epoch": 23.905718701700156, + "grad_norm": 0.4270259141921997, + "learning_rate": 4.5516770028803954e-05, + "loss": 0.2162, + "num_input_tokens_seen": 11834848, + "step": 7745 + }, + { + "epoch": 23.921174652241113, + "grad_norm": 0.34485533833503723, + "learning_rate": 4.5511158727913116e-05, + "loss": 0.2603, + "num_input_tokens_seen": 11841984, + "step": 7750 + }, + { + "epoch": 23.93663060278207, + "grad_norm": 0.44614604115486145, + "learning_rate": 4.5505544263943856e-05, + "loss": 0.304, + "num_input_tokens_seen": 11849696, + "step": 7755 + }, + { + "epoch": 23.952086553323028, + "grad_norm": 0.6060568690299988, + "learning_rate": 4.549992663776197e-05, + "loss": 0.3216, + "num_input_tokens_seen": 11857408, + "step": 7760 + }, + { + "epoch": 23.96754250386399, + "grad_norm": 0.9875277876853943, + "learning_rate": 4.5494305850233786e-05, + "loss": 0.3155, + "num_input_tokens_seen": 11864736, + "step": 7765 + }, + { + "epoch": 23.982998454404946, + "grad_norm": 0.5144898891448975, + "learning_rate": 4.5488681902226094e-05, + "loss": 0.3024, + "num_input_tokens_seen": 11872576, + "step": 7770 + }, + { + "epoch": 23.998454404945903, + "grad_norm": 0.61871337890625, + "learning_rate": 4.5483054794606174e-05, + "loss": 0.2635, + "num_input_tokens_seen": 11880704, + "step": 7775 + }, + { + "epoch": 24.012364760432767, + "grad_norm": 0.3689112067222595, + "learning_rate": 4.547742452824179e-05, + "loss": 0.2539, + "num_input_tokens_seen": 11887152, + "step": 7780 + }, + { + "epoch": 24.027820710973725, + "grad_norm": 0.31149154901504517, + "learning_rate": 4.5471791104001215e-05, + "loss": 0.2873, + "num_input_tokens_seen": 11894576, + "step": 7785 + }, + { + "epoch": 24.043276661514682, + "grad_norm": 0.47497954964637756, + "learning_rate": 4.546615452275319e-05, + "loss": 0.2883, + "num_input_tokens_seen": 11901744, + "step": 7790 + }, + { + "epoch": 24.058732612055643, + "grad_norm": 0.41701218485832214, + "learning_rate": 4.5460514785366944e-05, + "loss": 0.275, + "num_input_tokens_seen": 11909584, + "step": 7795 + }, + { + "epoch": 24.0741885625966, + "grad_norm": 0.35079267621040344, + "learning_rate": 4.545487189271219e-05, + "loss": 0.3065, + "num_input_tokens_seen": 11917328, + "step": 7800 + }, + { + "epoch": 24.0741885625966, + "eval_loss": 0.3578772246837616, + "eval_runtime": 6.2943, + "eval_samples_per_second": 91.353, + "eval_steps_per_second": 22.878, + "num_input_tokens_seen": 11917328, + "step": 7800 + }, + { + "epoch": 24.089644513137557, + "grad_norm": 0.6926100254058838, + "learning_rate": 4.544922584565914e-05, + "loss": 0.2612, + "num_input_tokens_seen": 11924912, + "step": 7805 + }, + { + "epoch": 24.105100463678518, + "grad_norm": 0.31618571281433105, + "learning_rate": 4.544357664507848e-05, + "loss": 0.3068, + "num_input_tokens_seen": 11933168, + "step": 7810 + }, + { + "epoch": 24.120556414219475, + "grad_norm": 0.5200211405754089, + "learning_rate": 4.54379242918414e-05, + "loss": 0.3261, + "num_input_tokens_seen": 11940848, + "step": 7815 + }, + { + "epoch": 24.136012364760433, + "grad_norm": 0.5505560040473938, + "learning_rate": 4.543226878681955e-05, + "loss": 0.3053, + "num_input_tokens_seen": 11948400, + "step": 7820 + }, + { + "epoch": 24.15146831530139, + "grad_norm": 0.5437933206558228, + "learning_rate": 4.5426610130885087e-05, + "loss": 0.2961, + "num_input_tokens_seen": 11956272, + "step": 7825 + }, + { + "epoch": 24.16692426584235, + "grad_norm": 1.0014686584472656, + "learning_rate": 4.542094832491064e-05, + "loss": 0.3229, + "num_input_tokens_seen": 11964368, + "step": 7830 + }, + { + "epoch": 24.182380216383308, + "grad_norm": 0.39137837290763855, + "learning_rate": 4.541528336976934e-05, + "loss": 0.2511, + "num_input_tokens_seen": 11971952, + "step": 7835 + }, + { + "epoch": 24.197836166924265, + "grad_norm": 0.5482484102249146, + "learning_rate": 4.540961526633479e-05, + "loss": 0.3939, + "num_input_tokens_seen": 11979792, + "step": 7840 + }, + { + "epoch": 24.213292117465222, + "grad_norm": 0.4809196889400482, + "learning_rate": 4.540394401548108e-05, + "loss": 0.3208, + "num_input_tokens_seen": 11987856, + "step": 7845 + }, + { + "epoch": 24.228748068006183, + "grad_norm": 0.7456735968589783, + "learning_rate": 4.539826961808279e-05, + "loss": 0.4041, + "num_input_tokens_seen": 11995888, + "step": 7850 + }, + { + "epoch": 24.24420401854714, + "grad_norm": 0.3382224142551422, + "learning_rate": 4.5392592075014994e-05, + "loss": 0.2817, + "num_input_tokens_seen": 12003568, + "step": 7855 + }, + { + "epoch": 24.259659969088098, + "grad_norm": 0.5856668949127197, + "learning_rate": 4.538691138715322e-05, + "loss": 0.3532, + "num_input_tokens_seen": 12011600, + "step": 7860 + }, + { + "epoch": 24.27511591962906, + "grad_norm": 0.6760448217391968, + "learning_rate": 4.5381227555373516e-05, + "loss": 0.3628, + "num_input_tokens_seen": 12019408, + "step": 7865 + }, + { + "epoch": 24.290571870170016, + "grad_norm": 0.5081963539123535, + "learning_rate": 4.537554058055239e-05, + "loss": 0.3557, + "num_input_tokens_seen": 12027632, + "step": 7870 + }, + { + "epoch": 24.306027820710973, + "grad_norm": 0.7112916111946106, + "learning_rate": 4.5369850463566865e-05, + "loss": 0.2742, + "num_input_tokens_seen": 12034960, + "step": 7875 + }, + { + "epoch": 24.32148377125193, + "grad_norm": 0.7055877447128296, + "learning_rate": 4.5364157205294404e-05, + "loss": 0.2706, + "num_input_tokens_seen": 12042704, + "step": 7880 + }, + { + "epoch": 24.33693972179289, + "grad_norm": 0.44809406995773315, + "learning_rate": 4.5358460806612996e-05, + "loss": 0.3398, + "num_input_tokens_seen": 12050704, + "step": 7885 + }, + { + "epoch": 24.35239567233385, + "grad_norm": 0.6093704104423523, + "learning_rate": 4.535276126840109e-05, + "loss": 0.3312, + "num_input_tokens_seen": 12058704, + "step": 7890 + }, + { + "epoch": 24.367851622874806, + "grad_norm": 0.6561365127563477, + "learning_rate": 4.5347058591537626e-05, + "loss": 0.2988, + "num_input_tokens_seen": 12065872, + "step": 7895 + }, + { + "epoch": 24.383307573415767, + "grad_norm": 0.5231336355209351, + "learning_rate": 4.534135277690203e-05, + "loss": 0.2316, + "num_input_tokens_seen": 12073776, + "step": 7900 + }, + { + "epoch": 24.398763523956724, + "grad_norm": 0.3872199058532715, + "learning_rate": 4.533564382537421e-05, + "loss": 0.2823, + "num_input_tokens_seen": 12081424, + "step": 7905 + }, + { + "epoch": 24.41421947449768, + "grad_norm": 0.5598058700561523, + "learning_rate": 4.532993173783456e-05, + "loss": 0.3628, + "num_input_tokens_seen": 12088752, + "step": 7910 + }, + { + "epoch": 24.42967542503864, + "grad_norm": 0.6550191044807434, + "learning_rate": 4.5324216515163954e-05, + "loss": 0.3245, + "num_input_tokens_seen": 12096080, + "step": 7915 + }, + { + "epoch": 24.4451313755796, + "grad_norm": 0.7390193343162537, + "learning_rate": 4.531849815824375e-05, + "loss": 0.3288, + "num_input_tokens_seen": 12104240, + "step": 7920 + }, + { + "epoch": 24.460587326120557, + "grad_norm": 0.5031383633613586, + "learning_rate": 4.5312776667955795e-05, + "loss": 0.2639, + "num_input_tokens_seen": 12111696, + "step": 7925 + }, + { + "epoch": 24.476043276661514, + "grad_norm": 0.4438900649547577, + "learning_rate": 4.5307052045182405e-05, + "loss": 0.2338, + "num_input_tokens_seen": 12118800, + "step": 7930 + }, + { + "epoch": 24.491499227202475, + "grad_norm": 0.6100245714187622, + "learning_rate": 4.53013242908064e-05, + "loss": 0.2943, + "num_input_tokens_seen": 12126544, + "step": 7935 + }, + { + "epoch": 24.506955177743432, + "grad_norm": 0.5968477129936218, + "learning_rate": 4.529559340571107e-05, + "loss": 0.2568, + "num_input_tokens_seen": 12134800, + "step": 7940 + }, + { + "epoch": 24.52241112828439, + "grad_norm": 0.3129194676876068, + "learning_rate": 4.528985939078018e-05, + "loss": 0.2945, + "num_input_tokens_seen": 12142224, + "step": 7945 + }, + { + "epoch": 24.537867078825347, + "grad_norm": 0.4449930787086487, + "learning_rate": 4.5284122246898e-05, + "loss": 0.3008, + "num_input_tokens_seen": 12149776, + "step": 7950 + }, + { + "epoch": 24.553323029366307, + "grad_norm": 0.48216861486434937, + "learning_rate": 4.527838197494926e-05, + "loss": 0.3287, + "num_input_tokens_seen": 12157392, + "step": 7955 + }, + { + "epoch": 24.568778979907265, + "grad_norm": 0.5185562968254089, + "learning_rate": 4.527263857581918e-05, + "loss": 0.3474, + "num_input_tokens_seen": 12165040, + "step": 7960 + }, + { + "epoch": 24.584234930448222, + "grad_norm": 0.40028172731399536, + "learning_rate": 4.526689205039347e-05, + "loss": 0.3717, + "num_input_tokens_seen": 12172624, + "step": 7965 + }, + { + "epoch": 24.59969088098918, + "grad_norm": 0.46868157386779785, + "learning_rate": 4.5261142399558324e-05, + "loss": 0.3367, + "num_input_tokens_seen": 12180112, + "step": 7970 + }, + { + "epoch": 24.61514683153014, + "grad_norm": 0.40096068382263184, + "learning_rate": 4.525538962420041e-05, + "loss": 0.2218, + "num_input_tokens_seen": 12187280, + "step": 7975 + }, + { + "epoch": 24.630602782071097, + "grad_norm": 0.8295575380325317, + "learning_rate": 4.524963372520685e-05, + "loss": 0.2852, + "num_input_tokens_seen": 12194896, + "step": 7980 + }, + { + "epoch": 24.646058732612055, + "grad_norm": 0.42408037185668945, + "learning_rate": 4.524387470346531e-05, + "loss": 0.3719, + "num_input_tokens_seen": 12202352, + "step": 7985 + }, + { + "epoch": 24.661514683153015, + "grad_norm": 0.77329421043396, + "learning_rate": 4.5238112559863885e-05, + "loss": 0.291, + "num_input_tokens_seen": 12209776, + "step": 7990 + }, + { + "epoch": 24.676970633693973, + "grad_norm": 0.521843671798706, + "learning_rate": 4.5232347295291175e-05, + "loss": 0.2656, + "num_input_tokens_seen": 12217168, + "step": 7995 + }, + { + "epoch": 24.69242658423493, + "grad_norm": 0.7951492667198181, + "learning_rate": 4.522657891063626e-05, + "loss": 0.2827, + "num_input_tokens_seen": 12224848, + "step": 8000 + }, + { + "epoch": 24.69242658423493, + "eval_loss": 0.3559044301509857, + "eval_runtime": 6.2739, + "eval_samples_per_second": 91.65, + "eval_steps_per_second": 22.952, + "num_input_tokens_seen": 12224848, + "step": 8000 + }, + { + "epoch": 24.707882534775887, + "grad_norm": 0.4279574453830719, + "learning_rate": 4.52208074067887e-05, + "loss": 0.29, + "num_input_tokens_seen": 12232496, + "step": 8005 + }, + { + "epoch": 24.723338485316848, + "grad_norm": 0.6505544185638428, + "learning_rate": 4.5215032784638516e-05, + "loss": 0.2747, + "num_input_tokens_seen": 12239792, + "step": 8010 + }, + { + "epoch": 24.738794435857805, + "grad_norm": 0.6132794618606567, + "learning_rate": 4.5209255045076245e-05, + "loss": 0.226, + "num_input_tokens_seen": 12247504, + "step": 8015 + }, + { + "epoch": 24.754250386398763, + "grad_norm": 0.4616400897502899, + "learning_rate": 4.5203474188992875e-05, + "loss": 0.2303, + "num_input_tokens_seen": 12255312, + "step": 8020 + }, + { + "epoch": 24.769706336939723, + "grad_norm": 0.7419180870056152, + "learning_rate": 4.51976902172799e-05, + "loss": 0.3023, + "num_input_tokens_seen": 12262512, + "step": 8025 + }, + { + "epoch": 24.78516228748068, + "grad_norm": 0.6766722798347473, + "learning_rate": 4.519190313082927e-05, + "loss": 0.4045, + "num_input_tokens_seen": 12270384, + "step": 8030 + }, + { + "epoch": 24.800618238021638, + "grad_norm": 0.8024869561195374, + "learning_rate": 4.518611293053343e-05, + "loss": 0.2698, + "num_input_tokens_seen": 12278064, + "step": 8035 + }, + { + "epoch": 24.816074188562595, + "grad_norm": 0.9173161387443542, + "learning_rate": 4.51803196172853e-05, + "loss": 0.2848, + "num_input_tokens_seen": 12285712, + "step": 8040 + }, + { + "epoch": 24.831530139103556, + "grad_norm": 0.6292343139648438, + "learning_rate": 4.517452319197828e-05, + "loss": 0.3558, + "num_input_tokens_seen": 12293520, + "step": 8045 + }, + { + "epoch": 24.846986089644513, + "grad_norm": 0.9015294909477234, + "learning_rate": 4.5168723655506265e-05, + "loss": 0.2711, + "num_input_tokens_seen": 12301680, + "step": 8050 + }, + { + "epoch": 24.86244204018547, + "grad_norm": 0.6037262082099915, + "learning_rate": 4.51629210087636e-05, + "loss": 0.2706, + "num_input_tokens_seen": 12309744, + "step": 8055 + }, + { + "epoch": 24.87789799072643, + "grad_norm": 0.29164162278175354, + "learning_rate": 4.515711525264513e-05, + "loss": 0.3162, + "num_input_tokens_seen": 12316784, + "step": 8060 + }, + { + "epoch": 24.89335394126739, + "grad_norm": 0.29740461707115173, + "learning_rate": 4.5151306388046175e-05, + "loss": 0.2793, + "num_input_tokens_seen": 12324560, + "step": 8065 + }, + { + "epoch": 24.908809891808346, + "grad_norm": 0.6339672207832336, + "learning_rate": 4.514549441586255e-05, + "loss": 0.378, + "num_input_tokens_seen": 12332752, + "step": 8070 + }, + { + "epoch": 24.924265842349303, + "grad_norm": 0.5507486462593079, + "learning_rate": 4.513967933699051e-05, + "loss": 0.2777, + "num_input_tokens_seen": 12340016, + "step": 8075 + }, + { + "epoch": 24.939721792890264, + "grad_norm": 0.7650327086448669, + "learning_rate": 4.513386115232684e-05, + "loss": 0.2785, + "num_input_tokens_seen": 12347760, + "step": 8080 + }, + { + "epoch": 24.95517774343122, + "grad_norm": 0.6151114106178284, + "learning_rate": 4.5128039862768745e-05, + "loss": 0.3117, + "num_input_tokens_seen": 12355760, + "step": 8085 + }, + { + "epoch": 24.97063369397218, + "grad_norm": 0.7935745716094971, + "learning_rate": 4.512221546921397e-05, + "loss": 0.2377, + "num_input_tokens_seen": 12363344, + "step": 8090 + }, + { + "epoch": 24.986089644513136, + "grad_norm": 0.40688714385032654, + "learning_rate": 4.5116387972560694e-05, + "loss": 0.2884, + "num_input_tokens_seen": 12370992, + "step": 8095 + }, + { + "epoch": 25.0, + "grad_norm": 0.5236217379570007, + "learning_rate": 4.511055737370759e-05, + "loss": 0.2133, + "num_input_tokens_seen": 12377232, + "step": 8100 + }, + { + "epoch": 25.015455950540957, + "grad_norm": 0.5379239916801453, + "learning_rate": 4.510472367355383e-05, + "loss": 0.3296, + "num_input_tokens_seen": 12384528, + "step": 8105 + }, + { + "epoch": 25.030911901081918, + "grad_norm": 0.5690016746520996, + "learning_rate": 4.509888687299901e-05, + "loss": 0.2799, + "num_input_tokens_seen": 12391792, + "step": 8110 + }, + { + "epoch": 25.046367851622875, + "grad_norm": 0.465707391500473, + "learning_rate": 4.5093046972943266e-05, + "loss": 0.2428, + "num_input_tokens_seen": 12399152, + "step": 8115 + }, + { + "epoch": 25.061823802163833, + "grad_norm": 0.5652322173118591, + "learning_rate": 4.508720397428717e-05, + "loss": 0.2659, + "num_input_tokens_seen": 12407024, + "step": 8120 + }, + { + "epoch": 25.07727975270479, + "grad_norm": 0.6444937586784363, + "learning_rate": 4.508135787793178e-05, + "loss": 0.3004, + "num_input_tokens_seen": 12414640, + "step": 8125 + }, + { + "epoch": 25.09273570324575, + "grad_norm": 0.49726197123527527, + "learning_rate": 4.5075508684778664e-05, + "loss": 0.3104, + "num_input_tokens_seen": 12422448, + "step": 8130 + }, + { + "epoch": 25.108191653786708, + "grad_norm": 1.259697675704956, + "learning_rate": 4.506965639572982e-05, + "loss": 0.296, + "num_input_tokens_seen": 12430224, + "step": 8135 + }, + { + "epoch": 25.123647604327665, + "grad_norm": 0.5410287380218506, + "learning_rate": 4.506380101168774e-05, + "loss": 0.2472, + "num_input_tokens_seen": 12437520, + "step": 8140 + }, + { + "epoch": 25.139103554868626, + "grad_norm": 0.8109152317047119, + "learning_rate": 4.505794253355542e-05, + "loss": 0.3566, + "num_input_tokens_seen": 12445168, + "step": 8145 + }, + { + "epoch": 25.154559505409583, + "grad_norm": 0.8809698224067688, + "learning_rate": 4.5052080962236286e-05, + "loss": 0.2463, + "num_input_tokens_seen": 12452912, + "step": 8150 + }, + { + "epoch": 25.17001545595054, + "grad_norm": 0.6108117699623108, + "learning_rate": 4.504621629863428e-05, + "loss": 0.2949, + "num_input_tokens_seen": 12460272, + "step": 8155 + }, + { + "epoch": 25.185471406491498, + "grad_norm": 0.43693220615386963, + "learning_rate": 4.504034854365381e-05, + "loss": 0.3023, + "num_input_tokens_seen": 12467632, + "step": 8160 + }, + { + "epoch": 25.20092735703246, + "grad_norm": 0.56116783618927, + "learning_rate": 4.503447769819974e-05, + "loss": 0.2914, + "num_input_tokens_seen": 12475184, + "step": 8165 + }, + { + "epoch": 25.216383307573416, + "grad_norm": 0.5607390403747559, + "learning_rate": 4.502860376317745e-05, + "loss": 0.2611, + "num_input_tokens_seen": 12483024, + "step": 8170 + }, + { + "epoch": 25.231839258114373, + "grad_norm": 0.7514896988868713, + "learning_rate": 4.502272673949276e-05, + "loss": 0.2271, + "num_input_tokens_seen": 12490544, + "step": 8175 + }, + { + "epoch": 25.24729520865533, + "grad_norm": 0.5228771567344666, + "learning_rate": 4.501684662805199e-05, + "loss": 0.3331, + "num_input_tokens_seen": 12497936, + "step": 8180 + }, + { + "epoch": 25.26275115919629, + "grad_norm": 0.5124324560165405, + "learning_rate": 4.5010963429761924e-05, + "loss": 0.3167, + "num_input_tokens_seen": 12506448, + "step": 8185 + }, + { + "epoch": 25.27820710973725, + "grad_norm": 0.5526285171508789, + "learning_rate": 4.500507714552982e-05, + "loss": 0.3123, + "num_input_tokens_seen": 12514512, + "step": 8190 + }, + { + "epoch": 25.293663060278206, + "grad_norm": 0.5478415489196777, + "learning_rate": 4.499918777626342e-05, + "loss": 0.3239, + "num_input_tokens_seen": 12522224, + "step": 8195 + }, + { + "epoch": 25.309119010819167, + "grad_norm": 0.4241132438182831, + "learning_rate": 4.499329532287093e-05, + "loss": 0.258, + "num_input_tokens_seen": 12530128, + "step": 8200 + }, + { + "epoch": 25.309119010819167, + "eval_loss": 0.3544504940509796, + "eval_runtime": 6.3156, + "eval_samples_per_second": 91.045, + "eval_steps_per_second": 22.801, + "num_input_tokens_seen": 12530128, + "step": 8200 + }, + { + "epoch": 25.324574961360124, + "grad_norm": 0.3870442807674408, + "learning_rate": 4.4987399786261064e-05, + "loss": 0.2697, + "num_input_tokens_seen": 12537936, + "step": 8205 + }, + { + "epoch": 25.34003091190108, + "grad_norm": 0.49346089363098145, + "learning_rate": 4.498150116734297e-05, + "loss": 0.2481, + "num_input_tokens_seen": 12545616, + "step": 8210 + }, + { + "epoch": 25.35548686244204, + "grad_norm": 0.3878783881664276, + "learning_rate": 4.4975599467026294e-05, + "loss": 0.3808, + "num_input_tokens_seen": 12553680, + "step": 8215 + }, + { + "epoch": 25.370942812983, + "grad_norm": 0.7823365926742554, + "learning_rate": 4.496969468622114e-05, + "loss": 0.2839, + "num_input_tokens_seen": 12561168, + "step": 8220 + }, + { + "epoch": 25.386398763523957, + "grad_norm": 0.41787639260292053, + "learning_rate": 4.496378682583813e-05, + "loss": 0.2789, + "num_input_tokens_seen": 12568848, + "step": 8225 + }, + { + "epoch": 25.401854714064914, + "grad_norm": 0.5897759795188904, + "learning_rate": 4.495787588678829e-05, + "loss": 0.3524, + "num_input_tokens_seen": 12576016, + "step": 8230 + }, + { + "epoch": 25.417310664605875, + "grad_norm": 0.5262805223464966, + "learning_rate": 4.4951961869983196e-05, + "loss": 0.2558, + "num_input_tokens_seen": 12583600, + "step": 8235 + }, + { + "epoch": 25.432766615146832, + "grad_norm": 0.5645907521247864, + "learning_rate": 4.494604477633485e-05, + "loss": 0.2446, + "num_input_tokens_seen": 12591792, + "step": 8240 + }, + { + "epoch": 25.44822256568779, + "grad_norm": 0.8096599578857422, + "learning_rate": 4.4940124606755734e-05, + "loss": 0.3809, + "num_input_tokens_seen": 12599952, + "step": 8245 + }, + { + "epoch": 25.463678516228747, + "grad_norm": 0.506732702255249, + "learning_rate": 4.493420136215882e-05, + "loss": 0.2536, + "num_input_tokens_seen": 12607856, + "step": 8250 + }, + { + "epoch": 25.479134466769708, + "grad_norm": 0.3246931731700897, + "learning_rate": 4.492827504345756e-05, + "loss": 0.3195, + "num_input_tokens_seen": 12616016, + "step": 8255 + }, + { + "epoch": 25.494590417310665, + "grad_norm": 0.41973263025283813, + "learning_rate": 4.492234565156584e-05, + "loss": 0.2448, + "num_input_tokens_seen": 12624304, + "step": 8260 + }, + { + "epoch": 25.510046367851622, + "grad_norm": 0.8659217357635498, + "learning_rate": 4.491641318739807e-05, + "loss": 0.3196, + "num_input_tokens_seen": 12631632, + "step": 8265 + }, + { + "epoch": 25.525502318392583, + "grad_norm": 0.5655413269996643, + "learning_rate": 4.4910477651869096e-05, + "loss": 0.3469, + "num_input_tokens_seen": 12639504, + "step": 8270 + }, + { + "epoch": 25.54095826893354, + "grad_norm": 0.5182358622550964, + "learning_rate": 4.4904539045894254e-05, + "loss": 0.2462, + "num_input_tokens_seen": 12646896, + "step": 8275 + }, + { + "epoch": 25.556414219474497, + "grad_norm": 0.45341333746910095, + "learning_rate": 4.4898597370389364e-05, + "loss": 0.3563, + "num_input_tokens_seen": 12655248, + "step": 8280 + }, + { + "epoch": 25.571870170015455, + "grad_norm": 0.6001011729240417, + "learning_rate": 4.489265262627069e-05, + "loss": 0.2528, + "num_input_tokens_seen": 12662480, + "step": 8285 + }, + { + "epoch": 25.587326120556416, + "grad_norm": 0.8142700791358948, + "learning_rate": 4.488670481445499e-05, + "loss": 0.3109, + "num_input_tokens_seen": 12669936, + "step": 8290 + }, + { + "epoch": 25.602782071097373, + "grad_norm": 0.6598548293113708, + "learning_rate": 4.488075393585951e-05, + "loss": 0.3852, + "num_input_tokens_seen": 12677712, + "step": 8295 + }, + { + "epoch": 25.61823802163833, + "grad_norm": 0.6506301164627075, + "learning_rate": 4.487479999140193e-05, + "loss": 0.2846, + "num_input_tokens_seen": 12685264, + "step": 8300 + }, + { + "epoch": 25.633693972179287, + "grad_norm": 0.4133561849594116, + "learning_rate": 4.4868842982000425e-05, + "loss": 0.2395, + "num_input_tokens_seen": 12692656, + "step": 8305 + }, + { + "epoch": 25.649149922720248, + "grad_norm": 0.3655999004840851, + "learning_rate": 4.486288290857365e-05, + "loss": 0.2519, + "num_input_tokens_seen": 12700656, + "step": 8310 + }, + { + "epoch": 25.664605873261205, + "grad_norm": 0.42927414178848267, + "learning_rate": 4.4856919772040715e-05, + "loss": 0.2261, + "num_input_tokens_seen": 12708592, + "step": 8315 + }, + { + "epoch": 25.680061823802163, + "grad_norm": 0.7058945298194885, + "learning_rate": 4.485095357332122e-05, + "loss": 0.3699, + "num_input_tokens_seen": 12716272, + "step": 8320 + }, + { + "epoch": 25.695517774343124, + "grad_norm": 0.4000960886478424, + "learning_rate": 4.484498431333521e-05, + "loss": 0.3061, + "num_input_tokens_seen": 12724112, + "step": 8325 + }, + { + "epoch": 25.71097372488408, + "grad_norm": 0.7195389270782471, + "learning_rate": 4.4839011993003245e-05, + "loss": 0.2511, + "num_input_tokens_seen": 12731984, + "step": 8330 + }, + { + "epoch": 25.726429675425038, + "grad_norm": 0.5469504594802856, + "learning_rate": 4.4833036613246305e-05, + "loss": 0.2967, + "num_input_tokens_seen": 12739824, + "step": 8335 + }, + { + "epoch": 25.741885625965995, + "grad_norm": 0.8977856040000916, + "learning_rate": 4.482705817498589e-05, + "loss": 0.2669, + "num_input_tokens_seen": 12747344, + "step": 8340 + }, + { + "epoch": 25.757341576506956, + "grad_norm": 0.6187527775764465, + "learning_rate": 4.4821076679143934e-05, + "loss": 0.3275, + "num_input_tokens_seen": 12755024, + "step": 8345 + }, + { + "epoch": 25.772797527047913, + "grad_norm": 1.2011798620224, + "learning_rate": 4.481509212664288e-05, + "loss": 0.3806, + "num_input_tokens_seen": 12762128, + "step": 8350 + }, + { + "epoch": 25.78825347758887, + "grad_norm": 0.76594078540802, + "learning_rate": 4.480910451840559e-05, + "loss": 0.4012, + "num_input_tokens_seen": 12769488, + "step": 8355 + }, + { + "epoch": 25.80370942812983, + "grad_norm": 0.7856513857841492, + "learning_rate": 4.480311385535546e-05, + "loss": 0.2739, + "num_input_tokens_seen": 12777168, + "step": 8360 + }, + { + "epoch": 25.81916537867079, + "grad_norm": 0.6052625775337219, + "learning_rate": 4.47971201384163e-05, + "loss": 0.3051, + "num_input_tokens_seen": 12784368, + "step": 8365 + }, + { + "epoch": 25.834621329211746, + "grad_norm": 0.28710609674453735, + "learning_rate": 4.4791123368512446e-05, + "loss": 0.3208, + "num_input_tokens_seen": 12791728, + "step": 8370 + }, + { + "epoch": 25.850077279752703, + "grad_norm": 0.5931981801986694, + "learning_rate": 4.478512354656864e-05, + "loss": 0.334, + "num_input_tokens_seen": 12800208, + "step": 8375 + }, + { + "epoch": 25.865533230293664, + "grad_norm": 0.47175779938697815, + "learning_rate": 4.477912067351016e-05, + "loss": 0.2109, + "num_input_tokens_seen": 12807632, + "step": 8380 + }, + { + "epoch": 25.88098918083462, + "grad_norm": 0.6446034908294678, + "learning_rate": 4.477311475026271e-05, + "loss": 0.3993, + "num_input_tokens_seen": 12815664, + "step": 8385 + }, + { + "epoch": 25.89644513137558, + "grad_norm": 0.4591789245605469, + "learning_rate": 4.476710577775248e-05, + "loss": 0.3408, + "num_input_tokens_seen": 12822960, + "step": 8390 + }, + { + "epoch": 25.91190108191654, + "grad_norm": 0.8173422813415527, + "learning_rate": 4.476109375690612e-05, + "loss": 0.3079, + "num_input_tokens_seen": 12830032, + "step": 8395 + }, + { + "epoch": 25.927357032457497, + "grad_norm": 0.5890312194824219, + "learning_rate": 4.4755078688650784e-05, + "loss": 0.2731, + "num_input_tokens_seen": 12838032, + "step": 8400 + }, + { + "epoch": 25.927357032457497, + "eval_loss": 0.3515976369380951, + "eval_runtime": 6.3124, + "eval_samples_per_second": 91.091, + "eval_steps_per_second": 22.812, + "num_input_tokens_seen": 12838032, + "step": 8400 + }, + { + "epoch": 25.942812982998454, + "grad_norm": 0.45570802688598633, + "learning_rate": 4.474906057391406e-05, + "loss": 0.2322, + "num_input_tokens_seen": 12845680, + "step": 8405 + }, + { + "epoch": 25.95826893353941, + "grad_norm": 0.9430697560310364, + "learning_rate": 4.4743039413624e-05, + "loss": 0.2968, + "num_input_tokens_seen": 12852848, + "step": 8410 + }, + { + "epoch": 25.973724884080372, + "grad_norm": 0.5678683519363403, + "learning_rate": 4.473701520870916e-05, + "loss": 0.3744, + "num_input_tokens_seen": 12860560, + "step": 8415 + }, + { + "epoch": 25.98918083462133, + "grad_norm": 0.6295216679573059, + "learning_rate": 4.4730987960098544e-05, + "loss": 0.2606, + "num_input_tokens_seen": 12868176, + "step": 8420 + }, + { + "epoch": 26.00309119010819, + "grad_norm": 0.4064059853553772, + "learning_rate": 4.4724957668721635e-05, + "loss": 0.2849, + "num_input_tokens_seen": 12874512, + "step": 8425 + }, + { + "epoch": 26.01854714064915, + "grad_norm": 0.6860201954841614, + "learning_rate": 4.471892433550836e-05, + "loss": 0.3001, + "num_input_tokens_seen": 12882000, + "step": 8430 + }, + { + "epoch": 26.034003091190108, + "grad_norm": 0.429575651884079, + "learning_rate": 4.471288796138916e-05, + "loss": 0.2536, + "num_input_tokens_seen": 12890000, + "step": 8435 + }, + { + "epoch": 26.049459041731065, + "grad_norm": 0.5366010069847107, + "learning_rate": 4.470684854729491e-05, + "loss": 0.286, + "num_input_tokens_seen": 12897200, + "step": 8440 + }, + { + "epoch": 26.064914992272026, + "grad_norm": 0.4450090229511261, + "learning_rate": 4.4700806094156955e-05, + "loss": 0.3597, + "num_input_tokens_seen": 12905232, + "step": 8445 + }, + { + "epoch": 26.080370942812984, + "grad_norm": 0.6690120100975037, + "learning_rate": 4.469476060290713e-05, + "loss": 0.2964, + "num_input_tokens_seen": 12912912, + "step": 8450 + }, + { + "epoch": 26.09582689335394, + "grad_norm": 0.31342238187789917, + "learning_rate": 4.468871207447772e-05, + "loss": 0.2867, + "num_input_tokens_seen": 12920688, + "step": 8455 + }, + { + "epoch": 26.111282843894898, + "grad_norm": 0.42582476139068604, + "learning_rate": 4.4682660509801486e-05, + "loss": 0.3029, + "num_input_tokens_seen": 12928144, + "step": 8460 + }, + { + "epoch": 26.12673879443586, + "grad_norm": 0.44315823912620544, + "learning_rate": 4.467660590981165e-05, + "loss": 0.2359, + "num_input_tokens_seen": 12936560, + "step": 8465 + }, + { + "epoch": 26.142194744976816, + "grad_norm": 0.2803557813167572, + "learning_rate": 4.467054827544191e-05, + "loss": 0.3111, + "num_input_tokens_seen": 12943888, + "step": 8470 + }, + { + "epoch": 26.157650695517773, + "grad_norm": 0.7522674202919006, + "learning_rate": 4.4664487607626434e-05, + "loss": 0.3201, + "num_input_tokens_seen": 12951312, + "step": 8475 + }, + { + "epoch": 26.173106646058734, + "grad_norm": 0.8308438658714294, + "learning_rate": 4.4658423907299845e-05, + "loss": 0.3318, + "num_input_tokens_seen": 12959312, + "step": 8480 + }, + { + "epoch": 26.18856259659969, + "grad_norm": 0.7239246964454651, + "learning_rate": 4.465235717539725e-05, + "loss": 0.3262, + "num_input_tokens_seen": 12966736, + "step": 8485 + }, + { + "epoch": 26.20401854714065, + "grad_norm": 0.6353563666343689, + "learning_rate": 4.464628741285421e-05, + "loss": 0.3383, + "num_input_tokens_seen": 12975024, + "step": 8490 + }, + { + "epoch": 26.219474497681606, + "grad_norm": 0.4448980689048767, + "learning_rate": 4.4640214620606754e-05, + "loss": 0.2365, + "num_input_tokens_seen": 12982192, + "step": 8495 + }, + { + "epoch": 26.234930448222567, + "grad_norm": 0.313005656003952, + "learning_rate": 4.46341387995914e-05, + "loss": 0.301, + "num_input_tokens_seen": 12989456, + "step": 8500 + }, + { + "epoch": 26.250386398763524, + "grad_norm": 0.47910499572753906, + "learning_rate": 4.4628059950745106e-05, + "loss": 0.2186, + "num_input_tokens_seen": 12997008, + "step": 8505 + }, + { + "epoch": 26.26584234930448, + "grad_norm": 0.8150962591171265, + "learning_rate": 4.4621978075005297e-05, + "loss": 0.2735, + "num_input_tokens_seen": 13005072, + "step": 8510 + }, + { + "epoch": 26.28129829984544, + "grad_norm": 0.3989039361476898, + "learning_rate": 4.461589317330989e-05, + "loss": 0.2431, + "num_input_tokens_seen": 13012624, + "step": 8515 + }, + { + "epoch": 26.2967542503864, + "grad_norm": 0.5313324928283691, + "learning_rate": 4.460980524659724e-05, + "loss": 0.2875, + "num_input_tokens_seen": 13020240, + "step": 8520 + }, + { + "epoch": 26.312210200927357, + "grad_norm": 0.418815016746521, + "learning_rate": 4.46037142958062e-05, + "loss": 0.2452, + "num_input_tokens_seen": 13027120, + "step": 8525 + }, + { + "epoch": 26.327666151468314, + "grad_norm": 0.5997875332832336, + "learning_rate": 4.4597620321876046e-05, + "loss": 0.3014, + "num_input_tokens_seen": 13034896, + "step": 8530 + }, + { + "epoch": 26.343122102009275, + "grad_norm": 0.5591450929641724, + "learning_rate": 4.459152332574656e-05, + "loss": 0.386, + "num_input_tokens_seen": 13042704, + "step": 8535 + }, + { + "epoch": 26.358578052550232, + "grad_norm": 0.6926552057266235, + "learning_rate": 4.4585423308357985e-05, + "loss": 0.3516, + "num_input_tokens_seen": 13050544, + "step": 8540 + }, + { + "epoch": 26.37403400309119, + "grad_norm": 0.4852924644947052, + "learning_rate": 4.457932027065102e-05, + "loss": 0.3114, + "num_input_tokens_seen": 13058096, + "step": 8545 + }, + { + "epoch": 26.389489953632147, + "grad_norm": 0.4965086281299591, + "learning_rate": 4.45732142135668e-05, + "loss": 0.3206, + "num_input_tokens_seen": 13065872, + "step": 8550 + }, + { + "epoch": 26.404945904173108, + "grad_norm": 0.45236608386039734, + "learning_rate": 4.4567105138046986e-05, + "loss": 0.2981, + "num_input_tokens_seen": 13073744, + "step": 8555 + }, + { + "epoch": 26.420401854714065, + "grad_norm": 0.54484623670578, + "learning_rate": 4.456099304503365e-05, + "loss": 0.2772, + "num_input_tokens_seen": 13081488, + "step": 8560 + }, + { + "epoch": 26.435857805255022, + "grad_norm": 0.5481235980987549, + "learning_rate": 4.455487793546939e-05, + "loss": 0.2236, + "num_input_tokens_seen": 13089232, + "step": 8565 + }, + { + "epoch": 26.451313755795983, + "grad_norm": 0.3690958321094513, + "learning_rate": 4.454875981029719e-05, + "loss": 0.2304, + "num_input_tokens_seen": 13096176, + "step": 8570 + }, + { + "epoch": 26.46676970633694, + "grad_norm": 0.5113763809204102, + "learning_rate": 4.454263867046057e-05, + "loss": 0.3407, + "num_input_tokens_seen": 13103536, + "step": 8575 + }, + { + "epoch": 26.482225656877898, + "grad_norm": 1.4303327798843384, + "learning_rate": 4.4536514516903484e-05, + "loss": 0.2759, + "num_input_tokens_seen": 13111088, + "step": 8580 + }, + { + "epoch": 26.497681607418855, + "grad_norm": 0.5628947019577026, + "learning_rate": 4.453038735057034e-05, + "loss": 0.3171, + "num_input_tokens_seen": 13118736, + "step": 8585 + }, + { + "epoch": 26.513137557959816, + "grad_norm": 1.0864015817642212, + "learning_rate": 4.4524257172406034e-05, + "loss": 0.3613, + "num_input_tokens_seen": 13126448, + "step": 8590 + }, + { + "epoch": 26.528593508500773, + "grad_norm": 0.5571060180664062, + "learning_rate": 4.451812398335592e-05, + "loss": 0.2612, + "num_input_tokens_seen": 13133840, + "step": 8595 + }, + { + "epoch": 26.54404945904173, + "grad_norm": 0.6151509881019592, + "learning_rate": 4.4511987784365805e-05, + "loss": 0.3197, + "num_input_tokens_seen": 13142096, + "step": 8600 + }, + { + "epoch": 26.54404945904173, + "eval_loss": 0.35142090916633606, + "eval_runtime": 6.3201, + "eval_samples_per_second": 90.98, + "eval_steps_per_second": 22.785, + "num_input_tokens_seen": 13142096, + "step": 8600 + }, + { + "epoch": 26.55950540958269, + "grad_norm": 0.5520145893096924, + "learning_rate": 4.450584857638197e-05, + "loss": 0.3334, + "num_input_tokens_seen": 13149616, + "step": 8605 + }, + { + "epoch": 26.57496136012365, + "grad_norm": 0.3731798231601715, + "learning_rate": 4.449970636035116e-05, + "loss": 0.3215, + "num_input_tokens_seen": 13156944, + "step": 8610 + }, + { + "epoch": 26.590417310664606, + "grad_norm": 0.6272196769714355, + "learning_rate": 4.4493561137220574e-05, + "loss": 0.2571, + "num_input_tokens_seen": 13164400, + "step": 8615 + }, + { + "epoch": 26.605873261205563, + "grad_norm": 0.45888790488243103, + "learning_rate": 4.44874129079379e-05, + "loss": 0.2586, + "num_input_tokens_seen": 13172688, + "step": 8620 + }, + { + "epoch": 26.621329211746524, + "grad_norm": 0.7035101652145386, + "learning_rate": 4.4481261673451255e-05, + "loss": 0.2741, + "num_input_tokens_seen": 13180368, + "step": 8625 + }, + { + "epoch": 26.63678516228748, + "grad_norm": 0.3630964457988739, + "learning_rate": 4.4475107434709245e-05, + "loss": 0.2512, + "num_input_tokens_seen": 13187888, + "step": 8630 + }, + { + "epoch": 26.652241112828438, + "grad_norm": 0.3761705458164215, + "learning_rate": 4.446895019266093e-05, + "loss": 0.2436, + "num_input_tokens_seen": 13195312, + "step": 8635 + }, + { + "epoch": 26.667697063369395, + "grad_norm": 0.5136421918869019, + "learning_rate": 4.446278994825583e-05, + "loss": 0.3418, + "num_input_tokens_seen": 13203184, + "step": 8640 + }, + { + "epoch": 26.683153013910356, + "grad_norm": 0.4875640571117401, + "learning_rate": 4.445662670244394e-05, + "loss": 0.2205, + "num_input_tokens_seen": 13211344, + "step": 8645 + }, + { + "epoch": 26.698608964451314, + "grad_norm": 0.8344579339027405, + "learning_rate": 4.44504604561757e-05, + "loss": 0.3737, + "num_input_tokens_seen": 13218608, + "step": 8650 + }, + { + "epoch": 26.71406491499227, + "grad_norm": 0.7512809038162231, + "learning_rate": 4.4444291210402035e-05, + "loss": 0.3497, + "num_input_tokens_seen": 13226416, + "step": 8655 + }, + { + "epoch": 26.72952086553323, + "grad_norm": 0.6743752956390381, + "learning_rate": 4.443811896607431e-05, + "loss": 0.3248, + "num_input_tokens_seen": 13234352, + "step": 8660 + }, + { + "epoch": 26.74497681607419, + "grad_norm": 0.2814044654369354, + "learning_rate": 4.443194372414436e-05, + "loss": 0.3771, + "num_input_tokens_seen": 13242256, + "step": 8665 + }, + { + "epoch": 26.760432766615146, + "grad_norm": 0.7426875233650208, + "learning_rate": 4.442576548556449e-05, + "loss": 0.2928, + "num_input_tokens_seen": 13250160, + "step": 8670 + }, + { + "epoch": 26.775888717156104, + "grad_norm": 0.9409419894218445, + "learning_rate": 4.441958425128747e-05, + "loss": 0.2989, + "num_input_tokens_seen": 13257584, + "step": 8675 + }, + { + "epoch": 26.791344667697064, + "grad_norm": 0.546934962272644, + "learning_rate": 4.4413400022266515e-05, + "loss": 0.3224, + "num_input_tokens_seen": 13265104, + "step": 8680 + }, + { + "epoch": 26.80680061823802, + "grad_norm": 0.5068230032920837, + "learning_rate": 4.4407212799455313e-05, + "loss": 0.2927, + "num_input_tokens_seen": 13272496, + "step": 8685 + }, + { + "epoch": 26.82225656877898, + "grad_norm": 0.38829970359802246, + "learning_rate": 4.4401022583808003e-05, + "loss": 0.2901, + "num_input_tokens_seen": 13280048, + "step": 8690 + }, + { + "epoch": 26.83771251931994, + "grad_norm": 0.6055241823196411, + "learning_rate": 4.439482937627921e-05, + "loss": 0.3898, + "num_input_tokens_seen": 13287376, + "step": 8695 + }, + { + "epoch": 26.853168469860897, + "grad_norm": 0.6446306109428406, + "learning_rate": 4.4388633177824004e-05, + "loss": 0.3588, + "num_input_tokens_seen": 13295120, + "step": 8700 + }, + { + "epoch": 26.868624420401854, + "grad_norm": 0.4330461025238037, + "learning_rate": 4.4382433989397895e-05, + "loss": 0.2473, + "num_input_tokens_seen": 13302832, + "step": 8705 + }, + { + "epoch": 26.88408037094281, + "grad_norm": 0.699151873588562, + "learning_rate": 4.4376231811956895e-05, + "loss": 0.2374, + "num_input_tokens_seen": 13310608, + "step": 8710 + }, + { + "epoch": 26.899536321483772, + "grad_norm": 0.5024514198303223, + "learning_rate": 4.437002664645745e-05, + "loss": 0.2331, + "num_input_tokens_seen": 13318256, + "step": 8715 + }, + { + "epoch": 26.91499227202473, + "grad_norm": 0.6261274218559265, + "learning_rate": 4.436381849385649e-05, + "loss": 0.2897, + "num_input_tokens_seen": 13325744, + "step": 8720 + }, + { + "epoch": 26.930448222565687, + "grad_norm": 0.42051443457603455, + "learning_rate": 4.435760735511136e-05, + "loss": 0.3101, + "num_input_tokens_seen": 13333104, + "step": 8725 + }, + { + "epoch": 26.945904173106648, + "grad_norm": 0.34936216473579407, + "learning_rate": 4.435139323117992e-05, + "loss": 0.3167, + "num_input_tokens_seen": 13341488, + "step": 8730 + }, + { + "epoch": 26.961360123647605, + "grad_norm": 0.6066237092018127, + "learning_rate": 4.434517612302046e-05, + "loss": 0.2558, + "num_input_tokens_seen": 13349360, + "step": 8735 + }, + { + "epoch": 26.976816074188562, + "grad_norm": 0.6485305428504944, + "learning_rate": 4.433895603159174e-05, + "loss": 0.2976, + "num_input_tokens_seen": 13357488, + "step": 8740 + }, + { + "epoch": 26.99227202472952, + "grad_norm": 0.8013071417808533, + "learning_rate": 4.433273295785296e-05, + "loss": 0.2212, + "num_input_tokens_seen": 13364720, + "step": 8745 + }, + { + "epoch": 27.006182380216384, + "grad_norm": 0.4632079005241394, + "learning_rate": 4.432650690276382e-05, + "loss": 0.326, + "num_input_tokens_seen": 13371456, + "step": 8750 + }, + { + "epoch": 27.02163833075734, + "grad_norm": 0.3349049389362335, + "learning_rate": 4.4320277867284435e-05, + "loss": 0.3068, + "num_input_tokens_seen": 13378848, + "step": 8755 + }, + { + "epoch": 27.037094281298298, + "grad_norm": 0.6401165723800659, + "learning_rate": 4.431404585237541e-05, + "loss": 0.2568, + "num_input_tokens_seen": 13386400, + "step": 8760 + }, + { + "epoch": 27.05255023183926, + "grad_norm": 0.5713716745376587, + "learning_rate": 4.43078108589978e-05, + "loss": 0.2891, + "num_input_tokens_seen": 13394176, + "step": 8765 + }, + { + "epoch": 27.068006182380216, + "grad_norm": 0.44754350185394287, + "learning_rate": 4.4301572888113116e-05, + "loss": 0.3033, + "num_input_tokens_seen": 13401504, + "step": 8770 + }, + { + "epoch": 27.083462132921174, + "grad_norm": 0.3829056918621063, + "learning_rate": 4.4295331940683337e-05, + "loss": 0.2968, + "num_input_tokens_seen": 13409792, + "step": 8775 + }, + { + "epoch": 27.098918083462134, + "grad_norm": 0.5908631682395935, + "learning_rate": 4.428908801767089e-05, + "loss": 0.3106, + "num_input_tokens_seen": 13417120, + "step": 8780 + }, + { + "epoch": 27.11437403400309, + "grad_norm": 0.466864675283432, + "learning_rate": 4.428284112003868e-05, + "loss": 0.229, + "num_input_tokens_seen": 13424832, + "step": 8785 + }, + { + "epoch": 27.12982998454405, + "grad_norm": 0.42926129698753357, + "learning_rate": 4.4276591248750033e-05, + "loss": 0.2103, + "num_input_tokens_seen": 13432224, + "step": 8790 + }, + { + "epoch": 27.145285935085006, + "grad_norm": 0.6502004265785217, + "learning_rate": 4.4270338404768774e-05, + "loss": 0.2439, + "num_input_tokens_seen": 13440256, + "step": 8795 + }, + { + "epoch": 27.160741885625967, + "grad_norm": 0.40997394919395447, + "learning_rate": 4.426408258905917e-05, + "loss": 0.2863, + "num_input_tokens_seen": 13447712, + "step": 8800 + }, + { + "epoch": 27.160741885625967, + "eval_loss": 0.34981226921081543, + "eval_runtime": 6.3228, + "eval_samples_per_second": 90.94, + "eval_steps_per_second": 22.775, + "num_input_tokens_seen": 13447712, + "step": 8800 + }, + { + "epoch": 27.176197836166924, + "grad_norm": 0.6388409733772278, + "learning_rate": 4.425782380258594e-05, + "loss": 0.289, + "num_input_tokens_seen": 13455008, + "step": 8805 + }, + { + "epoch": 27.19165378670788, + "grad_norm": 0.4873054623603821, + "learning_rate": 4.425156204631427e-05, + "loss": 0.2647, + "num_input_tokens_seen": 13462464, + "step": 8810 + }, + { + "epoch": 27.207109737248842, + "grad_norm": 0.37333860993385315, + "learning_rate": 4.424529732120981e-05, + "loss": 0.3064, + "num_input_tokens_seen": 13470496, + "step": 8815 + }, + { + "epoch": 27.2225656877898, + "grad_norm": 0.40268343687057495, + "learning_rate": 4.423902962823864e-05, + "loss": 0.2344, + "num_input_tokens_seen": 13477920, + "step": 8820 + }, + { + "epoch": 27.238021638330757, + "grad_norm": 0.47473445534706116, + "learning_rate": 4.423275896836733e-05, + "loss": 0.3297, + "num_input_tokens_seen": 13485888, + "step": 8825 + }, + { + "epoch": 27.253477588871714, + "grad_norm": 0.6606960892677307, + "learning_rate": 4.42264853425629e-05, + "loss": 0.3107, + "num_input_tokens_seen": 13494080, + "step": 8830 + }, + { + "epoch": 27.268933539412675, + "grad_norm": 1.0731480121612549, + "learning_rate": 4.4220208751792816e-05, + "loss": 0.3805, + "num_input_tokens_seen": 13502336, + "step": 8835 + }, + { + "epoch": 27.284389489953632, + "grad_norm": 0.6454283595085144, + "learning_rate": 4.421392919702499e-05, + "loss": 0.3074, + "num_input_tokens_seen": 13509504, + "step": 8840 + }, + { + "epoch": 27.29984544049459, + "grad_norm": 0.3383397161960602, + "learning_rate": 4.4207646679227846e-05, + "loss": 0.2498, + "num_input_tokens_seen": 13516640, + "step": 8845 + }, + { + "epoch": 27.315301391035547, + "grad_norm": 0.4419497847557068, + "learning_rate": 4.42013611993702e-05, + "loss": 0.2432, + "num_input_tokens_seen": 13523584, + "step": 8850 + }, + { + "epoch": 27.330757341576508, + "grad_norm": 0.7152637839317322, + "learning_rate": 4.419507275842135e-05, + "loss": 0.2914, + "num_input_tokens_seen": 13531584, + "step": 8855 + }, + { + "epoch": 27.346213292117465, + "grad_norm": 0.45782414078712463, + "learning_rate": 4.418878135735106e-05, + "loss": 0.3179, + "num_input_tokens_seen": 13539456, + "step": 8860 + }, + { + "epoch": 27.361669242658422, + "grad_norm": 0.6958245635032654, + "learning_rate": 4.418248699712955e-05, + "loss": 0.2837, + "num_input_tokens_seen": 13546688, + "step": 8865 + }, + { + "epoch": 27.377125193199383, + "grad_norm": 0.5145497918128967, + "learning_rate": 4.417618967872748e-05, + "loss": 0.3179, + "num_input_tokens_seen": 13554432, + "step": 8870 + }, + { + "epoch": 27.39258114374034, + "grad_norm": 0.48617130517959595, + "learning_rate": 4.4169889403115985e-05, + "loss": 0.2608, + "num_input_tokens_seen": 13562176, + "step": 8875 + }, + { + "epoch": 27.408037094281298, + "grad_norm": 0.48130595684051514, + "learning_rate": 4.4163586171266627e-05, + "loss": 0.277, + "num_input_tokens_seen": 13569184, + "step": 8880 + }, + { + "epoch": 27.423493044822255, + "grad_norm": 0.3088991641998291, + "learning_rate": 4.415727998415147e-05, + "loss": 0.3708, + "num_input_tokens_seen": 13576960, + "step": 8885 + }, + { + "epoch": 27.438948995363216, + "grad_norm": 0.5498400330543518, + "learning_rate": 4.4150970842742985e-05, + "loss": 0.3079, + "num_input_tokens_seen": 13584032, + "step": 8890 + }, + { + "epoch": 27.454404945904173, + "grad_norm": 0.4773470461368561, + "learning_rate": 4.4144658748014134e-05, + "loss": 0.2964, + "num_input_tokens_seen": 13591552, + "step": 8895 + }, + { + "epoch": 27.46986089644513, + "grad_norm": 0.6048139929771423, + "learning_rate": 4.413834370093831e-05, + "loss": 0.298, + "num_input_tokens_seen": 13598496, + "step": 8900 + }, + { + "epoch": 27.48531684698609, + "grad_norm": 0.4554649591445923, + "learning_rate": 4.413202570248939e-05, + "loss": 0.3326, + "num_input_tokens_seen": 13605728, + "step": 8905 + }, + { + "epoch": 27.50077279752705, + "grad_norm": 0.5528598427772522, + "learning_rate": 4.412570475364167e-05, + "loss": 0.329, + "num_input_tokens_seen": 13613760, + "step": 8910 + }, + { + "epoch": 27.516228748068006, + "grad_norm": 0.562095582485199, + "learning_rate": 4.411938085536994e-05, + "loss": 0.3141, + "num_input_tokens_seen": 13621408, + "step": 8915 + }, + { + "epoch": 27.531684698608963, + "grad_norm": 0.7619901299476624, + "learning_rate": 4.41130540086494e-05, + "loss": 0.3717, + "num_input_tokens_seen": 13628928, + "step": 8920 + }, + { + "epoch": 27.547140649149924, + "grad_norm": 0.5878492593765259, + "learning_rate": 4.4106724214455754e-05, + "loss": 0.2367, + "num_input_tokens_seen": 13636224, + "step": 8925 + }, + { + "epoch": 27.56259659969088, + "grad_norm": 0.5497180223464966, + "learning_rate": 4.4100391473765115e-05, + "loss": 0.2578, + "num_input_tokens_seen": 13643904, + "step": 8930 + }, + { + "epoch": 27.57805255023184, + "grad_norm": 0.4377579391002655, + "learning_rate": 4.409405578755408e-05, + "loss": 0.2449, + "num_input_tokens_seen": 13651744, + "step": 8935 + }, + { + "epoch": 27.5935085007728, + "grad_norm": 0.9892991781234741, + "learning_rate": 4.4087717156799705e-05, + "loss": 0.2218, + "num_input_tokens_seen": 13659264, + "step": 8940 + }, + { + "epoch": 27.608964451313756, + "grad_norm": 0.7504569292068481, + "learning_rate": 4.408137558247946e-05, + "loss": 0.3186, + "num_input_tokens_seen": 13667072, + "step": 8945 + }, + { + "epoch": 27.624420401854714, + "grad_norm": 0.645056426525116, + "learning_rate": 4.4075031065571306e-05, + "loss": 0.2967, + "num_input_tokens_seen": 13674656, + "step": 8950 + }, + { + "epoch": 27.63987635239567, + "grad_norm": 0.44258272647857666, + "learning_rate": 4.406868360705366e-05, + "loss": 0.3517, + "num_input_tokens_seen": 13682432, + "step": 8955 + }, + { + "epoch": 27.655332302936632, + "grad_norm": 0.8714371919631958, + "learning_rate": 4.406233320790536e-05, + "loss": 0.3049, + "num_input_tokens_seen": 13689856, + "step": 8960 + }, + { + "epoch": 27.67078825347759, + "grad_norm": 1.4262443780899048, + "learning_rate": 4.4055979869105734e-05, + "loss": 0.2936, + "num_input_tokens_seen": 13697856, + "step": 8965 + }, + { + "epoch": 27.686244204018546, + "grad_norm": 0.4248619079589844, + "learning_rate": 4.404962359163454e-05, + "loss": 0.2291, + "num_input_tokens_seen": 13706112, + "step": 8970 + }, + { + "epoch": 27.701700154559504, + "grad_norm": 0.6349568963050842, + "learning_rate": 4.404326437647199e-05, + "loss": 0.3186, + "num_input_tokens_seen": 13713952, + "step": 8975 + }, + { + "epoch": 27.717156105100464, + "grad_norm": 0.8287075161933899, + "learning_rate": 4.403690222459877e-05, + "loss": 0.3191, + "num_input_tokens_seen": 13721984, + "step": 8980 + }, + { + "epoch": 27.73261205564142, + "grad_norm": 0.5484983325004578, + "learning_rate": 4.4030537136995984e-05, + "loss": 0.2671, + "num_input_tokens_seen": 13729344, + "step": 8985 + }, + { + "epoch": 27.74806800618238, + "grad_norm": 0.5016254186630249, + "learning_rate": 4.402416911464523e-05, + "loss": 0.2919, + "num_input_tokens_seen": 13737120, + "step": 8990 + }, + { + "epoch": 27.76352395672334, + "grad_norm": 1.3476248979568481, + "learning_rate": 4.4017798158528516e-05, + "loss": 0.334, + "num_input_tokens_seen": 13744864, + "step": 8995 + }, + { + "epoch": 27.778979907264297, + "grad_norm": 0.6082625985145569, + "learning_rate": 4.401142426962834e-05, + "loss": 0.3435, + "num_input_tokens_seen": 13751968, + "step": 9000 + }, + { + "epoch": 27.778979907264297, + "eval_loss": 0.34834975004196167, + "eval_runtime": 6.2949, + "eval_samples_per_second": 91.343, + "eval_steps_per_second": 22.876, + "num_input_tokens_seen": 13751968, + "step": 9000 + }, + { + "epoch": 27.794435857805254, + "grad_norm": 0.557402491569519, + "learning_rate": 4.400504744892763e-05, + "loss": 0.2533, + "num_input_tokens_seen": 13759776, + "step": 9005 + }, + { + "epoch": 27.80989180834621, + "grad_norm": 0.7182505130767822, + "learning_rate": 4.399866769740975e-05, + "loss": 0.2795, + "num_input_tokens_seen": 13767840, + "step": 9010 + }, + { + "epoch": 27.825347758887172, + "grad_norm": 0.5791873931884766, + "learning_rate": 4.399228501605859e-05, + "loss": 0.2454, + "num_input_tokens_seen": 13775296, + "step": 9015 + }, + { + "epoch": 27.84080370942813, + "grad_norm": 0.7732223272323608, + "learning_rate": 4.398589940585839e-05, + "loss": 0.2627, + "num_input_tokens_seen": 13782976, + "step": 9020 + }, + { + "epoch": 27.856259659969087, + "grad_norm": 0.7872280478477478, + "learning_rate": 4.3979510867793917e-05, + "loss": 0.2983, + "num_input_tokens_seen": 13791200, + "step": 9025 + }, + { + "epoch": 27.871715610510048, + "grad_norm": 0.6161679625511169, + "learning_rate": 4.3973119402850346e-05, + "loss": 0.3537, + "num_input_tokens_seen": 13798688, + "step": 9030 + }, + { + "epoch": 27.887171561051005, + "grad_norm": 0.6600807905197144, + "learning_rate": 4.396672501201334e-05, + "loss": 0.2814, + "num_input_tokens_seen": 13806272, + "step": 9035 + }, + { + "epoch": 27.902627511591962, + "grad_norm": 0.4497867524623871, + "learning_rate": 4.396032769626899e-05, + "loss": 0.2865, + "num_input_tokens_seen": 13813792, + "step": 9040 + }, + { + "epoch": 27.91808346213292, + "grad_norm": 0.6469016671180725, + "learning_rate": 4.395392745660384e-05, + "loss": 0.3403, + "num_input_tokens_seen": 13821408, + "step": 9045 + }, + { + "epoch": 27.93353941267388, + "grad_norm": 0.3905344009399414, + "learning_rate": 4.394752429400488e-05, + "loss": 0.2987, + "num_input_tokens_seen": 13829504, + "step": 9050 + }, + { + "epoch": 27.948995363214838, + "grad_norm": 0.4307565689086914, + "learning_rate": 4.394111820945957e-05, + "loss": 0.374, + "num_input_tokens_seen": 13837248, + "step": 9055 + }, + { + "epoch": 27.964451313755795, + "grad_norm": 0.6758072972297668, + "learning_rate": 4.393470920395579e-05, + "loss": 0.2499, + "num_input_tokens_seen": 13844448, + "step": 9060 + }, + { + "epoch": 27.979907264296756, + "grad_norm": 0.2701876163482666, + "learning_rate": 4.392829727848192e-05, + "loss": 0.218, + "num_input_tokens_seen": 13852384, + "step": 9065 + }, + { + "epoch": 27.995363214837713, + "grad_norm": 0.4430687129497528, + "learning_rate": 4.392188243402673e-05, + "loss": 0.3589, + "num_input_tokens_seen": 13860064, + "step": 9070 + }, + { + "epoch": 28.009273570324574, + "grad_norm": 0.46872109174728394, + "learning_rate": 4.391546467157949e-05, + "loss": 0.26, + "num_input_tokens_seen": 13866736, + "step": 9075 + }, + { + "epoch": 28.024729520865534, + "grad_norm": 0.4725654423236847, + "learning_rate": 4.390904399212988e-05, + "loss": 0.3201, + "num_input_tokens_seen": 13874576, + "step": 9080 + }, + { + "epoch": 28.04018547140649, + "grad_norm": 0.5076780319213867, + "learning_rate": 4.390262039666807e-05, + "loss": 0.2683, + "num_input_tokens_seen": 13882352, + "step": 9085 + }, + { + "epoch": 28.05564142194745, + "grad_norm": 0.4635952413082123, + "learning_rate": 4.389619388618464e-05, + "loss": 0.2397, + "num_input_tokens_seen": 13890064, + "step": 9090 + }, + { + "epoch": 28.071097372488406, + "grad_norm": 0.4904319941997528, + "learning_rate": 4.3889764461670655e-05, + "loss": 0.2949, + "num_input_tokens_seen": 13897968, + "step": 9095 + }, + { + "epoch": 28.086553323029367, + "grad_norm": 0.6687608957290649, + "learning_rate": 4.38833321241176e-05, + "loss": 0.2746, + "num_input_tokens_seen": 13905680, + "step": 9100 + }, + { + "epoch": 28.102009273570324, + "grad_norm": 0.583832323551178, + "learning_rate": 4.3876896874517434e-05, + "loss": 0.2903, + "num_input_tokens_seen": 13913360, + "step": 9105 + }, + { + "epoch": 28.11746522411128, + "grad_norm": 0.42697954177856445, + "learning_rate": 4.3870458713862554e-05, + "loss": 0.2324, + "num_input_tokens_seen": 13921296, + "step": 9110 + }, + { + "epoch": 28.132921174652243, + "grad_norm": 0.7508499026298523, + "learning_rate": 4.386401764314579e-05, + "loss": 0.2617, + "num_input_tokens_seen": 13929040, + "step": 9115 + }, + { + "epoch": 28.1483771251932, + "grad_norm": 0.6485130786895752, + "learning_rate": 4.385757366336045e-05, + "loss": 0.3023, + "num_input_tokens_seen": 13936720, + "step": 9120 + }, + { + "epoch": 28.163833075734157, + "grad_norm": 0.459101140499115, + "learning_rate": 4.385112677550027e-05, + "loss": 0.3019, + "num_input_tokens_seen": 13945008, + "step": 9125 + }, + { + "epoch": 28.179289026275114, + "grad_norm": 0.4527605175971985, + "learning_rate": 4.384467698055945e-05, + "loss": 0.2769, + "num_input_tokens_seen": 13952976, + "step": 9130 + }, + { + "epoch": 28.194744976816075, + "grad_norm": 0.5153471231460571, + "learning_rate": 4.383822427953261e-05, + "loss": 0.2915, + "num_input_tokens_seen": 13960848, + "step": 9135 + }, + { + "epoch": 28.210200927357032, + "grad_norm": 0.6531121730804443, + "learning_rate": 4.3831768673414864e-05, + "loss": 0.2829, + "num_input_tokens_seen": 13968272, + "step": 9140 + }, + { + "epoch": 28.22565687789799, + "grad_norm": 0.6914727091789246, + "learning_rate": 4.382531016320173e-05, + "loss": 0.2366, + "num_input_tokens_seen": 13975824, + "step": 9145 + }, + { + "epoch": 28.24111282843895, + "grad_norm": 0.6506092548370361, + "learning_rate": 4.3818848749889184e-05, + "loss": 0.3476, + "num_input_tokens_seen": 13983856, + "step": 9150 + }, + { + "epoch": 28.256568778979908, + "grad_norm": 0.5600615739822388, + "learning_rate": 4.381238443447368e-05, + "loss": 0.2982, + "num_input_tokens_seen": 13991568, + "step": 9155 + }, + { + "epoch": 28.272024729520865, + "grad_norm": 0.6205595135688782, + "learning_rate": 4.380591721795208e-05, + "loss": 0.3252, + "num_input_tokens_seen": 13999024, + "step": 9160 + }, + { + "epoch": 28.287480680061822, + "grad_norm": 0.4225098192691803, + "learning_rate": 4.3799447101321723e-05, + "loss": 0.3269, + "num_input_tokens_seen": 14007056, + "step": 9165 + }, + { + "epoch": 28.302936630602783, + "grad_norm": 0.6238294839859009, + "learning_rate": 4.379297408558036e-05, + "loss": 0.289, + "num_input_tokens_seen": 14014832, + "step": 9170 + }, + { + "epoch": 28.31839258114374, + "grad_norm": 0.4848296046257019, + "learning_rate": 4.378649817172624e-05, + "loss": 0.3079, + "num_input_tokens_seen": 14022000, + "step": 9175 + }, + { + "epoch": 28.333848531684698, + "grad_norm": 0.5073214769363403, + "learning_rate": 4.378001936075801e-05, + "loss": 0.305, + "num_input_tokens_seen": 14029584, + "step": 9180 + }, + { + "epoch": 28.34930448222566, + "grad_norm": 0.6509828567504883, + "learning_rate": 4.377353765367479e-05, + "loss": 0.2624, + "num_input_tokens_seen": 14037744, + "step": 9185 + }, + { + "epoch": 28.364760432766616, + "grad_norm": 0.6652111411094666, + "learning_rate": 4.376705305147614e-05, + "loss": 0.2617, + "num_input_tokens_seen": 14045264, + "step": 9190 + }, + { + "epoch": 28.380216383307573, + "grad_norm": 0.3995722830295563, + "learning_rate": 4.376056555516206e-05, + "loss": 0.2466, + "num_input_tokens_seen": 14052560, + "step": 9195 + }, + { + "epoch": 28.39567233384853, + "grad_norm": 0.5553077459335327, + "learning_rate": 4.375407516573302e-05, + "loss": 0.255, + "num_input_tokens_seen": 14060176, + "step": 9200 + }, + { + "epoch": 28.39567233384853, + "eval_loss": 0.3485540747642517, + "eval_runtime": 6.3137, + "eval_samples_per_second": 91.072, + "eval_steps_per_second": 22.808, + "num_input_tokens_seen": 14060176, + "step": 9200 + }, + { + "epoch": 28.41112828438949, + "grad_norm": 0.6983447670936584, + "learning_rate": 4.3747581884189913e-05, + "loss": 0.3123, + "num_input_tokens_seen": 14067888, + "step": 9205 + }, + { + "epoch": 28.42658423493045, + "grad_norm": 0.5630178451538086, + "learning_rate": 4.374108571153408e-05, + "loss": 0.2616, + "num_input_tokens_seen": 14075088, + "step": 9210 + }, + { + "epoch": 28.442040185471406, + "grad_norm": 0.5947656035423279, + "learning_rate": 4.3734586648767316e-05, + "loss": 0.3452, + "num_input_tokens_seen": 14083024, + "step": 9215 + }, + { + "epoch": 28.457496136012363, + "grad_norm": 0.3112902045249939, + "learning_rate": 4.372808469689186e-05, + "loss": 0.3312, + "num_input_tokens_seen": 14091056, + "step": 9220 + }, + { + "epoch": 28.472952086553324, + "grad_norm": 0.382099986076355, + "learning_rate": 4.372157985691039e-05, + "loss": 0.2967, + "num_input_tokens_seen": 14098160, + "step": 9225 + }, + { + "epoch": 28.48840803709428, + "grad_norm": 0.5762952566146851, + "learning_rate": 4.371507212982603e-05, + "loss": 0.2683, + "num_input_tokens_seen": 14105808, + "step": 9230 + }, + { + "epoch": 28.50386398763524, + "grad_norm": 0.5581258535385132, + "learning_rate": 4.370856151664236e-05, + "loss": 0.2355, + "num_input_tokens_seen": 14113200, + "step": 9235 + }, + { + "epoch": 28.5193199381762, + "grad_norm": 0.49782970547676086, + "learning_rate": 4.3702048018363404e-05, + "loss": 0.2901, + "num_input_tokens_seen": 14120368, + "step": 9240 + }, + { + "epoch": 28.534775888717157, + "grad_norm": 0.44524896144866943, + "learning_rate": 4.369553163599362e-05, + "loss": 0.3042, + "num_input_tokens_seen": 14128240, + "step": 9245 + }, + { + "epoch": 28.550231839258114, + "grad_norm": 0.42281365394592285, + "learning_rate": 4.3689012370537904e-05, + "loss": 0.3009, + "num_input_tokens_seen": 14136112, + "step": 9250 + }, + { + "epoch": 28.56568778979907, + "grad_norm": 0.5369840860366821, + "learning_rate": 4.368249022300164e-05, + "loss": 0.2855, + "num_input_tokens_seen": 14143920, + "step": 9255 + }, + { + "epoch": 28.581143740340032, + "grad_norm": 0.4155443012714386, + "learning_rate": 4.367596519439059e-05, + "loss": 0.2629, + "num_input_tokens_seen": 14151792, + "step": 9260 + }, + { + "epoch": 28.59659969088099, + "grad_norm": 0.22021521627902985, + "learning_rate": 4.366943728571101e-05, + "loss": 0.321, + "num_input_tokens_seen": 14159888, + "step": 9265 + }, + { + "epoch": 28.612055641421946, + "grad_norm": 0.7689139246940613, + "learning_rate": 4.366290649796959e-05, + "loss": 0.2861, + "num_input_tokens_seen": 14167600, + "step": 9270 + }, + { + "epoch": 28.627511591962907, + "grad_norm": 0.505626380443573, + "learning_rate": 4.3656372832173456e-05, + "loss": 0.3054, + "num_input_tokens_seen": 14174704, + "step": 9275 + }, + { + "epoch": 28.642967542503865, + "grad_norm": 0.44637686014175415, + "learning_rate": 4.364983628933017e-05, + "loss": 0.3721, + "num_input_tokens_seen": 14182384, + "step": 9280 + }, + { + "epoch": 28.658423493044822, + "grad_norm": 0.7772952318191528, + "learning_rate": 4.364329687044777e-05, + "loss": 0.3012, + "num_input_tokens_seen": 14189808, + "step": 9285 + }, + { + "epoch": 28.67387944358578, + "grad_norm": 0.4094494581222534, + "learning_rate": 4.36367545765347e-05, + "loss": 0.2415, + "num_input_tokens_seen": 14197616, + "step": 9290 + }, + { + "epoch": 28.68933539412674, + "grad_norm": 0.45087605714797974, + "learning_rate": 4.363020940859988e-05, + "loss": 0.2325, + "num_input_tokens_seen": 14204848, + "step": 9295 + }, + { + "epoch": 28.704791344667697, + "grad_norm": 0.5493640303611755, + "learning_rate": 4.362366136765263e-05, + "loss": 0.2285, + "num_input_tokens_seen": 14212112, + "step": 9300 + }, + { + "epoch": 28.720247295208654, + "grad_norm": 0.7755668759346008, + "learning_rate": 4.361711045470278e-05, + "loss": 0.2441, + "num_input_tokens_seen": 14219344, + "step": 9305 + }, + { + "epoch": 28.735703245749615, + "grad_norm": 0.5528512001037598, + "learning_rate": 4.3610556670760524e-05, + "loss": 0.3297, + "num_input_tokens_seen": 14226480, + "step": 9310 + }, + { + "epoch": 28.751159196290573, + "grad_norm": 0.513531506061554, + "learning_rate": 4.360400001683657e-05, + "loss": 0.2915, + "num_input_tokens_seen": 14233936, + "step": 9315 + }, + { + "epoch": 28.76661514683153, + "grad_norm": 0.6262357234954834, + "learning_rate": 4.3597440493942e-05, + "loss": 0.2816, + "num_input_tokens_seen": 14241712, + "step": 9320 + }, + { + "epoch": 28.782071097372487, + "grad_norm": 0.7280137538909912, + "learning_rate": 4.3590878103088405e-05, + "loss": 0.3208, + "num_input_tokens_seen": 14249968, + "step": 9325 + }, + { + "epoch": 28.797527047913448, + "grad_norm": 0.566424548625946, + "learning_rate": 4.358431284528779e-05, + "loss": 0.2891, + "num_input_tokens_seen": 14257680, + "step": 9330 + }, + { + "epoch": 28.812982998454405, + "grad_norm": 0.6419042348861694, + "learning_rate": 4.357774472155257e-05, + "loss": 0.3018, + "num_input_tokens_seen": 14265072, + "step": 9335 + }, + { + "epoch": 28.828438948995363, + "grad_norm": 0.44815436005592346, + "learning_rate": 4.3571173732895664e-05, + "loss": 0.3498, + "num_input_tokens_seen": 14273040, + "step": 9340 + }, + { + "epoch": 28.84389489953632, + "grad_norm": 0.574682891368866, + "learning_rate": 4.356459988033039e-05, + "loss": 0.2697, + "num_input_tokens_seen": 14280560, + "step": 9345 + }, + { + "epoch": 28.85935085007728, + "grad_norm": 0.5814764499664307, + "learning_rate": 4.355802316487051e-05, + "loss": 0.2965, + "num_input_tokens_seen": 14288528, + "step": 9350 + }, + { + "epoch": 28.874806800618238, + "grad_norm": 0.4984791874885559, + "learning_rate": 4.355144358753025e-05, + "loss": 0.2292, + "num_input_tokens_seen": 14296304, + "step": 9355 + }, + { + "epoch": 28.890262751159195, + "grad_norm": 0.31972312927246094, + "learning_rate": 4.354486114932425e-05, + "loss": 0.322, + "num_input_tokens_seen": 14303728, + "step": 9360 + }, + { + "epoch": 28.905718701700156, + "grad_norm": 0.43316248059272766, + "learning_rate": 4.353827585126762e-05, + "loss": 0.3595, + "num_input_tokens_seen": 14311344, + "step": 9365 + }, + { + "epoch": 28.921174652241113, + "grad_norm": 0.8190271854400635, + "learning_rate": 4.353168769437588e-05, + "loss": 0.2711, + "num_input_tokens_seen": 14318736, + "step": 9370 + }, + { + "epoch": 28.93663060278207, + "grad_norm": 0.5076532959938049, + "learning_rate": 4.3525096679665014e-05, + "loss": 0.3557, + "num_input_tokens_seen": 14326064, + "step": 9375 + }, + { + "epoch": 28.952086553323028, + "grad_norm": 0.3330712914466858, + "learning_rate": 4.351850280815144e-05, + "loss": 0.2744, + "num_input_tokens_seen": 14333776, + "step": 9380 + }, + { + "epoch": 28.96754250386399, + "grad_norm": 0.919866144657135, + "learning_rate": 4.3511906080852014e-05, + "loss": 0.2736, + "num_input_tokens_seen": 14341200, + "step": 9385 + }, + { + "epoch": 28.982998454404946, + "grad_norm": 0.8121787309646606, + "learning_rate": 4.350530649878404e-05, + "loss": 0.3497, + "num_input_tokens_seen": 14348848, + "step": 9390 + }, + { + "epoch": 28.998454404945903, + "grad_norm": 0.3992689847946167, + "learning_rate": 4.3498704062965246e-05, + "loss": 0.2562, + "num_input_tokens_seen": 14356464, + "step": 9395 + }, + { + "epoch": 29.012364760432767, + "grad_norm": 0.5045272707939148, + "learning_rate": 4.3492098774413815e-05, + "loss": 0.294, + "num_input_tokens_seen": 14362928, + "step": 9400 + }, + { + "epoch": 29.012364760432767, + "eval_loss": 0.3449360430240631, + "eval_runtime": 6.3595, + "eval_samples_per_second": 90.416, + "eval_steps_per_second": 22.643, + "num_input_tokens_seen": 14362928, + "step": 9400 + }, + { + "epoch": 29.027820710973725, + "grad_norm": 0.4878784120082855, + "learning_rate": 4.3485490634148375e-05, + "loss": 0.2384, + "num_input_tokens_seen": 14370288, + "step": 9405 + }, + { + "epoch": 29.043276661514682, + "grad_norm": 0.6614915728569031, + "learning_rate": 4.347887964318797e-05, + "loss": 0.3583, + "num_input_tokens_seen": 14378224, + "step": 9410 + }, + { + "epoch": 29.058732612055643, + "grad_norm": 0.42330384254455566, + "learning_rate": 4.34722658025521e-05, + "loss": 0.2674, + "num_input_tokens_seen": 14385808, + "step": 9415 + }, + { + "epoch": 29.0741885625966, + "grad_norm": 0.5929175019264221, + "learning_rate": 4.346564911326071e-05, + "loss": 0.3137, + "num_input_tokens_seen": 14393200, + "step": 9420 + }, + { + "epoch": 29.089644513137557, + "grad_norm": 0.6802027821540833, + "learning_rate": 4.345902957633418e-05, + "loss": 0.2912, + "num_input_tokens_seen": 14400112, + "step": 9425 + }, + { + "epoch": 29.105100463678518, + "grad_norm": 0.4804799556732178, + "learning_rate": 4.345240719279331e-05, + "loss": 0.2908, + "num_input_tokens_seen": 14407472, + "step": 9430 + }, + { + "epoch": 29.120556414219475, + "grad_norm": 0.6810155510902405, + "learning_rate": 4.3445781963659374e-05, + "loss": 0.3366, + "num_input_tokens_seen": 14415696, + "step": 9435 + }, + { + "epoch": 29.136012364760433, + "grad_norm": 0.8109704256057739, + "learning_rate": 4.3439153889954045e-05, + "loss": 0.3805, + "num_input_tokens_seen": 14423184, + "step": 9440 + }, + { + "epoch": 29.15146831530139, + "grad_norm": 0.4552246928215027, + "learning_rate": 4.343252297269946e-05, + "loss": 0.241, + "num_input_tokens_seen": 14430512, + "step": 9445 + }, + { + "epoch": 29.16692426584235, + "grad_norm": 0.30502381920814514, + "learning_rate": 4.342588921291821e-05, + "loss": 0.2751, + "num_input_tokens_seen": 14437872, + "step": 9450 + }, + { + "epoch": 29.182380216383308, + "grad_norm": 0.3996569812297821, + "learning_rate": 4.341925261163328e-05, + "loss": 0.285, + "num_input_tokens_seen": 14446192, + "step": 9455 + }, + { + "epoch": 29.197836166924265, + "grad_norm": 0.4796683192253113, + "learning_rate": 4.341261316986813e-05, + "loss": 0.2632, + "num_input_tokens_seen": 14454224, + "step": 9460 + }, + { + "epoch": 29.213292117465222, + "grad_norm": 0.3094743490219116, + "learning_rate": 4.340597088864664e-05, + "loss": 0.217, + "num_input_tokens_seen": 14462000, + "step": 9465 + }, + { + "epoch": 29.228748068006183, + "grad_norm": 0.49363869428634644, + "learning_rate": 4.339932576899313e-05, + "loss": 0.2887, + "num_input_tokens_seen": 14469424, + "step": 9470 + }, + { + "epoch": 29.24420401854714, + "grad_norm": 0.6344819068908691, + "learning_rate": 4.3392677811932375e-05, + "loss": 0.2653, + "num_input_tokens_seen": 14476976, + "step": 9475 + }, + { + "epoch": 29.259659969088098, + "grad_norm": 0.6357038617134094, + "learning_rate": 4.338602701848956e-05, + "loss": 0.3527, + "num_input_tokens_seen": 14484688, + "step": 9480 + }, + { + "epoch": 29.27511591962906, + "grad_norm": 0.5443391799926758, + "learning_rate": 4.337937338969033e-05, + "loss": 0.2833, + "num_input_tokens_seen": 14492496, + "step": 9485 + }, + { + "epoch": 29.290571870170016, + "grad_norm": 0.6021100282669067, + "learning_rate": 4.337271692656075e-05, + "loss": 0.2432, + "num_input_tokens_seen": 14500176, + "step": 9490 + }, + { + "epoch": 29.306027820710973, + "grad_norm": 0.9955346584320068, + "learning_rate": 4.336605763012733e-05, + "loss": 0.3885, + "num_input_tokens_seen": 14507984, + "step": 9495 + }, + { + "epoch": 29.32148377125193, + "grad_norm": 0.4640214443206787, + "learning_rate": 4.3359395501417026e-05, + "loss": 0.237, + "num_input_tokens_seen": 14515376, + "step": 9500 + }, + { + "epoch": 29.33693972179289, + "grad_norm": 0.3795275390148163, + "learning_rate": 4.335273054145722e-05, + "loss": 0.2401, + "num_input_tokens_seen": 14522960, + "step": 9505 + }, + { + "epoch": 29.35239567233385, + "grad_norm": 0.521424412727356, + "learning_rate": 4.334606275127572e-05, + "loss": 0.2656, + "num_input_tokens_seen": 14531184, + "step": 9510 + }, + { + "epoch": 29.367851622874806, + "grad_norm": 0.3199806213378906, + "learning_rate": 4.33393921319008e-05, + "loss": 0.2905, + "num_input_tokens_seen": 14538800, + "step": 9515 + }, + { + "epoch": 29.383307573415767, + "grad_norm": 0.5883857607841492, + "learning_rate": 4.3332718684361146e-05, + "loss": 0.31, + "num_input_tokens_seen": 14546640, + "step": 9520 + }, + { + "epoch": 29.398763523956724, + "grad_norm": 0.5903661847114563, + "learning_rate": 4.332604240968588e-05, + "loss": 0.2359, + "num_input_tokens_seen": 14554128, + "step": 9525 + }, + { + "epoch": 29.41421947449768, + "grad_norm": 0.520275890827179, + "learning_rate": 4.331936330890459e-05, + "loss": 0.3734, + "num_input_tokens_seen": 14561296, + "step": 9530 + }, + { + "epoch": 29.42967542503864, + "grad_norm": 0.49417999386787415, + "learning_rate": 4.331268138304725e-05, + "loss": 0.287, + "num_input_tokens_seen": 14569232, + "step": 9535 + }, + { + "epoch": 29.4451313755796, + "grad_norm": 0.5397806763648987, + "learning_rate": 4.330599663314431e-05, + "loss": 0.2742, + "num_input_tokens_seen": 14577072, + "step": 9540 + }, + { + "epoch": 29.460587326120557, + "grad_norm": 0.7935118675231934, + "learning_rate": 4.329930906022665e-05, + "loss": 0.3079, + "num_input_tokens_seen": 14584752, + "step": 9545 + }, + { + "epoch": 29.476043276661514, + "grad_norm": 0.64582359790802, + "learning_rate": 4.3292618665325564e-05, + "loss": 0.2607, + "num_input_tokens_seen": 14592880, + "step": 9550 + }, + { + "epoch": 29.491499227202475, + "grad_norm": 0.7000830173492432, + "learning_rate": 4.3285925449472796e-05, + "loss": 0.3217, + "num_input_tokens_seen": 14600176, + "step": 9555 + }, + { + "epoch": 29.506955177743432, + "grad_norm": 0.6772957444190979, + "learning_rate": 4.327922941370054e-05, + "loss": 0.2384, + "num_input_tokens_seen": 14608112, + "step": 9560 + }, + { + "epoch": 29.52241112828439, + "grad_norm": 0.4327539801597595, + "learning_rate": 4.3272530559041384e-05, + "loss": 0.2824, + "num_input_tokens_seen": 14615408, + "step": 9565 + }, + { + "epoch": 29.537867078825347, + "grad_norm": 0.7448055744171143, + "learning_rate": 4.32658288865284e-05, + "loss": 0.3216, + "num_input_tokens_seen": 14622896, + "step": 9570 + }, + { + "epoch": 29.553323029366307, + "grad_norm": 0.5187138915061951, + "learning_rate": 4.325912439719505e-05, + "loss": 0.2478, + "num_input_tokens_seen": 14630608, + "step": 9575 + }, + { + "epoch": 29.568778979907265, + "grad_norm": 0.6162310242652893, + "learning_rate": 4.3252417092075266e-05, + "loss": 0.2503, + "num_input_tokens_seen": 14638736, + "step": 9580 + }, + { + "epoch": 29.584234930448222, + "grad_norm": 0.5053537487983704, + "learning_rate": 4.3245706972203385e-05, + "loss": 0.2906, + "num_input_tokens_seen": 14646160, + "step": 9585 + }, + { + "epoch": 29.59969088098918, + "grad_norm": 0.41387858986854553, + "learning_rate": 4.323899403861421e-05, + "loss": 0.3279, + "num_input_tokens_seen": 14654064, + "step": 9590 + }, + { + "epoch": 29.61514683153014, + "grad_norm": 0.3348860442638397, + "learning_rate": 4.3232278292342935e-05, + "loss": 0.2607, + "num_input_tokens_seen": 14661392, + "step": 9595 + }, + { + "epoch": 29.630602782071097, + "grad_norm": 0.8921270370483398, + "learning_rate": 4.322555973442524e-05, + "loss": 0.2661, + "num_input_tokens_seen": 14669168, + "step": 9600 + }, + { + "epoch": 29.630602782071097, + "eval_loss": 0.3428919017314911, + "eval_runtime": 6.3153, + "eval_samples_per_second": 91.048, + "eval_steps_per_second": 22.802, + "num_input_tokens_seen": 14669168, + "step": 9600 + }, + { + "epoch": 29.646058732612055, + "grad_norm": 0.770913302898407, + "learning_rate": 4.3218838365897184e-05, + "loss": 0.2865, + "num_input_tokens_seen": 14676912, + "step": 9605 + }, + { + "epoch": 29.661514683153015, + "grad_norm": 0.8367490768432617, + "learning_rate": 4.3212114187795306e-05, + "loss": 0.3135, + "num_input_tokens_seen": 14684784, + "step": 9610 + }, + { + "epoch": 29.676970633693973, + "grad_norm": 0.4448355436325073, + "learning_rate": 4.320538720115656e-05, + "loss": 0.2786, + "num_input_tokens_seen": 14692464, + "step": 9615 + }, + { + "epoch": 29.69242658423493, + "grad_norm": 0.5847178101539612, + "learning_rate": 4.319865740701831e-05, + "loss": 0.2515, + "num_input_tokens_seen": 14700592, + "step": 9620 + }, + { + "epoch": 29.707882534775887, + "grad_norm": 0.5935328006744385, + "learning_rate": 4.3191924806418396e-05, + "loss": 0.3191, + "num_input_tokens_seen": 14707952, + "step": 9625 + }, + { + "epoch": 29.723338485316848, + "grad_norm": 1.1169129610061646, + "learning_rate": 4.318518940039507e-05, + "loss": 0.2297, + "num_input_tokens_seen": 14715152, + "step": 9630 + }, + { + "epoch": 29.738794435857805, + "grad_norm": 0.6130616664886475, + "learning_rate": 4.3178451189987e-05, + "loss": 0.2958, + "num_input_tokens_seen": 14722544, + "step": 9635 + }, + { + "epoch": 29.754250386398763, + "grad_norm": 0.584479570388794, + "learning_rate": 4.3171710176233315e-05, + "loss": 0.2371, + "num_input_tokens_seen": 14729968, + "step": 9640 + }, + { + "epoch": 29.769706336939723, + "grad_norm": 0.5889752507209778, + "learning_rate": 4.316496636017355e-05, + "loss": 0.3163, + "num_input_tokens_seen": 14737264, + "step": 9645 + }, + { + "epoch": 29.78516228748068, + "grad_norm": 0.9950096607208252, + "learning_rate": 4.315821974284771e-05, + "loss": 0.2861, + "num_input_tokens_seen": 14744400, + "step": 9650 + }, + { + "epoch": 29.800618238021638, + "grad_norm": 0.47749942541122437, + "learning_rate": 4.315147032529619e-05, + "loss": 0.2981, + "num_input_tokens_seen": 14751856, + "step": 9655 + }, + { + "epoch": 29.816074188562595, + "grad_norm": 0.9806463122367859, + "learning_rate": 4.3144718108559845e-05, + "loss": 0.3644, + "num_input_tokens_seen": 14759344, + "step": 9660 + }, + { + "epoch": 29.831530139103556, + "grad_norm": 0.3734319508075714, + "learning_rate": 4.3137963093679945e-05, + "loss": 0.2116, + "num_input_tokens_seen": 14766960, + "step": 9665 + }, + { + "epoch": 29.846986089644513, + "grad_norm": 1.0214723348617554, + "learning_rate": 4.31312052816982e-05, + "loss": 0.293, + "num_input_tokens_seen": 14774864, + "step": 9670 + }, + { + "epoch": 29.86244204018547, + "grad_norm": 0.5102330446243286, + "learning_rate": 4.312444467365675e-05, + "loss": 0.3007, + "num_input_tokens_seen": 14782736, + "step": 9675 + }, + { + "epoch": 29.87789799072643, + "grad_norm": 0.6442170739173889, + "learning_rate": 4.311768127059816e-05, + "loss": 0.2938, + "num_input_tokens_seen": 14791024, + "step": 9680 + }, + { + "epoch": 29.89335394126739, + "grad_norm": 0.5886996388435364, + "learning_rate": 4.3110915073565444e-05, + "loss": 0.3179, + "num_input_tokens_seen": 14798704, + "step": 9685 + }, + { + "epoch": 29.908809891808346, + "grad_norm": 0.7398491501808167, + "learning_rate": 4.310414608360203e-05, + "loss": 0.2569, + "num_input_tokens_seen": 14806320, + "step": 9690 + }, + { + "epoch": 29.924265842349303, + "grad_norm": 0.36095866560935974, + "learning_rate": 4.309737430175177e-05, + "loss": 0.2723, + "num_input_tokens_seen": 14813648, + "step": 9695 + }, + { + "epoch": 29.939721792890264, + "grad_norm": 0.4619748890399933, + "learning_rate": 4.309059972905897e-05, + "loss": 0.3549, + "num_input_tokens_seen": 14821840, + "step": 9700 + }, + { + "epoch": 29.95517774343122, + "grad_norm": 0.713893473148346, + "learning_rate": 4.308382236656836e-05, + "loss": 0.3072, + "num_input_tokens_seen": 14829456, + "step": 9705 + }, + { + "epoch": 29.97063369397218, + "grad_norm": 0.5511335730552673, + "learning_rate": 4.307704221532507e-05, + "loss": 0.2404, + "num_input_tokens_seen": 14837488, + "step": 9710 + }, + { + "epoch": 29.986089644513136, + "grad_norm": 0.4224947988986969, + "learning_rate": 4.307025927637471e-05, + "loss": 0.242, + "num_input_tokens_seen": 14845360, + "step": 9715 + }, + { + "epoch": 30.0, + "grad_norm": 0.5247539281845093, + "learning_rate": 4.306347355076328e-05, + "loss": 0.3073, + "num_input_tokens_seen": 14852096, + "step": 9720 + }, + { + "epoch": 30.015455950540957, + "grad_norm": 0.3860490620136261, + "learning_rate": 4.305668503953724e-05, + "loss": 0.3158, + "num_input_tokens_seen": 14859424, + "step": 9725 + }, + { + "epoch": 30.030911901081918, + "grad_norm": 0.5082833170890808, + "learning_rate": 4.3049893743743436e-05, + "loss": 0.3208, + "num_input_tokens_seen": 14867264, + "step": 9730 + }, + { + "epoch": 30.046367851622875, + "grad_norm": 0.658906102180481, + "learning_rate": 4.304309966442919e-05, + "loss": 0.2821, + "num_input_tokens_seen": 14874688, + "step": 9735 + }, + { + "epoch": 30.061823802163833, + "grad_norm": 0.44582608342170715, + "learning_rate": 4.303630280264224e-05, + "loss": 0.2516, + "num_input_tokens_seen": 14881984, + "step": 9740 + }, + { + "epoch": 30.07727975270479, + "grad_norm": 0.41515350341796875, + "learning_rate": 4.302950315943074e-05, + "loss": 0.2725, + "num_input_tokens_seen": 14889312, + "step": 9745 + }, + { + "epoch": 30.09273570324575, + "grad_norm": 0.5317512154579163, + "learning_rate": 4.3022700735843275e-05, + "loss": 0.3035, + "num_input_tokens_seen": 14897472, + "step": 9750 + }, + { + "epoch": 30.108191653786708, + "grad_norm": 0.7523840665817261, + "learning_rate": 4.301589553292887e-05, + "loss": 0.2566, + "num_input_tokens_seen": 14905664, + "step": 9755 + }, + { + "epoch": 30.123647604327665, + "grad_norm": 0.6439720392227173, + "learning_rate": 4.300908755173697e-05, + "loss": 0.3338, + "num_input_tokens_seen": 14913120, + "step": 9760 + }, + { + "epoch": 30.139103554868626, + "grad_norm": 0.5437300205230713, + "learning_rate": 4.300227679331745e-05, + "loss": 0.3412, + "num_input_tokens_seen": 14920576, + "step": 9765 + }, + { + "epoch": 30.154559505409583, + "grad_norm": 0.4416978359222412, + "learning_rate": 4.299546325872063e-05, + "loss": 0.2438, + "num_input_tokens_seen": 14928224, + "step": 9770 + }, + { + "epoch": 30.17001545595054, + "grad_norm": 0.5853207111358643, + "learning_rate": 4.2988646948997225e-05, + "loss": 0.3175, + "num_input_tokens_seen": 14935616, + "step": 9775 + }, + { + "epoch": 30.185471406491498, + "grad_norm": 0.5278336405754089, + "learning_rate": 4.29818278651984e-05, + "loss": 0.3433, + "num_input_tokens_seen": 14942752, + "step": 9780 + }, + { + "epoch": 30.20092735703246, + "grad_norm": 0.42982664704322815, + "learning_rate": 4.297500600837574e-05, + "loss": 0.3344, + "num_input_tokens_seen": 14950400, + "step": 9785 + }, + { + "epoch": 30.216383307573416, + "grad_norm": 0.6631214618682861, + "learning_rate": 4.2968181379581276e-05, + "loss": 0.2894, + "num_input_tokens_seen": 14958304, + "step": 9790 + }, + { + "epoch": 30.231839258114373, + "grad_norm": 0.6328229904174805, + "learning_rate": 4.296135397986743e-05, + "loss": 0.3734, + "num_input_tokens_seen": 14965792, + "step": 9795 + }, + { + "epoch": 30.24729520865533, + "grad_norm": 0.7435451149940491, + "learning_rate": 4.295452381028709e-05, + "loss": 0.3367, + "num_input_tokens_seen": 14973568, + "step": 9800 + }, + { + "epoch": 30.24729520865533, + "eval_loss": 0.3407024145126343, + "eval_runtime": 6.3127, + "eval_samples_per_second": 91.086, + "eval_steps_per_second": 22.811, + "num_input_tokens_seen": 14973568, + "step": 9800 + }, + { + "epoch": 30.26275115919629, + "grad_norm": 0.485881507396698, + "learning_rate": 4.294769087189354e-05, + "loss": 0.2972, + "num_input_tokens_seen": 14981120, + "step": 9805 + }, + { + "epoch": 30.27820710973725, + "grad_norm": 0.4764610826969147, + "learning_rate": 4.294085516574052e-05, + "loss": 0.2555, + "num_input_tokens_seen": 14988704, + "step": 9810 + }, + { + "epoch": 30.293663060278206, + "grad_norm": 0.42536014318466187, + "learning_rate": 4.2934016692882176e-05, + "loss": 0.2158, + "num_input_tokens_seen": 14996256, + "step": 9815 + }, + { + "epoch": 30.309119010819167, + "grad_norm": 0.5213482975959778, + "learning_rate": 4.292717545437308e-05, + "loss": 0.2673, + "num_input_tokens_seen": 15003904, + "step": 9820 + }, + { + "epoch": 30.324574961360124, + "grad_norm": 1.0703859329223633, + "learning_rate": 4.292033145126825e-05, + "loss": 0.2715, + "num_input_tokens_seen": 15011008, + "step": 9825 + }, + { + "epoch": 30.34003091190108, + "grad_norm": 0.5694833993911743, + "learning_rate": 4.29134846846231e-05, + "loss": 0.2913, + "num_input_tokens_seen": 15018688, + "step": 9830 + }, + { + "epoch": 30.35548686244204, + "grad_norm": 0.6567373871803284, + "learning_rate": 4.29066351554935e-05, + "loss": 0.2962, + "num_input_tokens_seen": 15026656, + "step": 9835 + }, + { + "epoch": 30.370942812983, + "grad_norm": 0.7680280208587646, + "learning_rate": 4.289978286493574e-05, + "loss": 0.2512, + "num_input_tokens_seen": 15034272, + "step": 9840 + }, + { + "epoch": 30.386398763523957, + "grad_norm": 0.49822965264320374, + "learning_rate": 4.28929278140065e-05, + "loss": 0.3243, + "num_input_tokens_seen": 15041888, + "step": 9845 + }, + { + "epoch": 30.401854714064914, + "grad_norm": 0.4930824339389801, + "learning_rate": 4.288607000376295e-05, + "loss": 0.2329, + "num_input_tokens_seen": 15049024, + "step": 9850 + }, + { + "epoch": 30.417310664605875, + "grad_norm": 0.5417259931564331, + "learning_rate": 4.2879209435262624e-05, + "loss": 0.3395, + "num_input_tokens_seen": 15056640, + "step": 9855 + }, + { + "epoch": 30.432766615146832, + "grad_norm": 0.6675532460212708, + "learning_rate": 4.287234610956353e-05, + "loss": 0.29, + "num_input_tokens_seen": 15063968, + "step": 9860 + }, + { + "epoch": 30.44822256568779, + "grad_norm": 0.7095792889595032, + "learning_rate": 4.2865480027724056e-05, + "loss": 0.3504, + "num_input_tokens_seen": 15072032, + "step": 9865 + }, + { + "epoch": 30.463678516228747, + "grad_norm": 0.7892898917198181, + "learning_rate": 4.285861119080306e-05, + "loss": 0.2428, + "num_input_tokens_seen": 15079552, + "step": 9870 + }, + { + "epoch": 30.479134466769708, + "grad_norm": 0.46731382608413696, + "learning_rate": 4.2851739599859784e-05, + "loss": 0.2182, + "num_input_tokens_seen": 15088064, + "step": 9875 + }, + { + "epoch": 30.494590417310665, + "grad_norm": 0.5686298608779907, + "learning_rate": 4.2844865255953934e-05, + "loss": 0.2891, + "num_input_tokens_seen": 15095456, + "step": 9880 + }, + { + "epoch": 30.510046367851622, + "grad_norm": 0.8118075132369995, + "learning_rate": 4.2837988160145605e-05, + "loss": 0.2264, + "num_input_tokens_seen": 15103200, + "step": 9885 + }, + { + "epoch": 30.525502318392583, + "grad_norm": 0.40033283829689026, + "learning_rate": 4.2831108313495336e-05, + "loss": 0.2666, + "num_input_tokens_seen": 15110368, + "step": 9890 + }, + { + "epoch": 30.54095826893354, + "grad_norm": 0.3136499524116516, + "learning_rate": 4.282422571706408e-05, + "loss": 0.2545, + "num_input_tokens_seen": 15117984, + "step": 9895 + }, + { + "epoch": 30.556414219474497, + "grad_norm": 0.4120291769504547, + "learning_rate": 4.281734037191323e-05, + "loss": 0.2745, + "num_input_tokens_seen": 15125664, + "step": 9900 + }, + { + "epoch": 30.571870170015455, + "grad_norm": 0.628220796585083, + "learning_rate": 4.281045227910459e-05, + "loss": 0.2261, + "num_input_tokens_seen": 15132896, + "step": 9905 + }, + { + "epoch": 30.587326120556416, + "grad_norm": 0.4190906286239624, + "learning_rate": 4.280356143970038e-05, + "loss": 0.3298, + "num_input_tokens_seen": 15140928, + "step": 9910 + }, + { + "epoch": 30.602782071097373, + "grad_norm": 0.3189743459224701, + "learning_rate": 4.279666785476327e-05, + "loss": 0.3045, + "num_input_tokens_seen": 15148832, + "step": 9915 + }, + { + "epoch": 30.61823802163833, + "grad_norm": 0.46872711181640625, + "learning_rate": 4.2789771525356325e-05, + "loss": 0.2644, + "num_input_tokens_seen": 15156384, + "step": 9920 + }, + { + "epoch": 30.633693972179287, + "grad_norm": 0.5871984362602234, + "learning_rate": 4.2782872452543056e-05, + "loss": 0.2325, + "num_input_tokens_seen": 15164000, + "step": 9925 + }, + { + "epoch": 30.649149922720248, + "grad_norm": 0.4311677813529968, + "learning_rate": 4.2775970637387376e-05, + "loss": 0.2642, + "num_input_tokens_seen": 15171456, + "step": 9930 + }, + { + "epoch": 30.664605873261205, + "grad_norm": 0.5393868088722229, + "learning_rate": 4.276906608095363e-05, + "loss": 0.3147, + "num_input_tokens_seen": 15179264, + "step": 9935 + }, + { + "epoch": 30.680061823802163, + "grad_norm": 0.709825336933136, + "learning_rate": 4.276215878430661e-05, + "loss": 0.2618, + "num_input_tokens_seen": 15186528, + "step": 9940 + }, + { + "epoch": 30.695517774343124, + "grad_norm": 0.6609684228897095, + "learning_rate": 4.275524874851149e-05, + "loss": 0.2738, + "num_input_tokens_seen": 15194016, + "step": 9945 + }, + { + "epoch": 30.71097372488408, + "grad_norm": 0.4208090603351593, + "learning_rate": 4.274833597463388e-05, + "loss": 0.279, + "num_input_tokens_seen": 15201408, + "step": 9950 + }, + { + "epoch": 30.726429675425038, + "grad_norm": 0.40816500782966614, + "learning_rate": 4.2741420463739824e-05, + "loss": 0.266, + "num_input_tokens_seen": 15209312, + "step": 9955 + }, + { + "epoch": 30.741885625965995, + "grad_norm": 0.5730279684066772, + "learning_rate": 4.273450221689578e-05, + "loss": 0.336, + "num_input_tokens_seen": 15217280, + "step": 9960 + }, + { + "epoch": 30.757341576506956, + "grad_norm": 0.5307969450950623, + "learning_rate": 4.272758123516863e-05, + "loss": 0.275, + "num_input_tokens_seen": 15225472, + "step": 9965 + }, + { + "epoch": 30.772797527047913, + "grad_norm": 0.7318836450576782, + "learning_rate": 4.272065751962567e-05, + "loss": 0.2508, + "num_input_tokens_seen": 15233024, + "step": 9970 + }, + { + "epoch": 30.78825347758887, + "grad_norm": 0.5756546258926392, + "learning_rate": 4.271373107133464e-05, + "loss": 0.4038, + "num_input_tokens_seen": 15240992, + "step": 9975 + }, + { + "epoch": 30.80370942812983, + "grad_norm": 0.7191903591156006, + "learning_rate": 4.270680189136366e-05, + "loss": 0.2404, + "num_input_tokens_seen": 15248576, + "step": 9980 + }, + { + "epoch": 30.81916537867079, + "grad_norm": 0.40319013595581055, + "learning_rate": 4.269986998078132e-05, + "loss": 0.2923, + "num_input_tokens_seen": 15256384, + "step": 9985 + }, + { + "epoch": 30.834621329211746, + "grad_norm": 0.6335333585739136, + "learning_rate": 4.2692935340656595e-05, + "loss": 0.2662, + "num_input_tokens_seen": 15263872, + "step": 9990 + }, + { + "epoch": 30.850077279752703, + "grad_norm": 0.457205206155777, + "learning_rate": 4.26859979720589e-05, + "loss": 0.285, + "num_input_tokens_seen": 15271936, + "step": 9995 + }, + { + "epoch": 30.865533230293664, + "grad_norm": 0.6257767081260681, + "learning_rate": 4.267905787605806e-05, + "loss": 0.2961, + "num_input_tokens_seen": 15279840, + "step": 10000 + }, + { + "epoch": 30.865533230293664, + "eval_loss": 0.3408345580101013, + "eval_runtime": 6.2752, + "eval_samples_per_second": 91.631, + "eval_steps_per_second": 22.948, + "num_input_tokens_seen": 15279840, + "step": 10000 + }, + { + "epoch": 30.88098918083462, + "grad_norm": 0.5293672680854797, + "learning_rate": 4.267211505372433e-05, + "loss": 0.2509, + "num_input_tokens_seen": 15287680, + "step": 10005 + }, + { + "epoch": 30.89644513137558, + "grad_norm": 0.46837523579597473, + "learning_rate": 4.266516950612837e-05, + "loss": 0.2474, + "num_input_tokens_seen": 15294816, + "step": 10010 + }, + { + "epoch": 30.91190108191654, + "grad_norm": 0.5652144551277161, + "learning_rate": 4.265822123434128e-05, + "loss": 0.2926, + "num_input_tokens_seen": 15302880, + "step": 10015 + }, + { + "epoch": 30.927357032457497, + "grad_norm": 0.3721048831939697, + "learning_rate": 4.265127023943457e-05, + "loss": 0.2979, + "num_input_tokens_seen": 15311008, + "step": 10020 + }, + { + "epoch": 30.942812982998454, + "grad_norm": 0.8226016759872437, + "learning_rate": 4.2644316522480176e-05, + "loss": 0.3355, + "num_input_tokens_seen": 15319232, + "step": 10025 + }, + { + "epoch": 30.95826893353941, + "grad_norm": 0.5333162546157837, + "learning_rate": 4.263736008455044e-05, + "loss": 0.3018, + "num_input_tokens_seen": 15326752, + "step": 10030 + }, + { + "epoch": 30.973724884080372, + "grad_norm": 0.5890175104141235, + "learning_rate": 4.2630400926718125e-05, + "loss": 0.3649, + "num_input_tokens_seen": 15334720, + "step": 10035 + }, + { + "epoch": 30.98918083462133, + "grad_norm": 0.6330624222755432, + "learning_rate": 4.262343905005644e-05, + "loss": 0.2623, + "num_input_tokens_seen": 15342240, + "step": 10040 + }, + { + "epoch": 31.00309119010819, + "grad_norm": 0.4780576825141907, + "learning_rate": 4.261647445563897e-05, + "loss": 0.2174, + "num_input_tokens_seen": 15349008, + "step": 10045 + }, + { + "epoch": 31.01854714064915, + "grad_norm": 0.5741416811943054, + "learning_rate": 4.260950714453976e-05, + "loss": 0.2548, + "num_input_tokens_seen": 15356656, + "step": 10050 + }, + { + "epoch": 31.034003091190108, + "grad_norm": 0.6281588077545166, + "learning_rate": 4.2602537117833266e-05, + "loss": 0.3068, + "num_input_tokens_seen": 15364464, + "step": 10055 + }, + { + "epoch": 31.049459041731065, + "grad_norm": 0.807810366153717, + "learning_rate": 4.259556437659433e-05, + "loss": 0.3219, + "num_input_tokens_seen": 15372272, + "step": 10060 + }, + { + "epoch": 31.064914992272026, + "grad_norm": 0.5424848794937134, + "learning_rate": 4.258858892189825e-05, + "loss": 0.3269, + "num_input_tokens_seen": 15379792, + "step": 10065 + }, + { + "epoch": 31.080370942812984, + "grad_norm": 0.597834050655365, + "learning_rate": 4.2581610754820725e-05, + "loss": 0.282, + "num_input_tokens_seen": 15387824, + "step": 10070 + }, + { + "epoch": 31.09582689335394, + "grad_norm": 0.39408233761787415, + "learning_rate": 4.2574629876437876e-05, + "loss": 0.2923, + "num_input_tokens_seen": 15395568, + "step": 10075 + }, + { + "epoch": 31.111282843894898, + "grad_norm": 0.7660068273544312, + "learning_rate": 4.256764628782625e-05, + "loss": 0.2433, + "num_input_tokens_seen": 15403120, + "step": 10080 + }, + { + "epoch": 31.12673879443586, + "grad_norm": 0.44384029507637024, + "learning_rate": 4.256065999006279e-05, + "loss": 0.2525, + "num_input_tokens_seen": 15410608, + "step": 10085 + }, + { + "epoch": 31.142194744976816, + "grad_norm": 0.7852585911750793, + "learning_rate": 4.2553670984224885e-05, + "loss": 0.3266, + "num_input_tokens_seen": 15418384, + "step": 10090 + }, + { + "epoch": 31.157650695517773, + "grad_norm": 0.6457718014717102, + "learning_rate": 4.254667927139032e-05, + "loss": 0.2823, + "num_input_tokens_seen": 15425808, + "step": 10095 + }, + { + "epoch": 31.173106646058734, + "grad_norm": 0.5525968074798584, + "learning_rate": 4.2539684852637295e-05, + "loss": 0.305, + "num_input_tokens_seen": 15434128, + "step": 10100 + }, + { + "epoch": 31.18856259659969, + "grad_norm": 0.5769906044006348, + "learning_rate": 4.253268772904446e-05, + "loss": 0.254, + "num_input_tokens_seen": 15441872, + "step": 10105 + }, + { + "epoch": 31.20401854714065, + "grad_norm": 0.5488436222076416, + "learning_rate": 4.252568790169085e-05, + "loss": 0.2737, + "num_input_tokens_seen": 15449040, + "step": 10110 + }, + { + "epoch": 31.219474497681606, + "grad_norm": 0.5212190747261047, + "learning_rate": 4.251868537165592e-05, + "loss": 0.2265, + "num_input_tokens_seen": 15457264, + "step": 10115 + }, + { + "epoch": 31.234930448222567, + "grad_norm": 0.8649855256080627, + "learning_rate": 4.251168014001955e-05, + "loss": 0.2669, + "num_input_tokens_seen": 15464912, + "step": 10120 + }, + { + "epoch": 31.250386398763524, + "grad_norm": 1.0315202474594116, + "learning_rate": 4.250467220786204e-05, + "loss": 0.3505, + "num_input_tokens_seen": 15472080, + "step": 10125 + }, + { + "epoch": 31.26584234930448, + "grad_norm": 0.508949875831604, + "learning_rate": 4.249766157626409e-05, + "loss": 0.2876, + "num_input_tokens_seen": 15479888, + "step": 10130 + }, + { + "epoch": 31.28129829984544, + "grad_norm": 0.4617001414299011, + "learning_rate": 4.249064824630684e-05, + "loss": 0.2315, + "num_input_tokens_seen": 15487088, + "step": 10135 + }, + { + "epoch": 31.2967542503864, + "grad_norm": 0.5922565460205078, + "learning_rate": 4.248363221907183e-05, + "loss": 0.2478, + "num_input_tokens_seen": 15494640, + "step": 10140 + }, + { + "epoch": 31.312210200927357, + "grad_norm": 0.5480470657348633, + "learning_rate": 4.2476613495641026e-05, + "loss": 0.3536, + "num_input_tokens_seen": 15502224, + "step": 10145 + }, + { + "epoch": 31.327666151468314, + "grad_norm": 0.9901421666145325, + "learning_rate": 4.246959207709679e-05, + "loss": 0.2608, + "num_input_tokens_seen": 15510256, + "step": 10150 + }, + { + "epoch": 31.343122102009275, + "grad_norm": 0.7880836129188538, + "learning_rate": 4.246256796452192e-05, + "loss": 0.2851, + "num_input_tokens_seen": 15517136, + "step": 10155 + }, + { + "epoch": 31.358578052550232, + "grad_norm": 0.3433455526828766, + "learning_rate": 4.245554115899962e-05, + "loss": 0.2478, + "num_input_tokens_seen": 15524816, + "step": 10160 + }, + { + "epoch": 31.37403400309119, + "grad_norm": 0.6122440099716187, + "learning_rate": 4.2448511661613514e-05, + "loss": 0.2739, + "num_input_tokens_seen": 15532368, + "step": 10165 + }, + { + "epoch": 31.389489953632147, + "grad_norm": 0.5416987538337708, + "learning_rate": 4.2441479473447635e-05, + "loss": 0.3554, + "num_input_tokens_seen": 15540912, + "step": 10170 + }, + { + "epoch": 31.404945904173108, + "grad_norm": 0.9207471013069153, + "learning_rate": 4.243444459558644e-05, + "loss": 0.2868, + "num_input_tokens_seen": 15548304, + "step": 10175 + }, + { + "epoch": 31.420401854714065, + "grad_norm": 0.41455453634262085, + "learning_rate": 4.24274070291148e-05, + "loss": 0.3845, + "num_input_tokens_seen": 15556048, + "step": 10180 + }, + { + "epoch": 31.435857805255022, + "grad_norm": 0.6432345509529114, + "learning_rate": 4.242036677511798e-05, + "loss": 0.2313, + "num_input_tokens_seen": 15563472, + "step": 10185 + }, + { + "epoch": 31.451313755795983, + "grad_norm": 0.46978598833084106, + "learning_rate": 4.241332383468169e-05, + "loss": 0.3005, + "num_input_tokens_seen": 15571280, + "step": 10190 + }, + { + "epoch": 31.46676970633694, + "grad_norm": 0.7473583817481995, + "learning_rate": 4.2406278208892034e-05, + "loss": 0.2749, + "num_input_tokens_seen": 15578736, + "step": 10195 + }, + { + "epoch": 31.482225656877898, + "grad_norm": 0.5385870933532715, + "learning_rate": 4.2399229898835536e-05, + "loss": 0.2853, + "num_input_tokens_seen": 15586352, + "step": 10200 + }, + { + "epoch": 31.482225656877898, + "eval_loss": 0.3394823670387268, + "eval_runtime": 6.3142, + "eval_samples_per_second": 91.065, + "eval_steps_per_second": 22.806, + "num_input_tokens_seen": 15586352, + "step": 10200 + }, + { + "epoch": 31.497681607418855, + "grad_norm": 0.7769055366516113, + "learning_rate": 4.239217890559914e-05, + "loss": 0.3173, + "num_input_tokens_seen": 15594032, + "step": 10205 + }, + { + "epoch": 31.513137557959816, + "grad_norm": 0.5724741220474243, + "learning_rate": 4.238512523027019e-05, + "loss": 0.2804, + "num_input_tokens_seen": 15601680, + "step": 10210 + }, + { + "epoch": 31.528593508500773, + "grad_norm": 0.3057754933834076, + "learning_rate": 4.237806887393645e-05, + "loss": 0.2902, + "num_input_tokens_seen": 15609424, + "step": 10215 + }, + { + "epoch": 31.54404945904173, + "grad_norm": 0.9923636317253113, + "learning_rate": 4.237100983768611e-05, + "loss": 0.2135, + "num_input_tokens_seen": 15617104, + "step": 10220 + }, + { + "epoch": 31.55950540958269, + "grad_norm": 0.6801339387893677, + "learning_rate": 4.2363948122607756e-05, + "loss": 0.277, + "num_input_tokens_seen": 15624528, + "step": 10225 + }, + { + "epoch": 31.57496136012365, + "grad_norm": 0.6390348076820374, + "learning_rate": 4.235688372979039e-05, + "loss": 0.2738, + "num_input_tokens_seen": 15632368, + "step": 10230 + }, + { + "epoch": 31.590417310664606, + "grad_norm": 0.8357334733009338, + "learning_rate": 4.234981666032343e-05, + "loss": 0.2828, + "num_input_tokens_seen": 15639984, + "step": 10235 + }, + { + "epoch": 31.605873261205563, + "grad_norm": 0.5288840532302856, + "learning_rate": 4.2342746915296704e-05, + "loss": 0.3347, + "num_input_tokens_seen": 15647248, + "step": 10240 + }, + { + "epoch": 31.621329211746524, + "grad_norm": 0.4395450949668884, + "learning_rate": 4.233567449580047e-05, + "loss": 0.2498, + "num_input_tokens_seen": 15654832, + "step": 10245 + }, + { + "epoch": 31.63678516228748, + "grad_norm": 0.5019509792327881, + "learning_rate": 4.232859940292537e-05, + "loss": 0.2762, + "num_input_tokens_seen": 15662608, + "step": 10250 + }, + { + "epoch": 31.652241112828438, + "grad_norm": 0.3621577322483063, + "learning_rate": 4.232152163776248e-05, + "loss": 0.2403, + "num_input_tokens_seen": 15670704, + "step": 10255 + }, + { + "epoch": 31.667697063369395, + "grad_norm": 0.43427619338035583, + "learning_rate": 4.231444120140328e-05, + "loss": 0.2384, + "num_input_tokens_seen": 15678320, + "step": 10260 + }, + { + "epoch": 31.683153013910356, + "grad_norm": 0.8052217960357666, + "learning_rate": 4.230735809493967e-05, + "loss": 0.2844, + "num_input_tokens_seen": 15685552, + "step": 10265 + }, + { + "epoch": 31.698608964451314, + "grad_norm": 0.7136706113815308, + "learning_rate": 4.2300272319463926e-05, + "loss": 0.2173, + "num_input_tokens_seen": 15693232, + "step": 10270 + }, + { + "epoch": 31.71406491499227, + "grad_norm": 0.4679260551929474, + "learning_rate": 4.2293183876068786e-05, + "loss": 0.2672, + "num_input_tokens_seen": 15700912, + "step": 10275 + }, + { + "epoch": 31.72952086553323, + "grad_norm": 0.6756159663200378, + "learning_rate": 4.228609276584737e-05, + "loss": 0.3588, + "num_input_tokens_seen": 15708848, + "step": 10280 + }, + { + "epoch": 31.74497681607419, + "grad_norm": 0.6697835922241211, + "learning_rate": 4.227899898989323e-05, + "loss": 0.2851, + "num_input_tokens_seen": 15716592, + "step": 10285 + }, + { + "epoch": 31.760432766615146, + "grad_norm": 0.8192082047462463, + "learning_rate": 4.2271902549300293e-05, + "loss": 0.245, + "num_input_tokens_seen": 15723760, + "step": 10290 + }, + { + "epoch": 31.775888717156104, + "grad_norm": 0.5367572903633118, + "learning_rate": 4.226480344516294e-05, + "loss": 0.4248, + "num_input_tokens_seen": 15731344, + "step": 10295 + }, + { + "epoch": 31.791344667697064, + "grad_norm": 0.39724451303482056, + "learning_rate": 4.2257701678575925e-05, + "loss": 0.2231, + "num_input_tokens_seen": 15739792, + "step": 10300 + }, + { + "epoch": 31.80680061823802, + "grad_norm": 0.5120067000389099, + "learning_rate": 4.225059725063444e-05, + "loss": 0.2407, + "num_input_tokens_seen": 15747408, + "step": 10305 + }, + { + "epoch": 31.82225656877898, + "grad_norm": 0.3083018958568573, + "learning_rate": 4.2243490162434074e-05, + "loss": 0.2938, + "num_input_tokens_seen": 15754736, + "step": 10310 + }, + { + "epoch": 31.83771251931994, + "grad_norm": 0.6205025911331177, + "learning_rate": 4.223638041507083e-05, + "loss": 0.3117, + "num_input_tokens_seen": 15762192, + "step": 10315 + }, + { + "epoch": 31.853168469860897, + "grad_norm": 0.5828123092651367, + "learning_rate": 4.2229268009641124e-05, + "loss": 0.3133, + "num_input_tokens_seen": 15770448, + "step": 10320 + }, + { + "epoch": 31.868624420401854, + "grad_norm": 0.7924843430519104, + "learning_rate": 4.222215294724177e-05, + "loss": 0.3312, + "num_input_tokens_seen": 15777872, + "step": 10325 + }, + { + "epoch": 31.88408037094281, + "grad_norm": 0.5721500515937805, + "learning_rate": 4.2215035228970005e-05, + "loss": 0.2628, + "num_input_tokens_seen": 15785648, + "step": 10330 + }, + { + "epoch": 31.899536321483772, + "grad_norm": 0.35242512822151184, + "learning_rate": 4.2207914855923464e-05, + "loss": 0.275, + "num_input_tokens_seen": 15793040, + "step": 10335 + }, + { + "epoch": 31.91499227202473, + "grad_norm": 0.3874468505382538, + "learning_rate": 4.220079182920021e-05, + "loss": 0.2644, + "num_input_tokens_seen": 15800848, + "step": 10340 + }, + { + "epoch": 31.930448222565687, + "grad_norm": 0.3500699996948242, + "learning_rate": 4.2193666149898705e-05, + "loss": 0.2925, + "num_input_tokens_seen": 15808432, + "step": 10345 + }, + { + "epoch": 31.945904173106648, + "grad_norm": 0.429762065410614, + "learning_rate": 4.21865378191178e-05, + "loss": 0.2978, + "num_input_tokens_seen": 15815856, + "step": 10350 + }, + { + "epoch": 31.961360123647605, + "grad_norm": 0.49037373065948486, + "learning_rate": 4.217940683795678e-05, + "loss": 0.3166, + "num_input_tokens_seen": 15823344, + "step": 10355 + }, + { + "epoch": 31.976816074188562, + "grad_norm": 0.49960917234420776, + "learning_rate": 4.217227320751534e-05, + "loss": 0.3086, + "num_input_tokens_seen": 15831056, + "step": 10360 + }, + { + "epoch": 31.99227202472952, + "grad_norm": 0.5772466659545898, + "learning_rate": 4.216513692889358e-05, + "loss": 0.2874, + "num_input_tokens_seen": 15839024, + "step": 10365 + }, + { + "epoch": 32.00618238021638, + "grad_norm": 0.896300733089447, + "learning_rate": 4.215799800319199e-05, + "loss": 0.2641, + "num_input_tokens_seen": 15845376, + "step": 10370 + }, + { + "epoch": 32.02163833075734, + "grad_norm": 0.4656875729560852, + "learning_rate": 4.2150856431511485e-05, + "loss": 0.2864, + "num_input_tokens_seen": 15853280, + "step": 10375 + }, + { + "epoch": 32.0370942812983, + "grad_norm": 0.44519996643066406, + "learning_rate": 4.214371221495339e-05, + "loss": 0.2579, + "num_input_tokens_seen": 15860352, + "step": 10380 + }, + { + "epoch": 32.052550231839255, + "grad_norm": 0.9609034657478333, + "learning_rate": 4.213656535461942e-05, + "loss": 0.3606, + "num_input_tokens_seen": 15867872, + "step": 10385 + }, + { + "epoch": 32.068006182380216, + "grad_norm": 0.45265987515449524, + "learning_rate": 4.2129415851611734e-05, + "loss": 0.2406, + "num_input_tokens_seen": 15875392, + "step": 10390 + }, + { + "epoch": 32.08346213292118, + "grad_norm": 0.5555565357208252, + "learning_rate": 4.2122263707032855e-05, + "loss": 0.3678, + "num_input_tokens_seen": 15883008, + "step": 10395 + }, + { + "epoch": 32.09891808346213, + "grad_norm": 0.7135065793991089, + "learning_rate": 4.211510892198574e-05, + "loss": 0.2809, + "num_input_tokens_seen": 15891232, + "step": 10400 + }, + { + "epoch": 32.09891808346213, + "eval_loss": 0.33650901913642883, + "eval_runtime": 6.3, + "eval_samples_per_second": 91.27, + "eval_steps_per_second": 22.857, + "num_input_tokens_seen": 15891232, + "step": 10400 + }, + { + "epoch": 32.11437403400309, + "grad_norm": 0.4945389926433563, + "learning_rate": 4.210795149757375e-05, + "loss": 0.2713, + "num_input_tokens_seen": 15899392, + "step": 10405 + }, + { + "epoch": 32.12982998454405, + "grad_norm": 0.5226321220397949, + "learning_rate": 4.210079143490065e-05, + "loss": 0.3038, + "num_input_tokens_seen": 15906496, + "step": 10410 + }, + { + "epoch": 32.145285935085006, + "grad_norm": 0.7635927796363831, + "learning_rate": 4.2093628735070604e-05, + "loss": 0.2804, + "num_input_tokens_seen": 15913920, + "step": 10415 + }, + { + "epoch": 32.16074188562597, + "grad_norm": 0.3532120883464813, + "learning_rate": 4.208646339918819e-05, + "loss": 0.2871, + "num_input_tokens_seen": 15921184, + "step": 10420 + }, + { + "epoch": 32.17619783616692, + "grad_norm": 0.5338006615638733, + "learning_rate": 4.2079295428358414e-05, + "loss": 0.3113, + "num_input_tokens_seen": 15928704, + "step": 10425 + }, + { + "epoch": 32.19165378670788, + "grad_norm": 0.7164326310157776, + "learning_rate": 4.207212482368664e-05, + "loss": 0.2663, + "num_input_tokens_seen": 15936800, + "step": 10430 + }, + { + "epoch": 32.20710973724884, + "grad_norm": 0.7742984294891357, + "learning_rate": 4.206495158627867e-05, + "loss": 0.3122, + "num_input_tokens_seen": 15945216, + "step": 10435 + }, + { + "epoch": 32.222565687789796, + "grad_norm": 0.6062212586402893, + "learning_rate": 4.205777571724073e-05, + "loss": 0.261, + "num_input_tokens_seen": 15952736, + "step": 10440 + }, + { + "epoch": 32.23802163833076, + "grad_norm": 0.402996689081192, + "learning_rate": 4.20505972176794e-05, + "loss": 0.3547, + "num_input_tokens_seen": 15960736, + "step": 10445 + }, + { + "epoch": 32.25347758887172, + "grad_norm": 0.5697880387306213, + "learning_rate": 4.204341608870171e-05, + "loss": 0.2553, + "num_input_tokens_seen": 15968256, + "step": 10450 + }, + { + "epoch": 32.26893353941267, + "grad_norm": 0.9647498726844788, + "learning_rate": 4.203623233141508e-05, + "loss": 0.2367, + "num_input_tokens_seen": 15975488, + "step": 10455 + }, + { + "epoch": 32.28438948995363, + "grad_norm": 0.5128077268600464, + "learning_rate": 4.2029045946927334e-05, + "loss": 0.3065, + "num_input_tokens_seen": 15983296, + "step": 10460 + }, + { + "epoch": 32.29984544049459, + "grad_norm": 1.3099311590194702, + "learning_rate": 4.20218569363467e-05, + "loss": 0.2376, + "num_input_tokens_seen": 15991488, + "step": 10465 + }, + { + "epoch": 32.31530139103555, + "grad_norm": 0.24936643242835999, + "learning_rate": 4.2014665300781834e-05, + "loss": 0.2818, + "num_input_tokens_seen": 15999744, + "step": 10470 + }, + { + "epoch": 32.33075734157651, + "grad_norm": 0.5298882722854614, + "learning_rate": 4.200747104134174e-05, + "loss": 0.3037, + "num_input_tokens_seen": 16007840, + "step": 10475 + }, + { + "epoch": 32.34621329211747, + "grad_norm": 0.5470780730247498, + "learning_rate": 4.200027415913588e-05, + "loss": 0.325, + "num_input_tokens_seen": 16015488, + "step": 10480 + }, + { + "epoch": 32.36166924265842, + "grad_norm": 0.46248185634613037, + "learning_rate": 4.1993074655274126e-05, + "loss": 0.403, + "num_input_tokens_seen": 16022496, + "step": 10485 + }, + { + "epoch": 32.37712519319938, + "grad_norm": 0.37832316756248474, + "learning_rate": 4.198587253086669e-05, + "loss": 0.2701, + "num_input_tokens_seen": 16030080, + "step": 10490 + }, + { + "epoch": 32.39258114374034, + "grad_norm": 0.468071848154068, + "learning_rate": 4.197866778702426e-05, + "loss": 0.2451, + "num_input_tokens_seen": 16037536, + "step": 10495 + }, + { + "epoch": 32.4080370942813, + "grad_norm": 0.41976872086524963, + "learning_rate": 4.197146042485789e-05, + "loss": 0.2743, + "num_input_tokens_seen": 16045440, + "step": 10500 + }, + { + "epoch": 32.42349304482226, + "grad_norm": 0.5778932571411133, + "learning_rate": 4.1964250445479046e-05, + "loss": 0.2184, + "num_input_tokens_seen": 16053024, + "step": 10505 + }, + { + "epoch": 32.43894899536321, + "grad_norm": 0.6987515091896057, + "learning_rate": 4.19570378499996e-05, + "loss": 0.2355, + "num_input_tokens_seen": 16060512, + "step": 10510 + }, + { + "epoch": 32.45440494590417, + "grad_norm": 0.8956790566444397, + "learning_rate": 4.194982263953182e-05, + "loss": 0.2159, + "num_input_tokens_seen": 16067648, + "step": 10515 + }, + { + "epoch": 32.469860896445134, + "grad_norm": 0.644355058670044, + "learning_rate": 4.194260481518838e-05, + "loss": 0.2833, + "num_input_tokens_seen": 16075296, + "step": 10520 + }, + { + "epoch": 32.48531684698609, + "grad_norm": 0.47336408495903015, + "learning_rate": 4.1935384378082366e-05, + "loss": 0.2811, + "num_input_tokens_seen": 16083136, + "step": 10525 + }, + { + "epoch": 32.50077279752705, + "grad_norm": 0.32257983088493347, + "learning_rate": 4.1928161329327267e-05, + "loss": 0.2807, + "num_input_tokens_seen": 16090144, + "step": 10530 + }, + { + "epoch": 32.51622874806801, + "grad_norm": 0.4642952084541321, + "learning_rate": 4.1920935670036945e-05, + "loss": 0.2691, + "num_input_tokens_seen": 16098208, + "step": 10535 + }, + { + "epoch": 32.53168469860896, + "grad_norm": 0.6482694149017334, + "learning_rate": 4.1913707401325705e-05, + "loss": 0.2785, + "num_input_tokens_seen": 16106208, + "step": 10540 + }, + { + "epoch": 32.547140649149924, + "grad_norm": 0.6181731224060059, + "learning_rate": 4.1906476524308235e-05, + "loss": 0.25, + "num_input_tokens_seen": 16114304, + "step": 10545 + }, + { + "epoch": 32.56259659969088, + "grad_norm": 0.49179911613464355, + "learning_rate": 4.189924304009962e-05, + "loss": 0.259, + "num_input_tokens_seen": 16121888, + "step": 10550 + }, + { + "epoch": 32.57805255023184, + "grad_norm": 0.5712064504623413, + "learning_rate": 4.189200694981537e-05, + "loss": 0.3285, + "num_input_tokens_seen": 16129440, + "step": 10555 + }, + { + "epoch": 32.5935085007728, + "grad_norm": 0.6262872219085693, + "learning_rate": 4.188476825457136e-05, + "loss": 0.2949, + "num_input_tokens_seen": 16137216, + "step": 10560 + }, + { + "epoch": 32.60896445131375, + "grad_norm": 0.23655512928962708, + "learning_rate": 4.18775269554839e-05, + "loss": 0.252, + "num_input_tokens_seen": 16144832, + "step": 10565 + }, + { + "epoch": 32.624420401854714, + "grad_norm": 0.6548280715942383, + "learning_rate": 4.187028305366969e-05, + "loss": 0.3044, + "num_input_tokens_seen": 16152832, + "step": 10570 + }, + { + "epoch": 32.639876352395675, + "grad_norm": 0.6177781224250793, + "learning_rate": 4.1863036550245824e-05, + "loss": 0.3329, + "num_input_tokens_seen": 16160416, + "step": 10575 + }, + { + "epoch": 32.65533230293663, + "grad_norm": 0.526555061340332, + "learning_rate": 4.1855787446329806e-05, + "loss": 0.2549, + "num_input_tokens_seen": 16168160, + "step": 10580 + }, + { + "epoch": 32.67078825347759, + "grad_norm": 0.6734768748283386, + "learning_rate": 4.184853574303955e-05, + "loss": 0.327, + "num_input_tokens_seen": 16175360, + "step": 10585 + }, + { + "epoch": 32.68624420401855, + "grad_norm": 0.5789259076118469, + "learning_rate": 4.184128144149334e-05, + "loss": 0.3852, + "num_input_tokens_seen": 16182752, + "step": 10590 + }, + { + "epoch": 32.701700154559504, + "grad_norm": 0.353307843208313, + "learning_rate": 4.1834024542809896e-05, + "loss": 0.2505, + "num_input_tokens_seen": 16190144, + "step": 10595 + }, + { + "epoch": 32.717156105100464, + "grad_norm": 0.5663338899612427, + "learning_rate": 4.1826765048108315e-05, + "loss": 0.2892, + "num_input_tokens_seen": 16197472, + "step": 10600 + }, + { + "epoch": 32.717156105100464, + "eval_loss": 0.3371313512325287, + "eval_runtime": 6.3175, + "eval_samples_per_second": 91.017, + "eval_steps_per_second": 22.794, + "num_input_tokens_seen": 16197472, + "step": 10600 + }, + { + "epoch": 32.732612055641425, + "grad_norm": 0.34873032569885254, + "learning_rate": 4.181950295850811e-05, + "loss": 0.2738, + "num_input_tokens_seen": 16205344, + "step": 10605 + }, + { + "epoch": 32.74806800618238, + "grad_norm": 0.46986183524131775, + "learning_rate": 4.181223827512918e-05, + "loss": 0.2598, + "num_input_tokens_seen": 16212896, + "step": 10610 + }, + { + "epoch": 32.76352395672334, + "grad_norm": 1.117040991783142, + "learning_rate": 4.180497099909183e-05, + "loss": 0.2511, + "num_input_tokens_seen": 16220544, + "step": 10615 + }, + { + "epoch": 32.778979907264294, + "grad_norm": 0.6258671283721924, + "learning_rate": 4.179770113151677e-05, + "loss": 0.3062, + "num_input_tokens_seen": 16227968, + "step": 10620 + }, + { + "epoch": 32.794435857805254, + "grad_norm": 0.3326128125190735, + "learning_rate": 4.179042867352511e-05, + "loss": 0.2996, + "num_input_tokens_seen": 16235840, + "step": 10625 + }, + { + "epoch": 32.809891808346215, + "grad_norm": 0.5758018493652344, + "learning_rate": 4.1783153626238334e-05, + "loss": 0.2471, + "num_input_tokens_seen": 16243616, + "step": 10630 + }, + { + "epoch": 32.82534775888717, + "grad_norm": 0.6333622336387634, + "learning_rate": 4.177587599077836e-05, + "loss": 0.2529, + "num_input_tokens_seen": 16250944, + "step": 10635 + }, + { + "epoch": 32.84080370942813, + "grad_norm": 0.3778810501098633, + "learning_rate": 4.1768595768267494e-05, + "loss": 0.2598, + "num_input_tokens_seen": 16258880, + "step": 10640 + }, + { + "epoch": 32.85625965996909, + "grad_norm": 0.4363964796066284, + "learning_rate": 4.176131295982843e-05, + "loss": 0.2779, + "num_input_tokens_seen": 16266496, + "step": 10645 + }, + { + "epoch": 32.871715610510044, + "grad_norm": 1.155600905418396, + "learning_rate": 4.1754027566584276e-05, + "loss": 0.3326, + "num_input_tokens_seen": 16274016, + "step": 10650 + }, + { + "epoch": 32.887171561051005, + "grad_norm": 0.5357420444488525, + "learning_rate": 4.174673958965852e-05, + "loss": 0.2633, + "num_input_tokens_seen": 16281696, + "step": 10655 + }, + { + "epoch": 32.902627511591966, + "grad_norm": 0.5983253717422485, + "learning_rate": 4.173944903017507e-05, + "loss": 0.3034, + "num_input_tokens_seen": 16289632, + "step": 10660 + }, + { + "epoch": 32.91808346213292, + "grad_norm": 0.5821060538291931, + "learning_rate": 4.173215588925822e-05, + "loss": 0.2686, + "num_input_tokens_seen": 16297344, + "step": 10665 + }, + { + "epoch": 32.93353941267388, + "grad_norm": 0.8568161725997925, + "learning_rate": 4.172486016803266e-05, + "loss": 0.2827, + "num_input_tokens_seen": 16304768, + "step": 10670 + }, + { + "epoch": 32.948995363214834, + "grad_norm": 1.036049246788025, + "learning_rate": 4.171756186762349e-05, + "loss": 0.2534, + "num_input_tokens_seen": 16312032, + "step": 10675 + }, + { + "epoch": 32.964451313755795, + "grad_norm": 0.4691803455352783, + "learning_rate": 4.171026098915619e-05, + "loss": 0.2605, + "num_input_tokens_seen": 16319104, + "step": 10680 + }, + { + "epoch": 32.979907264296756, + "grad_norm": 0.402124285697937, + "learning_rate": 4.170295753375665e-05, + "loss": 0.2728, + "num_input_tokens_seen": 16326816, + "step": 10685 + }, + { + "epoch": 32.99536321483771, + "grad_norm": 0.5058770179748535, + "learning_rate": 4.169565150255117e-05, + "loss": 0.2536, + "num_input_tokens_seen": 16334432, + "step": 10690 + }, + { + "epoch": 33.009273570324574, + "grad_norm": 0.39978158473968506, + "learning_rate": 4.16883428966664e-05, + "loss": 0.2556, + "num_input_tokens_seen": 16340928, + "step": 10695 + }, + { + "epoch": 33.024729520865534, + "grad_norm": 0.5495910048484802, + "learning_rate": 4.168103171722944e-05, + "loss": 0.2842, + "num_input_tokens_seen": 16348672, + "step": 10700 + }, + { + "epoch": 33.04018547140649, + "grad_norm": 0.7531049251556396, + "learning_rate": 4.167371796536777e-05, + "loss": 0.3089, + "num_input_tokens_seen": 16356000, + "step": 10705 + }, + { + "epoch": 33.05564142194745, + "grad_norm": 0.8697512149810791, + "learning_rate": 4.166640164220924e-05, + "loss": 0.2863, + "num_input_tokens_seen": 16363904, + "step": 10710 + }, + { + "epoch": 33.07109737248841, + "grad_norm": 0.6259536147117615, + "learning_rate": 4.1659082748882144e-05, + "loss": 0.2234, + "num_input_tokens_seen": 16371488, + "step": 10715 + }, + { + "epoch": 33.086553323029364, + "grad_norm": 0.4593915641307831, + "learning_rate": 4.1651761286515135e-05, + "loss": 0.2194, + "num_input_tokens_seen": 16379776, + "step": 10720 + }, + { + "epoch": 33.102009273570324, + "grad_norm": 0.4678657054901123, + "learning_rate": 4.164443725623728e-05, + "loss": 0.2868, + "num_input_tokens_seen": 16387360, + "step": 10725 + }, + { + "epoch": 33.117465224111285, + "grad_norm": 0.3755057752132416, + "learning_rate": 4.163711065917802e-05, + "loss": 0.3219, + "num_input_tokens_seen": 16394944, + "step": 10730 + }, + { + "epoch": 33.13292117465224, + "grad_norm": 0.5068818926811218, + "learning_rate": 4.1629781496467234e-05, + "loss": 0.339, + "num_input_tokens_seen": 16402592, + "step": 10735 + }, + { + "epoch": 33.1483771251932, + "grad_norm": 0.5669158697128296, + "learning_rate": 4.1622449769235164e-05, + "loss": 0.2834, + "num_input_tokens_seen": 16410240, + "step": 10740 + }, + { + "epoch": 33.16383307573416, + "grad_norm": 0.6034325361251831, + "learning_rate": 4.161511547861243e-05, + "loss": 0.2805, + "num_input_tokens_seen": 16417888, + "step": 10745 + }, + { + "epoch": 33.179289026275114, + "grad_norm": 0.5986834168434143, + "learning_rate": 4.1607778625730104e-05, + "loss": 0.2713, + "num_input_tokens_seen": 16425696, + "step": 10750 + }, + { + "epoch": 33.194744976816075, + "grad_norm": 0.5046502947807312, + "learning_rate": 4.160043921171961e-05, + "loss": 0.223, + "num_input_tokens_seen": 16433472, + "step": 10755 + }, + { + "epoch": 33.210200927357036, + "grad_norm": 0.5558559894561768, + "learning_rate": 4.159309723771276e-05, + "loss": 0.4528, + "num_input_tokens_seen": 16441152, + "step": 10760 + }, + { + "epoch": 33.22565687789799, + "grad_norm": 0.3867781162261963, + "learning_rate": 4.158575270484181e-05, + "loss": 0.2288, + "num_input_tokens_seen": 16449056, + "step": 10765 + }, + { + "epoch": 33.24111282843895, + "grad_norm": 0.7799686789512634, + "learning_rate": 4.157840561423936e-05, + "loss": 0.2938, + "num_input_tokens_seen": 16456352, + "step": 10770 + }, + { + "epoch": 33.256568778979904, + "grad_norm": 0.46214067935943604, + "learning_rate": 4.1571055967038416e-05, + "loss": 0.2478, + "num_input_tokens_seen": 16463680, + "step": 10775 + }, + { + "epoch": 33.272024729520865, + "grad_norm": 0.45140913128852844, + "learning_rate": 4.156370376437241e-05, + "loss": 0.2206, + "num_input_tokens_seen": 16471296, + "step": 10780 + }, + { + "epoch": 33.287480680061826, + "grad_norm": 0.8329147100448608, + "learning_rate": 4.155634900737513e-05, + "loss": 0.2834, + "num_input_tokens_seen": 16478944, + "step": 10785 + }, + { + "epoch": 33.30293663060278, + "grad_norm": 0.5158499479293823, + "learning_rate": 4.1548991697180764e-05, + "loss": 0.3444, + "num_input_tokens_seen": 16486464, + "step": 10790 + }, + { + "epoch": 33.31839258114374, + "grad_norm": 0.7548772692680359, + "learning_rate": 4.1541631834923914e-05, + "loss": 0.2695, + "num_input_tokens_seen": 16494144, + "step": 10795 + }, + { + "epoch": 33.3338485316847, + "grad_norm": 0.3607438802719116, + "learning_rate": 4.153426942173956e-05, + "loss": 0.2193, + "num_input_tokens_seen": 16500992, + "step": 10800 + }, + { + "epoch": 33.3338485316847, + "eval_loss": 0.33405768871307373, + "eval_runtime": 6.2926, + "eval_samples_per_second": 91.378, + "eval_steps_per_second": 22.884, + "num_input_tokens_seen": 16500992, + "step": 10800 + }, + { + "epoch": 33.349304482225655, + "grad_norm": 0.5317320227622986, + "learning_rate": 4.152690445876308e-05, + "loss": 0.324, + "num_input_tokens_seen": 16509088, + "step": 10805 + }, + { + "epoch": 33.364760432766616, + "grad_norm": 0.605313777923584, + "learning_rate": 4.1519536947130245e-05, + "loss": 0.2463, + "num_input_tokens_seen": 16517024, + "step": 10810 + }, + { + "epoch": 33.38021638330758, + "grad_norm": 0.5392521619796753, + "learning_rate": 4.151216688797722e-05, + "loss": 0.3092, + "num_input_tokens_seen": 16524768, + "step": 10815 + }, + { + "epoch": 33.39567233384853, + "grad_norm": 0.466656357049942, + "learning_rate": 4.150479428244054e-05, + "loss": 0.2349, + "num_input_tokens_seen": 16532512, + "step": 10820 + }, + { + "epoch": 33.41112828438949, + "grad_norm": 0.5393986701965332, + "learning_rate": 4.1497419131657176e-05, + "loss": 0.2691, + "num_input_tokens_seen": 16540224, + "step": 10825 + }, + { + "epoch": 33.426584234930445, + "grad_norm": 1.1573375463485718, + "learning_rate": 4.149004143676447e-05, + "loss": 0.327, + "num_input_tokens_seen": 16547584, + "step": 10830 + }, + { + "epoch": 33.442040185471406, + "grad_norm": 0.49528124928474426, + "learning_rate": 4.148266119890015e-05, + "loss": 0.257, + "num_input_tokens_seen": 16555104, + "step": 10835 + }, + { + "epoch": 33.45749613601237, + "grad_norm": 0.4797780215740204, + "learning_rate": 4.1475278419202324e-05, + "loss": 0.2193, + "num_input_tokens_seen": 16562464, + "step": 10840 + }, + { + "epoch": 33.47295208655332, + "grad_norm": 0.5116164684295654, + "learning_rate": 4.146789309880953e-05, + "loss": 0.2375, + "num_input_tokens_seen": 16569472, + "step": 10845 + }, + { + "epoch": 33.48840803709428, + "grad_norm": 0.7316502332687378, + "learning_rate": 4.146050523886068e-05, + "loss": 0.2837, + "num_input_tokens_seen": 16577056, + "step": 10850 + }, + { + "epoch": 33.50386398763524, + "grad_norm": 0.6634213328361511, + "learning_rate": 4.1453114840495055e-05, + "loss": 0.2686, + "num_input_tokens_seen": 16584864, + "step": 10855 + }, + { + "epoch": 33.519319938176196, + "grad_norm": 0.47906938195228577, + "learning_rate": 4.1445721904852364e-05, + "loss": 0.2552, + "num_input_tokens_seen": 16592960, + "step": 10860 + }, + { + "epoch": 33.53477588871716, + "grad_norm": 0.8630818128585815, + "learning_rate": 4.143832643307269e-05, + "loss": 0.2639, + "num_input_tokens_seen": 16600480, + "step": 10865 + }, + { + "epoch": 33.55023183925812, + "grad_norm": 0.8438584208488464, + "learning_rate": 4.1430928426296503e-05, + "loss": 0.3184, + "num_input_tokens_seen": 16608320, + "step": 10870 + }, + { + "epoch": 33.56568778979907, + "grad_norm": 0.4904278814792633, + "learning_rate": 4.142352788566466e-05, + "loss": 0.3279, + "num_input_tokens_seen": 16615872, + "step": 10875 + }, + { + "epoch": 33.58114374034003, + "grad_norm": 0.4150509834289551, + "learning_rate": 4.1416124812318424e-05, + "loss": 0.3081, + "num_input_tokens_seen": 16623072, + "step": 10880 + }, + { + "epoch": 33.59659969088099, + "grad_norm": 0.3447813093662262, + "learning_rate": 4.1408719207399453e-05, + "loss": 0.2802, + "num_input_tokens_seen": 16630016, + "step": 10885 + }, + { + "epoch": 33.61205564142195, + "grad_norm": 0.5646242499351501, + "learning_rate": 4.140131107204978e-05, + "loss": 0.265, + "num_input_tokens_seen": 16638144, + "step": 10890 + }, + { + "epoch": 33.62751159196291, + "grad_norm": 0.8926159143447876, + "learning_rate": 4.139390040741182e-05, + "loss": 0.2765, + "num_input_tokens_seen": 16646336, + "step": 10895 + }, + { + "epoch": 33.64296754250386, + "grad_norm": 0.6287103891372681, + "learning_rate": 4.1386487214628396e-05, + "loss": 0.3029, + "num_input_tokens_seen": 16654304, + "step": 10900 + }, + { + "epoch": 33.65842349304482, + "grad_norm": 0.4183264672756195, + "learning_rate": 4.137907149484272e-05, + "loss": 0.3105, + "num_input_tokens_seen": 16662304, + "step": 10905 + }, + { + "epoch": 33.67387944358578, + "grad_norm": 0.42322972416877747, + "learning_rate": 4.137165324919839e-05, + "loss": 0.2937, + "num_input_tokens_seen": 16670400, + "step": 10910 + }, + { + "epoch": 33.689335394126736, + "grad_norm": 0.3977496325969696, + "learning_rate": 4.136423247883939e-05, + "loss": 0.2617, + "num_input_tokens_seen": 16677728, + "step": 10915 + }, + { + "epoch": 33.7047913446677, + "grad_norm": 0.4152752459049225, + "learning_rate": 4.135680918491009e-05, + "loss": 0.222, + "num_input_tokens_seen": 16685280, + "step": 10920 + }, + { + "epoch": 33.72024729520866, + "grad_norm": 0.8185412883758545, + "learning_rate": 4.1349383368555265e-05, + "loss": 0.341, + "num_input_tokens_seen": 16692608, + "step": 10925 + }, + { + "epoch": 33.73570324574961, + "grad_norm": 0.4986433684825897, + "learning_rate": 4.1341955030920065e-05, + "loss": 0.3364, + "num_input_tokens_seen": 16700736, + "step": 10930 + }, + { + "epoch": 33.75115919629057, + "grad_norm": 0.627949059009552, + "learning_rate": 4.1334524173150036e-05, + "loss": 0.3932, + "num_input_tokens_seen": 16708160, + "step": 10935 + }, + { + "epoch": 33.76661514683153, + "grad_norm": 0.6256363391876221, + "learning_rate": 4.13270907963911e-05, + "loss": 0.2343, + "num_input_tokens_seen": 16715744, + "step": 10940 + }, + { + "epoch": 33.78207109737249, + "grad_norm": 0.4442964494228363, + "learning_rate": 4.131965490178959e-05, + "loss": 0.2865, + "num_input_tokens_seen": 16723744, + "step": 10945 + }, + { + "epoch": 33.79752704791345, + "grad_norm": 0.5951035022735596, + "learning_rate": 4.131221649049222e-05, + "loss": 0.2853, + "num_input_tokens_seen": 16731744, + "step": 10950 + }, + { + "epoch": 33.8129829984544, + "grad_norm": 0.6733617782592773, + "learning_rate": 4.130477556364606e-05, + "loss": 0.269, + "num_input_tokens_seen": 16739904, + "step": 10955 + }, + { + "epoch": 33.82843894899536, + "grad_norm": 0.38272279500961304, + "learning_rate": 4.129733212239861e-05, + "loss": 0.2275, + "num_input_tokens_seen": 16747296, + "step": 10960 + }, + { + "epoch": 33.84389489953632, + "grad_norm": 1.1275099515914917, + "learning_rate": 4.128988616789774e-05, + "loss": 0.2375, + "num_input_tokens_seen": 16754432, + "step": 10965 + }, + { + "epoch": 33.85935085007728, + "grad_norm": 0.5487701296806335, + "learning_rate": 4.1282437701291724e-05, + "loss": 0.3737, + "num_input_tokens_seen": 16762016, + "step": 10970 + }, + { + "epoch": 33.87480680061824, + "grad_norm": 0.3713403642177582, + "learning_rate": 4.1274986723729184e-05, + "loss": 0.2868, + "num_input_tokens_seen": 16769312, + "step": 10975 + }, + { + "epoch": 33.8902627511592, + "grad_norm": 0.5884156227111816, + "learning_rate": 4.126753323635917e-05, + "loss": 0.2876, + "num_input_tokens_seen": 16777088, + "step": 10980 + }, + { + "epoch": 33.90571870170015, + "grad_norm": 0.482374370098114, + "learning_rate": 4.12600772403311e-05, + "loss": 0.2649, + "num_input_tokens_seen": 16784768, + "step": 10985 + }, + { + "epoch": 33.92117465224111, + "grad_norm": 0.5613670945167542, + "learning_rate": 4.125261873679479e-05, + "loss": 0.2604, + "num_input_tokens_seen": 16792576, + "step": 10990 + }, + { + "epoch": 33.936630602782074, + "grad_norm": 0.5626015067100525, + "learning_rate": 4.124515772690042e-05, + "loss": 0.2153, + "num_input_tokens_seen": 16799872, + "step": 10995 + }, + { + "epoch": 33.95208655332303, + "grad_norm": 0.4851844608783722, + "learning_rate": 4.123769421179858e-05, + "loss": 0.2344, + "num_input_tokens_seen": 16807808, + "step": 11000 + }, + { + "epoch": 33.95208655332303, + "eval_loss": 0.33431512117385864, + "eval_runtime": 6.3258, + "eval_samples_per_second": 90.897, + "eval_steps_per_second": 22.764, + "num_input_tokens_seen": 16807808, + "step": 11000 + }, + { + "epoch": 33.96754250386399, + "grad_norm": 0.39309757947921753, + "learning_rate": 4.1230228192640236e-05, + "loss": 0.2301, + "num_input_tokens_seen": 16815840, + "step": 11005 + }, + { + "epoch": 33.98299845440495, + "grad_norm": 0.964069664478302, + "learning_rate": 4.122275967057675e-05, + "loss": 0.2894, + "num_input_tokens_seen": 16823680, + "step": 11010 + }, + { + "epoch": 33.9984544049459, + "grad_norm": 0.4506332576274872, + "learning_rate": 4.1215288646759846e-05, + "loss": 0.3207, + "num_input_tokens_seen": 16831264, + "step": 11015 + }, + { + "epoch": 34.01236476043277, + "grad_norm": 0.6462857723236084, + "learning_rate": 4.120781512234166e-05, + "loss": 0.2938, + "num_input_tokens_seen": 16838336, + "step": 11020 + }, + { + "epoch": 34.02782071097373, + "grad_norm": 0.4097870886325836, + "learning_rate": 4.120033909847471e-05, + "loss": 0.3058, + "num_input_tokens_seen": 16845952, + "step": 11025 + }, + { + "epoch": 34.04327666151468, + "grad_norm": 0.35269248485565186, + "learning_rate": 4.119286057631187e-05, + "loss": 0.2073, + "num_input_tokens_seen": 16853440, + "step": 11030 + }, + { + "epoch": 34.05873261205564, + "grad_norm": 0.45569556951522827, + "learning_rate": 4.118537955700646e-05, + "loss": 0.2279, + "num_input_tokens_seen": 16860800, + "step": 11035 + }, + { + "epoch": 34.074188562596596, + "grad_norm": 0.6904122829437256, + "learning_rate": 4.11778960417121e-05, + "loss": 0.4197, + "num_input_tokens_seen": 16868288, + "step": 11040 + }, + { + "epoch": 34.08964451313756, + "grad_norm": 0.6630672216415405, + "learning_rate": 4.117041003158288e-05, + "loss": 0.3672, + "num_input_tokens_seen": 16875808, + "step": 11045 + }, + { + "epoch": 34.10510046367852, + "grad_norm": 0.5365743041038513, + "learning_rate": 4.1162921527773215e-05, + "loss": 0.2877, + "num_input_tokens_seen": 16884032, + "step": 11050 + }, + { + "epoch": 34.12055641421947, + "grad_norm": 0.48944851756095886, + "learning_rate": 4.115543053143794e-05, + "loss": 0.2243, + "num_input_tokens_seen": 16891616, + "step": 11055 + }, + { + "epoch": 34.13601236476043, + "grad_norm": 0.579908013343811, + "learning_rate": 4.114793704373226e-05, + "loss": 0.3108, + "num_input_tokens_seen": 16898944, + "step": 11060 + }, + { + "epoch": 34.15146831530139, + "grad_norm": 0.26483792066574097, + "learning_rate": 4.114044106581175e-05, + "loss": 0.2854, + "num_input_tokens_seen": 16906144, + "step": 11065 + }, + { + "epoch": 34.16692426584235, + "grad_norm": 1.593031644821167, + "learning_rate": 4.11329425988324e-05, + "loss": 0.3442, + "num_input_tokens_seen": 16913856, + "step": 11070 + }, + { + "epoch": 34.18238021638331, + "grad_norm": 0.5521802306175232, + "learning_rate": 4.112544164395056e-05, + "loss": 0.2634, + "num_input_tokens_seen": 16921120, + "step": 11075 + }, + { + "epoch": 34.19783616692427, + "grad_norm": 0.42971915006637573, + "learning_rate": 4.111793820232297e-05, + "loss": 0.2478, + "num_input_tokens_seen": 16929216, + "step": 11080 + }, + { + "epoch": 34.21329211746522, + "grad_norm": 0.5247380137443542, + "learning_rate": 4.1110432275106767e-05, + "loss": 0.2836, + "num_input_tokens_seen": 16936320, + "step": 11085 + }, + { + "epoch": 34.22874806800618, + "grad_norm": 0.4390627145767212, + "learning_rate": 4.110292386345944e-05, + "loss": 0.2517, + "num_input_tokens_seen": 16943488, + "step": 11090 + }, + { + "epoch": 34.244204018547144, + "grad_norm": 0.4149060845375061, + "learning_rate": 4.109541296853891e-05, + "loss": 0.3119, + "num_input_tokens_seen": 16951520, + "step": 11095 + }, + { + "epoch": 34.2596599690881, + "grad_norm": 0.7236697673797607, + "learning_rate": 4.108789959150341e-05, + "loss": 0.2602, + "num_input_tokens_seen": 16958784, + "step": 11100 + }, + { + "epoch": 34.27511591962906, + "grad_norm": 0.6080955862998962, + "learning_rate": 4.108038373351163e-05, + "loss": 0.3248, + "num_input_tokens_seen": 16966080, + "step": 11105 + }, + { + "epoch": 34.29057187017001, + "grad_norm": 0.8201320767402649, + "learning_rate": 4.10728653957226e-05, + "loss": 0.2919, + "num_input_tokens_seen": 16973536, + "step": 11110 + }, + { + "epoch": 34.30602782071097, + "grad_norm": 0.6771089434623718, + "learning_rate": 4.106534457929575e-05, + "loss": 0.2836, + "num_input_tokens_seen": 16981376, + "step": 11115 + }, + { + "epoch": 34.321483771251934, + "grad_norm": 0.5430486798286438, + "learning_rate": 4.105782128539086e-05, + "loss": 0.2669, + "num_input_tokens_seen": 16989760, + "step": 11120 + }, + { + "epoch": 34.33693972179289, + "grad_norm": 0.3784869909286499, + "learning_rate": 4.1050295515168144e-05, + "loss": 0.2648, + "num_input_tokens_seen": 16997440, + "step": 11125 + }, + { + "epoch": 34.35239567233385, + "grad_norm": 0.5437233448028564, + "learning_rate": 4.1042767269788155e-05, + "loss": 0.3225, + "num_input_tokens_seen": 17005600, + "step": 11130 + }, + { + "epoch": 34.36785162287481, + "grad_norm": 0.30865907669067383, + "learning_rate": 4.103523655041185e-05, + "loss": 0.2008, + "num_input_tokens_seen": 17012960, + "step": 11135 + }, + { + "epoch": 34.38330757341576, + "grad_norm": 0.5708175301551819, + "learning_rate": 4.102770335820055e-05, + "loss": 0.2402, + "num_input_tokens_seen": 17021088, + "step": 11140 + }, + { + "epoch": 34.398763523956724, + "grad_norm": 0.4537992775440216, + "learning_rate": 4.1020167694315984e-05, + "loss": 0.2562, + "num_input_tokens_seen": 17028736, + "step": 11145 + }, + { + "epoch": 34.414219474497685, + "grad_norm": 0.5733794569969177, + "learning_rate": 4.101262955992023e-05, + "loss": 0.2731, + "num_input_tokens_seen": 17036544, + "step": 11150 + }, + { + "epoch": 34.42967542503864, + "grad_norm": 0.7268709540367126, + "learning_rate": 4.100508895617578e-05, + "loss": 0.2251, + "num_input_tokens_seen": 17043936, + "step": 11155 + }, + { + "epoch": 34.4451313755796, + "grad_norm": 0.4137454032897949, + "learning_rate": 4.099754588424547e-05, + "loss": 0.2766, + "num_input_tokens_seen": 17051552, + "step": 11160 + }, + { + "epoch": 34.46058732612055, + "grad_norm": 0.5163319110870361, + "learning_rate": 4.0990000345292546e-05, + "loss": 0.2457, + "num_input_tokens_seen": 17058976, + "step": 11165 + }, + { + "epoch": 34.476043276661514, + "grad_norm": 0.7630552053451538, + "learning_rate": 4.098245234048064e-05, + "loss": 0.3508, + "num_input_tokens_seen": 17067008, + "step": 11170 + }, + { + "epoch": 34.491499227202475, + "grad_norm": 0.6769641041755676, + "learning_rate": 4.0974901870973726e-05, + "loss": 0.4011, + "num_input_tokens_seen": 17074880, + "step": 11175 + }, + { + "epoch": 34.50695517774343, + "grad_norm": 0.7769568562507629, + "learning_rate": 4.096734893793619e-05, + "loss": 0.3369, + "num_input_tokens_seen": 17082528, + "step": 11180 + }, + { + "epoch": 34.52241112828439, + "grad_norm": 0.5854775309562683, + "learning_rate": 4.095979354253279e-05, + "loss": 0.2763, + "num_input_tokens_seen": 17089696, + "step": 11185 + }, + { + "epoch": 34.53786707882535, + "grad_norm": 0.6252934336662292, + "learning_rate": 4.0952235685928656e-05, + "loss": 0.2536, + "num_input_tokens_seen": 17097024, + "step": 11190 + }, + { + "epoch": 34.553323029366304, + "grad_norm": 0.6213139295578003, + "learning_rate": 4.094467536928932e-05, + "loss": 0.2413, + "num_input_tokens_seen": 17104736, + "step": 11195 + }, + { + "epoch": 34.568778979907265, + "grad_norm": 0.32232987880706787, + "learning_rate": 4.093711259378067e-05, + "loss": 0.2338, + "num_input_tokens_seen": 17112928, + "step": 11200 + }, + { + "epoch": 34.568778979907265, + "eval_loss": 0.3325132727622986, + "eval_runtime": 6.3074, + "eval_samples_per_second": 91.163, + "eval_steps_per_second": 22.83, + "num_input_tokens_seen": 17112928, + "step": 11200 + }, + { + "epoch": 34.584234930448225, + "grad_norm": 0.5411344766616821, + "learning_rate": 4.092954736056897e-05, + "loss": 0.2373, + "num_input_tokens_seen": 17120256, + "step": 11205 + }, + { + "epoch": 34.59969088098918, + "grad_norm": 0.6928764581680298, + "learning_rate": 4.09219796708209e-05, + "loss": 0.274, + "num_input_tokens_seen": 17127584, + "step": 11210 + }, + { + "epoch": 34.61514683153014, + "grad_norm": 0.4733209013938904, + "learning_rate": 4.0914409525703464e-05, + "loss": 0.2602, + "num_input_tokens_seen": 17135488, + "step": 11215 + }, + { + "epoch": 34.630602782071094, + "grad_norm": 0.49553003907203674, + "learning_rate": 4.090683692638408e-05, + "loss": 0.3403, + "num_input_tokens_seen": 17143104, + "step": 11220 + }, + { + "epoch": 34.646058732612055, + "grad_norm": 0.6413846015930176, + "learning_rate": 4.089926187403056e-05, + "loss": 0.3327, + "num_input_tokens_seen": 17150624, + "step": 11225 + }, + { + "epoch": 34.661514683153015, + "grad_norm": 0.7520970702171326, + "learning_rate": 4.0891684369811044e-05, + "loss": 0.2803, + "num_input_tokens_seen": 17157984, + "step": 11230 + }, + { + "epoch": 34.67697063369397, + "grad_norm": 0.5333845615386963, + "learning_rate": 4.0884104414894107e-05, + "loss": 0.2772, + "num_input_tokens_seen": 17165792, + "step": 11235 + }, + { + "epoch": 34.69242658423493, + "grad_norm": 0.4259519875049591, + "learning_rate": 4.087652201044864e-05, + "loss": 0.2666, + "num_input_tokens_seen": 17173376, + "step": 11240 + }, + { + "epoch": 34.70788253477589, + "grad_norm": 0.8669977188110352, + "learning_rate": 4.086893715764397e-05, + "loss": 0.3044, + "num_input_tokens_seen": 17180704, + "step": 11245 + }, + { + "epoch": 34.723338485316845, + "grad_norm": 0.5869132280349731, + "learning_rate": 4.086134985764977e-05, + "loss": 0.2235, + "num_input_tokens_seen": 17188544, + "step": 11250 + }, + { + "epoch": 34.738794435857805, + "grad_norm": 0.857377827167511, + "learning_rate": 4.0853760111636085e-05, + "loss": 0.2457, + "num_input_tokens_seen": 17196256, + "step": 11255 + }, + { + "epoch": 34.754250386398766, + "grad_norm": 0.420545756816864, + "learning_rate": 4.084616792077337e-05, + "loss": 0.2451, + "num_input_tokens_seen": 17204128, + "step": 11260 + }, + { + "epoch": 34.76970633693972, + "grad_norm": 0.47880011796951294, + "learning_rate": 4.083857328623243e-05, + "loss": 0.3336, + "num_input_tokens_seen": 17212192, + "step": 11265 + }, + { + "epoch": 34.78516228748068, + "grad_norm": 0.6168875694274902, + "learning_rate": 4.083097620918444e-05, + "loss": 0.3449, + "num_input_tokens_seen": 17220256, + "step": 11270 + }, + { + "epoch": 34.80061823802164, + "grad_norm": 0.8231050372123718, + "learning_rate": 4.082337669080097e-05, + "loss": 0.2741, + "num_input_tokens_seen": 17228320, + "step": 11275 + }, + { + "epoch": 34.816074188562595, + "grad_norm": 0.37989041209220886, + "learning_rate": 4.081577473225398e-05, + "loss": 0.2595, + "num_input_tokens_seen": 17236160, + "step": 11280 + }, + { + "epoch": 34.831530139103556, + "grad_norm": 0.758781373500824, + "learning_rate": 4.080817033471577e-05, + "loss": 0.2245, + "num_input_tokens_seen": 17244192, + "step": 11285 + }, + { + "epoch": 34.84698608964451, + "grad_norm": 0.31239405274391174, + "learning_rate": 4.080056349935903e-05, + "loss": 0.2437, + "num_input_tokens_seen": 17252256, + "step": 11290 + }, + { + "epoch": 34.86244204018547, + "grad_norm": 1.0896598100662231, + "learning_rate": 4.079295422735684e-05, + "loss": 0.2928, + "num_input_tokens_seen": 17259808, + "step": 11295 + }, + { + "epoch": 34.87789799072643, + "grad_norm": 0.4050532579421997, + "learning_rate": 4.078534251988264e-05, + "loss": 0.2592, + "num_input_tokens_seen": 17267584, + "step": 11300 + }, + { + "epoch": 34.893353941267385, + "grad_norm": 0.5727129578590393, + "learning_rate": 4.077772837811025e-05, + "loss": 0.2995, + "num_input_tokens_seen": 17275200, + "step": 11305 + }, + { + "epoch": 34.908809891808346, + "grad_norm": 0.4074660539627075, + "learning_rate": 4.0770111803213874e-05, + "loss": 0.2866, + "num_input_tokens_seen": 17283616, + "step": 11310 + }, + { + "epoch": 34.92426584234931, + "grad_norm": 0.28152546286582947, + "learning_rate": 4.076249279636807e-05, + "loss": 0.2442, + "num_input_tokens_seen": 17291104, + "step": 11315 + }, + { + "epoch": 34.93972179289026, + "grad_norm": 0.48101991415023804, + "learning_rate": 4.075487135874781e-05, + "loss": 0.3118, + "num_input_tokens_seen": 17298464, + "step": 11320 + }, + { + "epoch": 34.95517774343122, + "grad_norm": 0.503472089767456, + "learning_rate": 4.074724749152837e-05, + "loss": 0.2308, + "num_input_tokens_seen": 17305600, + "step": 11325 + }, + { + "epoch": 34.97063369397218, + "grad_norm": 0.6053091287612915, + "learning_rate": 4.07396211958855e-05, + "loss": 0.2276, + "num_input_tokens_seen": 17312832, + "step": 11330 + }, + { + "epoch": 34.986089644513136, + "grad_norm": 0.4057449996471405, + "learning_rate": 4.073199247299523e-05, + "loss": 0.2331, + "num_input_tokens_seen": 17320800, + "step": 11335 + }, + { + "epoch": 35.0, + "grad_norm": 1.1346317529678345, + "learning_rate": 4.072436132403403e-05, + "loss": 0.2544, + "num_input_tokens_seen": 17327664, + "step": 11340 + }, + { + "epoch": 35.01545595054096, + "grad_norm": 0.5703853368759155, + "learning_rate": 4.0716727750178704e-05, + "loss": 0.2616, + "num_input_tokens_seen": 17334992, + "step": 11345 + }, + { + "epoch": 35.030911901081915, + "grad_norm": 0.8882664442062378, + "learning_rate": 4.0709091752606455e-05, + "loss": 0.354, + "num_input_tokens_seen": 17342896, + "step": 11350 + }, + { + "epoch": 35.046367851622875, + "grad_norm": 0.3973292410373688, + "learning_rate": 4.070145333249484e-05, + "loss": 0.2767, + "num_input_tokens_seen": 17350480, + "step": 11355 + }, + { + "epoch": 35.061823802163836, + "grad_norm": 0.3912928104400635, + "learning_rate": 4.069381249102181e-05, + "loss": 0.2484, + "num_input_tokens_seen": 17358192, + "step": 11360 + }, + { + "epoch": 35.07727975270479, + "grad_norm": 0.438869446516037, + "learning_rate": 4.0686169229365665e-05, + "loss": 0.3061, + "num_input_tokens_seen": 17365840, + "step": 11365 + }, + { + "epoch": 35.09273570324575, + "grad_norm": 0.5466052293777466, + "learning_rate": 4.067852354870511e-05, + "loss": 0.254, + "num_input_tokens_seen": 17373520, + "step": 11370 + }, + { + "epoch": 35.108191653786704, + "grad_norm": 0.6231051683425903, + "learning_rate": 4.067087545021919e-05, + "loss": 0.2491, + "num_input_tokens_seen": 17381136, + "step": 11375 + }, + { + "epoch": 35.123647604327665, + "grad_norm": 0.6748988032341003, + "learning_rate": 4.066322493508734e-05, + "loss": 0.2346, + "num_input_tokens_seen": 17388656, + "step": 11380 + }, + { + "epoch": 35.139103554868626, + "grad_norm": 0.47157129645347595, + "learning_rate": 4.065557200448937e-05, + "loss": 0.3138, + "num_input_tokens_seen": 17396688, + "step": 11385 + }, + { + "epoch": 35.15455950540958, + "grad_norm": 0.5953348875045776, + "learning_rate": 4.064791665960546e-05, + "loss": 0.299, + "num_input_tokens_seen": 17404240, + "step": 11390 + }, + { + "epoch": 35.17001545595054, + "grad_norm": 0.6345654129981995, + "learning_rate": 4.064025890161615e-05, + "loss": 0.3257, + "num_input_tokens_seen": 17412112, + "step": 11395 + }, + { + "epoch": 35.1854714064915, + "grad_norm": 0.39346951246261597, + "learning_rate": 4.0632598731702373e-05, + "loss": 0.2304, + "num_input_tokens_seen": 17420016, + "step": 11400 + }, + { + "epoch": 35.1854714064915, + "eval_loss": 0.3333474397659302, + "eval_runtime": 6.3002, + "eval_samples_per_second": 91.267, + "eval_steps_per_second": 22.856, + "num_input_tokens_seen": 17420016, + "step": 11400 + }, + { + "epoch": 35.200927357032455, + "grad_norm": 0.9372071623802185, + "learning_rate": 4.0624936151045426e-05, + "loss": 0.2716, + "num_input_tokens_seen": 17427920, + "step": 11405 + }, + { + "epoch": 35.216383307573416, + "grad_norm": 0.5972718000411987, + "learning_rate": 4.061727116082696e-05, + "loss": 0.2546, + "num_input_tokens_seen": 17435152, + "step": 11410 + }, + { + "epoch": 35.23183925811438, + "grad_norm": 0.32251670956611633, + "learning_rate": 4.060960376222903e-05, + "loss": 0.2572, + "num_input_tokens_seen": 17443408, + "step": 11415 + }, + { + "epoch": 35.24729520865533, + "grad_norm": 0.524914562702179, + "learning_rate": 4.0601933956434034e-05, + "loss": 0.2937, + "num_input_tokens_seen": 17451120, + "step": 11420 + }, + { + "epoch": 35.26275115919629, + "grad_norm": 0.4171120226383209, + "learning_rate": 4.059426174462476e-05, + "loss": 0.2998, + "num_input_tokens_seen": 17458864, + "step": 11425 + }, + { + "epoch": 35.27820710973725, + "grad_norm": 0.8284816145896912, + "learning_rate": 4.058658712798435e-05, + "loss": 0.2987, + "num_input_tokens_seen": 17466608, + "step": 11430 + }, + { + "epoch": 35.293663060278206, + "grad_norm": 0.39807090163230896, + "learning_rate": 4.0578910107696336e-05, + "loss": 0.2865, + "num_input_tokens_seen": 17474288, + "step": 11435 + }, + { + "epoch": 35.30911901081917, + "grad_norm": 0.4690052568912506, + "learning_rate": 4.05712306849446e-05, + "loss": 0.2551, + "num_input_tokens_seen": 17482032, + "step": 11440 + }, + { + "epoch": 35.32457496136012, + "grad_norm": 0.6222814321517944, + "learning_rate": 4.0563548860913415e-05, + "loss": 0.3086, + "num_input_tokens_seen": 17489840, + "step": 11445 + }, + { + "epoch": 35.34003091190108, + "grad_norm": 0.7695972323417664, + "learning_rate": 4.0555864636787414e-05, + "loss": 0.2821, + "num_input_tokens_seen": 17497584, + "step": 11450 + }, + { + "epoch": 35.35548686244204, + "grad_norm": 0.5063489079475403, + "learning_rate": 4.054817801375159e-05, + "loss": 0.2932, + "num_input_tokens_seen": 17505104, + "step": 11455 + }, + { + "epoch": 35.370942812982996, + "grad_norm": 0.3080472946166992, + "learning_rate": 4.054048899299134e-05, + "loss": 0.3143, + "num_input_tokens_seen": 17512816, + "step": 11460 + }, + { + "epoch": 35.38639876352396, + "grad_norm": 0.5370438694953918, + "learning_rate": 4.0532797575692385e-05, + "loss": 0.2948, + "num_input_tokens_seen": 17520240, + "step": 11465 + }, + { + "epoch": 35.40185471406492, + "grad_norm": 0.5191564559936523, + "learning_rate": 4.052510376304085e-05, + "loss": 0.3295, + "num_input_tokens_seen": 17527984, + "step": 11470 + }, + { + "epoch": 35.41731066460587, + "grad_norm": 0.8370712995529175, + "learning_rate": 4.051740755622321e-05, + "loss": 0.3649, + "num_input_tokens_seen": 17535504, + "step": 11475 + }, + { + "epoch": 35.43276661514683, + "grad_norm": 0.5998862981796265, + "learning_rate": 4.050970895642632e-05, + "loss": 0.2525, + "num_input_tokens_seen": 17543280, + "step": 11480 + }, + { + "epoch": 35.44822256568779, + "grad_norm": 0.7632616758346558, + "learning_rate": 4.050200796483741e-05, + "loss": 0.3301, + "num_input_tokens_seen": 17550896, + "step": 11485 + }, + { + "epoch": 35.46367851622875, + "grad_norm": 0.33948060870170593, + "learning_rate": 4.049430458264405e-05, + "loss": 0.283, + "num_input_tokens_seen": 17558160, + "step": 11490 + }, + { + "epoch": 35.47913446676971, + "grad_norm": 0.47389107942581177, + "learning_rate": 4.048659881103422e-05, + "loss": 0.2518, + "num_input_tokens_seen": 17566224, + "step": 11495 + }, + { + "epoch": 35.49459041731066, + "grad_norm": 0.9787896275520325, + "learning_rate": 4.0478890651196235e-05, + "loss": 0.2979, + "num_input_tokens_seen": 17574032, + "step": 11500 + }, + { + "epoch": 35.51004636785162, + "grad_norm": 0.33817967772483826, + "learning_rate": 4.047118010431879e-05, + "loss": 0.243, + "num_input_tokens_seen": 17581392, + "step": 11505 + }, + { + "epoch": 35.52550231839258, + "grad_norm": 0.41461870074272156, + "learning_rate": 4.046346717159094e-05, + "loss": 0.2451, + "num_input_tokens_seen": 17588560, + "step": 11510 + }, + { + "epoch": 35.54095826893354, + "grad_norm": 0.43665555119514465, + "learning_rate": 4.045575185420214e-05, + "loss": 0.2993, + "num_input_tokens_seen": 17595696, + "step": 11515 + }, + { + "epoch": 35.5564142194745, + "grad_norm": 0.39743074774742126, + "learning_rate": 4.0448034153342165e-05, + "loss": 0.2861, + "num_input_tokens_seen": 17603440, + "step": 11520 + }, + { + "epoch": 35.57187017001546, + "grad_norm": 0.628815770149231, + "learning_rate": 4.0440314070201194e-05, + "loss": 0.3136, + "num_input_tokens_seen": 17611568, + "step": 11525 + }, + { + "epoch": 35.58732612055641, + "grad_norm": 0.7435958385467529, + "learning_rate": 4.043259160596976e-05, + "loss": 0.2693, + "num_input_tokens_seen": 17619024, + "step": 11530 + }, + { + "epoch": 35.60278207109737, + "grad_norm": 0.42274025082588196, + "learning_rate": 4.0424866761838767e-05, + "loss": 0.3302, + "num_input_tokens_seen": 17626640, + "step": 11535 + }, + { + "epoch": 35.618238021638334, + "grad_norm": 1.0039665699005127, + "learning_rate": 4.041713953899948e-05, + "loss": 0.248, + "num_input_tokens_seen": 17634896, + "step": 11540 + }, + { + "epoch": 35.63369397217929, + "grad_norm": 0.7791439890861511, + "learning_rate": 4.0409409938643515e-05, + "loss": 0.2901, + "num_input_tokens_seen": 17642288, + "step": 11545 + }, + { + "epoch": 35.64914992272025, + "grad_norm": 0.35127243399620056, + "learning_rate": 4.0401677961962904e-05, + "loss": 0.2689, + "num_input_tokens_seen": 17650064, + "step": 11550 + }, + { + "epoch": 35.66460587326121, + "grad_norm": 0.36503633856773376, + "learning_rate": 4.039394361015001e-05, + "loss": 0.2718, + "num_input_tokens_seen": 17657680, + "step": 11555 + }, + { + "epoch": 35.68006182380216, + "grad_norm": 0.5887255668640137, + "learning_rate": 4.038620688439755e-05, + "loss": 0.2777, + "num_input_tokens_seen": 17665232, + "step": 11560 + }, + { + "epoch": 35.695517774343124, + "grad_norm": 0.44581139087677, + "learning_rate": 4.037846778589862e-05, + "loss": 0.2765, + "num_input_tokens_seen": 17672496, + "step": 11565 + }, + { + "epoch": 35.71097372488408, + "grad_norm": 0.5159468054771423, + "learning_rate": 4.0370726315846715e-05, + "loss": 0.2798, + "num_input_tokens_seen": 17680272, + "step": 11570 + }, + { + "epoch": 35.72642967542504, + "grad_norm": 0.3588799238204956, + "learning_rate": 4.036298247543565e-05, + "loss": 0.2196, + "num_input_tokens_seen": 17687952, + "step": 11575 + }, + { + "epoch": 35.741885625966, + "grad_norm": 0.39758700132369995, + "learning_rate": 4.035523626585962e-05, + "loss": 0.2935, + "num_input_tokens_seen": 17695760, + "step": 11580 + }, + { + "epoch": 35.75734157650695, + "grad_norm": 0.435641348361969, + "learning_rate": 4.0347487688313194e-05, + "loss": 0.2413, + "num_input_tokens_seen": 17703632, + "step": 11585 + }, + { + "epoch": 35.77279752704791, + "grad_norm": 0.3352557420730591, + "learning_rate": 4.0339736743991296e-05, + "loss": 0.2091, + "num_input_tokens_seen": 17711056, + "step": 11590 + }, + { + "epoch": 35.788253477588874, + "grad_norm": 0.3231052756309509, + "learning_rate": 4.0331983434089227e-05, + "loss": 0.2338, + "num_input_tokens_seen": 17718512, + "step": 11595 + }, + { + "epoch": 35.80370942812983, + "grad_norm": 0.7289990186691284, + "learning_rate": 4.032422775980264e-05, + "loss": 0.3045, + "num_input_tokens_seen": 17726608, + "step": 11600 + }, + { + "epoch": 35.80370942812983, + "eval_loss": 0.3308725953102112, + "eval_runtime": 6.3128, + "eval_samples_per_second": 91.085, + "eval_steps_per_second": 22.811, + "num_input_tokens_seen": 17726608, + "step": 11600 + }, + { + "epoch": 35.81916537867079, + "grad_norm": 0.4882926642894745, + "learning_rate": 4.031646972232754e-05, + "loss": 0.2667, + "num_input_tokens_seen": 17733904, + "step": 11605 + }, + { + "epoch": 35.83462132921175, + "grad_norm": 0.462530255317688, + "learning_rate": 4.0308709322860344e-05, + "loss": 0.3057, + "num_input_tokens_seen": 17742128, + "step": 11610 + }, + { + "epoch": 35.8500772797527, + "grad_norm": 0.4469016194343567, + "learning_rate": 4.0300946562597784e-05, + "loss": 0.3075, + "num_input_tokens_seen": 17749744, + "step": 11615 + }, + { + "epoch": 35.865533230293664, + "grad_norm": 0.6201682090759277, + "learning_rate": 4.029318144273698e-05, + "loss": 0.2465, + "num_input_tokens_seen": 17757104, + "step": 11620 + }, + { + "epoch": 35.88098918083462, + "grad_norm": 0.5639595985412598, + "learning_rate": 4.0285413964475415e-05, + "loss": 0.229, + "num_input_tokens_seen": 17764144, + "step": 11625 + }, + { + "epoch": 35.89644513137558, + "grad_norm": 0.5469321608543396, + "learning_rate": 4.0277644129010927e-05, + "loss": 0.2564, + "num_input_tokens_seen": 17771696, + "step": 11630 + }, + { + "epoch": 35.91190108191654, + "grad_norm": 0.5986455678939819, + "learning_rate": 4.0269871937541724e-05, + "loss": 0.2476, + "num_input_tokens_seen": 17779312, + "step": 11635 + }, + { + "epoch": 35.92735703245749, + "grad_norm": 0.7460610270500183, + "learning_rate": 4.026209739126637e-05, + "loss": 0.2646, + "num_input_tokens_seen": 17786896, + "step": 11640 + }, + { + "epoch": 35.942812982998454, + "grad_norm": 0.28978297114372253, + "learning_rate": 4.025432049138381e-05, + "loss": 0.2329, + "num_input_tokens_seen": 17794448, + "step": 11645 + }, + { + "epoch": 35.958268933539415, + "grad_norm": 0.5790079832077026, + "learning_rate": 4.0246541239093325e-05, + "loss": 0.2838, + "num_input_tokens_seen": 17802352, + "step": 11650 + }, + { + "epoch": 35.97372488408037, + "grad_norm": 0.47736701369285583, + "learning_rate": 4.023875963559459e-05, + "loss": 0.2616, + "num_input_tokens_seen": 17809776, + "step": 11655 + }, + { + "epoch": 35.98918083462133, + "grad_norm": 0.674104630947113, + "learning_rate": 4.023097568208761e-05, + "loss": 0.2764, + "num_input_tokens_seen": 17817872, + "step": 11660 + }, + { + "epoch": 36.003091190108194, + "grad_norm": 0.36211296916007996, + "learning_rate": 4.022318937977277e-05, + "loss": 0.2652, + "num_input_tokens_seen": 17824432, + "step": 11665 + }, + { + "epoch": 36.01854714064915, + "grad_norm": 0.9717457890510559, + "learning_rate": 4.021540072985084e-05, + "loss": 0.3202, + "num_input_tokens_seen": 17832272, + "step": 11670 + }, + { + "epoch": 36.03400309119011, + "grad_norm": 0.685950517654419, + "learning_rate": 4.020760973352289e-05, + "loss": 0.2764, + "num_input_tokens_seen": 17839664, + "step": 11675 + }, + { + "epoch": 36.04945904173107, + "grad_norm": 0.6859360337257385, + "learning_rate": 4.019981639199042e-05, + "loss": 0.2957, + "num_input_tokens_seen": 17846928, + "step": 11680 + }, + { + "epoch": 36.06491499227202, + "grad_norm": 0.6803675889968872, + "learning_rate": 4.0192020706455245e-05, + "loss": 0.2739, + "num_input_tokens_seen": 17854416, + "step": 11685 + }, + { + "epoch": 36.08037094281298, + "grad_norm": 0.7670580744743347, + "learning_rate": 4.018422267811956e-05, + "loss": 0.3901, + "num_input_tokens_seen": 17861744, + "step": 11690 + }, + { + "epoch": 36.095826893353944, + "grad_norm": 0.47071707248687744, + "learning_rate": 4.017642230818592e-05, + "loss": 0.2781, + "num_input_tokens_seen": 17869488, + "step": 11695 + }, + { + "epoch": 36.1112828438949, + "grad_norm": 0.4728805124759674, + "learning_rate": 4.0168619597857246e-05, + "loss": 0.2775, + "num_input_tokens_seen": 17877424, + "step": 11700 + }, + { + "epoch": 36.12673879443586, + "grad_norm": 0.4804753363132477, + "learning_rate": 4.016081454833681e-05, + "loss": 0.2354, + "num_input_tokens_seen": 17884976, + "step": 11705 + }, + { + "epoch": 36.14219474497681, + "grad_norm": 0.6491743922233582, + "learning_rate": 4.0153007160828245e-05, + "loss": 0.2471, + "num_input_tokens_seen": 17892112, + "step": 11710 + }, + { + "epoch": 36.15765069551777, + "grad_norm": 0.4678434729576111, + "learning_rate": 4.0145197436535555e-05, + "loss": 0.2971, + "num_input_tokens_seen": 17899632, + "step": 11715 + }, + { + "epoch": 36.173106646058734, + "grad_norm": 0.5247890949249268, + "learning_rate": 4.0137385376663095e-05, + "loss": 0.2406, + "num_input_tokens_seen": 17907824, + "step": 11720 + }, + { + "epoch": 36.18856259659969, + "grad_norm": 0.5082129240036011, + "learning_rate": 4.012957098241558e-05, + "loss": 0.2636, + "num_input_tokens_seen": 17916016, + "step": 11725 + }, + { + "epoch": 36.20401854714065, + "grad_norm": 0.6188139319419861, + "learning_rate": 4.0121754254998076e-05, + "loss": 0.297, + "num_input_tokens_seen": 17923568, + "step": 11730 + }, + { + "epoch": 36.21947449768161, + "grad_norm": 0.5539929270744324, + "learning_rate": 4.011393519561606e-05, + "loss": 0.2912, + "num_input_tokens_seen": 17930576, + "step": 11735 + }, + { + "epoch": 36.23493044822256, + "grad_norm": 0.4388905167579651, + "learning_rate": 4.010611380547529e-05, + "loss": 0.2491, + "num_input_tokens_seen": 17938000, + "step": 11740 + }, + { + "epoch": 36.250386398763524, + "grad_norm": 0.6207563281059265, + "learning_rate": 4.009829008578192e-05, + "loss": 0.2221, + "num_input_tokens_seen": 17945264, + "step": 11745 + }, + { + "epoch": 36.265842349304485, + "grad_norm": 0.5368770956993103, + "learning_rate": 4.00904640377425e-05, + "loss": 0.2857, + "num_input_tokens_seen": 17952848, + "step": 11750 + }, + { + "epoch": 36.28129829984544, + "grad_norm": 0.3644900918006897, + "learning_rate": 4.0082635662563886e-05, + "loss": 0.2797, + "num_input_tokens_seen": 17960176, + "step": 11755 + }, + { + "epoch": 36.2967542503864, + "grad_norm": 0.47985440492630005, + "learning_rate": 4.007480496145331e-05, + "loss": 0.3535, + "num_input_tokens_seen": 17968624, + "step": 11760 + }, + { + "epoch": 36.31221020092736, + "grad_norm": 0.5949994921684265, + "learning_rate": 4.006697193561837e-05, + "loss": 0.2684, + "num_input_tokens_seen": 17975888, + "step": 11765 + }, + { + "epoch": 36.327666151468314, + "grad_norm": 0.6844872236251831, + "learning_rate": 4.005913658626701e-05, + "loss": 0.3013, + "num_input_tokens_seen": 17983632, + "step": 11770 + }, + { + "epoch": 36.343122102009275, + "grad_norm": 0.26703789830207825, + "learning_rate": 4.005129891460754e-05, + "loss": 0.3247, + "num_input_tokens_seen": 17991312, + "step": 11775 + }, + { + "epoch": 36.35857805255023, + "grad_norm": 0.4724946618080139, + "learning_rate": 4.004345892184864e-05, + "loss": 0.2867, + "num_input_tokens_seen": 17999056, + "step": 11780 + }, + { + "epoch": 36.37403400309119, + "grad_norm": 0.7307482957839966, + "learning_rate": 4.003561660919932e-05, + "loss": 0.2772, + "num_input_tokens_seen": 18007344, + "step": 11785 + }, + { + "epoch": 36.38948995363215, + "grad_norm": 1.0756562948226929, + "learning_rate": 4.002777197786897e-05, + "loss": 0.3214, + "num_input_tokens_seen": 18015120, + "step": 11790 + }, + { + "epoch": 36.404945904173104, + "grad_norm": 0.36984291672706604, + "learning_rate": 4.0019925029067326e-05, + "loss": 0.2312, + "num_input_tokens_seen": 18022384, + "step": 11795 + }, + { + "epoch": 36.420401854714065, + "grad_norm": 0.7804802060127258, + "learning_rate": 4.0012075764004495e-05, + "loss": 0.2831, + "num_input_tokens_seen": 18030288, + "step": 11800 + }, + { + "epoch": 36.420401854714065, + "eval_loss": 0.3292371332645416, + "eval_runtime": 6.3161, + "eval_samples_per_second": 91.037, + "eval_steps_per_second": 22.799, + "num_input_tokens_seen": 18030288, + "step": 11800 + }, + { + "epoch": 36.435857805255026, + "grad_norm": 0.3054010272026062, + "learning_rate": 4.000422418389094e-05, + "loss": 0.2354, + "num_input_tokens_seen": 18037776, + "step": 11805 + }, + { + "epoch": 36.45131375579598, + "grad_norm": 0.5055350661277771, + "learning_rate": 3.999637028993744e-05, + "loss": 0.2673, + "num_input_tokens_seen": 18045136, + "step": 11810 + }, + { + "epoch": 36.46676970633694, + "grad_norm": 0.33882540464401245, + "learning_rate": 3.99885140833552e-05, + "loss": 0.2946, + "num_input_tokens_seen": 18053072, + "step": 11815 + }, + { + "epoch": 36.4822256568779, + "grad_norm": 0.8275111317634583, + "learning_rate": 3.998065556535572e-05, + "loss": 0.3042, + "num_input_tokens_seen": 18060816, + "step": 11820 + }, + { + "epoch": 36.497681607418855, + "grad_norm": 0.32030022144317627, + "learning_rate": 3.9972794737150895e-05, + "loss": 0.2562, + "num_input_tokens_seen": 18067824, + "step": 11825 + }, + { + "epoch": 36.513137557959816, + "grad_norm": 0.4429665207862854, + "learning_rate": 3.996493159995297e-05, + "loss": 0.2316, + "num_input_tokens_seen": 18075504, + "step": 11830 + }, + { + "epoch": 36.52859350850077, + "grad_norm": 0.5345675349235535, + "learning_rate": 3.995706615497453e-05, + "loss": 0.2344, + "num_input_tokens_seen": 18083472, + "step": 11835 + }, + { + "epoch": 36.54404945904173, + "grad_norm": 0.3219548463821411, + "learning_rate": 3.994919840342852e-05, + "loss": 0.3153, + "num_input_tokens_seen": 18090992, + "step": 11840 + }, + { + "epoch": 36.55950540958269, + "grad_norm": 0.6827722191810608, + "learning_rate": 3.994132834652825e-05, + "loss": 0.2099, + "num_input_tokens_seen": 18099120, + "step": 11845 + }, + { + "epoch": 36.574961360123645, + "grad_norm": 0.4954260587692261, + "learning_rate": 3.99334559854874e-05, + "loss": 0.3187, + "num_input_tokens_seen": 18107248, + "step": 11850 + }, + { + "epoch": 36.590417310664606, + "grad_norm": 0.4012429118156433, + "learning_rate": 3.9925581321519955e-05, + "loss": 0.2365, + "num_input_tokens_seen": 18114800, + "step": 11855 + }, + { + "epoch": 36.605873261205566, + "grad_norm": 0.4871174395084381, + "learning_rate": 3.991770435584031e-05, + "loss": 0.2927, + "num_input_tokens_seen": 18123280, + "step": 11860 + }, + { + "epoch": 36.62132921174652, + "grad_norm": 0.3178427219390869, + "learning_rate": 3.990982508966319e-05, + "loss": 0.2139, + "num_input_tokens_seen": 18130704, + "step": 11865 + }, + { + "epoch": 36.63678516228748, + "grad_norm": 0.518674373626709, + "learning_rate": 3.990194352420367e-05, + "loss": 0.2351, + "num_input_tokens_seen": 18138416, + "step": 11870 + }, + { + "epoch": 36.65224111282844, + "grad_norm": 0.7026209235191345, + "learning_rate": 3.9894059660677184e-05, + "loss": 0.2716, + "num_input_tokens_seen": 18146064, + "step": 11875 + }, + { + "epoch": 36.667697063369395, + "grad_norm": 0.7143779993057251, + "learning_rate": 3.9886173500299526e-05, + "loss": 0.2598, + "num_input_tokens_seen": 18153296, + "step": 11880 + }, + { + "epoch": 36.683153013910356, + "grad_norm": 0.5370947122573853, + "learning_rate": 3.987828504428685e-05, + "loss": 0.2759, + "num_input_tokens_seen": 18161264, + "step": 11885 + }, + { + "epoch": 36.69860896445132, + "grad_norm": 0.49401670694351196, + "learning_rate": 3.987039429385565e-05, + "loss": 0.2346, + "num_input_tokens_seen": 18169136, + "step": 11890 + }, + { + "epoch": 36.71406491499227, + "grad_norm": 0.4211377203464508, + "learning_rate": 3.986250125022277e-05, + "loss": 0.2648, + "num_input_tokens_seen": 18176560, + "step": 11895 + }, + { + "epoch": 36.72952086553323, + "grad_norm": 0.2872360944747925, + "learning_rate": 3.985460591460544e-05, + "loss": 0.2482, + "num_input_tokens_seen": 18184656, + "step": 11900 + }, + { + "epoch": 36.744976816074185, + "grad_norm": 0.5306445360183716, + "learning_rate": 3.984670828822118e-05, + "loss": 0.2801, + "num_input_tokens_seen": 18192880, + "step": 11905 + }, + { + "epoch": 36.760432766615146, + "grad_norm": 0.46122339367866516, + "learning_rate": 3.983880837228794e-05, + "loss": 0.2902, + "num_input_tokens_seen": 18201552, + "step": 11910 + }, + { + "epoch": 36.77588871715611, + "grad_norm": 0.6720304489135742, + "learning_rate": 3.983090616802396e-05, + "loss": 0.3176, + "num_input_tokens_seen": 18208816, + "step": 11915 + }, + { + "epoch": 36.79134466769706, + "grad_norm": 0.380221962928772, + "learning_rate": 3.982300167664788e-05, + "loss": 0.2437, + "num_input_tokens_seen": 18216112, + "step": 11920 + }, + { + "epoch": 36.80680061823802, + "grad_norm": 0.45581310987472534, + "learning_rate": 3.981509489937868e-05, + "loss": 0.2344, + "num_input_tokens_seen": 18223888, + "step": 11925 + }, + { + "epoch": 36.82225656877898, + "grad_norm": 0.5410979986190796, + "learning_rate": 3.9807185837435643e-05, + "loss": 0.2879, + "num_input_tokens_seen": 18231568, + "step": 11930 + }, + { + "epoch": 36.837712519319936, + "grad_norm": 0.4882701337337494, + "learning_rate": 3.9799274492038484e-05, + "loss": 0.3067, + "num_input_tokens_seen": 18239216, + "step": 11935 + }, + { + "epoch": 36.8531684698609, + "grad_norm": 0.7972468733787537, + "learning_rate": 3.979136086440722e-05, + "loss": 0.3117, + "num_input_tokens_seen": 18246736, + "step": 11940 + }, + { + "epoch": 36.86862442040186, + "grad_norm": 0.42449989914894104, + "learning_rate": 3.9783444955762226e-05, + "loss": 0.232, + "num_input_tokens_seen": 18254032, + "step": 11945 + }, + { + "epoch": 36.88408037094281, + "grad_norm": 0.8177435398101807, + "learning_rate": 3.977552676732424e-05, + "loss": 0.2645, + "num_input_tokens_seen": 18261616, + "step": 11950 + }, + { + "epoch": 36.89953632148377, + "grad_norm": 0.4996887743473053, + "learning_rate": 3.976760630031435e-05, + "loss": 0.256, + "num_input_tokens_seen": 18269776, + "step": 11955 + }, + { + "epoch": 36.914992272024726, + "grad_norm": 0.49187445640563965, + "learning_rate": 3.975968355595398e-05, + "loss": 0.2301, + "num_input_tokens_seen": 18277680, + "step": 11960 + }, + { + "epoch": 36.93044822256569, + "grad_norm": 0.9605548977851868, + "learning_rate": 3.9751758535464935e-05, + "loss": 0.3572, + "num_input_tokens_seen": 18284720, + "step": 11965 + }, + { + "epoch": 36.94590417310665, + "grad_norm": 0.8909050226211548, + "learning_rate": 3.9743831240069326e-05, + "loss": 0.3234, + "num_input_tokens_seen": 18292208, + "step": 11970 + }, + { + "epoch": 36.9613601236476, + "grad_norm": 0.4744971692562103, + "learning_rate": 3.9735901670989675e-05, + "loss": 0.245, + "num_input_tokens_seen": 18299792, + "step": 11975 + }, + { + "epoch": 36.97681607418856, + "grad_norm": 0.5548980832099915, + "learning_rate": 3.97279698294488e-05, + "loss": 0.3134, + "num_input_tokens_seen": 18307600, + "step": 11980 + }, + { + "epoch": 36.99227202472952, + "grad_norm": 0.5088147521018982, + "learning_rate": 3.9720035716669876e-05, + "loss": 0.2601, + "num_input_tokens_seen": 18315344, + "step": 11985 + }, + { + "epoch": 37.00618238021638, + "grad_norm": 0.6560157537460327, + "learning_rate": 3.9712099333876474e-05, + "loss": 0.2699, + "num_input_tokens_seen": 18322256, + "step": 11990 + }, + { + "epoch": 37.02163833075734, + "grad_norm": 0.6372279524803162, + "learning_rate": 3.9704160682292475e-05, + "loss": 0.2995, + "num_input_tokens_seen": 18330096, + "step": 11995 + }, + { + "epoch": 37.0370942812983, + "grad_norm": 0.5552882552146912, + "learning_rate": 3.9696219763142106e-05, + "loss": 0.2686, + "num_input_tokens_seen": 18337584, + "step": 12000 + }, + { + "epoch": 37.0370942812983, + "eval_loss": 0.32933881878852844, + "eval_runtime": 6.3002, + "eval_samples_per_second": 91.266, + "eval_steps_per_second": 22.856, + "num_input_tokens_seen": 18337584, + "step": 12000 + }, + { + "epoch": 37.052550231839255, + "grad_norm": 0.3649972975254059, + "learning_rate": 3.968827657764997e-05, + "loss": 0.2558, + "num_input_tokens_seen": 18345328, + "step": 12005 + }, + { + "epoch": 37.068006182380216, + "grad_norm": 0.9128829836845398, + "learning_rate": 3.9680331127041e-05, + "loss": 0.3356, + "num_input_tokens_seen": 18353744, + "step": 12010 + }, + { + "epoch": 37.08346213292118, + "grad_norm": 0.4005635380744934, + "learning_rate": 3.9672383412540495e-05, + "loss": 0.2129, + "num_input_tokens_seen": 18361040, + "step": 12015 + }, + { + "epoch": 37.09891808346213, + "grad_norm": 0.5036453008651733, + "learning_rate": 3.966443343537407e-05, + "loss": 0.2554, + "num_input_tokens_seen": 18369136, + "step": 12020 + }, + { + "epoch": 37.11437403400309, + "grad_norm": 0.5621324777603149, + "learning_rate": 3.965648119676772e-05, + "loss": 0.2485, + "num_input_tokens_seen": 18376528, + "step": 12025 + }, + { + "epoch": 37.12982998454405, + "grad_norm": 0.44643738865852356, + "learning_rate": 3.96485266979478e-05, + "loss": 0.2509, + "num_input_tokens_seen": 18383920, + "step": 12030 + }, + { + "epoch": 37.145285935085006, + "grad_norm": 0.5964872241020203, + "learning_rate": 3.9640569940140974e-05, + "loss": 0.2834, + "num_input_tokens_seen": 18391344, + "step": 12035 + }, + { + "epoch": 37.16074188562597, + "grad_norm": 0.4487094283103943, + "learning_rate": 3.963261092457428e-05, + "loss": 0.2231, + "num_input_tokens_seen": 18398480, + "step": 12040 + }, + { + "epoch": 37.17619783616692, + "grad_norm": 0.47922107577323914, + "learning_rate": 3.962464965247509e-05, + "loss": 0.2602, + "num_input_tokens_seen": 18406160, + "step": 12045 + }, + { + "epoch": 37.19165378670788, + "grad_norm": 0.7663019299507141, + "learning_rate": 3.9616686125071135e-05, + "loss": 0.2264, + "num_input_tokens_seen": 18413264, + "step": 12050 + }, + { + "epoch": 37.20710973724884, + "grad_norm": 0.6670501828193665, + "learning_rate": 3.9608720343590506e-05, + "loss": 0.2724, + "num_input_tokens_seen": 18420560, + "step": 12055 + }, + { + "epoch": 37.222565687789796, + "grad_norm": 0.7461984753608704, + "learning_rate": 3.960075230926161e-05, + "loss": 0.3722, + "num_input_tokens_seen": 18427696, + "step": 12060 + }, + { + "epoch": 37.23802163833076, + "grad_norm": 0.7417424321174622, + "learning_rate": 3.959278202331322e-05, + "loss": 0.2917, + "num_input_tokens_seen": 18434800, + "step": 12065 + }, + { + "epoch": 37.25347758887172, + "grad_norm": 0.4899305999279022, + "learning_rate": 3.958480948697446e-05, + "loss": 0.2525, + "num_input_tokens_seen": 18442224, + "step": 12070 + }, + { + "epoch": 37.26893353941267, + "grad_norm": 0.67018061876297, + "learning_rate": 3.95768347014748e-05, + "loss": 0.294, + "num_input_tokens_seen": 18450032, + "step": 12075 + }, + { + "epoch": 37.28438948995363, + "grad_norm": 0.6597607135772705, + "learning_rate": 3.956885766804404e-05, + "loss": 0.377, + "num_input_tokens_seen": 18457744, + "step": 12080 + }, + { + "epoch": 37.29984544049459, + "grad_norm": 0.5235077142715454, + "learning_rate": 3.956087838791235e-05, + "loss": 0.2507, + "num_input_tokens_seen": 18465008, + "step": 12085 + }, + { + "epoch": 37.31530139103555, + "grad_norm": 0.46869999170303345, + "learning_rate": 3.955289686231022e-05, + "loss": 0.2724, + "num_input_tokens_seen": 18472464, + "step": 12090 + }, + { + "epoch": 37.33075734157651, + "grad_norm": 0.8084605932235718, + "learning_rate": 3.9544913092468504e-05, + "loss": 0.2975, + "num_input_tokens_seen": 18480496, + "step": 12095 + }, + { + "epoch": 37.34621329211747, + "grad_norm": 0.4093535542488098, + "learning_rate": 3.9536927079618425e-05, + "loss": 0.2367, + "num_input_tokens_seen": 18488304, + "step": 12100 + }, + { + "epoch": 37.36166924265842, + "grad_norm": 0.3711329698562622, + "learning_rate": 3.9528938824991494e-05, + "loss": 0.2368, + "num_input_tokens_seen": 18495728, + "step": 12105 + }, + { + "epoch": 37.37712519319938, + "grad_norm": 0.6685149669647217, + "learning_rate": 3.952094832981962e-05, + "loss": 0.3112, + "num_input_tokens_seen": 18503792, + "step": 12110 + }, + { + "epoch": 37.39258114374034, + "grad_norm": 0.7090384364128113, + "learning_rate": 3.951295559533503e-05, + "loss": 0.2703, + "num_input_tokens_seen": 18510704, + "step": 12115 + }, + { + "epoch": 37.4080370942813, + "grad_norm": 0.3952358067035675, + "learning_rate": 3.95049606227703e-05, + "loss": 0.386, + "num_input_tokens_seen": 18518736, + "step": 12120 + }, + { + "epoch": 37.42349304482226, + "grad_norm": 0.6072391867637634, + "learning_rate": 3.949696341335838e-05, + "loss": 0.2848, + "num_input_tokens_seen": 18526288, + "step": 12125 + }, + { + "epoch": 37.43894899536321, + "grad_norm": 0.4008665680885315, + "learning_rate": 3.9488963968332503e-05, + "loss": 0.3046, + "num_input_tokens_seen": 18533648, + "step": 12130 + }, + { + "epoch": 37.45440494590417, + "grad_norm": 0.49116435647010803, + "learning_rate": 3.948096228892631e-05, + "loss": 0.3063, + "num_input_tokens_seen": 18541264, + "step": 12135 + }, + { + "epoch": 37.469860896445134, + "grad_norm": 0.6011390089988708, + "learning_rate": 3.947295837637375e-05, + "loss": 0.2982, + "num_input_tokens_seen": 18549136, + "step": 12140 + }, + { + "epoch": 37.48531684698609, + "grad_norm": 0.34753134846687317, + "learning_rate": 3.9464952231909135e-05, + "loss": 0.2582, + "num_input_tokens_seen": 18556720, + "step": 12145 + }, + { + "epoch": 37.50077279752705, + "grad_norm": 0.4869195520877838, + "learning_rate": 3.945694385676711e-05, + "loss": 0.2562, + "num_input_tokens_seen": 18564528, + "step": 12150 + }, + { + "epoch": 37.51622874806801, + "grad_norm": 0.42900633811950684, + "learning_rate": 3.944893325218265e-05, + "loss": 0.2481, + "num_input_tokens_seen": 18571984, + "step": 12155 + }, + { + "epoch": 37.53168469860896, + "grad_norm": 0.5416392087936401, + "learning_rate": 3.944092041939112e-05, + "loss": 0.2984, + "num_input_tokens_seen": 18579280, + "step": 12160 + }, + { + "epoch": 37.547140649149924, + "grad_norm": 0.41032829880714417, + "learning_rate": 3.943290535962818e-05, + "loss": 0.2449, + "num_input_tokens_seen": 18586928, + "step": 12165 + }, + { + "epoch": 37.56259659969088, + "grad_norm": 0.5104154944419861, + "learning_rate": 3.942488807412985e-05, + "loss": 0.2267, + "num_input_tokens_seen": 18594704, + "step": 12170 + }, + { + "epoch": 37.57805255023184, + "grad_norm": 0.3924248218536377, + "learning_rate": 3.941686856413251e-05, + "loss": 0.2077, + "num_input_tokens_seen": 18602608, + "step": 12175 + }, + { + "epoch": 37.5935085007728, + "grad_norm": 0.7071428298950195, + "learning_rate": 3.9408846830872874e-05, + "loss": 0.2414, + "num_input_tokens_seen": 18610160, + "step": 12180 + }, + { + "epoch": 37.60896445131375, + "grad_norm": 0.35429832339286804, + "learning_rate": 3.940082287558798e-05, + "loss": 0.2596, + "num_input_tokens_seen": 18618032, + "step": 12185 + }, + { + "epoch": 37.624420401854714, + "grad_norm": 0.5945603847503662, + "learning_rate": 3.939279669951522e-05, + "loss": 0.3103, + "num_input_tokens_seen": 18625520, + "step": 12190 + }, + { + "epoch": 37.639876352395675, + "grad_norm": 0.3272571563720703, + "learning_rate": 3.938476830389234e-05, + "loss": 0.2148, + "num_input_tokens_seen": 18633008, + "step": 12195 + }, + { + "epoch": 37.65533230293663, + "grad_norm": 0.6469860672950745, + "learning_rate": 3.937673768995742e-05, + "loss": 0.2861, + "num_input_tokens_seen": 18640720, + "step": 12200 + }, + { + "epoch": 37.65533230293663, + "eval_loss": 0.3285144567489624, + "eval_runtime": 6.3684, + "eval_samples_per_second": 90.289, + "eval_steps_per_second": 22.612, + "num_input_tokens_seen": 18640720, + "step": 12200 + }, + { + "epoch": 37.67078825347759, + "grad_norm": 0.4040759205818176, + "learning_rate": 3.936870485894888e-05, + "loss": 0.2779, + "num_input_tokens_seen": 18648720, + "step": 12205 + }, + { + "epoch": 37.68624420401855, + "grad_norm": 0.45105937123298645, + "learning_rate": 3.9360669812105475e-05, + "loss": 0.2308, + "num_input_tokens_seen": 18656592, + "step": 12210 + }, + { + "epoch": 37.701700154559504, + "grad_norm": 0.5140300989151001, + "learning_rate": 3.9352632550666325e-05, + "loss": 0.2636, + "num_input_tokens_seen": 18664240, + "step": 12215 + }, + { + "epoch": 37.717156105100464, + "grad_norm": 0.5072177648544312, + "learning_rate": 3.9344593075870866e-05, + "loss": 0.2448, + "num_input_tokens_seen": 18671344, + "step": 12220 + }, + { + "epoch": 37.732612055641425, + "grad_norm": 0.41530144214630127, + "learning_rate": 3.933655138895889e-05, + "loss": 0.33, + "num_input_tokens_seen": 18679760, + "step": 12225 + }, + { + "epoch": 37.74806800618238, + "grad_norm": 0.5138903856277466, + "learning_rate": 3.932850749117053e-05, + "loss": 0.2526, + "num_input_tokens_seen": 18687568, + "step": 12230 + }, + { + "epoch": 37.76352395672334, + "grad_norm": 0.5483144521713257, + "learning_rate": 3.932046138374624e-05, + "loss": 0.2478, + "num_input_tokens_seen": 18695248, + "step": 12235 + }, + { + "epoch": 37.778979907264294, + "grad_norm": 0.52303147315979, + "learning_rate": 3.9312413067926854e-05, + "loss": 0.2415, + "num_input_tokens_seen": 18702576, + "step": 12240 + }, + { + "epoch": 37.794435857805254, + "grad_norm": 0.6684170365333557, + "learning_rate": 3.9304362544953506e-05, + "loss": 0.316, + "num_input_tokens_seen": 18710192, + "step": 12245 + }, + { + "epoch": 37.809891808346215, + "grad_norm": 0.5616987347602844, + "learning_rate": 3.929630981606769e-05, + "loss": 0.243, + "num_input_tokens_seen": 18717840, + "step": 12250 + }, + { + "epoch": 37.82534775888717, + "grad_norm": 0.5151954889297485, + "learning_rate": 3.928825488251124e-05, + "loss": 0.2939, + "num_input_tokens_seen": 18725552, + "step": 12255 + }, + { + "epoch": 37.84080370942813, + "grad_norm": 0.576481282711029, + "learning_rate": 3.9280197745526344e-05, + "loss": 0.2349, + "num_input_tokens_seen": 18733936, + "step": 12260 + }, + { + "epoch": 37.85625965996909, + "grad_norm": 0.9225079417228699, + "learning_rate": 3.9272138406355495e-05, + "loss": 0.3919, + "num_input_tokens_seen": 18741488, + "step": 12265 + }, + { + "epoch": 37.871715610510044, + "grad_norm": 0.5070661902427673, + "learning_rate": 3.926407686624154e-05, + "loss": 0.2664, + "num_input_tokens_seen": 18748944, + "step": 12270 + }, + { + "epoch": 37.887171561051005, + "grad_norm": 0.7180739045143127, + "learning_rate": 3.9256013126427684e-05, + "loss": 0.2518, + "num_input_tokens_seen": 18757104, + "step": 12275 + }, + { + "epoch": 37.902627511591966, + "grad_norm": 0.3466077148914337, + "learning_rate": 3.9247947188157455e-05, + "loss": 0.3548, + "num_input_tokens_seen": 18764944, + "step": 12280 + }, + { + "epoch": 37.91808346213292, + "grad_norm": 0.409285306930542, + "learning_rate": 3.9239879052674715e-05, + "loss": 0.3433, + "num_input_tokens_seen": 18772912, + "step": 12285 + }, + { + "epoch": 37.93353941267388, + "grad_norm": 0.774101197719574, + "learning_rate": 3.9231808721223673e-05, + "loss": 0.2376, + "num_input_tokens_seen": 18780464, + "step": 12290 + }, + { + "epoch": 37.948995363214834, + "grad_norm": 0.4522714614868164, + "learning_rate": 3.9223736195048886e-05, + "loss": 0.2475, + "num_input_tokens_seen": 18787888, + "step": 12295 + }, + { + "epoch": 37.964451313755795, + "grad_norm": 0.2872186303138733, + "learning_rate": 3.921566147539523e-05, + "loss": 0.2342, + "num_input_tokens_seen": 18795696, + "step": 12300 + }, + { + "epoch": 37.979907264296756, + "grad_norm": 0.5235509276390076, + "learning_rate": 3.920758456350792e-05, + "loss": 0.3416, + "num_input_tokens_seen": 18803696, + "step": 12305 + }, + { + "epoch": 37.99536321483771, + "grad_norm": 0.9465931057929993, + "learning_rate": 3.919950546063253e-05, + "loss": 0.3217, + "num_input_tokens_seen": 18811728, + "step": 12310 + }, + { + "epoch": 38.009273570324574, + "grad_norm": 0.512494683265686, + "learning_rate": 3.919142416801496e-05, + "loss": 0.2804, + "num_input_tokens_seen": 18817984, + "step": 12315 + }, + { + "epoch": 38.024729520865534, + "grad_norm": 0.8197565078735352, + "learning_rate": 3.918334068690144e-05, + "loss": 0.2969, + "num_input_tokens_seen": 18825984, + "step": 12320 + }, + { + "epoch": 38.04018547140649, + "grad_norm": 0.46029332280158997, + "learning_rate": 3.917525501853855e-05, + "loss": 0.256, + "num_input_tokens_seen": 18833248, + "step": 12325 + }, + { + "epoch": 38.05564142194745, + "grad_norm": 0.44462263584136963, + "learning_rate": 3.916716716417319e-05, + "loss": 0.2095, + "num_input_tokens_seen": 18840544, + "step": 12330 + }, + { + "epoch": 38.07109737248841, + "grad_norm": 0.5493929982185364, + "learning_rate": 3.915907712505263e-05, + "loss": 0.3217, + "num_input_tokens_seen": 18848544, + "step": 12335 + }, + { + "epoch": 38.086553323029364, + "grad_norm": 0.5047240257263184, + "learning_rate": 3.915098490242444e-05, + "loss": 0.2409, + "num_input_tokens_seen": 18856384, + "step": 12340 + }, + { + "epoch": 38.102009273570324, + "grad_norm": 0.4259386360645294, + "learning_rate": 3.914289049753654e-05, + "loss": 0.2982, + "num_input_tokens_seen": 18864512, + "step": 12345 + }, + { + "epoch": 38.117465224111285, + "grad_norm": 0.48375728726387024, + "learning_rate": 3.913479391163719e-05, + "loss": 0.2429, + "num_input_tokens_seen": 18871872, + "step": 12350 + }, + { + "epoch": 38.13292117465224, + "grad_norm": 0.49243125319480896, + "learning_rate": 3.9126695145975e-05, + "loss": 0.2554, + "num_input_tokens_seen": 18879328, + "step": 12355 + }, + { + "epoch": 38.1483771251932, + "grad_norm": 0.7898686528205872, + "learning_rate": 3.911859420179889e-05, + "loss": 0.3127, + "num_input_tokens_seen": 18886880, + "step": 12360 + }, + { + "epoch": 38.16383307573416, + "grad_norm": 0.5985245108604431, + "learning_rate": 3.911049108035813e-05, + "loss": 0.2667, + "num_input_tokens_seen": 18894112, + "step": 12365 + }, + { + "epoch": 38.179289026275114, + "grad_norm": 0.7598841190338135, + "learning_rate": 3.910238578290232e-05, + "loss": 0.3102, + "num_input_tokens_seen": 18901472, + "step": 12370 + }, + { + "epoch": 38.194744976816075, + "grad_norm": 0.34954965114593506, + "learning_rate": 3.90942783106814e-05, + "loss": 0.2786, + "num_input_tokens_seen": 18908800, + "step": 12375 + }, + { + "epoch": 38.210200927357036, + "grad_norm": 0.5041497945785522, + "learning_rate": 3.908616866494564e-05, + "loss": 0.272, + "num_input_tokens_seen": 18916640, + "step": 12380 + }, + { + "epoch": 38.22565687789799, + "grad_norm": 0.5117724537849426, + "learning_rate": 3.907805684694566e-05, + "loss": 0.2705, + "num_input_tokens_seen": 18923808, + "step": 12385 + }, + { + "epoch": 38.24111282843895, + "grad_norm": 0.798118531703949, + "learning_rate": 3.90699428579324e-05, + "loss": 0.3062, + "num_input_tokens_seen": 18931456, + "step": 12390 + }, + { + "epoch": 38.256568778979904, + "grad_norm": 0.4184088408946991, + "learning_rate": 3.906182669915713e-05, + "loss": 0.286, + "num_input_tokens_seen": 18938368, + "step": 12395 + }, + { + "epoch": 38.272024729520865, + "grad_norm": 0.4974973499774933, + "learning_rate": 3.9053708371871476e-05, + "loss": 0.263, + "num_input_tokens_seen": 18946400, + "step": 12400 + }, + { + "epoch": 38.272024729520865, + "eval_loss": 0.32841652631759644, + "eval_runtime": 6.3359, + "eval_samples_per_second": 90.752, + "eval_steps_per_second": 22.728, + "num_input_tokens_seen": 18946400, + "step": 12400 + }, + { + "epoch": 38.287480680061826, + "grad_norm": 0.7639886140823364, + "learning_rate": 3.904558787732738e-05, + "loss": 0.2488, + "num_input_tokens_seen": 18953984, + "step": 12405 + }, + { + "epoch": 38.30293663060278, + "grad_norm": 0.4414333701133728, + "learning_rate": 3.9037465216777135e-05, + "loss": 0.2492, + "num_input_tokens_seen": 18961504, + "step": 12410 + }, + { + "epoch": 38.31839258114374, + "grad_norm": 0.43646782636642456, + "learning_rate": 3.902934039147334e-05, + "loss": 0.2611, + "num_input_tokens_seen": 18969312, + "step": 12415 + }, + { + "epoch": 38.3338485316847, + "grad_norm": 0.8645695447921753, + "learning_rate": 3.902121340266894e-05, + "loss": 0.2084, + "num_input_tokens_seen": 18976736, + "step": 12420 + }, + { + "epoch": 38.349304482225655, + "grad_norm": 0.6356896162033081, + "learning_rate": 3.9013084251617246e-05, + "loss": 0.2927, + "num_input_tokens_seen": 18984064, + "step": 12425 + }, + { + "epoch": 38.364760432766616, + "grad_norm": 0.6355364322662354, + "learning_rate": 3.9004952939571865e-05, + "loss": 0.2764, + "num_input_tokens_seen": 18992288, + "step": 12430 + }, + { + "epoch": 38.38021638330758, + "grad_norm": 0.5206319689750671, + "learning_rate": 3.899681946778673e-05, + "loss": 0.2697, + "num_input_tokens_seen": 19000416, + "step": 12435 + }, + { + "epoch": 38.39567233384853, + "grad_norm": 0.5786249041557312, + "learning_rate": 3.898868383751615e-05, + "loss": 0.3026, + "num_input_tokens_seen": 19007744, + "step": 12440 + }, + { + "epoch": 38.41112828438949, + "grad_norm": 0.4732385575771332, + "learning_rate": 3.8980546050014724e-05, + "loss": 0.3439, + "num_input_tokens_seen": 19015232, + "step": 12445 + }, + { + "epoch": 38.426584234930445, + "grad_norm": 0.4439452588558197, + "learning_rate": 3.897240610653741e-05, + "loss": 0.2625, + "num_input_tokens_seen": 19023136, + "step": 12450 + }, + { + "epoch": 38.442040185471406, + "grad_norm": 0.45414283871650696, + "learning_rate": 3.896426400833948e-05, + "loss": 0.3177, + "num_input_tokens_seen": 19031328, + "step": 12455 + }, + { + "epoch": 38.45749613601237, + "grad_norm": 0.7426843643188477, + "learning_rate": 3.895611975667656e-05, + "loss": 0.2839, + "num_input_tokens_seen": 19039104, + "step": 12460 + }, + { + "epoch": 38.47295208655332, + "grad_norm": 0.5927363634109497, + "learning_rate": 3.8947973352804584e-05, + "loss": 0.2517, + "num_input_tokens_seen": 19047392, + "step": 12465 + }, + { + "epoch": 38.48840803709428, + "grad_norm": 0.5481699109077454, + "learning_rate": 3.893982479797984e-05, + "loss": 0.3084, + "num_input_tokens_seen": 19055104, + "step": 12470 + }, + { + "epoch": 38.50386398763524, + "grad_norm": 0.41304081678390503, + "learning_rate": 3.8931674093458926e-05, + "loss": 0.2442, + "num_input_tokens_seen": 19063104, + "step": 12475 + }, + { + "epoch": 38.519319938176196, + "grad_norm": 0.38148942589759827, + "learning_rate": 3.89235212404988e-05, + "loss": 0.2593, + "num_input_tokens_seen": 19070592, + "step": 12480 + }, + { + "epoch": 38.53477588871716, + "grad_norm": 0.4082585275173187, + "learning_rate": 3.891536624035672e-05, + "loss": 0.2599, + "num_input_tokens_seen": 19077952, + "step": 12485 + }, + { + "epoch": 38.55023183925812, + "grad_norm": 0.5006064772605896, + "learning_rate": 3.8907209094290295e-05, + "loss": 0.2528, + "num_input_tokens_seen": 19085728, + "step": 12490 + }, + { + "epoch": 38.56568778979907, + "grad_norm": 0.5510046482086182, + "learning_rate": 3.8899049803557466e-05, + "loss": 0.255, + "num_input_tokens_seen": 19094240, + "step": 12495 + }, + { + "epoch": 38.58114374034003, + "grad_norm": 0.5486768484115601, + "learning_rate": 3.889088836941648e-05, + "loss": 0.2104, + "num_input_tokens_seen": 19101792, + "step": 12500 + }, + { + "epoch": 38.59659969088099, + "grad_norm": 0.5754913687705994, + "learning_rate": 3.8882724793125946e-05, + "loss": 0.3742, + "num_input_tokens_seen": 19109600, + "step": 12505 + }, + { + "epoch": 38.61205564142195, + "grad_norm": 0.6559274196624756, + "learning_rate": 3.8874559075944794e-05, + "loss": 0.3033, + "num_input_tokens_seen": 19117184, + "step": 12510 + }, + { + "epoch": 38.62751159196291, + "grad_norm": 0.47707292437553406, + "learning_rate": 3.886639121913227e-05, + "loss": 0.203, + "num_input_tokens_seen": 19124992, + "step": 12515 + }, + { + "epoch": 38.64296754250386, + "grad_norm": 0.4853302538394928, + "learning_rate": 3.885822122394797e-05, + "loss": 0.2707, + "num_input_tokens_seen": 19133120, + "step": 12520 + }, + { + "epoch": 38.65842349304482, + "grad_norm": 0.6087316274642944, + "learning_rate": 3.8850049091651794e-05, + "loss": 0.2342, + "num_input_tokens_seen": 19140896, + "step": 12525 + }, + { + "epoch": 38.67387944358578, + "grad_norm": 0.49577608704566956, + "learning_rate": 3.8841874823504e-05, + "loss": 0.2425, + "num_input_tokens_seen": 19148704, + "step": 12530 + }, + { + "epoch": 38.689335394126736, + "grad_norm": 0.8096317052841187, + "learning_rate": 3.8833698420765157e-05, + "loss": 0.3115, + "num_input_tokens_seen": 19156352, + "step": 12535 + }, + { + "epoch": 38.7047913446677, + "grad_norm": 0.21851059794425964, + "learning_rate": 3.882551988469618e-05, + "loss": 0.2378, + "num_input_tokens_seen": 19163136, + "step": 12540 + }, + { + "epoch": 38.72024729520866, + "grad_norm": 0.6988590955734253, + "learning_rate": 3.881733921655829e-05, + "loss": 0.2588, + "num_input_tokens_seen": 19170944, + "step": 12545 + }, + { + "epoch": 38.73570324574961, + "grad_norm": 0.4430778920650482, + "learning_rate": 3.8809156417613054e-05, + "loss": 0.3536, + "num_input_tokens_seen": 19178624, + "step": 12550 + }, + { + "epoch": 38.75115919629057, + "grad_norm": 0.39257726073265076, + "learning_rate": 3.8800971489122364e-05, + "loss": 0.2411, + "num_input_tokens_seen": 19186528, + "step": 12555 + }, + { + "epoch": 38.76661514683153, + "grad_norm": 0.8712234497070312, + "learning_rate": 3.8792784432348434e-05, + "loss": 0.2456, + "num_input_tokens_seen": 19194176, + "step": 12560 + }, + { + "epoch": 38.78207109737249, + "grad_norm": 0.6537286639213562, + "learning_rate": 3.878459524855381e-05, + "loss": 0.2848, + "num_input_tokens_seen": 19201760, + "step": 12565 + }, + { + "epoch": 38.79752704791345, + "grad_norm": 0.8079940676689148, + "learning_rate": 3.8776403939001384e-05, + "loss": 0.2661, + "num_input_tokens_seen": 19209568, + "step": 12570 + }, + { + "epoch": 38.8129829984544, + "grad_norm": 0.5042819976806641, + "learning_rate": 3.876821050495433e-05, + "loss": 0.2661, + "num_input_tokens_seen": 19217152, + "step": 12575 + }, + { + "epoch": 38.82843894899536, + "grad_norm": 0.30260124802589417, + "learning_rate": 3.87600149476762e-05, + "loss": 0.2673, + "num_input_tokens_seen": 19224608, + "step": 12580 + }, + { + "epoch": 38.84389489953632, + "grad_norm": 0.4733465015888214, + "learning_rate": 3.8751817268430843e-05, + "loss": 0.2871, + "num_input_tokens_seen": 19232192, + "step": 12585 + }, + { + "epoch": 38.85935085007728, + "grad_norm": 0.468075156211853, + "learning_rate": 3.8743617468482464e-05, + "loss": 0.2515, + "num_input_tokens_seen": 19239232, + "step": 12590 + }, + { + "epoch": 38.87480680061824, + "grad_norm": 0.402505099773407, + "learning_rate": 3.8735415549095535e-05, + "loss": 0.2595, + "num_input_tokens_seen": 19246272, + "step": 12595 + }, + { + "epoch": 38.8902627511592, + "grad_norm": 0.8529613018035889, + "learning_rate": 3.8727211511534934e-05, + "loss": 0.2913, + "num_input_tokens_seen": 19254240, + "step": 12600 + }, + { + "epoch": 38.8902627511592, + "eval_loss": 0.3276170790195465, + "eval_runtime": 6.2951, + "eval_samples_per_second": 91.341, + "eval_steps_per_second": 22.875, + "num_input_tokens_seen": 19254240, + "step": 12600 + }, + { + "epoch": 38.90571870170015, + "grad_norm": 0.8746472597122192, + "learning_rate": 3.8719005357065804e-05, + "loss": 0.2725, + "num_input_tokens_seen": 19261408, + "step": 12605 + }, + { + "epoch": 38.92117465224111, + "grad_norm": 0.5958709716796875, + "learning_rate": 3.8710797086953645e-05, + "loss": 0.2328, + "num_input_tokens_seen": 19269184, + "step": 12610 + }, + { + "epoch": 38.936630602782074, + "grad_norm": 0.4961085915565491, + "learning_rate": 3.870258670246427e-05, + "loss": 0.3029, + "num_input_tokens_seen": 19277152, + "step": 12615 + }, + { + "epoch": 38.95208655332303, + "grad_norm": 0.525568425655365, + "learning_rate": 3.869437420486384e-05, + "loss": 0.3459, + "num_input_tokens_seen": 19284704, + "step": 12620 + }, + { + "epoch": 38.96754250386399, + "grad_norm": 0.46607691049575806, + "learning_rate": 3.8686159595418805e-05, + "loss": 0.211, + "num_input_tokens_seen": 19291808, + "step": 12625 + }, + { + "epoch": 38.98299845440495, + "grad_norm": 0.44301947951316833, + "learning_rate": 3.867794287539597e-05, + "loss": 0.3114, + "num_input_tokens_seen": 19299456, + "step": 12630 + }, + { + "epoch": 38.9984544049459, + "grad_norm": 0.7089433670043945, + "learning_rate": 3.866972404606245e-05, + "loss": 0.26, + "num_input_tokens_seen": 19307424, + "step": 12635 + }, + { + "epoch": 39.01236476043277, + "grad_norm": 0.6502958536148071, + "learning_rate": 3.866150310868571e-05, + "loss": 0.2601, + "num_input_tokens_seen": 19314016, + "step": 12640 + }, + { + "epoch": 39.02782071097373, + "grad_norm": 0.43621328473091125, + "learning_rate": 3.8653280064533506e-05, + "loss": 0.2805, + "num_input_tokens_seen": 19322016, + "step": 12645 + }, + { + "epoch": 39.04327666151468, + "grad_norm": 0.5833430290222168, + "learning_rate": 3.864505491487394e-05, + "loss": 0.2409, + "num_input_tokens_seen": 19330112, + "step": 12650 + }, + { + "epoch": 39.05873261205564, + "grad_norm": 0.5088956952095032, + "learning_rate": 3.8636827660975414e-05, + "loss": 0.2004, + "num_input_tokens_seen": 19337376, + "step": 12655 + }, + { + "epoch": 39.074188562596596, + "grad_norm": 0.6825577020645142, + "learning_rate": 3.862859830410671e-05, + "loss": 0.2079, + "num_input_tokens_seen": 19344800, + "step": 12660 + }, + { + "epoch": 39.08964451313756, + "grad_norm": 0.5629070997238159, + "learning_rate": 3.862036684553688e-05, + "loss": 0.3126, + "num_input_tokens_seen": 19352704, + "step": 12665 + }, + { + "epoch": 39.10510046367852, + "grad_norm": 0.3565267026424408, + "learning_rate": 3.8612133286535314e-05, + "loss": 0.2456, + "num_input_tokens_seen": 19359776, + "step": 12670 + }, + { + "epoch": 39.12055641421947, + "grad_norm": 0.7786404490470886, + "learning_rate": 3.860389762837173e-05, + "loss": 0.3088, + "num_input_tokens_seen": 19367680, + "step": 12675 + }, + { + "epoch": 39.13601236476043, + "grad_norm": 0.33781811594963074, + "learning_rate": 3.859565987231618e-05, + "loss": 0.227, + "num_input_tokens_seen": 19374944, + "step": 12680 + }, + { + "epoch": 39.15146831530139, + "grad_norm": 0.45941129326820374, + "learning_rate": 3.858742001963902e-05, + "loss": 0.2708, + "num_input_tokens_seen": 19382560, + "step": 12685 + }, + { + "epoch": 39.16692426584235, + "grad_norm": 0.49177101254463196, + "learning_rate": 3.857917807161094e-05, + "loss": 0.2797, + "num_input_tokens_seen": 19390784, + "step": 12690 + }, + { + "epoch": 39.18238021638331, + "grad_norm": 0.625834047794342, + "learning_rate": 3.857093402950296e-05, + "loss": 0.3068, + "num_input_tokens_seen": 19399200, + "step": 12695 + }, + { + "epoch": 39.19783616692427, + "grad_norm": 0.42301589250564575, + "learning_rate": 3.8562687894586414e-05, + "loss": 0.293, + "num_input_tokens_seen": 19406688, + "step": 12700 + }, + { + "epoch": 39.21329211746522, + "grad_norm": 0.572422981262207, + "learning_rate": 3.8554439668132946e-05, + "loss": 0.2249, + "num_input_tokens_seen": 19413952, + "step": 12705 + }, + { + "epoch": 39.22874806800618, + "grad_norm": 0.41608673334121704, + "learning_rate": 3.854618935141455e-05, + "loss": 0.3909, + "num_input_tokens_seen": 19422016, + "step": 12710 + }, + { + "epoch": 39.244204018547144, + "grad_norm": 0.6333625912666321, + "learning_rate": 3.8537936945703525e-05, + "loss": 0.281, + "num_input_tokens_seen": 19429472, + "step": 12715 + }, + { + "epoch": 39.2596599690881, + "grad_norm": 0.687062680721283, + "learning_rate": 3.852968245227249e-05, + "loss": 0.3011, + "num_input_tokens_seen": 19437408, + "step": 12720 + }, + { + "epoch": 39.27511591962906, + "grad_norm": 0.4896988272666931, + "learning_rate": 3.85214258723944e-05, + "loss": 0.2912, + "num_input_tokens_seen": 19444384, + "step": 12725 + }, + { + "epoch": 39.29057187017001, + "grad_norm": 0.4971073865890503, + "learning_rate": 3.8513167207342524e-05, + "loss": 0.3472, + "num_input_tokens_seen": 19451488, + "step": 12730 + }, + { + "epoch": 39.30602782071097, + "grad_norm": 0.46042975783348083, + "learning_rate": 3.850490645839044e-05, + "loss": 0.2254, + "num_input_tokens_seen": 19458784, + "step": 12735 + }, + { + "epoch": 39.321483771251934, + "grad_norm": 0.4978675842285156, + "learning_rate": 3.849664362681207e-05, + "loss": 0.269, + "num_input_tokens_seen": 19466272, + "step": 12740 + }, + { + "epoch": 39.33693972179289, + "grad_norm": 0.7618266344070435, + "learning_rate": 3.848837871388165e-05, + "loss": 0.3022, + "num_input_tokens_seen": 19473760, + "step": 12745 + }, + { + "epoch": 39.35239567233385, + "grad_norm": 0.5615552663803101, + "learning_rate": 3.848011172087371e-05, + "loss": 0.308, + "num_input_tokens_seen": 19481344, + "step": 12750 + }, + { + "epoch": 39.36785162287481, + "grad_norm": 0.3590501844882965, + "learning_rate": 3.847184264906315e-05, + "loss": 0.2866, + "num_input_tokens_seen": 19489248, + "step": 12755 + }, + { + "epoch": 39.38330757341576, + "grad_norm": 0.45323416590690613, + "learning_rate": 3.846357149972516e-05, + "loss": 0.255, + "num_input_tokens_seen": 19497152, + "step": 12760 + }, + { + "epoch": 39.398763523956724, + "grad_norm": 0.6781359314918518, + "learning_rate": 3.8455298274135246e-05, + "loss": 0.2784, + "num_input_tokens_seen": 19504736, + "step": 12765 + }, + { + "epoch": 39.414219474497685, + "grad_norm": 0.5248453617095947, + "learning_rate": 3.8447022973569254e-05, + "loss": 0.3107, + "num_input_tokens_seen": 19512608, + "step": 12770 + }, + { + "epoch": 39.42967542503864, + "grad_norm": 0.5258142948150635, + "learning_rate": 3.843874559930332e-05, + "loss": 0.3014, + "num_input_tokens_seen": 19519904, + "step": 12775 + }, + { + "epoch": 39.4451313755796, + "grad_norm": 0.8654763698577881, + "learning_rate": 3.843046615261394e-05, + "loss": 0.2438, + "num_input_tokens_seen": 19527392, + "step": 12780 + }, + { + "epoch": 39.46058732612055, + "grad_norm": 0.4098304212093353, + "learning_rate": 3.842218463477791e-05, + "loss": 0.3265, + "num_input_tokens_seen": 19535328, + "step": 12785 + }, + { + "epoch": 39.476043276661514, + "grad_norm": 0.6466229557991028, + "learning_rate": 3.841390104707233e-05, + "loss": 0.3791, + "num_input_tokens_seen": 19542720, + "step": 12790 + }, + { + "epoch": 39.491499227202475, + "grad_norm": 0.5484912991523743, + "learning_rate": 3.8405615390774643e-05, + "loss": 0.2919, + "num_input_tokens_seen": 19550464, + "step": 12795 + }, + { + "epoch": 39.50695517774343, + "grad_norm": 0.49317118525505066, + "learning_rate": 3.839732766716259e-05, + "loss": 0.273, + "num_input_tokens_seen": 19558592, + "step": 12800 + }, + { + "epoch": 39.50695517774343, + "eval_loss": 0.32532036304473877, + "eval_runtime": 6.3142, + "eval_samples_per_second": 91.064, + "eval_steps_per_second": 22.806, + "num_input_tokens_seen": 19558592, + "step": 12800 + }, + { + "epoch": 39.52241112828439, + "grad_norm": 0.4397670328617096, + "learning_rate": 3.838903787751425e-05, + "loss": 0.2684, + "num_input_tokens_seen": 19566240, + "step": 12805 + }, + { + "epoch": 39.53786707882535, + "grad_norm": 0.5179854035377502, + "learning_rate": 3.838074602310802e-05, + "loss": 0.2624, + "num_input_tokens_seen": 19573280, + "step": 12810 + }, + { + "epoch": 39.553323029366304, + "grad_norm": 0.5172144770622253, + "learning_rate": 3.837245210522258e-05, + "loss": 0.2125, + "num_input_tokens_seen": 19581792, + "step": 12815 + }, + { + "epoch": 39.568778979907265, + "grad_norm": 0.8769711256027222, + "learning_rate": 3.8364156125136996e-05, + "loss": 0.2429, + "num_input_tokens_seen": 19588960, + "step": 12820 + }, + { + "epoch": 39.584234930448225, + "grad_norm": 0.37004661560058594, + "learning_rate": 3.835585808413059e-05, + "loss": 0.2852, + "num_input_tokens_seen": 19596992, + "step": 12825 + }, + { + "epoch": 39.59969088098918, + "grad_norm": 0.5629948973655701, + "learning_rate": 3.8347557983483024e-05, + "loss": 0.2737, + "num_input_tokens_seen": 19604352, + "step": 12830 + }, + { + "epoch": 39.61514683153014, + "grad_norm": 0.441839337348938, + "learning_rate": 3.833925582447428e-05, + "loss": 0.234, + "num_input_tokens_seen": 19611968, + "step": 12835 + }, + { + "epoch": 39.630602782071094, + "grad_norm": 0.5507423877716064, + "learning_rate": 3.8330951608384656e-05, + "loss": 0.2686, + "num_input_tokens_seen": 19619040, + "step": 12840 + }, + { + "epoch": 39.646058732612055, + "grad_norm": 0.4320237338542938, + "learning_rate": 3.832264533649477e-05, + "loss": 0.3219, + "num_input_tokens_seen": 19626848, + "step": 12845 + }, + { + "epoch": 39.661514683153015, + "grad_norm": 0.7224617004394531, + "learning_rate": 3.8314337010085555e-05, + "loss": 0.2859, + "num_input_tokens_seen": 19634208, + "step": 12850 + }, + { + "epoch": 39.67697063369397, + "grad_norm": 0.5871168971061707, + "learning_rate": 3.830602663043824e-05, + "loss": 0.2467, + "num_input_tokens_seen": 19641312, + "step": 12855 + }, + { + "epoch": 39.69242658423493, + "grad_norm": 0.39102551341056824, + "learning_rate": 3.8297714198834414e-05, + "loss": 0.2183, + "num_input_tokens_seen": 19649056, + "step": 12860 + }, + { + "epoch": 39.70788253477589, + "grad_norm": 0.3679935336112976, + "learning_rate": 3.828939971655595e-05, + "loss": 0.2176, + "num_input_tokens_seen": 19656384, + "step": 12865 + }, + { + "epoch": 39.723338485316845, + "grad_norm": 0.4880983829498291, + "learning_rate": 3.828108318488505e-05, + "loss": 0.2501, + "num_input_tokens_seen": 19663616, + "step": 12870 + }, + { + "epoch": 39.738794435857805, + "grad_norm": 0.5657460689544678, + "learning_rate": 3.8272764605104216e-05, + "loss": 0.2461, + "num_input_tokens_seen": 19671040, + "step": 12875 + }, + { + "epoch": 39.754250386398766, + "grad_norm": 0.7492674589157104, + "learning_rate": 3.826444397849628e-05, + "loss": 0.2232, + "num_input_tokens_seen": 19678336, + "step": 12880 + }, + { + "epoch": 39.76970633693972, + "grad_norm": 0.45540979504585266, + "learning_rate": 3.825612130634439e-05, + "loss": 0.3033, + "num_input_tokens_seen": 19686656, + "step": 12885 + }, + { + "epoch": 39.78516228748068, + "grad_norm": 0.7022525668144226, + "learning_rate": 3.824779658993202e-05, + "loss": 0.3114, + "num_input_tokens_seen": 19694144, + "step": 12890 + }, + { + "epoch": 39.80061823802164, + "grad_norm": 0.41018545627593994, + "learning_rate": 3.823946983054292e-05, + "loss": 0.2508, + "num_input_tokens_seen": 19702144, + "step": 12895 + }, + { + "epoch": 39.816074188562595, + "grad_norm": 0.36648473143577576, + "learning_rate": 3.82311410294612e-05, + "loss": 0.232, + "num_input_tokens_seen": 19710368, + "step": 12900 + }, + { + "epoch": 39.831530139103556, + "grad_norm": 0.431270569562912, + "learning_rate": 3.822281018797127e-05, + "loss": 0.2708, + "num_input_tokens_seen": 19718240, + "step": 12905 + }, + { + "epoch": 39.84698608964451, + "grad_norm": 0.6518776416778564, + "learning_rate": 3.821447730735783e-05, + "loss": 0.2661, + "num_input_tokens_seen": 19726208, + "step": 12910 + }, + { + "epoch": 39.86244204018547, + "grad_norm": 0.49240681529045105, + "learning_rate": 3.820614238890592e-05, + "loss": 0.2424, + "num_input_tokens_seen": 19734112, + "step": 12915 + }, + { + "epoch": 39.87789799072643, + "grad_norm": 0.48195531964302063, + "learning_rate": 3.819780543390091e-05, + "loss": 0.2211, + "num_input_tokens_seen": 19741408, + "step": 12920 + }, + { + "epoch": 39.893353941267385, + "grad_norm": 0.36459678411483765, + "learning_rate": 3.818946644362844e-05, + "loss": 0.2451, + "num_input_tokens_seen": 19749408, + "step": 12925 + }, + { + "epoch": 39.908809891808346, + "grad_norm": 0.3968496322631836, + "learning_rate": 3.81811254193745e-05, + "loss": 0.2254, + "num_input_tokens_seen": 19757024, + "step": 12930 + }, + { + "epoch": 39.92426584234931, + "grad_norm": 0.5145406723022461, + "learning_rate": 3.8172782362425366e-05, + "loss": 0.2616, + "num_input_tokens_seen": 19764576, + "step": 12935 + }, + { + "epoch": 39.93972179289026, + "grad_norm": 0.5705386400222778, + "learning_rate": 3.816443727406765e-05, + "loss": 0.2749, + "num_input_tokens_seen": 19772288, + "step": 12940 + }, + { + "epoch": 39.95517774343122, + "grad_norm": 0.37253624200820923, + "learning_rate": 3.815609015558829e-05, + "loss": 0.2806, + "num_input_tokens_seen": 19780032, + "step": 12945 + }, + { + "epoch": 39.97063369397218, + "grad_norm": 0.4122242033481598, + "learning_rate": 3.814774100827448e-05, + "loss": 0.2866, + "num_input_tokens_seen": 19787776, + "step": 12950 + }, + { + "epoch": 39.986089644513136, + "grad_norm": 0.4599798917770386, + "learning_rate": 3.813938983341379e-05, + "loss": 0.3408, + "num_input_tokens_seen": 19795136, + "step": 12955 + }, + { + "epoch": 40.0, + "grad_norm": 1.2167463302612305, + "learning_rate": 3.813103663229407e-05, + "loss": 0.2661, + "num_input_tokens_seen": 19801744, + "step": 12960 + }, + { + "epoch": 40.01545595054096, + "grad_norm": 0.32418596744537354, + "learning_rate": 3.812268140620349e-05, + "loss": 0.2928, + "num_input_tokens_seen": 19808944, + "step": 12965 + }, + { + "epoch": 40.030911901081915, + "grad_norm": 0.5195584297180176, + "learning_rate": 3.811432415643051e-05, + "loss": 0.3212, + "num_input_tokens_seen": 19816048, + "step": 12970 + }, + { + "epoch": 40.046367851622875, + "grad_norm": 1.0479048490524292, + "learning_rate": 3.8105964884263954e-05, + "loss": 0.2758, + "num_input_tokens_seen": 19823664, + "step": 12975 + }, + { + "epoch": 40.061823802163836, + "grad_norm": 0.34361231327056885, + "learning_rate": 3.809760359099291e-05, + "loss": 0.2712, + "num_input_tokens_seen": 19831312, + "step": 12980 + }, + { + "epoch": 40.07727975270479, + "grad_norm": 0.3898048996925354, + "learning_rate": 3.8089240277906804e-05, + "loss": 0.2347, + "num_input_tokens_seen": 19838544, + "step": 12985 + }, + { + "epoch": 40.09273570324575, + "grad_norm": 0.589039146900177, + "learning_rate": 3.808087494629535e-05, + "loss": 0.1996, + "num_input_tokens_seen": 19846064, + "step": 12990 + }, + { + "epoch": 40.108191653786704, + "grad_norm": 0.41748902201652527, + "learning_rate": 3.8072507597448595e-05, + "loss": 0.2581, + "num_input_tokens_seen": 19853776, + "step": 12995 + }, + { + "epoch": 40.123647604327665, + "grad_norm": 0.5849087238311768, + "learning_rate": 3.806413823265689e-05, + "loss": 0.2321, + "num_input_tokens_seen": 19861168, + "step": 13000 + }, + { + "epoch": 40.123647604327665, + "eval_loss": 0.32579630613327026, + "eval_runtime": 6.3042, + "eval_samples_per_second": 91.208, + "eval_steps_per_second": 22.842, + "num_input_tokens_seen": 19861168, + "step": 13000 + }, + { + "epoch": 40.139103554868626, + "grad_norm": 0.7174578309059143, + "learning_rate": 3.805576685321089e-05, + "loss": 0.2937, + "num_input_tokens_seen": 19868464, + "step": 13005 + }, + { + "epoch": 40.15455950540958, + "grad_norm": 0.8591083884239197, + "learning_rate": 3.804739346040158e-05, + "loss": 0.2976, + "num_input_tokens_seen": 19876240, + "step": 13010 + }, + { + "epoch": 40.17001545595054, + "grad_norm": 1.067629098892212, + "learning_rate": 3.8039018055520234e-05, + "loss": 0.2942, + "num_input_tokens_seen": 19883824, + "step": 13015 + }, + { + "epoch": 40.1854714064915, + "grad_norm": 0.6578458547592163, + "learning_rate": 3.803064063985844e-05, + "loss": 0.2272, + "num_input_tokens_seen": 19891728, + "step": 13020 + }, + { + "epoch": 40.200927357032455, + "grad_norm": 0.5661362409591675, + "learning_rate": 3.802226121470811e-05, + "loss": 0.2692, + "num_input_tokens_seen": 19899248, + "step": 13025 + }, + { + "epoch": 40.216383307573416, + "grad_norm": 0.5504807233810425, + "learning_rate": 3.801387978136145e-05, + "loss": 0.2424, + "num_input_tokens_seen": 19906576, + "step": 13030 + }, + { + "epoch": 40.23183925811438, + "grad_norm": 0.3998510539531708, + "learning_rate": 3.800549634111099e-05, + "loss": 0.2297, + "num_input_tokens_seen": 19914704, + "step": 13035 + }, + { + "epoch": 40.24729520865533, + "grad_norm": 0.4933226704597473, + "learning_rate": 3.799711089524955e-05, + "loss": 0.2678, + "num_input_tokens_seen": 19921968, + "step": 13040 + }, + { + "epoch": 40.26275115919629, + "grad_norm": 0.5535529851913452, + "learning_rate": 3.7988723445070285e-05, + "loss": 0.2699, + "num_input_tokens_seen": 19929840, + "step": 13045 + }, + { + "epoch": 40.27820710973725, + "grad_norm": 0.5027062296867371, + "learning_rate": 3.798033399186663e-05, + "loss": 0.215, + "num_input_tokens_seen": 19937616, + "step": 13050 + }, + { + "epoch": 40.293663060278206, + "grad_norm": 0.4069463312625885, + "learning_rate": 3.797194253693237e-05, + "loss": 0.2005, + "num_input_tokens_seen": 19945456, + "step": 13055 + }, + { + "epoch": 40.30911901081917, + "grad_norm": 0.6046401262283325, + "learning_rate": 3.796354908156153e-05, + "loss": 0.2976, + "num_input_tokens_seen": 19953360, + "step": 13060 + }, + { + "epoch": 40.32457496136012, + "grad_norm": 0.6060523986816406, + "learning_rate": 3.795515362704853e-05, + "loss": 0.2528, + "num_input_tokens_seen": 19960752, + "step": 13065 + }, + { + "epoch": 40.34003091190108, + "grad_norm": 0.6410747170448303, + "learning_rate": 3.794675617468803e-05, + "loss": 0.262, + "num_input_tokens_seen": 19968752, + "step": 13070 + }, + { + "epoch": 40.35548686244204, + "grad_norm": 0.6961711049079895, + "learning_rate": 3.793835672577503e-05, + "loss": 0.3325, + "num_input_tokens_seen": 19976400, + "step": 13075 + }, + { + "epoch": 40.370942812982996, + "grad_norm": 0.5995051860809326, + "learning_rate": 3.7929955281604826e-05, + "loss": 0.2676, + "num_input_tokens_seen": 19983696, + "step": 13080 + }, + { + "epoch": 40.38639876352396, + "grad_norm": 0.3117564022541046, + "learning_rate": 3.7921551843473036e-05, + "loss": 0.2974, + "num_input_tokens_seen": 19991248, + "step": 13085 + }, + { + "epoch": 40.40185471406492, + "grad_norm": 0.46521449089050293, + "learning_rate": 3.791314641267557e-05, + "loss": 0.3076, + "num_input_tokens_seen": 19998512, + "step": 13090 + }, + { + "epoch": 40.41731066460587, + "grad_norm": 0.7473383545875549, + "learning_rate": 3.790473899050864e-05, + "loss": 0.3557, + "num_input_tokens_seen": 20005968, + "step": 13095 + }, + { + "epoch": 40.43276661514683, + "grad_norm": 1.0579453706741333, + "learning_rate": 3.7896329578268794e-05, + "loss": 0.2997, + "num_input_tokens_seen": 20013808, + "step": 13100 + }, + { + "epoch": 40.44822256568779, + "grad_norm": 0.4535798132419586, + "learning_rate": 3.7887918177252855e-05, + "loss": 0.2636, + "num_input_tokens_seen": 20021456, + "step": 13105 + }, + { + "epoch": 40.46367851622875, + "grad_norm": 0.4685685336589813, + "learning_rate": 3.787950478875798e-05, + "loss": 0.2106, + "num_input_tokens_seen": 20029200, + "step": 13110 + }, + { + "epoch": 40.47913446676971, + "grad_norm": 0.5882577896118164, + "learning_rate": 3.787108941408162e-05, + "loss": 0.2403, + "num_input_tokens_seen": 20037232, + "step": 13115 + }, + { + "epoch": 40.49459041731066, + "grad_norm": 0.5715230107307434, + "learning_rate": 3.786267205452151e-05, + "loss": 0.256, + "num_input_tokens_seen": 20044720, + "step": 13120 + }, + { + "epoch": 40.51004636785162, + "grad_norm": 0.7141552567481995, + "learning_rate": 3.785425271137573e-05, + "loss": 0.2323, + "num_input_tokens_seen": 20052112, + "step": 13125 + }, + { + "epoch": 40.52550231839258, + "grad_norm": 0.33257123827934265, + "learning_rate": 3.7845831385942655e-05, + "loss": 0.2827, + "num_input_tokens_seen": 20059728, + "step": 13130 + }, + { + "epoch": 40.54095826893354, + "grad_norm": 0.6744012832641602, + "learning_rate": 3.7837408079520944e-05, + "loss": 0.2928, + "num_input_tokens_seen": 20067408, + "step": 13135 + }, + { + "epoch": 40.5564142194745, + "grad_norm": 0.41801485419273376, + "learning_rate": 3.782898279340957e-05, + "loss": 0.2412, + "num_input_tokens_seen": 20075440, + "step": 13140 + }, + { + "epoch": 40.57187017001546, + "grad_norm": 1.0870693922042847, + "learning_rate": 3.782055552890784e-05, + "loss": 0.2587, + "num_input_tokens_seen": 20082960, + "step": 13145 + }, + { + "epoch": 40.58732612055641, + "grad_norm": 0.4285912811756134, + "learning_rate": 3.781212628731534e-05, + "loss": 0.276, + "num_input_tokens_seen": 20090736, + "step": 13150 + }, + { + "epoch": 40.60278207109737, + "grad_norm": 0.7007280588150024, + "learning_rate": 3.7803695069931946e-05, + "loss": 0.2509, + "num_input_tokens_seen": 20099152, + "step": 13155 + }, + { + "epoch": 40.618238021638334, + "grad_norm": 0.5222724676132202, + "learning_rate": 3.779526187805789e-05, + "loss": 0.2389, + "num_input_tokens_seen": 20106512, + "step": 13160 + }, + { + "epoch": 40.63369397217929, + "grad_norm": 0.38026225566864014, + "learning_rate": 3.778682671299364e-05, + "loss": 0.2612, + "num_input_tokens_seen": 20114608, + "step": 13165 + }, + { + "epoch": 40.64914992272025, + "grad_norm": 0.8207507729530334, + "learning_rate": 3.777838957604003e-05, + "loss": 0.2378, + "num_input_tokens_seen": 20122384, + "step": 13170 + }, + { + "epoch": 40.66460587326121, + "grad_norm": 0.5814533233642578, + "learning_rate": 3.776995046849816e-05, + "loss": 0.3198, + "num_input_tokens_seen": 20130640, + "step": 13175 + }, + { + "epoch": 40.68006182380216, + "grad_norm": 0.546974778175354, + "learning_rate": 3.776150939166945e-05, + "loss": 0.3153, + "num_input_tokens_seen": 20138384, + "step": 13180 + }, + { + "epoch": 40.695517774343124, + "grad_norm": 0.3581986427307129, + "learning_rate": 3.775306634685562e-05, + "loss": 0.2919, + "num_input_tokens_seen": 20146608, + "step": 13185 + }, + { + "epoch": 40.71097372488408, + "grad_norm": 0.3467352092266083, + "learning_rate": 3.7744621335358696e-05, + "loss": 0.2351, + "num_input_tokens_seen": 20154512, + "step": 13190 + }, + { + "epoch": 40.72642967542504, + "grad_norm": 0.6446689963340759, + "learning_rate": 3.7736174358481e-05, + "loss": 0.2534, + "num_input_tokens_seen": 20161808, + "step": 13195 + }, + { + "epoch": 40.741885625966, + "grad_norm": 0.5651279091835022, + "learning_rate": 3.7727725417525175e-05, + "loss": 0.2927, + "num_input_tokens_seen": 20169712, + "step": 13200 + }, + { + "epoch": 40.741885625966, + "eval_loss": 0.32498475909233093, + "eval_runtime": 6.317, + "eval_samples_per_second": 91.024, + "eval_steps_per_second": 22.796, + "num_input_tokens_seen": 20169712, + "step": 13200 + }, + { + "epoch": 40.75734157650695, + "grad_norm": 0.48841336369514465, + "learning_rate": 3.771927451379414e-05, + "loss": 0.2265, + "num_input_tokens_seen": 20177520, + "step": 13205 + }, + { + "epoch": 40.77279752704791, + "grad_norm": 0.38440239429473877, + "learning_rate": 3.7710821648591135e-05, + "loss": 0.2817, + "num_input_tokens_seen": 20185456, + "step": 13210 + }, + { + "epoch": 40.788253477588874, + "grad_norm": 0.7945720553398132, + "learning_rate": 3.7702366823219694e-05, + "loss": 0.2517, + "num_input_tokens_seen": 20193264, + "step": 13215 + }, + { + "epoch": 40.80370942812983, + "grad_norm": 0.9607544541358948, + "learning_rate": 3.769391003898366e-05, + "loss": 0.372, + "num_input_tokens_seen": 20201200, + "step": 13220 + }, + { + "epoch": 40.81916537867079, + "grad_norm": 0.6475789546966553, + "learning_rate": 3.768545129718718e-05, + "loss": 0.2992, + "num_input_tokens_seen": 20208592, + "step": 13225 + }, + { + "epoch": 40.83462132921175, + "grad_norm": 0.6419808864593506, + "learning_rate": 3.7676990599134686e-05, + "loss": 0.266, + "num_input_tokens_seen": 20216432, + "step": 13230 + }, + { + "epoch": 40.8500772797527, + "grad_norm": 0.880935549736023, + "learning_rate": 3.766852794613095e-05, + "loss": 0.2411, + "num_input_tokens_seen": 20223568, + "step": 13235 + }, + { + "epoch": 40.865533230293664, + "grad_norm": 0.3211355209350586, + "learning_rate": 3.766006333948099e-05, + "loss": 0.2322, + "num_input_tokens_seen": 20231856, + "step": 13240 + }, + { + "epoch": 40.88098918083462, + "grad_norm": 0.8035355806350708, + "learning_rate": 3.765159678049017e-05, + "loss": 0.3506, + "num_input_tokens_seen": 20239632, + "step": 13245 + }, + { + "epoch": 40.89644513137558, + "grad_norm": 0.5169605612754822, + "learning_rate": 3.7643128270464134e-05, + "loss": 0.2619, + "num_input_tokens_seen": 20246768, + "step": 13250 + }, + { + "epoch": 40.91190108191654, + "grad_norm": 0.45257139205932617, + "learning_rate": 3.763465781070884e-05, + "loss": 0.2901, + "num_input_tokens_seen": 20254576, + "step": 13255 + }, + { + "epoch": 40.92735703245749, + "grad_norm": 0.6272189617156982, + "learning_rate": 3.762618540253052e-05, + "loss": 0.2702, + "num_input_tokens_seen": 20262512, + "step": 13260 + }, + { + "epoch": 40.942812982998454, + "grad_norm": 0.7577395439147949, + "learning_rate": 3.761771104723576e-05, + "loss": 0.3047, + "num_input_tokens_seen": 20270288, + "step": 13265 + }, + { + "epoch": 40.958268933539415, + "grad_norm": 0.4510180652141571, + "learning_rate": 3.7609234746131386e-05, + "loss": 0.2314, + "num_input_tokens_seen": 20278032, + "step": 13270 + }, + { + "epoch": 40.97372488408037, + "grad_norm": 0.6318406462669373, + "learning_rate": 3.7600756500524556e-05, + "loss": 0.2667, + "num_input_tokens_seen": 20285392, + "step": 13275 + }, + { + "epoch": 40.98918083462133, + "grad_norm": 0.8984829187393188, + "learning_rate": 3.759227631172271e-05, + "loss": 0.323, + "num_input_tokens_seen": 20292560, + "step": 13280 + }, + { + "epoch": 41.003091190108194, + "grad_norm": 0.5255023837089539, + "learning_rate": 3.758379418103363e-05, + "loss": 0.3484, + "num_input_tokens_seen": 20298816, + "step": 13285 + }, + { + "epoch": 41.01854714064915, + "grad_norm": 0.49530693888664246, + "learning_rate": 3.757531010976534e-05, + "loss": 0.2752, + "num_input_tokens_seen": 20306336, + "step": 13290 + }, + { + "epoch": 41.03400309119011, + "grad_norm": 0.41223427653312683, + "learning_rate": 3.75668240992262e-05, + "loss": 0.2506, + "num_input_tokens_seen": 20314240, + "step": 13295 + }, + { + "epoch": 41.04945904173107, + "grad_norm": 0.4172406792640686, + "learning_rate": 3.7558336150724865e-05, + "loss": 0.2488, + "num_input_tokens_seen": 20321408, + "step": 13300 + }, + { + "epoch": 41.06491499227202, + "grad_norm": 0.6659147143363953, + "learning_rate": 3.754984626557028e-05, + "loss": 0.2532, + "num_input_tokens_seen": 20329792, + "step": 13305 + }, + { + "epoch": 41.08037094281298, + "grad_norm": 0.5128026008605957, + "learning_rate": 3.754135444507168e-05, + "loss": 0.2456, + "num_input_tokens_seen": 20337504, + "step": 13310 + }, + { + "epoch": 41.095826893353944, + "grad_norm": 0.6184148192405701, + "learning_rate": 3.753286069053863e-05, + "loss": 0.2235, + "num_input_tokens_seen": 20345184, + "step": 13315 + }, + { + "epoch": 41.1112828438949, + "grad_norm": 0.6964574456214905, + "learning_rate": 3.7524365003280945e-05, + "loss": 0.211, + "num_input_tokens_seen": 20352096, + "step": 13320 + }, + { + "epoch": 41.12673879443586, + "grad_norm": 0.38679903745651245, + "learning_rate": 3.75158673846088e-05, + "loss": 0.2139, + "num_input_tokens_seen": 20359840, + "step": 13325 + }, + { + "epoch": 41.14219474497681, + "grad_norm": 0.4662390649318695, + "learning_rate": 3.750736783583262e-05, + "loss": 0.2365, + "num_input_tokens_seen": 20367936, + "step": 13330 + }, + { + "epoch": 41.15765069551777, + "grad_norm": 0.4374399781227112, + "learning_rate": 3.7498866358263144e-05, + "loss": 0.2159, + "num_input_tokens_seen": 20375520, + "step": 13335 + }, + { + "epoch": 41.173106646058734, + "grad_norm": 0.3222748041152954, + "learning_rate": 3.74903629532114e-05, + "loss": 0.3528, + "num_input_tokens_seen": 20382976, + "step": 13340 + }, + { + "epoch": 41.18856259659969, + "grad_norm": 0.3872283399105072, + "learning_rate": 3.748185762198873e-05, + "loss": 0.3063, + "num_input_tokens_seen": 20390656, + "step": 13345 + }, + { + "epoch": 41.20401854714065, + "grad_norm": 0.5774600505828857, + "learning_rate": 3.747335036590676e-05, + "loss": 0.2707, + "num_input_tokens_seen": 20398624, + "step": 13350 + }, + { + "epoch": 41.21947449768161, + "grad_norm": 0.4094473123550415, + "learning_rate": 3.7464841186277405e-05, + "loss": 0.2228, + "num_input_tokens_seen": 20406816, + "step": 13355 + }, + { + "epoch": 41.23493044822256, + "grad_norm": 0.704920768737793, + "learning_rate": 3.7456330084412896e-05, + "loss": 0.2776, + "num_input_tokens_seen": 20413984, + "step": 13360 + }, + { + "epoch": 41.250386398763524, + "grad_norm": 0.6582883596420288, + "learning_rate": 3.744781706162576e-05, + "loss": 0.2688, + "num_input_tokens_seen": 20421504, + "step": 13365 + }, + { + "epoch": 41.265842349304485, + "grad_norm": 0.3534213900566101, + "learning_rate": 3.743930211922879e-05, + "loss": 0.241, + "num_input_tokens_seen": 20428864, + "step": 13370 + }, + { + "epoch": 41.28129829984544, + "grad_norm": 0.5985316634178162, + "learning_rate": 3.743078525853513e-05, + "loss": 0.2871, + "num_input_tokens_seen": 20436416, + "step": 13375 + }, + { + "epoch": 41.2967542503864, + "grad_norm": 0.3722984492778778, + "learning_rate": 3.7422266480858154e-05, + "loss": 0.2298, + "num_input_tokens_seen": 20444672, + "step": 13380 + }, + { + "epoch": 41.31221020092736, + "grad_norm": 0.3129578232765198, + "learning_rate": 3.741374578751158e-05, + "loss": 0.298, + "num_input_tokens_seen": 20452736, + "step": 13385 + }, + { + "epoch": 41.327666151468314, + "grad_norm": 0.49010875821113586, + "learning_rate": 3.740522317980941e-05, + "loss": 0.2463, + "num_input_tokens_seen": 20460032, + "step": 13390 + }, + { + "epoch": 41.343122102009275, + "grad_norm": 0.5035573840141296, + "learning_rate": 3.739669865906593e-05, + "loss": 0.2479, + "num_input_tokens_seen": 20467520, + "step": 13395 + }, + { + "epoch": 41.35857805255023, + "grad_norm": 0.4080961048603058, + "learning_rate": 3.738817222659573e-05, + "loss": 0.2533, + "num_input_tokens_seen": 20475008, + "step": 13400 + }, + { + "epoch": 41.35857805255023, + "eval_loss": 0.3245210647583008, + "eval_runtime": 6.3059, + "eval_samples_per_second": 91.185, + "eval_steps_per_second": 22.836, + "num_input_tokens_seen": 20475008, + "step": 13400 + }, + { + "epoch": 41.37403400309119, + "grad_norm": 0.3255915343761444, + "learning_rate": 3.73796438837137e-05, + "loss": 0.2398, + "num_input_tokens_seen": 20482432, + "step": 13405 + }, + { + "epoch": 41.38948995363215, + "grad_norm": 0.6158708333969116, + "learning_rate": 3.7371113631735e-05, + "loss": 0.2786, + "num_input_tokens_seen": 20490080, + "step": 13410 + }, + { + "epoch": 41.404945904173104, + "grad_norm": 0.4459121823310852, + "learning_rate": 3.736258147197512e-05, + "loss": 0.3378, + "num_input_tokens_seen": 20497568, + "step": 13415 + }, + { + "epoch": 41.420401854714065, + "grad_norm": 0.3904368281364441, + "learning_rate": 3.735404740574981e-05, + "loss": 0.22, + "num_input_tokens_seen": 20505248, + "step": 13420 + }, + { + "epoch": 41.435857805255026, + "grad_norm": 0.5459337830543518, + "learning_rate": 3.7345511434375145e-05, + "loss": 0.2835, + "num_input_tokens_seen": 20513088, + "step": 13425 + }, + { + "epoch": 41.45131375579598, + "grad_norm": 0.6681903600692749, + "learning_rate": 3.733697355916748e-05, + "loss": 0.2497, + "num_input_tokens_seen": 20520960, + "step": 13430 + }, + { + "epoch": 41.46676970633694, + "grad_norm": 0.7155354022979736, + "learning_rate": 3.732843378144345e-05, + "loss": 0.3121, + "num_input_tokens_seen": 20528864, + "step": 13435 + }, + { + "epoch": 41.4822256568779, + "grad_norm": 0.7104299068450928, + "learning_rate": 3.7319892102519995e-05, + "loss": 0.27, + "num_input_tokens_seen": 20536768, + "step": 13440 + }, + { + "epoch": 41.497681607418855, + "grad_norm": 0.7799752950668335, + "learning_rate": 3.731134852371436e-05, + "loss": 0.2621, + "num_input_tokens_seen": 20544992, + "step": 13445 + }, + { + "epoch": 41.513137557959816, + "grad_norm": 0.36044907569885254, + "learning_rate": 3.730280304634408e-05, + "loss": 0.3353, + "num_input_tokens_seen": 20553376, + "step": 13450 + }, + { + "epoch": 41.52859350850077, + "grad_norm": 0.38550013303756714, + "learning_rate": 3.729425567172696e-05, + "loss": 0.2728, + "num_input_tokens_seen": 20560576, + "step": 13455 + }, + { + "epoch": 41.54404945904173, + "grad_norm": 0.5625228881835938, + "learning_rate": 3.728570640118111e-05, + "loss": 0.3489, + "num_input_tokens_seen": 20568128, + "step": 13460 + }, + { + "epoch": 41.55950540958269, + "grad_norm": 0.522523045539856, + "learning_rate": 3.727715523602494e-05, + "loss": 0.2582, + "num_input_tokens_seen": 20575360, + "step": 13465 + }, + { + "epoch": 41.574961360123645, + "grad_norm": 0.731666088104248, + "learning_rate": 3.726860217757715e-05, + "loss": 0.2993, + "num_input_tokens_seen": 20582848, + "step": 13470 + }, + { + "epoch": 41.590417310664606, + "grad_norm": 0.4030451774597168, + "learning_rate": 3.726004722715673e-05, + "loss": 0.3479, + "num_input_tokens_seen": 20590816, + "step": 13475 + }, + { + "epoch": 41.605873261205566, + "grad_norm": 0.7571209073066711, + "learning_rate": 3.725149038608296e-05, + "loss": 0.2368, + "num_input_tokens_seen": 20598432, + "step": 13480 + }, + { + "epoch": 41.62132921174652, + "grad_norm": 1.0218031406402588, + "learning_rate": 3.7242931655675404e-05, + "loss": 0.3581, + "num_input_tokens_seen": 20605792, + "step": 13485 + }, + { + "epoch": 41.63678516228748, + "grad_norm": 0.3862861692905426, + "learning_rate": 3.7234371037253937e-05, + "loss": 0.2804, + "num_input_tokens_seen": 20613600, + "step": 13490 + }, + { + "epoch": 41.65224111282844, + "grad_norm": 0.5710163712501526, + "learning_rate": 3.7225808532138705e-05, + "loss": 0.293, + "num_input_tokens_seen": 20620928, + "step": 13495 + }, + { + "epoch": 41.667697063369395, + "grad_norm": 0.40712082386016846, + "learning_rate": 3.721724414165016e-05, + "loss": 0.2362, + "num_input_tokens_seen": 20628064, + "step": 13500 + }, + { + "epoch": 41.683153013910356, + "grad_norm": 0.5247051119804382, + "learning_rate": 3.720867786710904e-05, + "loss": 0.3003, + "num_input_tokens_seen": 20635840, + "step": 13505 + }, + { + "epoch": 41.69860896445132, + "grad_norm": 0.4109409749507904, + "learning_rate": 3.7200109709836366e-05, + "loss": 0.2645, + "num_input_tokens_seen": 20643488, + "step": 13510 + }, + { + "epoch": 41.71406491499227, + "grad_norm": 0.3880438506603241, + "learning_rate": 3.7191539671153465e-05, + "loss": 0.3236, + "num_input_tokens_seen": 20650976, + "step": 13515 + }, + { + "epoch": 41.72952086553323, + "grad_norm": 0.32446038722991943, + "learning_rate": 3.718296775238193e-05, + "loss": 0.2459, + "num_input_tokens_seen": 20658720, + "step": 13520 + }, + { + "epoch": 41.744976816074185, + "grad_norm": 0.7425022721290588, + "learning_rate": 3.7174393954843675e-05, + "loss": 0.2723, + "num_input_tokens_seen": 20666272, + "step": 13525 + }, + { + "epoch": 41.760432766615146, + "grad_norm": 0.43158283829689026, + "learning_rate": 3.716581827986087e-05, + "loss": 0.2476, + "num_input_tokens_seen": 20673984, + "step": 13530 + }, + { + "epoch": 41.77588871715611, + "grad_norm": 0.8921719789505005, + "learning_rate": 3.7157240728756004e-05, + "loss": 0.2904, + "num_input_tokens_seen": 20681920, + "step": 13535 + }, + { + "epoch": 41.79134466769706, + "grad_norm": 0.6287435293197632, + "learning_rate": 3.714866130285184e-05, + "loss": 0.2495, + "num_input_tokens_seen": 20689152, + "step": 13540 + }, + { + "epoch": 41.80680061823802, + "grad_norm": 0.5173242688179016, + "learning_rate": 3.714008000347143e-05, + "loss": 0.2864, + "num_input_tokens_seen": 20696960, + "step": 13545 + }, + { + "epoch": 41.82225656877898, + "grad_norm": 0.43559351563453674, + "learning_rate": 3.7131496831938126e-05, + "loss": 0.2751, + "num_input_tokens_seen": 20704672, + "step": 13550 + }, + { + "epoch": 41.837712519319936, + "grad_norm": 0.5194693803787231, + "learning_rate": 3.7122911789575565e-05, + "loss": 0.2874, + "num_input_tokens_seen": 20712352, + "step": 13555 + }, + { + "epoch": 41.8531684698609, + "grad_norm": 0.53900146484375, + "learning_rate": 3.711432487770765e-05, + "loss": 0.2092, + "num_input_tokens_seen": 20720064, + "step": 13560 + }, + { + "epoch": 41.86862442040186, + "grad_norm": 0.35565677285194397, + "learning_rate": 3.710573609765861e-05, + "loss": 0.2417, + "num_input_tokens_seen": 20727840, + "step": 13565 + }, + { + "epoch": 41.88408037094281, + "grad_norm": 0.6627592444419861, + "learning_rate": 3.709714545075292e-05, + "loss": 0.2678, + "num_input_tokens_seen": 20735008, + "step": 13570 + }, + { + "epoch": 41.89953632148377, + "grad_norm": 0.4610256850719452, + "learning_rate": 3.708855293831538e-05, + "loss": 0.2623, + "num_input_tokens_seen": 20742848, + "step": 13575 + }, + { + "epoch": 41.914992272024726, + "grad_norm": 0.4095945358276367, + "learning_rate": 3.707995856167107e-05, + "loss": 0.2443, + "num_input_tokens_seen": 20751296, + "step": 13580 + }, + { + "epoch": 41.93044822256569, + "grad_norm": 0.496228963136673, + "learning_rate": 3.707136232214534e-05, + "loss": 0.2183, + "num_input_tokens_seen": 20759744, + "step": 13585 + }, + { + "epoch": 41.94590417310665, + "grad_norm": 0.4468134939670563, + "learning_rate": 3.7062764221063844e-05, + "loss": 0.3219, + "num_input_tokens_seen": 20767072, + "step": 13590 + }, + { + "epoch": 41.9613601236476, + "grad_norm": 0.49693506956100464, + "learning_rate": 3.705416425975252e-05, + "loss": 0.2953, + "num_input_tokens_seen": 20774528, + "step": 13595 + }, + { + "epoch": 41.97681607418856, + "grad_norm": 0.5834470987319946, + "learning_rate": 3.704556243953758e-05, + "loss": 0.2798, + "num_input_tokens_seen": 20782016, + "step": 13600 + }, + { + "epoch": 41.97681607418856, + "eval_loss": 0.32343047857284546, + "eval_runtime": 6.2668, + "eval_samples_per_second": 91.754, + "eval_steps_per_second": 22.978, + "num_input_tokens_seen": 20782016, + "step": 13600 + }, + { + "epoch": 41.99227202472952, + "grad_norm": 1.0365673303604126, + "learning_rate": 3.7036958761745535e-05, + "loss": 0.2776, + "num_input_tokens_seen": 20789248, + "step": 13605 + }, + { + "epoch": 42.00618238021638, + "grad_norm": 0.5089357495307922, + "learning_rate": 3.702835322770318e-05, + "loss": 0.2425, + "num_input_tokens_seen": 20796032, + "step": 13610 + }, + { + "epoch": 42.02163833075734, + "grad_norm": 0.41125166416168213, + "learning_rate": 3.701974583873761e-05, + "loss": 0.2422, + "num_input_tokens_seen": 20803904, + "step": 13615 + }, + { + "epoch": 42.0370942812983, + "grad_norm": 0.5421667695045471, + "learning_rate": 3.701113659617618e-05, + "loss": 0.2638, + "num_input_tokens_seen": 20810784, + "step": 13620 + }, + { + "epoch": 42.052550231839255, + "grad_norm": 0.5001305341720581, + "learning_rate": 3.7002525501346535e-05, + "loss": 0.2703, + "num_input_tokens_seen": 20818752, + "step": 13625 + }, + { + "epoch": 42.068006182380216, + "grad_norm": 0.372221440076828, + "learning_rate": 3.699391255557664e-05, + "loss": 0.3028, + "num_input_tokens_seen": 20826656, + "step": 13630 + }, + { + "epoch": 42.08346213292118, + "grad_norm": 0.42728325724601746, + "learning_rate": 3.69852977601947e-05, + "loss": 0.2597, + "num_input_tokens_seen": 20834240, + "step": 13635 + }, + { + "epoch": 42.09891808346213, + "grad_norm": 0.3599265217781067, + "learning_rate": 3.697668111652922e-05, + "loss": 0.2663, + "num_input_tokens_seen": 20841792, + "step": 13640 + }, + { + "epoch": 42.11437403400309, + "grad_norm": 0.5496059060096741, + "learning_rate": 3.6968062625909005e-05, + "loss": 0.2738, + "num_input_tokens_seen": 20850048, + "step": 13645 + }, + { + "epoch": 42.12982998454405, + "grad_norm": 0.4658702313899994, + "learning_rate": 3.6959442289663135e-05, + "loss": 0.2249, + "num_input_tokens_seen": 20857472, + "step": 13650 + }, + { + "epoch": 42.145285935085006, + "grad_norm": 0.3452933728694916, + "learning_rate": 3.695082010912098e-05, + "loss": 0.2879, + "num_input_tokens_seen": 20865184, + "step": 13655 + }, + { + "epoch": 42.16074188562597, + "grad_norm": 0.5360816717147827, + "learning_rate": 3.694219608561217e-05, + "loss": 0.3253, + "num_input_tokens_seen": 20873248, + "step": 13660 + }, + { + "epoch": 42.17619783616692, + "grad_norm": 0.5538164377212524, + "learning_rate": 3.693357022046665e-05, + "loss": 0.2553, + "num_input_tokens_seen": 20880992, + "step": 13665 + }, + { + "epoch": 42.19165378670788, + "grad_norm": 0.44397643208503723, + "learning_rate": 3.6924942515014644e-05, + "loss": 0.3248, + "num_input_tokens_seen": 20888960, + "step": 13670 + }, + { + "epoch": 42.20710973724884, + "grad_norm": 0.45188573002815247, + "learning_rate": 3.691631297058664e-05, + "loss": 0.2678, + "num_input_tokens_seen": 20896672, + "step": 13675 + }, + { + "epoch": 42.222565687789796, + "grad_norm": 0.5431495904922485, + "learning_rate": 3.6907681588513424e-05, + "loss": 0.2301, + "num_input_tokens_seen": 20904192, + "step": 13680 + }, + { + "epoch": 42.23802163833076, + "grad_norm": 0.4647657871246338, + "learning_rate": 3.689904837012606e-05, + "loss": 0.2877, + "num_input_tokens_seen": 20911712, + "step": 13685 + }, + { + "epoch": 42.25347758887172, + "grad_norm": 0.4909169673919678, + "learning_rate": 3.689041331675591e-05, + "loss": 0.2753, + "num_input_tokens_seen": 20919392, + "step": 13690 + }, + { + "epoch": 42.26893353941267, + "grad_norm": 0.4251636862754822, + "learning_rate": 3.688177642973461e-05, + "loss": 0.2314, + "num_input_tokens_seen": 20926752, + "step": 13695 + }, + { + "epoch": 42.28438948995363, + "grad_norm": 0.6579927802085876, + "learning_rate": 3.687313771039406e-05, + "loss": 0.3122, + "num_input_tokens_seen": 20934752, + "step": 13700 + }, + { + "epoch": 42.29984544049459, + "grad_norm": 0.37501227855682373, + "learning_rate": 3.686449716006647e-05, + "loss": 0.2048, + "num_input_tokens_seen": 20942016, + "step": 13705 + }, + { + "epoch": 42.31530139103555, + "grad_norm": 0.5216811299324036, + "learning_rate": 3.685585478008432e-05, + "loss": 0.2191, + "num_input_tokens_seen": 20949312, + "step": 13710 + }, + { + "epoch": 42.33075734157651, + "grad_norm": 0.37672901153564453, + "learning_rate": 3.6847210571780364e-05, + "loss": 0.3463, + "num_input_tokens_seen": 20956640, + "step": 13715 + }, + { + "epoch": 42.34621329211747, + "grad_norm": 0.41250619292259216, + "learning_rate": 3.683856453648767e-05, + "loss": 0.2305, + "num_input_tokens_seen": 20964064, + "step": 13720 + }, + { + "epoch": 42.36166924265842, + "grad_norm": 0.612395167350769, + "learning_rate": 3.682991667553954e-05, + "loss": 0.2692, + "num_input_tokens_seen": 20971712, + "step": 13725 + }, + { + "epoch": 42.37712519319938, + "grad_norm": 0.45617055892944336, + "learning_rate": 3.6821266990269606e-05, + "loss": 0.2568, + "num_input_tokens_seen": 20979616, + "step": 13730 + }, + { + "epoch": 42.39258114374034, + "grad_norm": 0.5788162350654602, + "learning_rate": 3.681261548201174e-05, + "loss": 0.3053, + "num_input_tokens_seen": 20986944, + "step": 13735 + }, + { + "epoch": 42.4080370942813, + "grad_norm": 0.49446648359298706, + "learning_rate": 3.6803962152100125e-05, + "loss": 0.3581, + "num_input_tokens_seen": 20994528, + "step": 13740 + }, + { + "epoch": 42.42349304482226, + "grad_norm": 0.6050933003425598, + "learning_rate": 3.67953070018692e-05, + "loss": 0.2167, + "num_input_tokens_seen": 21002272, + "step": 13745 + }, + { + "epoch": 42.43894899536321, + "grad_norm": 0.5227155089378357, + "learning_rate": 3.678665003265371e-05, + "loss": 0.2508, + "num_input_tokens_seen": 21009664, + "step": 13750 + }, + { + "epoch": 42.45440494590417, + "grad_norm": 0.25583258271217346, + "learning_rate": 3.677799124578867e-05, + "loss": 0.1906, + "num_input_tokens_seen": 21017088, + "step": 13755 + }, + { + "epoch": 42.469860896445134, + "grad_norm": 0.43215665221214294, + "learning_rate": 3.676933064260937e-05, + "loss": 0.2413, + "num_input_tokens_seen": 21024288, + "step": 13760 + }, + { + "epoch": 42.48531684698609, + "grad_norm": 0.4435845911502838, + "learning_rate": 3.6760668224451365e-05, + "loss": 0.308, + "num_input_tokens_seen": 21032416, + "step": 13765 + }, + { + "epoch": 42.50077279752705, + "grad_norm": 0.5856855511665344, + "learning_rate": 3.675200399265054e-05, + "loss": 0.3075, + "num_input_tokens_seen": 21039232, + "step": 13770 + }, + { + "epoch": 42.51622874806801, + "grad_norm": 0.37153205275535583, + "learning_rate": 3.6743337948543014e-05, + "loss": 0.2266, + "num_input_tokens_seen": 21047104, + "step": 13775 + }, + { + "epoch": 42.53168469860896, + "grad_norm": 0.6741389036178589, + "learning_rate": 3.6734670093465204e-05, + "loss": 0.2304, + "num_input_tokens_seen": 21054400, + "step": 13780 + }, + { + "epoch": 42.547140649149924, + "grad_norm": 0.24861064553260803, + "learning_rate": 3.672600042875379e-05, + "loss": 0.241, + "num_input_tokens_seen": 21062016, + "step": 13785 + }, + { + "epoch": 42.56259659969088, + "grad_norm": 0.5838108062744141, + "learning_rate": 3.671732895574575e-05, + "loss": 0.2387, + "num_input_tokens_seen": 21070176, + "step": 13790 + }, + { + "epoch": 42.57805255023184, + "grad_norm": 0.7237746715545654, + "learning_rate": 3.670865567577834e-05, + "loss": 0.3065, + "num_input_tokens_seen": 21077856, + "step": 13795 + }, + { + "epoch": 42.5935085007728, + "grad_norm": 0.48044532537460327, + "learning_rate": 3.669998059018909e-05, + "loss": 0.2724, + "num_input_tokens_seen": 21085440, + "step": 13800 + }, + { + "epoch": 42.5935085007728, + "eval_loss": 0.32274046540260315, + "eval_runtime": 6.3094, + "eval_samples_per_second": 91.134, + "eval_steps_per_second": 22.823, + "num_input_tokens_seen": 21085440, + "step": 13800 + }, + { + "epoch": 42.60896445131375, + "grad_norm": 0.48136278986930847, + "learning_rate": 3.6691303700315796e-05, + "loss": 0.3154, + "num_input_tokens_seen": 21093312, + "step": 13805 + }, + { + "epoch": 42.624420401854714, + "grad_norm": 0.857109010219574, + "learning_rate": 3.668262500749655e-05, + "loss": 0.2909, + "num_input_tokens_seen": 21101024, + "step": 13810 + }, + { + "epoch": 42.639876352395675, + "grad_norm": 0.6593394875526428, + "learning_rate": 3.667394451306971e-05, + "loss": 0.2709, + "num_input_tokens_seen": 21108128, + "step": 13815 + }, + { + "epoch": 42.65533230293663, + "grad_norm": 0.4592283070087433, + "learning_rate": 3.666526221837393e-05, + "loss": 0.2251, + "num_input_tokens_seen": 21115936, + "step": 13820 + }, + { + "epoch": 42.67078825347759, + "grad_norm": 0.6585188508033752, + "learning_rate": 3.665657812474812e-05, + "loss": 0.3453, + "num_input_tokens_seen": 21123488, + "step": 13825 + }, + { + "epoch": 42.68624420401855, + "grad_norm": 0.6196401119232178, + "learning_rate": 3.664789223353147e-05, + "loss": 0.2352, + "num_input_tokens_seen": 21131616, + "step": 13830 + }, + { + "epoch": 42.701700154559504, + "grad_norm": 0.5145904421806335, + "learning_rate": 3.663920454606347e-05, + "loss": 0.2686, + "num_input_tokens_seen": 21139776, + "step": 13835 + }, + { + "epoch": 42.717156105100464, + "grad_norm": 0.4179391860961914, + "learning_rate": 3.6630515063683856e-05, + "loss": 0.2644, + "num_input_tokens_seen": 21147136, + "step": 13840 + }, + { + "epoch": 42.732612055641425, + "grad_norm": 0.30970704555511475, + "learning_rate": 3.662182378773267e-05, + "loss": 0.3031, + "num_input_tokens_seen": 21155264, + "step": 13845 + }, + { + "epoch": 42.74806800618238, + "grad_norm": 0.3590744435787201, + "learning_rate": 3.66131307195502e-05, + "loss": 0.3291, + "num_input_tokens_seen": 21162848, + "step": 13850 + }, + { + "epoch": 42.76352395672334, + "grad_norm": 0.6009094715118408, + "learning_rate": 3.6604435860477034e-05, + "loss": 0.2709, + "num_input_tokens_seen": 21170336, + "step": 13855 + }, + { + "epoch": 42.778979907264294, + "grad_norm": 0.5153399705886841, + "learning_rate": 3.6595739211854025e-05, + "loss": 0.2095, + "num_input_tokens_seen": 21178336, + "step": 13860 + }, + { + "epoch": 42.794435857805254, + "grad_norm": 0.43230077624320984, + "learning_rate": 3.658704077502231e-05, + "loss": 0.3415, + "num_input_tokens_seen": 21185824, + "step": 13865 + }, + { + "epoch": 42.809891808346215, + "grad_norm": 0.35635703802108765, + "learning_rate": 3.65783405513233e-05, + "loss": 0.2982, + "num_input_tokens_seen": 21193568, + "step": 13870 + }, + { + "epoch": 42.82534775888717, + "grad_norm": 0.48729756474494934, + "learning_rate": 3.656963854209867e-05, + "loss": 0.2378, + "num_input_tokens_seen": 21200800, + "step": 13875 + }, + { + "epoch": 42.84080370942813, + "grad_norm": 0.4996480643749237, + "learning_rate": 3.656093474869038e-05, + "loss": 0.2507, + "num_input_tokens_seen": 21208896, + "step": 13880 + }, + { + "epoch": 42.85625965996909, + "grad_norm": 0.3303280770778656, + "learning_rate": 3.655222917244068e-05, + "loss": 0.2165, + "num_input_tokens_seen": 21217376, + "step": 13885 + }, + { + "epoch": 42.871715610510044, + "grad_norm": 0.46984541416168213, + "learning_rate": 3.6543521814692054e-05, + "loss": 0.2416, + "num_input_tokens_seen": 21224864, + "step": 13890 + }, + { + "epoch": 42.887171561051005, + "grad_norm": 0.3740047216415405, + "learning_rate": 3.653481267678731e-05, + "loss": 0.2952, + "num_input_tokens_seen": 21232320, + "step": 13895 + }, + { + "epoch": 42.902627511591966, + "grad_norm": 0.5144497156143188, + "learning_rate": 3.652610176006949e-05, + "loss": 0.3085, + "num_input_tokens_seen": 21239424, + "step": 13900 + }, + { + "epoch": 42.91808346213292, + "grad_norm": 0.42223307490348816, + "learning_rate": 3.6517389065881925e-05, + "loss": 0.2684, + "num_input_tokens_seen": 21246528, + "step": 13905 + }, + { + "epoch": 42.93353941267388, + "grad_norm": 0.4501033425331116, + "learning_rate": 3.650867459556824e-05, + "loss": 0.2187, + "num_input_tokens_seen": 21254400, + "step": 13910 + }, + { + "epoch": 42.948995363214834, + "grad_norm": 0.921261191368103, + "learning_rate": 3.64999583504723e-05, + "loss": 0.3295, + "num_input_tokens_seen": 21262016, + "step": 13915 + }, + { + "epoch": 42.964451313755795, + "grad_norm": 0.5360414981842041, + "learning_rate": 3.649124033193827e-05, + "loss": 0.2513, + "num_input_tokens_seen": 21269376, + "step": 13920 + }, + { + "epoch": 42.979907264296756, + "grad_norm": 0.3849940598011017, + "learning_rate": 3.648252054131057e-05, + "loss": 0.3398, + "num_input_tokens_seen": 21277216, + "step": 13925 + }, + { + "epoch": 42.99536321483771, + "grad_norm": 0.469023197889328, + "learning_rate": 3.647379897993391e-05, + "loss": 0.2137, + "num_input_tokens_seen": 21284832, + "step": 13930 + }, + { + "epoch": 43.009273570324574, + "grad_norm": 0.9656540155410767, + "learning_rate": 3.646507564915325e-05, + "loss": 0.2837, + "num_input_tokens_seen": 21291488, + "step": 13935 + }, + { + "epoch": 43.024729520865534, + "grad_norm": 0.36914148926734924, + "learning_rate": 3.645635055031385e-05, + "loss": 0.2528, + "num_input_tokens_seen": 21299008, + "step": 13940 + }, + { + "epoch": 43.04018547140649, + "grad_norm": 0.6820820569992065, + "learning_rate": 3.6447623684761224e-05, + "loss": 0.3741, + "num_input_tokens_seen": 21306688, + "step": 13945 + }, + { + "epoch": 43.05564142194745, + "grad_norm": 0.49069976806640625, + "learning_rate": 3.643889505384117e-05, + "loss": 0.3068, + "num_input_tokens_seen": 21314368, + "step": 13950 + }, + { + "epoch": 43.07109737248841, + "grad_norm": 0.32117101550102234, + "learning_rate": 3.6430164658899744e-05, + "loss": 0.2121, + "num_input_tokens_seen": 21322304, + "step": 13955 + }, + { + "epoch": 43.086553323029364, + "grad_norm": 0.3749277591705322, + "learning_rate": 3.642143250128329e-05, + "loss": 0.2479, + "num_input_tokens_seen": 21330560, + "step": 13960 + }, + { + "epoch": 43.102009273570324, + "grad_norm": 0.5789420008659363, + "learning_rate": 3.641269858233841e-05, + "loss": 0.2572, + "num_input_tokens_seen": 21337696, + "step": 13965 + }, + { + "epoch": 43.117465224111285, + "grad_norm": 0.546373724937439, + "learning_rate": 3.640396290341199e-05, + "loss": 0.2315, + "num_input_tokens_seen": 21345600, + "step": 13970 + }, + { + "epoch": 43.13292117465224, + "grad_norm": 0.5015021562576294, + "learning_rate": 3.639522546585118e-05, + "loss": 0.243, + "num_input_tokens_seen": 21353088, + "step": 13975 + }, + { + "epoch": 43.1483771251932, + "grad_norm": 0.8835340142250061, + "learning_rate": 3.6386486271003404e-05, + "loss": 0.2286, + "num_input_tokens_seen": 21360896, + "step": 13980 + }, + { + "epoch": 43.16383307573416, + "grad_norm": 0.587333083152771, + "learning_rate": 3.6377745320216346e-05, + "loss": 0.2315, + "num_input_tokens_seen": 21368256, + "step": 13985 + }, + { + "epoch": 43.179289026275114, + "grad_norm": 0.5407262444496155, + "learning_rate": 3.636900261483798e-05, + "loss": 0.3157, + "num_input_tokens_seen": 21376128, + "step": 13990 + }, + { + "epoch": 43.194744976816075, + "grad_norm": 0.6197136044502258, + "learning_rate": 3.636025815621654e-05, + "loss": 0.2707, + "num_input_tokens_seen": 21383968, + "step": 13995 + }, + { + "epoch": 43.210200927357036, + "grad_norm": 0.7015054225921631, + "learning_rate": 3.635151194570054e-05, + "loss": 0.2989, + "num_input_tokens_seen": 21391616, + "step": 14000 + }, + { + "epoch": 43.210200927357036, + "eval_loss": 0.321704238653183, + "eval_runtime": 6.27, + "eval_samples_per_second": 91.706, + "eval_steps_per_second": 22.966, + "num_input_tokens_seen": 21391616, + "step": 14000 + }, + { + "epoch": 43.22565687789799, + "grad_norm": 0.4422539472579956, + "learning_rate": 3.634276398463873e-05, + "loss": 0.2213, + "num_input_tokens_seen": 21399360, + "step": 14005 + }, + { + "epoch": 43.24111282843895, + "grad_norm": 0.2966010570526123, + "learning_rate": 3.633401427438018e-05, + "loss": 0.2798, + "num_input_tokens_seen": 21406240, + "step": 14010 + }, + { + "epoch": 43.256568778979904, + "grad_norm": 0.707040548324585, + "learning_rate": 3.63252628162742e-05, + "loss": 0.3722, + "num_input_tokens_seen": 21414272, + "step": 14015 + }, + { + "epoch": 43.272024729520865, + "grad_norm": 0.5937957763671875, + "learning_rate": 3.6316509611670364e-05, + "loss": 0.3006, + "num_input_tokens_seen": 21421600, + "step": 14020 + }, + { + "epoch": 43.287480680061826, + "grad_norm": 0.32738208770751953, + "learning_rate": 3.630775466191854e-05, + "loss": 0.22, + "num_input_tokens_seen": 21429920, + "step": 14025 + }, + { + "epoch": 43.30293663060278, + "grad_norm": 0.5392811894416809, + "learning_rate": 3.629899796836884e-05, + "loss": 0.264, + "num_input_tokens_seen": 21437216, + "step": 14030 + }, + { + "epoch": 43.31839258114374, + "grad_norm": 0.6855170130729675, + "learning_rate": 3.6290239532371666e-05, + "loss": 0.2538, + "num_input_tokens_seen": 21444576, + "step": 14035 + }, + { + "epoch": 43.3338485316847, + "grad_norm": 0.5233397483825684, + "learning_rate": 3.628147935527767e-05, + "loss": 0.249, + "num_input_tokens_seen": 21452192, + "step": 14040 + }, + { + "epoch": 43.349304482225655, + "grad_norm": 0.7088304162025452, + "learning_rate": 3.627271743843779e-05, + "loss": 0.32, + "num_input_tokens_seen": 21459808, + "step": 14045 + }, + { + "epoch": 43.364760432766616, + "grad_norm": 0.39498186111450195, + "learning_rate": 3.626395378320321e-05, + "loss": 0.2733, + "num_input_tokens_seen": 21467744, + "step": 14050 + }, + { + "epoch": 43.38021638330758, + "grad_norm": 0.476036012172699, + "learning_rate": 3.625518839092541e-05, + "loss": 0.2439, + "num_input_tokens_seen": 21475424, + "step": 14055 + }, + { + "epoch": 43.39567233384853, + "grad_norm": 0.3943033218383789, + "learning_rate": 3.624642126295612e-05, + "loss": 0.3126, + "num_input_tokens_seen": 21482496, + "step": 14060 + }, + { + "epoch": 43.41112828438949, + "grad_norm": 0.7503748536109924, + "learning_rate": 3.6237652400647345e-05, + "loss": 0.2968, + "num_input_tokens_seen": 21489824, + "step": 14065 + }, + { + "epoch": 43.426584234930445, + "grad_norm": 0.5474340915679932, + "learning_rate": 3.622888180535134e-05, + "loss": 0.2156, + "num_input_tokens_seen": 21497280, + "step": 14070 + }, + { + "epoch": 43.442040185471406, + "grad_norm": 0.5552473068237305, + "learning_rate": 3.6220109478420655e-05, + "loss": 0.2594, + "num_input_tokens_seen": 21504640, + "step": 14075 + }, + { + "epoch": 43.45749613601237, + "grad_norm": 0.4796220064163208, + "learning_rate": 3.6211335421208084e-05, + "loss": 0.2763, + "num_input_tokens_seen": 21512320, + "step": 14080 + }, + { + "epoch": 43.47295208655332, + "grad_norm": 0.6664683222770691, + "learning_rate": 3.62025596350667e-05, + "loss": 0.2771, + "num_input_tokens_seen": 21520256, + "step": 14085 + }, + { + "epoch": 43.48840803709428, + "grad_norm": 0.576176106929779, + "learning_rate": 3.619378212134984e-05, + "loss": 0.2583, + "num_input_tokens_seen": 21528096, + "step": 14090 + }, + { + "epoch": 43.50386398763524, + "grad_norm": 0.4102894961833954, + "learning_rate": 3.618500288141111e-05, + "loss": 0.2271, + "num_input_tokens_seen": 21535488, + "step": 14095 + }, + { + "epoch": 43.519319938176196, + "grad_norm": 0.25697508454322815, + "learning_rate": 3.617622191660438e-05, + "loss": 0.2556, + "num_input_tokens_seen": 21543168, + "step": 14100 + }, + { + "epoch": 43.53477588871716, + "grad_norm": 0.5918314456939697, + "learning_rate": 3.616743922828377e-05, + "loss": 0.245, + "num_input_tokens_seen": 21550656, + "step": 14105 + }, + { + "epoch": 43.55023183925812, + "grad_norm": 0.2809102535247803, + "learning_rate": 3.615865481780371e-05, + "loss": 0.2451, + "num_input_tokens_seen": 21558144, + "step": 14110 + }, + { + "epoch": 43.56568778979907, + "grad_norm": 0.46640628576278687, + "learning_rate": 3.614986868651883e-05, + "loss": 0.2298, + "num_input_tokens_seen": 21565760, + "step": 14115 + }, + { + "epoch": 43.58114374034003, + "grad_norm": 0.6473149061203003, + "learning_rate": 3.614108083578409e-05, + "loss": 0.2624, + "num_input_tokens_seen": 21573472, + "step": 14120 + }, + { + "epoch": 43.59659969088099, + "grad_norm": 0.49279680848121643, + "learning_rate": 3.613229126695467e-05, + "loss": 0.2695, + "num_input_tokens_seen": 21580864, + "step": 14125 + }, + { + "epoch": 43.61205564142195, + "grad_norm": 0.5887806415557861, + "learning_rate": 3.612349998138605e-05, + "loss": 0.2155, + "num_input_tokens_seen": 21589024, + "step": 14130 + }, + { + "epoch": 43.62751159196291, + "grad_norm": 0.41317930817604065, + "learning_rate": 3.6114706980433946e-05, + "loss": 0.2499, + "num_input_tokens_seen": 21596576, + "step": 14135 + }, + { + "epoch": 43.64296754250386, + "grad_norm": 0.3085242807865143, + "learning_rate": 3.610591226545435e-05, + "loss": 0.2692, + "num_input_tokens_seen": 21604480, + "step": 14140 + }, + { + "epoch": 43.65842349304482, + "grad_norm": 0.4412974715232849, + "learning_rate": 3.6097115837803505e-05, + "loss": 0.2543, + "num_input_tokens_seen": 21611936, + "step": 14145 + }, + { + "epoch": 43.67387944358578, + "grad_norm": 0.33325499296188354, + "learning_rate": 3.608831769883795e-05, + "loss": 0.2444, + "num_input_tokens_seen": 21619776, + "step": 14150 + }, + { + "epoch": 43.689335394126736, + "grad_norm": 0.48152005672454834, + "learning_rate": 3.607951784991446e-05, + "loss": 0.2699, + "num_input_tokens_seen": 21627616, + "step": 14155 + }, + { + "epoch": 43.7047913446677, + "grad_norm": 0.4691998362541199, + "learning_rate": 3.6070716292390085e-05, + "loss": 0.3326, + "num_input_tokens_seen": 21635136, + "step": 14160 + }, + { + "epoch": 43.72024729520866, + "grad_norm": 0.6445431113243103, + "learning_rate": 3.606191302762213e-05, + "loss": 0.2485, + "num_input_tokens_seen": 21642400, + "step": 14165 + }, + { + "epoch": 43.73570324574961, + "grad_norm": 0.745509684085846, + "learning_rate": 3.605310805696818e-05, + "loss": 0.3331, + "num_input_tokens_seen": 21650400, + "step": 14170 + }, + { + "epoch": 43.75115919629057, + "grad_norm": 0.47958359122276306, + "learning_rate": 3.6044301381786067e-05, + "loss": 0.2555, + "num_input_tokens_seen": 21658304, + "step": 14175 + }, + { + "epoch": 43.76661514683153, + "grad_norm": 0.5007508993148804, + "learning_rate": 3.6035493003433883e-05, + "loss": 0.2788, + "num_input_tokens_seen": 21666208, + "step": 14180 + }, + { + "epoch": 43.78207109737249, + "grad_norm": 0.43658408522605896, + "learning_rate": 3.6026682923269994e-05, + "loss": 0.2198, + "num_input_tokens_seen": 21673600, + "step": 14185 + }, + { + "epoch": 43.79752704791345, + "grad_norm": 0.46105989813804626, + "learning_rate": 3.6017871142653034e-05, + "loss": 0.2989, + "num_input_tokens_seen": 21681408, + "step": 14190 + }, + { + "epoch": 43.8129829984544, + "grad_norm": 0.5262457132339478, + "learning_rate": 3.600905766294189e-05, + "loss": 0.2663, + "num_input_tokens_seen": 21689152, + "step": 14195 + }, + { + "epoch": 43.82843894899536, + "grad_norm": 0.6207807660102844, + "learning_rate": 3.60002424854957e-05, + "loss": 0.2997, + "num_input_tokens_seen": 21696768, + "step": 14200 + }, + { + "epoch": 43.82843894899536, + "eval_loss": 0.32105186581611633, + "eval_runtime": 6.29, + "eval_samples_per_second": 91.415, + "eval_steps_per_second": 22.894, + "num_input_tokens_seen": 21696768, + "step": 14200 + }, + { + "epoch": 43.84389489953632, + "grad_norm": 0.6822209358215332, + "learning_rate": 3.5991425611673876e-05, + "loss": 0.3682, + "num_input_tokens_seen": 21704480, + "step": 14205 + }, + { + "epoch": 43.85935085007728, + "grad_norm": 0.36109721660614014, + "learning_rate": 3.5982607042836105e-05, + "loss": 0.2174, + "num_input_tokens_seen": 21711808, + "step": 14210 + }, + { + "epoch": 43.87480680061824, + "grad_norm": 0.5395616292953491, + "learning_rate": 3.597378678034231e-05, + "loss": 0.2245, + "num_input_tokens_seen": 21719744, + "step": 14215 + }, + { + "epoch": 43.8902627511592, + "grad_norm": 0.6104339957237244, + "learning_rate": 3.596496482555269e-05, + "loss": 0.242, + "num_input_tokens_seen": 21727648, + "step": 14220 + }, + { + "epoch": 43.90571870170015, + "grad_norm": 0.690853476524353, + "learning_rate": 3.595614117982769e-05, + "loss": 0.3193, + "num_input_tokens_seen": 21735392, + "step": 14225 + }, + { + "epoch": 43.92117465224111, + "grad_norm": 0.42248275876045227, + "learning_rate": 3.594731584452805e-05, + "loss": 0.2129, + "num_input_tokens_seen": 21742976, + "step": 14230 + }, + { + "epoch": 43.936630602782074, + "grad_norm": 0.5229154229164124, + "learning_rate": 3.593848882101472e-05, + "loss": 0.2955, + "num_input_tokens_seen": 21750880, + "step": 14235 + }, + { + "epoch": 43.95208655332303, + "grad_norm": 0.5050122737884521, + "learning_rate": 3.592966011064896e-05, + "loss": 0.2562, + "num_input_tokens_seen": 21758848, + "step": 14240 + }, + { + "epoch": 43.96754250386399, + "grad_norm": 0.3765043318271637, + "learning_rate": 3.592082971479226e-05, + "loss": 0.3425, + "num_input_tokens_seen": 21766656, + "step": 14245 + }, + { + "epoch": 43.98299845440495, + "grad_norm": 0.7008997201919556, + "learning_rate": 3.5911997634806385e-05, + "loss": 0.3059, + "num_input_tokens_seen": 21773760, + "step": 14250 + }, + { + "epoch": 43.9984544049459, + "grad_norm": 0.44311094284057617, + "learning_rate": 3.5903163872053336e-05, + "loss": 0.271, + "num_input_tokens_seen": 21781536, + "step": 14255 + }, + { + "epoch": 44.01236476043277, + "grad_norm": 0.4463657736778259, + "learning_rate": 3.58943284278954e-05, + "loss": 0.2457, + "num_input_tokens_seen": 21788016, + "step": 14260 + }, + { + "epoch": 44.02782071097373, + "grad_norm": 0.3654845058917999, + "learning_rate": 3.588549130369512e-05, + "loss": 0.2673, + "num_input_tokens_seen": 21795664, + "step": 14265 + }, + { + "epoch": 44.04327666151468, + "grad_norm": 0.8714410662651062, + "learning_rate": 3.5876652500815274e-05, + "loss": 0.2156, + "num_input_tokens_seen": 21803920, + "step": 14270 + }, + { + "epoch": 44.05873261205564, + "grad_norm": 0.4076497554779053, + "learning_rate": 3.586781202061894e-05, + "loss": 0.255, + "num_input_tokens_seen": 21811312, + "step": 14275 + }, + { + "epoch": 44.074188562596596, + "grad_norm": 0.5460442900657654, + "learning_rate": 3.585896986446942e-05, + "loss": 0.2747, + "num_input_tokens_seen": 21818896, + "step": 14280 + }, + { + "epoch": 44.08964451313756, + "grad_norm": 0.6512629985809326, + "learning_rate": 3.585012603373028e-05, + "loss": 0.3395, + "num_input_tokens_seen": 21826288, + "step": 14285 + }, + { + "epoch": 44.10510046367852, + "grad_norm": 0.3549835681915283, + "learning_rate": 3.584128052976535e-05, + "loss": 0.2599, + "num_input_tokens_seen": 21834288, + "step": 14290 + }, + { + "epoch": 44.12055641421947, + "grad_norm": 0.34578466415405273, + "learning_rate": 3.5832433353938724e-05, + "loss": 0.2479, + "num_input_tokens_seen": 21841648, + "step": 14295 + }, + { + "epoch": 44.13601236476043, + "grad_norm": 0.6387801170349121, + "learning_rate": 3.5823584507614746e-05, + "loss": 0.2533, + "num_input_tokens_seen": 21849232, + "step": 14300 + }, + { + "epoch": 44.15146831530139, + "grad_norm": 0.814266562461853, + "learning_rate": 3.581473399215802e-05, + "loss": 0.2364, + "num_input_tokens_seen": 21856720, + "step": 14305 + }, + { + "epoch": 44.16692426584235, + "grad_norm": 0.5496731400489807, + "learning_rate": 3.580588180893341e-05, + "loss": 0.273, + "num_input_tokens_seen": 21864656, + "step": 14310 + }, + { + "epoch": 44.18238021638331, + "grad_norm": 1.092163324356079, + "learning_rate": 3.579702795930602e-05, + "loss": 0.2543, + "num_input_tokens_seen": 21872176, + "step": 14315 + }, + { + "epoch": 44.19783616692427, + "grad_norm": 0.41228142380714417, + "learning_rate": 3.578817244464125e-05, + "loss": 0.208, + "num_input_tokens_seen": 21879792, + "step": 14320 + }, + { + "epoch": 44.21329211746522, + "grad_norm": 0.4766063392162323, + "learning_rate": 3.577931526630471e-05, + "loss": 0.2354, + "num_input_tokens_seen": 21887216, + "step": 14325 + }, + { + "epoch": 44.22874806800618, + "grad_norm": 0.614540159702301, + "learning_rate": 3.577045642566229e-05, + "loss": 0.2789, + "num_input_tokens_seen": 21894928, + "step": 14330 + }, + { + "epoch": 44.244204018547144, + "grad_norm": 0.43378156423568726, + "learning_rate": 3.576159592408014e-05, + "loss": 0.2648, + "num_input_tokens_seen": 21902672, + "step": 14335 + }, + { + "epoch": 44.2596599690881, + "grad_norm": 0.5748655796051025, + "learning_rate": 3.575273376292466e-05, + "loss": 0.295, + "num_input_tokens_seen": 21910352, + "step": 14340 + }, + { + "epoch": 44.27511591962906, + "grad_norm": 1.039884090423584, + "learning_rate": 3.574386994356251e-05, + "loss": 0.3044, + "num_input_tokens_seen": 21918480, + "step": 14345 + }, + { + "epoch": 44.29057187017001, + "grad_norm": 0.4741714894771576, + "learning_rate": 3.573500446736059e-05, + "loss": 0.2459, + "num_input_tokens_seen": 21925648, + "step": 14350 + }, + { + "epoch": 44.30602782071097, + "grad_norm": 0.7626579403877258, + "learning_rate": 3.5726137335686094e-05, + "loss": 0.2982, + "num_input_tokens_seen": 21933488, + "step": 14355 + }, + { + "epoch": 44.321483771251934, + "grad_norm": 0.6127970814704895, + "learning_rate": 3.571726854990642e-05, + "loss": 0.2644, + "num_input_tokens_seen": 21940848, + "step": 14360 + }, + { + "epoch": 44.33693972179289, + "grad_norm": 1.0687636137008667, + "learning_rate": 3.570839811138925e-05, + "loss": 0.337, + "num_input_tokens_seen": 21949072, + "step": 14365 + }, + { + "epoch": 44.35239567233385, + "grad_norm": 0.35114070773124695, + "learning_rate": 3.569952602150252e-05, + "loss": 0.251, + "num_input_tokens_seen": 21956816, + "step": 14370 + }, + { + "epoch": 44.36785162287481, + "grad_norm": 0.4424717426300049, + "learning_rate": 3.569065228161442e-05, + "loss": 0.2981, + "num_input_tokens_seen": 21964112, + "step": 14375 + }, + { + "epoch": 44.38330757341576, + "grad_norm": 0.6663159728050232, + "learning_rate": 3.5681776893093395e-05, + "loss": 0.2334, + "num_input_tokens_seen": 21971920, + "step": 14380 + }, + { + "epoch": 44.398763523956724, + "grad_norm": 0.7737947106361389, + "learning_rate": 3.5672899857308134e-05, + "loss": 0.3304, + "num_input_tokens_seen": 21979344, + "step": 14385 + }, + { + "epoch": 44.414219474497685, + "grad_norm": 0.39692285656929016, + "learning_rate": 3.566402117562759e-05, + "loss": 0.313, + "num_input_tokens_seen": 21987024, + "step": 14390 + }, + { + "epoch": 44.42967542503864, + "grad_norm": 0.3525376617908478, + "learning_rate": 3.565514084942097e-05, + "loss": 0.2592, + "num_input_tokens_seen": 21994448, + "step": 14395 + }, + { + "epoch": 44.4451313755796, + "grad_norm": 0.5287626385688782, + "learning_rate": 3.564625888005773e-05, + "loss": 0.2512, + "num_input_tokens_seen": 22001488, + "step": 14400 + }, + { + "epoch": 44.4451313755796, + "eval_loss": 0.3205941319465637, + "eval_runtime": 6.3204, + "eval_samples_per_second": 90.976, + "eval_steps_per_second": 22.784, + "num_input_tokens_seen": 22001488, + "step": 14400 + }, + { + "epoch": 44.46058732612055, + "grad_norm": 0.5194658041000366, + "learning_rate": 3.563737526890759e-05, + "loss": 0.3315, + "num_input_tokens_seen": 22009456, + "step": 14405 + }, + { + "epoch": 44.476043276661514, + "grad_norm": 0.4201609790325165, + "learning_rate": 3.562849001734049e-05, + "loss": 0.2474, + "num_input_tokens_seen": 22017424, + "step": 14410 + }, + { + "epoch": 44.491499227202475, + "grad_norm": 0.4316908121109009, + "learning_rate": 3.561960312672667e-05, + "loss": 0.217, + "num_input_tokens_seen": 22025200, + "step": 14415 + }, + { + "epoch": 44.50695517774343, + "grad_norm": 0.7261341214179993, + "learning_rate": 3.5610714598436596e-05, + "loss": 0.2516, + "num_input_tokens_seen": 22033232, + "step": 14420 + }, + { + "epoch": 44.52241112828439, + "grad_norm": 0.4827432930469513, + "learning_rate": 3.5601824433840986e-05, + "loss": 0.2457, + "num_input_tokens_seen": 22040624, + "step": 14425 + }, + { + "epoch": 44.53786707882535, + "grad_norm": 0.5418756604194641, + "learning_rate": 3.559293263431082e-05, + "loss": 0.2667, + "num_input_tokens_seen": 22048528, + "step": 14430 + }, + { + "epoch": 44.553323029366304, + "grad_norm": 0.5807805061340332, + "learning_rate": 3.558403920121732e-05, + "loss": 0.2948, + "num_input_tokens_seen": 22056464, + "step": 14435 + }, + { + "epoch": 44.568778979907265, + "grad_norm": 0.5938735604286194, + "learning_rate": 3.557514413593197e-05, + "loss": 0.2571, + "num_input_tokens_seen": 22064592, + "step": 14440 + }, + { + "epoch": 44.584234930448225, + "grad_norm": 0.39336875081062317, + "learning_rate": 3.55662474398265e-05, + "loss": 0.2576, + "num_input_tokens_seen": 22072208, + "step": 14445 + }, + { + "epoch": 44.59969088098918, + "grad_norm": 0.44110044836997986, + "learning_rate": 3.555734911427288e-05, + "loss": 0.2251, + "num_input_tokens_seen": 22079696, + "step": 14450 + }, + { + "epoch": 44.61514683153014, + "grad_norm": 0.9974276423454285, + "learning_rate": 3.5548449160643363e-05, + "loss": 0.254, + "num_input_tokens_seen": 22086864, + "step": 14455 + }, + { + "epoch": 44.630602782071094, + "grad_norm": 0.5383582711219788, + "learning_rate": 3.553954758031043e-05, + "loss": 0.269, + "num_input_tokens_seen": 22095216, + "step": 14460 + }, + { + "epoch": 44.646058732612055, + "grad_norm": 0.6671627759933472, + "learning_rate": 3.5530644374646815e-05, + "loss": 0.2783, + "num_input_tokens_seen": 22102832, + "step": 14465 + }, + { + "epoch": 44.661514683153015, + "grad_norm": 0.7147595882415771, + "learning_rate": 3.552173954502549e-05, + "loss": 0.2404, + "num_input_tokens_seen": 22110896, + "step": 14470 + }, + { + "epoch": 44.67697063369397, + "grad_norm": 0.4684768319129944, + "learning_rate": 3.55128330928197e-05, + "loss": 0.3382, + "num_input_tokens_seen": 22118416, + "step": 14475 + }, + { + "epoch": 44.69242658423493, + "grad_norm": 0.3367319107055664, + "learning_rate": 3.550392501940294e-05, + "loss": 0.2506, + "num_input_tokens_seen": 22125744, + "step": 14480 + }, + { + "epoch": 44.70788253477589, + "grad_norm": 0.49183422327041626, + "learning_rate": 3.5495015326148945e-05, + "loss": 0.2722, + "num_input_tokens_seen": 22133584, + "step": 14485 + }, + { + "epoch": 44.723338485316845, + "grad_norm": 0.4865533709526062, + "learning_rate": 3.548610401443169e-05, + "loss": 0.2795, + "num_input_tokens_seen": 22141264, + "step": 14490 + }, + { + "epoch": 44.738794435857805, + "grad_norm": 0.6017870306968689, + "learning_rate": 3.547719108562543e-05, + "loss": 0.3004, + "num_input_tokens_seen": 22149104, + "step": 14495 + }, + { + "epoch": 44.754250386398766, + "grad_norm": 0.48695114254951477, + "learning_rate": 3.546827654110464e-05, + "loss": 0.2203, + "num_input_tokens_seen": 22156944, + "step": 14500 + }, + { + "epoch": 44.76970633693972, + "grad_norm": 0.5728229880332947, + "learning_rate": 3.545936038224405e-05, + "loss": 0.2214, + "num_input_tokens_seen": 22164368, + "step": 14505 + }, + { + "epoch": 44.78516228748068, + "grad_norm": 0.45526835322380066, + "learning_rate": 3.545044261041864e-05, + "loss": 0.2438, + "num_input_tokens_seen": 22172496, + "step": 14510 + }, + { + "epoch": 44.80061823802164, + "grad_norm": 0.42624038457870483, + "learning_rate": 3.5441523227003657e-05, + "loss": 0.2145, + "num_input_tokens_seen": 22180112, + "step": 14515 + }, + { + "epoch": 44.816074188562595, + "grad_norm": 0.7557064294815063, + "learning_rate": 3.543260223337459e-05, + "loss": 0.2394, + "num_input_tokens_seen": 22187120, + "step": 14520 + }, + { + "epoch": 44.831530139103556, + "grad_norm": 0.5791212320327759, + "learning_rate": 3.542367963090714e-05, + "loss": 0.2564, + "num_input_tokens_seen": 22195152, + "step": 14525 + }, + { + "epoch": 44.84698608964451, + "grad_norm": 0.6766620874404907, + "learning_rate": 3.5414755420977295e-05, + "loss": 0.3444, + "num_input_tokens_seen": 22202192, + "step": 14530 + }, + { + "epoch": 44.86244204018547, + "grad_norm": 0.4410088062286377, + "learning_rate": 3.54058296049613e-05, + "loss": 0.2592, + "num_input_tokens_seen": 22209872, + "step": 14535 + }, + { + "epoch": 44.87789799072643, + "grad_norm": 0.5745851397514343, + "learning_rate": 3.53969021842356e-05, + "loss": 0.3317, + "num_input_tokens_seen": 22217232, + "step": 14540 + }, + { + "epoch": 44.893353941267385, + "grad_norm": 0.5391982197761536, + "learning_rate": 3.5387973160176926e-05, + "loss": 0.2824, + "num_input_tokens_seen": 22224720, + "step": 14545 + }, + { + "epoch": 44.908809891808346, + "grad_norm": 0.742670476436615, + "learning_rate": 3.537904253416224e-05, + "loss": 0.3277, + "num_input_tokens_seen": 22232592, + "step": 14550 + }, + { + "epoch": 44.92426584234931, + "grad_norm": 0.37013736367225647, + "learning_rate": 3.537011030756878e-05, + "loss": 0.2526, + "num_input_tokens_seen": 22239920, + "step": 14555 + }, + { + "epoch": 44.93972179289026, + "grad_norm": 0.40351372957229614, + "learning_rate": 3.536117648177399e-05, + "loss": 0.2523, + "num_input_tokens_seen": 22247536, + "step": 14560 + }, + { + "epoch": 44.95517774343122, + "grad_norm": 0.39307117462158203, + "learning_rate": 3.535224105815558e-05, + "loss": 0.2255, + "num_input_tokens_seen": 22255056, + "step": 14565 + }, + { + "epoch": 44.97063369397218, + "grad_norm": 0.37278875708580017, + "learning_rate": 3.5343304038091494e-05, + "loss": 0.2192, + "num_input_tokens_seen": 22262896, + "step": 14570 + }, + { + "epoch": 44.986089644513136, + "grad_norm": 0.6142448782920837, + "learning_rate": 3.5334365422959955e-05, + "loss": 0.2892, + "num_input_tokens_seen": 22270384, + "step": 14575 + }, + { + "epoch": 45.0, + "grad_norm": 1.098564624786377, + "learning_rate": 3.5325425214139396e-05, + "loss": 0.28, + "num_input_tokens_seen": 22276688, + "step": 14580 + }, + { + "epoch": 45.01545595054096, + "grad_norm": 0.6343981623649597, + "learning_rate": 3.531648341300851e-05, + "loss": 0.2931, + "num_input_tokens_seen": 22284144, + "step": 14585 + }, + { + "epoch": 45.030911901081915, + "grad_norm": 0.3408297896385193, + "learning_rate": 3.530754002094623e-05, + "loss": 0.2383, + "num_input_tokens_seen": 22291792, + "step": 14590 + }, + { + "epoch": 45.046367851622875, + "grad_norm": 0.4498598575592041, + "learning_rate": 3.529859503933175e-05, + "loss": 0.219, + "num_input_tokens_seen": 22299568, + "step": 14595 + }, + { + "epoch": 45.061823802163836, + "grad_norm": 0.5778750777244568, + "learning_rate": 3.52896484695445e-05, + "loss": 0.2577, + "num_input_tokens_seen": 22307216, + "step": 14600 + }, + { + "epoch": 45.061823802163836, + "eval_loss": 0.32277873158454895, + "eval_runtime": 6.3018, + "eval_samples_per_second": 91.244, + "eval_steps_per_second": 22.851, + "num_input_tokens_seen": 22307216, + "step": 14600 + }, + { + "epoch": 45.07727975270479, + "grad_norm": 0.44991588592529297, + "learning_rate": 3.528070031296414e-05, + "loss": 0.2349, + "num_input_tokens_seen": 22314864, + "step": 14605 + }, + { + "epoch": 45.09273570324575, + "grad_norm": 1.0038070678710938, + "learning_rate": 3.5271750570970605e-05, + "loss": 0.3288, + "num_input_tokens_seen": 22322576, + "step": 14610 + }, + { + "epoch": 45.108191653786704, + "grad_norm": 0.4619998633861542, + "learning_rate": 3.526279924494405e-05, + "loss": 0.253, + "num_input_tokens_seen": 22330544, + "step": 14615 + }, + { + "epoch": 45.123647604327665, + "grad_norm": 0.344941109418869, + "learning_rate": 3.5253846336264874e-05, + "loss": 0.2115, + "num_input_tokens_seen": 22338448, + "step": 14620 + }, + { + "epoch": 45.139103554868626, + "grad_norm": 0.28441792726516724, + "learning_rate": 3.5244891846313736e-05, + "loss": 0.1913, + "num_input_tokens_seen": 22345744, + "step": 14625 + }, + { + "epoch": 45.15455950540958, + "grad_norm": 0.6554847955703735, + "learning_rate": 3.5235935776471527e-05, + "loss": 0.2662, + "num_input_tokens_seen": 22353328, + "step": 14630 + }, + { + "epoch": 45.17001545595054, + "grad_norm": 0.5812717080116272, + "learning_rate": 3.522697812811939e-05, + "loss": 0.2741, + "num_input_tokens_seen": 22361008, + "step": 14635 + }, + { + "epoch": 45.1854714064915, + "grad_norm": 0.34142255783081055, + "learning_rate": 3.521801890263871e-05, + "loss": 0.2455, + "num_input_tokens_seen": 22368272, + "step": 14640 + }, + { + "epoch": 45.200927357032455, + "grad_norm": 0.37588340044021606, + "learning_rate": 3.5209058101411114e-05, + "loss": 0.2702, + "num_input_tokens_seen": 22375856, + "step": 14645 + }, + { + "epoch": 45.216383307573416, + "grad_norm": 0.6931722164154053, + "learning_rate": 3.520009572581845e-05, + "loss": 0.2867, + "num_input_tokens_seen": 22383440, + "step": 14650 + }, + { + "epoch": 45.23183925811438, + "grad_norm": 0.5510474443435669, + "learning_rate": 3.519113177724285e-05, + "loss": 0.2742, + "num_input_tokens_seen": 22390576, + "step": 14655 + }, + { + "epoch": 45.24729520865533, + "grad_norm": 0.3575286269187927, + "learning_rate": 3.5182166257066656e-05, + "loss": 0.2803, + "num_input_tokens_seen": 22398672, + "step": 14660 + }, + { + "epoch": 45.26275115919629, + "grad_norm": 0.3955017924308777, + "learning_rate": 3.517319916667247e-05, + "loss": 0.2858, + "num_input_tokens_seen": 22406256, + "step": 14665 + }, + { + "epoch": 45.27820710973725, + "grad_norm": 0.42738813161849976, + "learning_rate": 3.516423050744313e-05, + "loss": 0.1902, + "num_input_tokens_seen": 22413936, + "step": 14670 + }, + { + "epoch": 45.293663060278206, + "grad_norm": 0.3982919454574585, + "learning_rate": 3.5155260280761704e-05, + "loss": 0.2808, + "num_input_tokens_seen": 22420752, + "step": 14675 + }, + { + "epoch": 45.30911901081917, + "grad_norm": 0.5295397639274597, + "learning_rate": 3.514628848801154e-05, + "loss": 0.2607, + "num_input_tokens_seen": 22428432, + "step": 14680 + }, + { + "epoch": 45.32457496136012, + "grad_norm": 0.6211256384849548, + "learning_rate": 3.5137315130576174e-05, + "loss": 0.2649, + "num_input_tokens_seen": 22436656, + "step": 14685 + }, + { + "epoch": 45.34003091190108, + "grad_norm": 0.6561644673347473, + "learning_rate": 3.512834020983942e-05, + "loss": 0.2552, + "num_input_tokens_seen": 22444144, + "step": 14690 + }, + { + "epoch": 45.35548686244204, + "grad_norm": 0.5991784930229187, + "learning_rate": 3.5119363727185334e-05, + "loss": 0.2889, + "num_input_tokens_seen": 22451536, + "step": 14695 + }, + { + "epoch": 45.370942812982996, + "grad_norm": 0.9413470029830933, + "learning_rate": 3.511038568399819e-05, + "loss": 0.3659, + "num_input_tokens_seen": 22458864, + "step": 14700 + }, + { + "epoch": 45.38639876352396, + "grad_norm": 0.8585204482078552, + "learning_rate": 3.510140608166251e-05, + "loss": 0.2788, + "num_input_tokens_seen": 22466544, + "step": 14705 + }, + { + "epoch": 45.40185471406492, + "grad_norm": 0.3299621641635895, + "learning_rate": 3.509242492156308e-05, + "loss": 0.3025, + "num_input_tokens_seen": 22474608, + "step": 14710 + }, + { + "epoch": 45.41731066460587, + "grad_norm": 0.8452068567276001, + "learning_rate": 3.5083442205084896e-05, + "loss": 0.3824, + "num_input_tokens_seen": 22481808, + "step": 14715 + }, + { + "epoch": 45.43276661514683, + "grad_norm": 0.4975554347038269, + "learning_rate": 3.507445793361321e-05, + "loss": 0.2228, + "num_input_tokens_seen": 22489392, + "step": 14720 + }, + { + "epoch": 45.44822256568779, + "grad_norm": 0.564784586429596, + "learning_rate": 3.5065472108533505e-05, + "loss": 0.2354, + "num_input_tokens_seen": 22497040, + "step": 14725 + }, + { + "epoch": 45.46367851622875, + "grad_norm": 0.5430232882499695, + "learning_rate": 3.5056484731231504e-05, + "loss": 0.2947, + "num_input_tokens_seen": 22504656, + "step": 14730 + }, + { + "epoch": 45.47913446676971, + "grad_norm": 0.4942256510257721, + "learning_rate": 3.504749580309319e-05, + "loss": 0.208, + "num_input_tokens_seen": 22512272, + "step": 14735 + }, + { + "epoch": 45.49459041731066, + "grad_norm": 0.5689105987548828, + "learning_rate": 3.5038505325504753e-05, + "loss": 0.2022, + "num_input_tokens_seen": 22520496, + "step": 14740 + }, + { + "epoch": 45.51004636785162, + "grad_norm": 0.7372163534164429, + "learning_rate": 3.502951329985264e-05, + "loss": 0.2554, + "num_input_tokens_seen": 22528176, + "step": 14745 + }, + { + "epoch": 45.52550231839258, + "grad_norm": 0.4365151822566986, + "learning_rate": 3.502051972752354e-05, + "loss": 0.2524, + "num_input_tokens_seen": 22536112, + "step": 14750 + }, + { + "epoch": 45.54095826893354, + "grad_norm": 0.5945054888725281, + "learning_rate": 3.5011524609904374e-05, + "loss": 0.3101, + "num_input_tokens_seen": 22543312, + "step": 14755 + }, + { + "epoch": 45.5564142194745, + "grad_norm": 0.924187958240509, + "learning_rate": 3.50025279483823e-05, + "loss": 0.2118, + "num_input_tokens_seen": 22551024, + "step": 14760 + }, + { + "epoch": 45.57187017001546, + "grad_norm": 0.4760534465312958, + "learning_rate": 3.499352974434472e-05, + "loss": 0.2493, + "num_input_tokens_seen": 22558512, + "step": 14765 + }, + { + "epoch": 45.58732612055641, + "grad_norm": 0.6613631844520569, + "learning_rate": 3.498452999917926e-05, + "loss": 0.2867, + "num_input_tokens_seen": 22566128, + "step": 14770 + }, + { + "epoch": 45.60278207109737, + "grad_norm": 0.492745041847229, + "learning_rate": 3.4975528714273795e-05, + "loss": 0.2382, + "num_input_tokens_seen": 22573328, + "step": 14775 + }, + { + "epoch": 45.618238021638334, + "grad_norm": 0.4409641623497009, + "learning_rate": 3.4966525891016454e-05, + "loss": 0.2609, + "num_input_tokens_seen": 22580880, + "step": 14780 + }, + { + "epoch": 45.63369397217929, + "grad_norm": 0.7877684235572815, + "learning_rate": 3.495752153079557e-05, + "loss": 0.2637, + "num_input_tokens_seen": 22588880, + "step": 14785 + }, + { + "epoch": 45.64914992272025, + "grad_norm": 0.9460625648498535, + "learning_rate": 3.494851563499974e-05, + "loss": 0.2381, + "num_input_tokens_seen": 22596880, + "step": 14790 + }, + { + "epoch": 45.66460587326121, + "grad_norm": 0.35789185762405396, + "learning_rate": 3.493950820501777e-05, + "loss": 0.2802, + "num_input_tokens_seen": 22604464, + "step": 14795 + }, + { + "epoch": 45.68006182380216, + "grad_norm": 0.35306498408317566, + "learning_rate": 3.493049924223872e-05, + "loss": 0.256, + "num_input_tokens_seen": 22612016, + "step": 14800 + }, + { + "epoch": 45.68006182380216, + "eval_loss": 0.3205316960811615, + "eval_runtime": 6.3136, + "eval_samples_per_second": 91.073, + "eval_steps_per_second": 22.808, + "num_input_tokens_seen": 22612016, + "step": 14800 + }, + { + "epoch": 45.695517774343124, + "grad_norm": 0.6995148658752441, + "learning_rate": 3.49214887480519e-05, + "loss": 0.3892, + "num_input_tokens_seen": 22620080, + "step": 14805 + }, + { + "epoch": 45.71097372488408, + "grad_norm": 0.537512481212616, + "learning_rate": 3.4912476723846834e-05, + "loss": 0.2901, + "num_input_tokens_seen": 22627856, + "step": 14810 + }, + { + "epoch": 45.72642967542504, + "grad_norm": 0.5057835578918457, + "learning_rate": 3.490346317101328e-05, + "loss": 0.281, + "num_input_tokens_seen": 22634864, + "step": 14815 + }, + { + "epoch": 45.741885625966, + "grad_norm": 1.2056019306182861, + "learning_rate": 3.4894448090941266e-05, + "loss": 0.2458, + "num_input_tokens_seen": 22642448, + "step": 14820 + }, + { + "epoch": 45.75734157650695, + "grad_norm": 0.4047897458076477, + "learning_rate": 3.488543148502101e-05, + "loss": 0.2862, + "num_input_tokens_seen": 22649872, + "step": 14825 + }, + { + "epoch": 45.77279752704791, + "grad_norm": 0.5056330561637878, + "learning_rate": 3.487641335464299e-05, + "loss": 0.2775, + "num_input_tokens_seen": 22657680, + "step": 14830 + }, + { + "epoch": 45.788253477588874, + "grad_norm": 0.8056687712669373, + "learning_rate": 3.4867393701197914e-05, + "loss": 0.3957, + "num_input_tokens_seen": 22665296, + "step": 14835 + }, + { + "epoch": 45.80370942812983, + "grad_norm": 0.7953941822052002, + "learning_rate": 3.485837252607673e-05, + "loss": 0.2764, + "num_input_tokens_seen": 22673168, + "step": 14840 + }, + { + "epoch": 45.81916537867079, + "grad_norm": 0.5825561881065369, + "learning_rate": 3.4849349830670615e-05, + "loss": 0.2632, + "num_input_tokens_seen": 22680880, + "step": 14845 + }, + { + "epoch": 45.83462132921175, + "grad_norm": 0.39454415440559387, + "learning_rate": 3.4840325616370976e-05, + "loss": 0.26, + "num_input_tokens_seen": 22688176, + "step": 14850 + }, + { + "epoch": 45.8500772797527, + "grad_norm": 0.48550450801849365, + "learning_rate": 3.483129988456947e-05, + "loss": 0.2406, + "num_input_tokens_seen": 22696368, + "step": 14855 + }, + { + "epoch": 45.865533230293664, + "grad_norm": 0.7708332538604736, + "learning_rate": 3.482227263665797e-05, + "loss": 0.2301, + "num_input_tokens_seen": 22704112, + "step": 14860 + }, + { + "epoch": 45.88098918083462, + "grad_norm": 0.6748814582824707, + "learning_rate": 3.48132438740286e-05, + "loss": 0.3271, + "num_input_tokens_seen": 22711920, + "step": 14865 + }, + { + "epoch": 45.89644513137558, + "grad_norm": 0.5269950032234192, + "learning_rate": 3.48042135980737e-05, + "loss": 0.2839, + "num_input_tokens_seen": 22719760, + "step": 14870 + }, + { + "epoch": 45.91190108191654, + "grad_norm": 0.4313519597053528, + "learning_rate": 3.479518181018586e-05, + "loss": 0.219, + "num_input_tokens_seen": 22727568, + "step": 14875 + }, + { + "epoch": 45.92735703245749, + "grad_norm": 0.3637554347515106, + "learning_rate": 3.4786148511757886e-05, + "loss": 0.2852, + "num_input_tokens_seen": 22735440, + "step": 14880 + }, + { + "epoch": 45.942812982998454, + "grad_norm": 0.8375332355499268, + "learning_rate": 3.477711370418284e-05, + "loss": 0.2087, + "num_input_tokens_seen": 22743344, + "step": 14885 + }, + { + "epoch": 45.958268933539415, + "grad_norm": 0.627729058265686, + "learning_rate": 3.476807738885399e-05, + "loss": 0.2512, + "num_input_tokens_seen": 22751088, + "step": 14890 + }, + { + "epoch": 45.97372488408037, + "grad_norm": 0.5067960023880005, + "learning_rate": 3.475903956716485e-05, + "loss": 0.2079, + "num_input_tokens_seen": 22759056, + "step": 14895 + }, + { + "epoch": 45.98918083462133, + "grad_norm": 0.7160232067108154, + "learning_rate": 3.475000024050917e-05, + "loss": 0.3043, + "num_input_tokens_seen": 22766672, + "step": 14900 + }, + { + "epoch": 46.003091190108194, + "grad_norm": 0.8672434687614441, + "learning_rate": 3.4740959410280926e-05, + "loss": 0.2704, + "num_input_tokens_seen": 22772816, + "step": 14905 + }, + { + "epoch": 46.01854714064915, + "grad_norm": 0.5611032247543335, + "learning_rate": 3.4731917077874324e-05, + "loss": 0.2697, + "num_input_tokens_seen": 22780624, + "step": 14910 + }, + { + "epoch": 46.03400309119011, + "grad_norm": 0.2885143458843231, + "learning_rate": 3.4722873244683816e-05, + "loss": 0.2415, + "num_input_tokens_seen": 22787920, + "step": 14915 + }, + { + "epoch": 46.04945904173107, + "grad_norm": 0.4603523313999176, + "learning_rate": 3.4713827912104065e-05, + "loss": 0.2731, + "num_input_tokens_seen": 22796528, + "step": 14920 + }, + { + "epoch": 46.06491499227202, + "grad_norm": 0.31535884737968445, + "learning_rate": 3.470478108152998e-05, + "loss": 0.3032, + "num_input_tokens_seen": 22803984, + "step": 14925 + }, + { + "epoch": 46.08037094281298, + "grad_norm": 0.6037811040878296, + "learning_rate": 3.4695732754356695e-05, + "loss": 0.2173, + "num_input_tokens_seen": 22811600, + "step": 14930 + }, + { + "epoch": 46.095826893353944, + "grad_norm": 0.528181791305542, + "learning_rate": 3.4686682931979576e-05, + "loss": 0.2667, + "num_input_tokens_seen": 22819248, + "step": 14935 + }, + { + "epoch": 46.1112828438949, + "grad_norm": 0.6260986328125, + "learning_rate": 3.467763161579422e-05, + "loss": 0.3002, + "num_input_tokens_seen": 22826448, + "step": 14940 + }, + { + "epoch": 46.12673879443586, + "grad_norm": 0.4452609717845917, + "learning_rate": 3.466857880719645e-05, + "loss": 0.2585, + "num_input_tokens_seen": 22834160, + "step": 14945 + }, + { + "epoch": 46.14219474497681, + "grad_norm": 0.4563736319541931, + "learning_rate": 3.465952450758233e-05, + "loss": 0.2816, + "num_input_tokens_seen": 22841840, + "step": 14950 + }, + { + "epoch": 46.15765069551777, + "grad_norm": 0.383274644613266, + "learning_rate": 3.4650468718348126e-05, + "loss": 0.2197, + "num_input_tokens_seen": 22849008, + "step": 14955 + }, + { + "epoch": 46.173106646058734, + "grad_norm": 0.26582205295562744, + "learning_rate": 3.464141144089038e-05, + "loss": 0.2634, + "num_input_tokens_seen": 22856816, + "step": 14960 + }, + { + "epoch": 46.18856259659969, + "grad_norm": 0.6306109428405762, + "learning_rate": 3.463235267660583e-05, + "loss": 0.247, + "num_input_tokens_seen": 22864144, + "step": 14965 + }, + { + "epoch": 46.20401854714065, + "grad_norm": 0.5875097513198853, + "learning_rate": 3.462329242689145e-05, + "loss": 0.2221, + "num_input_tokens_seen": 22871824, + "step": 14970 + }, + { + "epoch": 46.21947449768161, + "grad_norm": 0.40974879264831543, + "learning_rate": 3.461423069314444e-05, + "loss": 0.2326, + "num_input_tokens_seen": 22879376, + "step": 14975 + }, + { + "epoch": 46.23493044822256, + "grad_norm": 0.44374316930770874, + "learning_rate": 3.460516747676224e-05, + "loss": 0.2687, + "num_input_tokens_seen": 22887248, + "step": 14980 + }, + { + "epoch": 46.250386398763524, + "grad_norm": 0.5204342007637024, + "learning_rate": 3.459610277914251e-05, + "loss": 0.2601, + "num_input_tokens_seen": 22894992, + "step": 14985 + }, + { + "epoch": 46.265842349304485, + "grad_norm": 0.42809662222862244, + "learning_rate": 3.458703660168314e-05, + "loss": 0.2498, + "num_input_tokens_seen": 22902704, + "step": 14990 + }, + { + "epoch": 46.28129829984544, + "grad_norm": 0.45999079942703247, + "learning_rate": 3.457796894578224e-05, + "loss": 0.2682, + "num_input_tokens_seen": 22910672, + "step": 14995 + }, + { + "epoch": 46.2967542503864, + "grad_norm": 0.5032544732093811, + "learning_rate": 3.456889981283817e-05, + "loss": 0.2296, + "num_input_tokens_seen": 22917744, + "step": 15000 + }, + { + "epoch": 46.2967542503864, + "eval_loss": 0.3193829357624054, + "eval_runtime": 6.3951, + "eval_samples_per_second": 89.913, + "eval_steps_per_second": 22.517, + "num_input_tokens_seen": 22917744, + "step": 15000 + }, + { + "epoch": 46.31221020092736, + "grad_norm": 1.0776301622390747, + "learning_rate": 3.45598292042495e-05, + "loss": 0.2593, + "num_input_tokens_seen": 22925008, + "step": 15005 + }, + { + "epoch": 46.327666151468314, + "grad_norm": 0.419946551322937, + "learning_rate": 3.4550757121415035e-05, + "loss": 0.2146, + "num_input_tokens_seen": 22932368, + "step": 15010 + }, + { + "epoch": 46.343122102009275, + "grad_norm": 0.3884050250053406, + "learning_rate": 3.454168356573378e-05, + "loss": 0.2422, + "num_input_tokens_seen": 22940016, + "step": 15015 + }, + { + "epoch": 46.35857805255023, + "grad_norm": 0.4939338266849518, + "learning_rate": 3.453260853860503e-05, + "loss": 0.22, + "num_input_tokens_seen": 22947216, + "step": 15020 + }, + { + "epoch": 46.37403400309119, + "grad_norm": 0.6178098320960999, + "learning_rate": 3.452353204142824e-05, + "loss": 0.3045, + "num_input_tokens_seen": 22955824, + "step": 15025 + }, + { + "epoch": 46.38948995363215, + "grad_norm": 0.8149932026863098, + "learning_rate": 3.4514454075603136e-05, + "loss": 0.2462, + "num_input_tokens_seen": 22963696, + "step": 15030 + }, + { + "epoch": 46.404945904173104, + "grad_norm": 0.4467146098613739, + "learning_rate": 3.450537464252964e-05, + "loss": 0.2512, + "num_input_tokens_seen": 22970672, + "step": 15035 + }, + { + "epoch": 46.420401854714065, + "grad_norm": 0.7094013094902039, + "learning_rate": 3.4496293743607925e-05, + "loss": 0.289, + "num_input_tokens_seen": 22978384, + "step": 15040 + }, + { + "epoch": 46.435857805255026, + "grad_norm": 0.4409451484680176, + "learning_rate": 3.448721138023838e-05, + "loss": 0.2986, + "num_input_tokens_seen": 22985936, + "step": 15045 + }, + { + "epoch": 46.45131375579598, + "grad_norm": 0.533575177192688, + "learning_rate": 3.447812755382162e-05, + "loss": 0.2721, + "num_input_tokens_seen": 22993488, + "step": 15050 + }, + { + "epoch": 46.46676970633694, + "grad_norm": 0.3468853235244751, + "learning_rate": 3.446904226575847e-05, + "loss": 0.2538, + "num_input_tokens_seen": 23001520, + "step": 15055 + }, + { + "epoch": 46.4822256568779, + "grad_norm": 0.7084349393844604, + "learning_rate": 3.445995551745002e-05, + "loss": 0.2101, + "num_input_tokens_seen": 23009328, + "step": 15060 + }, + { + "epoch": 46.497681607418855, + "grad_norm": 0.42190322279930115, + "learning_rate": 3.445086731029753e-05, + "loss": 0.3085, + "num_input_tokens_seen": 23016912, + "step": 15065 + }, + { + "epoch": 46.513137557959816, + "grad_norm": 0.7313929200172424, + "learning_rate": 3.444177764570255e-05, + "loss": 0.3299, + "num_input_tokens_seen": 23024592, + "step": 15070 + }, + { + "epoch": 46.52859350850077, + "grad_norm": 0.4098299741744995, + "learning_rate": 3.44326865250668e-05, + "loss": 0.2928, + "num_input_tokens_seen": 23031952, + "step": 15075 + }, + { + "epoch": 46.54404945904173, + "grad_norm": 0.6591498255729675, + "learning_rate": 3.442359394979225e-05, + "loss": 0.3124, + "num_input_tokens_seen": 23039440, + "step": 15080 + }, + { + "epoch": 46.55950540958269, + "grad_norm": 0.37423980236053467, + "learning_rate": 3.441449992128108e-05, + "loss": 0.2219, + "num_input_tokens_seen": 23046416, + "step": 15085 + }, + { + "epoch": 46.574961360123645, + "grad_norm": 0.5162524580955505, + "learning_rate": 3.440540444093573e-05, + "loss": 0.2114, + "num_input_tokens_seen": 23054256, + "step": 15090 + }, + { + "epoch": 46.590417310664606, + "grad_norm": 0.6858354210853577, + "learning_rate": 3.43963075101588e-05, + "loss": 0.2403, + "num_input_tokens_seen": 23062672, + "step": 15095 + }, + { + "epoch": 46.605873261205566, + "grad_norm": 0.8237178921699524, + "learning_rate": 3.438720913035318e-05, + "loss": 0.2549, + "num_input_tokens_seen": 23070576, + "step": 15100 + }, + { + "epoch": 46.62132921174652, + "grad_norm": 0.5445351004600525, + "learning_rate": 3.437810930292195e-05, + "loss": 0.249, + "num_input_tokens_seen": 23078384, + "step": 15105 + }, + { + "epoch": 46.63678516228748, + "grad_norm": 0.5136814117431641, + "learning_rate": 3.43690080292684e-05, + "loss": 0.2638, + "num_input_tokens_seen": 23086032, + "step": 15110 + }, + { + "epoch": 46.65224111282844, + "grad_norm": 0.6592784523963928, + "learning_rate": 3.435990531079608e-05, + "loss": 0.2918, + "num_input_tokens_seen": 23093872, + "step": 15115 + }, + { + "epoch": 46.667697063369395, + "grad_norm": 0.4237399101257324, + "learning_rate": 3.435080114890874e-05, + "loss": 0.2316, + "num_input_tokens_seen": 23101296, + "step": 15120 + }, + { + "epoch": 46.683153013910356, + "grad_norm": 0.5086627006530762, + "learning_rate": 3.434169554501035e-05, + "loss": 0.2728, + "num_input_tokens_seen": 23109168, + "step": 15125 + }, + { + "epoch": 46.69860896445132, + "grad_norm": 0.6446278095245361, + "learning_rate": 3.433258850050511e-05, + "loss": 0.2582, + "num_input_tokens_seen": 23116624, + "step": 15130 + }, + { + "epoch": 46.71406491499227, + "grad_norm": 0.510057806968689, + "learning_rate": 3.4323480016797446e-05, + "loss": 0.2831, + "num_input_tokens_seen": 23124368, + "step": 15135 + }, + { + "epoch": 46.72952086553323, + "grad_norm": 0.4434848129749298, + "learning_rate": 3.4314370095291995e-05, + "loss": 0.2834, + "num_input_tokens_seen": 23132176, + "step": 15140 + }, + { + "epoch": 46.744976816074185, + "grad_norm": 0.6021254062652588, + "learning_rate": 3.430525873739363e-05, + "loss": 0.3081, + "num_input_tokens_seen": 23139888, + "step": 15145 + }, + { + "epoch": 46.760432766615146, + "grad_norm": 0.6405128240585327, + "learning_rate": 3.429614594450743e-05, + "loss": 0.3095, + "num_input_tokens_seen": 23147824, + "step": 15150 + }, + { + "epoch": 46.77588871715611, + "grad_norm": 0.682361900806427, + "learning_rate": 3.428703171803869e-05, + "loss": 0.2807, + "num_input_tokens_seen": 23155568, + "step": 15155 + }, + { + "epoch": 46.79134466769706, + "grad_norm": 0.4249824583530426, + "learning_rate": 3.4277916059392964e-05, + "loss": 0.2523, + "num_input_tokens_seen": 23163024, + "step": 15160 + }, + { + "epoch": 46.80680061823802, + "grad_norm": 0.3887919485569, + "learning_rate": 3.426879896997598e-05, + "loss": 0.2156, + "num_input_tokens_seen": 23170704, + "step": 15165 + }, + { + "epoch": 46.82225656877898, + "grad_norm": 0.5317782163619995, + "learning_rate": 3.425968045119372e-05, + "loss": 0.3148, + "num_input_tokens_seen": 23178192, + "step": 15170 + }, + { + "epoch": 46.837712519319936, + "grad_norm": 0.34987562894821167, + "learning_rate": 3.425056050445237e-05, + "loss": 0.2806, + "num_input_tokens_seen": 23185616, + "step": 15175 + }, + { + "epoch": 46.8531684698609, + "grad_norm": 0.5510576367378235, + "learning_rate": 3.4241439131158336e-05, + "loss": 0.2269, + "num_input_tokens_seen": 23193072, + "step": 15180 + }, + { + "epoch": 46.86862442040186, + "grad_norm": 0.7096011638641357, + "learning_rate": 3.423231633271825e-05, + "loss": 0.2494, + "num_input_tokens_seen": 23201232, + "step": 15185 + }, + { + "epoch": 46.88408037094281, + "grad_norm": 0.4397924840450287, + "learning_rate": 3.4223192110538985e-05, + "loss": 0.2849, + "num_input_tokens_seen": 23209424, + "step": 15190 + }, + { + "epoch": 46.89953632148377, + "grad_norm": 0.4831060767173767, + "learning_rate": 3.4214066466027575e-05, + "loss": 0.2951, + "num_input_tokens_seen": 23217104, + "step": 15195 + }, + { + "epoch": 46.914992272024726, + "grad_norm": 0.4750826060771942, + "learning_rate": 3.4204939400591325e-05, + "loss": 0.2335, + "num_input_tokens_seen": 23224720, + "step": 15200 + }, + { + "epoch": 46.914992272024726, + "eval_loss": 0.3180605173110962, + "eval_runtime": 6.3106, + "eval_samples_per_second": 91.116, + "eval_steps_per_second": 22.819, + "num_input_tokens_seen": 23224720, + "step": 15200 + }, + { + "epoch": 46.93044822256569, + "grad_norm": 0.4630907475948334, + "learning_rate": 3.419581091563775e-05, + "loss": 0.3393, + "num_input_tokens_seen": 23232080, + "step": 15205 + }, + { + "epoch": 46.94590417310665, + "grad_norm": 0.5276147723197937, + "learning_rate": 3.418668101257456e-05, + "loss": 0.3822, + "num_input_tokens_seen": 23240144, + "step": 15210 + }, + { + "epoch": 46.9613601236476, + "grad_norm": 0.376772940158844, + "learning_rate": 3.417754969280971e-05, + "loss": 0.2807, + "num_input_tokens_seen": 23247728, + "step": 15215 + }, + { + "epoch": 46.97681607418856, + "grad_norm": 0.5669904947280884, + "learning_rate": 3.416841695775137e-05, + "loss": 0.2027, + "num_input_tokens_seen": 23255376, + "step": 15220 + }, + { + "epoch": 46.99227202472952, + "grad_norm": 0.6492288708686829, + "learning_rate": 3.415928280880792e-05, + "loss": 0.2466, + "num_input_tokens_seen": 23263184, + "step": 15225 + }, + { + "epoch": 47.00618238021638, + "grad_norm": 0.3961988389492035, + "learning_rate": 3.4150147247387965e-05, + "loss": 0.245, + "num_input_tokens_seen": 23270752, + "step": 15230 + }, + { + "epoch": 47.02163833075734, + "grad_norm": 0.3591473400592804, + "learning_rate": 3.4141010274900306e-05, + "loss": 0.227, + "num_input_tokens_seen": 23278528, + "step": 15235 + }, + { + "epoch": 47.0370942812983, + "grad_norm": 0.5533758401870728, + "learning_rate": 3.413187189275399e-05, + "loss": 0.2544, + "num_input_tokens_seen": 23286592, + "step": 15240 + }, + { + "epoch": 47.052550231839255, + "grad_norm": 0.8133221864700317, + "learning_rate": 3.4122732102358265e-05, + "loss": 0.2705, + "num_input_tokens_seen": 23294240, + "step": 15245 + }, + { + "epoch": 47.068006182380216, + "grad_norm": 0.7000631093978882, + "learning_rate": 3.411359090512261e-05, + "loss": 0.2977, + "num_input_tokens_seen": 23301600, + "step": 15250 + }, + { + "epoch": 47.08346213292118, + "grad_norm": 0.4220869541168213, + "learning_rate": 3.410444830245672e-05, + "loss": 0.231, + "num_input_tokens_seen": 23309472, + "step": 15255 + }, + { + "epoch": 47.09891808346213, + "grad_norm": 0.7145861983299255, + "learning_rate": 3.409530429577048e-05, + "loss": 0.2312, + "num_input_tokens_seen": 23317376, + "step": 15260 + }, + { + "epoch": 47.11437403400309, + "grad_norm": 0.7187508344650269, + "learning_rate": 3.408615888647402e-05, + "loss": 0.2278, + "num_input_tokens_seen": 23324896, + "step": 15265 + }, + { + "epoch": 47.12982998454405, + "grad_norm": 0.7695075869560242, + "learning_rate": 3.4077012075977675e-05, + "loss": 0.2558, + "num_input_tokens_seen": 23332800, + "step": 15270 + }, + { + "epoch": 47.145285935085006, + "grad_norm": 0.3555656671524048, + "learning_rate": 3.4067863865692e-05, + "loss": 0.239, + "num_input_tokens_seen": 23340512, + "step": 15275 + }, + { + "epoch": 47.16074188562597, + "grad_norm": 0.41469210386276245, + "learning_rate": 3.4058714257027755e-05, + "loss": 0.2962, + "num_input_tokens_seen": 23348448, + "step": 15280 + }, + { + "epoch": 47.17619783616692, + "grad_norm": 0.5766644477844238, + "learning_rate": 3.404956325139594e-05, + "loss": 0.2411, + "num_input_tokens_seen": 23356576, + "step": 15285 + }, + { + "epoch": 47.19165378670788, + "grad_norm": 0.5426275134086609, + "learning_rate": 3.404041085020775e-05, + "loss": 0.2994, + "num_input_tokens_seen": 23364128, + "step": 15290 + }, + { + "epoch": 47.20710973724884, + "grad_norm": 0.531288206577301, + "learning_rate": 3.403125705487459e-05, + "loss": 0.2828, + "num_input_tokens_seen": 23371744, + "step": 15295 + }, + { + "epoch": 47.222565687789796, + "grad_norm": 0.6323620676994324, + "learning_rate": 3.402210186680811e-05, + "loss": 0.2137, + "num_input_tokens_seen": 23379040, + "step": 15300 + }, + { + "epoch": 47.23802163833076, + "grad_norm": 0.5622113347053528, + "learning_rate": 3.4012945287420137e-05, + "loss": 0.2167, + "num_input_tokens_seen": 23387136, + "step": 15305 + }, + { + "epoch": 47.25347758887172, + "grad_norm": 0.5842376947402954, + "learning_rate": 3.400378731812274e-05, + "loss": 0.2702, + "num_input_tokens_seen": 23394272, + "step": 15310 + }, + { + "epoch": 47.26893353941267, + "grad_norm": 0.704547643661499, + "learning_rate": 3.399462796032817e-05, + "loss": 0.2387, + "num_input_tokens_seen": 23401600, + "step": 15315 + }, + { + "epoch": 47.28438948995363, + "grad_norm": 0.5551791191101074, + "learning_rate": 3.3985467215448954e-05, + "loss": 0.33, + "num_input_tokens_seen": 23409120, + "step": 15320 + }, + { + "epoch": 47.29984544049459, + "grad_norm": 0.3413110077381134, + "learning_rate": 3.3976305084897776e-05, + "loss": 0.2061, + "num_input_tokens_seen": 23416640, + "step": 15325 + }, + { + "epoch": 47.31530139103555, + "grad_norm": 0.3489742875099182, + "learning_rate": 3.3967141570087544e-05, + "loss": 0.3428, + "num_input_tokens_seen": 23423936, + "step": 15330 + }, + { + "epoch": 47.33075734157651, + "grad_norm": 0.5715504288673401, + "learning_rate": 3.39579766724314e-05, + "loss": 0.2245, + "num_input_tokens_seen": 23431456, + "step": 15335 + }, + { + "epoch": 47.34621329211747, + "grad_norm": 0.2931464910507202, + "learning_rate": 3.3948810393342677e-05, + "loss": 0.2621, + "num_input_tokens_seen": 23438880, + "step": 15340 + }, + { + "epoch": 47.36166924265842, + "grad_norm": 0.7430688738822937, + "learning_rate": 3.3939642734234936e-05, + "loss": 0.2453, + "num_input_tokens_seen": 23446656, + "step": 15345 + }, + { + "epoch": 47.37712519319938, + "grad_norm": 0.5743961930274963, + "learning_rate": 3.393047369652194e-05, + "loss": 0.2567, + "num_input_tokens_seen": 23454656, + "step": 15350 + }, + { + "epoch": 47.39258114374034, + "grad_norm": 0.77553391456604, + "learning_rate": 3.3921303281617664e-05, + "loss": 0.2725, + "num_input_tokens_seen": 23462336, + "step": 15355 + }, + { + "epoch": 47.4080370942813, + "grad_norm": 0.5050553679466248, + "learning_rate": 3.391213149093632e-05, + "loss": 0.2271, + "num_input_tokens_seen": 23470464, + "step": 15360 + }, + { + "epoch": 47.42349304482226, + "grad_norm": 0.6782698631286621, + "learning_rate": 3.3902958325892303e-05, + "loss": 0.2484, + "num_input_tokens_seen": 23478304, + "step": 15365 + }, + { + "epoch": 47.43894899536321, + "grad_norm": 0.48489001393318176, + "learning_rate": 3.389378378790023e-05, + "loss": 0.2453, + "num_input_tokens_seen": 23485984, + "step": 15370 + }, + { + "epoch": 47.45440494590417, + "grad_norm": 0.4368564784526825, + "learning_rate": 3.388460787837493e-05, + "loss": 0.3124, + "num_input_tokens_seen": 23493408, + "step": 15375 + }, + { + "epoch": 47.469860896445134, + "grad_norm": 0.5235582590103149, + "learning_rate": 3.387543059873145e-05, + "loss": 0.3201, + "num_input_tokens_seen": 23500448, + "step": 15380 + }, + { + "epoch": 47.48531684698609, + "grad_norm": 0.6864351034164429, + "learning_rate": 3.386625195038503e-05, + "loss": 0.3846, + "num_input_tokens_seen": 23507840, + "step": 15385 + }, + { + "epoch": 47.50077279752705, + "grad_norm": 0.5929641127586365, + "learning_rate": 3.3857071934751136e-05, + "loss": 0.257, + "num_input_tokens_seen": 23515648, + "step": 15390 + }, + { + "epoch": 47.51622874806801, + "grad_norm": 0.3864864706993103, + "learning_rate": 3.384789055324544e-05, + "loss": 0.3156, + "num_input_tokens_seen": 23523424, + "step": 15395 + }, + { + "epoch": 47.53168469860896, + "grad_norm": 0.5417267680168152, + "learning_rate": 3.3838707807283843e-05, + "loss": 0.2468, + "num_input_tokens_seen": 23531040, + "step": 15400 + }, + { + "epoch": 47.53168469860896, + "eval_loss": 0.3173469305038452, + "eval_runtime": 6.313, + "eval_samples_per_second": 91.082, + "eval_steps_per_second": 22.81, + "num_input_tokens_seen": 23531040, + "step": 15400 + }, + { + "epoch": 47.547140649149924, + "grad_norm": 0.5309116244316101, + "learning_rate": 3.382952369828243e-05, + "loss": 0.3795, + "num_input_tokens_seen": 23539136, + "step": 15405 + }, + { + "epoch": 47.56259659969088, + "grad_norm": 0.3905530571937561, + "learning_rate": 3.38203382276575e-05, + "loss": 0.252, + "num_input_tokens_seen": 23547104, + "step": 15410 + }, + { + "epoch": 47.57805255023184, + "grad_norm": 0.561226487159729, + "learning_rate": 3.381115139682557e-05, + "loss": 0.2445, + "num_input_tokens_seen": 23554528, + "step": 15415 + }, + { + "epoch": 47.5935085007728, + "grad_norm": 0.4713115394115448, + "learning_rate": 3.3801963207203366e-05, + "loss": 0.2415, + "num_input_tokens_seen": 23563008, + "step": 15420 + }, + { + "epoch": 47.60896445131375, + "grad_norm": 0.4932906925678253, + "learning_rate": 3.379277366020782e-05, + "loss": 0.2783, + "num_input_tokens_seen": 23570752, + "step": 15425 + }, + { + "epoch": 47.624420401854714, + "grad_norm": 0.4353971779346466, + "learning_rate": 3.3783582757256085e-05, + "loss": 0.2986, + "num_input_tokens_seen": 23577984, + "step": 15430 + }, + { + "epoch": 47.639876352395675, + "grad_norm": 0.7052789926528931, + "learning_rate": 3.3774390499765504e-05, + "loss": 0.2647, + "num_input_tokens_seen": 23585280, + "step": 15435 + }, + { + "epoch": 47.65533230293663, + "grad_norm": 0.4613003730773926, + "learning_rate": 3.376519688915364e-05, + "loss": 0.2058, + "num_input_tokens_seen": 23592672, + "step": 15440 + }, + { + "epoch": 47.67078825347759, + "grad_norm": 0.7666558623313904, + "learning_rate": 3.3756001926838273e-05, + "loss": 0.2725, + "num_input_tokens_seen": 23600736, + "step": 15445 + }, + { + "epoch": 47.68624420401855, + "grad_norm": 0.39773109555244446, + "learning_rate": 3.374680561423737e-05, + "loss": 0.218, + "num_input_tokens_seen": 23608576, + "step": 15450 + }, + { + "epoch": 47.701700154559504, + "grad_norm": 0.3368648290634155, + "learning_rate": 3.373760795276912e-05, + "loss": 0.2235, + "num_input_tokens_seen": 23616096, + "step": 15455 + }, + { + "epoch": 47.717156105100464, + "grad_norm": 0.6017543077468872, + "learning_rate": 3.372840894385192e-05, + "loss": 0.274, + "num_input_tokens_seen": 23623424, + "step": 15460 + }, + { + "epoch": 47.732612055641425, + "grad_norm": 0.4972898066043854, + "learning_rate": 3.3719208588904375e-05, + "loss": 0.2217, + "num_input_tokens_seen": 23630816, + "step": 15465 + }, + { + "epoch": 47.74806800618238, + "grad_norm": 0.9636270999908447, + "learning_rate": 3.371000688934529e-05, + "loss": 0.3262, + "num_input_tokens_seen": 23637760, + "step": 15470 + }, + { + "epoch": 47.76352395672334, + "grad_norm": 0.6207860112190247, + "learning_rate": 3.370080384659369e-05, + "loss": 0.2958, + "num_input_tokens_seen": 23645280, + "step": 15475 + }, + { + "epoch": 47.778979907264294, + "grad_norm": 0.4447523355484009, + "learning_rate": 3.36915994620688e-05, + "loss": 0.2465, + "num_input_tokens_seen": 23652864, + "step": 15480 + }, + { + "epoch": 47.794435857805254, + "grad_norm": 0.49886077642440796, + "learning_rate": 3.3682393737190035e-05, + "loss": 0.3267, + "num_input_tokens_seen": 23660448, + "step": 15485 + }, + { + "epoch": 47.809891808346215, + "grad_norm": 0.7481591105461121, + "learning_rate": 3.3673186673377054e-05, + "loss": 0.2283, + "num_input_tokens_seen": 23668096, + "step": 15490 + }, + { + "epoch": 47.82534775888717, + "grad_norm": 0.45409736037254333, + "learning_rate": 3.366397827204969e-05, + "loss": 0.2927, + "num_input_tokens_seen": 23676032, + "step": 15495 + }, + { + "epoch": 47.84080370942813, + "grad_norm": 0.7038403749465942, + "learning_rate": 3.3654768534628e-05, + "loss": 0.3093, + "num_input_tokens_seen": 23683616, + "step": 15500 + }, + { + "epoch": 47.85625965996909, + "grad_norm": 0.940424919128418, + "learning_rate": 3.3645557462532245e-05, + "loss": 0.2894, + "num_input_tokens_seen": 23691296, + "step": 15505 + }, + { + "epoch": 47.871715610510044, + "grad_norm": 0.5090105533599854, + "learning_rate": 3.363634505718288e-05, + "loss": 0.2611, + "num_input_tokens_seen": 23698880, + "step": 15510 + }, + { + "epoch": 47.887171561051005, + "grad_norm": 0.34392622113227844, + "learning_rate": 3.362713132000057e-05, + "loss": 0.2039, + "num_input_tokens_seen": 23706496, + "step": 15515 + }, + { + "epoch": 47.902627511591966, + "grad_norm": 0.5057738423347473, + "learning_rate": 3.36179162524062e-05, + "loss": 0.3338, + "num_input_tokens_seen": 23714208, + "step": 15520 + }, + { + "epoch": 47.91808346213292, + "grad_norm": 0.8580009937286377, + "learning_rate": 3.3608699855820846e-05, + "loss": 0.2722, + "num_input_tokens_seen": 23721728, + "step": 15525 + }, + { + "epoch": 47.93353941267388, + "grad_norm": 0.35892385244369507, + "learning_rate": 3.359948213166578e-05, + "loss": 0.3046, + "num_input_tokens_seen": 23729184, + "step": 15530 + }, + { + "epoch": 47.948995363214834, + "grad_norm": 0.40362223982810974, + "learning_rate": 3.359026308136252e-05, + "loss": 0.2305, + "num_input_tokens_seen": 23737024, + "step": 15535 + }, + { + "epoch": 47.964451313755795, + "grad_norm": 0.4861481189727783, + "learning_rate": 3.358104270633272e-05, + "loss": 0.2413, + "num_input_tokens_seen": 23744160, + "step": 15540 + }, + { + "epoch": 47.979907264296756, + "grad_norm": 0.37147587537765503, + "learning_rate": 3.357182100799831e-05, + "loss": 0.2156, + "num_input_tokens_seen": 23751456, + "step": 15545 + }, + { + "epoch": 47.99536321483771, + "grad_norm": 0.7374624609947205, + "learning_rate": 3.3562597987781384e-05, + "loss": 0.2424, + "num_input_tokens_seen": 23759424, + "step": 15550 + }, + { + "epoch": 48.009273570324574, + "grad_norm": 0.6204301118850708, + "learning_rate": 3.355337364710424e-05, + "loss": 0.2318, + "num_input_tokens_seen": 23766192, + "step": 15555 + }, + { + "epoch": 48.024729520865534, + "grad_norm": 0.4842032790184021, + "learning_rate": 3.354414798738939e-05, + "loss": 0.2228, + "num_input_tokens_seen": 23773680, + "step": 15560 + }, + { + "epoch": 48.04018547140649, + "grad_norm": 0.34465786814689636, + "learning_rate": 3.353492101005955e-05, + "loss": 0.2436, + "num_input_tokens_seen": 23781488, + "step": 15565 + }, + { + "epoch": 48.05564142194745, + "grad_norm": 0.5044572353363037, + "learning_rate": 3.352569271653763e-05, + "loss": 0.2967, + "num_input_tokens_seen": 23789104, + "step": 15570 + }, + { + "epoch": 48.07109737248841, + "grad_norm": 0.4766616225242615, + "learning_rate": 3.351646310824675e-05, + "loss": 0.2718, + "num_input_tokens_seen": 23796720, + "step": 15575 + }, + { + "epoch": 48.086553323029364, + "grad_norm": 0.8273824453353882, + "learning_rate": 3.350723218661023e-05, + "loss": 0.3861, + "num_input_tokens_seen": 23804880, + "step": 15580 + }, + { + "epoch": 48.102009273570324, + "grad_norm": 0.46773409843444824, + "learning_rate": 3.349799995305162e-05, + "loss": 0.2462, + "num_input_tokens_seen": 23812784, + "step": 15585 + }, + { + "epoch": 48.117465224111285, + "grad_norm": 0.5409140586853027, + "learning_rate": 3.348876640899461e-05, + "loss": 0.2612, + "num_input_tokens_seen": 23820528, + "step": 15590 + }, + { + "epoch": 48.13292117465224, + "grad_norm": 0.6030458807945251, + "learning_rate": 3.3479531555863144e-05, + "loss": 0.2954, + "num_input_tokens_seen": 23828464, + "step": 15595 + }, + { + "epoch": 48.1483771251932, + "grad_norm": 0.3377331793308258, + "learning_rate": 3.3470295395081344e-05, + "loss": 0.2409, + "num_input_tokens_seen": 23836048, + "step": 15600 + }, + { + "epoch": 48.1483771251932, + "eval_loss": 0.3177674114704132, + "eval_runtime": 6.301, + "eval_samples_per_second": 91.255, + "eval_steps_per_second": 22.853, + "num_input_tokens_seen": 23836048, + "step": 15600 + }, + { + "epoch": 48.16383307573416, + "grad_norm": 0.8303706049919128, + "learning_rate": 3.3461057928073556e-05, + "loss": 0.2854, + "num_input_tokens_seen": 23843824, + "step": 15605 + }, + { + "epoch": 48.179289026275114, + "grad_norm": 0.6685774922370911, + "learning_rate": 3.345181915626431e-05, + "loss": 0.2706, + "num_input_tokens_seen": 23851312, + "step": 15610 + }, + { + "epoch": 48.194744976816075, + "grad_norm": 0.38278311491012573, + "learning_rate": 3.344257908107834e-05, + "loss": 0.2636, + "num_input_tokens_seen": 23858800, + "step": 15615 + }, + { + "epoch": 48.210200927357036, + "grad_norm": 0.6031194925308228, + "learning_rate": 3.343333770394058e-05, + "loss": 0.2348, + "num_input_tokens_seen": 23866384, + "step": 15620 + }, + { + "epoch": 48.22565687789799, + "grad_norm": 0.5638322234153748, + "learning_rate": 3.342409502627616e-05, + "loss": 0.2546, + "num_input_tokens_seen": 23874160, + "step": 15625 + }, + { + "epoch": 48.24111282843895, + "grad_norm": 0.43243688344955444, + "learning_rate": 3.341485104951043e-05, + "loss": 0.2699, + "num_input_tokens_seen": 23881744, + "step": 15630 + }, + { + "epoch": 48.256568778979904, + "grad_norm": 0.3991020619869232, + "learning_rate": 3.340560577506892e-05, + "loss": 0.2263, + "num_input_tokens_seen": 23889104, + "step": 15635 + }, + { + "epoch": 48.272024729520865, + "grad_norm": 0.6018113493919373, + "learning_rate": 3.339635920437735e-05, + "loss": 0.264, + "num_input_tokens_seen": 23896592, + "step": 15640 + }, + { + "epoch": 48.287480680061826, + "grad_norm": 0.5648847222328186, + "learning_rate": 3.338711133886169e-05, + "loss": 0.3283, + "num_input_tokens_seen": 23904368, + "step": 15645 + }, + { + "epoch": 48.30293663060278, + "grad_norm": 0.6835505962371826, + "learning_rate": 3.3377862179948064e-05, + "loss": 0.2985, + "num_input_tokens_seen": 23911664, + "step": 15650 + }, + { + "epoch": 48.31839258114374, + "grad_norm": 0.3691290020942688, + "learning_rate": 3.336861172906281e-05, + "loss": 0.3206, + "num_input_tokens_seen": 23919888, + "step": 15655 + }, + { + "epoch": 48.3338485316847, + "grad_norm": 0.49712690711021423, + "learning_rate": 3.335935998763245e-05, + "loss": 0.2683, + "num_input_tokens_seen": 23927888, + "step": 15660 + }, + { + "epoch": 48.349304482225655, + "grad_norm": 0.7357456088066101, + "learning_rate": 3.3350106957083744e-05, + "loss": 0.2579, + "num_input_tokens_seen": 23935152, + "step": 15665 + }, + { + "epoch": 48.364760432766616, + "grad_norm": 0.4700267016887665, + "learning_rate": 3.33408526388436e-05, + "loss": 0.2445, + "num_input_tokens_seen": 23942736, + "step": 15670 + }, + { + "epoch": 48.38021638330758, + "grad_norm": 0.6374909281730652, + "learning_rate": 3.3331597034339166e-05, + "loss": 0.3363, + "num_input_tokens_seen": 23950192, + "step": 15675 + }, + { + "epoch": 48.39567233384853, + "grad_norm": 0.40766704082489014, + "learning_rate": 3.3322340144997764e-05, + "loss": 0.2646, + "num_input_tokens_seen": 23958256, + "step": 15680 + }, + { + "epoch": 48.41112828438949, + "grad_norm": 0.44435232877731323, + "learning_rate": 3.331308197224693e-05, + "loss": 0.3208, + "num_input_tokens_seen": 23966288, + "step": 15685 + }, + { + "epoch": 48.426584234930445, + "grad_norm": 0.952881395816803, + "learning_rate": 3.330382251751438e-05, + "loss": 0.257, + "num_input_tokens_seen": 23973296, + "step": 15690 + }, + { + "epoch": 48.442040185471406, + "grad_norm": 0.5888374447822571, + "learning_rate": 3.3294561782228054e-05, + "loss": 0.2834, + "num_input_tokens_seen": 23980592, + "step": 15695 + }, + { + "epoch": 48.45749613601237, + "grad_norm": 0.4392663240432739, + "learning_rate": 3.328529976781607e-05, + "loss": 0.2653, + "num_input_tokens_seen": 23988080, + "step": 15700 + }, + { + "epoch": 48.47295208655332, + "grad_norm": 0.7435940504074097, + "learning_rate": 3.327603647570673e-05, + "loss": 0.2736, + "num_input_tokens_seen": 23995408, + "step": 15705 + }, + { + "epoch": 48.48840803709428, + "grad_norm": 0.578467607498169, + "learning_rate": 3.326677190732857e-05, + "loss": 0.252, + "num_input_tokens_seen": 24002928, + "step": 15710 + }, + { + "epoch": 48.50386398763524, + "grad_norm": 0.8923315405845642, + "learning_rate": 3.325750606411029e-05, + "loss": 0.2205, + "num_input_tokens_seen": 24010480, + "step": 15715 + }, + { + "epoch": 48.519319938176196, + "grad_norm": 0.7465397715568542, + "learning_rate": 3.3248238947480804e-05, + "loss": 0.2812, + "num_input_tokens_seen": 24018224, + "step": 15720 + }, + { + "epoch": 48.53477588871716, + "grad_norm": 0.313720166683197, + "learning_rate": 3.323897055886922e-05, + "loss": 0.2919, + "num_input_tokens_seen": 24025808, + "step": 15725 + }, + { + "epoch": 48.55023183925812, + "grad_norm": 0.44646769762039185, + "learning_rate": 3.322970089970484e-05, + "loss": 0.2408, + "num_input_tokens_seen": 24033296, + "step": 15730 + }, + { + "epoch": 48.56568778979907, + "grad_norm": 0.33822768926620483, + "learning_rate": 3.3220429971417165e-05, + "loss": 0.2038, + "num_input_tokens_seen": 24040368, + "step": 15735 + }, + { + "epoch": 48.58114374034003, + "grad_norm": 0.5024354457855225, + "learning_rate": 3.321115777543588e-05, + "loss": 0.2198, + "num_input_tokens_seen": 24047792, + "step": 15740 + }, + { + "epoch": 48.59659969088099, + "grad_norm": 0.45519810914993286, + "learning_rate": 3.320188431319088e-05, + "loss": 0.2271, + "num_input_tokens_seen": 24055888, + "step": 15745 + }, + { + "epoch": 48.61205564142195, + "grad_norm": 0.6402506232261658, + "learning_rate": 3.319260958611224e-05, + "loss": 0.2582, + "num_input_tokens_seen": 24063536, + "step": 15750 + }, + { + "epoch": 48.62751159196291, + "grad_norm": 0.4314610958099365, + "learning_rate": 3.3183333595630256e-05, + "loss": 0.2243, + "num_input_tokens_seen": 24071184, + "step": 15755 + }, + { + "epoch": 48.64296754250386, + "grad_norm": 0.5666975378990173, + "learning_rate": 3.317405634317538e-05, + "loss": 0.2618, + "num_input_tokens_seen": 24078672, + "step": 15760 + }, + { + "epoch": 48.65842349304482, + "grad_norm": 0.4079296886920929, + "learning_rate": 3.3164777830178315e-05, + "loss": 0.1937, + "num_input_tokens_seen": 24085744, + "step": 15765 + }, + { + "epoch": 48.67387944358578, + "grad_norm": 0.3042692542076111, + "learning_rate": 3.315549805806989e-05, + "loss": 0.3913, + "num_input_tokens_seen": 24093936, + "step": 15770 + }, + { + "epoch": 48.689335394126736, + "grad_norm": 0.47524282336235046, + "learning_rate": 3.314621702828118e-05, + "loss": 0.2793, + "num_input_tokens_seen": 24101456, + "step": 15775 + }, + { + "epoch": 48.7047913446677, + "grad_norm": 0.5627877712249756, + "learning_rate": 3.313693474224342e-05, + "loss": 0.2251, + "num_input_tokens_seen": 24108848, + "step": 15780 + }, + { + "epoch": 48.72024729520866, + "grad_norm": 0.7148708701133728, + "learning_rate": 3.312765120138809e-05, + "loss": 0.2814, + "num_input_tokens_seen": 24116528, + "step": 15785 + }, + { + "epoch": 48.73570324574961, + "grad_norm": 0.4907703101634979, + "learning_rate": 3.311836640714679e-05, + "loss": 0.2102, + "num_input_tokens_seen": 24124080, + "step": 15790 + }, + { + "epoch": 48.75115919629057, + "grad_norm": 0.8061583042144775, + "learning_rate": 3.310908036095137e-05, + "loss": 0.2582, + "num_input_tokens_seen": 24132016, + "step": 15795 + }, + { + "epoch": 48.76661514683153, + "grad_norm": 0.4644472897052765, + "learning_rate": 3.309979306423386e-05, + "loss": 0.2948, + "num_input_tokens_seen": 24140240, + "step": 15800 + }, + { + "epoch": 48.76661514683153, + "eval_loss": 0.3169686496257782, + "eval_runtime": 6.3049, + "eval_samples_per_second": 91.2, + "eval_steps_per_second": 22.84, + "num_input_tokens_seen": 24140240, + "step": 15800 + }, + { + "epoch": 48.78207109737249, + "grad_norm": 0.49075084924697876, + "learning_rate": 3.309050451842647e-05, + "loss": 0.2217, + "num_input_tokens_seen": 24148144, + "step": 15805 + }, + { + "epoch": 48.79752704791345, + "grad_norm": 0.7130388617515564, + "learning_rate": 3.3081214724961604e-05, + "loss": 0.2471, + "num_input_tokens_seen": 24155888, + "step": 15810 + }, + { + "epoch": 48.8129829984544, + "grad_norm": 0.5143466591835022, + "learning_rate": 3.307192368527188e-05, + "loss": 0.2558, + "num_input_tokens_seen": 24163184, + "step": 15815 + }, + { + "epoch": 48.82843894899536, + "grad_norm": 0.6613137722015381, + "learning_rate": 3.306263140079008e-05, + "loss": 0.2347, + "num_input_tokens_seen": 24171152, + "step": 15820 + }, + { + "epoch": 48.84389489953632, + "grad_norm": 0.4996977746486664, + "learning_rate": 3.30533378729492e-05, + "loss": 0.2075, + "num_input_tokens_seen": 24178352, + "step": 15825 + }, + { + "epoch": 48.85935085007728, + "grad_norm": 0.6025872230529785, + "learning_rate": 3.304404310318242e-05, + "loss": 0.2509, + "num_input_tokens_seen": 24186352, + "step": 15830 + }, + { + "epoch": 48.87480680061824, + "grad_norm": 0.4587246775627136, + "learning_rate": 3.3034747092923105e-05, + "loss": 0.2912, + "num_input_tokens_seen": 24193680, + "step": 15835 + }, + { + "epoch": 48.8902627511592, + "grad_norm": 0.4310637414455414, + "learning_rate": 3.3025449843604806e-05, + "loss": 0.2662, + "num_input_tokens_seen": 24201104, + "step": 15840 + }, + { + "epoch": 48.90571870170015, + "grad_norm": 0.76615971326828, + "learning_rate": 3.30161513566613e-05, + "loss": 0.2703, + "num_input_tokens_seen": 24208848, + "step": 15845 + }, + { + "epoch": 48.92117465224111, + "grad_norm": 0.3465260863304138, + "learning_rate": 3.3006851633526506e-05, + "loss": 0.2547, + "num_input_tokens_seen": 24216624, + "step": 15850 + }, + { + "epoch": 48.936630602782074, + "grad_norm": 0.6946966052055359, + "learning_rate": 3.2997550675634584e-05, + "loss": 0.2443, + "num_input_tokens_seen": 24223920, + "step": 15855 + }, + { + "epoch": 48.95208655332303, + "grad_norm": 0.5708033442497253, + "learning_rate": 3.2988248484419825e-05, + "loss": 0.3262, + "num_input_tokens_seen": 24231248, + "step": 15860 + }, + { + "epoch": 48.96754250386399, + "grad_norm": 0.406107097864151, + "learning_rate": 3.2978945061316776e-05, + "loss": 0.2513, + "num_input_tokens_seen": 24239184, + "step": 15865 + }, + { + "epoch": 48.98299845440495, + "grad_norm": 0.5293879508972168, + "learning_rate": 3.296964040776013e-05, + "loss": 0.2431, + "num_input_tokens_seen": 24246672, + "step": 15870 + }, + { + "epoch": 48.9984544049459, + "grad_norm": 0.5942738652229309, + "learning_rate": 3.296033452518478e-05, + "loss": 0.2727, + "num_input_tokens_seen": 24254448, + "step": 15875 + }, + { + "epoch": 49.01236476043277, + "grad_norm": 0.4988233745098114, + "learning_rate": 3.2951027415025806e-05, + "loss": 0.2418, + "num_input_tokens_seen": 24261264, + "step": 15880 + }, + { + "epoch": 49.02782071097373, + "grad_norm": 0.3583526909351349, + "learning_rate": 3.294171907871849e-05, + "loss": 0.2411, + "num_input_tokens_seen": 24268720, + "step": 15885 + }, + { + "epoch": 49.04327666151468, + "grad_norm": 0.5047385096549988, + "learning_rate": 3.293240951769828e-05, + "loss": 0.2305, + "num_input_tokens_seen": 24276496, + "step": 15890 + }, + { + "epoch": 49.05873261205564, + "grad_norm": 0.44770851731300354, + "learning_rate": 3.2923098733400846e-05, + "loss": 0.2891, + "num_input_tokens_seen": 24284016, + "step": 15895 + }, + { + "epoch": 49.074188562596596, + "grad_norm": 0.8449902534484863, + "learning_rate": 3.291378672726202e-05, + "loss": 0.3978, + "num_input_tokens_seen": 24291664, + "step": 15900 + }, + { + "epoch": 49.08964451313756, + "grad_norm": 0.5976800322532654, + "learning_rate": 3.2904473500717824e-05, + "loss": 0.2155, + "num_input_tokens_seen": 24298768, + "step": 15905 + }, + { + "epoch": 49.10510046367852, + "grad_norm": 0.8061519861221313, + "learning_rate": 3.289515905520449e-05, + "loss": 0.3373, + "num_input_tokens_seen": 24306672, + "step": 15910 + }, + { + "epoch": 49.12055641421947, + "grad_norm": 0.4264138340950012, + "learning_rate": 3.288584339215841e-05, + "loss": 0.2746, + "num_input_tokens_seen": 24314160, + "step": 15915 + }, + { + "epoch": 49.13601236476043, + "grad_norm": 0.4502829909324646, + "learning_rate": 3.287652651301617e-05, + "loss": 0.3534, + "num_input_tokens_seen": 24322064, + "step": 15920 + }, + { + "epoch": 49.15146831530139, + "grad_norm": 0.2581764757633209, + "learning_rate": 3.286720841921457e-05, + "loss": 0.1954, + "num_input_tokens_seen": 24329584, + "step": 15925 + }, + { + "epoch": 49.16692426584235, + "grad_norm": 0.4788527488708496, + "learning_rate": 3.285788911219056e-05, + "loss": 0.2285, + "num_input_tokens_seen": 24337264, + "step": 15930 + }, + { + "epoch": 49.18238021638331, + "grad_norm": 0.8393002152442932, + "learning_rate": 3.284856859338131e-05, + "loss": 0.2647, + "num_input_tokens_seen": 24344720, + "step": 15935 + }, + { + "epoch": 49.19783616692427, + "grad_norm": 0.47231820225715637, + "learning_rate": 3.283924686422414e-05, + "loss": 0.236, + "num_input_tokens_seen": 24352656, + "step": 15940 + }, + { + "epoch": 49.21329211746522, + "grad_norm": 0.325156033039093, + "learning_rate": 3.282992392615659e-05, + "loss": 0.222, + "num_input_tokens_seen": 24360208, + "step": 15945 + }, + { + "epoch": 49.22874806800618, + "grad_norm": 0.5404468178749084, + "learning_rate": 3.282059978061638e-05, + "loss": 0.2274, + "num_input_tokens_seen": 24368176, + "step": 15950 + }, + { + "epoch": 49.244204018547144, + "grad_norm": 0.618349015712738, + "learning_rate": 3.28112744290414e-05, + "loss": 0.2488, + "num_input_tokens_seen": 24375984, + "step": 15955 + }, + { + "epoch": 49.2596599690881, + "grad_norm": 0.5664922595024109, + "learning_rate": 3.280194787286974e-05, + "loss": 0.2114, + "num_input_tokens_seen": 24383952, + "step": 15960 + }, + { + "epoch": 49.27511591962906, + "grad_norm": 0.5828094482421875, + "learning_rate": 3.2792620113539674e-05, + "loss": 0.2565, + "num_input_tokens_seen": 24391664, + "step": 15965 + }, + { + "epoch": 49.29057187017001, + "grad_norm": 0.4621988832950592, + "learning_rate": 3.278329115248966e-05, + "loss": 0.2737, + "num_input_tokens_seen": 24399344, + "step": 15970 + }, + { + "epoch": 49.30602782071097, + "grad_norm": 0.5617520809173584, + "learning_rate": 3.277396099115834e-05, + "loss": 0.2422, + "num_input_tokens_seen": 24406704, + "step": 15975 + }, + { + "epoch": 49.321483771251934, + "grad_norm": 0.3872825801372528, + "learning_rate": 3.276462963098454e-05, + "loss": 0.2538, + "num_input_tokens_seen": 24414000, + "step": 15980 + }, + { + "epoch": 49.33693972179289, + "grad_norm": 0.5686657428741455, + "learning_rate": 3.275529707340728e-05, + "loss": 0.2421, + "num_input_tokens_seen": 24421424, + "step": 15985 + }, + { + "epoch": 49.35239567233385, + "grad_norm": 0.604526698589325, + "learning_rate": 3.274596331986574e-05, + "loss": 0.2341, + "num_input_tokens_seen": 24429552, + "step": 15990 + }, + { + "epoch": 49.36785162287481, + "grad_norm": 0.37907347083091736, + "learning_rate": 3.273662837179932e-05, + "loss": 0.2148, + "num_input_tokens_seen": 24437168, + "step": 15995 + }, + { + "epoch": 49.38330757341576, + "grad_norm": 0.6652953624725342, + "learning_rate": 3.272729223064758e-05, + "loss": 0.29, + "num_input_tokens_seen": 24445104, + "step": 16000 + }, + { + "epoch": 49.38330757341576, + "eval_loss": 0.31683194637298584, + "eval_runtime": 6.3178, + "eval_samples_per_second": 91.013, + "eval_steps_per_second": 22.793, + "num_input_tokens_seen": 24445104, + "step": 16000 + }, + { + "epoch": 49.398763523956724, + "grad_norm": 0.46356311440467834, + "learning_rate": 3.2717954897850264e-05, + "loss": 0.2975, + "num_input_tokens_seen": 24452624, + "step": 16005 + }, + { + "epoch": 49.414219474497685, + "grad_norm": 0.49919453263282776, + "learning_rate": 3.270861637484733e-05, + "loss": 0.2985, + "num_input_tokens_seen": 24460144, + "step": 16010 + }, + { + "epoch": 49.42967542503864, + "grad_norm": 0.2843089699745178, + "learning_rate": 3.2699276663078867e-05, + "loss": 0.2445, + "num_input_tokens_seen": 24467696, + "step": 16015 + }, + { + "epoch": 49.4451313755796, + "grad_norm": 0.6231794953346252, + "learning_rate": 3.268993576398519e-05, + "loss": 0.2428, + "num_input_tokens_seen": 24475664, + "step": 16020 + }, + { + "epoch": 49.46058732612055, + "grad_norm": 0.36897730827331543, + "learning_rate": 3.268059367900678e-05, + "loss": 0.234, + "num_input_tokens_seen": 24483216, + "step": 16025 + }, + { + "epoch": 49.476043276661514, + "grad_norm": 0.5073255896568298, + "learning_rate": 3.26712504095843e-05, + "loss": 0.2726, + "num_input_tokens_seen": 24490416, + "step": 16030 + }, + { + "epoch": 49.491499227202475, + "grad_norm": 0.4438701570034027, + "learning_rate": 3.2661905957158615e-05, + "loss": 0.2839, + "num_input_tokens_seen": 24498128, + "step": 16035 + }, + { + "epoch": 49.50695517774343, + "grad_norm": 0.5323871970176697, + "learning_rate": 3.2652560323170734e-05, + "loss": 0.3182, + "num_input_tokens_seen": 24505808, + "step": 16040 + }, + { + "epoch": 49.52241112828439, + "grad_norm": 0.3100323975086212, + "learning_rate": 3.264321350906189e-05, + "loss": 0.2576, + "num_input_tokens_seen": 24513648, + "step": 16045 + }, + { + "epoch": 49.53786707882535, + "grad_norm": 0.42708197236061096, + "learning_rate": 3.263386551627346e-05, + "loss": 0.2729, + "num_input_tokens_seen": 24521552, + "step": 16050 + }, + { + "epoch": 49.553323029366304, + "grad_norm": 0.4065527617931366, + "learning_rate": 3.2624516346247055e-05, + "loss": 0.2748, + "num_input_tokens_seen": 24529104, + "step": 16055 + }, + { + "epoch": 49.568778979907265, + "grad_norm": 0.5642593502998352, + "learning_rate": 3.2615166000424404e-05, + "loss": 0.298, + "num_input_tokens_seen": 24537424, + "step": 16060 + }, + { + "epoch": 49.584234930448225, + "grad_norm": 0.32338011264801025, + "learning_rate": 3.260581448024745e-05, + "loss": 0.2262, + "num_input_tokens_seen": 24544688, + "step": 16065 + }, + { + "epoch": 49.59969088098918, + "grad_norm": 0.5745888352394104, + "learning_rate": 3.2596461787158335e-05, + "loss": 0.2678, + "num_input_tokens_seen": 24552592, + "step": 16070 + }, + { + "epoch": 49.61514683153014, + "grad_norm": 0.7219393849372864, + "learning_rate": 3.258710792259934e-05, + "loss": 0.2595, + "num_input_tokens_seen": 24559824, + "step": 16075 + }, + { + "epoch": 49.630602782071094, + "grad_norm": 0.39848577976226807, + "learning_rate": 3.257775288801296e-05, + "loss": 0.2305, + "num_input_tokens_seen": 24567952, + "step": 16080 + }, + { + "epoch": 49.646058732612055, + "grad_norm": 0.3830896019935608, + "learning_rate": 3.256839668484186e-05, + "loss": 0.2912, + "num_input_tokens_seen": 24575952, + "step": 16085 + }, + { + "epoch": 49.661514683153015, + "grad_norm": 0.5375733375549316, + "learning_rate": 3.255903931452888e-05, + "loss": 0.2486, + "num_input_tokens_seen": 24583536, + "step": 16090 + }, + { + "epoch": 49.67697063369397, + "grad_norm": 0.6512746810913086, + "learning_rate": 3.2549680778517045e-05, + "loss": 0.3182, + "num_input_tokens_seen": 24591504, + "step": 16095 + }, + { + "epoch": 49.69242658423493, + "grad_norm": 0.5498485565185547, + "learning_rate": 3.2540321078249556e-05, + "loss": 0.2275, + "num_input_tokens_seen": 24598800, + "step": 16100 + }, + { + "epoch": 49.70788253477589, + "grad_norm": 0.780023992061615, + "learning_rate": 3.2530960215169795e-05, + "loss": 0.2499, + "num_input_tokens_seen": 24606352, + "step": 16105 + }, + { + "epoch": 49.723338485316845, + "grad_norm": 0.48530665040016174, + "learning_rate": 3.2521598190721345e-05, + "loss": 0.3026, + "num_input_tokens_seen": 24614320, + "step": 16110 + }, + { + "epoch": 49.738794435857805, + "grad_norm": 0.44959938526153564, + "learning_rate": 3.251223500634792e-05, + "loss": 0.2557, + "num_input_tokens_seen": 24621936, + "step": 16115 + }, + { + "epoch": 49.754250386398766, + "grad_norm": 0.6927440762519836, + "learning_rate": 3.2502870663493445e-05, + "loss": 0.2854, + "num_input_tokens_seen": 24629232, + "step": 16120 + }, + { + "epoch": 49.76970633693972, + "grad_norm": 0.6212608814239502, + "learning_rate": 3.249350516360203e-05, + "loss": 0.2319, + "num_input_tokens_seen": 24636592, + "step": 16125 + }, + { + "epoch": 49.78516228748068, + "grad_norm": 0.51052325963974, + "learning_rate": 3.248413850811797e-05, + "loss": 0.2479, + "num_input_tokens_seen": 24644080, + "step": 16130 + }, + { + "epoch": 49.80061823802164, + "grad_norm": 0.399612158536911, + "learning_rate": 3.2474770698485677e-05, + "loss": 0.29, + "num_input_tokens_seen": 24651664, + "step": 16135 + }, + { + "epoch": 49.816074188562595, + "grad_norm": 0.5250032544136047, + "learning_rate": 3.246540173614983e-05, + "loss": 0.2557, + "num_input_tokens_seen": 24658768, + "step": 16140 + }, + { + "epoch": 49.831530139103556, + "grad_norm": 0.6035742163658142, + "learning_rate": 3.2456031622555197e-05, + "loss": 0.2839, + "num_input_tokens_seen": 24665968, + "step": 16145 + }, + { + "epoch": 49.84698608964451, + "grad_norm": 0.520489513874054, + "learning_rate": 3.2446660359146794e-05, + "loss": 0.2363, + "num_input_tokens_seen": 24674000, + "step": 16150 + }, + { + "epoch": 49.86244204018547, + "grad_norm": 0.749563455581665, + "learning_rate": 3.2437287947369786e-05, + "loss": 0.2886, + "num_input_tokens_seen": 24681424, + "step": 16155 + }, + { + "epoch": 49.87789799072643, + "grad_norm": 0.4947766661643982, + "learning_rate": 3.2427914388669525e-05, + "loss": 0.2565, + "num_input_tokens_seen": 24688720, + "step": 16160 + }, + { + "epoch": 49.893353941267385, + "grad_norm": 0.4453890323638916, + "learning_rate": 3.241853968449151e-05, + "loss": 0.2363, + "num_input_tokens_seen": 24696848, + "step": 16165 + }, + { + "epoch": 49.908809891808346, + "grad_norm": 0.36543846130371094, + "learning_rate": 3.240916383628144e-05, + "loss": 0.2636, + "num_input_tokens_seen": 24704528, + "step": 16170 + }, + { + "epoch": 49.92426584234931, + "grad_norm": 0.4840046167373657, + "learning_rate": 3.239978684548521e-05, + "loss": 0.254, + "num_input_tokens_seen": 24711984, + "step": 16175 + }, + { + "epoch": 49.93972179289026, + "grad_norm": 0.3557453155517578, + "learning_rate": 3.239040871354885e-05, + "loss": 0.3044, + "num_input_tokens_seen": 24719184, + "step": 16180 + }, + { + "epoch": 49.95517774343122, + "grad_norm": 0.4184607267379761, + "learning_rate": 3.2381029441918596e-05, + "loss": 0.2348, + "num_input_tokens_seen": 24727664, + "step": 16185 + }, + { + "epoch": 49.97063369397218, + "grad_norm": 0.89964359998703, + "learning_rate": 3.2371649032040845e-05, + "loss": 0.2363, + "num_input_tokens_seen": 24735632, + "step": 16190 + }, + { + "epoch": 49.986089644513136, + "grad_norm": 0.6153954267501831, + "learning_rate": 3.2362267485362174e-05, + "loss": 0.3466, + "num_input_tokens_seen": 24743152, + "step": 16195 + }, + { + "epoch": 50.0, + "grad_norm": 0.7429007887840271, + "learning_rate": 3.235288480332934e-05, + "loss": 0.2291, + "num_input_tokens_seen": 24750256, + "step": 16200 + }, + { + "epoch": 50.0, + "eval_loss": 0.3159416615962982, + "eval_runtime": 6.2838, + "eval_samples_per_second": 91.505, + "eval_steps_per_second": 22.916, + "num_input_tokens_seen": 24750256, + "step": 16200 + }, + { + "epoch": 50.01545595054096, + "grad_norm": 0.3714267909526825, + "learning_rate": 3.234350098738927e-05, + "loss": 0.3116, + "num_input_tokens_seen": 24757680, + "step": 16205 + }, + { + "epoch": 50.030911901081915, + "grad_norm": 0.5455105304718018, + "learning_rate": 3.233411603898906e-05, + "loss": 0.233, + "num_input_tokens_seen": 24765168, + "step": 16210 + }, + { + "epoch": 50.046367851622875, + "grad_norm": 0.7928998470306396, + "learning_rate": 3.232472995957599e-05, + "loss": 0.3657, + "num_input_tokens_seen": 24772272, + "step": 16215 + }, + { + "epoch": 50.061823802163836, + "grad_norm": 0.5258522629737854, + "learning_rate": 3.231534275059751e-05, + "loss": 0.313, + "num_input_tokens_seen": 24780368, + "step": 16220 + }, + { + "epoch": 50.07727975270479, + "grad_norm": 0.8557379245758057, + "learning_rate": 3.230595441350125e-05, + "loss": 0.3051, + "num_input_tokens_seen": 24788272, + "step": 16225 + }, + { + "epoch": 50.09273570324575, + "grad_norm": 0.3367864787578583, + "learning_rate": 3.2296564949735e-05, + "loss": 0.2344, + "num_input_tokens_seen": 24795952, + "step": 16230 + }, + { + "epoch": 50.108191653786704, + "grad_norm": 0.6115015149116516, + "learning_rate": 3.228717436074675e-05, + "loss": 0.2349, + "num_input_tokens_seen": 24803312, + "step": 16235 + }, + { + "epoch": 50.123647604327665, + "grad_norm": 0.6149800419807434, + "learning_rate": 3.227778264798463e-05, + "loss": 0.2676, + "num_input_tokens_seen": 24810896, + "step": 16240 + }, + { + "epoch": 50.139103554868626, + "grad_norm": 0.47352537512779236, + "learning_rate": 3.226838981289698e-05, + "loss": 0.2674, + "num_input_tokens_seen": 24817904, + "step": 16245 + }, + { + "epoch": 50.15455950540958, + "grad_norm": 0.4919855296611786, + "learning_rate": 3.225899585693227e-05, + "loss": 0.2237, + "num_input_tokens_seen": 24825776, + "step": 16250 + }, + { + "epoch": 50.17001545595054, + "grad_norm": 0.46672746539115906, + "learning_rate": 3.224960078153918e-05, + "loss": 0.2715, + "num_input_tokens_seen": 24833136, + "step": 16255 + }, + { + "epoch": 50.1854714064915, + "grad_norm": 0.5938121676445007, + "learning_rate": 3.224020458816655e-05, + "loss": 0.2245, + "num_input_tokens_seen": 24840144, + "step": 16260 + }, + { + "epoch": 50.200927357032455, + "grad_norm": 0.42155885696411133, + "learning_rate": 3.223080727826337e-05, + "loss": 0.2415, + "num_input_tokens_seen": 24847728, + "step": 16265 + }, + { + "epoch": 50.216383307573416, + "grad_norm": 0.5290868878364563, + "learning_rate": 3.222140885327885e-05, + "loss": 0.2577, + "num_input_tokens_seen": 24855344, + "step": 16270 + }, + { + "epoch": 50.23183925811438, + "grad_norm": 0.4989933967590332, + "learning_rate": 3.221200931466234e-05, + "loss": 0.2096, + "num_input_tokens_seen": 24862800, + "step": 16275 + }, + { + "epoch": 50.24729520865533, + "grad_norm": 0.4330855906009674, + "learning_rate": 3.220260866386336e-05, + "loss": 0.2859, + "num_input_tokens_seen": 24870800, + "step": 16280 + }, + { + "epoch": 50.26275115919629, + "grad_norm": 0.32111656665802, + "learning_rate": 3.21932069023316e-05, + "loss": 0.315, + "num_input_tokens_seen": 24878448, + "step": 16285 + }, + { + "epoch": 50.27820710973725, + "grad_norm": 0.4845902621746063, + "learning_rate": 3.218380403151695e-05, + "loss": 0.2837, + "num_input_tokens_seen": 24886416, + "step": 16290 + }, + { + "epoch": 50.293663060278206, + "grad_norm": 0.356852263212204, + "learning_rate": 3.217440005286943e-05, + "loss": 0.2229, + "num_input_tokens_seen": 24893840, + "step": 16295 + }, + { + "epoch": 50.30911901081917, + "grad_norm": 0.4096072316169739, + "learning_rate": 3.216499496783928e-05, + "loss": 0.2074, + "num_input_tokens_seen": 24900720, + "step": 16300 + }, + { + "epoch": 50.32457496136012, + "grad_norm": 0.5501698851585388, + "learning_rate": 3.2155588777876856e-05, + "loss": 0.3634, + "num_input_tokens_seen": 24908464, + "step": 16305 + }, + { + "epoch": 50.34003091190108, + "grad_norm": 0.5230047106742859, + "learning_rate": 3.214618148443273e-05, + "loss": 0.2088, + "num_input_tokens_seen": 24916080, + "step": 16310 + }, + { + "epoch": 50.35548686244204, + "grad_norm": 0.36626628041267395, + "learning_rate": 3.2136773088957595e-05, + "loss": 0.2349, + "num_input_tokens_seen": 24923536, + "step": 16315 + }, + { + "epoch": 50.370942812982996, + "grad_norm": 0.36631444096565247, + "learning_rate": 3.2127363592902374e-05, + "loss": 0.2532, + "num_input_tokens_seen": 24931152, + "step": 16320 + }, + { + "epoch": 50.38639876352396, + "grad_norm": 0.5628075003623962, + "learning_rate": 3.211795299771812e-05, + "loss": 0.2717, + "num_input_tokens_seen": 24938768, + "step": 16325 + }, + { + "epoch": 50.40185471406492, + "grad_norm": 0.41110342741012573, + "learning_rate": 3.210854130485605e-05, + "loss": 0.252, + "num_input_tokens_seen": 24945968, + "step": 16330 + }, + { + "epoch": 50.41731066460587, + "grad_norm": 0.4724985957145691, + "learning_rate": 3.209912851576759e-05, + "loss": 0.2468, + "num_input_tokens_seen": 24953648, + "step": 16335 + }, + { + "epoch": 50.43276661514683, + "grad_norm": 0.3954441249370575, + "learning_rate": 3.208971463190431e-05, + "loss": 0.2507, + "num_input_tokens_seen": 24961392, + "step": 16340 + }, + { + "epoch": 50.44822256568779, + "grad_norm": 1.0600833892822266, + "learning_rate": 3.208029965471793e-05, + "loss": 0.3232, + "num_input_tokens_seen": 24969104, + "step": 16345 + }, + { + "epoch": 50.46367851622875, + "grad_norm": 0.5381712913513184, + "learning_rate": 3.2070883585660364e-05, + "loss": 0.2417, + "num_input_tokens_seen": 24976752, + "step": 16350 + }, + { + "epoch": 50.47913446676971, + "grad_norm": 0.7766026258468628, + "learning_rate": 3.20614664261837e-05, + "loss": 0.3302, + "num_input_tokens_seen": 24984336, + "step": 16355 + }, + { + "epoch": 50.49459041731066, + "grad_norm": 0.4419419765472412, + "learning_rate": 3.205204817774016e-05, + "loss": 0.2142, + "num_input_tokens_seen": 24992144, + "step": 16360 + }, + { + "epoch": 50.51004636785162, + "grad_norm": 0.393686443567276, + "learning_rate": 3.204262884178218e-05, + "loss": 0.2562, + "num_input_tokens_seen": 24999792, + "step": 16365 + }, + { + "epoch": 50.52550231839258, + "grad_norm": 0.7704620957374573, + "learning_rate": 3.2033208419762314e-05, + "loss": 0.335, + "num_input_tokens_seen": 25007824, + "step": 16370 + }, + { + "epoch": 50.54095826893354, + "grad_norm": 0.49729883670806885, + "learning_rate": 3.2023786913133344e-05, + "loss": 0.2685, + "num_input_tokens_seen": 25015568, + "step": 16375 + }, + { + "epoch": 50.5564142194745, + "grad_norm": 0.6618556976318359, + "learning_rate": 3.201436432334816e-05, + "loss": 0.2409, + "num_input_tokens_seen": 25023408, + "step": 16380 + }, + { + "epoch": 50.57187017001546, + "grad_norm": 0.6579931378364563, + "learning_rate": 3.2004940651859844e-05, + "loss": 0.3109, + "num_input_tokens_seen": 25031120, + "step": 16385 + }, + { + "epoch": 50.58732612055641, + "grad_norm": 0.38889333605766296, + "learning_rate": 3.1995515900121655e-05, + "loss": 0.2494, + "num_input_tokens_seen": 25039408, + "step": 16390 + }, + { + "epoch": 50.60278207109737, + "grad_norm": 0.48646998405456543, + "learning_rate": 3.1986090069587e-05, + "loss": 0.2668, + "num_input_tokens_seen": 25047408, + "step": 16395 + }, + { + "epoch": 50.618238021638334, + "grad_norm": 0.5991451144218445, + "learning_rate": 3.1976663161709466e-05, + "loss": 0.2685, + "num_input_tokens_seen": 25055056, + "step": 16400 + }, + { + "epoch": 50.618238021638334, + "eval_loss": 0.31614547967910767, + "eval_runtime": 6.3255, + "eval_samples_per_second": 90.902, + "eval_steps_per_second": 22.765, + "num_input_tokens_seen": 25055056, + "step": 16400 + }, + { + "epoch": 50.63369397217929, + "grad_norm": 0.4481904208660126, + "learning_rate": 3.196723517794279e-05, + "loss": 0.2218, + "num_input_tokens_seen": 25063056, + "step": 16405 + }, + { + "epoch": 50.64914992272025, + "grad_norm": 0.4563565254211426, + "learning_rate": 3.19578061197409e-05, + "loss": 0.3084, + "num_input_tokens_seen": 25070992, + "step": 16410 + }, + { + "epoch": 50.66460587326121, + "grad_norm": 0.4527413845062256, + "learning_rate": 3.194837598855787e-05, + "loss": 0.2645, + "num_input_tokens_seen": 25078224, + "step": 16415 + }, + { + "epoch": 50.68006182380216, + "grad_norm": 0.6317687630653381, + "learning_rate": 3.193894478584794e-05, + "loss": 0.3064, + "num_input_tokens_seen": 25086480, + "step": 16420 + }, + { + "epoch": 50.695517774343124, + "grad_norm": 0.5777757167816162, + "learning_rate": 3.192951251306553e-05, + "loss": 0.2814, + "num_input_tokens_seen": 25093776, + "step": 16425 + }, + { + "epoch": 50.71097372488408, + "grad_norm": 0.4865227937698364, + "learning_rate": 3.192007917166521e-05, + "loss": 0.2293, + "num_input_tokens_seen": 25101232, + "step": 16430 + }, + { + "epoch": 50.72642967542504, + "grad_norm": 0.378996342420578, + "learning_rate": 3.191064476310171e-05, + "loss": 0.2297, + "num_input_tokens_seen": 25108656, + "step": 16435 + }, + { + "epoch": 50.741885625966, + "grad_norm": 0.5088365077972412, + "learning_rate": 3.1901209288829944e-05, + "loss": 0.2307, + "num_input_tokens_seen": 25116976, + "step": 16440 + }, + { + "epoch": 50.75734157650695, + "grad_norm": 0.6280990242958069, + "learning_rate": 3.1891772750304985e-05, + "loss": 0.2916, + "num_input_tokens_seen": 25124624, + "step": 16445 + }, + { + "epoch": 50.77279752704791, + "grad_norm": 0.6540954113006592, + "learning_rate": 3.188233514898206e-05, + "loss": 0.2598, + "num_input_tokens_seen": 25132560, + "step": 16450 + }, + { + "epoch": 50.788253477588874, + "grad_norm": 0.3471432030200958, + "learning_rate": 3.187289648631657e-05, + "loss": 0.2477, + "num_input_tokens_seen": 25140624, + "step": 16455 + }, + { + "epoch": 50.80370942812983, + "grad_norm": 0.3370917737483978, + "learning_rate": 3.186345676376406e-05, + "loss": 0.2299, + "num_input_tokens_seen": 25148496, + "step": 16460 + }, + { + "epoch": 50.81916537867079, + "grad_norm": 0.45670318603515625, + "learning_rate": 3.1854015982780275e-05, + "loss": 0.1975, + "num_input_tokens_seen": 25156432, + "step": 16465 + }, + { + "epoch": 50.83462132921175, + "grad_norm": 0.6453576683998108, + "learning_rate": 3.1844574144821084e-05, + "loss": 0.213, + "num_input_tokens_seen": 25164240, + "step": 16470 + }, + { + "epoch": 50.8500772797527, + "grad_norm": 0.8542863130569458, + "learning_rate": 3.1835131251342554e-05, + "loss": 0.2609, + "num_input_tokens_seen": 25171760, + "step": 16475 + }, + { + "epoch": 50.865533230293664, + "grad_norm": 0.41076862812042236, + "learning_rate": 3.182568730380089e-05, + "loss": 0.2634, + "num_input_tokens_seen": 25179504, + "step": 16480 + }, + { + "epoch": 50.88098918083462, + "grad_norm": 0.6864318251609802, + "learning_rate": 3.181624230365245e-05, + "loss": 0.3094, + "num_input_tokens_seen": 25186896, + "step": 16485 + }, + { + "epoch": 50.89644513137558, + "grad_norm": 0.7534364461898804, + "learning_rate": 3.180679625235381e-05, + "loss": 0.2326, + "num_input_tokens_seen": 25194704, + "step": 16490 + }, + { + "epoch": 50.91190108191654, + "grad_norm": 0.5446121096611023, + "learning_rate": 3.1797349151361646e-05, + "loss": 0.2603, + "num_input_tokens_seen": 25202096, + "step": 16495 + }, + { + "epoch": 50.92735703245749, + "grad_norm": 0.46491578221321106, + "learning_rate": 3.178790100213281e-05, + "loss": 0.2279, + "num_input_tokens_seen": 25209360, + "step": 16500 + }, + { + "epoch": 50.942812982998454, + "grad_norm": 0.6760479211807251, + "learning_rate": 3.1778451806124346e-05, + "loss": 0.2356, + "num_input_tokens_seen": 25217168, + "step": 16505 + }, + { + "epoch": 50.958268933539415, + "grad_norm": 0.6679714322090149, + "learning_rate": 3.176900156479342e-05, + "loss": 0.2984, + "num_input_tokens_seen": 25224688, + "step": 16510 + }, + { + "epoch": 50.97372488408037, + "grad_norm": 0.5691900253295898, + "learning_rate": 3.17595502795974e-05, + "loss": 0.2484, + "num_input_tokens_seen": 25232624, + "step": 16515 + }, + { + "epoch": 50.98918083462133, + "grad_norm": 0.8480483293533325, + "learning_rate": 3.175009795199377e-05, + "loss": 0.2784, + "num_input_tokens_seen": 25240368, + "step": 16520 + }, + { + "epoch": 51.003091190108194, + "grad_norm": 0.45839619636535645, + "learning_rate": 3.1740644583440224e-05, + "loss": 0.2684, + "num_input_tokens_seen": 25247024, + "step": 16525 + }, + { + "epoch": 51.01854714064915, + "grad_norm": 0.7835742235183716, + "learning_rate": 3.173119017539457e-05, + "loss": 0.2609, + "num_input_tokens_seen": 25254352, + "step": 16530 + }, + { + "epoch": 51.03400309119011, + "grad_norm": 0.4312191903591156, + "learning_rate": 3.172173472931479e-05, + "loss": 0.2452, + "num_input_tokens_seen": 25261680, + "step": 16535 + }, + { + "epoch": 51.04945904173107, + "grad_norm": 0.5156587958335876, + "learning_rate": 3.1712278246659055e-05, + "loss": 0.3268, + "num_input_tokens_seen": 25269008, + "step": 16540 + }, + { + "epoch": 51.06491499227202, + "grad_norm": 0.6268177628517151, + "learning_rate": 3.170282072888566e-05, + "loss": 0.2555, + "num_input_tokens_seen": 25276464, + "step": 16545 + }, + { + "epoch": 51.08037094281298, + "grad_norm": 0.24602356553077698, + "learning_rate": 3.169336217745307e-05, + "loss": 0.2703, + "num_input_tokens_seen": 25284400, + "step": 16550 + }, + { + "epoch": 51.095826893353944, + "grad_norm": 0.3915085792541504, + "learning_rate": 3.1683902593819924e-05, + "loss": 0.2036, + "num_input_tokens_seen": 25292464, + "step": 16555 + }, + { + "epoch": 51.1112828438949, + "grad_norm": 0.4236101806163788, + "learning_rate": 3.1674441979445e-05, + "loss": 0.2012, + "num_input_tokens_seen": 25299920, + "step": 16560 + }, + { + "epoch": 51.12673879443586, + "grad_norm": 0.6341826915740967, + "learning_rate": 3.166498033578725e-05, + "loss": 0.3092, + "num_input_tokens_seen": 25307824, + "step": 16565 + }, + { + "epoch": 51.14219474497681, + "grad_norm": 0.2911948561668396, + "learning_rate": 3.165551766430578e-05, + "loss": 0.2525, + "num_input_tokens_seen": 25314960, + "step": 16570 + }, + { + "epoch": 51.15765069551777, + "grad_norm": 0.5051543116569519, + "learning_rate": 3.164605396645984e-05, + "loss": 0.2706, + "num_input_tokens_seen": 25322096, + "step": 16575 + }, + { + "epoch": 51.173106646058734, + "grad_norm": 0.6210150122642517, + "learning_rate": 3.163658924370886e-05, + "loss": 0.2811, + "num_input_tokens_seen": 25329808, + "step": 16580 + }, + { + "epoch": 51.18856259659969, + "grad_norm": 0.6521432399749756, + "learning_rate": 3.1627123497512415e-05, + "loss": 0.2565, + "num_input_tokens_seen": 25337712, + "step": 16585 + }, + { + "epoch": 51.20401854714065, + "grad_norm": 0.4901999235153198, + "learning_rate": 3.1617656729330245e-05, + "loss": 0.2644, + "num_input_tokens_seen": 25345456, + "step": 16590 + }, + { + "epoch": 51.21947449768161, + "grad_norm": 0.8377752900123596, + "learning_rate": 3.1608188940622255e-05, + "loss": 0.2418, + "num_input_tokens_seen": 25352720, + "step": 16595 + }, + { + "epoch": 51.23493044822256, + "grad_norm": 0.597045361995697, + "learning_rate": 3.159872013284847e-05, + "loss": 0.2823, + "num_input_tokens_seen": 25360976, + "step": 16600 + }, + { + "epoch": 51.23493044822256, + "eval_loss": 0.3140900135040283, + "eval_runtime": 6.3144, + "eval_samples_per_second": 91.061, + "eval_steps_per_second": 22.805, + "num_input_tokens_seen": 25360976, + "step": 16600 + }, + { + "epoch": 51.250386398763524, + "grad_norm": 0.5598127841949463, + "learning_rate": 3.1589250307469134e-05, + "loss": 0.2248, + "num_input_tokens_seen": 25369360, + "step": 16605 + }, + { + "epoch": 51.265842349304485, + "grad_norm": 0.7669420838356018, + "learning_rate": 3.1579779465944586e-05, + "loss": 0.2558, + "num_input_tokens_seen": 25377328, + "step": 16610 + }, + { + "epoch": 51.28129829984544, + "grad_norm": 0.5673779249191284, + "learning_rate": 3.1570307609735363e-05, + "loss": 0.3251, + "num_input_tokens_seen": 25385712, + "step": 16615 + }, + { + "epoch": 51.2967542503864, + "grad_norm": 0.5076112747192383, + "learning_rate": 3.156083474030213e-05, + "loss": 0.2675, + "num_input_tokens_seen": 25392848, + "step": 16620 + }, + { + "epoch": 51.31221020092736, + "grad_norm": 0.4664924740791321, + "learning_rate": 3.155136085910573e-05, + "loss": 0.2154, + "num_input_tokens_seen": 25400944, + "step": 16625 + }, + { + "epoch": 51.327666151468314, + "grad_norm": 0.7010108828544617, + "learning_rate": 3.154188596760717e-05, + "loss": 0.2814, + "num_input_tokens_seen": 25408048, + "step": 16630 + }, + { + "epoch": 51.343122102009275, + "grad_norm": 0.745057225227356, + "learning_rate": 3.153241006726757e-05, + "loss": 0.276, + "num_input_tokens_seen": 25415248, + "step": 16635 + }, + { + "epoch": 51.35857805255023, + "grad_norm": 0.5705413222312927, + "learning_rate": 3.152293315954825e-05, + "loss": 0.2362, + "num_input_tokens_seen": 25423088, + "step": 16640 + }, + { + "epoch": 51.37403400309119, + "grad_norm": 0.5206142663955688, + "learning_rate": 3.1513455245910666e-05, + "loss": 0.2653, + "num_input_tokens_seen": 25431024, + "step": 16645 + }, + { + "epoch": 51.38948995363215, + "grad_norm": 0.5310730934143066, + "learning_rate": 3.150397632781643e-05, + "loss": 0.261, + "num_input_tokens_seen": 25438832, + "step": 16650 + }, + { + "epoch": 51.404945904173104, + "grad_norm": 0.665678083896637, + "learning_rate": 3.149449640672731e-05, + "loss": 0.2719, + "num_input_tokens_seen": 25446672, + "step": 16655 + }, + { + "epoch": 51.420401854714065, + "grad_norm": 0.3671247661113739, + "learning_rate": 3.148501548410523e-05, + "loss": 0.23, + "num_input_tokens_seen": 25454544, + "step": 16660 + }, + { + "epoch": 51.435857805255026, + "grad_norm": 0.42991721630096436, + "learning_rate": 3.1475533561412256e-05, + "loss": 0.2284, + "num_input_tokens_seen": 25462160, + "step": 16665 + }, + { + "epoch": 51.45131375579598, + "grad_norm": 0.5450485944747925, + "learning_rate": 3.146605064011065e-05, + "loss": 0.2214, + "num_input_tokens_seen": 25470160, + "step": 16670 + }, + { + "epoch": 51.46676970633694, + "grad_norm": 0.5950295925140381, + "learning_rate": 3.145656672166277e-05, + "loss": 0.2352, + "num_input_tokens_seen": 25477488, + "step": 16675 + }, + { + "epoch": 51.4822256568779, + "grad_norm": 0.7224443554878235, + "learning_rate": 3.144708180753116e-05, + "loss": 0.253, + "num_input_tokens_seen": 25485584, + "step": 16680 + }, + { + "epoch": 51.497681607418855, + "grad_norm": 0.44728174805641174, + "learning_rate": 3.143759589917851e-05, + "loss": 0.3285, + "num_input_tokens_seen": 25493072, + "step": 16685 + }, + { + "epoch": 51.513137557959816, + "grad_norm": 0.3595462143421173, + "learning_rate": 3.142810899806768e-05, + "loss": 0.2398, + "num_input_tokens_seen": 25500688, + "step": 16690 + }, + { + "epoch": 51.52859350850077, + "grad_norm": 0.3742568790912628, + "learning_rate": 3.141862110566166e-05, + "loss": 0.2772, + "num_input_tokens_seen": 25508912, + "step": 16695 + }, + { + "epoch": 51.54404945904173, + "grad_norm": 0.4883486032485962, + "learning_rate": 3.1409132223423606e-05, + "loss": 0.2423, + "num_input_tokens_seen": 25516144, + "step": 16700 + }, + { + "epoch": 51.55950540958269, + "grad_norm": 0.3661101758480072, + "learning_rate": 3.139964235281682e-05, + "loss": 0.2964, + "num_input_tokens_seen": 25524080, + "step": 16705 + }, + { + "epoch": 51.574961360123645, + "grad_norm": 0.4953065514564514, + "learning_rate": 3.139015149530476e-05, + "loss": 0.243, + "num_input_tokens_seen": 25531408, + "step": 16710 + }, + { + "epoch": 51.590417310664606, + "grad_norm": 0.4401352107524872, + "learning_rate": 3.1380659652351034e-05, + "loss": 0.2164, + "num_input_tokens_seen": 25539312, + "step": 16715 + }, + { + "epoch": 51.605873261205566, + "grad_norm": 0.5090393424034119, + "learning_rate": 3.137116682541941e-05, + "loss": 0.2431, + "num_input_tokens_seen": 25546736, + "step": 16720 + }, + { + "epoch": 51.62132921174652, + "grad_norm": 0.43014514446258545, + "learning_rate": 3.136167301597379e-05, + "loss": 0.2914, + "num_input_tokens_seen": 25554608, + "step": 16725 + }, + { + "epoch": 51.63678516228748, + "grad_norm": 0.4757860004901886, + "learning_rate": 3.1352178225478254e-05, + "loss": 0.2719, + "num_input_tokens_seen": 25562032, + "step": 16730 + }, + { + "epoch": 51.65224111282844, + "grad_norm": 0.6423874497413635, + "learning_rate": 3.1342682455396996e-05, + "loss": 0.2559, + "num_input_tokens_seen": 25569552, + "step": 16735 + }, + { + "epoch": 51.667697063369395, + "grad_norm": 0.525138258934021, + "learning_rate": 3.133318570719441e-05, + "loss": 0.2161, + "num_input_tokens_seen": 25577136, + "step": 16740 + }, + { + "epoch": 51.683153013910356, + "grad_norm": 0.5056799054145813, + "learning_rate": 3.132368798233499e-05, + "loss": 0.3376, + "num_input_tokens_seen": 25585136, + "step": 16745 + }, + { + "epoch": 51.69860896445132, + "grad_norm": 0.7756030559539795, + "learning_rate": 3.131418928228342e-05, + "loss": 0.3053, + "num_input_tokens_seen": 25592528, + "step": 16750 + }, + { + "epoch": 51.71406491499227, + "grad_norm": 0.5350439548492432, + "learning_rate": 3.1304689608504514e-05, + "loss": 0.3454, + "num_input_tokens_seen": 25600208, + "step": 16755 + }, + { + "epoch": 51.72952086553323, + "grad_norm": 0.4051505923271179, + "learning_rate": 3.129518896246324e-05, + "loss": 0.2293, + "num_input_tokens_seen": 25607824, + "step": 16760 + }, + { + "epoch": 51.744976816074185, + "grad_norm": 0.9270668029785156, + "learning_rate": 3.128568734562472e-05, + "loss": 0.277, + "num_input_tokens_seen": 25615504, + "step": 16765 + }, + { + "epoch": 51.760432766615146, + "grad_norm": 0.47722724080085754, + "learning_rate": 3.127618475945421e-05, + "loss": 0.2364, + "num_input_tokens_seen": 25623408, + "step": 16770 + }, + { + "epoch": 51.77588871715611, + "grad_norm": 0.5351205468177795, + "learning_rate": 3.126668120541715e-05, + "loss": 0.2851, + "num_input_tokens_seen": 25630928, + "step": 16775 + }, + { + "epoch": 51.79134466769706, + "grad_norm": 0.5677598714828491, + "learning_rate": 3.1257176684979096e-05, + "loss": 0.3004, + "num_input_tokens_seen": 25638224, + "step": 16780 + }, + { + "epoch": 51.80680061823802, + "grad_norm": 0.6548559069633484, + "learning_rate": 3.124767119960576e-05, + "loss": 0.2364, + "num_input_tokens_seen": 25646384, + "step": 16785 + }, + { + "epoch": 51.82225656877898, + "grad_norm": 0.49318477511405945, + "learning_rate": 3.123816475076301e-05, + "loss": 0.2188, + "num_input_tokens_seen": 25653968, + "step": 16790 + }, + { + "epoch": 51.837712519319936, + "grad_norm": 0.43314266204833984, + "learning_rate": 3.122865733991687e-05, + "loss": 0.2269, + "num_input_tokens_seen": 25661520, + "step": 16795 + }, + { + "epoch": 51.8531684698609, + "grad_norm": 0.5371931195259094, + "learning_rate": 3.1219148968533486e-05, + "loss": 0.2348, + "num_input_tokens_seen": 25669136, + "step": 16800 + }, + { + "epoch": 51.8531684698609, + "eval_loss": 0.3148178458213806, + "eval_runtime": 6.3068, + "eval_samples_per_second": 91.172, + "eval_steps_per_second": 22.833, + "num_input_tokens_seen": 25669136, + "step": 16800 + }, + { + "epoch": 51.86862442040186, + "grad_norm": 0.39724403619766235, + "learning_rate": 3.120963963807918e-05, + "loss": 0.3112, + "num_input_tokens_seen": 25676912, + "step": 16805 + }, + { + "epoch": 51.88408037094281, + "grad_norm": 0.43200013041496277, + "learning_rate": 3.12001293500204e-05, + "loss": 0.2264, + "num_input_tokens_seen": 25684432, + "step": 16810 + }, + { + "epoch": 51.89953632148377, + "grad_norm": 0.5046736001968384, + "learning_rate": 3.1190618105823765e-05, + "loss": 0.247, + "num_input_tokens_seen": 25691920, + "step": 16815 + }, + { + "epoch": 51.914992272024726, + "grad_norm": 0.4014033079147339, + "learning_rate": 3.118110590695603e-05, + "loss": 0.2429, + "num_input_tokens_seen": 25700208, + "step": 16820 + }, + { + "epoch": 51.93044822256569, + "grad_norm": 0.4742620289325714, + "learning_rate": 3.117159275488407e-05, + "loss": 0.2907, + "num_input_tokens_seen": 25707696, + "step": 16825 + }, + { + "epoch": 51.94590417310665, + "grad_norm": 0.4832633435726166, + "learning_rate": 3.1162078651074956e-05, + "loss": 0.2823, + "num_input_tokens_seen": 25715184, + "step": 16830 + }, + { + "epoch": 51.9613601236476, + "grad_norm": 0.4050208330154419, + "learning_rate": 3.1152563596995885e-05, + "loss": 0.2315, + "num_input_tokens_seen": 25722704, + "step": 16835 + }, + { + "epoch": 51.97681607418856, + "grad_norm": 0.6341502070426941, + "learning_rate": 3.1143047594114186e-05, + "loss": 0.3434, + "num_input_tokens_seen": 25730544, + "step": 16840 + }, + { + "epoch": 51.99227202472952, + "grad_norm": 0.886052131652832, + "learning_rate": 3.113353064389734e-05, + "loss": 0.2215, + "num_input_tokens_seen": 25737904, + "step": 16845 + }, + { + "epoch": 52.00618238021638, + "grad_norm": 0.42958348989486694, + "learning_rate": 3.1124012747812993e-05, + "loss": 0.2093, + "num_input_tokens_seen": 25744400, + "step": 16850 + }, + { + "epoch": 52.02163833075734, + "grad_norm": 0.4403996169567108, + "learning_rate": 3.1114493907328936e-05, + "loss": 0.2308, + "num_input_tokens_seen": 25752272, + "step": 16855 + }, + { + "epoch": 52.0370942812983, + "grad_norm": 0.4954417049884796, + "learning_rate": 3.110497412391306e-05, + "loss": 0.2706, + "num_input_tokens_seen": 25759664, + "step": 16860 + }, + { + "epoch": 52.052550231839255, + "grad_norm": 0.629467248916626, + "learning_rate": 3.1095453399033466e-05, + "loss": 0.2659, + "num_input_tokens_seen": 25767344, + "step": 16865 + }, + { + "epoch": 52.068006182380216, + "grad_norm": 0.5154246091842651, + "learning_rate": 3.108593173415835e-05, + "loss": 0.2538, + "num_input_tokens_seen": 25774992, + "step": 16870 + }, + { + "epoch": 52.08346213292118, + "grad_norm": 0.3127736449241638, + "learning_rate": 3.107640913075609e-05, + "loss": 0.2127, + "num_input_tokens_seen": 25782160, + "step": 16875 + }, + { + "epoch": 52.09891808346213, + "grad_norm": 0.513810396194458, + "learning_rate": 3.106688559029517e-05, + "loss": 0.2221, + "num_input_tokens_seen": 25789680, + "step": 16880 + }, + { + "epoch": 52.11437403400309, + "grad_norm": 0.31364795565605164, + "learning_rate": 3.105736111424425e-05, + "loss": 0.2363, + "num_input_tokens_seen": 25797424, + "step": 16885 + }, + { + "epoch": 52.12982998454405, + "grad_norm": 0.4910198748111725, + "learning_rate": 3.1047835704072136e-05, + "loss": 0.2176, + "num_input_tokens_seen": 25804496, + "step": 16890 + }, + { + "epoch": 52.145285935085006, + "grad_norm": 0.3620013892650604, + "learning_rate": 3.103830936124775e-05, + "loss": 0.3207, + "num_input_tokens_seen": 25812112, + "step": 16895 + }, + { + "epoch": 52.16074188562597, + "grad_norm": 0.4373488426208496, + "learning_rate": 3.102878208724018e-05, + "loss": 0.2332, + "num_input_tokens_seen": 25819664, + "step": 16900 + }, + { + "epoch": 52.17619783616692, + "grad_norm": 0.5368002653121948, + "learning_rate": 3.101925388351865e-05, + "loss": 0.2136, + "num_input_tokens_seen": 25827408, + "step": 16905 + }, + { + "epoch": 52.19165378670788, + "grad_norm": 0.3645363748073578, + "learning_rate": 3.1009724751552515e-05, + "loss": 0.2363, + "num_input_tokens_seen": 25834960, + "step": 16910 + }, + { + "epoch": 52.20710973724884, + "grad_norm": 0.4466310143470764, + "learning_rate": 3.100019469281131e-05, + "loss": 0.2518, + "num_input_tokens_seen": 25842192, + "step": 16915 + }, + { + "epoch": 52.222565687789796, + "grad_norm": 0.32382553815841675, + "learning_rate": 3.0990663708764685e-05, + "loss": 0.2359, + "num_input_tokens_seen": 25849264, + "step": 16920 + }, + { + "epoch": 52.23802163833076, + "grad_norm": 1.003248929977417, + "learning_rate": 3.098113180088243e-05, + "loss": 0.3715, + "num_input_tokens_seen": 25857136, + "step": 16925 + }, + { + "epoch": 52.25347758887172, + "grad_norm": 0.4678618609905243, + "learning_rate": 3.097159897063448e-05, + "loss": 0.2451, + "num_input_tokens_seen": 25864784, + "step": 16930 + }, + { + "epoch": 52.26893353941267, + "grad_norm": 0.5458438396453857, + "learning_rate": 3.096206521949094e-05, + "loss": 0.3141, + "num_input_tokens_seen": 25872528, + "step": 16935 + }, + { + "epoch": 52.28438948995363, + "grad_norm": 0.7360817193984985, + "learning_rate": 3.0952530548922006e-05, + "loss": 0.2847, + "num_input_tokens_seen": 25880528, + "step": 16940 + }, + { + "epoch": 52.29984544049459, + "grad_norm": 0.8665243983268738, + "learning_rate": 3.0942994960398064e-05, + "loss": 0.2741, + "num_input_tokens_seen": 25888848, + "step": 16945 + }, + { + "epoch": 52.31530139103555, + "grad_norm": 0.6714643836021423, + "learning_rate": 3.093345845538961e-05, + "loss": 0.2167, + "num_input_tokens_seen": 25896176, + "step": 16950 + }, + { + "epoch": 52.33075734157651, + "grad_norm": 0.628822386264801, + "learning_rate": 3.09239210353673e-05, + "loss": 0.2988, + "num_input_tokens_seen": 25904080, + "step": 16955 + }, + { + "epoch": 52.34621329211747, + "grad_norm": 0.4324052333831787, + "learning_rate": 3.0914382701801926e-05, + "loss": 0.2902, + "num_input_tokens_seen": 25912112, + "step": 16960 + }, + { + "epoch": 52.36166924265842, + "grad_norm": 0.5570214986801147, + "learning_rate": 3.090484345616441e-05, + "loss": 0.276, + "num_input_tokens_seen": 25919408, + "step": 16965 + }, + { + "epoch": 52.37712519319938, + "grad_norm": 0.6727666258811951, + "learning_rate": 3.0895303299925825e-05, + "loss": 0.2679, + "num_input_tokens_seen": 25926352, + "step": 16970 + }, + { + "epoch": 52.39258114374034, + "grad_norm": 0.4341881573200226, + "learning_rate": 3.0885762234557393e-05, + "loss": 0.3083, + "num_input_tokens_seen": 25934192, + "step": 16975 + }, + { + "epoch": 52.4080370942813, + "grad_norm": 0.3781125247478485, + "learning_rate": 3.087622026153045e-05, + "loss": 0.2769, + "num_input_tokens_seen": 25941840, + "step": 16980 + }, + { + "epoch": 52.42349304482226, + "grad_norm": 0.5077627301216125, + "learning_rate": 3.086667738231651e-05, + "loss": 0.2633, + "num_input_tokens_seen": 25949552, + "step": 16985 + }, + { + "epoch": 52.43894899536321, + "grad_norm": 0.6681261658668518, + "learning_rate": 3.085713359838718e-05, + "loss": 0.321, + "num_input_tokens_seen": 25957648, + "step": 16990 + }, + { + "epoch": 52.45440494590417, + "grad_norm": 0.629081666469574, + "learning_rate": 3.084758891121425e-05, + "loss": 0.2591, + "num_input_tokens_seen": 25965264, + "step": 16995 + }, + { + "epoch": 52.469860896445134, + "grad_norm": 0.5859989523887634, + "learning_rate": 3.083804332226963e-05, + "loss": 0.2632, + "num_input_tokens_seen": 25972400, + "step": 17000 + }, + { + "epoch": 52.469860896445134, + "eval_loss": 0.3149999976158142, + "eval_runtime": 6.2987, + "eval_samples_per_second": 91.289, + "eval_steps_per_second": 22.862, + "num_input_tokens_seen": 25972400, + "step": 17000 + }, + { + "epoch": 52.48531684698609, + "grad_norm": 0.411560982465744, + "learning_rate": 3.082849683302536e-05, + "loss": 0.2099, + "num_input_tokens_seen": 25979856, + "step": 17005 + }, + { + "epoch": 52.50077279752705, + "grad_norm": 0.6945553421974182, + "learning_rate": 3.081894944495363e-05, + "loss": 0.2557, + "num_input_tokens_seen": 25987376, + "step": 17010 + }, + { + "epoch": 52.51622874806801, + "grad_norm": 0.39090055227279663, + "learning_rate": 3.080940115952677e-05, + "loss": 0.2235, + "num_input_tokens_seen": 25995472, + "step": 17015 + }, + { + "epoch": 52.53168469860896, + "grad_norm": 0.3978305757045746, + "learning_rate": 3.0799851978217245e-05, + "loss": 0.2111, + "num_input_tokens_seen": 26002800, + "step": 17020 + }, + { + "epoch": 52.547140649149924, + "grad_norm": 0.3770975172519684, + "learning_rate": 3.0790301902497666e-05, + "loss": 0.2855, + "num_input_tokens_seen": 26010000, + "step": 17025 + }, + { + "epoch": 52.56259659969088, + "grad_norm": 0.49629759788513184, + "learning_rate": 3.078075093384076e-05, + "loss": 0.2738, + "num_input_tokens_seen": 26017904, + "step": 17030 + }, + { + "epoch": 52.57805255023184, + "grad_norm": 0.48910221457481384, + "learning_rate": 3.077119907371942e-05, + "loss": 0.2185, + "num_input_tokens_seen": 26026160, + "step": 17035 + }, + { + "epoch": 52.5935085007728, + "grad_norm": 0.6052612662315369, + "learning_rate": 3.076164632360666e-05, + "loss": 0.317, + "num_input_tokens_seen": 26033968, + "step": 17040 + }, + { + "epoch": 52.60896445131375, + "grad_norm": 0.43207594752311707, + "learning_rate": 3.075209268497563e-05, + "loss": 0.2393, + "num_input_tokens_seen": 26041968, + "step": 17045 + }, + { + "epoch": 52.624420401854714, + "grad_norm": 0.5738855004310608, + "learning_rate": 3.074253815929961e-05, + "loss": 0.2545, + "num_input_tokens_seen": 26049904, + "step": 17050 + }, + { + "epoch": 52.639876352395675, + "grad_norm": 0.2902267575263977, + "learning_rate": 3.0732982748052054e-05, + "loss": 0.2672, + "num_input_tokens_seen": 26057616, + "step": 17055 + }, + { + "epoch": 52.65533230293663, + "grad_norm": 0.3078409433364868, + "learning_rate": 3.072342645270651e-05, + "loss": 0.2977, + "num_input_tokens_seen": 26065264, + "step": 17060 + }, + { + "epoch": 52.67078825347759, + "grad_norm": 0.44309455156326294, + "learning_rate": 3.071386927473668e-05, + "loss": 0.251, + "num_input_tokens_seen": 26072848, + "step": 17065 + }, + { + "epoch": 52.68624420401855, + "grad_norm": 0.4663460850715637, + "learning_rate": 3.0704311215616404e-05, + "loss": 0.3446, + "num_input_tokens_seen": 26081232, + "step": 17070 + }, + { + "epoch": 52.701700154559504, + "grad_norm": 0.5422477722167969, + "learning_rate": 3.0694752276819656e-05, + "loss": 0.2644, + "num_input_tokens_seen": 26089648, + "step": 17075 + }, + { + "epoch": 52.717156105100464, + "grad_norm": 0.6999765038490295, + "learning_rate": 3.068519245982054e-05, + "loss": 0.2691, + "num_input_tokens_seen": 26097520, + "step": 17080 + }, + { + "epoch": 52.732612055641425, + "grad_norm": 0.32003548741340637, + "learning_rate": 3.0675631766093304e-05, + "loss": 0.2156, + "num_input_tokens_seen": 26105200, + "step": 17085 + }, + { + "epoch": 52.74806800618238, + "grad_norm": 0.6667821407318115, + "learning_rate": 3.066607019711232e-05, + "loss": 0.2945, + "num_input_tokens_seen": 26112816, + "step": 17090 + }, + { + "epoch": 52.76352395672334, + "grad_norm": 0.35217973589897156, + "learning_rate": 3.065650775435211e-05, + "loss": 0.332, + "num_input_tokens_seen": 26120848, + "step": 17095 + }, + { + "epoch": 52.778979907264294, + "grad_norm": 0.4215817153453827, + "learning_rate": 3.0646944439287326e-05, + "loss": 0.2173, + "num_input_tokens_seen": 26128336, + "step": 17100 + }, + { + "epoch": 52.794435857805254, + "grad_norm": 0.37672942876815796, + "learning_rate": 3.0637380253392736e-05, + "loss": 0.2639, + "num_input_tokens_seen": 26135856, + "step": 17105 + }, + { + "epoch": 52.809891808346215, + "grad_norm": 0.5604127645492554, + "learning_rate": 3.062781519814327e-05, + "loss": 0.2232, + "num_input_tokens_seen": 26143216, + "step": 17110 + }, + { + "epoch": 52.82534775888717, + "grad_norm": 0.6165072917938232, + "learning_rate": 3.0618249275013985e-05, + "loss": 0.2378, + "num_input_tokens_seen": 26150672, + "step": 17115 + }, + { + "epoch": 52.84080370942813, + "grad_norm": 0.37457624077796936, + "learning_rate": 3.060868248548005e-05, + "loss": 0.2826, + "num_input_tokens_seen": 26158416, + "step": 17120 + }, + { + "epoch": 52.85625965996909, + "grad_norm": 0.7610495090484619, + "learning_rate": 3.0599114831016796e-05, + "loss": 0.2761, + "num_input_tokens_seen": 26166832, + "step": 17125 + }, + { + "epoch": 52.871715610510044, + "grad_norm": 0.5308418869972229, + "learning_rate": 3.0589546313099666e-05, + "loss": 0.2327, + "num_input_tokens_seen": 26174000, + "step": 17130 + }, + { + "epoch": 52.887171561051005, + "grad_norm": 0.579084038734436, + "learning_rate": 3.0579976933204255e-05, + "loss": 0.2556, + "num_input_tokens_seen": 26181904, + "step": 17135 + }, + { + "epoch": 52.902627511591966, + "grad_norm": 0.49289771914482117, + "learning_rate": 3.0570406692806284e-05, + "loss": 0.2569, + "num_input_tokens_seen": 26189392, + "step": 17140 + }, + { + "epoch": 52.91808346213292, + "grad_norm": 0.4968765079975128, + "learning_rate": 3.05608355933816e-05, + "loss": 0.2153, + "num_input_tokens_seen": 26196848, + "step": 17145 + }, + { + "epoch": 52.93353941267388, + "grad_norm": 0.6334372758865356, + "learning_rate": 3.055126363640618e-05, + "loss": 0.2535, + "num_input_tokens_seen": 26204368, + "step": 17150 + }, + { + "epoch": 52.948995363214834, + "grad_norm": 0.7013767957687378, + "learning_rate": 3.0541690823356146e-05, + "loss": 0.2737, + "num_input_tokens_seen": 26211472, + "step": 17155 + }, + { + "epoch": 52.964451313755795, + "grad_norm": 0.5478538274765015, + "learning_rate": 3.053211715570775e-05, + "loss": 0.3114, + "num_input_tokens_seen": 26219120, + "step": 17160 + }, + { + "epoch": 52.979907264296756, + "grad_norm": 0.7515898942947388, + "learning_rate": 3.052254263493736e-05, + "loss": 0.22, + "num_input_tokens_seen": 26226896, + "step": 17165 + }, + { + "epoch": 52.99536321483771, + "grad_norm": 1.6193257570266724, + "learning_rate": 3.0512967262521498e-05, + "loss": 0.275, + "num_input_tokens_seen": 26234832, + "step": 17170 + }, + { + "epoch": 53.009273570324574, + "grad_norm": 0.3699585795402527, + "learning_rate": 3.0503391039936803e-05, + "loss": 0.2558, + "num_input_tokens_seen": 26241584, + "step": 17175 + }, + { + "epoch": 53.024729520865534, + "grad_norm": 0.8647301197052002, + "learning_rate": 3.0493813968660056e-05, + "loss": 0.225, + "num_input_tokens_seen": 26249200, + "step": 17180 + }, + { + "epoch": 53.04018547140649, + "grad_norm": 0.6605184078216553, + "learning_rate": 3.0484236050168153e-05, + "loss": 0.24, + "num_input_tokens_seen": 26256560, + "step": 17185 + }, + { + "epoch": 53.05564142194745, + "grad_norm": 0.34593063592910767, + "learning_rate": 3.0474657285938123e-05, + "loss": 0.2777, + "num_input_tokens_seen": 26264464, + "step": 17190 + }, + { + "epoch": 53.07109737248841, + "grad_norm": 0.6122977137565613, + "learning_rate": 3.046507767744715e-05, + "loss": 0.2381, + "num_input_tokens_seen": 26271888, + "step": 17195 + }, + { + "epoch": 53.086553323029364, + "grad_norm": 0.5501947999000549, + "learning_rate": 3.045549722617252e-05, + "loss": 0.2542, + "num_input_tokens_seen": 26280272, + "step": 17200 + }, + { + "epoch": 53.086553323029364, + "eval_loss": 0.3144283592700958, + "eval_runtime": 6.2647, + "eval_samples_per_second": 91.785, + "eval_steps_per_second": 22.986, + "num_input_tokens_seen": 26280272, + "step": 17200 + }, + { + "epoch": 53.102009273570324, + "grad_norm": 0.6723687648773193, + "learning_rate": 3.0445915933591658e-05, + "loss": 0.2652, + "num_input_tokens_seen": 26287792, + "step": 17205 + }, + { + "epoch": 53.117465224111285, + "grad_norm": 0.4364381432533264, + "learning_rate": 3.0436333801182114e-05, + "loss": 0.302, + "num_input_tokens_seen": 26295216, + "step": 17210 + }, + { + "epoch": 53.13292117465224, + "grad_norm": 0.6726999282836914, + "learning_rate": 3.0426750830421596e-05, + "loss": 0.2275, + "num_input_tokens_seen": 26302480, + "step": 17215 + }, + { + "epoch": 53.1483771251932, + "grad_norm": 0.4049278795719147, + "learning_rate": 3.0417167022787897e-05, + "loss": 0.3694, + "num_input_tokens_seen": 26310000, + "step": 17220 + }, + { + "epoch": 53.16383307573416, + "grad_norm": 0.4968661665916443, + "learning_rate": 3.0407582379758966e-05, + "loss": 0.336, + "num_input_tokens_seen": 26317584, + "step": 17225 + }, + { + "epoch": 53.179289026275114, + "grad_norm": 0.7343124747276306, + "learning_rate": 3.039799690281287e-05, + "loss": 0.2365, + "num_input_tokens_seen": 26325360, + "step": 17230 + }, + { + "epoch": 53.194744976816075, + "grad_norm": 0.51253741979599, + "learning_rate": 3.0388410593427823e-05, + "loss": 0.2024, + "num_input_tokens_seen": 26332272, + "step": 17235 + }, + { + "epoch": 53.210200927357036, + "grad_norm": 0.3493416905403137, + "learning_rate": 3.0378823453082146e-05, + "loss": 0.2301, + "num_input_tokens_seen": 26340240, + "step": 17240 + }, + { + "epoch": 53.22565687789799, + "grad_norm": 0.4844779670238495, + "learning_rate": 3.03692354832543e-05, + "loss": 0.2487, + "num_input_tokens_seen": 26348112, + "step": 17245 + }, + { + "epoch": 53.24111282843895, + "grad_norm": 0.6483993530273438, + "learning_rate": 3.0359646685422865e-05, + "loss": 0.3359, + "num_input_tokens_seen": 26355888, + "step": 17250 + }, + { + "epoch": 53.256568778979904, + "grad_norm": 0.49502214789390564, + "learning_rate": 3.035005706106656e-05, + "loss": 0.248, + "num_input_tokens_seen": 26363088, + "step": 17255 + }, + { + "epoch": 53.272024729520865, + "grad_norm": 0.7990154027938843, + "learning_rate": 3.034046661166422e-05, + "loss": 0.2375, + "num_input_tokens_seen": 26370384, + "step": 17260 + }, + { + "epoch": 53.287480680061826, + "grad_norm": 0.5331736207008362, + "learning_rate": 3.033087533869482e-05, + "loss": 0.2496, + "num_input_tokens_seen": 26378192, + "step": 17265 + }, + { + "epoch": 53.30293663060278, + "grad_norm": 0.8321058750152588, + "learning_rate": 3.0321283243637444e-05, + "loss": 0.2559, + "num_input_tokens_seen": 26385584, + "step": 17270 + }, + { + "epoch": 53.31839258114374, + "grad_norm": 0.36901453137397766, + "learning_rate": 3.0311690327971326e-05, + "loss": 0.1975, + "num_input_tokens_seen": 26393392, + "step": 17275 + }, + { + "epoch": 53.3338485316847, + "grad_norm": 0.44938674569129944, + "learning_rate": 3.030209659317581e-05, + "loss": 0.3, + "num_input_tokens_seen": 26400880, + "step": 17280 + }, + { + "epoch": 53.349304482225655, + "grad_norm": 0.3454243540763855, + "learning_rate": 3.0292502040730362e-05, + "loss": 0.2511, + "num_input_tokens_seen": 26408272, + "step": 17285 + }, + { + "epoch": 53.364760432766616, + "grad_norm": 0.3214719891548157, + "learning_rate": 3.0282906672114597e-05, + "loss": 0.2424, + "num_input_tokens_seen": 26416144, + "step": 17290 + }, + { + "epoch": 53.38021638330758, + "grad_norm": 0.5973691940307617, + "learning_rate": 3.027331048880823e-05, + "loss": 0.2412, + "num_input_tokens_seen": 26423824, + "step": 17295 + }, + { + "epoch": 53.39567233384853, + "grad_norm": 0.5549883246421814, + "learning_rate": 3.0263713492291123e-05, + "loss": 0.4092, + "num_input_tokens_seen": 26431984, + "step": 17300 + }, + { + "epoch": 53.41112828438949, + "grad_norm": 0.7135182023048401, + "learning_rate": 3.0254115684043242e-05, + "loss": 0.3599, + "num_input_tokens_seen": 26439824, + "step": 17305 + }, + { + "epoch": 53.426584234930445, + "grad_norm": 0.34848740696907043, + "learning_rate": 3.024451706554469e-05, + "loss": 0.2462, + "num_input_tokens_seen": 26447312, + "step": 17310 + }, + { + "epoch": 53.442040185471406, + "grad_norm": 0.49563974142074585, + "learning_rate": 3.0234917638275705e-05, + "loss": 0.3438, + "num_input_tokens_seen": 26454864, + "step": 17315 + }, + { + "epoch": 53.45749613601237, + "grad_norm": 0.6348243355751038, + "learning_rate": 3.0225317403716635e-05, + "loss": 0.2351, + "num_input_tokens_seen": 26462640, + "step": 17320 + }, + { + "epoch": 53.47295208655332, + "grad_norm": 0.8735719323158264, + "learning_rate": 3.0215716363347956e-05, + "loss": 0.2092, + "num_input_tokens_seen": 26470096, + "step": 17325 + }, + { + "epoch": 53.48840803709428, + "grad_norm": 0.4010702967643738, + "learning_rate": 3.0206114518650275e-05, + "loss": 0.2442, + "num_input_tokens_seen": 26477328, + "step": 17330 + }, + { + "epoch": 53.50386398763524, + "grad_norm": 0.36304038763046265, + "learning_rate": 3.0196511871104304e-05, + "loss": 0.2057, + "num_input_tokens_seen": 26485744, + "step": 17335 + }, + { + "epoch": 53.519319938176196, + "grad_norm": 0.45182356238365173, + "learning_rate": 3.01869084221909e-05, + "loss": 0.2288, + "num_input_tokens_seen": 26493264, + "step": 17340 + }, + { + "epoch": 53.53477588871716, + "grad_norm": 0.56695955991745, + "learning_rate": 3.0177304173391037e-05, + "loss": 0.2342, + "num_input_tokens_seen": 26501040, + "step": 17345 + }, + { + "epoch": 53.55023183925812, + "grad_norm": 0.5890248417854309, + "learning_rate": 3.01676991261858e-05, + "loss": 0.2193, + "num_input_tokens_seen": 26508496, + "step": 17350 + }, + { + "epoch": 53.56568778979907, + "grad_norm": 0.4628632068634033, + "learning_rate": 3.015809328205642e-05, + "loss": 0.2621, + "num_input_tokens_seen": 26515824, + "step": 17355 + }, + { + "epoch": 53.58114374034003, + "grad_norm": 0.48404696583747864, + "learning_rate": 3.0148486642484248e-05, + "loss": 0.2271, + "num_input_tokens_seen": 26523664, + "step": 17360 + }, + { + "epoch": 53.59659969088099, + "grad_norm": 0.7052038311958313, + "learning_rate": 3.0138879208950722e-05, + "loss": 0.2195, + "num_input_tokens_seen": 26531056, + "step": 17365 + }, + { + "epoch": 53.61205564142195, + "grad_norm": 0.6883875727653503, + "learning_rate": 3.012927098293744e-05, + "loss": 0.2476, + "num_input_tokens_seen": 26538640, + "step": 17370 + }, + { + "epoch": 53.62751159196291, + "grad_norm": 0.42996299266815186, + "learning_rate": 3.0119661965926123e-05, + "loss": 0.2455, + "num_input_tokens_seen": 26546160, + "step": 17375 + }, + { + "epoch": 53.64296754250386, + "grad_norm": 0.3128948509693146, + "learning_rate": 3.0110052159398587e-05, + "loss": 0.2073, + "num_input_tokens_seen": 26553840, + "step": 17380 + }, + { + "epoch": 53.65842349304482, + "grad_norm": 0.44717153906822205, + "learning_rate": 3.0100441564836802e-05, + "loss": 0.2466, + "num_input_tokens_seen": 26561168, + "step": 17385 + }, + { + "epoch": 53.67387944358578, + "grad_norm": 0.5196998119354248, + "learning_rate": 3.0090830183722817e-05, + "loss": 0.2822, + "num_input_tokens_seen": 26568176, + "step": 17390 + }, + { + "epoch": 53.689335394126736, + "grad_norm": 0.2768576443195343, + "learning_rate": 3.0081218017538852e-05, + "loss": 0.2209, + "num_input_tokens_seen": 26575600, + "step": 17395 + }, + { + "epoch": 53.7047913446677, + "grad_norm": 0.6517194509506226, + "learning_rate": 3.0071605067767212e-05, + "loss": 0.2637, + "num_input_tokens_seen": 26583184, + "step": 17400 + }, + { + "epoch": 53.7047913446677, + "eval_loss": 0.31341245770454407, + "eval_runtime": 6.3038, + "eval_samples_per_second": 91.215, + "eval_steps_per_second": 22.843, + "num_input_tokens_seen": 26583184, + "step": 17400 + }, + { + "epoch": 53.72024729520866, + "grad_norm": 0.4189993739128113, + "learning_rate": 3.006199133589034e-05, + "loss": 0.2327, + "num_input_tokens_seen": 26590672, + "step": 17405 + }, + { + "epoch": 53.73570324574961, + "grad_norm": 0.4412473440170288, + "learning_rate": 3.005237682339079e-05, + "loss": 0.296, + "num_input_tokens_seen": 26598384, + "step": 17410 + }, + { + "epoch": 53.75115919629057, + "grad_norm": 0.4803880751132965, + "learning_rate": 3.0042761531751228e-05, + "loss": 0.2352, + "num_input_tokens_seen": 26606128, + "step": 17415 + }, + { + "epoch": 53.76661514683153, + "grad_norm": 0.830177366733551, + "learning_rate": 3.0033145462454482e-05, + "loss": 0.2323, + "num_input_tokens_seen": 26613808, + "step": 17420 + }, + { + "epoch": 53.78207109737249, + "grad_norm": 0.549387514591217, + "learning_rate": 3.002352861698345e-05, + "loss": 0.3068, + "num_input_tokens_seen": 26621904, + "step": 17425 + }, + { + "epoch": 53.79752704791345, + "grad_norm": 0.842488169670105, + "learning_rate": 3.0013910996821178e-05, + "loss": 0.3388, + "num_input_tokens_seen": 26629776, + "step": 17430 + }, + { + "epoch": 53.8129829984544, + "grad_norm": 0.5147616267204285, + "learning_rate": 3.0004292603450817e-05, + "loss": 0.2891, + "num_input_tokens_seen": 26637488, + "step": 17435 + }, + { + "epoch": 53.82843894899536, + "grad_norm": 0.554401159286499, + "learning_rate": 2.9994673438355653e-05, + "loss": 0.2768, + "num_input_tokens_seen": 26644880, + "step": 17440 + }, + { + "epoch": 53.84389489953632, + "grad_norm": 0.7010182738304138, + "learning_rate": 2.9985053503019078e-05, + "loss": 0.2787, + "num_input_tokens_seen": 26652848, + "step": 17445 + }, + { + "epoch": 53.85935085007728, + "grad_norm": 0.6857694983482361, + "learning_rate": 2.99754327989246e-05, + "loss": 0.2466, + "num_input_tokens_seen": 26660816, + "step": 17450 + }, + { + "epoch": 53.87480680061824, + "grad_norm": 0.5207811594009399, + "learning_rate": 2.9965811327555864e-05, + "loss": 0.2982, + "num_input_tokens_seen": 26669104, + "step": 17455 + }, + { + "epoch": 53.8902627511592, + "grad_norm": 0.6542634963989258, + "learning_rate": 2.995618909039662e-05, + "loss": 0.2552, + "num_input_tokens_seen": 26676976, + "step": 17460 + }, + { + "epoch": 53.90571870170015, + "grad_norm": 0.4239993095397949, + "learning_rate": 2.9946566088930727e-05, + "loss": 0.2329, + "num_input_tokens_seen": 26685168, + "step": 17465 + }, + { + "epoch": 53.92117465224111, + "grad_norm": 0.689130425453186, + "learning_rate": 2.9936942324642192e-05, + "loss": 0.295, + "num_input_tokens_seen": 26692304, + "step": 17470 + }, + { + "epoch": 53.936630602782074, + "grad_norm": 0.9486216306686401, + "learning_rate": 2.9927317799015097e-05, + "loss": 0.2763, + "num_input_tokens_seen": 26699920, + "step": 17475 + }, + { + "epoch": 53.95208655332303, + "grad_norm": 0.4095277190208435, + "learning_rate": 2.9917692513533685e-05, + "loss": 0.1855, + "num_input_tokens_seen": 26708048, + "step": 17480 + }, + { + "epoch": 53.96754250386399, + "grad_norm": 0.5438172817230225, + "learning_rate": 2.990806646968229e-05, + "loss": 0.2798, + "num_input_tokens_seen": 26715888, + "step": 17485 + }, + { + "epoch": 53.98299845440495, + "grad_norm": 0.48286277055740356, + "learning_rate": 2.989843966894536e-05, + "loss": 0.2606, + "num_input_tokens_seen": 26723472, + "step": 17490 + }, + { + "epoch": 53.9984544049459, + "grad_norm": 0.5249704718589783, + "learning_rate": 2.9888812112807472e-05, + "loss": 0.2437, + "num_input_tokens_seen": 26731056, + "step": 17495 + }, + { + "epoch": 54.01236476043277, + "grad_norm": 0.5486212372779846, + "learning_rate": 2.987918380275333e-05, + "loss": 0.2571, + "num_input_tokens_seen": 26738000, + "step": 17500 + }, + { + "epoch": 54.02782071097373, + "grad_norm": 0.40201979875564575, + "learning_rate": 2.9869554740267724e-05, + "loss": 0.2555, + "num_input_tokens_seen": 26746160, + "step": 17505 + }, + { + "epoch": 54.04327666151468, + "grad_norm": 0.744016170501709, + "learning_rate": 2.9859924926835585e-05, + "loss": 0.2241, + "num_input_tokens_seen": 26753776, + "step": 17510 + }, + { + "epoch": 54.05873261205564, + "grad_norm": 0.44436952471733093, + "learning_rate": 2.9850294363941944e-05, + "loss": 0.2725, + "num_input_tokens_seen": 26760944, + "step": 17515 + }, + { + "epoch": 54.074188562596596, + "grad_norm": 0.5290988683700562, + "learning_rate": 2.9840663053071967e-05, + "loss": 0.2149, + "num_input_tokens_seen": 26768496, + "step": 17520 + }, + { + "epoch": 54.08964451313756, + "grad_norm": 0.5520220398902893, + "learning_rate": 2.983103099571091e-05, + "loss": 0.3101, + "num_input_tokens_seen": 26776368, + "step": 17525 + }, + { + "epoch": 54.10510046367852, + "grad_norm": 0.6042412519454956, + "learning_rate": 2.9821398193344164e-05, + "loss": 0.2443, + "num_input_tokens_seen": 26783728, + "step": 17530 + }, + { + "epoch": 54.12055641421947, + "grad_norm": 0.6714248061180115, + "learning_rate": 2.9811764647457226e-05, + "loss": 0.2488, + "num_input_tokens_seen": 26791120, + "step": 17535 + }, + { + "epoch": 54.13601236476043, + "grad_norm": 0.43105348944664, + "learning_rate": 2.9802130359535714e-05, + "loss": 0.2443, + "num_input_tokens_seen": 26798576, + "step": 17540 + }, + { + "epoch": 54.15146831530139, + "grad_norm": 0.3026086091995239, + "learning_rate": 2.979249533106535e-05, + "loss": 0.3597, + "num_input_tokens_seen": 26806128, + "step": 17545 + }, + { + "epoch": 54.16692426584235, + "grad_norm": 0.5152492523193359, + "learning_rate": 2.9782859563531986e-05, + "loss": 0.2614, + "num_input_tokens_seen": 26813904, + "step": 17550 + }, + { + "epoch": 54.18238021638331, + "grad_norm": 0.5153869390487671, + "learning_rate": 2.977322305842156e-05, + "loss": 0.2789, + "num_input_tokens_seen": 26821328, + "step": 17555 + }, + { + "epoch": 54.19783616692427, + "grad_norm": 0.4043024480342865, + "learning_rate": 2.9763585817220162e-05, + "loss": 0.2099, + "num_input_tokens_seen": 26829328, + "step": 17560 + }, + { + "epoch": 54.21329211746522, + "grad_norm": 0.3261878788471222, + "learning_rate": 2.975394784141397e-05, + "loss": 0.2401, + "num_input_tokens_seen": 26837392, + "step": 17565 + }, + { + "epoch": 54.22874806800618, + "grad_norm": 0.501804769039154, + "learning_rate": 2.974430913248928e-05, + "loss": 0.2495, + "num_input_tokens_seen": 26844912, + "step": 17570 + }, + { + "epoch": 54.244204018547144, + "grad_norm": 0.5434606075286865, + "learning_rate": 2.9734669691932497e-05, + "loss": 0.2278, + "num_input_tokens_seen": 26852688, + "step": 17575 + }, + { + "epoch": 54.2596599690881, + "grad_norm": 0.4424755275249481, + "learning_rate": 2.9725029521230147e-05, + "loss": 0.271, + "num_input_tokens_seen": 26860624, + "step": 17580 + }, + { + "epoch": 54.27511591962906, + "grad_norm": 0.47354549169540405, + "learning_rate": 2.9715388621868873e-05, + "loss": 0.2606, + "num_input_tokens_seen": 26868112, + "step": 17585 + }, + { + "epoch": 54.29057187017001, + "grad_norm": 0.5604641437530518, + "learning_rate": 2.970574699533541e-05, + "loss": 0.2753, + "num_input_tokens_seen": 26875920, + "step": 17590 + }, + { + "epoch": 54.30602782071097, + "grad_norm": 0.6575353741645813, + "learning_rate": 2.969610464311662e-05, + "loss": 0.204, + "num_input_tokens_seen": 26883536, + "step": 17595 + }, + { + "epoch": 54.321483771251934, + "grad_norm": 0.5431023836135864, + "learning_rate": 2.9686461566699487e-05, + "loss": 0.2026, + "num_input_tokens_seen": 26891152, + "step": 17600 + }, + { + "epoch": 54.321483771251934, + "eval_loss": 0.3146488666534424, + "eval_runtime": 6.3117, + "eval_samples_per_second": 91.101, + "eval_steps_per_second": 22.815, + "num_input_tokens_seen": 26891152, + "step": 17600 + }, + { + "epoch": 54.33693972179289, + "grad_norm": 0.3233765959739685, + "learning_rate": 2.9676817767571086e-05, + "loss": 0.2438, + "num_input_tokens_seen": 26899088, + "step": 17605 + }, + { + "epoch": 54.35239567233385, + "grad_norm": 0.6578189730644226, + "learning_rate": 2.966717324721861e-05, + "loss": 0.2552, + "num_input_tokens_seen": 26906608, + "step": 17610 + }, + { + "epoch": 54.36785162287481, + "grad_norm": 0.4838980436325073, + "learning_rate": 2.9657528007129366e-05, + "loss": 0.2683, + "num_input_tokens_seen": 26913968, + "step": 17615 + }, + { + "epoch": 54.38330757341576, + "grad_norm": 0.6564570069313049, + "learning_rate": 2.9647882048790777e-05, + "loss": 0.2546, + "num_input_tokens_seen": 26921936, + "step": 17620 + }, + { + "epoch": 54.398763523956724, + "grad_norm": 0.6373048424720764, + "learning_rate": 2.963823537369037e-05, + "loss": 0.298, + "num_input_tokens_seen": 26929872, + "step": 17625 + }, + { + "epoch": 54.414219474497685, + "grad_norm": 0.5906521081924438, + "learning_rate": 2.9628587983315775e-05, + "loss": 0.2392, + "num_input_tokens_seen": 26937584, + "step": 17630 + }, + { + "epoch": 54.42967542503864, + "grad_norm": 0.7208932042121887, + "learning_rate": 2.9618939879154746e-05, + "loss": 0.2303, + "num_input_tokens_seen": 26946128, + "step": 17635 + }, + { + "epoch": 54.4451313755796, + "grad_norm": 0.3905966579914093, + "learning_rate": 2.9609291062695143e-05, + "loss": 0.2889, + "num_input_tokens_seen": 26954160, + "step": 17640 + }, + { + "epoch": 54.46058732612055, + "grad_norm": 0.3760851323604584, + "learning_rate": 2.9599641535424938e-05, + "loss": 0.2511, + "num_input_tokens_seen": 26962096, + "step": 17645 + }, + { + "epoch": 54.476043276661514, + "grad_norm": 0.5528916120529175, + "learning_rate": 2.9589991298832202e-05, + "loss": 0.2431, + "num_input_tokens_seen": 26969424, + "step": 17650 + }, + { + "epoch": 54.491499227202475, + "grad_norm": 0.4113554060459137, + "learning_rate": 2.958034035440513e-05, + "loss": 0.2869, + "num_input_tokens_seen": 26977072, + "step": 17655 + }, + { + "epoch": 54.50695517774343, + "grad_norm": 0.5577666759490967, + "learning_rate": 2.957068870363201e-05, + "loss": 0.2857, + "num_input_tokens_seen": 26984560, + "step": 17660 + }, + { + "epoch": 54.52241112828439, + "grad_norm": 0.3754449486732483, + "learning_rate": 2.956103634800126e-05, + "loss": 0.2153, + "num_input_tokens_seen": 26991952, + "step": 17665 + }, + { + "epoch": 54.53786707882535, + "grad_norm": 0.6527158617973328, + "learning_rate": 2.9551383289001384e-05, + "loss": 0.287, + "num_input_tokens_seen": 26999344, + "step": 17670 + }, + { + "epoch": 54.553323029366304, + "grad_norm": 0.4804117679595947, + "learning_rate": 2.9541729528121005e-05, + "loss": 0.2686, + "num_input_tokens_seen": 27007088, + "step": 17675 + }, + { + "epoch": 54.568778979907265, + "grad_norm": 0.6717604398727417, + "learning_rate": 2.9532075066848856e-05, + "loss": 0.2458, + "num_input_tokens_seen": 27015184, + "step": 17680 + }, + { + "epoch": 54.584234930448225, + "grad_norm": 0.28565508127212524, + "learning_rate": 2.9522419906673786e-05, + "loss": 0.2471, + "num_input_tokens_seen": 27022928, + "step": 17685 + }, + { + "epoch": 54.59969088098918, + "grad_norm": 0.3142811059951782, + "learning_rate": 2.951276404908474e-05, + "loss": 0.2664, + "num_input_tokens_seen": 27030672, + "step": 17690 + }, + { + "epoch": 54.61514683153014, + "grad_norm": 0.47443103790283203, + "learning_rate": 2.9503107495570752e-05, + "loss": 0.3346, + "num_input_tokens_seen": 27039056, + "step": 17695 + }, + { + "epoch": 54.630602782071094, + "grad_norm": 0.6933510899543762, + "learning_rate": 2.9493450247621003e-05, + "loss": 0.2497, + "num_input_tokens_seen": 27046768, + "step": 17700 + }, + { + "epoch": 54.646058732612055, + "grad_norm": 0.3843940496444702, + "learning_rate": 2.948379230672476e-05, + "loss": 0.2911, + "num_input_tokens_seen": 27054288, + "step": 17705 + }, + { + "epoch": 54.661514683153015, + "grad_norm": 0.35456016659736633, + "learning_rate": 2.9474133674371396e-05, + "loss": 0.2249, + "num_input_tokens_seen": 27061808, + "step": 17710 + }, + { + "epoch": 54.67697063369397, + "grad_norm": 0.4385337829589844, + "learning_rate": 2.9464474352050387e-05, + "loss": 0.2958, + "num_input_tokens_seen": 27069520, + "step": 17715 + }, + { + "epoch": 54.69242658423493, + "grad_norm": 0.5164781808853149, + "learning_rate": 2.9454814341251336e-05, + "loss": 0.2355, + "num_input_tokens_seen": 27076656, + "step": 17720 + }, + { + "epoch": 54.70788253477589, + "grad_norm": 0.6433148384094238, + "learning_rate": 2.9445153643463942e-05, + "loss": 0.2621, + "num_input_tokens_seen": 27084272, + "step": 17725 + }, + { + "epoch": 54.723338485316845, + "grad_norm": 0.42695555090904236, + "learning_rate": 2.943549226017798e-05, + "loss": 0.1996, + "num_input_tokens_seen": 27091120, + "step": 17730 + }, + { + "epoch": 54.738794435857805, + "grad_norm": 0.5465254187583923, + "learning_rate": 2.942583019288337e-05, + "loss": 0.2347, + "num_input_tokens_seen": 27098768, + "step": 17735 + }, + { + "epoch": 54.754250386398766, + "grad_norm": 0.40517476201057434, + "learning_rate": 2.9416167443070132e-05, + "loss": 0.2476, + "num_input_tokens_seen": 27105936, + "step": 17740 + }, + { + "epoch": 54.76970633693972, + "grad_norm": 0.5253052115440369, + "learning_rate": 2.9406504012228375e-05, + "loss": 0.3347, + "num_input_tokens_seen": 27113648, + "step": 17745 + }, + { + "epoch": 54.78516228748068, + "grad_norm": 0.29543229937553406, + "learning_rate": 2.939683990184832e-05, + "loss": 0.2848, + "num_input_tokens_seen": 27121520, + "step": 17750 + }, + { + "epoch": 54.80061823802164, + "grad_norm": 0.38247382640838623, + "learning_rate": 2.93871751134203e-05, + "loss": 0.2146, + "num_input_tokens_seen": 27128816, + "step": 17755 + }, + { + "epoch": 54.816074188562595, + "grad_norm": 0.4650510251522064, + "learning_rate": 2.9377509648434752e-05, + "loss": 0.2857, + "num_input_tokens_seen": 27136336, + "step": 17760 + }, + { + "epoch": 54.831530139103556, + "grad_norm": 0.886834442615509, + "learning_rate": 2.9367843508382203e-05, + "loss": 0.2135, + "num_input_tokens_seen": 27143792, + "step": 17765 + }, + { + "epoch": 54.84698608964451, + "grad_norm": 0.7253538966178894, + "learning_rate": 2.9358176694753293e-05, + "loss": 0.2272, + "num_input_tokens_seen": 27151088, + "step": 17770 + }, + { + "epoch": 54.86244204018547, + "grad_norm": 0.7688599228858948, + "learning_rate": 2.9348509209038766e-05, + "loss": 0.2923, + "num_input_tokens_seen": 27158448, + "step": 17775 + }, + { + "epoch": 54.87789799072643, + "grad_norm": 0.3948313891887665, + "learning_rate": 2.933884105272947e-05, + "loss": 0.3, + "num_input_tokens_seen": 27166384, + "step": 17780 + }, + { + "epoch": 54.893353941267385, + "grad_norm": 0.5325972437858582, + "learning_rate": 2.9329172227316366e-05, + "loss": 0.2607, + "num_input_tokens_seen": 27173968, + "step": 17785 + }, + { + "epoch": 54.908809891808346, + "grad_norm": 0.4389657974243164, + "learning_rate": 2.93195027342905e-05, + "loss": 0.3092, + "num_input_tokens_seen": 27181296, + "step": 17790 + }, + { + "epoch": 54.92426584234931, + "grad_norm": 0.33522874116897583, + "learning_rate": 2.9309832575143024e-05, + "loss": 0.2235, + "num_input_tokens_seen": 27189040, + "step": 17795 + }, + { + "epoch": 54.93972179289026, + "grad_norm": 0.6152422428131104, + "learning_rate": 2.930016175136521e-05, + "loss": 0.2113, + "num_input_tokens_seen": 27197008, + "step": 17800 + }, + { + "epoch": 54.93972179289026, + "eval_loss": 0.3138073682785034, + "eval_runtime": 6.3504, + "eval_samples_per_second": 90.545, + "eval_steps_per_second": 22.676, + "num_input_tokens_seen": 27197008, + "step": 17800 + }, + { + "epoch": 54.95517774343122, + "grad_norm": 0.5384479761123657, + "learning_rate": 2.9290490264448412e-05, + "loss": 0.2688, + "num_input_tokens_seen": 27204816, + "step": 17805 + }, + { + "epoch": 54.97063369397218, + "grad_norm": 0.7600142359733582, + "learning_rate": 2.9280818115884094e-05, + "loss": 0.2921, + "num_input_tokens_seen": 27212592, + "step": 17810 + }, + { + "epoch": 54.986089644513136, + "grad_norm": 0.4528822600841522, + "learning_rate": 2.9271145307163828e-05, + "loss": 0.3007, + "num_input_tokens_seen": 27220176, + "step": 17815 + }, + { + "epoch": 55.0, + "grad_norm": 0.866976261138916, + "learning_rate": 2.9261471839779287e-05, + "loss": 0.3289, + "num_input_tokens_seen": 27226720, + "step": 17820 + }, + { + "epoch": 55.01545595054096, + "grad_norm": 0.38685500621795654, + "learning_rate": 2.925179771522223e-05, + "loss": 0.2016, + "num_input_tokens_seen": 27234240, + "step": 17825 + }, + { + "epoch": 55.030911901081915, + "grad_norm": 0.9111426472663879, + "learning_rate": 2.9242122934984535e-05, + "loss": 0.2596, + "num_input_tokens_seen": 27241728, + "step": 17830 + }, + { + "epoch": 55.046367851622875, + "grad_norm": 0.5269106030464172, + "learning_rate": 2.9232447500558176e-05, + "loss": 0.3022, + "num_input_tokens_seen": 27249600, + "step": 17835 + }, + { + "epoch": 55.061823802163836, + "grad_norm": 0.40986746549606323, + "learning_rate": 2.9222771413435225e-05, + "loss": 0.2471, + "num_input_tokens_seen": 27257024, + "step": 17840 + }, + { + "epoch": 55.07727975270479, + "grad_norm": 0.5866854190826416, + "learning_rate": 2.9213094675107848e-05, + "loss": 0.2775, + "num_input_tokens_seen": 27264160, + "step": 17845 + }, + { + "epoch": 55.09273570324575, + "grad_norm": 0.40081432461738586, + "learning_rate": 2.9203417287068335e-05, + "loss": 0.2183, + "num_input_tokens_seen": 27272096, + "step": 17850 + }, + { + "epoch": 55.108191653786704, + "grad_norm": 0.47491881251335144, + "learning_rate": 2.9193739250809042e-05, + "loss": 0.3164, + "num_input_tokens_seen": 27279648, + "step": 17855 + }, + { + "epoch": 55.123647604327665, + "grad_norm": 0.502302348613739, + "learning_rate": 2.9184060567822463e-05, + "loss": 0.2661, + "num_input_tokens_seen": 27286848, + "step": 17860 + }, + { + "epoch": 55.139103554868626, + "grad_norm": 0.3036981523036957, + "learning_rate": 2.9174381239601166e-05, + "loss": 0.2469, + "num_input_tokens_seen": 27294496, + "step": 17865 + }, + { + "epoch": 55.15455950540958, + "grad_norm": 0.5739202499389648, + "learning_rate": 2.916470126763783e-05, + "loss": 0.2093, + "num_input_tokens_seen": 27301952, + "step": 17870 + }, + { + "epoch": 55.17001545595054, + "grad_norm": 0.43614253401756287, + "learning_rate": 2.9155020653425203e-05, + "loss": 0.2212, + "num_input_tokens_seen": 27309376, + "step": 17875 + }, + { + "epoch": 55.1854714064915, + "grad_norm": 0.42214643955230713, + "learning_rate": 2.9145339398456184e-05, + "loss": 0.2391, + "num_input_tokens_seen": 27317248, + "step": 17880 + }, + { + "epoch": 55.200927357032455, + "grad_norm": 0.777840256690979, + "learning_rate": 2.913565750422374e-05, + "loss": 0.3301, + "num_input_tokens_seen": 27324896, + "step": 17885 + }, + { + "epoch": 55.216383307573416, + "grad_norm": 0.6917327046394348, + "learning_rate": 2.9125974972220938e-05, + "loss": 0.2467, + "num_input_tokens_seen": 27332480, + "step": 17890 + }, + { + "epoch": 55.23183925811438, + "grad_norm": 0.5825452208518982, + "learning_rate": 2.9116291803940932e-05, + "loss": 0.2373, + "num_input_tokens_seen": 27340416, + "step": 17895 + }, + { + "epoch": 55.24729520865533, + "grad_norm": 0.7261101603507996, + "learning_rate": 2.910660800087701e-05, + "loss": 0.2042, + "num_input_tokens_seen": 27348416, + "step": 17900 + }, + { + "epoch": 55.26275115919629, + "grad_norm": 0.8874297738075256, + "learning_rate": 2.909692356452254e-05, + "loss": 0.3552, + "num_input_tokens_seen": 27355872, + "step": 17905 + }, + { + "epoch": 55.27820710973725, + "grad_norm": 0.6218092441558838, + "learning_rate": 2.9087238496370962e-05, + "loss": 0.2583, + "num_input_tokens_seen": 27362976, + "step": 17910 + }, + { + "epoch": 55.293663060278206, + "grad_norm": 0.5869262218475342, + "learning_rate": 2.907755279791583e-05, + "loss": 0.2504, + "num_input_tokens_seen": 27369984, + "step": 17915 + }, + { + "epoch": 55.30911901081917, + "grad_norm": 0.5123754143714905, + "learning_rate": 2.906786647065083e-05, + "loss": 0.3094, + "num_input_tokens_seen": 27377568, + "step": 17920 + }, + { + "epoch": 55.32457496136012, + "grad_norm": 0.3510196805000305, + "learning_rate": 2.9058179516069695e-05, + "loss": 0.2171, + "num_input_tokens_seen": 27385248, + "step": 17925 + }, + { + "epoch": 55.34003091190108, + "grad_norm": 0.3690684139728546, + "learning_rate": 2.9048491935666282e-05, + "loss": 0.295, + "num_input_tokens_seen": 27393376, + "step": 17930 + }, + { + "epoch": 55.35548686244204, + "grad_norm": 0.9847258925437927, + "learning_rate": 2.9038803730934534e-05, + "loss": 0.2506, + "num_input_tokens_seen": 27401024, + "step": 17935 + }, + { + "epoch": 55.370942812982996, + "grad_norm": 0.4634341299533844, + "learning_rate": 2.9029114903368503e-05, + "loss": 0.2232, + "num_input_tokens_seen": 27408640, + "step": 17940 + }, + { + "epoch": 55.38639876352396, + "grad_norm": 0.42162612080574036, + "learning_rate": 2.9019425454462318e-05, + "loss": 0.242, + "num_input_tokens_seen": 27415872, + "step": 17945 + }, + { + "epoch": 55.40185471406492, + "grad_norm": 0.5391274690628052, + "learning_rate": 2.9009735385710212e-05, + "loss": 0.2266, + "num_input_tokens_seen": 27423712, + "step": 17950 + }, + { + "epoch": 55.41731066460587, + "grad_norm": 0.479792982339859, + "learning_rate": 2.900004469860652e-05, + "loss": 0.2033, + "num_input_tokens_seen": 27431136, + "step": 17955 + }, + { + "epoch": 55.43276661514683, + "grad_norm": 0.45719096064567566, + "learning_rate": 2.8990353394645668e-05, + "loss": 0.2504, + "num_input_tokens_seen": 27438560, + "step": 17960 + }, + { + "epoch": 55.44822256568779, + "grad_norm": 0.6904475688934326, + "learning_rate": 2.8980661475322186e-05, + "loss": 0.3278, + "num_input_tokens_seen": 27446432, + "step": 17965 + }, + { + "epoch": 55.46367851622875, + "grad_norm": 0.45930564403533936, + "learning_rate": 2.897096894213067e-05, + "loss": 0.236, + "num_input_tokens_seen": 27454048, + "step": 17970 + }, + { + "epoch": 55.47913446676971, + "grad_norm": 0.425035685300827, + "learning_rate": 2.8961275796565845e-05, + "loss": 0.2979, + "num_input_tokens_seen": 27461408, + "step": 17975 + }, + { + "epoch": 55.49459041731066, + "grad_norm": 0.5010126829147339, + "learning_rate": 2.8951582040122517e-05, + "loss": 0.2806, + "num_input_tokens_seen": 27469184, + "step": 17980 + }, + { + "epoch": 55.51004636785162, + "grad_norm": 0.6399075388908386, + "learning_rate": 2.894188767429557e-05, + "loss": 0.2477, + "num_input_tokens_seen": 27477184, + "step": 17985 + }, + { + "epoch": 55.52550231839258, + "grad_norm": 1.0056524276733398, + "learning_rate": 2.8932192700580014e-05, + "loss": 0.2372, + "num_input_tokens_seen": 27485024, + "step": 17990 + }, + { + "epoch": 55.54095826893354, + "grad_norm": 0.3795710504055023, + "learning_rate": 2.8922497120470916e-05, + "loss": 0.2511, + "num_input_tokens_seen": 27492672, + "step": 17995 + }, + { + "epoch": 55.5564142194745, + "grad_norm": 0.6231082081794739, + "learning_rate": 2.891280093546348e-05, + "loss": 0.3108, + "num_input_tokens_seen": 27500160, + "step": 18000 + }, + { + "epoch": 55.5564142194745, + "eval_loss": 0.3130464553833008, + "eval_runtime": 6.2742, + "eval_samples_per_second": 91.644, + "eval_steps_per_second": 22.951, + "num_input_tokens_seen": 27500160, + "step": 18000 + }, + { + "epoch": 55.57187017001546, + "grad_norm": 0.540459930896759, + "learning_rate": 2.890310414705297e-05, + "loss": 0.2538, + "num_input_tokens_seen": 27507968, + "step": 18005 + }, + { + "epoch": 55.58732612055641, + "grad_norm": 0.41197532415390015, + "learning_rate": 2.8893406756734742e-05, + "loss": 0.2871, + "num_input_tokens_seen": 27516288, + "step": 18010 + }, + { + "epoch": 55.60278207109737, + "grad_norm": 0.5188208818435669, + "learning_rate": 2.888370876600427e-05, + "loss": 0.2987, + "num_input_tokens_seen": 27523616, + "step": 18015 + }, + { + "epoch": 55.618238021638334, + "grad_norm": 0.5052193403244019, + "learning_rate": 2.8874010176357104e-05, + "loss": 0.2408, + "num_input_tokens_seen": 27531520, + "step": 18020 + }, + { + "epoch": 55.63369397217929, + "grad_norm": 0.4987712502479553, + "learning_rate": 2.886431098928888e-05, + "loss": 0.2708, + "num_input_tokens_seen": 27538848, + "step": 18025 + }, + { + "epoch": 55.64914992272025, + "grad_norm": 0.5695444345474243, + "learning_rate": 2.885461120629534e-05, + "loss": 0.2793, + "num_input_tokens_seen": 27546272, + "step": 18030 + }, + { + "epoch": 55.66460587326121, + "grad_norm": 0.6451480984687805, + "learning_rate": 2.8844910828872317e-05, + "loss": 0.3034, + "num_input_tokens_seen": 27553920, + "step": 18035 + }, + { + "epoch": 55.68006182380216, + "grad_norm": 0.5242071151733398, + "learning_rate": 2.8835209858515715e-05, + "loss": 0.2556, + "num_input_tokens_seen": 27561216, + "step": 18040 + }, + { + "epoch": 55.695517774343124, + "grad_norm": 0.5245334506034851, + "learning_rate": 2.8825508296721566e-05, + "loss": 0.2507, + "num_input_tokens_seen": 27568704, + "step": 18045 + }, + { + "epoch": 55.71097372488408, + "grad_norm": 0.3593696355819702, + "learning_rate": 2.881580614498596e-05, + "loss": 0.2312, + "num_input_tokens_seen": 27576192, + "step": 18050 + }, + { + "epoch": 55.72642967542504, + "grad_norm": 0.3575533628463745, + "learning_rate": 2.8806103404805103e-05, + "loss": 0.2446, + "num_input_tokens_seen": 27584160, + "step": 18055 + }, + { + "epoch": 55.741885625966, + "grad_norm": 0.5246678590774536, + "learning_rate": 2.8796400077675257e-05, + "loss": 0.2504, + "num_input_tokens_seen": 27592000, + "step": 18060 + }, + { + "epoch": 55.75734157650695, + "grad_norm": 0.5394699573516846, + "learning_rate": 2.8786696165092812e-05, + "loss": 0.304, + "num_input_tokens_seen": 27599776, + "step": 18065 + }, + { + "epoch": 55.77279752704791, + "grad_norm": 0.8271961808204651, + "learning_rate": 2.8776991668554236e-05, + "loss": 0.3431, + "num_input_tokens_seen": 27607584, + "step": 18070 + }, + { + "epoch": 55.788253477588874, + "grad_norm": 0.5385125279426575, + "learning_rate": 2.876728658955608e-05, + "loss": 0.2626, + "num_input_tokens_seen": 27614784, + "step": 18075 + }, + { + "epoch": 55.80370942812983, + "grad_norm": 1.6975570917129517, + "learning_rate": 2.8757580929594986e-05, + "loss": 0.2769, + "num_input_tokens_seen": 27622272, + "step": 18080 + }, + { + "epoch": 55.81916537867079, + "grad_norm": 0.3898678421974182, + "learning_rate": 2.87478746901677e-05, + "loss": 0.2586, + "num_input_tokens_seen": 27630208, + "step": 18085 + }, + { + "epoch": 55.83462132921175, + "grad_norm": 0.4767809212207794, + "learning_rate": 2.873816787277103e-05, + "loss": 0.2496, + "num_input_tokens_seen": 27637824, + "step": 18090 + }, + { + "epoch": 55.8500772797527, + "grad_norm": 0.3978579342365265, + "learning_rate": 2.8728460478901903e-05, + "loss": 0.2614, + "num_input_tokens_seen": 27645504, + "step": 18095 + }, + { + "epoch": 55.865533230293664, + "grad_norm": 0.5896540880203247, + "learning_rate": 2.8718752510057307e-05, + "loss": 0.3157, + "num_input_tokens_seen": 27652768, + "step": 18100 + }, + { + "epoch": 55.88098918083462, + "grad_norm": 0.9459717273712158, + "learning_rate": 2.870904396773435e-05, + "loss": 0.214, + "num_input_tokens_seen": 27660064, + "step": 18105 + }, + { + "epoch": 55.89644513137558, + "grad_norm": 0.2965084910392761, + "learning_rate": 2.86993348534302e-05, + "loss": 0.1813, + "num_input_tokens_seen": 27668256, + "step": 18110 + }, + { + "epoch": 55.91190108191654, + "grad_norm": 0.6338528990745544, + "learning_rate": 2.868962516864212e-05, + "loss": 0.2317, + "num_input_tokens_seen": 27676032, + "step": 18115 + }, + { + "epoch": 55.92735703245749, + "grad_norm": 0.4228910207748413, + "learning_rate": 2.8679914914867477e-05, + "loss": 0.2435, + "num_input_tokens_seen": 27683744, + "step": 18120 + }, + { + "epoch": 55.942812982998454, + "grad_norm": 0.4894598722457886, + "learning_rate": 2.8670204093603713e-05, + "loss": 0.2095, + "num_input_tokens_seen": 27691200, + "step": 18125 + }, + { + "epoch": 55.958268933539415, + "grad_norm": 0.5336100459098816, + "learning_rate": 2.8660492706348357e-05, + "loss": 0.2402, + "num_input_tokens_seen": 27698976, + "step": 18130 + }, + { + "epoch": 55.97372488408037, + "grad_norm": 0.4875941872596741, + "learning_rate": 2.8650780754599022e-05, + "loss": 0.2155, + "num_input_tokens_seen": 27706656, + "step": 18135 + }, + { + "epoch": 55.98918083462133, + "grad_norm": 0.6596088409423828, + "learning_rate": 2.8641068239853407e-05, + "loss": 0.2444, + "num_input_tokens_seen": 27714240, + "step": 18140 + }, + { + "epoch": 56.003091190108194, + "grad_norm": 0.5138448476791382, + "learning_rate": 2.863135516360932e-05, + "loss": 0.2793, + "num_input_tokens_seen": 27721008, + "step": 18145 + }, + { + "epoch": 56.01854714064915, + "grad_norm": 0.42333775758743286, + "learning_rate": 2.8621641527364633e-05, + "loss": 0.2401, + "num_input_tokens_seen": 27728976, + "step": 18150 + }, + { + "epoch": 56.03400309119011, + "grad_norm": 0.43493202328681946, + "learning_rate": 2.8611927332617313e-05, + "loss": 0.2659, + "num_input_tokens_seen": 27736336, + "step": 18155 + }, + { + "epoch": 56.04945904173107, + "grad_norm": 0.5168222784996033, + "learning_rate": 2.8602212580865405e-05, + "loss": 0.2848, + "num_input_tokens_seen": 27743920, + "step": 18160 + }, + { + "epoch": 56.06491499227202, + "grad_norm": 0.4054471552371979, + "learning_rate": 2.859249727360705e-05, + "loss": 0.2435, + "num_input_tokens_seen": 27751632, + "step": 18165 + }, + { + "epoch": 56.08037094281298, + "grad_norm": 0.3206298351287842, + "learning_rate": 2.8582781412340465e-05, + "loss": 0.3047, + "num_input_tokens_seen": 27759120, + "step": 18170 + }, + { + "epoch": 56.095826893353944, + "grad_norm": 0.5753397345542908, + "learning_rate": 2.857306499856397e-05, + "loss": 0.272, + "num_input_tokens_seen": 27766928, + "step": 18175 + }, + { + "epoch": 56.1112828438949, + "grad_norm": 0.9183152914047241, + "learning_rate": 2.856334803377594e-05, + "loss": 0.3575, + "num_input_tokens_seen": 27774544, + "step": 18180 + }, + { + "epoch": 56.12673879443586, + "grad_norm": 0.6963825225830078, + "learning_rate": 2.8553630519474867e-05, + "loss": 0.2208, + "num_input_tokens_seen": 27782096, + "step": 18185 + }, + { + "epoch": 56.14219474497681, + "grad_norm": 0.365804523229599, + "learning_rate": 2.8543912457159317e-05, + "loss": 0.2719, + "num_input_tokens_seen": 27790096, + "step": 18190 + }, + { + "epoch": 56.15765069551777, + "grad_norm": 0.4793349504470825, + "learning_rate": 2.853419384832792e-05, + "loss": 0.2914, + "num_input_tokens_seen": 27798000, + "step": 18195 + }, + { + "epoch": 56.173106646058734, + "grad_norm": 0.40847092866897583, + "learning_rate": 2.8524474694479423e-05, + "loss": 0.2994, + "num_input_tokens_seen": 27805616, + "step": 18200 + }, + { + "epoch": 56.173106646058734, + "eval_loss": 0.31025317311286926, + "eval_runtime": 6.3026, + "eval_samples_per_second": 91.231, + "eval_steps_per_second": 22.848, + "num_input_tokens_seen": 27805616, + "step": 18200 + }, + { + "epoch": 56.18856259659969, + "grad_norm": 0.35625341534614563, + "learning_rate": 2.851475499711264e-05, + "loss": 0.2328, + "num_input_tokens_seen": 27813200, + "step": 18205 + }, + { + "epoch": 56.20401854714065, + "grad_norm": 0.4687705338001251, + "learning_rate": 2.8505034757726468e-05, + "loss": 0.2187, + "num_input_tokens_seen": 27821008, + "step": 18210 + }, + { + "epoch": 56.21947449768161, + "grad_norm": 0.5692394375801086, + "learning_rate": 2.8495313977819886e-05, + "loss": 0.2024, + "num_input_tokens_seen": 27828464, + "step": 18215 + }, + { + "epoch": 56.23493044822256, + "grad_norm": 0.5773736238479614, + "learning_rate": 2.8485592658891956e-05, + "loss": 0.2911, + "num_input_tokens_seen": 27836240, + "step": 18220 + }, + { + "epoch": 56.250386398763524, + "grad_norm": 0.5278369188308716, + "learning_rate": 2.8475870802441844e-05, + "loss": 0.1924, + "num_input_tokens_seen": 27843472, + "step": 18225 + }, + { + "epoch": 56.265842349304485, + "grad_norm": 0.3118089437484741, + "learning_rate": 2.8466148409968774e-05, + "loss": 0.289, + "num_input_tokens_seen": 27851344, + "step": 18230 + }, + { + "epoch": 56.28129829984544, + "grad_norm": 0.5889444351196289, + "learning_rate": 2.8456425482972067e-05, + "loss": 0.2637, + "num_input_tokens_seen": 27859792, + "step": 18235 + }, + { + "epoch": 56.2967542503864, + "grad_norm": 0.48136189579963684, + "learning_rate": 2.84467020229511e-05, + "loss": 0.2389, + "num_input_tokens_seen": 27867760, + "step": 18240 + }, + { + "epoch": 56.31221020092736, + "grad_norm": 0.32653480768203735, + "learning_rate": 2.8436978031405375e-05, + "loss": 0.2624, + "num_input_tokens_seen": 27875376, + "step": 18245 + }, + { + "epoch": 56.327666151468314, + "grad_norm": 0.6306765675544739, + "learning_rate": 2.842725350983445e-05, + "loss": 0.3007, + "num_input_tokens_seen": 27882576, + "step": 18250 + }, + { + "epoch": 56.343122102009275, + "grad_norm": 0.3083963990211487, + "learning_rate": 2.8417528459737957e-05, + "loss": 0.207, + "num_input_tokens_seen": 27889936, + "step": 18255 + }, + { + "epoch": 56.35857805255023, + "grad_norm": 0.4508289396762848, + "learning_rate": 2.8407802882615624e-05, + "loss": 0.2563, + "num_input_tokens_seen": 27897808, + "step": 18260 + }, + { + "epoch": 56.37403400309119, + "grad_norm": 0.36426857113838196, + "learning_rate": 2.8398076779967277e-05, + "loss": 0.3031, + "num_input_tokens_seen": 27905296, + "step": 18265 + }, + { + "epoch": 56.38948995363215, + "grad_norm": 0.6908025741577148, + "learning_rate": 2.8388350153292774e-05, + "loss": 0.2869, + "num_input_tokens_seen": 27913040, + "step": 18270 + }, + { + "epoch": 56.404945904173104, + "grad_norm": 0.5403527021408081, + "learning_rate": 2.8378623004092103e-05, + "loss": 0.3497, + "num_input_tokens_seen": 27920464, + "step": 18275 + }, + { + "epoch": 56.420401854714065, + "grad_norm": 0.6120466589927673, + "learning_rate": 2.8368895333865302e-05, + "loss": 0.2496, + "num_input_tokens_seen": 27928272, + "step": 18280 + }, + { + "epoch": 56.435857805255026, + "grad_norm": 0.3526031970977783, + "learning_rate": 2.835916714411251e-05, + "loss": 0.2384, + "num_input_tokens_seen": 27935824, + "step": 18285 + }, + { + "epoch": 56.45131375579598, + "grad_norm": 0.7502318024635315, + "learning_rate": 2.8349438436333926e-05, + "loss": 0.2255, + "num_input_tokens_seen": 27943344, + "step": 18290 + }, + { + "epoch": 56.46676970633694, + "grad_norm": 0.7657824754714966, + "learning_rate": 2.833970921202984e-05, + "loss": 0.2574, + "num_input_tokens_seen": 27950832, + "step": 18295 + }, + { + "epoch": 56.4822256568779, + "grad_norm": 0.6423833966255188, + "learning_rate": 2.8329979472700628e-05, + "loss": 0.2046, + "num_input_tokens_seen": 27958096, + "step": 18300 + }, + { + "epoch": 56.497681607418855, + "grad_norm": 0.6204164624214172, + "learning_rate": 2.832024921984674e-05, + "loss": 0.2913, + "num_input_tokens_seen": 27966000, + "step": 18305 + }, + { + "epoch": 56.513137557959816, + "grad_norm": 0.5659399628639221, + "learning_rate": 2.8310518454968693e-05, + "loss": 0.2398, + "num_input_tokens_seen": 27973168, + "step": 18310 + }, + { + "epoch": 56.52859350850077, + "grad_norm": 0.640693187713623, + "learning_rate": 2.8300787179567095e-05, + "loss": 0.2128, + "num_input_tokens_seen": 27980912, + "step": 18315 + }, + { + "epoch": 56.54404945904173, + "grad_norm": 0.3575538098812103, + "learning_rate": 2.8291055395142636e-05, + "loss": 0.2281, + "num_input_tokens_seen": 27988304, + "step": 18320 + }, + { + "epoch": 56.55950540958269, + "grad_norm": 0.5761309862136841, + "learning_rate": 2.8281323103196073e-05, + "loss": 0.2591, + "num_input_tokens_seen": 27996048, + "step": 18325 + }, + { + "epoch": 56.574961360123645, + "grad_norm": 0.6152256727218628, + "learning_rate": 2.8271590305228256e-05, + "loss": 0.2855, + "num_input_tokens_seen": 28004080, + "step": 18330 + }, + { + "epoch": 56.590417310664606, + "grad_norm": 0.6570716500282288, + "learning_rate": 2.82618570027401e-05, + "loss": 0.2621, + "num_input_tokens_seen": 28011312, + "step": 18335 + }, + { + "epoch": 56.605873261205566, + "grad_norm": 0.536148726940155, + "learning_rate": 2.8252123197232604e-05, + "loss": 0.2344, + "num_input_tokens_seen": 28018608, + "step": 18340 + }, + { + "epoch": 56.62132921174652, + "grad_norm": 0.4584887623786926, + "learning_rate": 2.8242388890206843e-05, + "loss": 0.2454, + "num_input_tokens_seen": 28026704, + "step": 18345 + }, + { + "epoch": 56.63678516228748, + "grad_norm": 0.48298829793930054, + "learning_rate": 2.8232654083163967e-05, + "loss": 0.2429, + "num_input_tokens_seen": 28034448, + "step": 18350 + }, + { + "epoch": 56.65224111282844, + "grad_norm": 0.3717045485973358, + "learning_rate": 2.822291877760521e-05, + "loss": 0.2755, + "num_input_tokens_seen": 28042160, + "step": 18355 + }, + { + "epoch": 56.667697063369395, + "grad_norm": 0.40757524967193604, + "learning_rate": 2.8213182975031864e-05, + "loss": 0.2613, + "num_input_tokens_seen": 28049936, + "step": 18360 + }, + { + "epoch": 56.683153013910356, + "grad_norm": 0.3828763961791992, + "learning_rate": 2.8203446676945337e-05, + "loss": 0.2753, + "num_input_tokens_seen": 28057904, + "step": 18365 + }, + { + "epoch": 56.69860896445132, + "grad_norm": 0.3930385112762451, + "learning_rate": 2.8193709884847075e-05, + "loss": 0.197, + "num_input_tokens_seen": 28065520, + "step": 18370 + }, + { + "epoch": 56.71406491499227, + "grad_norm": 0.6058446168899536, + "learning_rate": 2.8183972600238605e-05, + "loss": 0.2778, + "num_input_tokens_seen": 28073680, + "step": 18375 + }, + { + "epoch": 56.72952086553323, + "grad_norm": 0.5795854926109314, + "learning_rate": 2.817423482462156e-05, + "loss": 0.2369, + "num_input_tokens_seen": 28081520, + "step": 18380 + }, + { + "epoch": 56.744976816074185, + "grad_norm": 0.5206186175346375, + "learning_rate": 2.8164496559497605e-05, + "loss": 0.2487, + "num_input_tokens_seen": 28088496, + "step": 18385 + }, + { + "epoch": 56.760432766615146, + "grad_norm": 0.6470784544944763, + "learning_rate": 2.815475780636852e-05, + "loss": 0.2224, + "num_input_tokens_seen": 28096688, + "step": 18390 + }, + { + "epoch": 56.77588871715611, + "grad_norm": 0.39347830414772034, + "learning_rate": 2.814501856673613e-05, + "loss": 0.2413, + "num_input_tokens_seen": 28104368, + "step": 18395 + }, + { + "epoch": 56.79134466769706, + "grad_norm": 0.43400198221206665, + "learning_rate": 2.8135278842102353e-05, + "loss": 0.317, + "num_input_tokens_seen": 28112336, + "step": 18400 + }, + { + "epoch": 56.79134466769706, + "eval_loss": 0.31230923533439636, + "eval_runtime": 6.3057, + "eval_samples_per_second": 91.187, + "eval_steps_per_second": 22.836, + "num_input_tokens_seen": 28112336, + "step": 18400 + }, + { + "epoch": 56.80680061823802, + "grad_norm": 0.7371638417243958, + "learning_rate": 2.8125538633969183e-05, + "loss": 0.2744, + "num_input_tokens_seen": 28119856, + "step": 18405 + }, + { + "epoch": 56.82225656877898, + "grad_norm": 0.3547092378139496, + "learning_rate": 2.8115797943838677e-05, + "loss": 0.2457, + "num_input_tokens_seen": 28127952, + "step": 18410 + }, + { + "epoch": 56.837712519319936, + "grad_norm": 0.5143808722496033, + "learning_rate": 2.810605677321298e-05, + "loss": 0.1988, + "num_input_tokens_seen": 28135632, + "step": 18415 + }, + { + "epoch": 56.8531684698609, + "grad_norm": 0.3766460716724396, + "learning_rate": 2.809631512359428e-05, + "loss": 0.2263, + "num_input_tokens_seen": 28143120, + "step": 18420 + }, + { + "epoch": 56.86862442040186, + "grad_norm": 0.5016581416130066, + "learning_rate": 2.8086572996484884e-05, + "loss": 0.2322, + "num_input_tokens_seen": 28151312, + "step": 18425 + }, + { + "epoch": 56.88408037094281, + "grad_norm": 0.3595437705516815, + "learning_rate": 2.8076830393387143e-05, + "loss": 0.2774, + "num_input_tokens_seen": 28159056, + "step": 18430 + }, + { + "epoch": 56.89953632148377, + "grad_norm": 0.4086596965789795, + "learning_rate": 2.8067087315803497e-05, + "loss": 0.2644, + "num_input_tokens_seen": 28166320, + "step": 18435 + }, + { + "epoch": 56.914992272024726, + "grad_norm": 0.4378129541873932, + "learning_rate": 2.8057343765236433e-05, + "loss": 0.2143, + "num_input_tokens_seen": 28173776, + "step": 18440 + }, + { + "epoch": 56.93044822256569, + "grad_norm": 0.6545950770378113, + "learning_rate": 2.804759974318854e-05, + "loss": 0.3219, + "num_input_tokens_seen": 28181584, + "step": 18445 + }, + { + "epoch": 56.94590417310665, + "grad_norm": 0.4307551681995392, + "learning_rate": 2.8037855251162482e-05, + "loss": 0.2566, + "num_input_tokens_seen": 28189680, + "step": 18450 + }, + { + "epoch": 56.9613601236476, + "grad_norm": 0.40611889958381653, + "learning_rate": 2.802811029066096e-05, + "loss": 0.2838, + "num_input_tokens_seen": 28197040, + "step": 18455 + }, + { + "epoch": 56.97681607418856, + "grad_norm": 0.6621159315109253, + "learning_rate": 2.8018364863186764e-05, + "loss": 0.1961, + "num_input_tokens_seen": 28204560, + "step": 18460 + }, + { + "epoch": 56.99227202472952, + "grad_norm": 0.7132711410522461, + "learning_rate": 2.800861897024279e-05, + "loss": 0.3163, + "num_input_tokens_seen": 28211760, + "step": 18465 + }, + { + "epoch": 57.00618238021638, + "grad_norm": 0.6290728449821472, + "learning_rate": 2.799887261333196e-05, + "loss": 0.3584, + "num_input_tokens_seen": 28218704, + "step": 18470 + }, + { + "epoch": 57.02163833075734, + "grad_norm": 0.3603161573410034, + "learning_rate": 2.798912579395728e-05, + "loss": 0.259, + "num_input_tokens_seen": 28225968, + "step": 18475 + }, + { + "epoch": 57.0370942812983, + "grad_norm": 0.43437719345092773, + "learning_rate": 2.797937851362185e-05, + "loss": 0.2987, + "num_input_tokens_seen": 28234000, + "step": 18480 + }, + { + "epoch": 57.052550231839255, + "grad_norm": 0.38359859585762024, + "learning_rate": 2.7969630773828802e-05, + "loss": 0.2411, + "num_input_tokens_seen": 28241744, + "step": 18485 + }, + { + "epoch": 57.068006182380216, + "grad_norm": 0.6145579218864441, + "learning_rate": 2.7959882576081382e-05, + "loss": 0.2816, + "num_input_tokens_seen": 28250000, + "step": 18490 + }, + { + "epoch": 57.08346213292118, + "grad_norm": 0.31740012764930725, + "learning_rate": 2.795013392188286e-05, + "loss": 0.2155, + "num_input_tokens_seen": 28258640, + "step": 18495 + }, + { + "epoch": 57.09891808346213, + "grad_norm": 1.1704787015914917, + "learning_rate": 2.7940384812736614e-05, + "loss": 0.2719, + "num_input_tokens_seen": 28266160, + "step": 18500 + }, + { + "epoch": 57.11437403400309, + "grad_norm": 0.41550031304359436, + "learning_rate": 2.7930635250146087e-05, + "loss": 0.2271, + "num_input_tokens_seen": 28273328, + "step": 18505 + }, + { + "epoch": 57.12982998454405, + "grad_norm": 0.5305392742156982, + "learning_rate": 2.792088523561477e-05, + "loss": 0.2261, + "num_input_tokens_seen": 28280752, + "step": 18510 + }, + { + "epoch": 57.145285935085006, + "grad_norm": 0.4676353931427002, + "learning_rate": 2.7911134770646246e-05, + "loss": 0.2317, + "num_input_tokens_seen": 28288016, + "step": 18515 + }, + { + "epoch": 57.16074188562597, + "grad_norm": 0.683792233467102, + "learning_rate": 2.7901383856744157e-05, + "loss": 0.3108, + "num_input_tokens_seen": 28295632, + "step": 18520 + }, + { + "epoch": 57.17619783616692, + "grad_norm": 0.635486900806427, + "learning_rate": 2.7891632495412217e-05, + "loss": 0.2091, + "num_input_tokens_seen": 28302704, + "step": 18525 + }, + { + "epoch": 57.19165378670788, + "grad_norm": 0.7183573842048645, + "learning_rate": 2.7881880688154205e-05, + "loss": 0.2358, + "num_input_tokens_seen": 28310416, + "step": 18530 + }, + { + "epoch": 57.20710973724884, + "grad_norm": 0.4516482353210449, + "learning_rate": 2.7872128436473977e-05, + "loss": 0.263, + "num_input_tokens_seen": 28318480, + "step": 18535 + }, + { + "epoch": 57.222565687789796, + "grad_norm": 0.6394111514091492, + "learning_rate": 2.7862375741875448e-05, + "loss": 0.2411, + "num_input_tokens_seen": 28326864, + "step": 18540 + }, + { + "epoch": 57.23802163833076, + "grad_norm": 0.5547207593917847, + "learning_rate": 2.785262260586261e-05, + "loss": 0.308, + "num_input_tokens_seen": 28335056, + "step": 18545 + }, + { + "epoch": 57.25347758887172, + "grad_norm": 0.34453123807907104, + "learning_rate": 2.7842869029939517e-05, + "loss": 0.2619, + "num_input_tokens_seen": 28342704, + "step": 18550 + }, + { + "epoch": 57.26893353941267, + "grad_norm": 1.2247164249420166, + "learning_rate": 2.7833115015610296e-05, + "loss": 0.2459, + "num_input_tokens_seen": 28350192, + "step": 18555 + }, + { + "epoch": 57.28438948995363, + "grad_norm": 0.4615039825439453, + "learning_rate": 2.7823360564379136e-05, + "loss": 0.316, + "num_input_tokens_seen": 28358000, + "step": 18560 + }, + { + "epoch": 57.29984544049459, + "grad_norm": 0.7598435282707214, + "learning_rate": 2.7813605677750297e-05, + "loss": 0.2867, + "num_input_tokens_seen": 28366256, + "step": 18565 + }, + { + "epoch": 57.31530139103555, + "grad_norm": 0.35858333110809326, + "learning_rate": 2.7803850357228102e-05, + "loss": 0.2554, + "num_input_tokens_seen": 28373968, + "step": 18570 + }, + { + "epoch": 57.33075734157651, + "grad_norm": 0.41979241371154785, + "learning_rate": 2.779409460431695e-05, + "loss": 0.2422, + "num_input_tokens_seen": 28381296, + "step": 18575 + }, + { + "epoch": 57.34621329211747, + "grad_norm": 0.5138543248176575, + "learning_rate": 2.778433842052129e-05, + "loss": 0.2768, + "num_input_tokens_seen": 28388432, + "step": 18580 + }, + { + "epoch": 57.36166924265842, + "grad_norm": 0.5200363397598267, + "learning_rate": 2.7774581807345664e-05, + "loss": 0.2463, + "num_input_tokens_seen": 28396560, + "step": 18585 + }, + { + "epoch": 57.37712519319938, + "grad_norm": 0.4776303172111511, + "learning_rate": 2.776482476629465e-05, + "loss": 0.2311, + "num_input_tokens_seen": 28404368, + "step": 18590 + }, + { + "epoch": 57.39258114374034, + "grad_norm": 0.3699359893798828, + "learning_rate": 2.7755067298872924e-05, + "loss": 0.237, + "num_input_tokens_seen": 28412304, + "step": 18595 + }, + { + "epoch": 57.4080370942813, + "grad_norm": 0.5341569781303406, + "learning_rate": 2.774530940658518e-05, + "loss": 0.2248, + "num_input_tokens_seen": 28419888, + "step": 18600 + }, + { + "epoch": 57.4080370942813, + "eval_loss": 0.31159210205078125, + "eval_runtime": 6.3178, + "eval_samples_per_second": 91.013, + "eval_steps_per_second": 22.793, + "num_input_tokens_seen": 28419888, + "step": 18600 + }, + { + "epoch": 57.42349304482226, + "grad_norm": 0.4835396111011505, + "learning_rate": 2.7735551090936236e-05, + "loss": 0.262, + "num_input_tokens_seen": 28427504, + "step": 18605 + }, + { + "epoch": 57.43894899536321, + "grad_norm": 0.6006073951721191, + "learning_rate": 2.7725792353430934e-05, + "loss": 0.2233, + "num_input_tokens_seen": 28434608, + "step": 18610 + }, + { + "epoch": 57.45440494590417, + "grad_norm": 0.4869166910648346, + "learning_rate": 2.77160331955742e-05, + "loss": 0.2686, + "num_input_tokens_seen": 28442160, + "step": 18615 + }, + { + "epoch": 57.469860896445134, + "grad_norm": 0.44707250595092773, + "learning_rate": 2.7706273618871008e-05, + "loss": 0.2106, + "num_input_tokens_seen": 28449872, + "step": 18620 + }, + { + "epoch": 57.48531684698609, + "grad_norm": 0.368731826543808, + "learning_rate": 2.769651362482642e-05, + "loss": 0.238, + "num_input_tokens_seen": 28457264, + "step": 18625 + }, + { + "epoch": 57.50077279752705, + "grad_norm": 0.35751771926879883, + "learning_rate": 2.768675321494555e-05, + "loss": 0.227, + "num_input_tokens_seen": 28465264, + "step": 18630 + }, + { + "epoch": 57.51622874806801, + "grad_norm": 0.46474388241767883, + "learning_rate": 2.7676992390733565e-05, + "loss": 0.2919, + "num_input_tokens_seen": 28473136, + "step": 18635 + }, + { + "epoch": 57.53168469860896, + "grad_norm": 0.7636638283729553, + "learning_rate": 2.766723115369571e-05, + "loss": 0.2975, + "num_input_tokens_seen": 28480688, + "step": 18640 + }, + { + "epoch": 57.547140649149924, + "grad_norm": 0.5495980978012085, + "learning_rate": 2.765746950533729e-05, + "loss": 0.2985, + "num_input_tokens_seen": 28488272, + "step": 18645 + }, + { + "epoch": 57.56259659969088, + "grad_norm": 0.4795461595058441, + "learning_rate": 2.7647707447163684e-05, + "loss": 0.2519, + "num_input_tokens_seen": 28495280, + "step": 18650 + }, + { + "epoch": 57.57805255023184, + "grad_norm": 0.6262984275817871, + "learning_rate": 2.7637944980680315e-05, + "loss": 0.2455, + "num_input_tokens_seen": 28502960, + "step": 18655 + }, + { + "epoch": 57.5935085007728, + "grad_norm": 0.4030863642692566, + "learning_rate": 2.762818210739268e-05, + "loss": 0.2992, + "num_input_tokens_seen": 28510416, + "step": 18660 + }, + { + "epoch": 57.60896445131375, + "grad_norm": 0.627160906791687, + "learning_rate": 2.7618418828806332e-05, + "loss": 0.2424, + "num_input_tokens_seen": 28518384, + "step": 18665 + }, + { + "epoch": 57.624420401854714, + "grad_norm": 0.8829951882362366, + "learning_rate": 2.76086551464269e-05, + "loss": 0.303, + "num_input_tokens_seen": 28525840, + "step": 18670 + }, + { + "epoch": 57.639876352395675, + "grad_norm": 0.49173370003700256, + "learning_rate": 2.759889106176006e-05, + "loss": 0.2939, + "num_input_tokens_seen": 28533648, + "step": 18675 + }, + { + "epoch": 57.65533230293663, + "grad_norm": 0.48604312539100647, + "learning_rate": 2.758912657631156e-05, + "loss": 0.2174, + "num_input_tokens_seen": 28541104, + "step": 18680 + }, + { + "epoch": 57.67078825347759, + "grad_norm": 0.9200245141983032, + "learning_rate": 2.7579361691587198e-05, + "loss": 0.2474, + "num_input_tokens_seen": 28548880, + "step": 18685 + }, + { + "epoch": 57.68624420401855, + "grad_norm": 0.6062650680541992, + "learning_rate": 2.756959640909285e-05, + "loss": 0.2425, + "num_input_tokens_seen": 28556560, + "step": 18690 + }, + { + "epoch": 57.701700154559504, + "grad_norm": 0.7808263897895813, + "learning_rate": 2.7559830730334452e-05, + "loss": 0.2245, + "num_input_tokens_seen": 28564176, + "step": 18695 + }, + { + "epoch": 57.717156105100464, + "grad_norm": 0.6969781517982483, + "learning_rate": 2.7550064656817988e-05, + "loss": 0.2234, + "num_input_tokens_seen": 28571600, + "step": 18700 + }, + { + "epoch": 57.732612055641425, + "grad_norm": 0.6800720691680908, + "learning_rate": 2.7540298190049503e-05, + "loss": 0.2203, + "num_input_tokens_seen": 28579760, + "step": 18705 + }, + { + "epoch": 57.74806800618238, + "grad_norm": 0.6230520009994507, + "learning_rate": 2.7530531331535107e-05, + "loss": 0.2651, + "num_input_tokens_seen": 28587248, + "step": 18710 + }, + { + "epoch": 57.76352395672334, + "grad_norm": 0.6472727656364441, + "learning_rate": 2.752076408278099e-05, + "loss": 0.2367, + "num_input_tokens_seen": 28594960, + "step": 18715 + }, + { + "epoch": 57.778979907264294, + "grad_norm": 0.5586429834365845, + "learning_rate": 2.751099644529337e-05, + "loss": 0.2732, + "num_input_tokens_seen": 28602672, + "step": 18720 + }, + { + "epoch": 57.794435857805254, + "grad_norm": 0.5392041206359863, + "learning_rate": 2.7501228420578533e-05, + "loss": 0.2947, + "num_input_tokens_seen": 28610192, + "step": 18725 + }, + { + "epoch": 57.809891808346215, + "grad_norm": 0.3165722191333771, + "learning_rate": 2.7491460010142857e-05, + "loss": 0.2812, + "num_input_tokens_seen": 28617744, + "step": 18730 + }, + { + "epoch": 57.82534775888717, + "grad_norm": 0.5770792365074158, + "learning_rate": 2.7481691215492727e-05, + "loss": 0.2448, + "num_input_tokens_seen": 28625264, + "step": 18735 + }, + { + "epoch": 57.84080370942813, + "grad_norm": 0.6739878058433533, + "learning_rate": 2.747192203813463e-05, + "loss": 0.2264, + "num_input_tokens_seen": 28632336, + "step": 18740 + }, + { + "epoch": 57.85625965996909, + "grad_norm": 0.4900580644607544, + "learning_rate": 2.7462152479575087e-05, + "loss": 0.2685, + "num_input_tokens_seen": 28639760, + "step": 18745 + }, + { + "epoch": 57.871715610510044, + "grad_norm": 0.49300214648246765, + "learning_rate": 2.7452382541320697e-05, + "loss": 0.2485, + "num_input_tokens_seen": 28647504, + "step": 18750 + }, + { + "epoch": 57.887171561051005, + "grad_norm": 0.9041780233383179, + "learning_rate": 2.7442612224878096e-05, + "loss": 0.3833, + "num_input_tokens_seen": 28654640, + "step": 18755 + }, + { + "epoch": 57.902627511591966, + "grad_norm": 0.8179636597633362, + "learning_rate": 2.7432841531753994e-05, + "loss": 0.2082, + "num_input_tokens_seen": 28662448, + "step": 18760 + }, + { + "epoch": 57.91808346213292, + "grad_norm": 0.6178157329559326, + "learning_rate": 2.7423070463455147e-05, + "loss": 0.2575, + "num_input_tokens_seen": 28669744, + "step": 18765 + }, + { + "epoch": 57.93353941267388, + "grad_norm": 0.621040940284729, + "learning_rate": 2.7413299021488397e-05, + "loss": 0.2628, + "num_input_tokens_seen": 28677968, + "step": 18770 + }, + { + "epoch": 57.948995363214834, + "grad_norm": 0.5201172828674316, + "learning_rate": 2.7403527207360615e-05, + "loss": 0.3008, + "num_input_tokens_seen": 28686640, + "step": 18775 + }, + { + "epoch": 57.964451313755795, + "grad_norm": 0.9973020553588867, + "learning_rate": 2.7393755022578722e-05, + "loss": 0.2953, + "num_input_tokens_seen": 28694704, + "step": 18780 + }, + { + "epoch": 57.979907264296756, + "grad_norm": 0.6131154298782349, + "learning_rate": 2.7383982468649714e-05, + "loss": 0.2427, + "num_input_tokens_seen": 28702320, + "step": 18785 + }, + { + "epoch": 57.99536321483771, + "grad_norm": 0.7871760725975037, + "learning_rate": 2.7374209547080665e-05, + "loss": 0.235, + "num_input_tokens_seen": 28709360, + "step": 18790 + }, + { + "epoch": 58.009273570324574, + "grad_norm": 0.5990124940872192, + "learning_rate": 2.7364436259378663e-05, + "loss": 0.2407, + "num_input_tokens_seen": 28716416, + "step": 18795 + }, + { + "epoch": 58.024729520865534, + "grad_norm": 0.7015490531921387, + "learning_rate": 2.735466260705088e-05, + "loss": 0.2551, + "num_input_tokens_seen": 28724096, + "step": 18800 + }, + { + "epoch": 58.024729520865534, + "eval_loss": 0.31153854727745056, + "eval_runtime": 6.268, + "eval_samples_per_second": 91.736, + "eval_steps_per_second": 22.974, + "num_input_tokens_seen": 28724096, + "step": 18800 + }, + { + "epoch": 58.04018547140649, + "grad_norm": 0.448857843875885, + "learning_rate": 2.7344888591604524e-05, + "loss": 0.2325, + "num_input_tokens_seen": 28731840, + "step": 18805 + }, + { + "epoch": 58.05564142194745, + "grad_norm": 0.4318247139453888, + "learning_rate": 2.7335114214546893e-05, + "loss": 0.2271, + "num_input_tokens_seen": 28739712, + "step": 18810 + }, + { + "epoch": 58.07109737248841, + "grad_norm": 0.32377564907073975, + "learning_rate": 2.7325339477385293e-05, + "loss": 0.2459, + "num_input_tokens_seen": 28747456, + "step": 18815 + }, + { + "epoch": 58.086553323029364, + "grad_norm": 0.35842904448509216, + "learning_rate": 2.7315564381627128e-05, + "loss": 0.2749, + "num_input_tokens_seen": 28754784, + "step": 18820 + }, + { + "epoch": 58.102009273570324, + "grad_norm": 0.5742428302764893, + "learning_rate": 2.7305788928779835e-05, + "loss": 0.2953, + "num_input_tokens_seen": 28762528, + "step": 18825 + }, + { + "epoch": 58.117465224111285, + "grad_norm": 0.7245960831642151, + "learning_rate": 2.729601312035091e-05, + "loss": 0.289, + "num_input_tokens_seen": 28770240, + "step": 18830 + }, + { + "epoch": 58.13292117465224, + "grad_norm": 0.5019060969352722, + "learning_rate": 2.7286236957847915e-05, + "loss": 0.2044, + "num_input_tokens_seen": 28777312, + "step": 18835 + }, + { + "epoch": 58.1483771251932, + "grad_norm": 0.3680460751056671, + "learning_rate": 2.7276460442778446e-05, + "loss": 0.2781, + "num_input_tokens_seen": 28784768, + "step": 18840 + }, + { + "epoch": 58.16383307573416, + "grad_norm": 0.3865100145339966, + "learning_rate": 2.726668357665017e-05, + "loss": 0.2546, + "num_input_tokens_seen": 28792672, + "step": 18845 + }, + { + "epoch": 58.179289026275114, + "grad_norm": 0.6618106961250305, + "learning_rate": 2.7256906360970808e-05, + "loss": 0.2134, + "num_input_tokens_seen": 28800800, + "step": 18850 + }, + { + "epoch": 58.194744976816075, + "grad_norm": 0.6161372065544128, + "learning_rate": 2.7247128797248117e-05, + "loss": 0.2377, + "num_input_tokens_seen": 28807968, + "step": 18855 + }, + { + "epoch": 58.210200927357036, + "grad_norm": 0.5074812173843384, + "learning_rate": 2.7237350886989925e-05, + "loss": 0.2564, + "num_input_tokens_seen": 28815808, + "step": 18860 + }, + { + "epoch": 58.22565687789799, + "grad_norm": 0.5336150527000427, + "learning_rate": 2.7227572631704107e-05, + "loss": 0.3096, + "num_input_tokens_seen": 28823104, + "step": 18865 + }, + { + "epoch": 58.24111282843895, + "grad_norm": 0.5550016760826111, + "learning_rate": 2.7217794032898596e-05, + "loss": 0.2281, + "num_input_tokens_seen": 28831104, + "step": 18870 + }, + { + "epoch": 58.256568778979904, + "grad_norm": 0.37999069690704346, + "learning_rate": 2.7208015092081384e-05, + "loss": 0.2842, + "num_input_tokens_seen": 28839008, + "step": 18875 + }, + { + "epoch": 58.272024729520865, + "grad_norm": 0.5474669933319092, + "learning_rate": 2.719823581076049e-05, + "loss": 0.2533, + "num_input_tokens_seen": 28846624, + "step": 18880 + }, + { + "epoch": 58.287480680061826, + "grad_norm": 0.6639845967292786, + "learning_rate": 2.718845619044401e-05, + "loss": 0.2882, + "num_input_tokens_seen": 28854624, + "step": 18885 + }, + { + "epoch": 58.30293663060278, + "grad_norm": 0.39774468541145325, + "learning_rate": 2.7178676232640088e-05, + "loss": 0.2728, + "num_input_tokens_seen": 28862048, + "step": 18890 + }, + { + "epoch": 58.31839258114374, + "grad_norm": 0.34505945444107056, + "learning_rate": 2.716889593885691e-05, + "loss": 0.269, + "num_input_tokens_seen": 28869792, + "step": 18895 + }, + { + "epoch": 58.3338485316847, + "grad_norm": 0.6997979879379272, + "learning_rate": 2.7159115310602716e-05, + "loss": 0.2374, + "num_input_tokens_seen": 28877440, + "step": 18900 + }, + { + "epoch": 58.349304482225655, + "grad_norm": 0.603663444519043, + "learning_rate": 2.7149334349385814e-05, + "loss": 0.2157, + "num_input_tokens_seen": 28885312, + "step": 18905 + }, + { + "epoch": 58.364760432766616, + "grad_norm": 0.6782128214836121, + "learning_rate": 2.713955305671454e-05, + "loss": 0.2986, + "num_input_tokens_seen": 28893056, + "step": 18910 + }, + { + "epoch": 58.38021638330758, + "grad_norm": 0.31456416845321655, + "learning_rate": 2.71297714340973e-05, + "loss": 0.2072, + "num_input_tokens_seen": 28900672, + "step": 18915 + }, + { + "epoch": 58.39567233384853, + "grad_norm": 0.388980895280838, + "learning_rate": 2.7119989483042545e-05, + "loss": 0.272, + "num_input_tokens_seen": 28907968, + "step": 18920 + }, + { + "epoch": 58.41112828438949, + "grad_norm": 0.3718777000904083, + "learning_rate": 2.7110207205058768e-05, + "loss": 0.2444, + "num_input_tokens_seen": 28915360, + "step": 18925 + }, + { + "epoch": 58.426584234930445, + "grad_norm": 0.5576260685920715, + "learning_rate": 2.7100424601654517e-05, + "loss": 0.2519, + "num_input_tokens_seen": 28922880, + "step": 18930 + }, + { + "epoch": 58.442040185471406, + "grad_norm": 0.4745454490184784, + "learning_rate": 2.7090641674338403e-05, + "loss": 0.2693, + "num_input_tokens_seen": 28931296, + "step": 18935 + }, + { + "epoch": 58.45749613601237, + "grad_norm": 0.39475658535957336, + "learning_rate": 2.7080858424619072e-05, + "loss": 0.3056, + "num_input_tokens_seen": 28939360, + "step": 18940 + }, + { + "epoch": 58.47295208655332, + "grad_norm": 0.5872281789779663, + "learning_rate": 2.707107485400521e-05, + "loss": 0.2313, + "num_input_tokens_seen": 28947040, + "step": 18945 + }, + { + "epoch": 58.48840803709428, + "grad_norm": 0.5172808170318604, + "learning_rate": 2.7061290964005586e-05, + "loss": 0.2288, + "num_input_tokens_seen": 28955136, + "step": 18950 + }, + { + "epoch": 58.50386398763524, + "grad_norm": 0.4795955419540405, + "learning_rate": 2.7051506756129e-05, + "loss": 0.2139, + "num_input_tokens_seen": 28962560, + "step": 18955 + }, + { + "epoch": 58.519319938176196, + "grad_norm": 0.6390344500541687, + "learning_rate": 2.704172223188428e-05, + "loss": 0.3118, + "num_input_tokens_seen": 28969760, + "step": 18960 + }, + { + "epoch": 58.53477588871716, + "grad_norm": 0.5160568356513977, + "learning_rate": 2.7031937392780334e-05, + "loss": 0.1997, + "num_input_tokens_seen": 28977248, + "step": 18965 + }, + { + "epoch": 58.55023183925812, + "grad_norm": 0.8641074299812317, + "learning_rate": 2.702215224032611e-05, + "loss": 0.2227, + "num_input_tokens_seen": 28984288, + "step": 18970 + }, + { + "epoch": 58.56568778979907, + "grad_norm": 0.45701926946640015, + "learning_rate": 2.70123667760306e-05, + "loss": 0.2389, + "num_input_tokens_seen": 28991904, + "step": 18975 + }, + { + "epoch": 58.58114374034003, + "grad_norm": 0.44091975688934326, + "learning_rate": 2.7002581001402845e-05, + "loss": 0.2987, + "num_input_tokens_seen": 28999872, + "step": 18980 + }, + { + "epoch": 58.59659969088099, + "grad_norm": 0.7115287780761719, + "learning_rate": 2.6992794917951923e-05, + "loss": 0.2989, + "num_input_tokens_seen": 29007488, + "step": 18985 + }, + { + "epoch": 58.61205564142195, + "grad_norm": 0.4172707200050354, + "learning_rate": 2.6983008527187e-05, + "loss": 0.2397, + "num_input_tokens_seen": 29015552, + "step": 18990 + }, + { + "epoch": 58.62751159196291, + "grad_norm": 0.4336620569229126, + "learning_rate": 2.697322183061723e-05, + "loss": 0.2598, + "num_input_tokens_seen": 29023296, + "step": 18995 + }, + { + "epoch": 58.64296754250386, + "grad_norm": 0.5269997119903564, + "learning_rate": 2.696343482975186e-05, + "loss": 0.2405, + "num_input_tokens_seen": 29031328, + "step": 19000 + }, + { + "epoch": 58.64296754250386, + "eval_loss": 0.3120494782924652, + "eval_runtime": 6.3233, + "eval_samples_per_second": 90.934, + "eval_steps_per_second": 22.773, + "num_input_tokens_seen": 29031328, + "step": 19000 + }, + { + "epoch": 58.65842349304482, + "grad_norm": 0.39929646253585815, + "learning_rate": 2.695364752610016e-05, + "loss": 0.2332, + "num_input_tokens_seen": 29039200, + "step": 19005 + }, + { + "epoch": 58.67387944358578, + "grad_norm": 0.5953931212425232, + "learning_rate": 2.6943859921171467e-05, + "loss": 0.305, + "num_input_tokens_seen": 29046752, + "step": 19010 + }, + { + "epoch": 58.689335394126736, + "grad_norm": 0.7048777341842651, + "learning_rate": 2.6934072016475143e-05, + "loss": 0.2898, + "num_input_tokens_seen": 29054080, + "step": 19015 + }, + { + "epoch": 58.7047913446677, + "grad_norm": 0.43196094036102295, + "learning_rate": 2.6924283813520606e-05, + "loss": 0.2056, + "num_input_tokens_seen": 29061792, + "step": 19020 + }, + { + "epoch": 58.72024729520866, + "grad_norm": 0.673856258392334, + "learning_rate": 2.691449531381733e-05, + "loss": 0.324, + "num_input_tokens_seen": 29069024, + "step": 19025 + }, + { + "epoch": 58.73570324574961, + "grad_norm": 0.5278506875038147, + "learning_rate": 2.6904706518874816e-05, + "loss": 0.3145, + "num_input_tokens_seen": 29076256, + "step": 19030 + }, + { + "epoch": 58.75115919629057, + "grad_norm": 0.47341036796569824, + "learning_rate": 2.6894917430202615e-05, + "loss": 0.2744, + "num_input_tokens_seen": 29084128, + "step": 19035 + }, + { + "epoch": 58.76661514683153, + "grad_norm": 0.47636619210243225, + "learning_rate": 2.6885128049310343e-05, + "loss": 0.2363, + "num_input_tokens_seen": 29091872, + "step": 19040 + }, + { + "epoch": 58.78207109737249, + "grad_norm": 0.3332866430282593, + "learning_rate": 2.687533837770762e-05, + "loss": 0.2054, + "num_input_tokens_seen": 29099296, + "step": 19045 + }, + { + "epoch": 58.79752704791345, + "grad_norm": 0.4789353907108307, + "learning_rate": 2.6865548416904162e-05, + "loss": 0.2656, + "num_input_tokens_seen": 29107232, + "step": 19050 + }, + { + "epoch": 58.8129829984544, + "grad_norm": 0.5565450191497803, + "learning_rate": 2.68557581684097e-05, + "loss": 0.2741, + "num_input_tokens_seen": 29114848, + "step": 19055 + }, + { + "epoch": 58.82843894899536, + "grad_norm": 0.6453842520713806, + "learning_rate": 2.6845967633733998e-05, + "loss": 0.2579, + "num_input_tokens_seen": 29122336, + "step": 19060 + }, + { + "epoch": 58.84389489953632, + "grad_norm": 0.6325498223304749, + "learning_rate": 2.683617681438689e-05, + "loss": 0.2414, + "num_input_tokens_seen": 29130112, + "step": 19065 + }, + { + "epoch": 58.85935085007728, + "grad_norm": 0.7612115740776062, + "learning_rate": 2.682638571187825e-05, + "loss": 0.3783, + "num_input_tokens_seen": 29137632, + "step": 19070 + }, + { + "epoch": 58.87480680061824, + "grad_norm": 0.3553233742713928, + "learning_rate": 2.6816594327717976e-05, + "loss": 0.2076, + "num_input_tokens_seen": 29145696, + "step": 19075 + }, + { + "epoch": 58.8902627511592, + "grad_norm": 0.5250456929206848, + "learning_rate": 2.680680266341603e-05, + "loss": 0.2573, + "num_input_tokens_seen": 29153344, + "step": 19080 + }, + { + "epoch": 58.90571870170015, + "grad_norm": 0.5421446561813354, + "learning_rate": 2.67970107204824e-05, + "loss": 0.2681, + "num_input_tokens_seen": 29161792, + "step": 19085 + }, + { + "epoch": 58.92117465224111, + "grad_norm": 0.4840373694896698, + "learning_rate": 2.6787218500427142e-05, + "loss": 0.2129, + "num_input_tokens_seen": 29169440, + "step": 19090 + }, + { + "epoch": 58.936630602782074, + "grad_norm": 0.3086816966533661, + "learning_rate": 2.6777426004760332e-05, + "loss": 0.2274, + "num_input_tokens_seen": 29177184, + "step": 19095 + }, + { + "epoch": 58.95208655332303, + "grad_norm": 0.6877962350845337, + "learning_rate": 2.6767633234992094e-05, + "loss": 0.3033, + "num_input_tokens_seen": 29184320, + "step": 19100 + }, + { + "epoch": 58.96754250386399, + "grad_norm": 0.4098924398422241, + "learning_rate": 2.6757840192632598e-05, + "loss": 0.2101, + "num_input_tokens_seen": 29191648, + "step": 19105 + }, + { + "epoch": 58.98299845440495, + "grad_norm": 0.6138983964920044, + "learning_rate": 2.6748046879192052e-05, + "loss": 0.2968, + "num_input_tokens_seen": 29199424, + "step": 19110 + }, + { + "epoch": 58.9984544049459, + "grad_norm": 0.39976781606674194, + "learning_rate": 2.673825329618071e-05, + "loss": 0.2641, + "num_input_tokens_seen": 29207264, + "step": 19115 + }, + { + "epoch": 59.01236476043277, + "grad_norm": 0.5011347532272339, + "learning_rate": 2.6728459445108866e-05, + "loss": 0.3609, + "num_input_tokens_seen": 29214000, + "step": 19120 + }, + { + "epoch": 59.02782071097373, + "grad_norm": 0.37677013874053955, + "learning_rate": 2.6718665327486854e-05, + "loss": 0.2285, + "num_input_tokens_seen": 29221552, + "step": 19125 + }, + { + "epoch": 59.04327666151468, + "grad_norm": 0.5372051000595093, + "learning_rate": 2.6708870944825048e-05, + "loss": 0.2214, + "num_input_tokens_seen": 29228816, + "step": 19130 + }, + { + "epoch": 59.05873261205564, + "grad_norm": 0.5490147471427917, + "learning_rate": 2.6699076298633874e-05, + "loss": 0.2466, + "num_input_tokens_seen": 29237040, + "step": 19135 + }, + { + "epoch": 59.074188562596596, + "grad_norm": 0.3863569498062134, + "learning_rate": 2.6689281390423788e-05, + "loss": 0.2265, + "num_input_tokens_seen": 29244592, + "step": 19140 + }, + { + "epoch": 59.08964451313756, + "grad_norm": 0.5585085153579712, + "learning_rate": 2.667948622170527e-05, + "loss": 0.2901, + "num_input_tokens_seen": 29252432, + "step": 19145 + }, + { + "epoch": 59.10510046367852, + "grad_norm": 0.35923805832862854, + "learning_rate": 2.6669690793988873e-05, + "loss": 0.2319, + "num_input_tokens_seen": 29259632, + "step": 19150 + }, + { + "epoch": 59.12055641421947, + "grad_norm": 0.3297685384750366, + "learning_rate": 2.665989510878518e-05, + "loss": 0.2511, + "num_input_tokens_seen": 29267088, + "step": 19155 + }, + { + "epoch": 59.13601236476043, + "grad_norm": 0.4422866106033325, + "learning_rate": 2.6650099167604793e-05, + "loss": 0.2691, + "num_input_tokens_seen": 29275472, + "step": 19160 + }, + { + "epoch": 59.15146831530139, + "grad_norm": 0.4481583535671234, + "learning_rate": 2.6640302971958376e-05, + "loss": 0.1929, + "num_input_tokens_seen": 29283152, + "step": 19165 + }, + { + "epoch": 59.16692426584235, + "grad_norm": 0.6568586230278015, + "learning_rate": 2.6630506523356635e-05, + "loss": 0.2612, + "num_input_tokens_seen": 29290672, + "step": 19170 + }, + { + "epoch": 59.18238021638331, + "grad_norm": 0.8444507718086243, + "learning_rate": 2.6620709823310297e-05, + "loss": 0.2536, + "num_input_tokens_seen": 29298416, + "step": 19175 + }, + { + "epoch": 59.19783616692427, + "grad_norm": 0.7666417360305786, + "learning_rate": 2.661091287333014e-05, + "loss": 0.2546, + "num_input_tokens_seen": 29305904, + "step": 19180 + }, + { + "epoch": 59.21329211746522, + "grad_norm": 0.29359689354896545, + "learning_rate": 2.660111567492696e-05, + "loss": 0.2529, + "num_input_tokens_seen": 29312912, + "step": 19185 + }, + { + "epoch": 59.22874806800618, + "grad_norm": 0.5245959758758545, + "learning_rate": 2.6591318229611635e-05, + "loss": 0.2794, + "num_input_tokens_seen": 29320656, + "step": 19190 + }, + { + "epoch": 59.244204018547144, + "grad_norm": 0.5056788325309753, + "learning_rate": 2.6581520538895037e-05, + "loss": 0.2379, + "num_input_tokens_seen": 29328752, + "step": 19195 + }, + { + "epoch": 59.2596599690881, + "grad_norm": 0.8207859992980957, + "learning_rate": 2.6571722604288102e-05, + "loss": 0.2108, + "num_input_tokens_seen": 29336560, + "step": 19200 + }, + { + "epoch": 59.2596599690881, + "eval_loss": 0.31101518869400024, + "eval_runtime": 6.2752, + "eval_samples_per_second": 91.631, + "eval_steps_per_second": 22.948, + "num_input_tokens_seen": 29336560, + "step": 19200 + }, + { + "epoch": 59.27511591962906, + "grad_norm": 0.3135232925415039, + "learning_rate": 2.656192442730179e-05, + "loss": 0.273, + "num_input_tokens_seen": 29344208, + "step": 19205 + }, + { + "epoch": 59.29057187017001, + "grad_norm": 0.4690769910812378, + "learning_rate": 2.6552126009447098e-05, + "loss": 0.2421, + "num_input_tokens_seen": 29351344, + "step": 19210 + }, + { + "epoch": 59.30602782071097, + "grad_norm": 0.4778822064399719, + "learning_rate": 2.654232735223507e-05, + "loss": 0.3151, + "num_input_tokens_seen": 29358864, + "step": 19215 + }, + { + "epoch": 59.321483771251934, + "grad_norm": 0.5750157833099365, + "learning_rate": 2.6532528457176787e-05, + "loss": 0.2842, + "num_input_tokens_seen": 29366864, + "step": 19220 + }, + { + "epoch": 59.33693972179289, + "grad_norm": 0.41795679926872253, + "learning_rate": 2.6522729325783348e-05, + "loss": 0.264, + "num_input_tokens_seen": 29374544, + "step": 19225 + }, + { + "epoch": 59.35239567233385, + "grad_norm": 0.6546184420585632, + "learning_rate": 2.6512929959565914e-05, + "loss": 0.3081, + "num_input_tokens_seen": 29382128, + "step": 19230 + }, + { + "epoch": 59.36785162287481, + "grad_norm": 0.46875646710395813, + "learning_rate": 2.6503130360035673e-05, + "loss": 0.2159, + "num_input_tokens_seen": 29389360, + "step": 19235 + }, + { + "epoch": 59.38330757341576, + "grad_norm": 0.4326598644256592, + "learning_rate": 2.6493330528703835e-05, + "loss": 0.2323, + "num_input_tokens_seen": 29396656, + "step": 19240 + }, + { + "epoch": 59.398763523956724, + "grad_norm": 0.26689577102661133, + "learning_rate": 2.648353046708167e-05, + "loss": 0.2581, + "num_input_tokens_seen": 29403824, + "step": 19245 + }, + { + "epoch": 59.414219474497685, + "grad_norm": 0.40223318338394165, + "learning_rate": 2.647373017668046e-05, + "loss": 0.2983, + "num_input_tokens_seen": 29411152, + "step": 19250 + }, + { + "epoch": 59.42967542503864, + "grad_norm": 0.4257497787475586, + "learning_rate": 2.6463929659011537e-05, + "loss": 0.2215, + "num_input_tokens_seen": 29418288, + "step": 19255 + }, + { + "epoch": 59.4451313755796, + "grad_norm": 0.43655920028686523, + "learning_rate": 2.6454128915586262e-05, + "loss": 0.2015, + "num_input_tokens_seen": 29425808, + "step": 19260 + }, + { + "epoch": 59.46058732612055, + "grad_norm": 0.5490999817848206, + "learning_rate": 2.6444327947916036e-05, + "loss": 0.2382, + "num_input_tokens_seen": 29433520, + "step": 19265 + }, + { + "epoch": 59.476043276661514, + "grad_norm": 0.5190300345420837, + "learning_rate": 2.6434526757512292e-05, + "loss": 0.3225, + "num_input_tokens_seen": 29441488, + "step": 19270 + }, + { + "epoch": 59.491499227202475, + "grad_norm": 0.38623592257499695, + "learning_rate": 2.6424725345886486e-05, + "loss": 0.2233, + "num_input_tokens_seen": 29449424, + "step": 19275 + }, + { + "epoch": 59.50695517774343, + "grad_norm": 0.37971821427345276, + "learning_rate": 2.641492371455014e-05, + "loss": 0.2497, + "num_input_tokens_seen": 29456464, + "step": 19280 + }, + { + "epoch": 59.52241112828439, + "grad_norm": 0.579084038734436, + "learning_rate": 2.640512186501477e-05, + "loss": 0.2806, + "num_input_tokens_seen": 29464944, + "step": 19285 + }, + { + "epoch": 59.53786707882535, + "grad_norm": 0.6059790253639221, + "learning_rate": 2.639531979879195e-05, + "loss": 0.2316, + "num_input_tokens_seen": 29472304, + "step": 19290 + }, + { + "epoch": 59.553323029366304, + "grad_norm": 0.4753325879573822, + "learning_rate": 2.638551751739328e-05, + "loss": 0.224, + "num_input_tokens_seen": 29479888, + "step": 19295 + }, + { + "epoch": 59.568778979907265, + "grad_norm": 0.2665259540081024, + "learning_rate": 2.6375715022330404e-05, + "loss": 0.2498, + "num_input_tokens_seen": 29487344, + "step": 19300 + }, + { + "epoch": 59.584234930448225, + "grad_norm": 0.49230098724365234, + "learning_rate": 2.6365912315114976e-05, + "loss": 0.3099, + "num_input_tokens_seen": 29495184, + "step": 19305 + }, + { + "epoch": 59.59969088098918, + "grad_norm": 0.7153189778327942, + "learning_rate": 2.6356109397258704e-05, + "loss": 0.2107, + "num_input_tokens_seen": 29503024, + "step": 19310 + }, + { + "epoch": 59.61514683153014, + "grad_norm": 0.36599236726760864, + "learning_rate": 2.6346306270273325e-05, + "loss": 0.2209, + "num_input_tokens_seen": 29510416, + "step": 19315 + }, + { + "epoch": 59.630602782071094, + "grad_norm": 0.5735997557640076, + "learning_rate": 2.6336502935670608e-05, + "loss": 0.2386, + "num_input_tokens_seen": 29517808, + "step": 19320 + }, + { + "epoch": 59.646058732612055, + "grad_norm": 0.4811175763607025, + "learning_rate": 2.6326699394962333e-05, + "loss": 0.2405, + "num_input_tokens_seen": 29525648, + "step": 19325 + }, + { + "epoch": 59.661514683153015, + "grad_norm": 0.6744702458381653, + "learning_rate": 2.6316895649660334e-05, + "loss": 0.2321, + "num_input_tokens_seen": 29533456, + "step": 19330 + }, + { + "epoch": 59.67697063369397, + "grad_norm": 0.8927619457244873, + "learning_rate": 2.6307091701276486e-05, + "loss": 0.3005, + "num_input_tokens_seen": 29541232, + "step": 19335 + }, + { + "epoch": 59.69242658423493, + "grad_norm": 0.4401887357234955, + "learning_rate": 2.629728755132267e-05, + "loss": 0.2774, + "num_input_tokens_seen": 29548944, + "step": 19340 + }, + { + "epoch": 59.70788253477589, + "grad_norm": 0.3920145332813263, + "learning_rate": 2.628748320131081e-05, + "loss": 0.1925, + "num_input_tokens_seen": 29556048, + "step": 19345 + }, + { + "epoch": 59.723338485316845, + "grad_norm": 0.31074267625808716, + "learning_rate": 2.6277678652752856e-05, + "loss": 0.2373, + "num_input_tokens_seen": 29563440, + "step": 19350 + }, + { + "epoch": 59.738794435857805, + "grad_norm": 0.6751255393028259, + "learning_rate": 2.6267873907160807e-05, + "loss": 0.2862, + "num_input_tokens_seen": 29572240, + "step": 19355 + }, + { + "epoch": 59.754250386398766, + "grad_norm": 0.8568524122238159, + "learning_rate": 2.6258068966046668e-05, + "loss": 0.3494, + "num_input_tokens_seen": 29580080, + "step": 19360 + }, + { + "epoch": 59.76970633693972, + "grad_norm": 0.3302035331726074, + "learning_rate": 2.6248263830922475e-05, + "loss": 0.2602, + "num_input_tokens_seen": 29587472, + "step": 19365 + }, + { + "epoch": 59.78516228748068, + "grad_norm": 0.3638732433319092, + "learning_rate": 2.6238458503300318e-05, + "loss": 0.2475, + "num_input_tokens_seen": 29594896, + "step": 19370 + }, + { + "epoch": 59.80061823802164, + "grad_norm": 0.47695600986480713, + "learning_rate": 2.6228652984692292e-05, + "loss": 0.2251, + "num_input_tokens_seen": 29602704, + "step": 19375 + }, + { + "epoch": 59.816074188562595, + "grad_norm": 0.5130065679550171, + "learning_rate": 2.621884727661054e-05, + "loss": 0.2329, + "num_input_tokens_seen": 29609968, + "step": 19380 + }, + { + "epoch": 59.831530139103556, + "grad_norm": 0.6137351989746094, + "learning_rate": 2.6209041380567222e-05, + "loss": 0.3021, + "num_input_tokens_seen": 29617968, + "step": 19385 + }, + { + "epoch": 59.84698608964451, + "grad_norm": 0.5058630108833313, + "learning_rate": 2.6199235298074527e-05, + "loss": 0.2921, + "num_input_tokens_seen": 29625904, + "step": 19390 + }, + { + "epoch": 59.86244204018547, + "grad_norm": 0.5605476498603821, + "learning_rate": 2.618942903064468e-05, + "loss": 0.3033, + "num_input_tokens_seen": 29634000, + "step": 19395 + }, + { + "epoch": 59.87789799072643, + "grad_norm": 0.4392697215080261, + "learning_rate": 2.6179622579789932e-05, + "loss": 0.2993, + "num_input_tokens_seen": 29642224, + "step": 19400 + }, + { + "epoch": 59.87789799072643, + "eval_loss": 0.3109235465526581, + "eval_runtime": 6.3031, + "eval_samples_per_second": 91.225, + "eval_steps_per_second": 22.846, + "num_input_tokens_seen": 29642224, + "step": 19400 + }, + { + "epoch": 59.893353941267385, + "grad_norm": 0.6009784936904907, + "learning_rate": 2.6169815947022553e-05, + "loss": 0.3203, + "num_input_tokens_seen": 29649936, + "step": 19405 + }, + { + "epoch": 59.908809891808346, + "grad_norm": 0.35244134068489075, + "learning_rate": 2.6160009133854853e-05, + "loss": 0.2163, + "num_input_tokens_seen": 29657936, + "step": 19410 + }, + { + "epoch": 59.92426584234931, + "grad_norm": 0.4608634412288666, + "learning_rate": 2.6150202141799168e-05, + "loss": 0.2387, + "num_input_tokens_seen": 29665008, + "step": 19415 + }, + { + "epoch": 59.93972179289026, + "grad_norm": 0.3579333424568176, + "learning_rate": 2.614039497236786e-05, + "loss": 0.2618, + "num_input_tokens_seen": 29672752, + "step": 19420 + }, + { + "epoch": 59.95517774343122, + "grad_norm": 0.29864776134490967, + "learning_rate": 2.6130587627073315e-05, + "loss": 0.2521, + "num_input_tokens_seen": 29680176, + "step": 19425 + }, + { + "epoch": 59.97063369397218, + "grad_norm": 0.492160826921463, + "learning_rate": 2.6120780107427956e-05, + "loss": 0.2438, + "num_input_tokens_seen": 29687408, + "step": 19430 + }, + { + "epoch": 59.986089644513136, + "grad_norm": 0.7506211996078491, + "learning_rate": 2.6110972414944214e-05, + "loss": 0.2223, + "num_input_tokens_seen": 29695216, + "step": 19435 + }, + { + "epoch": 60.0, + "grad_norm": 0.9189118146896362, + "learning_rate": 2.6101164551134565e-05, + "loss": 0.2351, + "num_input_tokens_seen": 29702208, + "step": 19440 + }, + { + "epoch": 60.01545595054096, + "grad_norm": 0.3370526432991028, + "learning_rate": 2.6091356517511505e-05, + "loss": 0.2879, + "num_input_tokens_seen": 29709664, + "step": 19445 + }, + { + "epoch": 60.030911901081915, + "grad_norm": 0.6783708333969116, + "learning_rate": 2.608154831558755e-05, + "loss": 0.2906, + "num_input_tokens_seen": 29717216, + "step": 19450 + }, + { + "epoch": 60.046367851622875, + "grad_norm": 0.605964183807373, + "learning_rate": 2.607173994687526e-05, + "loss": 0.2601, + "num_input_tokens_seen": 29725152, + "step": 19455 + }, + { + "epoch": 60.061823802163836, + "grad_norm": 0.4876057505607605, + "learning_rate": 2.6061931412887196e-05, + "loss": 0.2535, + "num_input_tokens_seen": 29732928, + "step": 19460 + }, + { + "epoch": 60.07727975270479, + "grad_norm": 0.7937220931053162, + "learning_rate": 2.6052122715135973e-05, + "loss": 0.2596, + "num_input_tokens_seen": 29740320, + "step": 19465 + }, + { + "epoch": 60.09273570324575, + "grad_norm": 0.814091682434082, + "learning_rate": 2.60423138551342e-05, + "loss": 0.3945, + "num_input_tokens_seen": 29748224, + "step": 19470 + }, + { + "epoch": 60.108191653786704, + "grad_norm": 0.4751949906349182, + "learning_rate": 2.6032504834394527e-05, + "loss": 0.3046, + "num_input_tokens_seen": 29756160, + "step": 19475 + }, + { + "epoch": 60.123647604327665, + "grad_norm": 0.832263708114624, + "learning_rate": 2.602269565442964e-05, + "loss": 0.2396, + "num_input_tokens_seen": 29764032, + "step": 19480 + }, + { + "epoch": 60.139103554868626, + "grad_norm": 0.466379314661026, + "learning_rate": 2.6012886316752227e-05, + "loss": 0.2505, + "num_input_tokens_seen": 29771936, + "step": 19485 + }, + { + "epoch": 60.15455950540958, + "grad_norm": 0.40228891372680664, + "learning_rate": 2.6003076822875018e-05, + "loss": 0.2443, + "num_input_tokens_seen": 29779744, + "step": 19490 + }, + { + "epoch": 60.17001545595054, + "grad_norm": 0.6277418732643127, + "learning_rate": 2.5993267174310755e-05, + "loss": 0.2512, + "num_input_tokens_seen": 29787328, + "step": 19495 + }, + { + "epoch": 60.1854714064915, + "grad_norm": 0.38332703709602356, + "learning_rate": 2.5983457372572218e-05, + "loss": 0.2333, + "num_input_tokens_seen": 29794912, + "step": 19500 + }, + { + "epoch": 60.200927357032455, + "grad_norm": 0.7865648865699768, + "learning_rate": 2.597364741917219e-05, + "loss": 0.3108, + "num_input_tokens_seen": 29802848, + "step": 19505 + }, + { + "epoch": 60.216383307573416, + "grad_norm": 0.45820873975753784, + "learning_rate": 2.5963837315623492e-05, + "loss": 0.2766, + "num_input_tokens_seen": 29810368, + "step": 19510 + }, + { + "epoch": 60.23183925811438, + "grad_norm": 0.5513409972190857, + "learning_rate": 2.595402706343897e-05, + "loss": 0.2096, + "num_input_tokens_seen": 29817824, + "step": 19515 + }, + { + "epoch": 60.24729520865533, + "grad_norm": 0.33118319511413574, + "learning_rate": 2.594421666413148e-05, + "loss": 0.2231, + "num_input_tokens_seen": 29825120, + "step": 19520 + }, + { + "epoch": 60.26275115919629, + "grad_norm": 0.418691486120224, + "learning_rate": 2.5934406119213928e-05, + "loss": 0.281, + "num_input_tokens_seen": 29832640, + "step": 19525 + }, + { + "epoch": 60.27820710973725, + "grad_norm": 0.5979465246200562, + "learning_rate": 2.5924595430199193e-05, + "loss": 0.2832, + "num_input_tokens_seen": 29840544, + "step": 19530 + }, + { + "epoch": 60.293663060278206, + "grad_norm": 0.583078145980835, + "learning_rate": 2.5914784598600238e-05, + "loss": 0.2516, + "num_input_tokens_seen": 29848096, + "step": 19535 + }, + { + "epoch": 60.30911901081917, + "grad_norm": 0.40154188871383667, + "learning_rate": 2.5904973625930002e-05, + "loss": 0.2357, + "num_input_tokens_seen": 29855392, + "step": 19540 + }, + { + "epoch": 60.32457496136012, + "grad_norm": 0.4431767165660858, + "learning_rate": 2.5895162513701456e-05, + "loss": 0.2523, + "num_input_tokens_seen": 29863008, + "step": 19545 + }, + { + "epoch": 60.34003091190108, + "grad_norm": 0.5258204936981201, + "learning_rate": 2.5885351263427593e-05, + "loss": 0.3021, + "num_input_tokens_seen": 29870976, + "step": 19550 + }, + { + "epoch": 60.35548686244204, + "grad_norm": 0.5217841863632202, + "learning_rate": 2.5875539876621448e-05, + "loss": 0.286, + "num_input_tokens_seen": 29878656, + "step": 19555 + }, + { + "epoch": 60.370942812982996, + "grad_norm": 0.5254432559013367, + "learning_rate": 2.586572835479605e-05, + "loss": 0.2151, + "num_input_tokens_seen": 29886560, + "step": 19560 + }, + { + "epoch": 60.38639876352396, + "grad_norm": 0.41874927282333374, + "learning_rate": 2.585591669946446e-05, + "loss": 0.2035, + "num_input_tokens_seen": 29894048, + "step": 19565 + }, + { + "epoch": 60.40185471406492, + "grad_norm": 0.3731996715068817, + "learning_rate": 2.5846104912139756e-05, + "loss": 0.2273, + "num_input_tokens_seen": 29901984, + "step": 19570 + }, + { + "epoch": 60.41731066460587, + "grad_norm": 0.3233911991119385, + "learning_rate": 2.583629299433505e-05, + "loss": 0.2305, + "num_input_tokens_seen": 29909504, + "step": 19575 + }, + { + "epoch": 60.43276661514683, + "grad_norm": 0.7147181630134583, + "learning_rate": 2.582648094756345e-05, + "loss": 0.2921, + "num_input_tokens_seen": 29917056, + "step": 19580 + }, + { + "epoch": 60.44822256568779, + "grad_norm": 0.5312479138374329, + "learning_rate": 2.5816668773338098e-05, + "loss": 0.2493, + "num_input_tokens_seen": 29924480, + "step": 19585 + }, + { + "epoch": 60.46367851622875, + "grad_norm": 0.9041016697883606, + "learning_rate": 2.580685647317216e-05, + "loss": 0.2726, + "num_input_tokens_seen": 29932032, + "step": 19590 + }, + { + "epoch": 60.47913446676971, + "grad_norm": 0.41857287287712097, + "learning_rate": 2.5797044048578818e-05, + "loss": 0.2804, + "num_input_tokens_seen": 29939840, + "step": 19595 + }, + { + "epoch": 60.49459041731066, + "grad_norm": 0.754546046257019, + "learning_rate": 2.5787231501071262e-05, + "loss": 0.2417, + "num_input_tokens_seen": 29947456, + "step": 19600 + }, + { + "epoch": 60.49459041731066, + "eval_loss": 0.309762179851532, + "eval_runtime": 6.2629, + "eval_samples_per_second": 91.81, + "eval_steps_per_second": 22.992, + "num_input_tokens_seen": 29947456, + "step": 19600 + }, + { + "epoch": 60.51004636785162, + "grad_norm": 0.5293994545936584, + "learning_rate": 2.577741883216272e-05, + "loss": 0.2303, + "num_input_tokens_seen": 29955136, + "step": 19605 + }, + { + "epoch": 60.52550231839258, + "grad_norm": 0.5579532980918884, + "learning_rate": 2.576760604336642e-05, + "loss": 0.2844, + "num_input_tokens_seen": 29962208, + "step": 19610 + }, + { + "epoch": 60.54095826893354, + "grad_norm": 0.38528192043304443, + "learning_rate": 2.575779313619563e-05, + "loss": 0.2624, + "num_input_tokens_seen": 29969888, + "step": 19615 + }, + { + "epoch": 60.5564142194745, + "grad_norm": 0.36679473519325256, + "learning_rate": 2.5747980112163605e-05, + "loss": 0.2622, + "num_input_tokens_seen": 29978304, + "step": 19620 + }, + { + "epoch": 60.57187017001546, + "grad_norm": 0.3796131908893585, + "learning_rate": 2.5738166972783656e-05, + "loss": 0.234, + "num_input_tokens_seen": 29985728, + "step": 19625 + }, + { + "epoch": 60.58732612055641, + "grad_norm": 0.6394279599189758, + "learning_rate": 2.5728353719569075e-05, + "loss": 0.2408, + "num_input_tokens_seen": 29993504, + "step": 19630 + }, + { + "epoch": 60.60278207109737, + "grad_norm": 0.5147260427474976, + "learning_rate": 2.57185403540332e-05, + "loss": 0.2362, + "num_input_tokens_seen": 30001120, + "step": 19635 + }, + { + "epoch": 60.618238021638334, + "grad_norm": 0.6912987232208252, + "learning_rate": 2.5708726877689375e-05, + "loss": 0.2227, + "num_input_tokens_seen": 30008608, + "step": 19640 + }, + { + "epoch": 60.63369397217929, + "grad_norm": 0.44740572571754456, + "learning_rate": 2.5698913292050964e-05, + "loss": 0.2164, + "num_input_tokens_seen": 30016032, + "step": 19645 + }, + { + "epoch": 60.64914992272025, + "grad_norm": 0.374461829662323, + "learning_rate": 2.568909959863133e-05, + "loss": 0.2751, + "num_input_tokens_seen": 30023520, + "step": 19650 + }, + { + "epoch": 60.66460587326121, + "grad_norm": 0.45388150215148926, + "learning_rate": 2.5679285798943887e-05, + "loss": 0.2097, + "num_input_tokens_seen": 30030720, + "step": 19655 + }, + { + "epoch": 60.68006182380216, + "grad_norm": 0.629582405090332, + "learning_rate": 2.5669471894502035e-05, + "loss": 0.2489, + "num_input_tokens_seen": 30038272, + "step": 19660 + }, + { + "epoch": 60.695517774343124, + "grad_norm": 0.6767942905426025, + "learning_rate": 2.56596578868192e-05, + "loss": 0.2322, + "num_input_tokens_seen": 30045856, + "step": 19665 + }, + { + "epoch": 60.71097372488408, + "grad_norm": 0.4797068238258362, + "learning_rate": 2.564984377740883e-05, + "loss": 0.2238, + "num_input_tokens_seen": 30052800, + "step": 19670 + }, + { + "epoch": 60.72642967542504, + "grad_norm": 0.6864298582077026, + "learning_rate": 2.564002956778438e-05, + "loss": 0.263, + "num_input_tokens_seen": 30061312, + "step": 19675 + }, + { + "epoch": 60.741885625966, + "grad_norm": 0.6708416938781738, + "learning_rate": 2.563021525945934e-05, + "loss": 0.2347, + "num_input_tokens_seen": 30068672, + "step": 19680 + }, + { + "epoch": 60.75734157650695, + "grad_norm": 0.40188294649124146, + "learning_rate": 2.562040085394718e-05, + "loss": 0.211, + "num_input_tokens_seen": 30076768, + "step": 19685 + }, + { + "epoch": 60.77279752704791, + "grad_norm": 0.2239050269126892, + "learning_rate": 2.56105863527614e-05, + "loss": 0.2105, + "num_input_tokens_seen": 30084064, + "step": 19690 + }, + { + "epoch": 60.788253477588874, + "grad_norm": 0.5900670289993286, + "learning_rate": 2.5600771757415548e-05, + "loss": 0.3159, + "num_input_tokens_seen": 30092128, + "step": 19695 + }, + { + "epoch": 60.80370942812983, + "grad_norm": 0.4411679804325104, + "learning_rate": 2.5590957069423134e-05, + "loss": 0.2861, + "num_input_tokens_seen": 30099360, + "step": 19700 + }, + { + "epoch": 60.81916537867079, + "grad_norm": 0.40251997113227844, + "learning_rate": 2.5581142290297716e-05, + "loss": 0.2509, + "num_input_tokens_seen": 30107072, + "step": 19705 + }, + { + "epoch": 60.83462132921175, + "grad_norm": 0.7377637028694153, + "learning_rate": 2.557132742155285e-05, + "loss": 0.3029, + "num_input_tokens_seen": 30114752, + "step": 19710 + }, + { + "epoch": 60.8500772797527, + "grad_norm": 0.6543982028961182, + "learning_rate": 2.556151246470212e-05, + "loss": 0.2638, + "num_input_tokens_seen": 30122432, + "step": 19715 + }, + { + "epoch": 60.865533230293664, + "grad_norm": 0.4145892858505249, + "learning_rate": 2.5551697421259114e-05, + "loss": 0.2258, + "num_input_tokens_seen": 30130592, + "step": 19720 + }, + { + "epoch": 60.88098918083462, + "grad_norm": 0.4463726878166199, + "learning_rate": 2.554188229273743e-05, + "loss": 0.2303, + "num_input_tokens_seen": 30138304, + "step": 19725 + }, + { + "epoch": 60.89644513137558, + "grad_norm": 0.6864736676216125, + "learning_rate": 2.5532067080650678e-05, + "loss": 0.2387, + "num_input_tokens_seen": 30145952, + "step": 19730 + }, + { + "epoch": 60.91190108191654, + "grad_norm": 0.48091834783554077, + "learning_rate": 2.55222517865125e-05, + "loss": 0.244, + "num_input_tokens_seen": 30153504, + "step": 19735 + }, + { + "epoch": 60.92735703245749, + "grad_norm": 0.4156259298324585, + "learning_rate": 2.5512436411836538e-05, + "loss": 0.2909, + "num_input_tokens_seen": 30161248, + "step": 19740 + }, + { + "epoch": 60.942812982998454, + "grad_norm": 0.5172837972640991, + "learning_rate": 2.5502620958136443e-05, + "loss": 0.2258, + "num_input_tokens_seen": 30169216, + "step": 19745 + }, + { + "epoch": 60.958268933539415, + "grad_norm": 0.3773114085197449, + "learning_rate": 2.5492805426925874e-05, + "loss": 0.2727, + "num_input_tokens_seen": 30176832, + "step": 19750 + }, + { + "epoch": 60.97372488408037, + "grad_norm": 1.3681590557098389, + "learning_rate": 2.5482989819718523e-05, + "loss": 0.2777, + "num_input_tokens_seen": 30184256, + "step": 19755 + }, + { + "epoch": 60.98918083462133, + "grad_norm": 0.6723235249519348, + "learning_rate": 2.5473174138028065e-05, + "loss": 0.3042, + "num_input_tokens_seen": 30191680, + "step": 19760 + }, + { + "epoch": 61.003091190108194, + "grad_norm": 0.5181425213813782, + "learning_rate": 2.5463358383368212e-05, + "loss": 0.2646, + "num_input_tokens_seen": 30198624, + "step": 19765 + }, + { + "epoch": 61.01854714064915, + "grad_norm": 0.816684365272522, + "learning_rate": 2.545354255725267e-05, + "loss": 0.2317, + "num_input_tokens_seen": 30206400, + "step": 19770 + }, + { + "epoch": 61.03400309119011, + "grad_norm": 0.44200533628463745, + "learning_rate": 2.5443726661195165e-05, + "loss": 0.2201, + "num_input_tokens_seen": 30214240, + "step": 19775 + }, + { + "epoch": 61.04945904173107, + "grad_norm": 0.5039185881614685, + "learning_rate": 2.543391069670944e-05, + "loss": 0.2831, + "num_input_tokens_seen": 30221888, + "step": 19780 + }, + { + "epoch": 61.06491499227202, + "grad_norm": 0.4412546753883362, + "learning_rate": 2.5424094665309228e-05, + "loss": 0.2388, + "num_input_tokens_seen": 30229344, + "step": 19785 + }, + { + "epoch": 61.08037094281298, + "grad_norm": 0.7769919037818909, + "learning_rate": 2.5414278568508292e-05, + "loss": 0.2471, + "num_input_tokens_seen": 30236768, + "step": 19790 + }, + { + "epoch": 61.095826893353944, + "grad_norm": 0.5686861872673035, + "learning_rate": 2.540446240782039e-05, + "loss": 0.2225, + "num_input_tokens_seen": 30244192, + "step": 19795 + }, + { + "epoch": 61.1112828438949, + "grad_norm": 0.38959017395973206, + "learning_rate": 2.5394646184759307e-05, + "loss": 0.3663, + "num_input_tokens_seen": 30252288, + "step": 19800 + }, + { + "epoch": 61.1112828438949, + "eval_loss": 0.3118714690208435, + "eval_runtime": 6.2968, + "eval_samples_per_second": 91.316, + "eval_steps_per_second": 22.869, + "num_input_tokens_seen": 30252288, + "step": 19800 + }, + { + "epoch": 61.12673879443586, + "grad_norm": 0.3849567770957947, + "learning_rate": 2.538482990083882e-05, + "loss": 0.2133, + "num_input_tokens_seen": 30260736, + "step": 19805 + }, + { + "epoch": 61.14219474497681, + "grad_norm": 0.6707568764686584, + "learning_rate": 2.5375013557572725e-05, + "loss": 0.2582, + "num_input_tokens_seen": 30269152, + "step": 19810 + }, + { + "epoch": 61.15765069551777, + "grad_norm": 0.2635015845298767, + "learning_rate": 2.536519715647483e-05, + "loss": 0.2536, + "num_input_tokens_seen": 30276512, + "step": 19815 + }, + { + "epoch": 61.173106646058734, + "grad_norm": 0.42538705468177795, + "learning_rate": 2.535538069905894e-05, + "loss": 0.2691, + "num_input_tokens_seen": 30284256, + "step": 19820 + }, + { + "epoch": 61.18856259659969, + "grad_norm": 0.5661433935165405, + "learning_rate": 2.534556418683888e-05, + "loss": 0.2685, + "num_input_tokens_seen": 30292384, + "step": 19825 + }, + { + "epoch": 61.20401854714065, + "grad_norm": 0.9897286891937256, + "learning_rate": 2.5335747621328486e-05, + "loss": 0.2076, + "num_input_tokens_seen": 30299520, + "step": 19830 + }, + { + "epoch": 61.21947449768161, + "grad_norm": 0.3331243693828583, + "learning_rate": 2.5325931004041586e-05, + "loss": 0.2471, + "num_input_tokens_seen": 30307168, + "step": 19835 + }, + { + "epoch": 61.23493044822256, + "grad_norm": 0.5194005370140076, + "learning_rate": 2.5316114336492032e-05, + "loss": 0.2166, + "num_input_tokens_seen": 30314560, + "step": 19840 + }, + { + "epoch": 61.250386398763524, + "grad_norm": 0.3963828384876251, + "learning_rate": 2.530629762019367e-05, + "loss": 0.2438, + "num_input_tokens_seen": 30322016, + "step": 19845 + }, + { + "epoch": 61.265842349304485, + "grad_norm": 0.485324889421463, + "learning_rate": 2.5296480856660364e-05, + "loss": 0.2787, + "num_input_tokens_seen": 30330112, + "step": 19850 + }, + { + "epoch": 61.28129829984544, + "grad_norm": 0.4367891252040863, + "learning_rate": 2.528666404740599e-05, + "loss": 0.3021, + "num_input_tokens_seen": 30337344, + "step": 19855 + }, + { + "epoch": 61.2967542503864, + "grad_norm": 0.24776288866996765, + "learning_rate": 2.527684719394442e-05, + "loss": 0.2363, + "num_input_tokens_seen": 30344864, + "step": 19860 + }, + { + "epoch": 61.31221020092736, + "grad_norm": 0.36377930641174316, + "learning_rate": 2.526703029778953e-05, + "loss": 0.2127, + "num_input_tokens_seen": 30352544, + "step": 19865 + }, + { + "epoch": 61.327666151468314, + "grad_norm": 0.7563863396644592, + "learning_rate": 2.5257213360455208e-05, + "loss": 0.2445, + "num_input_tokens_seen": 30360064, + "step": 19870 + }, + { + "epoch": 61.343122102009275, + "grad_norm": 0.6173843145370483, + "learning_rate": 2.5247396383455353e-05, + "loss": 0.2451, + "num_input_tokens_seen": 30367424, + "step": 19875 + }, + { + "epoch": 61.35857805255023, + "grad_norm": 0.43856942653656006, + "learning_rate": 2.523757936830387e-05, + "loss": 0.2253, + "num_input_tokens_seen": 30375264, + "step": 19880 + }, + { + "epoch": 61.37403400309119, + "grad_norm": 0.31684544682502747, + "learning_rate": 2.5227762316514662e-05, + "loss": 0.2824, + "num_input_tokens_seen": 30383040, + "step": 19885 + }, + { + "epoch": 61.38948995363215, + "grad_norm": 0.5195931196212769, + "learning_rate": 2.5217945229601648e-05, + "loss": 0.259, + "num_input_tokens_seen": 30390848, + "step": 19890 + }, + { + "epoch": 61.404945904173104, + "grad_norm": 0.5768713355064392, + "learning_rate": 2.5208128109078738e-05, + "loss": 0.2246, + "num_input_tokens_seen": 30398688, + "step": 19895 + }, + { + "epoch": 61.420401854714065, + "grad_norm": 0.3142842948436737, + "learning_rate": 2.5198310956459853e-05, + "loss": 0.2515, + "num_input_tokens_seen": 30405696, + "step": 19900 + }, + { + "epoch": 61.435857805255026, + "grad_norm": 0.44374901056289673, + "learning_rate": 2.518849377325893e-05, + "loss": 0.2801, + "num_input_tokens_seen": 30413728, + "step": 19905 + }, + { + "epoch": 61.45131375579598, + "grad_norm": 0.9951769709587097, + "learning_rate": 2.51786765609899e-05, + "loss": 0.2738, + "num_input_tokens_seen": 30421152, + "step": 19910 + }, + { + "epoch": 61.46676970633694, + "grad_norm": 0.38461270928382874, + "learning_rate": 2.5168859321166694e-05, + "loss": 0.2386, + "num_input_tokens_seen": 30428672, + "step": 19915 + }, + { + "epoch": 61.4822256568779, + "grad_norm": 0.46342357993125916, + "learning_rate": 2.515904205530326e-05, + "loss": 0.2478, + "num_input_tokens_seen": 30436544, + "step": 19920 + }, + { + "epoch": 61.497681607418855, + "grad_norm": 0.38035455346107483, + "learning_rate": 2.514922476491355e-05, + "loss": 0.3039, + "num_input_tokens_seen": 30444000, + "step": 19925 + }, + { + "epoch": 61.513137557959816, + "grad_norm": 0.7306350469589233, + "learning_rate": 2.51394074515115e-05, + "loss": 0.2877, + "num_input_tokens_seen": 30451168, + "step": 19930 + }, + { + "epoch": 61.52859350850077, + "grad_norm": 0.45781421661376953, + "learning_rate": 2.5129590116611067e-05, + "loss": 0.2659, + "num_input_tokens_seen": 30458944, + "step": 19935 + }, + { + "epoch": 61.54404945904173, + "grad_norm": 0.9433178901672363, + "learning_rate": 2.5119772761726212e-05, + "loss": 0.2603, + "num_input_tokens_seen": 30466688, + "step": 19940 + }, + { + "epoch": 61.55950540958269, + "grad_norm": 0.4888220429420471, + "learning_rate": 2.5109955388370893e-05, + "loss": 0.2466, + "num_input_tokens_seen": 30473984, + "step": 19945 + }, + { + "epoch": 61.574961360123645, + "grad_norm": 0.4369438886642456, + "learning_rate": 2.510013799805907e-05, + "loss": 0.2941, + "num_input_tokens_seen": 30481472, + "step": 19950 + }, + { + "epoch": 61.590417310664606, + "grad_norm": 0.5444830060005188, + "learning_rate": 2.5090320592304706e-05, + "loss": 0.2061, + "num_input_tokens_seen": 30489184, + "step": 19955 + }, + { + "epoch": 61.605873261205566, + "grad_norm": 0.4156999886035919, + "learning_rate": 2.5080503172621777e-05, + "loss": 0.3291, + "num_input_tokens_seen": 30496288, + "step": 19960 + }, + { + "epoch": 61.62132921174652, + "grad_norm": 0.6248978972434998, + "learning_rate": 2.5070685740524246e-05, + "loss": 0.3694, + "num_input_tokens_seen": 30503552, + "step": 19965 + }, + { + "epoch": 61.63678516228748, + "grad_norm": 0.5306891202926636, + "learning_rate": 2.5060868297526084e-05, + "loss": 0.2592, + "num_input_tokens_seen": 30511456, + "step": 19970 + }, + { + "epoch": 61.65224111282844, + "grad_norm": 0.4065871834754944, + "learning_rate": 2.5051050845141267e-05, + "loss": 0.2507, + "num_input_tokens_seen": 30518752, + "step": 19975 + }, + { + "epoch": 61.667697063369395, + "grad_norm": 0.39237192273139954, + "learning_rate": 2.5041233384883765e-05, + "loss": 0.2829, + "num_input_tokens_seen": 30526624, + "step": 19980 + }, + { + "epoch": 61.683153013910356, + "grad_norm": 0.45468077063560486, + "learning_rate": 2.5031415918267564e-05, + "loss": 0.2131, + "num_input_tokens_seen": 30534784, + "step": 19985 + }, + { + "epoch": 61.69860896445132, + "grad_norm": 0.5064849257469177, + "learning_rate": 2.5021598446806626e-05, + "loss": 0.2446, + "num_input_tokens_seen": 30542016, + "step": 19990 + }, + { + "epoch": 61.71406491499227, + "grad_norm": 0.6269859671592712, + "learning_rate": 2.5011780972014937e-05, + "loss": 0.2357, + "num_input_tokens_seen": 30549632, + "step": 19995 + }, + { + "epoch": 61.72952086553323, + "grad_norm": 0.3178132176399231, + "learning_rate": 2.5001963495406478e-05, + "loss": 0.2843, + "num_input_tokens_seen": 30557408, + "step": 20000 + }, + { + "epoch": 61.72952086553323, + "eval_loss": 0.3084073066711426, + "eval_runtime": 6.307, + "eval_samples_per_second": 91.169, + "eval_steps_per_second": 22.832, + "num_input_tokens_seen": 30557408, + "step": 20000 + }, + { + "epoch": 61.744976816074185, + "grad_norm": 0.5031725168228149, + "learning_rate": 2.499214601849522e-05, + "loss": 0.2173, + "num_input_tokens_seen": 30564544, + "step": 20005 + }, + { + "epoch": 61.760432766615146, + "grad_norm": 0.364938884973526, + "learning_rate": 2.4982328542795148e-05, + "loss": 0.2641, + "num_input_tokens_seen": 30572320, + "step": 20010 + }, + { + "epoch": 61.77588871715611, + "grad_norm": 0.5233353972434998, + "learning_rate": 2.497251106982024e-05, + "loss": 0.2168, + "num_input_tokens_seen": 30579456, + "step": 20015 + }, + { + "epoch": 61.79134466769706, + "grad_norm": 0.6200485825538635, + "learning_rate": 2.4962693601084458e-05, + "loss": 0.2542, + "num_input_tokens_seen": 30587200, + "step": 20020 + }, + { + "epoch": 61.80680061823802, + "grad_norm": 0.3618623614311218, + "learning_rate": 2.4952876138101794e-05, + "loss": 0.2952, + "num_input_tokens_seen": 30595232, + "step": 20025 + }, + { + "epoch": 61.82225656877898, + "grad_norm": 0.5032217502593994, + "learning_rate": 2.4943058682386233e-05, + "loss": 0.2437, + "num_input_tokens_seen": 30603360, + "step": 20030 + }, + { + "epoch": 61.837712519319936, + "grad_norm": 0.34730014204978943, + "learning_rate": 2.493324123545173e-05, + "loss": 0.2887, + "num_input_tokens_seen": 30610528, + "step": 20035 + }, + { + "epoch": 61.8531684698609, + "grad_norm": 0.450151264667511, + "learning_rate": 2.4923423798812272e-05, + "loss": 0.3093, + "num_input_tokens_seen": 30618208, + "step": 20040 + }, + { + "epoch": 61.86862442040186, + "grad_norm": 0.7037802934646606, + "learning_rate": 2.4913606373981825e-05, + "loss": 0.2212, + "num_input_tokens_seen": 30626080, + "step": 20045 + }, + { + "epoch": 61.88408037094281, + "grad_norm": 0.482852041721344, + "learning_rate": 2.4903788962474357e-05, + "loss": 0.258, + "num_input_tokens_seen": 30633760, + "step": 20050 + }, + { + "epoch": 61.89953632148377, + "grad_norm": 0.6357303857803345, + "learning_rate": 2.489397156580385e-05, + "loss": 0.2495, + "num_input_tokens_seen": 30641440, + "step": 20055 + }, + { + "epoch": 61.914992272024726, + "grad_norm": 0.5169270634651184, + "learning_rate": 2.4884154185484246e-05, + "loss": 0.2316, + "num_input_tokens_seen": 30649184, + "step": 20060 + }, + { + "epoch": 61.93044822256569, + "grad_norm": 0.6205847263336182, + "learning_rate": 2.4874336823029526e-05, + "loss": 0.2383, + "num_input_tokens_seen": 30656672, + "step": 20065 + }, + { + "epoch": 61.94590417310665, + "grad_norm": 0.6269769072532654, + "learning_rate": 2.4864519479953656e-05, + "loss": 0.2343, + "num_input_tokens_seen": 30664960, + "step": 20070 + }, + { + "epoch": 61.9613601236476, + "grad_norm": 0.6048909425735474, + "learning_rate": 2.485470215777058e-05, + "loss": 0.2491, + "num_input_tokens_seen": 30672704, + "step": 20075 + }, + { + "epoch": 61.97681607418856, + "grad_norm": 0.5390631556510925, + "learning_rate": 2.4844884857994258e-05, + "loss": 0.2523, + "num_input_tokens_seen": 30679936, + "step": 20080 + }, + { + "epoch": 61.99227202472952, + "grad_norm": 0.6583005785942078, + "learning_rate": 2.4835067582138638e-05, + "loss": 0.3069, + "num_input_tokens_seen": 30688000, + "step": 20085 + }, + { + "epoch": 62.00618238021638, + "grad_norm": 0.48761722445487976, + "learning_rate": 2.4825250331717666e-05, + "loss": 0.1957, + "num_input_tokens_seen": 30694656, + "step": 20090 + }, + { + "epoch": 62.02163833075734, + "grad_norm": 0.40561822056770325, + "learning_rate": 2.4815433108245298e-05, + "loss": 0.2914, + "num_input_tokens_seen": 30702208, + "step": 20095 + }, + { + "epoch": 62.0370942812983, + "grad_norm": 0.4118557274341583, + "learning_rate": 2.4805615913235456e-05, + "loss": 0.2814, + "num_input_tokens_seen": 30709728, + "step": 20100 + }, + { + "epoch": 62.052550231839255, + "grad_norm": 0.4259851276874542, + "learning_rate": 2.479579874820208e-05, + "loss": 0.2826, + "num_input_tokens_seen": 30717120, + "step": 20105 + }, + { + "epoch": 62.068006182380216, + "grad_norm": 0.46736958622932434, + "learning_rate": 2.4785981614659115e-05, + "loss": 0.2851, + "num_input_tokens_seen": 30725184, + "step": 20110 + }, + { + "epoch": 62.08346213292118, + "grad_norm": 0.5548693537712097, + "learning_rate": 2.477616451412047e-05, + "loss": 0.3549, + "num_input_tokens_seen": 30733184, + "step": 20115 + }, + { + "epoch": 62.09891808346213, + "grad_norm": 0.3765344023704529, + "learning_rate": 2.476634744810007e-05, + "loss": 0.2352, + "num_input_tokens_seen": 30740416, + "step": 20120 + }, + { + "epoch": 62.11437403400309, + "grad_norm": 0.3304935097694397, + "learning_rate": 2.475653041811183e-05, + "loss": 0.2723, + "num_input_tokens_seen": 30748256, + "step": 20125 + }, + { + "epoch": 62.12982998454405, + "grad_norm": 0.30658233165740967, + "learning_rate": 2.4746713425669652e-05, + "loss": 0.2619, + "num_input_tokens_seen": 30755488, + "step": 20130 + }, + { + "epoch": 62.145285935085006, + "grad_norm": 0.4195229411125183, + "learning_rate": 2.4736896472287458e-05, + "loss": 0.2589, + "num_input_tokens_seen": 30763072, + "step": 20135 + }, + { + "epoch": 62.16074188562597, + "grad_norm": 0.3236118257045746, + "learning_rate": 2.4727079559479124e-05, + "loss": 0.2555, + "num_input_tokens_seen": 30771040, + "step": 20140 + }, + { + "epoch": 62.17619783616692, + "grad_norm": 0.53208988904953, + "learning_rate": 2.4717262688758557e-05, + "loss": 0.219, + "num_input_tokens_seen": 30778592, + "step": 20145 + }, + { + "epoch": 62.19165378670788, + "grad_norm": 0.5826901197433472, + "learning_rate": 2.4707445861639637e-05, + "loss": 0.2477, + "num_input_tokens_seen": 30786080, + "step": 20150 + }, + { + "epoch": 62.20710973724884, + "grad_norm": 0.5391337871551514, + "learning_rate": 2.4697629079636244e-05, + "loss": 0.225, + "num_input_tokens_seen": 30793472, + "step": 20155 + }, + { + "epoch": 62.222565687789796, + "grad_norm": 0.3697952926158905, + "learning_rate": 2.4687812344262244e-05, + "loss": 0.2196, + "num_input_tokens_seen": 30801600, + "step": 20160 + }, + { + "epoch": 62.23802163833076, + "grad_norm": 0.5692461133003235, + "learning_rate": 2.46779956570315e-05, + "loss": 0.2585, + "num_input_tokens_seen": 30809280, + "step": 20165 + }, + { + "epoch": 62.25347758887172, + "grad_norm": 0.33265867829322815, + "learning_rate": 2.466817901945787e-05, + "loss": 0.2458, + "num_input_tokens_seen": 30817248, + "step": 20170 + }, + { + "epoch": 62.26893353941267, + "grad_norm": 0.42996013164520264, + "learning_rate": 2.4658362433055217e-05, + "loss": 0.2243, + "num_input_tokens_seen": 30824832, + "step": 20175 + }, + { + "epoch": 62.28438948995363, + "grad_norm": 0.6310046315193176, + "learning_rate": 2.4648545899337356e-05, + "loss": 0.2399, + "num_input_tokens_seen": 30832064, + "step": 20180 + }, + { + "epoch": 62.29984544049459, + "grad_norm": 0.4467822015285492, + "learning_rate": 2.4638729419818143e-05, + "loss": 0.188, + "num_input_tokens_seen": 30839744, + "step": 20185 + }, + { + "epoch": 62.31530139103555, + "grad_norm": 0.5606081485748291, + "learning_rate": 2.46289129960114e-05, + "loss": 0.2633, + "num_input_tokens_seen": 30847552, + "step": 20190 + }, + { + "epoch": 62.33075734157651, + "grad_norm": 0.4572191834449768, + "learning_rate": 2.4619096629430924e-05, + "loss": 0.3025, + "num_input_tokens_seen": 30855264, + "step": 20195 + }, + { + "epoch": 62.34621329211747, + "grad_norm": 0.630942702293396, + "learning_rate": 2.4609280321590543e-05, + "loss": 0.2279, + "num_input_tokens_seen": 30862656, + "step": 20200 + }, + { + "epoch": 62.34621329211747, + "eval_loss": 0.31080830097198486, + "eval_runtime": 6.2854, + "eval_samples_per_second": 91.481, + "eval_steps_per_second": 22.91, + "num_input_tokens_seen": 30862656, + "step": 20200 + }, + { + "epoch": 62.36166924265842, + "grad_norm": 0.4777791500091553, + "learning_rate": 2.4599464074004037e-05, + "loss": 0.2258, + "num_input_tokens_seen": 30870112, + "step": 20205 + }, + { + "epoch": 62.37712519319938, + "grad_norm": 0.6119409203529358, + "learning_rate": 2.4589647888185204e-05, + "loss": 0.253, + "num_input_tokens_seen": 30877568, + "step": 20210 + }, + { + "epoch": 62.39258114374034, + "grad_norm": 0.285862535238266, + "learning_rate": 2.4579831765647836e-05, + "loss": 0.2327, + "num_input_tokens_seen": 30885376, + "step": 20215 + }, + { + "epoch": 62.4080370942813, + "grad_norm": 0.528700590133667, + "learning_rate": 2.4570015707905676e-05, + "loss": 0.2349, + "num_input_tokens_seen": 30893152, + "step": 20220 + }, + { + "epoch": 62.42349304482226, + "grad_norm": 1.2211004495620728, + "learning_rate": 2.4560199716472508e-05, + "loss": 0.2762, + "num_input_tokens_seen": 30900416, + "step": 20225 + }, + { + "epoch": 62.43894899536321, + "grad_norm": 0.5128402709960938, + "learning_rate": 2.455038379286207e-05, + "loss": 0.2329, + "num_input_tokens_seen": 30908000, + "step": 20230 + }, + { + "epoch": 62.45440494590417, + "grad_norm": 0.5586560368537903, + "learning_rate": 2.4540567938588095e-05, + "loss": 0.2607, + "num_input_tokens_seen": 30915712, + "step": 20235 + }, + { + "epoch": 62.469860896445134, + "grad_norm": 0.3861485719680786, + "learning_rate": 2.4530752155164328e-05, + "loss": 0.2335, + "num_input_tokens_seen": 30923040, + "step": 20240 + }, + { + "epoch": 62.48531684698609, + "grad_norm": 0.5375968217849731, + "learning_rate": 2.4520936444104463e-05, + "loss": 0.2016, + "num_input_tokens_seen": 30931680, + "step": 20245 + }, + { + "epoch": 62.50077279752705, + "grad_norm": 0.4049575626850128, + "learning_rate": 2.4511120806922218e-05, + "loss": 0.3333, + "num_input_tokens_seen": 30938688, + "step": 20250 + }, + { + "epoch": 62.51622874806801, + "grad_norm": 0.30570659041404724, + "learning_rate": 2.45013052451313e-05, + "loss": 0.2396, + "num_input_tokens_seen": 30946240, + "step": 20255 + }, + { + "epoch": 62.53168469860896, + "grad_norm": 0.4434534013271332, + "learning_rate": 2.4491489760245376e-05, + "loss": 0.203, + "num_input_tokens_seen": 30954656, + "step": 20260 + }, + { + "epoch": 62.547140649149924, + "grad_norm": 0.4198566973209381, + "learning_rate": 2.4481674353778115e-05, + "loss": 0.2244, + "num_input_tokens_seen": 30962432, + "step": 20265 + }, + { + "epoch": 62.56259659969088, + "grad_norm": 0.5693820118904114, + "learning_rate": 2.447185902724319e-05, + "loss": 0.2509, + "num_input_tokens_seen": 30969888, + "step": 20270 + }, + { + "epoch": 62.57805255023184, + "grad_norm": 0.9390968084335327, + "learning_rate": 2.4462043782154233e-05, + "loss": 0.2801, + "num_input_tokens_seen": 30977728, + "step": 20275 + }, + { + "epoch": 62.5935085007728, + "grad_norm": 0.6551035046577454, + "learning_rate": 2.4452228620024895e-05, + "loss": 0.269, + "num_input_tokens_seen": 30985440, + "step": 20280 + }, + { + "epoch": 62.60896445131375, + "grad_norm": 0.6879645586013794, + "learning_rate": 2.4442413542368776e-05, + "loss": 0.2691, + "num_input_tokens_seen": 30992960, + "step": 20285 + }, + { + "epoch": 62.624420401854714, + "grad_norm": 0.406587690114975, + "learning_rate": 2.4432598550699502e-05, + "loss": 0.2412, + "num_input_tokens_seen": 31000160, + "step": 20290 + }, + { + "epoch": 62.639876352395675, + "grad_norm": 0.5007425546646118, + "learning_rate": 2.4422783646530663e-05, + "loss": 0.2501, + "num_input_tokens_seen": 31007904, + "step": 20295 + }, + { + "epoch": 62.65533230293663, + "grad_norm": 0.4546840786933899, + "learning_rate": 2.441296883137584e-05, + "loss": 0.279, + "num_input_tokens_seen": 31015168, + "step": 20300 + }, + { + "epoch": 62.67078825347759, + "grad_norm": 0.4602547287940979, + "learning_rate": 2.4403154106748592e-05, + "loss": 0.2579, + "num_input_tokens_seen": 31022944, + "step": 20305 + }, + { + "epoch": 62.68624420401855, + "grad_norm": 0.40555959939956665, + "learning_rate": 2.4393339474162494e-05, + "loss": 0.2631, + "num_input_tokens_seen": 31030624, + "step": 20310 + }, + { + "epoch": 62.701700154559504, + "grad_norm": 0.41487011313438416, + "learning_rate": 2.4383524935131062e-05, + "loss": 0.2421, + "num_input_tokens_seen": 31038272, + "step": 20315 + }, + { + "epoch": 62.717156105100464, + "grad_norm": 0.2877490818500519, + "learning_rate": 2.437371049116784e-05, + "loss": 0.29, + "num_input_tokens_seen": 31045920, + "step": 20320 + }, + { + "epoch": 62.732612055641425, + "grad_norm": 0.4799184799194336, + "learning_rate": 2.436389614378632e-05, + "loss": 0.2133, + "num_input_tokens_seen": 31053600, + "step": 20325 + }, + { + "epoch": 62.74806800618238, + "grad_norm": 0.3794982433319092, + "learning_rate": 2.435408189450002e-05, + "loss": 0.2217, + "num_input_tokens_seen": 31061152, + "step": 20330 + }, + { + "epoch": 62.76352395672334, + "grad_norm": 0.6014113426208496, + "learning_rate": 2.4344267744822406e-05, + "loss": 0.2892, + "num_input_tokens_seen": 31068800, + "step": 20335 + }, + { + "epoch": 62.778979907264294, + "grad_norm": 0.34584668278694153, + "learning_rate": 2.4334453696266944e-05, + "loss": 0.3327, + "num_input_tokens_seen": 31076032, + "step": 20340 + }, + { + "epoch": 62.794435857805254, + "grad_norm": 0.45513588190078735, + "learning_rate": 2.432463975034708e-05, + "loss": 0.2676, + "num_input_tokens_seen": 31083200, + "step": 20345 + }, + { + "epoch": 62.809891808346215, + "grad_norm": 0.381002813577652, + "learning_rate": 2.4314825908576265e-05, + "loss": 0.2281, + "num_input_tokens_seen": 31091328, + "step": 20350 + }, + { + "epoch": 62.82534775888717, + "grad_norm": 0.6628137230873108, + "learning_rate": 2.4305012172467897e-05, + "loss": 0.2136, + "num_input_tokens_seen": 31098976, + "step": 20355 + }, + { + "epoch": 62.84080370942813, + "grad_norm": 0.5028077960014343, + "learning_rate": 2.4295198543535393e-05, + "loss": 0.2675, + "num_input_tokens_seen": 31107104, + "step": 20360 + }, + { + "epoch": 62.85625965996909, + "grad_norm": 0.5075802803039551, + "learning_rate": 2.4285385023292124e-05, + "loss": 0.2769, + "num_input_tokens_seen": 31114688, + "step": 20365 + }, + { + "epoch": 62.871715610510044, + "grad_norm": 0.6314690709114075, + "learning_rate": 2.427557161325147e-05, + "loss": 0.2555, + "num_input_tokens_seen": 31122976, + "step": 20370 + }, + { + "epoch": 62.887171561051005, + "grad_norm": 0.7505627870559692, + "learning_rate": 2.4265758314926778e-05, + "loss": 0.2453, + "num_input_tokens_seen": 31130336, + "step": 20375 + }, + { + "epoch": 62.902627511591966, + "grad_norm": 0.7448394894599915, + "learning_rate": 2.4255945129831373e-05, + "loss": 0.3118, + "num_input_tokens_seen": 31137920, + "step": 20380 + }, + { + "epoch": 62.91808346213292, + "grad_norm": 0.9159676432609558, + "learning_rate": 2.4246132059478578e-05, + "loss": 0.2801, + "num_input_tokens_seen": 31146016, + "step": 20385 + }, + { + "epoch": 62.93353941267388, + "grad_norm": 0.4354098439216614, + "learning_rate": 2.4236319105381706e-05, + "loss": 0.2788, + "num_input_tokens_seen": 31153696, + "step": 20390 + }, + { + "epoch": 62.948995363214834, + "grad_norm": 0.48785343766212463, + "learning_rate": 2.422650626905401e-05, + "loss": 0.3047, + "num_input_tokens_seen": 31161728, + "step": 20395 + }, + { + "epoch": 62.964451313755795, + "grad_norm": 0.3510212302207947, + "learning_rate": 2.4216693552008785e-05, + "loss": 0.2419, + "num_input_tokens_seen": 31169472, + "step": 20400 + }, + { + "epoch": 62.964451313755795, + "eval_loss": 0.3090716600418091, + "eval_runtime": 6.3086, + "eval_samples_per_second": 91.146, + "eval_steps_per_second": 22.826, + "num_input_tokens_seen": 31169472, + "step": 20400 + }, + { + "epoch": 62.979907264296756, + "grad_norm": 0.7040321230888367, + "learning_rate": 2.4206880955759247e-05, + "loss": 0.2438, + "num_input_tokens_seen": 31177344, + "step": 20405 + }, + { + "epoch": 62.99536321483771, + "grad_norm": 0.5338494777679443, + "learning_rate": 2.419706848181863e-05, + "loss": 0.2516, + "num_input_tokens_seen": 31185056, + "step": 20410 + }, + { + "epoch": 63.009273570324574, + "grad_norm": 0.6697709560394287, + "learning_rate": 2.4187256131700153e-05, + "loss": 0.2129, + "num_input_tokens_seen": 31191664, + "step": 20415 + }, + { + "epoch": 63.024729520865534, + "grad_norm": 0.6711438894271851, + "learning_rate": 2.4177443906916985e-05, + "loss": 0.2367, + "num_input_tokens_seen": 31199248, + "step": 20420 + }, + { + "epoch": 63.04018547140649, + "grad_norm": 0.5892099738121033, + "learning_rate": 2.4167631808982303e-05, + "loss": 0.2043, + "num_input_tokens_seen": 31206352, + "step": 20425 + }, + { + "epoch": 63.05564142194745, + "grad_norm": 0.5006194710731506, + "learning_rate": 2.4157819839409264e-05, + "loss": 0.2754, + "num_input_tokens_seen": 31213776, + "step": 20430 + }, + { + "epoch": 63.07109737248841, + "grad_norm": 0.374762624502182, + "learning_rate": 2.414800799971098e-05, + "loss": 0.225, + "num_input_tokens_seen": 31221936, + "step": 20435 + }, + { + "epoch": 63.086553323029364, + "grad_norm": 0.7322760820388794, + "learning_rate": 2.4138196291400582e-05, + "loss": 0.2241, + "num_input_tokens_seen": 31229776, + "step": 20440 + }, + { + "epoch": 63.102009273570324, + "grad_norm": 0.5891981720924377, + "learning_rate": 2.412838471599114e-05, + "loss": 0.2707, + "num_input_tokens_seen": 31237616, + "step": 20445 + }, + { + "epoch": 63.117465224111285, + "grad_norm": 0.5309884548187256, + "learning_rate": 2.411857327499572e-05, + "loss": 0.2402, + "num_input_tokens_seen": 31245520, + "step": 20450 + }, + { + "epoch": 63.13292117465224, + "grad_norm": 0.47360265254974365, + "learning_rate": 2.410876196992739e-05, + "loss": 0.2176, + "num_input_tokens_seen": 31253264, + "step": 20455 + }, + { + "epoch": 63.1483771251932, + "grad_norm": 0.42743685841560364, + "learning_rate": 2.4098950802299156e-05, + "loss": 0.2265, + "num_input_tokens_seen": 31260720, + "step": 20460 + }, + { + "epoch": 63.16383307573416, + "grad_norm": 0.544838547706604, + "learning_rate": 2.4089139773624027e-05, + "loss": 0.2023, + "num_input_tokens_seen": 31268080, + "step": 20465 + }, + { + "epoch": 63.179289026275114, + "grad_norm": 0.4812687337398529, + "learning_rate": 2.4079328885415007e-05, + "loss": 0.2329, + "num_input_tokens_seen": 31275888, + "step": 20470 + }, + { + "epoch": 63.194744976816075, + "grad_norm": 0.8770227432250977, + "learning_rate": 2.4069518139185036e-05, + "loss": 0.2166, + "num_input_tokens_seen": 31283856, + "step": 20475 + }, + { + "epoch": 63.210200927357036, + "grad_norm": 0.690126359462738, + "learning_rate": 2.405970753644706e-05, + "loss": 0.2399, + "num_input_tokens_seen": 31291280, + "step": 20480 + }, + { + "epoch": 63.22565687789799, + "grad_norm": 0.7610945105552673, + "learning_rate": 2.4049897078714e-05, + "loss": 0.349, + "num_input_tokens_seen": 31299120, + "step": 20485 + }, + { + "epoch": 63.24111282843895, + "grad_norm": 0.4827478229999542, + "learning_rate": 2.404008676749874e-05, + "loss": 0.2519, + "num_input_tokens_seen": 31307056, + "step": 20490 + }, + { + "epoch": 63.256568778979904, + "grad_norm": 0.5456489324569702, + "learning_rate": 2.403027660431418e-05, + "loss": 0.3125, + "num_input_tokens_seen": 31315056, + "step": 20495 + }, + { + "epoch": 63.272024729520865, + "grad_norm": 0.658113420009613, + "learning_rate": 2.402046659067314e-05, + "loss": 0.2478, + "num_input_tokens_seen": 31322864, + "step": 20500 + }, + { + "epoch": 63.287480680061826, + "grad_norm": 0.2532230615615845, + "learning_rate": 2.401065672808847e-05, + "loss": 0.2307, + "num_input_tokens_seen": 31330512, + "step": 20505 + }, + { + "epoch": 63.30293663060278, + "grad_norm": 0.6328368186950684, + "learning_rate": 2.400084701807296e-05, + "loss": 0.2087, + "num_input_tokens_seen": 31337648, + "step": 20510 + }, + { + "epoch": 63.31839258114374, + "grad_norm": 0.6028344035148621, + "learning_rate": 2.39910374621394e-05, + "loss": 0.2431, + "num_input_tokens_seen": 31345456, + "step": 20515 + }, + { + "epoch": 63.3338485316847, + "grad_norm": 0.44632941484451294, + "learning_rate": 2.3981228061800544e-05, + "loss": 0.2124, + "num_input_tokens_seen": 31352848, + "step": 20520 + }, + { + "epoch": 63.349304482225655, + "grad_norm": 0.6475494503974915, + "learning_rate": 2.3971418818569115e-05, + "loss": 0.2291, + "num_input_tokens_seen": 31360656, + "step": 20525 + }, + { + "epoch": 63.364760432766616, + "grad_norm": 0.5224652290344238, + "learning_rate": 2.3961609733957832e-05, + "loss": 0.3035, + "num_input_tokens_seen": 31367888, + "step": 20530 + }, + { + "epoch": 63.38021638330758, + "grad_norm": 0.6909589767456055, + "learning_rate": 2.395180080947939e-05, + "loss": 0.2764, + "num_input_tokens_seen": 31375792, + "step": 20535 + }, + { + "epoch": 63.39567233384853, + "grad_norm": 0.7850907444953918, + "learning_rate": 2.394199204664642e-05, + "loss": 0.2918, + "num_input_tokens_seen": 31383152, + "step": 20540 + }, + { + "epoch": 63.41112828438949, + "grad_norm": 0.5226409435272217, + "learning_rate": 2.3932183446971583e-05, + "loss": 0.2499, + "num_input_tokens_seen": 31390608, + "step": 20545 + }, + { + "epoch": 63.426584234930445, + "grad_norm": 0.8076445460319519, + "learning_rate": 2.3922375011967473e-05, + "loss": 0.2356, + "num_input_tokens_seen": 31398096, + "step": 20550 + }, + { + "epoch": 63.442040185471406, + "grad_norm": 0.40298840403556824, + "learning_rate": 2.3912566743146676e-05, + "loss": 0.2758, + "num_input_tokens_seen": 31405104, + "step": 20555 + }, + { + "epoch": 63.45749613601237, + "grad_norm": 0.46340295672416687, + "learning_rate": 2.390275864202176e-05, + "loss": 0.3489, + "num_input_tokens_seen": 31412560, + "step": 20560 + }, + { + "epoch": 63.47295208655332, + "grad_norm": 0.384308397769928, + "learning_rate": 2.3892950710105243e-05, + "loss": 0.2327, + "num_input_tokens_seen": 31420272, + "step": 20565 + }, + { + "epoch": 63.48840803709428, + "grad_norm": 0.5156206488609314, + "learning_rate": 2.3883142948909635e-05, + "loss": 0.2545, + "num_input_tokens_seen": 31428272, + "step": 20570 + }, + { + "epoch": 63.50386398763524, + "grad_norm": 0.5493907928466797, + "learning_rate": 2.3873335359947433e-05, + "loss": 0.285, + "num_input_tokens_seen": 31436176, + "step": 20575 + }, + { + "epoch": 63.519319938176196, + "grad_norm": 0.331451952457428, + "learning_rate": 2.3863527944731066e-05, + "loss": 0.3431, + "num_input_tokens_seen": 31443632, + "step": 20580 + }, + { + "epoch": 63.53477588871716, + "grad_norm": 0.4288462996482849, + "learning_rate": 2.385372070477298e-05, + "loss": 0.2845, + "num_input_tokens_seen": 31450960, + "step": 20585 + }, + { + "epoch": 63.55023183925812, + "grad_norm": 0.8642883896827698, + "learning_rate": 2.384391364158556e-05, + "loss": 0.2894, + "num_input_tokens_seen": 31458384, + "step": 20590 + }, + { + "epoch": 63.56568778979907, + "grad_norm": 0.36189788579940796, + "learning_rate": 2.3834106756681185e-05, + "loss": 0.2544, + "num_input_tokens_seen": 31466448, + "step": 20595 + }, + { + "epoch": 63.58114374034003, + "grad_norm": 0.4593207836151123, + "learning_rate": 2.3824300051572206e-05, + "loss": 0.2191, + "num_input_tokens_seen": 31474928, + "step": 20600 + }, + { + "epoch": 63.58114374034003, + "eval_loss": 0.31003808975219727, + "eval_runtime": 6.3415, + "eval_samples_per_second": 90.673, + "eval_steps_per_second": 22.708, + "num_input_tokens_seen": 31474928, + "step": 20600 + }, + { + "epoch": 63.59659969088099, + "grad_norm": 0.31507667899131775, + "learning_rate": 2.3814493527770923e-05, + "loss": 0.2574, + "num_input_tokens_seen": 31482256, + "step": 20605 + }, + { + "epoch": 63.61205564142195, + "grad_norm": 0.5502341389656067, + "learning_rate": 2.3804687186789637e-05, + "loss": 0.2281, + "num_input_tokens_seen": 31489328, + "step": 20610 + }, + { + "epoch": 63.62751159196291, + "grad_norm": 0.3695066273212433, + "learning_rate": 2.379488103014062e-05, + "loss": 0.3365, + "num_input_tokens_seen": 31497136, + "step": 20615 + }, + { + "epoch": 63.64296754250386, + "grad_norm": 0.49995988607406616, + "learning_rate": 2.3785075059336086e-05, + "loss": 0.2311, + "num_input_tokens_seen": 31504912, + "step": 20620 + }, + { + "epoch": 63.65842349304482, + "grad_norm": 0.4457852244377136, + "learning_rate": 2.3775269275888248e-05, + "loss": 0.3283, + "num_input_tokens_seen": 31512240, + "step": 20625 + }, + { + "epoch": 63.67387944358578, + "grad_norm": 0.6148441433906555, + "learning_rate": 2.3765463681309274e-05, + "loss": 0.2313, + "num_input_tokens_seen": 31519504, + "step": 20630 + }, + { + "epoch": 63.689335394126736, + "grad_norm": 0.3018190264701843, + "learning_rate": 2.3755658277111313e-05, + "loss": 0.2377, + "num_input_tokens_seen": 31526992, + "step": 20635 + }, + { + "epoch": 63.7047913446677, + "grad_norm": 0.6490955948829651, + "learning_rate": 2.374585306480649e-05, + "loss": 0.198, + "num_input_tokens_seen": 31534480, + "step": 20640 + }, + { + "epoch": 63.72024729520866, + "grad_norm": 0.559270977973938, + "learning_rate": 2.3736048045906877e-05, + "loss": 0.2968, + "num_input_tokens_seen": 31541904, + "step": 20645 + }, + { + "epoch": 63.73570324574961, + "grad_norm": 0.40424439311027527, + "learning_rate": 2.372624322192454e-05, + "loss": 0.2509, + "num_input_tokens_seen": 31549776, + "step": 20650 + }, + { + "epoch": 63.75115919629057, + "grad_norm": 0.6104480624198914, + "learning_rate": 2.3716438594371516e-05, + "loss": 0.2363, + "num_input_tokens_seen": 31557296, + "step": 20655 + }, + { + "epoch": 63.76661514683153, + "grad_norm": 0.6294844746589661, + "learning_rate": 2.3706634164759784e-05, + "loss": 0.2715, + "num_input_tokens_seen": 31565232, + "step": 20660 + }, + { + "epoch": 63.78207109737249, + "grad_norm": 0.5703375339508057, + "learning_rate": 2.3696829934601323e-05, + "loss": 0.2899, + "num_input_tokens_seen": 31572816, + "step": 20665 + }, + { + "epoch": 63.79752704791345, + "grad_norm": 0.41500699520111084, + "learning_rate": 2.3687025905408053e-05, + "loss": 0.2389, + "num_input_tokens_seen": 31580528, + "step": 20670 + }, + { + "epoch": 63.8129829984544, + "grad_norm": 0.6186912655830383, + "learning_rate": 2.3677222078691886e-05, + "loss": 0.3017, + "num_input_tokens_seen": 31588624, + "step": 20675 + }, + { + "epoch": 63.82843894899536, + "grad_norm": 0.3837515115737915, + "learning_rate": 2.366741845596471e-05, + "loss": 0.2032, + "num_input_tokens_seen": 31595984, + "step": 20680 + }, + { + "epoch": 63.84389489953632, + "grad_norm": 0.3842329680919647, + "learning_rate": 2.3657615038738343e-05, + "loss": 0.2554, + "num_input_tokens_seen": 31604080, + "step": 20685 + }, + { + "epoch": 63.85935085007728, + "grad_norm": 0.34775134921073914, + "learning_rate": 2.3647811828524614e-05, + "loss": 0.1994, + "num_input_tokens_seen": 31611856, + "step": 20690 + }, + { + "epoch": 63.87480680061824, + "grad_norm": 0.3969894051551819, + "learning_rate": 2.363800882683529e-05, + "loss": 0.24, + "num_input_tokens_seen": 31619280, + "step": 20695 + }, + { + "epoch": 63.8902627511592, + "grad_norm": 0.3255986273288727, + "learning_rate": 2.3628206035182125e-05, + "loss": 0.255, + "num_input_tokens_seen": 31627312, + "step": 20700 + }, + { + "epoch": 63.90571870170015, + "grad_norm": 0.5568186044692993, + "learning_rate": 2.361840345507683e-05, + "loss": 0.2757, + "num_input_tokens_seen": 31634864, + "step": 20705 + }, + { + "epoch": 63.92117465224111, + "grad_norm": 0.45078980922698975, + "learning_rate": 2.3608601088031073e-05, + "loss": 0.2865, + "num_input_tokens_seen": 31642384, + "step": 20710 + }, + { + "epoch": 63.936630602782074, + "grad_norm": 0.8828786611557007, + "learning_rate": 2.3598798935556516e-05, + "loss": 0.2534, + "num_input_tokens_seen": 31649648, + "step": 20715 + }, + { + "epoch": 63.95208655332303, + "grad_norm": 0.3939429521560669, + "learning_rate": 2.3588996999164784e-05, + "loss": 0.2315, + "num_input_tokens_seen": 31657776, + "step": 20720 + }, + { + "epoch": 63.96754250386399, + "grad_norm": 0.4771733582019806, + "learning_rate": 2.3579195280367434e-05, + "loss": 0.2091, + "num_input_tokens_seen": 31665232, + "step": 20725 + }, + { + "epoch": 63.98299845440495, + "grad_norm": 0.5053581595420837, + "learning_rate": 2.356939378067603e-05, + "loss": 0.2525, + "num_input_tokens_seen": 31673328, + "step": 20730 + }, + { + "epoch": 63.9984544049459, + "grad_norm": 0.5971755385398865, + "learning_rate": 2.3559592501602092e-05, + "loss": 0.3378, + "num_input_tokens_seen": 31680624, + "step": 20735 + }, + { + "epoch": 64.01236476043276, + "grad_norm": 0.6488777995109558, + "learning_rate": 2.3549791444657076e-05, + "loss": 0.2869, + "num_input_tokens_seen": 31687552, + "step": 20740 + }, + { + "epoch": 64.02782071097373, + "grad_norm": 0.4519585072994232, + "learning_rate": 2.353999061135246e-05, + "loss": 0.241, + "num_input_tokens_seen": 31695264, + "step": 20745 + }, + { + "epoch": 64.04327666151468, + "grad_norm": 0.2757170796394348, + "learning_rate": 2.3530190003199626e-05, + "loss": 0.2334, + "num_input_tokens_seen": 31702976, + "step": 20750 + }, + { + "epoch": 64.05873261205564, + "grad_norm": 0.5411752462387085, + "learning_rate": 2.3520389621709965e-05, + "loss": 0.2279, + "num_input_tokens_seen": 31710432, + "step": 20755 + }, + { + "epoch": 64.0741885625966, + "grad_norm": 0.6494357585906982, + "learning_rate": 2.351058946839483e-05, + "loss": 0.2405, + "num_input_tokens_seen": 31717280, + "step": 20760 + }, + { + "epoch": 64.08964451313756, + "grad_norm": 0.6309649348258972, + "learning_rate": 2.350078954476551e-05, + "loss": 0.1949, + "num_input_tokens_seen": 31724480, + "step": 20765 + }, + { + "epoch": 64.10510046367851, + "grad_norm": 0.3889094889163971, + "learning_rate": 2.3490989852333272e-05, + "loss": 0.2299, + "num_input_tokens_seen": 31732576, + "step": 20770 + }, + { + "epoch": 64.12055641421948, + "grad_norm": 0.8335018157958984, + "learning_rate": 2.3481190392609377e-05, + "loss": 0.327, + "num_input_tokens_seen": 31739872, + "step": 20775 + }, + { + "epoch": 64.13601236476043, + "grad_norm": 0.3422092795372009, + "learning_rate": 2.3471391167105e-05, + "loss": 0.2377, + "num_input_tokens_seen": 31748032, + "step": 20780 + }, + { + "epoch": 64.15146831530139, + "grad_norm": 0.2845865786075592, + "learning_rate": 2.3461592177331325e-05, + "loss": 0.2401, + "num_input_tokens_seen": 31755648, + "step": 20785 + }, + { + "epoch": 64.16692426584235, + "grad_norm": 0.38770604133605957, + "learning_rate": 2.345179342479946e-05, + "loss": 0.2154, + "num_input_tokens_seen": 31763520, + "step": 20790 + }, + { + "epoch": 64.18238021638331, + "grad_norm": 0.5551255941390991, + "learning_rate": 2.3441994911020503e-05, + "loss": 0.2637, + "num_input_tokens_seen": 31770784, + "step": 20795 + }, + { + "epoch": 64.19783616692426, + "grad_norm": 0.4326201379299164, + "learning_rate": 2.3432196637505522e-05, + "loss": 0.2659, + "num_input_tokens_seen": 31778496, + "step": 20800 + }, + { + "epoch": 64.19783616692426, + "eval_loss": 0.3087671995162964, + "eval_runtime": 6.3295, + "eval_samples_per_second": 90.844, + "eval_steps_per_second": 22.751, + "num_input_tokens_seen": 31778496, + "step": 20800 + }, + { + "epoch": 64.21329211746523, + "grad_norm": 0.6052542328834534, + "learning_rate": 2.3422398605765515e-05, + "loss": 0.2274, + "num_input_tokens_seen": 31786112, + "step": 20805 + }, + { + "epoch": 64.22874806800618, + "grad_norm": 0.5565476417541504, + "learning_rate": 2.3412600817311462e-05, + "loss": 0.2775, + "num_input_tokens_seen": 31793504, + "step": 20810 + }, + { + "epoch": 64.24420401854714, + "grad_norm": 0.5297768115997314, + "learning_rate": 2.3402803273654326e-05, + "loss": 0.2518, + "num_input_tokens_seen": 31801728, + "step": 20815 + }, + { + "epoch": 64.2596599690881, + "grad_norm": 0.4842531085014343, + "learning_rate": 2.3393005976304983e-05, + "loss": 0.2747, + "num_input_tokens_seen": 31809664, + "step": 20820 + }, + { + "epoch": 64.27511591962906, + "grad_norm": 0.6412750482559204, + "learning_rate": 2.338320892677432e-05, + "loss": 0.3644, + "num_input_tokens_seen": 31817248, + "step": 20825 + }, + { + "epoch": 64.29057187017001, + "grad_norm": 0.36325398087501526, + "learning_rate": 2.3373412126573155e-05, + "loss": 0.2624, + "num_input_tokens_seen": 31824736, + "step": 20830 + }, + { + "epoch": 64.30602782071098, + "grad_norm": 0.5743479132652283, + "learning_rate": 2.3363615577212285e-05, + "loss": 0.2398, + "num_input_tokens_seen": 31832352, + "step": 20835 + }, + { + "epoch": 64.32148377125193, + "grad_norm": 0.2982104420661926, + "learning_rate": 2.3353819280202455e-05, + "loss": 0.2427, + "num_input_tokens_seen": 31840064, + "step": 20840 + }, + { + "epoch": 64.33693972179289, + "grad_norm": 0.4819014072418213, + "learning_rate": 2.334402323705438e-05, + "loss": 0.2262, + "num_input_tokens_seen": 31848000, + "step": 20845 + }, + { + "epoch": 64.35239567233384, + "grad_norm": 0.5484129190444946, + "learning_rate": 2.3334227449278725e-05, + "loss": 0.2308, + "num_input_tokens_seen": 31855552, + "step": 20850 + }, + { + "epoch": 64.36785162287481, + "grad_norm": 0.3735593557357788, + "learning_rate": 2.3324431918386143e-05, + "loss": 0.2235, + "num_input_tokens_seen": 31863040, + "step": 20855 + }, + { + "epoch": 64.38330757341576, + "grad_norm": 0.5767542719841003, + "learning_rate": 2.3314636645887207e-05, + "loss": 0.2781, + "num_input_tokens_seen": 31870432, + "step": 20860 + }, + { + "epoch": 64.39876352395672, + "grad_norm": 0.7045870423316956, + "learning_rate": 2.3304841633292487e-05, + "loss": 0.2468, + "num_input_tokens_seen": 31878400, + "step": 20865 + }, + { + "epoch": 64.41421947449768, + "grad_norm": 0.4389835000038147, + "learning_rate": 2.329504688211248e-05, + "loss": 0.2255, + "num_input_tokens_seen": 31885888, + "step": 20870 + }, + { + "epoch": 64.42967542503864, + "grad_norm": 0.597791850566864, + "learning_rate": 2.3285252393857677e-05, + "loss": 0.2506, + "num_input_tokens_seen": 31894144, + "step": 20875 + }, + { + "epoch": 64.44513137557959, + "grad_norm": 0.34341123700141907, + "learning_rate": 2.327545817003851e-05, + "loss": 0.2347, + "num_input_tokens_seen": 31902048, + "step": 20880 + }, + { + "epoch": 64.46058732612056, + "grad_norm": 0.43336471915245056, + "learning_rate": 2.326566421216535e-05, + "loss": 0.2667, + "num_input_tokens_seen": 31909984, + "step": 20885 + }, + { + "epoch": 64.47604327666151, + "grad_norm": 0.3788301646709442, + "learning_rate": 2.3255870521748565e-05, + "loss": 0.26, + "num_input_tokens_seen": 31917696, + "step": 20890 + }, + { + "epoch": 64.49149922720247, + "grad_norm": 0.426573783159256, + "learning_rate": 2.3246077100298474e-05, + "loss": 0.1923, + "num_input_tokens_seen": 31926080, + "step": 20895 + }, + { + "epoch": 64.50695517774344, + "grad_norm": 0.6422605514526367, + "learning_rate": 2.3236283949325328e-05, + "loss": 0.2612, + "num_input_tokens_seen": 31933792, + "step": 20900 + }, + { + "epoch": 64.52241112828439, + "grad_norm": 0.38661035895347595, + "learning_rate": 2.3226491070339368e-05, + "loss": 0.257, + "num_input_tokens_seen": 31941664, + "step": 20905 + }, + { + "epoch": 64.53786707882534, + "grad_norm": 0.46580371260643005, + "learning_rate": 2.3216698464850762e-05, + "loss": 0.2975, + "num_input_tokens_seen": 31949088, + "step": 20910 + }, + { + "epoch": 64.55332302936631, + "grad_norm": 0.6361277103424072, + "learning_rate": 2.320690613436967e-05, + "loss": 0.2931, + "num_input_tokens_seen": 31956704, + "step": 20915 + }, + { + "epoch": 64.56877897990726, + "grad_norm": 0.44421836733818054, + "learning_rate": 2.3197114080406192e-05, + "loss": 0.2344, + "num_input_tokens_seen": 31964256, + "step": 20920 + }, + { + "epoch": 64.58423493044822, + "grad_norm": 0.3739317059516907, + "learning_rate": 2.3187322304470365e-05, + "loss": 0.2499, + "num_input_tokens_seen": 31971712, + "step": 20925 + }, + { + "epoch": 64.59969088098919, + "grad_norm": 0.40552669763565063, + "learning_rate": 2.3177530808072222e-05, + "loss": 0.2772, + "num_input_tokens_seen": 31979264, + "step": 20930 + }, + { + "epoch": 64.61514683153014, + "grad_norm": 0.45615166425704956, + "learning_rate": 2.316773959272174e-05, + "loss": 0.2297, + "num_input_tokens_seen": 31986976, + "step": 20935 + }, + { + "epoch": 64.6306027820711, + "grad_norm": 0.581922709941864, + "learning_rate": 2.3157948659928823e-05, + "loss": 0.2323, + "num_input_tokens_seen": 31994368, + "step": 20940 + }, + { + "epoch": 64.64605873261206, + "grad_norm": 0.7743876576423645, + "learning_rate": 2.3148158011203388e-05, + "loss": 0.2144, + "num_input_tokens_seen": 32001952, + "step": 20945 + }, + { + "epoch": 64.66151468315302, + "grad_norm": 0.38297390937805176, + "learning_rate": 2.3138367648055253e-05, + "loss": 0.2593, + "num_input_tokens_seen": 32009440, + "step": 20950 + }, + { + "epoch": 64.67697063369397, + "grad_norm": 0.5845410823822021, + "learning_rate": 2.312857757199422e-05, + "loss": 0.264, + "num_input_tokens_seen": 32017184, + "step": 20955 + }, + { + "epoch": 64.69242658423494, + "grad_norm": 0.556242048740387, + "learning_rate": 2.3118787784530048e-05, + "loss": 0.2226, + "num_input_tokens_seen": 32025120, + "step": 20960 + }, + { + "epoch": 64.70788253477589, + "grad_norm": 0.9506428241729736, + "learning_rate": 2.310899828717243e-05, + "loss": 0.2835, + "num_input_tokens_seen": 32032640, + "step": 20965 + }, + { + "epoch": 64.72333848531684, + "grad_norm": 0.4317498803138733, + "learning_rate": 2.309920908143104e-05, + "loss": 0.2579, + "num_input_tokens_seen": 32039776, + "step": 20970 + }, + { + "epoch": 64.7387944358578, + "grad_norm": 0.3812468349933624, + "learning_rate": 2.308942016881551e-05, + "loss": 0.2215, + "num_input_tokens_seen": 32047072, + "step": 20975 + }, + { + "epoch": 64.75425038639877, + "grad_norm": 0.4936833381652832, + "learning_rate": 2.307963155083539e-05, + "loss": 0.3061, + "num_input_tokens_seen": 32055392, + "step": 20980 + }, + { + "epoch": 64.76970633693972, + "grad_norm": 0.515932559967041, + "learning_rate": 2.306984322900022e-05, + "loss": 0.2566, + "num_input_tokens_seen": 32063168, + "step": 20985 + }, + { + "epoch": 64.78516228748067, + "grad_norm": 0.2831677198410034, + "learning_rate": 2.3060055204819482e-05, + "loss": 0.2797, + "num_input_tokens_seen": 32070912, + "step": 20990 + }, + { + "epoch": 64.80061823802164, + "grad_norm": 0.39801204204559326, + "learning_rate": 2.3050267479802604e-05, + "loss": 0.216, + "num_input_tokens_seen": 32078336, + "step": 20995 + }, + { + "epoch": 64.8160741885626, + "grad_norm": 0.3820362687110901, + "learning_rate": 2.304048005545899e-05, + "loss": 0.237, + "num_input_tokens_seen": 32086304, + "step": 21000 + }, + { + "epoch": 64.8160741885626, + "eval_loss": 0.308478981256485, + "eval_runtime": 6.3121, + "eval_samples_per_second": 91.095, + "eval_steps_per_second": 22.813, + "num_input_tokens_seen": 32086304, + "step": 21000 + }, + { + "epoch": 64.83153013910355, + "grad_norm": 0.6663228869438171, + "learning_rate": 2.3030692933297972e-05, + "loss": 0.2489, + "num_input_tokens_seen": 32093824, + "step": 21005 + }, + { + "epoch": 64.84698608964452, + "grad_norm": 0.4141896367073059, + "learning_rate": 2.3020906114828843e-05, + "loss": 0.2855, + "num_input_tokens_seen": 32101440, + "step": 21010 + }, + { + "epoch": 64.86244204018547, + "grad_norm": 0.998133659362793, + "learning_rate": 2.301111960156088e-05, + "loss": 0.3369, + "num_input_tokens_seen": 32109024, + "step": 21015 + }, + { + "epoch": 64.87789799072642, + "grad_norm": 0.39370185136795044, + "learning_rate": 2.300133339500326e-05, + "loss": 0.2677, + "num_input_tokens_seen": 32116672, + "step": 21020 + }, + { + "epoch": 64.89335394126739, + "grad_norm": 0.5730671882629395, + "learning_rate": 2.2991547496665148e-05, + "loss": 0.2042, + "num_input_tokens_seen": 32124320, + "step": 21025 + }, + { + "epoch": 64.90880989180835, + "grad_norm": 0.5898191332817078, + "learning_rate": 2.298176190805565e-05, + "loss": 0.22, + "num_input_tokens_seen": 32132192, + "step": 21030 + }, + { + "epoch": 64.9242658423493, + "grad_norm": 0.6424750089645386, + "learning_rate": 2.2971976630683826e-05, + "loss": 0.2991, + "num_input_tokens_seen": 32139840, + "step": 21035 + }, + { + "epoch": 64.93972179289027, + "grad_norm": 0.4962642788887024, + "learning_rate": 2.29621916660587e-05, + "loss": 0.2607, + "num_input_tokens_seen": 32147584, + "step": 21040 + }, + { + "epoch": 64.95517774343122, + "grad_norm": 0.5053713917732239, + "learning_rate": 2.295240701568922e-05, + "loss": 0.2853, + "num_input_tokens_seen": 32154912, + "step": 21045 + }, + { + "epoch": 64.97063369397218, + "grad_norm": 0.7218994498252869, + "learning_rate": 2.2942622681084312e-05, + "loss": 0.322, + "num_input_tokens_seen": 32162368, + "step": 21050 + }, + { + "epoch": 64.98608964451314, + "grad_norm": 0.4220591187477112, + "learning_rate": 2.293283866375284e-05, + "loss": 0.2123, + "num_input_tokens_seen": 32169824, + "step": 21055 + }, + { + "epoch": 65.0, + "grad_norm": 0.7401726245880127, + "learning_rate": 2.2923054965203627e-05, + "loss": 0.2699, + "num_input_tokens_seen": 32176528, + "step": 21060 + }, + { + "epoch": 65.01545595054095, + "grad_norm": 0.40385502576828003, + "learning_rate": 2.2913271586945443e-05, + "loss": 0.2513, + "num_input_tokens_seen": 32184048, + "step": 21065 + }, + { + "epoch": 65.03091190108192, + "grad_norm": 0.33603084087371826, + "learning_rate": 2.290348853048699e-05, + "loss": 0.259, + "num_input_tokens_seen": 32191696, + "step": 21070 + }, + { + "epoch": 65.04636785162288, + "grad_norm": 0.3293779492378235, + "learning_rate": 2.2893705797336956e-05, + "loss": 0.2378, + "num_input_tokens_seen": 32198864, + "step": 21075 + }, + { + "epoch": 65.06182380216383, + "grad_norm": 0.3530829846858978, + "learning_rate": 2.288392338900397e-05, + "loss": 0.2575, + "num_input_tokens_seen": 32206096, + "step": 21080 + }, + { + "epoch": 65.0772797527048, + "grad_norm": 0.3954606056213379, + "learning_rate": 2.2874141306996576e-05, + "loss": 0.2403, + "num_input_tokens_seen": 32213872, + "step": 21085 + }, + { + "epoch": 65.09273570324575, + "grad_norm": 0.3982163667678833, + "learning_rate": 2.2864359552823312e-05, + "loss": 0.2279, + "num_input_tokens_seen": 32221360, + "step": 21090 + }, + { + "epoch": 65.1081916537867, + "grad_norm": 0.39837533235549927, + "learning_rate": 2.2854578127992648e-05, + "loss": 0.2081, + "num_input_tokens_seen": 32229168, + "step": 21095 + }, + { + "epoch": 65.12364760432767, + "grad_norm": 0.4818834662437439, + "learning_rate": 2.2844797034012988e-05, + "loss": 0.2712, + "num_input_tokens_seen": 32236880, + "step": 21100 + }, + { + "epoch": 65.13910355486863, + "grad_norm": 0.40924277901649475, + "learning_rate": 2.2835016272392722e-05, + "loss": 0.2807, + "num_input_tokens_seen": 32244656, + "step": 21105 + }, + { + "epoch": 65.15455950540958, + "grad_norm": 0.7071319222450256, + "learning_rate": 2.2825235844640142e-05, + "loss": 0.2584, + "num_input_tokens_seen": 32252016, + "step": 21110 + }, + { + "epoch": 65.17001545595055, + "grad_norm": 0.3476217985153198, + "learning_rate": 2.2815455752263522e-05, + "loss": 0.19, + "num_input_tokens_seen": 32259568, + "step": 21115 + }, + { + "epoch": 65.1854714064915, + "grad_norm": 0.37263765931129456, + "learning_rate": 2.2805675996771092e-05, + "loss": 0.2533, + "num_input_tokens_seen": 32267568, + "step": 21120 + }, + { + "epoch": 65.20092735703246, + "grad_norm": 0.35292115807533264, + "learning_rate": 2.2795896579670987e-05, + "loss": 0.2292, + "num_input_tokens_seen": 32275248, + "step": 21125 + }, + { + "epoch": 65.21638330757341, + "grad_norm": 0.5884835720062256, + "learning_rate": 2.2786117502471337e-05, + "loss": 0.2425, + "num_input_tokens_seen": 32282576, + "step": 21130 + }, + { + "epoch": 65.23183925811438, + "grad_norm": 0.5826055407524109, + "learning_rate": 2.2776338766680185e-05, + "loss": 0.2279, + "num_input_tokens_seen": 32289968, + "step": 21135 + }, + { + "epoch": 65.24729520865533, + "grad_norm": 0.9393557906150818, + "learning_rate": 2.2766560373805533e-05, + "loss": 0.2681, + "num_input_tokens_seen": 32297776, + "step": 21140 + }, + { + "epoch": 65.26275115919628, + "grad_norm": 0.5230081677436829, + "learning_rate": 2.2756782325355353e-05, + "loss": 0.2867, + "num_input_tokens_seen": 32305808, + "step": 21145 + }, + { + "epoch": 65.27820710973725, + "grad_norm": 0.35019153356552124, + "learning_rate": 2.2747004622837514e-05, + "loss": 0.2275, + "num_input_tokens_seen": 32313552, + "step": 21150 + }, + { + "epoch": 65.2936630602782, + "grad_norm": 0.6483094096183777, + "learning_rate": 2.2737227267759878e-05, + "loss": 0.3107, + "num_input_tokens_seen": 32321040, + "step": 21155 + }, + { + "epoch": 65.30911901081916, + "grad_norm": 0.42852672934532166, + "learning_rate": 2.272745026163024e-05, + "loss": 0.2612, + "num_input_tokens_seen": 32328304, + "step": 21160 + }, + { + "epoch": 65.32457496136013, + "grad_norm": 0.40620312094688416, + "learning_rate": 2.271767360595633e-05, + "loss": 0.2352, + "num_input_tokens_seen": 32335664, + "step": 21165 + }, + { + "epoch": 65.34003091190108, + "grad_norm": 0.5437992811203003, + "learning_rate": 2.270789730224583e-05, + "loss": 0.2178, + "num_input_tokens_seen": 32342800, + "step": 21170 + }, + { + "epoch": 65.35548686244204, + "grad_norm": 0.5858874320983887, + "learning_rate": 2.2698121352006367e-05, + "loss": 0.2341, + "num_input_tokens_seen": 32350320, + "step": 21175 + }, + { + "epoch": 65.370942812983, + "grad_norm": 0.48103752732276917, + "learning_rate": 2.2688345756745517e-05, + "loss": 0.246, + "num_input_tokens_seen": 32358448, + "step": 21180 + }, + { + "epoch": 65.38639876352396, + "grad_norm": 0.4042814373970032, + "learning_rate": 2.267857051797081e-05, + "loss": 0.23, + "num_input_tokens_seen": 32366448, + "step": 21185 + }, + { + "epoch": 65.40185471406491, + "grad_norm": 0.5770852565765381, + "learning_rate": 2.2668795637189695e-05, + "loss": 0.3009, + "num_input_tokens_seen": 32374064, + "step": 21190 + }, + { + "epoch": 65.41731066460588, + "grad_norm": 0.6183199882507324, + "learning_rate": 2.2659021115909586e-05, + "loss": 0.2819, + "num_input_tokens_seen": 32381136, + "step": 21195 + }, + { + "epoch": 65.43276661514683, + "grad_norm": 0.7297003269195557, + "learning_rate": 2.2649246955637847e-05, + "loss": 0.3355, + "num_input_tokens_seen": 32389328, + "step": 21200 + }, + { + "epoch": 65.43276661514683, + "eval_loss": 0.30805814266204834, + "eval_runtime": 6.2708, + "eval_samples_per_second": 91.694, + "eval_steps_per_second": 22.963, + "num_input_tokens_seen": 32389328, + "step": 21200 + }, + { + "epoch": 65.44822256568779, + "grad_norm": 0.39595600962638855, + "learning_rate": 2.2639473157881766e-05, + "loss": 0.2605, + "num_input_tokens_seen": 32397264, + "step": 21205 + }, + { + "epoch": 65.46367851622875, + "grad_norm": 0.8109120726585388, + "learning_rate": 2.2629699724148594e-05, + "loss": 0.3145, + "num_input_tokens_seen": 32404720, + "step": 21210 + }, + { + "epoch": 65.47913446676971, + "grad_norm": 0.6535093188285828, + "learning_rate": 2.26199266559455e-05, + "loss": 0.1916, + "num_input_tokens_seen": 32412240, + "step": 21215 + }, + { + "epoch": 65.49459041731066, + "grad_norm": 0.9300081729888916, + "learning_rate": 2.2610153954779625e-05, + "loss": 0.2616, + "num_input_tokens_seen": 32419888, + "step": 21220 + }, + { + "epoch": 65.51004636785163, + "grad_norm": 0.5559262633323669, + "learning_rate": 2.2600381622158056e-05, + "loss": 0.2644, + "num_input_tokens_seen": 32427760, + "step": 21225 + }, + { + "epoch": 65.52550231839258, + "grad_norm": 0.2681616544723511, + "learning_rate": 2.2590609659587783e-05, + "loss": 0.2782, + "num_input_tokens_seen": 32435120, + "step": 21230 + }, + { + "epoch": 65.54095826893354, + "grad_norm": 0.3196113407611847, + "learning_rate": 2.2580838068575787e-05, + "loss": 0.2371, + "num_input_tokens_seen": 32442992, + "step": 21235 + }, + { + "epoch": 65.5564142194745, + "grad_norm": 0.34869176149368286, + "learning_rate": 2.257106685062896e-05, + "loss": 0.2934, + "num_input_tokens_seen": 32450896, + "step": 21240 + }, + { + "epoch": 65.57187017001546, + "grad_norm": 0.5217129588127136, + "learning_rate": 2.256129600725415e-05, + "loss": 0.1949, + "num_input_tokens_seen": 32458448, + "step": 21245 + }, + { + "epoch": 65.58732612055641, + "grad_norm": 0.41926008462905884, + "learning_rate": 2.2551525539958145e-05, + "loss": 0.2298, + "num_input_tokens_seen": 32465840, + "step": 21250 + }, + { + "epoch": 65.60278207109737, + "grad_norm": 0.420827716588974, + "learning_rate": 2.2541755450247663e-05, + "loss": 0.2346, + "num_input_tokens_seen": 32473488, + "step": 21255 + }, + { + "epoch": 65.61823802163833, + "grad_norm": 0.31078770756721497, + "learning_rate": 2.2531985739629382e-05, + "loss": 0.2326, + "num_input_tokens_seen": 32481264, + "step": 21260 + }, + { + "epoch": 65.63369397217929, + "grad_norm": 0.510520339012146, + "learning_rate": 2.2522216409609924e-05, + "loss": 0.2382, + "num_input_tokens_seen": 32488688, + "step": 21265 + }, + { + "epoch": 65.64914992272024, + "grad_norm": 0.6241837739944458, + "learning_rate": 2.2512447461695826e-05, + "loss": 0.254, + "num_input_tokens_seen": 32496432, + "step": 21270 + }, + { + "epoch": 65.66460587326121, + "grad_norm": 0.562235951423645, + "learning_rate": 2.2502678897393593e-05, + "loss": 0.2263, + "num_input_tokens_seen": 32503408, + "step": 21275 + }, + { + "epoch": 65.68006182380216, + "grad_norm": 0.4181927442550659, + "learning_rate": 2.2492910718209665e-05, + "loss": 0.3325, + "num_input_tokens_seen": 32510736, + "step": 21280 + }, + { + "epoch": 65.69551777434312, + "grad_norm": 0.6460524797439575, + "learning_rate": 2.2483142925650398e-05, + "loss": 0.2673, + "num_input_tokens_seen": 32518160, + "step": 21285 + }, + { + "epoch": 65.71097372488408, + "grad_norm": 0.8803573250770569, + "learning_rate": 2.247337552122213e-05, + "loss": 0.268, + "num_input_tokens_seen": 32525840, + "step": 21290 + }, + { + "epoch": 65.72642967542504, + "grad_norm": 0.3636034429073334, + "learning_rate": 2.24636085064311e-05, + "loss": 0.223, + "num_input_tokens_seen": 32533552, + "step": 21295 + }, + { + "epoch": 65.74188562596599, + "grad_norm": 0.4294278025627136, + "learning_rate": 2.245384188278351e-05, + "loss": 0.2653, + "num_input_tokens_seen": 32540880, + "step": 21300 + }, + { + "epoch": 65.75734157650696, + "grad_norm": 0.39363789558410645, + "learning_rate": 2.2444075651785513e-05, + "loss": 0.2615, + "num_input_tokens_seen": 32548592, + "step": 21305 + }, + { + "epoch": 65.77279752704791, + "grad_norm": 0.534835159778595, + "learning_rate": 2.243430981494316e-05, + "loss": 0.355, + "num_input_tokens_seen": 32556016, + "step": 21310 + }, + { + "epoch": 65.78825347758887, + "grad_norm": 0.3990272283554077, + "learning_rate": 2.2424544373762475e-05, + "loss": 0.2117, + "num_input_tokens_seen": 32564528, + "step": 21315 + }, + { + "epoch": 65.80370942812984, + "grad_norm": 0.3028179407119751, + "learning_rate": 2.2414779329749418e-05, + "loss": 0.2146, + "num_input_tokens_seen": 32572560, + "step": 21320 + }, + { + "epoch": 65.81916537867079, + "grad_norm": 0.48781538009643555, + "learning_rate": 2.2405014684409873e-05, + "loss": 0.2437, + "num_input_tokens_seen": 32580016, + "step": 21325 + }, + { + "epoch": 65.83462132921174, + "grad_norm": 0.41801413893699646, + "learning_rate": 2.239525043924968e-05, + "loss": 0.2263, + "num_input_tokens_seen": 32588272, + "step": 21330 + }, + { + "epoch": 65.85007727975271, + "grad_norm": 0.5522680282592773, + "learning_rate": 2.2385486595774592e-05, + "loss": 0.2357, + "num_input_tokens_seen": 32595888, + "step": 21335 + }, + { + "epoch": 65.86553323029366, + "grad_norm": 0.5505148768424988, + "learning_rate": 2.237572315549033e-05, + "loss": 0.2272, + "num_input_tokens_seen": 32603504, + "step": 21340 + }, + { + "epoch": 65.88098918083462, + "grad_norm": 0.8260542154312134, + "learning_rate": 2.2365960119902545e-05, + "loss": 0.2904, + "num_input_tokens_seen": 32610672, + "step": 21345 + }, + { + "epoch": 65.89644513137559, + "grad_norm": 0.5562597513198853, + "learning_rate": 2.2356197490516806e-05, + "loss": 0.2751, + "num_input_tokens_seen": 32618352, + "step": 21350 + }, + { + "epoch": 65.91190108191654, + "grad_norm": 0.4061402380466461, + "learning_rate": 2.234643526883863e-05, + "loss": 0.2411, + "num_input_tokens_seen": 32626416, + "step": 21355 + }, + { + "epoch": 65.9273570324575, + "grad_norm": 0.7875488996505737, + "learning_rate": 2.2336673456373497e-05, + "loss": 0.2909, + "num_input_tokens_seen": 32633840, + "step": 21360 + }, + { + "epoch": 65.94281298299846, + "grad_norm": 0.46090462803840637, + "learning_rate": 2.2326912054626772e-05, + "loss": 0.2067, + "num_input_tokens_seen": 32641616, + "step": 21365 + }, + { + "epoch": 65.95826893353942, + "grad_norm": 0.4912562370300293, + "learning_rate": 2.2317151065103813e-05, + "loss": 0.2781, + "num_input_tokens_seen": 32649232, + "step": 21370 + }, + { + "epoch": 65.97372488408037, + "grad_norm": 0.8507387042045593, + "learning_rate": 2.2307390489309865e-05, + "loss": 0.2491, + "num_input_tokens_seen": 32656688, + "step": 21375 + }, + { + "epoch": 65.98918083462132, + "grad_norm": 0.5302955508232117, + "learning_rate": 2.2297630328750146e-05, + "loss": 0.2964, + "num_input_tokens_seen": 32664592, + "step": 21380 + }, + { + "epoch": 66.0030911901082, + "grad_norm": 0.5304604172706604, + "learning_rate": 2.228787058492979e-05, + "loss": 0.2527, + "num_input_tokens_seen": 32671664, + "step": 21385 + }, + { + "epoch": 66.01854714064915, + "grad_norm": 0.439972460269928, + "learning_rate": 2.2278111259353875e-05, + "loss": 0.3152, + "num_input_tokens_seen": 32679920, + "step": 21390 + }, + { + "epoch": 66.03400309119012, + "grad_norm": 0.4975886344909668, + "learning_rate": 2.2268352353527395e-05, + "loss": 0.234, + "num_input_tokens_seen": 32688752, + "step": 21395 + }, + { + "epoch": 66.04945904173107, + "grad_norm": 0.5058537721633911, + "learning_rate": 2.225859386895533e-05, + "loss": 0.3007, + "num_input_tokens_seen": 32696656, + "step": 21400 + }, + { + "epoch": 66.04945904173107, + "eval_loss": 0.30797278881073, + "eval_runtime": 6.2832, + "eval_samples_per_second": 91.514, + "eval_steps_per_second": 22.918, + "num_input_tokens_seen": 32696656, + "step": 21400 + }, + { + "epoch": 66.06491499227202, + "grad_norm": 0.5083351731300354, + "learning_rate": 2.2248835807142525e-05, + "loss": 0.2309, + "num_input_tokens_seen": 32703536, + "step": 21405 + }, + { + "epoch": 66.08037094281298, + "grad_norm": 0.5371934771537781, + "learning_rate": 2.2239078169593826e-05, + "loss": 0.2461, + "num_input_tokens_seen": 32711280, + "step": 21410 + }, + { + "epoch": 66.09582689335394, + "grad_norm": 0.607243537902832, + "learning_rate": 2.222932095781396e-05, + "loss": 0.2219, + "num_input_tokens_seen": 32719216, + "step": 21415 + }, + { + "epoch": 66.1112828438949, + "grad_norm": 0.4359864890575409, + "learning_rate": 2.221956417330762e-05, + "loss": 0.2312, + "num_input_tokens_seen": 32726768, + "step": 21420 + }, + { + "epoch": 66.12673879443585, + "grad_norm": 0.46243003010749817, + "learning_rate": 2.2209807817579438e-05, + "loss": 0.2343, + "num_input_tokens_seen": 32734864, + "step": 21425 + }, + { + "epoch": 66.14219474497682, + "grad_norm": 0.7663004994392395, + "learning_rate": 2.220005189213394e-05, + "loss": 0.2972, + "num_input_tokens_seen": 32742064, + "step": 21430 + }, + { + "epoch": 66.15765069551777, + "grad_norm": 0.6492801308631897, + "learning_rate": 2.2190296398475624e-05, + "loss": 0.2203, + "num_input_tokens_seen": 32749392, + "step": 21435 + }, + { + "epoch": 66.17310664605873, + "grad_norm": 0.8576779365539551, + "learning_rate": 2.2180541338108926e-05, + "loss": 0.2544, + "num_input_tokens_seen": 32757424, + "step": 21440 + }, + { + "epoch": 66.1885625965997, + "grad_norm": 0.7267796397209167, + "learning_rate": 2.2170786712538176e-05, + "loss": 0.2363, + "num_input_tokens_seen": 32765520, + "step": 21445 + }, + { + "epoch": 66.20401854714065, + "grad_norm": 0.34690529108047485, + "learning_rate": 2.216103252326768e-05, + "loss": 0.2405, + "num_input_tokens_seen": 32773392, + "step": 21450 + }, + { + "epoch": 66.2194744976816, + "grad_norm": 0.7393453121185303, + "learning_rate": 2.2151278771801635e-05, + "loss": 0.2774, + "num_input_tokens_seen": 32781520, + "step": 21455 + }, + { + "epoch": 66.23493044822257, + "grad_norm": 0.36905184388160706, + "learning_rate": 2.21415254596442e-05, + "loss": 0.2567, + "num_input_tokens_seen": 32788816, + "step": 21460 + }, + { + "epoch": 66.25038639876352, + "grad_norm": 0.4468071460723877, + "learning_rate": 2.213177258829947e-05, + "loss": 0.2971, + "num_input_tokens_seen": 32796112, + "step": 21465 + }, + { + "epoch": 66.26584234930448, + "grad_norm": 0.36515530943870544, + "learning_rate": 2.2122020159271445e-05, + "loss": 0.2895, + "num_input_tokens_seen": 32803504, + "step": 21470 + }, + { + "epoch": 66.28129829984545, + "grad_norm": 0.2844495177268982, + "learning_rate": 2.2112268174064075e-05, + "loss": 0.2476, + "num_input_tokens_seen": 32810768, + "step": 21475 + }, + { + "epoch": 66.2967542503864, + "grad_norm": 0.3419191837310791, + "learning_rate": 2.2102516634181253e-05, + "loss": 0.2808, + "num_input_tokens_seen": 32818384, + "step": 21480 + }, + { + "epoch": 66.31221020092735, + "grad_norm": 0.736711859703064, + "learning_rate": 2.209276554112677e-05, + "loss": 0.252, + "num_input_tokens_seen": 32826064, + "step": 21485 + }, + { + "epoch": 66.32766615146832, + "grad_norm": 0.3739466369152069, + "learning_rate": 2.2083014896404384e-05, + "loss": 0.2223, + "num_input_tokens_seen": 32834128, + "step": 21490 + }, + { + "epoch": 66.34312210200927, + "grad_norm": 0.4766245186328888, + "learning_rate": 2.207326470151775e-05, + "loss": 0.2221, + "num_input_tokens_seen": 32841392, + "step": 21495 + }, + { + "epoch": 66.35857805255023, + "grad_norm": 0.6416658163070679, + "learning_rate": 2.2063514957970477e-05, + "loss": 0.2279, + "num_input_tokens_seen": 32849360, + "step": 21500 + }, + { + "epoch": 66.3740340030912, + "grad_norm": 0.31082504987716675, + "learning_rate": 2.205376566726611e-05, + "loss": 0.2875, + "num_input_tokens_seen": 32856688, + "step": 21505 + }, + { + "epoch": 66.38948995363215, + "grad_norm": 0.4553896486759186, + "learning_rate": 2.204401683090809e-05, + "loss": 0.2045, + "num_input_tokens_seen": 32864272, + "step": 21510 + }, + { + "epoch": 66.4049459041731, + "grad_norm": 0.39170634746551514, + "learning_rate": 2.203426845039982e-05, + "loss": 0.2357, + "num_input_tokens_seen": 32872048, + "step": 21515 + }, + { + "epoch": 66.42040185471407, + "grad_norm": 0.49259600043296814, + "learning_rate": 2.202452052724464e-05, + "loss": 0.2469, + "num_input_tokens_seen": 32879664, + "step": 21520 + }, + { + "epoch": 66.43585780525503, + "grad_norm": 0.44412243366241455, + "learning_rate": 2.2014773062945777e-05, + "loss": 0.2333, + "num_input_tokens_seen": 32887696, + "step": 21525 + }, + { + "epoch": 66.45131375579598, + "grad_norm": 0.849398672580719, + "learning_rate": 2.2005026059006427e-05, + "loss": 0.2386, + "num_input_tokens_seen": 32895728, + "step": 21530 + }, + { + "epoch": 66.46676970633693, + "grad_norm": 0.6861202716827393, + "learning_rate": 2.1995279516929695e-05, + "loss": 0.2532, + "num_input_tokens_seen": 32903152, + "step": 21535 + }, + { + "epoch": 66.4822256568779, + "grad_norm": 0.42327940464019775, + "learning_rate": 2.1985533438218613e-05, + "loss": 0.2123, + "num_input_tokens_seen": 32910544, + "step": 21540 + }, + { + "epoch": 66.49768160741885, + "grad_norm": 0.8432512283325195, + "learning_rate": 2.197578782437617e-05, + "loss": 0.2181, + "num_input_tokens_seen": 32918256, + "step": 21545 + }, + { + "epoch": 66.51313755795981, + "grad_norm": 0.48024195432662964, + "learning_rate": 2.196604267690524e-05, + "loss": 0.2317, + "num_input_tokens_seen": 32925808, + "step": 21550 + }, + { + "epoch": 66.52859350850078, + "grad_norm": 0.6739438772201538, + "learning_rate": 2.195629799730865e-05, + "loss": 0.226, + "num_input_tokens_seen": 32933168, + "step": 21555 + }, + { + "epoch": 66.54404945904173, + "grad_norm": 0.49459952116012573, + "learning_rate": 2.1946553787089173e-05, + "loss": 0.218, + "num_input_tokens_seen": 32940880, + "step": 21560 + }, + { + "epoch": 66.55950540958268, + "grad_norm": 0.5818195343017578, + "learning_rate": 2.193681004774947e-05, + "loss": 0.2981, + "num_input_tokens_seen": 32947984, + "step": 21565 + }, + { + "epoch": 66.57496136012365, + "grad_norm": 0.6288292407989502, + "learning_rate": 2.1927066780792154e-05, + "loss": 0.2576, + "num_input_tokens_seen": 32955632, + "step": 21570 + }, + { + "epoch": 66.5904173106646, + "grad_norm": 0.7272052764892578, + "learning_rate": 2.191732398771975e-05, + "loss": 0.2755, + "num_input_tokens_seen": 32963376, + "step": 21575 + }, + { + "epoch": 66.60587326120556, + "grad_norm": 0.4359034597873688, + "learning_rate": 2.1907581670034725e-05, + "loss": 0.2396, + "num_input_tokens_seen": 32970832, + "step": 21580 + }, + { + "epoch": 66.62132921174653, + "grad_norm": 0.3403743505477905, + "learning_rate": 2.189783982923948e-05, + "loss": 0.2616, + "num_input_tokens_seen": 32978256, + "step": 21585 + }, + { + "epoch": 66.63678516228748, + "grad_norm": 0.7228701114654541, + "learning_rate": 2.1888098466836303e-05, + "loss": 0.2172, + "num_input_tokens_seen": 32985936, + "step": 21590 + }, + { + "epoch": 66.65224111282843, + "grad_norm": 0.43685096502304077, + "learning_rate": 2.1878357584327457e-05, + "loss": 0.2544, + "num_input_tokens_seen": 32993200, + "step": 21595 + }, + { + "epoch": 66.6676970633694, + "grad_norm": 0.49060526490211487, + "learning_rate": 2.1868617183215103e-05, + "loss": 0.2563, + "num_input_tokens_seen": 33001008, + "step": 21600 + }, + { + "epoch": 66.6676970633694, + "eval_loss": 0.30794599652290344, + "eval_runtime": 6.3034, + "eval_samples_per_second": 91.221, + "eval_steps_per_second": 22.845, + "num_input_tokens_seen": 33001008, + "step": 21600 + }, + { + "epoch": 66.68315301391036, + "grad_norm": 0.5106200575828552, + "learning_rate": 2.1858877265001327e-05, + "loss": 0.3535, + "num_input_tokens_seen": 33008688, + "step": 21605 + }, + { + "epoch": 66.69860896445131, + "grad_norm": 0.5975707173347473, + "learning_rate": 2.184913783118816e-05, + "loss": 0.2465, + "num_input_tokens_seen": 33016816, + "step": 21610 + }, + { + "epoch": 66.71406491499228, + "grad_norm": 0.5166662931442261, + "learning_rate": 2.1839398883277522e-05, + "loss": 0.2755, + "num_input_tokens_seen": 33024784, + "step": 21615 + }, + { + "epoch": 66.72952086553323, + "grad_norm": 0.3796682357788086, + "learning_rate": 2.182966042277129e-05, + "loss": 0.2533, + "num_input_tokens_seen": 33032528, + "step": 21620 + }, + { + "epoch": 66.74497681607419, + "grad_norm": 0.8900421261787415, + "learning_rate": 2.181992245117128e-05, + "loss": 0.2519, + "num_input_tokens_seen": 33039888, + "step": 21625 + }, + { + "epoch": 66.76043276661515, + "grad_norm": 0.6125680208206177, + "learning_rate": 2.181018496997918e-05, + "loss": 0.2406, + "num_input_tokens_seen": 33047184, + "step": 21630 + }, + { + "epoch": 66.7758887171561, + "grad_norm": 0.46396544575691223, + "learning_rate": 2.1800447980696648e-05, + "loss": 0.2549, + "num_input_tokens_seen": 33054416, + "step": 21635 + }, + { + "epoch": 66.79134466769706, + "grad_norm": 0.6138590574264526, + "learning_rate": 2.1790711484825248e-05, + "loss": 0.2607, + "num_input_tokens_seen": 33061968, + "step": 21640 + }, + { + "epoch": 66.80680061823801, + "grad_norm": 0.507542610168457, + "learning_rate": 2.178097548386646e-05, + "loss": 0.3444, + "num_input_tokens_seen": 33070032, + "step": 21645 + }, + { + "epoch": 66.82225656877898, + "grad_norm": 0.6884363293647766, + "learning_rate": 2.1771239979321712e-05, + "loss": 0.2317, + "num_input_tokens_seen": 33077232, + "step": 21650 + }, + { + "epoch": 66.83771251931994, + "grad_norm": 0.5864202380180359, + "learning_rate": 2.1761504972692327e-05, + "loss": 0.2404, + "num_input_tokens_seen": 33084944, + "step": 21655 + }, + { + "epoch": 66.85316846986089, + "grad_norm": 0.4392865002155304, + "learning_rate": 2.1751770465479572e-05, + "loss": 0.3207, + "num_input_tokens_seen": 33092784, + "step": 21660 + }, + { + "epoch": 66.86862442040186, + "grad_norm": 0.6114606261253357, + "learning_rate": 2.174203645918464e-05, + "loss": 0.2087, + "num_input_tokens_seen": 33100528, + "step": 21665 + }, + { + "epoch": 66.88408037094281, + "grad_norm": 0.6211300492286682, + "learning_rate": 2.1732302955308624e-05, + "loss": 0.2019, + "num_input_tokens_seen": 33107600, + "step": 21670 + }, + { + "epoch": 66.89953632148377, + "grad_norm": 0.3431023955345154, + "learning_rate": 2.172256995535255e-05, + "loss": 0.2557, + "num_input_tokens_seen": 33114832, + "step": 21675 + }, + { + "epoch": 66.91499227202473, + "grad_norm": 0.572717010974884, + "learning_rate": 2.171283746081739e-05, + "loss": 0.2299, + "num_input_tokens_seen": 33122608, + "step": 21680 + }, + { + "epoch": 66.93044822256569, + "grad_norm": 0.352967232465744, + "learning_rate": 2.1703105473203988e-05, + "loss": 0.2225, + "num_input_tokens_seen": 33130256, + "step": 21685 + }, + { + "epoch": 66.94590417310664, + "grad_norm": 0.8835752606391907, + "learning_rate": 2.1693373994013168e-05, + "loss": 0.3128, + "num_input_tokens_seen": 33138032, + "step": 21690 + }, + { + "epoch": 66.96136012364761, + "grad_norm": 0.6887832880020142, + "learning_rate": 2.168364302474562e-05, + "loss": 0.2635, + "num_input_tokens_seen": 33145104, + "step": 21695 + }, + { + "epoch": 66.97681607418856, + "grad_norm": 0.540178120136261, + "learning_rate": 2.167391256690199e-05, + "loss": 0.2652, + "num_input_tokens_seen": 33152752, + "step": 21700 + }, + { + "epoch": 66.99227202472952, + "grad_norm": 0.346513032913208, + "learning_rate": 2.1664182621982855e-05, + "loss": 0.2336, + "num_input_tokens_seen": 33160944, + "step": 21705 + }, + { + "epoch": 67.00618238021639, + "grad_norm": 0.38791924715042114, + "learning_rate": 2.1654453191488673e-05, + "loss": 0.3359, + "num_input_tokens_seen": 33168016, + "step": 21710 + }, + { + "epoch": 67.02163833075734, + "grad_norm": 0.49442824721336365, + "learning_rate": 2.1644724276919846e-05, + "loss": 0.2809, + "num_input_tokens_seen": 33175504, + "step": 21715 + }, + { + "epoch": 67.0370942812983, + "grad_norm": 0.5064396262168884, + "learning_rate": 2.1634995879776715e-05, + "loss": 0.2092, + "num_input_tokens_seen": 33183024, + "step": 21720 + }, + { + "epoch": 67.05255023183926, + "grad_norm": 0.48445701599121094, + "learning_rate": 2.162526800155949e-05, + "loss": 0.3212, + "num_input_tokens_seen": 33190704, + "step": 21725 + }, + { + "epoch": 67.06800618238022, + "grad_norm": 0.6609929203987122, + "learning_rate": 2.1615540643768363e-05, + "loss": 0.2999, + "num_input_tokens_seen": 33199056, + "step": 21730 + }, + { + "epoch": 67.08346213292117, + "grad_norm": 0.4046471118927002, + "learning_rate": 2.160581380790339e-05, + "loss": 0.2317, + "num_input_tokens_seen": 33207088, + "step": 21735 + }, + { + "epoch": 67.09891808346214, + "grad_norm": 0.562280535697937, + "learning_rate": 2.1596087495464586e-05, + "loss": 0.2265, + "num_input_tokens_seen": 33215088, + "step": 21740 + }, + { + "epoch": 67.11437403400309, + "grad_norm": 0.48527252674102783, + "learning_rate": 2.1586361707951866e-05, + "loss": 0.2282, + "num_input_tokens_seen": 33222608, + "step": 21745 + }, + { + "epoch": 67.12982998454405, + "grad_norm": 0.3420516848564148, + "learning_rate": 2.157663644686507e-05, + "loss": 0.2249, + "num_input_tokens_seen": 33230288, + "step": 21750 + }, + { + "epoch": 67.14528593508501, + "grad_norm": 0.6721711754798889, + "learning_rate": 2.156691171370396e-05, + "loss": 0.2809, + "num_input_tokens_seen": 33238064, + "step": 21755 + }, + { + "epoch": 67.16074188562597, + "grad_norm": 0.4659823477268219, + "learning_rate": 2.1557187509968195e-05, + "loss": 0.2563, + "num_input_tokens_seen": 33245232, + "step": 21760 + }, + { + "epoch": 67.17619783616692, + "grad_norm": 0.6521710157394409, + "learning_rate": 2.1547463837157382e-05, + "loss": 0.2226, + "num_input_tokens_seen": 33253040, + "step": 21765 + }, + { + "epoch": 67.19165378670789, + "grad_norm": 0.4487874507904053, + "learning_rate": 2.1537740696771045e-05, + "loss": 0.225, + "num_input_tokens_seen": 33260528, + "step": 21770 + }, + { + "epoch": 67.20710973724884, + "grad_norm": 0.510473370552063, + "learning_rate": 2.1528018090308587e-05, + "loss": 0.2233, + "num_input_tokens_seen": 33268048, + "step": 21775 + }, + { + "epoch": 67.2225656877898, + "grad_norm": 0.6326645016670227, + "learning_rate": 2.151829601926938e-05, + "loss": 0.2277, + "num_input_tokens_seen": 33275536, + "step": 21780 + }, + { + "epoch": 67.23802163833076, + "grad_norm": 0.515310525894165, + "learning_rate": 2.1508574485152684e-05, + "loss": 0.2746, + "num_input_tokens_seen": 33283280, + "step": 21785 + }, + { + "epoch": 67.25347758887172, + "grad_norm": 0.47417476773262024, + "learning_rate": 2.1498853489457667e-05, + "loss": 0.2274, + "num_input_tokens_seen": 33290608, + "step": 21790 + }, + { + "epoch": 67.26893353941267, + "grad_norm": 0.5609796643257141, + "learning_rate": 2.1489133033683455e-05, + "loss": 0.2784, + "num_input_tokens_seen": 33298608, + "step": 21795 + }, + { + "epoch": 67.28438948995363, + "grad_norm": 0.4171265661716461, + "learning_rate": 2.1479413119329038e-05, + "loss": 0.233, + "num_input_tokens_seen": 33306288, + "step": 21800 + }, + { + "epoch": 67.28438948995363, + "eval_loss": 0.3067197799682617, + "eval_runtime": 6.2781, + "eval_samples_per_second": 91.588, + "eval_steps_per_second": 22.937, + "num_input_tokens_seen": 33306288, + "step": 21800 + }, + { + "epoch": 67.2998454404946, + "grad_norm": 0.5621266961097717, + "learning_rate": 2.1469693747893355e-05, + "loss": 0.2198, + "num_input_tokens_seen": 33314000, + "step": 21805 + }, + { + "epoch": 67.31530139103555, + "grad_norm": 0.28218141198158264, + "learning_rate": 2.1459974920875274e-05, + "loss": 0.2613, + "num_input_tokens_seen": 33321840, + "step": 21810 + }, + { + "epoch": 67.3307573415765, + "grad_norm": 0.6615131497383118, + "learning_rate": 2.145025663977354e-05, + "loss": 0.2315, + "num_input_tokens_seen": 33329392, + "step": 21815 + }, + { + "epoch": 67.34621329211747, + "grad_norm": 0.4753236770629883, + "learning_rate": 2.1440538906086844e-05, + "loss": 0.2049, + "num_input_tokens_seen": 33337808, + "step": 21820 + }, + { + "epoch": 67.36166924265842, + "grad_norm": 0.5072250962257385, + "learning_rate": 2.1430821721313782e-05, + "loss": 0.2841, + "num_input_tokens_seen": 33345744, + "step": 21825 + }, + { + "epoch": 67.37712519319938, + "grad_norm": 0.4012574255466461, + "learning_rate": 2.142110508695286e-05, + "loss": 0.2628, + "num_input_tokens_seen": 33353040, + "step": 21830 + }, + { + "epoch": 67.39258114374034, + "grad_norm": 0.7206489443778992, + "learning_rate": 2.1411389004502515e-05, + "loss": 0.2434, + "num_input_tokens_seen": 33360624, + "step": 21835 + }, + { + "epoch": 67.4080370942813, + "grad_norm": 0.5143715739250183, + "learning_rate": 2.140167347546107e-05, + "loss": 0.2927, + "num_input_tokens_seen": 33368336, + "step": 21840 + }, + { + "epoch": 67.42349304482225, + "grad_norm": 0.5665262937545776, + "learning_rate": 2.1391958501326793e-05, + "loss": 0.2043, + "num_input_tokens_seen": 33375408, + "step": 21845 + }, + { + "epoch": 67.43894899536322, + "grad_norm": 0.39679330587387085, + "learning_rate": 2.1382244083597873e-05, + "loss": 0.23, + "num_input_tokens_seen": 33383344, + "step": 21850 + }, + { + "epoch": 67.45440494590417, + "grad_norm": 0.5088476538658142, + "learning_rate": 2.137253022377237e-05, + "loss": 0.2968, + "num_input_tokens_seen": 33391024, + "step": 21855 + }, + { + "epoch": 67.46986089644513, + "grad_norm": 0.47711530327796936, + "learning_rate": 2.136281692334829e-05, + "loss": 0.2666, + "num_input_tokens_seen": 33398576, + "step": 21860 + }, + { + "epoch": 67.4853168469861, + "grad_norm": 0.39835160970687866, + "learning_rate": 2.135310418382356e-05, + "loss": 0.3308, + "num_input_tokens_seen": 33406256, + "step": 21865 + }, + { + "epoch": 67.50077279752705, + "grad_norm": 0.6067320704460144, + "learning_rate": 2.134339200669598e-05, + "loss": 0.2239, + "num_input_tokens_seen": 33413680, + "step": 21870 + }, + { + "epoch": 67.516228748068, + "grad_norm": 0.5102028250694275, + "learning_rate": 2.133368039346331e-05, + "loss": 0.249, + "num_input_tokens_seen": 33421232, + "step": 21875 + }, + { + "epoch": 67.53168469860897, + "grad_norm": 0.6904842853546143, + "learning_rate": 2.1323969345623195e-05, + "loss": 0.223, + "num_input_tokens_seen": 33428592, + "step": 21880 + }, + { + "epoch": 67.54714064914992, + "grad_norm": 0.38664624094963074, + "learning_rate": 2.1314258864673207e-05, + "loss": 0.2998, + "num_input_tokens_seen": 33436784, + "step": 21885 + }, + { + "epoch": 67.56259659969088, + "grad_norm": 0.38175439834594727, + "learning_rate": 2.130454895211082e-05, + "loss": 0.2499, + "num_input_tokens_seen": 33444784, + "step": 21890 + }, + { + "epoch": 67.57805255023185, + "grad_norm": 0.8670171499252319, + "learning_rate": 2.129483960943342e-05, + "loss": 0.2551, + "num_input_tokens_seen": 33452624, + "step": 21895 + }, + { + "epoch": 67.5935085007728, + "grad_norm": 0.5024891495704651, + "learning_rate": 2.128513083813831e-05, + "loss": 0.2088, + "num_input_tokens_seen": 33460336, + "step": 21900 + }, + { + "epoch": 67.60896445131375, + "grad_norm": 0.44466039538383484, + "learning_rate": 2.1275422639722724e-05, + "loss": 0.2497, + "num_input_tokens_seen": 33467984, + "step": 21905 + }, + { + "epoch": 67.62442040185472, + "grad_norm": 0.43690019845962524, + "learning_rate": 2.126571501568376e-05, + "loss": 0.2131, + "num_input_tokens_seen": 33475536, + "step": 21910 + }, + { + "epoch": 67.63987635239567, + "grad_norm": 0.5336515307426453, + "learning_rate": 2.1256007967518478e-05, + "loss": 0.2307, + "num_input_tokens_seen": 33482992, + "step": 21915 + }, + { + "epoch": 67.65533230293663, + "grad_norm": 0.5172221660614014, + "learning_rate": 2.124630149672381e-05, + "loss": 0.2347, + "num_input_tokens_seen": 33490416, + "step": 21920 + }, + { + "epoch": 67.67078825347758, + "grad_norm": 0.40115129947662354, + "learning_rate": 2.1236595604796624e-05, + "loss": 0.2276, + "num_input_tokens_seen": 33498256, + "step": 21925 + }, + { + "epoch": 67.68624420401855, + "grad_norm": 0.8006030917167664, + "learning_rate": 2.1226890293233693e-05, + "loss": 0.2885, + "num_input_tokens_seen": 33506096, + "step": 21930 + }, + { + "epoch": 67.7017001545595, + "grad_norm": 0.42424362897872925, + "learning_rate": 2.1217185563531694e-05, + "loss": 0.2257, + "num_input_tokens_seen": 33513168, + "step": 21935 + }, + { + "epoch": 67.71715610510046, + "grad_norm": 0.6166151762008667, + "learning_rate": 2.120748141718721e-05, + "loss": 0.2606, + "num_input_tokens_seen": 33520784, + "step": 21940 + }, + { + "epoch": 67.73261205564143, + "grad_norm": 0.47113728523254395, + "learning_rate": 2.1197777855696765e-05, + "loss": 0.2595, + "num_input_tokens_seen": 33528400, + "step": 21945 + }, + { + "epoch": 67.74806800618238, + "grad_norm": 0.3241330683231354, + "learning_rate": 2.1188074880556746e-05, + "loss": 0.2399, + "num_input_tokens_seen": 33536304, + "step": 21950 + }, + { + "epoch": 67.76352395672333, + "grad_norm": 0.47320428490638733, + "learning_rate": 2.1178372493263495e-05, + "loss": 0.2873, + "num_input_tokens_seen": 33543824, + "step": 21955 + }, + { + "epoch": 67.7789799072643, + "grad_norm": 0.3996713161468506, + "learning_rate": 2.116867069531322e-05, + "loss": 0.2892, + "num_input_tokens_seen": 33551440, + "step": 21960 + }, + { + "epoch": 67.79443585780525, + "grad_norm": 0.48172685503959656, + "learning_rate": 2.1158969488202073e-05, + "loss": 0.2452, + "num_input_tokens_seen": 33559280, + "step": 21965 + }, + { + "epoch": 67.80989180834621, + "grad_norm": 0.8578282594680786, + "learning_rate": 2.114926887342611e-05, + "loss": 0.3418, + "num_input_tokens_seen": 33566768, + "step": 21970 + }, + { + "epoch": 67.82534775888718, + "grad_norm": 0.45842549204826355, + "learning_rate": 2.113956885248127e-05, + "loss": 0.2263, + "num_input_tokens_seen": 33574672, + "step": 21975 + }, + { + "epoch": 67.84080370942813, + "grad_norm": 0.39436209201812744, + "learning_rate": 2.112986942686342e-05, + "loss": 0.2396, + "num_input_tokens_seen": 33582192, + "step": 21980 + }, + { + "epoch": 67.85625965996908, + "grad_norm": 0.5584468245506287, + "learning_rate": 2.112017059806835e-05, + "loss": 0.2675, + "num_input_tokens_seen": 33589744, + "step": 21985 + }, + { + "epoch": 67.87171561051005, + "grad_norm": 0.4395412504673004, + "learning_rate": 2.1110472367591724e-05, + "loss": 0.2584, + "num_input_tokens_seen": 33597200, + "step": 21990 + }, + { + "epoch": 67.887171561051, + "grad_norm": 0.6393595933914185, + "learning_rate": 2.1100774736929145e-05, + "loss": 0.2204, + "num_input_tokens_seen": 33604816, + "step": 21995 + }, + { + "epoch": 67.90262751159196, + "grad_norm": 0.45821458101272583, + "learning_rate": 2.10910777075761e-05, + "loss": 0.3712, + "num_input_tokens_seen": 33612592, + "step": 22000 + }, + { + "epoch": 67.90262751159196, + "eval_loss": 0.3072393238544464, + "eval_runtime": 6.318, + "eval_samples_per_second": 91.01, + "eval_steps_per_second": 22.792, + "num_input_tokens_seen": 33612592, + "step": 22000 + }, + { + "epoch": 67.91808346213293, + "grad_norm": 0.5389094352722168, + "learning_rate": 2.108138128102799e-05, + "loss": 0.2876, + "num_input_tokens_seen": 33619984, + "step": 22005 + }, + { + "epoch": 67.93353941267388, + "grad_norm": 0.49718576669692993, + "learning_rate": 2.107168545878014e-05, + "loss": 0.3122, + "num_input_tokens_seen": 33627504, + "step": 22010 + }, + { + "epoch": 67.94899536321483, + "grad_norm": 0.46952807903289795, + "learning_rate": 2.106199024232775e-05, + "loss": 0.2685, + "num_input_tokens_seen": 33634864, + "step": 22015 + }, + { + "epoch": 67.9644513137558, + "grad_norm": 0.3145512342453003, + "learning_rate": 2.105229563316595e-05, + "loss": 0.2004, + "num_input_tokens_seen": 33642544, + "step": 22020 + }, + { + "epoch": 67.97990726429676, + "grad_norm": 0.49626678228378296, + "learning_rate": 2.1042601632789784e-05, + "loss": 0.2533, + "num_input_tokens_seen": 33649712, + "step": 22025 + }, + { + "epoch": 67.99536321483771, + "grad_norm": 0.5059766173362732, + "learning_rate": 2.103290824269417e-05, + "loss": 0.1949, + "num_input_tokens_seen": 33657168, + "step": 22030 + }, + { + "epoch": 68.00927357032458, + "grad_norm": 0.43899041414260864, + "learning_rate": 2.1023215464373965e-05, + "loss": 0.255, + "num_input_tokens_seen": 33663952, + "step": 22035 + }, + { + "epoch": 68.02472952086553, + "grad_norm": 0.6277148127555847, + "learning_rate": 2.1013523299323908e-05, + "loss": 0.2838, + "num_input_tokens_seen": 33671344, + "step": 22040 + }, + { + "epoch": 68.04018547140649, + "grad_norm": 0.3300027549266815, + "learning_rate": 2.1003831749038654e-05, + "loss": 0.2181, + "num_input_tokens_seen": 33678832, + "step": 22045 + }, + { + "epoch": 68.05564142194746, + "grad_norm": 0.480415016412735, + "learning_rate": 2.099414081501277e-05, + "loss": 0.2021, + "num_input_tokens_seen": 33686704, + "step": 22050 + }, + { + "epoch": 68.07109737248841, + "grad_norm": 0.5254292488098145, + "learning_rate": 2.09844504987407e-05, + "loss": 0.267, + "num_input_tokens_seen": 33695024, + "step": 22055 + }, + { + "epoch": 68.08655332302936, + "grad_norm": 0.5473738312721252, + "learning_rate": 2.097476080171683e-05, + "loss": 0.2928, + "num_input_tokens_seen": 33702192, + "step": 22060 + }, + { + "epoch": 68.10200927357033, + "grad_norm": 0.6171761751174927, + "learning_rate": 2.0965071725435436e-05, + "loss": 0.2383, + "num_input_tokens_seen": 33710480, + "step": 22065 + }, + { + "epoch": 68.11746522411129, + "grad_norm": 1.2106804847717285, + "learning_rate": 2.0955383271390684e-05, + "loss": 0.2671, + "num_input_tokens_seen": 33718096, + "step": 22070 + }, + { + "epoch": 68.13292117465224, + "grad_norm": 0.4685452878475189, + "learning_rate": 2.094569544107666e-05, + "loss": 0.2559, + "num_input_tokens_seen": 33726128, + "step": 22075 + }, + { + "epoch": 68.14837712519319, + "grad_norm": 0.26549044251441956, + "learning_rate": 2.093600823598735e-05, + "loss": 0.2657, + "num_input_tokens_seen": 33733744, + "step": 22080 + }, + { + "epoch": 68.16383307573416, + "grad_norm": 0.3757427930831909, + "learning_rate": 2.092632165761663e-05, + "loss": 0.2257, + "num_input_tokens_seen": 33741712, + "step": 22085 + }, + { + "epoch": 68.17928902627511, + "grad_norm": 0.5196652412414551, + "learning_rate": 2.091663570745832e-05, + "loss": 0.2101, + "num_input_tokens_seen": 33749008, + "step": 22090 + }, + { + "epoch": 68.19474497681607, + "grad_norm": 0.4656255543231964, + "learning_rate": 2.0906950387006086e-05, + "loss": 0.273, + "num_input_tokens_seen": 33756432, + "step": 22095 + }, + { + "epoch": 68.21020092735704, + "grad_norm": 0.6356040835380554, + "learning_rate": 2.0897265697753543e-05, + "loss": 0.2447, + "num_input_tokens_seen": 33764336, + "step": 22100 + }, + { + "epoch": 68.22565687789799, + "grad_norm": 0.684299886226654, + "learning_rate": 2.088758164119419e-05, + "loss": 0.2212, + "num_input_tokens_seen": 33771888, + "step": 22105 + }, + { + "epoch": 68.24111282843894, + "grad_norm": 0.3582121729850769, + "learning_rate": 2.0877898218821428e-05, + "loss": 0.3053, + "num_input_tokens_seen": 33779664, + "step": 22110 + }, + { + "epoch": 68.25656877897991, + "grad_norm": 0.6439127922058105, + "learning_rate": 2.0868215432128565e-05, + "loss": 0.2506, + "num_input_tokens_seen": 33787664, + "step": 22115 + }, + { + "epoch": 68.27202472952087, + "grad_norm": 0.4942111670970917, + "learning_rate": 2.0858533282608796e-05, + "loss": 0.2549, + "num_input_tokens_seen": 33795504, + "step": 22120 + }, + { + "epoch": 68.28748068006182, + "grad_norm": 0.7184171080589294, + "learning_rate": 2.084885177175524e-05, + "loss": 0.2316, + "num_input_tokens_seen": 33802832, + "step": 22125 + }, + { + "epoch": 68.30293663060279, + "grad_norm": 0.4203801155090332, + "learning_rate": 2.0839170901060917e-05, + "loss": 0.2703, + "num_input_tokens_seen": 33810448, + "step": 22130 + }, + { + "epoch": 68.31839258114374, + "grad_norm": 0.5085709095001221, + "learning_rate": 2.082949067201872e-05, + "loss": 0.2718, + "num_input_tokens_seen": 33817808, + "step": 22135 + }, + { + "epoch": 68.3338485316847, + "grad_norm": 0.3432433307170868, + "learning_rate": 2.0819811086121475e-05, + "loss": 0.3172, + "num_input_tokens_seen": 33825264, + "step": 22140 + }, + { + "epoch": 68.34930448222566, + "grad_norm": 0.355916827917099, + "learning_rate": 2.08101321448619e-05, + "loss": 0.1894, + "num_input_tokens_seen": 33832976, + "step": 22145 + }, + { + "epoch": 68.36476043276662, + "grad_norm": 0.523736298084259, + "learning_rate": 2.080045384973259e-05, + "loss": 0.3467, + "num_input_tokens_seen": 33839728, + "step": 22150 + }, + { + "epoch": 68.38021638330757, + "grad_norm": 0.46175968647003174, + "learning_rate": 2.0790776202226082e-05, + "loss": 0.1937, + "num_input_tokens_seen": 33846832, + "step": 22155 + }, + { + "epoch": 68.39567233384854, + "grad_norm": 0.40457022190093994, + "learning_rate": 2.078109920383477e-05, + "loss": 0.2567, + "num_input_tokens_seen": 33854480, + "step": 22160 + }, + { + "epoch": 68.41112828438949, + "grad_norm": 0.5139554738998413, + "learning_rate": 2.0771422856050978e-05, + "loss": 0.2759, + "num_input_tokens_seen": 33862032, + "step": 22165 + }, + { + "epoch": 68.42658423493044, + "grad_norm": 0.7894652485847473, + "learning_rate": 2.076174716036693e-05, + "loss": 0.1952, + "num_input_tokens_seen": 33869680, + "step": 22170 + }, + { + "epoch": 68.44204018547141, + "grad_norm": 0.76076740026474, + "learning_rate": 2.075207211827472e-05, + "loss": 0.2885, + "num_input_tokens_seen": 33877072, + "step": 22175 + }, + { + "epoch": 68.45749613601237, + "grad_norm": 0.5217140316963196, + "learning_rate": 2.074239773126638e-05, + "loss": 0.3212, + "num_input_tokens_seen": 33884464, + "step": 22180 + }, + { + "epoch": 68.47295208655332, + "grad_norm": 0.41720324754714966, + "learning_rate": 2.073272400083382e-05, + "loss": 0.1921, + "num_input_tokens_seen": 33892080, + "step": 22185 + }, + { + "epoch": 68.48840803709429, + "grad_norm": 0.6167252659797668, + "learning_rate": 2.072305092846883e-05, + "loss": 0.2385, + "num_input_tokens_seen": 33899568, + "step": 22190 + }, + { + "epoch": 68.50386398763524, + "grad_norm": 0.4688585698604584, + "learning_rate": 2.0713378515663152e-05, + "loss": 0.2452, + "num_input_tokens_seen": 33907248, + "step": 22195 + }, + { + "epoch": 68.5193199381762, + "grad_norm": 0.4874158799648285, + "learning_rate": 2.070370676390836e-05, + "loss": 0.24, + "num_input_tokens_seen": 33914992, + "step": 22200 + }, + { + "epoch": 68.5193199381762, + "eval_loss": 0.30695226788520813, + "eval_runtime": 6.3233, + "eval_samples_per_second": 90.934, + "eval_steps_per_second": 22.773, + "num_input_tokens_seen": 33914992, + "step": 22200 + }, + { + "epoch": 68.53477588871715, + "grad_norm": 0.42589685320854187, + "learning_rate": 2.0694035674695974e-05, + "loss": 0.2945, + "num_input_tokens_seen": 33922992, + "step": 22205 + }, + { + "epoch": 68.55023183925812, + "grad_norm": 0.7772510647773743, + "learning_rate": 2.0684365249517416e-05, + "loss": 0.2413, + "num_input_tokens_seen": 33930512, + "step": 22210 + }, + { + "epoch": 68.56568778979907, + "grad_norm": 0.6200753450393677, + "learning_rate": 2.067469548986396e-05, + "loss": 0.244, + "num_input_tokens_seen": 33938096, + "step": 22215 + }, + { + "epoch": 68.58114374034002, + "grad_norm": 0.5956021547317505, + "learning_rate": 2.066502639722681e-05, + "loss": 0.3269, + "num_input_tokens_seen": 33945968, + "step": 22220 + }, + { + "epoch": 68.59659969088099, + "grad_norm": 0.5172000527381897, + "learning_rate": 2.065535797309708e-05, + "loss": 0.2421, + "num_input_tokens_seen": 33953968, + "step": 22225 + }, + { + "epoch": 68.61205564142195, + "grad_norm": 0.5438131093978882, + "learning_rate": 2.0645690218965736e-05, + "loss": 0.2215, + "num_input_tokens_seen": 33961552, + "step": 22230 + }, + { + "epoch": 68.6275115919629, + "grad_norm": 0.5751809477806091, + "learning_rate": 2.063602313632369e-05, + "loss": 0.2327, + "num_input_tokens_seen": 33969328, + "step": 22235 + }, + { + "epoch": 68.64296754250387, + "grad_norm": 0.4000431299209595, + "learning_rate": 2.0626356726661704e-05, + "loss": 0.2732, + "num_input_tokens_seen": 33977008, + "step": 22240 + }, + { + "epoch": 68.65842349304482, + "grad_norm": 0.35194480419158936, + "learning_rate": 2.0616690991470477e-05, + "loss": 0.2168, + "num_input_tokens_seen": 33984240, + "step": 22245 + }, + { + "epoch": 68.67387944358578, + "grad_norm": 0.33268633484840393, + "learning_rate": 2.0607025932240595e-05, + "loss": 0.2413, + "num_input_tokens_seen": 33992016, + "step": 22250 + }, + { + "epoch": 68.68933539412674, + "grad_norm": 0.7043076157569885, + "learning_rate": 2.059736155046251e-05, + "loss": 0.2582, + "num_input_tokens_seen": 33999472, + "step": 22255 + }, + { + "epoch": 68.7047913446677, + "grad_norm": 0.4051560163497925, + "learning_rate": 2.0587697847626603e-05, + "loss": 0.24, + "num_input_tokens_seen": 34007088, + "step": 22260 + }, + { + "epoch": 68.72024729520865, + "grad_norm": 0.5026947855949402, + "learning_rate": 2.057803482522314e-05, + "loss": 0.2318, + "num_input_tokens_seen": 34014736, + "step": 22265 + }, + { + "epoch": 68.73570324574962, + "grad_norm": 0.5765253305435181, + "learning_rate": 2.056837248474227e-05, + "loss": 0.2345, + "num_input_tokens_seen": 34022288, + "step": 22270 + }, + { + "epoch": 68.75115919629057, + "grad_norm": 0.6266161799430847, + "learning_rate": 2.0558710827674064e-05, + "loss": 0.263, + "num_input_tokens_seen": 34029584, + "step": 22275 + }, + { + "epoch": 68.76661514683153, + "grad_norm": 0.5141224265098572, + "learning_rate": 2.054904985550845e-05, + "loss": 0.2424, + "num_input_tokens_seen": 34036752, + "step": 22280 + }, + { + "epoch": 68.7820710973725, + "grad_norm": 0.4618760645389557, + "learning_rate": 2.0539389569735287e-05, + "loss": 0.24, + "num_input_tokens_seen": 34044240, + "step": 22285 + }, + { + "epoch": 68.79752704791345, + "grad_norm": 0.6864967346191406, + "learning_rate": 2.052972997184431e-05, + "loss": 0.2432, + "num_input_tokens_seen": 34052272, + "step": 22290 + }, + { + "epoch": 68.8129829984544, + "grad_norm": 0.5170074105262756, + "learning_rate": 2.0520071063325146e-05, + "loss": 0.2322, + "num_input_tokens_seen": 34060336, + "step": 22295 + }, + { + "epoch": 68.82843894899537, + "grad_norm": 0.45526304841041565, + "learning_rate": 2.051041284566732e-05, + "loss": 0.223, + "num_input_tokens_seen": 34067664, + "step": 22300 + }, + { + "epoch": 68.84389489953632, + "grad_norm": 0.8029200434684753, + "learning_rate": 2.050075532036026e-05, + "loss": 0.2587, + "num_input_tokens_seen": 34075792, + "step": 22305 + }, + { + "epoch": 68.85935085007728, + "grad_norm": 0.4146057367324829, + "learning_rate": 2.0491098488893264e-05, + "loss": 0.2349, + "num_input_tokens_seen": 34084048, + "step": 22310 + }, + { + "epoch": 68.87480680061825, + "grad_norm": 0.27295807003974915, + "learning_rate": 2.0481442352755546e-05, + "loss": 0.277, + "num_input_tokens_seen": 34091408, + "step": 22315 + }, + { + "epoch": 68.8902627511592, + "grad_norm": 0.8080443739891052, + "learning_rate": 2.0471786913436198e-05, + "loss": 0.2523, + "num_input_tokens_seen": 34098864, + "step": 22320 + }, + { + "epoch": 68.90571870170015, + "grad_norm": 0.5165835022926331, + "learning_rate": 2.0462132172424218e-05, + "loss": 0.2666, + "num_input_tokens_seen": 34106448, + "step": 22325 + }, + { + "epoch": 68.9211746522411, + "grad_norm": 0.7652693390846252, + "learning_rate": 2.0452478131208484e-05, + "loss": 0.2861, + "num_input_tokens_seen": 34114064, + "step": 22330 + }, + { + "epoch": 68.93663060278207, + "grad_norm": 0.372360497713089, + "learning_rate": 2.0442824791277765e-05, + "loss": 0.2392, + "num_input_tokens_seen": 34121872, + "step": 22335 + }, + { + "epoch": 68.95208655332303, + "grad_norm": 0.33387818932533264, + "learning_rate": 2.0433172154120727e-05, + "loss": 0.2523, + "num_input_tokens_seen": 34129616, + "step": 22340 + }, + { + "epoch": 68.96754250386398, + "grad_norm": 0.33057281374931335, + "learning_rate": 2.0423520221225947e-05, + "loss": 0.2108, + "num_input_tokens_seen": 34136944, + "step": 22345 + }, + { + "epoch": 68.98299845440495, + "grad_norm": 0.4528294801712036, + "learning_rate": 2.0413868994081848e-05, + "loss": 0.279, + "num_input_tokens_seen": 34144816, + "step": 22350 + }, + { + "epoch": 68.9984544049459, + "grad_norm": 0.45853644609451294, + "learning_rate": 2.0404218474176795e-05, + "loss": 0.2927, + "num_input_tokens_seen": 34152400, + "step": 22355 + }, + { + "epoch": 69.01236476043276, + "grad_norm": 0.7628101110458374, + "learning_rate": 2.0394568662999002e-05, + "loss": 0.2488, + "num_input_tokens_seen": 34159072, + "step": 22360 + }, + { + "epoch": 69.02782071097373, + "grad_norm": 0.43076011538505554, + "learning_rate": 2.0384919562036593e-05, + "loss": 0.294, + "num_input_tokens_seen": 34166560, + "step": 22365 + }, + { + "epoch": 69.04327666151468, + "grad_norm": 0.388843297958374, + "learning_rate": 2.0375271172777593e-05, + "loss": 0.2226, + "num_input_tokens_seen": 34173920, + "step": 22370 + }, + { + "epoch": 69.05873261205564, + "grad_norm": 0.7141405940055847, + "learning_rate": 2.0365623496709885e-05, + "loss": 0.2943, + "num_input_tokens_seen": 34181280, + "step": 22375 + }, + { + "epoch": 69.0741885625966, + "grad_norm": 0.5416387915611267, + "learning_rate": 2.0355976535321283e-05, + "loss": 0.2672, + "num_input_tokens_seen": 34188736, + "step": 22380 + }, + { + "epoch": 69.08964451313756, + "grad_norm": 0.465236634016037, + "learning_rate": 2.034633029009945e-05, + "loss": 0.2771, + "num_input_tokens_seen": 34196736, + "step": 22385 + }, + { + "epoch": 69.10510046367851, + "grad_norm": 0.3948393762111664, + "learning_rate": 2.0336684762531972e-05, + "loss": 0.259, + "num_input_tokens_seen": 34204384, + "step": 22390 + }, + { + "epoch": 69.12055641421948, + "grad_norm": 0.4634822905063629, + "learning_rate": 2.032703995410631e-05, + "loss": 0.1897, + "num_input_tokens_seen": 34212288, + "step": 22395 + }, + { + "epoch": 69.13601236476043, + "grad_norm": 0.5539835691452026, + "learning_rate": 2.031739586630981e-05, + "loss": 0.3087, + "num_input_tokens_seen": 34219808, + "step": 22400 + }, + { + "epoch": 69.13601236476043, + "eval_loss": 0.3067536950111389, + "eval_runtime": 6.3084, + "eval_samples_per_second": 91.148, + "eval_steps_per_second": 22.827, + "num_input_tokens_seen": 34219808, + "step": 22400 + }, + { + "epoch": 69.15146831530139, + "grad_norm": 0.4259827733039856, + "learning_rate": 2.0307752500629707e-05, + "loss": 0.2323, + "num_input_tokens_seen": 34227392, + "step": 22405 + }, + { + "epoch": 69.16692426584235, + "grad_norm": 0.4321039617061615, + "learning_rate": 2.0298109858553144e-05, + "loss": 0.2414, + "num_input_tokens_seen": 34235744, + "step": 22410 + }, + { + "epoch": 69.18238021638331, + "grad_norm": 0.5366625189781189, + "learning_rate": 2.028846794156712e-05, + "loss": 0.2419, + "num_input_tokens_seen": 34243072, + "step": 22415 + }, + { + "epoch": 69.19783616692426, + "grad_norm": 0.36613136529922485, + "learning_rate": 2.027882675115856e-05, + "loss": 0.227, + "num_input_tokens_seen": 34250496, + "step": 22420 + }, + { + "epoch": 69.21329211746523, + "grad_norm": 0.6429983377456665, + "learning_rate": 2.026918628881423e-05, + "loss": 0.2635, + "num_input_tokens_seen": 34258368, + "step": 22425 + }, + { + "epoch": 69.22874806800618, + "grad_norm": 0.41689297556877136, + "learning_rate": 2.0259546556020833e-05, + "loss": 0.2075, + "num_input_tokens_seen": 34266592, + "step": 22430 + }, + { + "epoch": 69.24420401854714, + "grad_norm": 0.4212048649787903, + "learning_rate": 2.024990755426493e-05, + "loss": 0.2437, + "num_input_tokens_seen": 34274208, + "step": 22435 + }, + { + "epoch": 69.2596599690881, + "grad_norm": 0.7556365132331848, + "learning_rate": 2.0240269285032975e-05, + "loss": 0.1975, + "num_input_tokens_seen": 34281824, + "step": 22440 + }, + { + "epoch": 69.27511591962906, + "grad_norm": 0.33030277490615845, + "learning_rate": 2.0230631749811306e-05, + "loss": 0.2277, + "num_input_tokens_seen": 34289696, + "step": 22445 + }, + { + "epoch": 69.29057187017001, + "grad_norm": 1.3486486673355103, + "learning_rate": 2.0220994950086162e-05, + "loss": 0.2811, + "num_input_tokens_seen": 34297824, + "step": 22450 + }, + { + "epoch": 69.30602782071098, + "grad_norm": 0.39127758145332336, + "learning_rate": 2.021135888734365e-05, + "loss": 0.2188, + "num_input_tokens_seen": 34305120, + "step": 22455 + }, + { + "epoch": 69.32148377125193, + "grad_norm": 0.31380924582481384, + "learning_rate": 2.0201723563069783e-05, + "loss": 0.3134, + "num_input_tokens_seen": 34312896, + "step": 22460 + }, + { + "epoch": 69.33693972179289, + "grad_norm": 0.6058773994445801, + "learning_rate": 2.0192088978750433e-05, + "loss": 0.1949, + "num_input_tokens_seen": 34320704, + "step": 22465 + }, + { + "epoch": 69.35239567233384, + "grad_norm": 0.270275741815567, + "learning_rate": 2.0182455135871385e-05, + "loss": 0.2871, + "num_input_tokens_seen": 34328672, + "step": 22470 + }, + { + "epoch": 69.36785162287481, + "grad_norm": 0.502102255821228, + "learning_rate": 2.0172822035918305e-05, + "loss": 0.235, + "num_input_tokens_seen": 34336128, + "step": 22475 + }, + { + "epoch": 69.38330757341576, + "grad_norm": 0.9508520364761353, + "learning_rate": 2.016318968037671e-05, + "loss": 0.2822, + "num_input_tokens_seen": 34343936, + "step": 22480 + }, + { + "epoch": 69.39876352395672, + "grad_norm": 0.5207989811897278, + "learning_rate": 2.015355807073206e-05, + "loss": 0.2358, + "num_input_tokens_seen": 34351136, + "step": 22485 + }, + { + "epoch": 69.41421947449768, + "grad_norm": 0.22876426577568054, + "learning_rate": 2.0143927208469664e-05, + "loss": 0.2035, + "num_input_tokens_seen": 34358336, + "step": 22490 + }, + { + "epoch": 69.42967542503864, + "grad_norm": 0.38017088174819946, + "learning_rate": 2.0134297095074708e-05, + "loss": 0.2866, + "num_input_tokens_seen": 34366048, + "step": 22495 + }, + { + "epoch": 69.44513137557959, + "grad_norm": 0.28861314058303833, + "learning_rate": 2.0124667732032297e-05, + "loss": 0.252, + "num_input_tokens_seen": 34373440, + "step": 22500 + }, + { + "epoch": 69.46058732612056, + "grad_norm": 0.6083528995513916, + "learning_rate": 2.011503912082738e-05, + "loss": 0.2926, + "num_input_tokens_seen": 34381536, + "step": 22505 + }, + { + "epoch": 69.47604327666151, + "grad_norm": 0.40744733810424805, + "learning_rate": 2.0105411262944823e-05, + "loss": 0.2118, + "num_input_tokens_seen": 34389056, + "step": 22510 + }, + { + "epoch": 69.49149922720247, + "grad_norm": 0.463365763425827, + "learning_rate": 2.0095784159869366e-05, + "loss": 0.2598, + "num_input_tokens_seen": 34397152, + "step": 22515 + }, + { + "epoch": 69.50695517774344, + "grad_norm": 0.4984365999698639, + "learning_rate": 2.0086157813085608e-05, + "loss": 0.2621, + "num_input_tokens_seen": 34404480, + "step": 22520 + }, + { + "epoch": 69.52241112828439, + "grad_norm": 0.5156427621841431, + "learning_rate": 2.0076532224078068e-05, + "loss": 0.2613, + "num_input_tokens_seen": 34412512, + "step": 22525 + }, + { + "epoch": 69.53786707882534, + "grad_norm": 0.31068509817123413, + "learning_rate": 2.0066907394331142e-05, + "loss": 0.2514, + "num_input_tokens_seen": 34419520, + "step": 22530 + }, + { + "epoch": 69.55332302936631, + "grad_norm": 0.3550248444080353, + "learning_rate": 2.0057283325329077e-05, + "loss": 0.2405, + "num_input_tokens_seen": 34427328, + "step": 22535 + }, + { + "epoch": 69.56877897990726, + "grad_norm": 0.5296316742897034, + "learning_rate": 2.0047660018556047e-05, + "loss": 0.2333, + "num_input_tokens_seen": 34434752, + "step": 22540 + }, + { + "epoch": 69.58423493044822, + "grad_norm": 0.5923834443092346, + "learning_rate": 2.0038037475496075e-05, + "loss": 0.2834, + "num_input_tokens_seen": 34442144, + "step": 22545 + }, + { + "epoch": 69.59969088098919, + "grad_norm": 0.4667277932167053, + "learning_rate": 2.0028415697633073e-05, + "loss": 0.3103, + "num_input_tokens_seen": 34448992, + "step": 22550 + }, + { + "epoch": 69.61514683153014, + "grad_norm": 0.3627246916294098, + "learning_rate": 2.0018794686450858e-05, + "loss": 0.2398, + "num_input_tokens_seen": 34456896, + "step": 22555 + }, + { + "epoch": 69.6306027820711, + "grad_norm": 0.6092913746833801, + "learning_rate": 2.0009174443433088e-05, + "loss": 0.3597, + "num_input_tokens_seen": 34464352, + "step": 22560 + }, + { + "epoch": 69.64605873261206, + "grad_norm": 0.6334140300750732, + "learning_rate": 1.999955497006334e-05, + "loss": 0.2834, + "num_input_tokens_seen": 34472032, + "step": 22565 + }, + { + "epoch": 69.66151468315302, + "grad_norm": 0.334611177444458, + "learning_rate": 1.9989936267825067e-05, + "loss": 0.221, + "num_input_tokens_seen": 34479296, + "step": 22570 + }, + { + "epoch": 69.67697063369397, + "grad_norm": 0.34433186054229736, + "learning_rate": 1.9980318338201572e-05, + "loss": 0.2601, + "num_input_tokens_seen": 34487488, + "step": 22575 + }, + { + "epoch": 69.69242658423494, + "grad_norm": 0.4191255271434784, + "learning_rate": 1.997070118267607e-05, + "loss": 0.2369, + "num_input_tokens_seen": 34495296, + "step": 22580 + }, + { + "epoch": 69.70788253477589, + "grad_norm": 0.5513067245483398, + "learning_rate": 1.9961084802731654e-05, + "loss": 0.2589, + "num_input_tokens_seen": 34502656, + "step": 22585 + }, + { + "epoch": 69.72333848531684, + "grad_norm": 0.5787686109542847, + "learning_rate": 1.9951469199851273e-05, + "loss": 0.2517, + "num_input_tokens_seen": 34510624, + "step": 22590 + }, + { + "epoch": 69.7387944358578, + "grad_norm": 0.493731826543808, + "learning_rate": 1.99418543755178e-05, + "loss": 0.307, + "num_input_tokens_seen": 34518400, + "step": 22595 + }, + { + "epoch": 69.75425038639877, + "grad_norm": 0.2864319980144501, + "learning_rate": 1.9932240331213936e-05, + "loss": 0.2091, + "num_input_tokens_seen": 34525536, + "step": 22600 + }, + { + "epoch": 69.75425038639877, + "eval_loss": 0.306668221950531, + "eval_runtime": 6.3069, + "eval_samples_per_second": 91.17, + "eval_steps_per_second": 22.832, + "num_input_tokens_seen": 34525536, + "step": 22600 + }, + { + "epoch": 69.76970633693972, + "grad_norm": 0.5585412979125977, + "learning_rate": 1.9922627068422297e-05, + "loss": 0.2259, + "num_input_tokens_seen": 34533600, + "step": 22605 + }, + { + "epoch": 69.78516228748067, + "grad_norm": 0.4593772888183594, + "learning_rate": 1.991301458862538e-05, + "loss": 0.2229, + "num_input_tokens_seen": 34540896, + "step": 22610 + }, + { + "epoch": 69.80061823802164, + "grad_norm": 0.7190529704093933, + "learning_rate": 1.9903402893305536e-05, + "loss": 0.2219, + "num_input_tokens_seen": 34548704, + "step": 22615 + }, + { + "epoch": 69.8160741885626, + "grad_norm": 0.5175474882125854, + "learning_rate": 1.9893791983945016e-05, + "loss": 0.2672, + "num_input_tokens_seen": 34556192, + "step": 22620 + }, + { + "epoch": 69.83153013910355, + "grad_norm": 0.4346683919429779, + "learning_rate": 1.988418186202594e-05, + "loss": 0.2398, + "num_input_tokens_seen": 34563456, + "step": 22625 + }, + { + "epoch": 69.84698608964452, + "grad_norm": 0.45923593640327454, + "learning_rate": 1.98745725290303e-05, + "loss": 0.2713, + "num_input_tokens_seen": 34571328, + "step": 22630 + }, + { + "epoch": 69.86244204018547, + "grad_norm": 0.3236231207847595, + "learning_rate": 1.986496398644e-05, + "loss": 0.2629, + "num_input_tokens_seen": 34578784, + "step": 22635 + }, + { + "epoch": 69.87789799072642, + "grad_norm": 0.3938983976840973, + "learning_rate": 1.9855356235736777e-05, + "loss": 0.2994, + "num_input_tokens_seen": 34586624, + "step": 22640 + }, + { + "epoch": 69.89335394126739, + "grad_norm": 0.6012716889381409, + "learning_rate": 1.9845749278402277e-05, + "loss": 0.236, + "num_input_tokens_seen": 34594176, + "step": 22645 + }, + { + "epoch": 69.90880989180835, + "grad_norm": 0.4233875572681427, + "learning_rate": 1.9836143115918006e-05, + "loss": 0.2416, + "num_input_tokens_seen": 34601824, + "step": 22650 + }, + { + "epoch": 69.9242658423493, + "grad_norm": 0.46061691641807556, + "learning_rate": 1.9826537749765367e-05, + "loss": 0.2018, + "num_input_tokens_seen": 34609472, + "step": 22655 + }, + { + "epoch": 69.93972179289027, + "grad_norm": 0.3063866198062897, + "learning_rate": 1.9816933181425625e-05, + "loss": 0.2155, + "num_input_tokens_seen": 34617248, + "step": 22660 + }, + { + "epoch": 69.95517774343122, + "grad_norm": 0.6599643230438232, + "learning_rate": 1.9807329412379903e-05, + "loss": 0.3074, + "num_input_tokens_seen": 34624800, + "step": 22665 + }, + { + "epoch": 69.97063369397218, + "grad_norm": 0.4867657423019409, + "learning_rate": 1.9797726444109247e-05, + "loss": 0.2551, + "num_input_tokens_seen": 34632480, + "step": 22670 + }, + { + "epoch": 69.98608964451314, + "grad_norm": 0.539709746837616, + "learning_rate": 1.9788124278094557e-05, + "loss": 0.2349, + "num_input_tokens_seen": 34639968, + "step": 22675 + }, + { + "epoch": 70.0, + "grad_norm": 0.6932908296585083, + "learning_rate": 1.9778522915816594e-05, + "loss": 0.2579, + "num_input_tokens_seen": 34646816, + "step": 22680 + }, + { + "epoch": 70.01545595054095, + "grad_norm": 0.40601804852485657, + "learning_rate": 1.9768922358756014e-05, + "loss": 0.2215, + "num_input_tokens_seen": 34654944, + "step": 22685 + }, + { + "epoch": 70.03091190108192, + "grad_norm": 0.29956501722335815, + "learning_rate": 1.9759322608393353e-05, + "loss": 0.2536, + "num_input_tokens_seen": 34662304, + "step": 22690 + }, + { + "epoch": 70.04636785162288, + "grad_norm": 0.5449848175048828, + "learning_rate": 1.9749723666208992e-05, + "loss": 0.2341, + "num_input_tokens_seen": 34669920, + "step": 22695 + }, + { + "epoch": 70.06182380216383, + "grad_norm": 0.43082308769226074, + "learning_rate": 1.9740125533683235e-05, + "loss": 0.2425, + "num_input_tokens_seen": 34677696, + "step": 22700 + }, + { + "epoch": 70.0772797527048, + "grad_norm": 0.6186437010765076, + "learning_rate": 1.9730528212296208e-05, + "loss": 0.252, + "num_input_tokens_seen": 34685120, + "step": 22705 + }, + { + "epoch": 70.09273570324575, + "grad_norm": 0.38108956813812256, + "learning_rate": 1.9720931703527945e-05, + "loss": 0.2686, + "num_input_tokens_seen": 34693088, + "step": 22710 + }, + { + "epoch": 70.1081916537867, + "grad_norm": 0.39042773842811584, + "learning_rate": 1.9711336008858373e-05, + "loss": 0.2632, + "num_input_tokens_seen": 34700960, + "step": 22715 + }, + { + "epoch": 70.12364760432767, + "grad_norm": 0.5040457844734192, + "learning_rate": 1.9701741129767233e-05, + "loss": 0.239, + "num_input_tokens_seen": 34708608, + "step": 22720 + }, + { + "epoch": 70.13910355486863, + "grad_norm": 0.5315229296684265, + "learning_rate": 1.9692147067734202e-05, + "loss": 0.2502, + "num_input_tokens_seen": 34715776, + "step": 22725 + }, + { + "epoch": 70.15455950540958, + "grad_norm": 0.5665891170501709, + "learning_rate": 1.96825538242388e-05, + "loss": 0.2056, + "num_input_tokens_seen": 34723200, + "step": 22730 + }, + { + "epoch": 70.17001545595055, + "grad_norm": 0.30262526869773865, + "learning_rate": 1.967296140076041e-05, + "loss": 0.264, + "num_input_tokens_seen": 34730880, + "step": 22735 + }, + { + "epoch": 70.1854714064915, + "grad_norm": 0.9670436978340149, + "learning_rate": 1.966336979877833e-05, + "loss": 0.2719, + "num_input_tokens_seen": 34737728, + "step": 22740 + }, + { + "epoch": 70.20092735703246, + "grad_norm": 0.5027326941490173, + "learning_rate": 1.9653779019771678e-05, + "loss": 0.2437, + "num_input_tokens_seen": 34745728, + "step": 22745 + }, + { + "epoch": 70.21638330757341, + "grad_norm": 0.7164501547813416, + "learning_rate": 1.9644189065219488e-05, + "loss": 0.281, + "num_input_tokens_seen": 34753568, + "step": 22750 + }, + { + "epoch": 70.23183925811438, + "grad_norm": 0.6499317288398743, + "learning_rate": 1.9634599936600655e-05, + "loss": 0.3783, + "num_input_tokens_seen": 34761056, + "step": 22755 + }, + { + "epoch": 70.24729520865533, + "grad_norm": 0.4131605327129364, + "learning_rate": 1.9625011635393935e-05, + "loss": 0.2356, + "num_input_tokens_seen": 34768896, + "step": 22760 + }, + { + "epoch": 70.26275115919628, + "grad_norm": 0.6183690428733826, + "learning_rate": 1.9615424163077963e-05, + "loss": 0.246, + "num_input_tokens_seen": 34776128, + "step": 22765 + }, + { + "epoch": 70.27820710973725, + "grad_norm": 0.3791883885860443, + "learning_rate": 1.9605837521131263e-05, + "loss": 0.2297, + "num_input_tokens_seen": 34783936, + "step": 22770 + }, + { + "epoch": 70.2936630602782, + "grad_norm": 0.3996848464012146, + "learning_rate": 1.9596251711032192e-05, + "loss": 0.207, + "num_input_tokens_seen": 34791552, + "step": 22775 + }, + { + "epoch": 70.30911901081916, + "grad_norm": 0.5628897547721863, + "learning_rate": 1.958666673425903e-05, + "loss": 0.351, + "num_input_tokens_seen": 34798976, + "step": 22780 + }, + { + "epoch": 70.32457496136013, + "grad_norm": 0.9341474771499634, + "learning_rate": 1.957708259228987e-05, + "loss": 0.226, + "num_input_tokens_seen": 34806688, + "step": 22785 + }, + { + "epoch": 70.34003091190108, + "grad_norm": 0.5294299721717834, + "learning_rate": 1.956749928660273e-05, + "loss": 0.2668, + "num_input_tokens_seen": 34814336, + "step": 22790 + }, + { + "epoch": 70.35548686244204, + "grad_norm": 0.684058666229248, + "learning_rate": 1.955791681867547e-05, + "loss": 0.2906, + "num_input_tokens_seen": 34822304, + "step": 22795 + }, + { + "epoch": 70.370942812983, + "grad_norm": 0.5184778571128845, + "learning_rate": 1.9548335189985824e-05, + "loss": 0.2151, + "num_input_tokens_seen": 34829856, + "step": 22800 + }, + { + "epoch": 70.370942812983, + "eval_loss": 0.3071548342704773, + "eval_runtime": 6.3169, + "eval_samples_per_second": 91.025, + "eval_steps_per_second": 22.796, + "num_input_tokens_seen": 34829856, + "step": 22800 + }, + { + "epoch": 70.38639876352396, + "grad_norm": 0.34830817580223083, + "learning_rate": 1.9538754402011396e-05, + "loss": 0.2543, + "num_input_tokens_seen": 34837856, + "step": 22805 + }, + { + "epoch": 70.40185471406491, + "grad_norm": 0.5441434383392334, + "learning_rate": 1.952917445622968e-05, + "loss": 0.2641, + "num_input_tokens_seen": 34845984, + "step": 22810 + }, + { + "epoch": 70.41731066460588, + "grad_norm": 0.4178452789783478, + "learning_rate": 1.9519595354118005e-05, + "loss": 0.2286, + "num_input_tokens_seen": 34853632, + "step": 22815 + }, + { + "epoch": 70.43276661514683, + "grad_norm": 0.41061538457870483, + "learning_rate": 1.951001709715361e-05, + "loss": 0.307, + "num_input_tokens_seen": 34861312, + "step": 22820 + }, + { + "epoch": 70.44822256568779, + "grad_norm": 0.5239942669868469, + "learning_rate": 1.9500439686813556e-05, + "loss": 0.2175, + "num_input_tokens_seen": 34868640, + "step": 22825 + }, + { + "epoch": 70.46367851622875, + "grad_norm": 0.7252335548400879, + "learning_rate": 1.949086312457482e-05, + "loss": 0.2556, + "num_input_tokens_seen": 34876096, + "step": 22830 + }, + { + "epoch": 70.47913446676971, + "grad_norm": 0.29009559750556946, + "learning_rate": 1.9481287411914223e-05, + "loss": 0.2637, + "num_input_tokens_seen": 34883616, + "step": 22835 + }, + { + "epoch": 70.49459041731066, + "grad_norm": 0.4007693827152252, + "learning_rate": 1.9471712550308457e-05, + "loss": 0.2586, + "num_input_tokens_seen": 34890944, + "step": 22840 + }, + { + "epoch": 70.51004636785163, + "grad_norm": 0.2956197261810303, + "learning_rate": 1.946213854123409e-05, + "loss": 0.2542, + "num_input_tokens_seen": 34899040, + "step": 22845 + }, + { + "epoch": 70.52550231839258, + "grad_norm": 0.41662803292274475, + "learning_rate": 1.9452565386167554e-05, + "loss": 0.2539, + "num_input_tokens_seen": 34906880, + "step": 22850 + }, + { + "epoch": 70.54095826893354, + "grad_norm": 0.6605352163314819, + "learning_rate": 1.9442993086585142e-05, + "loss": 0.3099, + "num_input_tokens_seen": 34914656, + "step": 22855 + }, + { + "epoch": 70.5564142194745, + "grad_norm": 0.7297142148017883, + "learning_rate": 1.9433421643963043e-05, + "loss": 0.2569, + "num_input_tokens_seen": 34922560, + "step": 22860 + }, + { + "epoch": 70.57187017001546, + "grad_norm": 0.41290298104286194, + "learning_rate": 1.942385105977727e-05, + "loss": 0.2055, + "num_input_tokens_seen": 34930720, + "step": 22865 + }, + { + "epoch": 70.58732612055641, + "grad_norm": 0.8150274753570557, + "learning_rate": 1.9414281335503743e-05, + "loss": 0.2603, + "num_input_tokens_seen": 34938464, + "step": 22870 + }, + { + "epoch": 70.60278207109737, + "grad_norm": 0.33468925952911377, + "learning_rate": 1.9404712472618232e-05, + "loss": 0.2535, + "num_input_tokens_seen": 34946176, + "step": 22875 + }, + { + "epoch": 70.61823802163833, + "grad_norm": 0.6882083415985107, + "learning_rate": 1.939514447259636e-05, + "loss": 0.2787, + "num_input_tokens_seen": 34953216, + "step": 22880 + }, + { + "epoch": 70.63369397217929, + "grad_norm": 0.42755240201950073, + "learning_rate": 1.938557733691365e-05, + "loss": 0.2305, + "num_input_tokens_seen": 34960544, + "step": 22885 + }, + { + "epoch": 70.64914992272024, + "grad_norm": 0.5400654077529907, + "learning_rate": 1.9376011067045476e-05, + "loss": 0.2752, + "num_input_tokens_seen": 34968256, + "step": 22890 + }, + { + "epoch": 70.66460587326121, + "grad_norm": 0.6286571621894836, + "learning_rate": 1.9366445664467065e-05, + "loss": 0.2498, + "num_input_tokens_seen": 34975456, + "step": 22895 + }, + { + "epoch": 70.68006182380216, + "grad_norm": 0.5401946902275085, + "learning_rate": 1.9356881130653533e-05, + "loss": 0.2395, + "num_input_tokens_seen": 34982528, + "step": 22900 + }, + { + "epoch": 70.69551777434312, + "grad_norm": 0.5219168066978455, + "learning_rate": 1.9347317467079846e-05, + "loss": 0.2436, + "num_input_tokens_seen": 34990336, + "step": 22905 + }, + { + "epoch": 70.71097372488408, + "grad_norm": 0.3584766685962677, + "learning_rate": 1.9337754675220836e-05, + "loss": 0.2421, + "num_input_tokens_seen": 34997664, + "step": 22910 + }, + { + "epoch": 70.72642967542504, + "grad_norm": 0.7719721794128418, + "learning_rate": 1.9328192756551218e-05, + "loss": 0.2527, + "num_input_tokens_seen": 35005088, + "step": 22915 + }, + { + "epoch": 70.74188562596599, + "grad_norm": 0.5716461539268494, + "learning_rate": 1.931863171254555e-05, + "loss": 0.2346, + "num_input_tokens_seen": 35012544, + "step": 22920 + }, + { + "epoch": 70.75734157650696, + "grad_norm": 0.5273721218109131, + "learning_rate": 1.930907154467826e-05, + "loss": 0.2321, + "num_input_tokens_seen": 35019616, + "step": 22925 + }, + { + "epoch": 70.77279752704791, + "grad_norm": 0.46929675340652466, + "learning_rate": 1.9299512254423673e-05, + "loss": 0.2688, + "num_input_tokens_seen": 35027552, + "step": 22930 + }, + { + "epoch": 70.78825347758887, + "grad_norm": 0.5982005596160889, + "learning_rate": 1.9289953843255914e-05, + "loss": 0.21, + "num_input_tokens_seen": 35035008, + "step": 22935 + }, + { + "epoch": 70.80370942812984, + "grad_norm": 0.8988177180290222, + "learning_rate": 1.9280396312649048e-05, + "loss": 0.2848, + "num_input_tokens_seen": 35042496, + "step": 22940 + }, + { + "epoch": 70.81916537867079, + "grad_norm": 0.5323253870010376, + "learning_rate": 1.9270839664076936e-05, + "loss": 0.2399, + "num_input_tokens_seen": 35050464, + "step": 22945 + }, + { + "epoch": 70.83462132921174, + "grad_norm": 0.6653112769126892, + "learning_rate": 1.9261283899013345e-05, + "loss": 0.2315, + "num_input_tokens_seen": 35058400, + "step": 22950 + }, + { + "epoch": 70.85007727975271, + "grad_norm": 0.3508927524089813, + "learning_rate": 1.92517290189319e-05, + "loss": 0.2089, + "num_input_tokens_seen": 35065984, + "step": 22955 + }, + { + "epoch": 70.86553323029366, + "grad_norm": 0.6143345236778259, + "learning_rate": 1.924217502530607e-05, + "loss": 0.2423, + "num_input_tokens_seen": 35073632, + "step": 22960 + }, + { + "epoch": 70.88098918083462, + "grad_norm": 0.7788504958152771, + "learning_rate": 1.9232621919609207e-05, + "loss": 0.2702, + "num_input_tokens_seen": 35080960, + "step": 22965 + }, + { + "epoch": 70.89644513137559, + "grad_norm": 0.5480562448501587, + "learning_rate": 1.9223069703314534e-05, + "loss": 0.2499, + "num_input_tokens_seen": 35089120, + "step": 22970 + }, + { + "epoch": 70.91190108191654, + "grad_norm": 0.3629317581653595, + "learning_rate": 1.92135183778951e-05, + "loss": 0.2943, + "num_input_tokens_seen": 35096640, + "step": 22975 + }, + { + "epoch": 70.9273570324575, + "grad_norm": 0.5440725088119507, + "learning_rate": 1.9203967944823857e-05, + "loss": 0.232, + "num_input_tokens_seen": 35104032, + "step": 22980 + }, + { + "epoch": 70.94281298299846, + "grad_norm": 0.5224255919456482, + "learning_rate": 1.9194418405573588e-05, + "loss": 0.2479, + "num_input_tokens_seen": 35111904, + "step": 22985 + }, + { + "epoch": 70.95826893353942, + "grad_norm": 0.31497249007225037, + "learning_rate": 1.9184869761616954e-05, + "loss": 0.2011, + "num_input_tokens_seen": 35119456, + "step": 22990 + }, + { + "epoch": 70.97372488408037, + "grad_norm": 0.5151953101158142, + "learning_rate": 1.9175322014426495e-05, + "loss": 0.2976, + "num_input_tokens_seen": 35126912, + "step": 22995 + }, + { + "epoch": 70.98918083462132, + "grad_norm": 0.3982318639755249, + "learning_rate": 1.9165775165474565e-05, + "loss": 0.2072, + "num_input_tokens_seen": 35134560, + "step": 23000 + }, + { + "epoch": 70.98918083462132, + "eval_loss": 0.3063207268714905, + "eval_runtime": 6.3024, + "eval_samples_per_second": 91.235, + "eval_steps_per_second": 22.848, + "num_input_tokens_seen": 35134560, + "step": 23000 + }, + { + "epoch": 71.0030911901082, + "grad_norm": 0.6947663426399231, + "learning_rate": 1.9156229216233434e-05, + "loss": 0.3372, + "num_input_tokens_seen": 35141216, + "step": 23005 + }, + { + "epoch": 71.01854714064915, + "grad_norm": 0.40748798847198486, + "learning_rate": 1.9146684168175184e-05, + "loss": 0.3482, + "num_input_tokens_seen": 35148672, + "step": 23010 + }, + { + "epoch": 71.03400309119012, + "grad_norm": 0.44807490706443787, + "learning_rate": 1.9137140022771796e-05, + "loss": 0.2283, + "num_input_tokens_seen": 35156192, + "step": 23015 + }, + { + "epoch": 71.04945904173107, + "grad_norm": 0.44845983386039734, + "learning_rate": 1.9127596781495103e-05, + "loss": 0.2421, + "num_input_tokens_seen": 35163968, + "step": 23020 + }, + { + "epoch": 71.06491499227202, + "grad_norm": 0.48415160179138184, + "learning_rate": 1.9118054445816767e-05, + "loss": 0.2564, + "num_input_tokens_seen": 35171872, + "step": 23025 + }, + { + "epoch": 71.08037094281298, + "grad_norm": 0.5762768387794495, + "learning_rate": 1.9108513017208356e-05, + "loss": 0.2849, + "num_input_tokens_seen": 35179072, + "step": 23030 + }, + { + "epoch": 71.09582689335394, + "grad_norm": 0.5407233834266663, + "learning_rate": 1.9098972497141287e-05, + "loss": 0.246, + "num_input_tokens_seen": 35185984, + "step": 23035 + }, + { + "epoch": 71.1112828438949, + "grad_norm": 0.5345772504806519, + "learning_rate": 1.9089432887086806e-05, + "loss": 0.2202, + "num_input_tokens_seen": 35193920, + "step": 23040 + }, + { + "epoch": 71.12673879443585, + "grad_norm": 0.4638163447380066, + "learning_rate": 1.9079894188516056e-05, + "loss": 0.2592, + "num_input_tokens_seen": 35200800, + "step": 23045 + }, + { + "epoch": 71.14219474497682, + "grad_norm": 0.4640327990055084, + "learning_rate": 1.907035640290002e-05, + "loss": 0.2611, + "num_input_tokens_seen": 35208192, + "step": 23050 + }, + { + "epoch": 71.15765069551777, + "grad_norm": 0.6326900124549866, + "learning_rate": 1.9060819531709534e-05, + "loss": 0.2404, + "num_input_tokens_seen": 35216576, + "step": 23055 + }, + { + "epoch": 71.17310664605873, + "grad_norm": 0.43667370080947876, + "learning_rate": 1.9051283576415325e-05, + "loss": 0.2631, + "num_input_tokens_seen": 35224512, + "step": 23060 + }, + { + "epoch": 71.1885625965997, + "grad_norm": 0.5362464189529419, + "learning_rate": 1.904174853848793e-05, + "loss": 0.2671, + "num_input_tokens_seen": 35232512, + "step": 23065 + }, + { + "epoch": 71.20401854714065, + "grad_norm": 1.010508418083191, + "learning_rate": 1.903221441939779e-05, + "loss": 0.2349, + "num_input_tokens_seen": 35239872, + "step": 23070 + }, + { + "epoch": 71.2194744976816, + "grad_norm": 0.46132004261016846, + "learning_rate": 1.9022681220615194e-05, + "loss": 0.2744, + "num_input_tokens_seen": 35247744, + "step": 23075 + }, + { + "epoch": 71.23493044822257, + "grad_norm": 0.3835689425468445, + "learning_rate": 1.9013148943610255e-05, + "loss": 0.2374, + "num_input_tokens_seen": 35255680, + "step": 23080 + }, + { + "epoch": 71.25038639876352, + "grad_norm": 0.48954010009765625, + "learning_rate": 1.9003617589852998e-05, + "loss": 0.2983, + "num_input_tokens_seen": 35264224, + "step": 23085 + }, + { + "epoch": 71.26584234930448, + "grad_norm": 0.7904359102249146, + "learning_rate": 1.899408716081326e-05, + "loss": 0.2623, + "num_input_tokens_seen": 35272000, + "step": 23090 + }, + { + "epoch": 71.28129829984545, + "grad_norm": 0.42716893553733826, + "learning_rate": 1.898455765796075e-05, + "loss": 0.2121, + "num_input_tokens_seen": 35279872, + "step": 23095 + }, + { + "epoch": 71.2967542503864, + "grad_norm": 0.4568197429180145, + "learning_rate": 1.8975029082765053e-05, + "loss": 0.1973, + "num_input_tokens_seen": 35287456, + "step": 23100 + }, + { + "epoch": 71.31221020092735, + "grad_norm": 0.502801775932312, + "learning_rate": 1.8965501436695577e-05, + "loss": 0.255, + "num_input_tokens_seen": 35295072, + "step": 23105 + }, + { + "epoch": 71.32766615146832, + "grad_norm": 0.5198385715484619, + "learning_rate": 1.895597472122161e-05, + "loss": 0.2199, + "num_input_tokens_seen": 35302592, + "step": 23110 + }, + { + "epoch": 71.34312210200927, + "grad_norm": 0.8080894351005554, + "learning_rate": 1.894644893781231e-05, + "loss": 0.2612, + "num_input_tokens_seen": 35309952, + "step": 23115 + }, + { + "epoch": 71.35857805255023, + "grad_norm": 0.34514522552490234, + "learning_rate": 1.893692408793665e-05, + "loss": 0.2326, + "num_input_tokens_seen": 35317088, + "step": 23120 + }, + { + "epoch": 71.3740340030912, + "grad_norm": 0.5370644927024841, + "learning_rate": 1.8927400173063493e-05, + "loss": 0.2801, + "num_input_tokens_seen": 35324576, + "step": 23125 + }, + { + "epoch": 71.38948995363215, + "grad_norm": 0.5735016465187073, + "learning_rate": 1.891787719466154e-05, + "loss": 0.2283, + "num_input_tokens_seen": 35331840, + "step": 23130 + }, + { + "epoch": 71.4049459041731, + "grad_norm": 0.3539670705795288, + "learning_rate": 1.8908355154199346e-05, + "loss": 0.2094, + "num_input_tokens_seen": 35339424, + "step": 23135 + }, + { + "epoch": 71.42040185471407, + "grad_norm": 0.3780730068683624, + "learning_rate": 1.8898834053145357e-05, + "loss": 0.2773, + "num_input_tokens_seen": 35347072, + "step": 23140 + }, + { + "epoch": 71.43585780525503, + "grad_norm": 0.8213280439376831, + "learning_rate": 1.8889313892967813e-05, + "loss": 0.2719, + "num_input_tokens_seen": 35355072, + "step": 23145 + }, + { + "epoch": 71.45131375579598, + "grad_norm": 0.3707258999347687, + "learning_rate": 1.8879794675134863e-05, + "loss": 0.2277, + "num_input_tokens_seen": 35362400, + "step": 23150 + }, + { + "epoch": 71.46676970633693, + "grad_norm": 0.8530495166778564, + "learning_rate": 1.8870276401114494e-05, + "loss": 0.2273, + "num_input_tokens_seen": 35370048, + "step": 23155 + }, + { + "epoch": 71.4822256568779, + "grad_norm": 0.35084518790245056, + "learning_rate": 1.886075907237453e-05, + "loss": 0.2455, + "num_input_tokens_seen": 35378080, + "step": 23160 + }, + { + "epoch": 71.49768160741885, + "grad_norm": 0.5239251852035522, + "learning_rate": 1.8851242690382672e-05, + "loss": 0.2189, + "num_input_tokens_seen": 35385984, + "step": 23165 + }, + { + "epoch": 71.51313755795981, + "grad_norm": 0.3565666675567627, + "learning_rate": 1.884172725660645e-05, + "loss": 0.2961, + "num_input_tokens_seen": 35393120, + "step": 23170 + }, + { + "epoch": 71.52859350850078, + "grad_norm": 0.34458714723587036, + "learning_rate": 1.8832212772513277e-05, + "loss": 0.2828, + "num_input_tokens_seen": 35400352, + "step": 23175 + }, + { + "epoch": 71.54404945904173, + "grad_norm": 0.37284669280052185, + "learning_rate": 1.8822699239570414e-05, + "loss": 0.2181, + "num_input_tokens_seen": 35407904, + "step": 23180 + }, + { + "epoch": 71.55950540958268, + "grad_norm": 0.4512544870376587, + "learning_rate": 1.8813186659244943e-05, + "loss": 0.2209, + "num_input_tokens_seen": 35415936, + "step": 23185 + }, + { + "epoch": 71.57496136012365, + "grad_norm": 0.785151481628418, + "learning_rate": 1.880367503300385e-05, + "loss": 0.2575, + "num_input_tokens_seen": 35423296, + "step": 23190 + }, + { + "epoch": 71.5904173106646, + "grad_norm": 0.4815164804458618, + "learning_rate": 1.8794164362313927e-05, + "loss": 0.2691, + "num_input_tokens_seen": 35431360, + "step": 23195 + }, + { + "epoch": 71.60587326120556, + "grad_norm": 0.4313836097717285, + "learning_rate": 1.878465464864185e-05, + "loss": 0.2173, + "num_input_tokens_seen": 35439168, + "step": 23200 + }, + { + "epoch": 71.60587326120556, + "eval_loss": 0.30679360032081604, + "eval_runtime": 6.3277, + "eval_samples_per_second": 90.871, + "eval_steps_per_second": 22.757, + "num_input_tokens_seen": 35439168, + "step": 23200 + }, + { + "epoch": 71.62132921174653, + "grad_norm": 0.34028691053390503, + "learning_rate": 1.877514589345414e-05, + "loss": 0.2114, + "num_input_tokens_seen": 35446976, + "step": 23205 + }, + { + "epoch": 71.63678516228748, + "grad_norm": 0.3900725543498993, + "learning_rate": 1.876563809821715e-05, + "loss": 0.226, + "num_input_tokens_seen": 35455072, + "step": 23210 + }, + { + "epoch": 71.65224111282843, + "grad_norm": 0.6162948608398438, + "learning_rate": 1.8756131264397106e-05, + "loss": 0.2157, + "num_input_tokens_seen": 35462528, + "step": 23215 + }, + { + "epoch": 71.6676970633694, + "grad_norm": 0.4606819748878479, + "learning_rate": 1.87466253934601e-05, + "loss": 0.2167, + "num_input_tokens_seen": 35470336, + "step": 23220 + }, + { + "epoch": 71.68315301391036, + "grad_norm": 0.4605596363544464, + "learning_rate": 1.8737120486872033e-05, + "loss": 0.2944, + "num_input_tokens_seen": 35477824, + "step": 23225 + }, + { + "epoch": 71.69860896445131, + "grad_norm": 0.6801949143409729, + "learning_rate": 1.8727616546098696e-05, + "loss": 0.1967, + "num_input_tokens_seen": 35485376, + "step": 23230 + }, + { + "epoch": 71.71406491499228, + "grad_norm": 0.5928366780281067, + "learning_rate": 1.8718113572605716e-05, + "loss": 0.3258, + "num_input_tokens_seen": 35493600, + "step": 23235 + }, + { + "epoch": 71.72952086553323, + "grad_norm": 0.5192357897758484, + "learning_rate": 1.8708611567858554e-05, + "loss": 0.2672, + "num_input_tokens_seen": 35500992, + "step": 23240 + }, + { + "epoch": 71.74497681607419, + "grad_norm": 0.8168883323669434, + "learning_rate": 1.8699110533322565e-05, + "loss": 0.264, + "num_input_tokens_seen": 35508416, + "step": 23245 + }, + { + "epoch": 71.76043276661515, + "grad_norm": 0.5084102749824524, + "learning_rate": 1.8689610470462897e-05, + "loss": 0.2448, + "num_input_tokens_seen": 35516000, + "step": 23250 + }, + { + "epoch": 71.7758887171561, + "grad_norm": 1.2029513120651245, + "learning_rate": 1.8680111380744604e-05, + "loss": 0.3084, + "num_input_tokens_seen": 35523456, + "step": 23255 + }, + { + "epoch": 71.79134466769706, + "grad_norm": 0.34382951259613037, + "learning_rate": 1.8670613265632564e-05, + "loss": 0.2001, + "num_input_tokens_seen": 35531072, + "step": 23260 + }, + { + "epoch": 71.80680061823801, + "grad_norm": 0.6702537536621094, + "learning_rate": 1.866111612659149e-05, + "loss": 0.2593, + "num_input_tokens_seen": 35538976, + "step": 23265 + }, + { + "epoch": 71.82225656877898, + "grad_norm": 0.6118903756141663, + "learning_rate": 1.8651619965085967e-05, + "loss": 0.2502, + "num_input_tokens_seen": 35546208, + "step": 23270 + }, + { + "epoch": 71.83771251931994, + "grad_norm": 0.7244486808776855, + "learning_rate": 1.8642124782580433e-05, + "loss": 0.2772, + "num_input_tokens_seen": 35553376, + "step": 23275 + }, + { + "epoch": 71.85316846986089, + "grad_norm": 0.6755731105804443, + "learning_rate": 1.8632630580539144e-05, + "loss": 0.2346, + "num_input_tokens_seen": 35561888, + "step": 23280 + }, + { + "epoch": 71.86862442040186, + "grad_norm": 0.42072439193725586, + "learning_rate": 1.862313736042625e-05, + "loss": 0.2618, + "num_input_tokens_seen": 35569312, + "step": 23285 + }, + { + "epoch": 71.88408037094281, + "grad_norm": 0.8066227436065674, + "learning_rate": 1.8613645123705703e-05, + "loss": 0.2455, + "num_input_tokens_seen": 35576864, + "step": 23290 + }, + { + "epoch": 71.89953632148377, + "grad_norm": 0.2856321632862091, + "learning_rate": 1.8604153871841328e-05, + "loss": 0.2611, + "num_input_tokens_seen": 35584320, + "step": 23295 + }, + { + "epoch": 71.91499227202473, + "grad_norm": 0.3864315450191498, + "learning_rate": 1.859466360629682e-05, + "loss": 0.225, + "num_input_tokens_seen": 35591648, + "step": 23300 + }, + { + "epoch": 71.93044822256569, + "grad_norm": 0.6349111795425415, + "learning_rate": 1.8585174328535666e-05, + "loss": 0.2898, + "num_input_tokens_seen": 35599744, + "step": 23305 + }, + { + "epoch": 71.94590417310664, + "grad_norm": 0.3558352291584015, + "learning_rate": 1.857568604002124e-05, + "loss": 0.2822, + "num_input_tokens_seen": 35607168, + "step": 23310 + }, + { + "epoch": 71.96136012364761, + "grad_norm": 0.7477486729621887, + "learning_rate": 1.8566198742216774e-05, + "loss": 0.3014, + "num_input_tokens_seen": 35614880, + "step": 23315 + }, + { + "epoch": 71.97681607418856, + "grad_norm": 0.569242537021637, + "learning_rate": 1.85567124365853e-05, + "loss": 0.248, + "num_input_tokens_seen": 35622336, + "step": 23320 + }, + { + "epoch": 71.99227202472952, + "grad_norm": 0.5293457508087158, + "learning_rate": 1.854722712458975e-05, + "loss": 0.2265, + "num_input_tokens_seen": 35629568, + "step": 23325 + }, + { + "epoch": 72.00618238021639, + "grad_norm": 0.33639004826545715, + "learning_rate": 1.853774280769286e-05, + "loss": 0.2419, + "num_input_tokens_seen": 35636608, + "step": 23330 + }, + { + "epoch": 72.02163833075734, + "grad_norm": 0.29008936882019043, + "learning_rate": 1.852825948735724e-05, + "loss": 0.2697, + "num_input_tokens_seen": 35644736, + "step": 23335 + }, + { + "epoch": 72.0370942812983, + "grad_norm": 0.5805440545082092, + "learning_rate": 1.851877716504534e-05, + "loss": 0.2811, + "num_input_tokens_seen": 35652768, + "step": 23340 + }, + { + "epoch": 72.05255023183926, + "grad_norm": 0.31914055347442627, + "learning_rate": 1.8509295842219448e-05, + "loss": 0.2226, + "num_input_tokens_seen": 35660672, + "step": 23345 + }, + { + "epoch": 72.06800618238022, + "grad_norm": 0.5796032547950745, + "learning_rate": 1.8499815520341697e-05, + "loss": 0.28, + "num_input_tokens_seen": 35668192, + "step": 23350 + }, + { + "epoch": 72.08346213292117, + "grad_norm": 0.3542090058326721, + "learning_rate": 1.8490336200874094e-05, + "loss": 0.225, + "num_input_tokens_seen": 35675904, + "step": 23355 + }, + { + "epoch": 72.09891808346214, + "grad_norm": 0.3127102255821228, + "learning_rate": 1.848085788527844e-05, + "loss": 0.263, + "num_input_tokens_seen": 35683648, + "step": 23360 + }, + { + "epoch": 72.11437403400309, + "grad_norm": 0.5603237152099609, + "learning_rate": 1.847138057501644e-05, + "loss": 0.3068, + "num_input_tokens_seen": 35691520, + "step": 23365 + }, + { + "epoch": 72.12982998454405, + "grad_norm": 0.8516455888748169, + "learning_rate": 1.8461904271549582e-05, + "loss": 0.2175, + "num_input_tokens_seen": 35698720, + "step": 23370 + }, + { + "epoch": 72.14528593508501, + "grad_norm": 0.5169196724891663, + "learning_rate": 1.845242897633926e-05, + "loss": 0.2984, + "num_input_tokens_seen": 35706304, + "step": 23375 + }, + { + "epoch": 72.16074188562597, + "grad_norm": 0.45674020051956177, + "learning_rate": 1.844295469084667e-05, + "loss": 0.2086, + "num_input_tokens_seen": 35713824, + "step": 23380 + }, + { + "epoch": 72.17619783616692, + "grad_norm": 0.6253563761711121, + "learning_rate": 1.843348141653286e-05, + "loss": 0.2761, + "num_input_tokens_seen": 35721344, + "step": 23385 + }, + { + "epoch": 72.19165378670789, + "grad_norm": 0.3343159854412079, + "learning_rate": 1.842400915485874e-05, + "loss": 0.2544, + "num_input_tokens_seen": 35728736, + "step": 23390 + }, + { + "epoch": 72.20710973724884, + "grad_norm": 0.5392755270004272, + "learning_rate": 1.8414537907285053e-05, + "loss": 0.258, + "num_input_tokens_seen": 35736640, + "step": 23395 + }, + { + "epoch": 72.2225656877898, + "grad_norm": 0.6325563788414001, + "learning_rate": 1.840506767527237e-05, + "loss": 0.3593, + "num_input_tokens_seen": 35744608, + "step": 23400 + }, + { + "epoch": 72.2225656877898, + "eval_loss": 0.30698806047439575, + "eval_runtime": 6.3131, + "eval_samples_per_second": 91.08, + "eval_steps_per_second": 22.81, + "num_input_tokens_seen": 35744608, + "step": 23400 + }, + { + "epoch": 72.23802163833076, + "grad_norm": 0.44577085971832275, + "learning_rate": 1.8395598460281137e-05, + "loss": 0.2933, + "num_input_tokens_seen": 35752512, + "step": 23405 + }, + { + "epoch": 72.25347758887172, + "grad_norm": 0.42868489027023315, + "learning_rate": 1.838613026377161e-05, + "loss": 0.2374, + "num_input_tokens_seen": 35760032, + "step": 23410 + }, + { + "epoch": 72.26893353941267, + "grad_norm": 0.41679176688194275, + "learning_rate": 1.8376663087203917e-05, + "loss": 0.2736, + "num_input_tokens_seen": 35768064, + "step": 23415 + }, + { + "epoch": 72.28438948995363, + "grad_norm": 0.3867667615413666, + "learning_rate": 1.8367196932038014e-05, + "loss": 0.2486, + "num_input_tokens_seen": 35775392, + "step": 23420 + }, + { + "epoch": 72.2998454404946, + "grad_norm": 0.6565195322036743, + "learning_rate": 1.8357731799733686e-05, + "loss": 0.2744, + "num_input_tokens_seen": 35783328, + "step": 23425 + }, + { + "epoch": 72.31530139103555, + "grad_norm": 0.338356614112854, + "learning_rate": 1.8348267691750586e-05, + "loss": 0.2394, + "num_input_tokens_seen": 35791360, + "step": 23430 + }, + { + "epoch": 72.3307573415765, + "grad_norm": 0.5433232188224792, + "learning_rate": 1.833880460954821e-05, + "loss": 0.2484, + "num_input_tokens_seen": 35799168, + "step": 23435 + }, + { + "epoch": 72.34621329211747, + "grad_norm": 0.9536648392677307, + "learning_rate": 1.8329342554585866e-05, + "loss": 0.2626, + "num_input_tokens_seen": 35806272, + "step": 23440 + }, + { + "epoch": 72.36166924265842, + "grad_norm": 0.4834897220134735, + "learning_rate": 1.8319881528322735e-05, + "loss": 0.2374, + "num_input_tokens_seen": 35813984, + "step": 23445 + }, + { + "epoch": 72.37712519319938, + "grad_norm": 0.6738618016242981, + "learning_rate": 1.8310421532217815e-05, + "loss": 0.2653, + "num_input_tokens_seen": 35821472, + "step": 23450 + }, + { + "epoch": 72.39258114374034, + "grad_norm": 0.42505380511283875, + "learning_rate": 1.8300962567729958e-05, + "loss": 0.24, + "num_input_tokens_seen": 35829280, + "step": 23455 + }, + { + "epoch": 72.4080370942813, + "grad_norm": 0.9398479461669922, + "learning_rate": 1.8291504636317866e-05, + "loss": 0.2479, + "num_input_tokens_seen": 35837344, + "step": 23460 + }, + { + "epoch": 72.42349304482225, + "grad_norm": 0.46019190549850464, + "learning_rate": 1.8282047739440055e-05, + "loss": 0.2472, + "num_input_tokens_seen": 35844480, + "step": 23465 + }, + { + "epoch": 72.43894899536322, + "grad_norm": 1.0591470003128052, + "learning_rate": 1.8272591878554903e-05, + "loss": 0.2609, + "num_input_tokens_seen": 35852128, + "step": 23470 + }, + { + "epoch": 72.45440494590417, + "grad_norm": 0.6994818449020386, + "learning_rate": 1.8263137055120638e-05, + "loss": 0.2492, + "num_input_tokens_seen": 35860032, + "step": 23475 + }, + { + "epoch": 72.46986089644513, + "grad_norm": 0.4463517367839813, + "learning_rate": 1.8253683270595295e-05, + "loss": 0.247, + "num_input_tokens_seen": 35867648, + "step": 23480 + }, + { + "epoch": 72.4853168469861, + "grad_norm": 0.6064504384994507, + "learning_rate": 1.824423052643677e-05, + "loss": 0.295, + "num_input_tokens_seen": 35875520, + "step": 23485 + }, + { + "epoch": 72.50077279752705, + "grad_norm": 0.4195713698863983, + "learning_rate": 1.82347788241028e-05, + "loss": 0.2133, + "num_input_tokens_seen": 35883520, + "step": 23490 + }, + { + "epoch": 72.516228748068, + "grad_norm": 0.398124635219574, + "learning_rate": 1.8225328165050942e-05, + "loss": 0.2854, + "num_input_tokens_seen": 35891424, + "step": 23495 + }, + { + "epoch": 72.53168469860897, + "grad_norm": 0.705731987953186, + "learning_rate": 1.821587855073863e-05, + "loss": 0.2304, + "num_input_tokens_seen": 35898944, + "step": 23500 + }, + { + "epoch": 72.54714064914992, + "grad_norm": 0.6190503239631653, + "learning_rate": 1.8206429982623086e-05, + "loss": 0.267, + "num_input_tokens_seen": 35906304, + "step": 23505 + }, + { + "epoch": 72.56259659969088, + "grad_norm": 0.5383731126785278, + "learning_rate": 1.8196982462161416e-05, + "loss": 0.2615, + "num_input_tokens_seen": 35913760, + "step": 23510 + }, + { + "epoch": 72.57805255023185, + "grad_norm": 0.5019688010215759, + "learning_rate": 1.818753599081055e-05, + "loss": 0.287, + "num_input_tokens_seen": 35921504, + "step": 23515 + }, + { + "epoch": 72.5935085007728, + "grad_norm": 0.4588824510574341, + "learning_rate": 1.817809057002724e-05, + "loss": 0.25, + "num_input_tokens_seen": 35928992, + "step": 23520 + }, + { + "epoch": 72.60896445131375, + "grad_norm": 0.416710764169693, + "learning_rate": 1.8168646201268096e-05, + "loss": 0.2185, + "num_input_tokens_seen": 35936512, + "step": 23525 + }, + { + "epoch": 72.62442040185472, + "grad_norm": 0.8519783020019531, + "learning_rate": 1.8159202885989557e-05, + "loss": 0.2697, + "num_input_tokens_seen": 35944352, + "step": 23530 + }, + { + "epoch": 72.63987635239567, + "grad_norm": 0.8084420561790466, + "learning_rate": 1.814976062564789e-05, + "loss": 0.2731, + "num_input_tokens_seen": 35952128, + "step": 23535 + }, + { + "epoch": 72.65533230293663, + "grad_norm": 0.40908676385879517, + "learning_rate": 1.8140319421699234e-05, + "loss": 0.2692, + "num_input_tokens_seen": 35960064, + "step": 23540 + }, + { + "epoch": 72.67078825347758, + "grad_norm": 0.40004962682724, + "learning_rate": 1.8130879275599515e-05, + "loss": 0.2479, + "num_input_tokens_seen": 35967456, + "step": 23545 + }, + { + "epoch": 72.68624420401855, + "grad_norm": 0.4517582654953003, + "learning_rate": 1.8121440188804544e-05, + "loss": 0.2033, + "num_input_tokens_seen": 35975040, + "step": 23550 + }, + { + "epoch": 72.7017001545595, + "grad_norm": 0.6404550075531006, + "learning_rate": 1.811200216276993e-05, + "loss": 0.2909, + "num_input_tokens_seen": 35982464, + "step": 23555 + }, + { + "epoch": 72.71715610510046, + "grad_norm": 0.4926714599132538, + "learning_rate": 1.810256519895115e-05, + "loss": 0.2168, + "num_input_tokens_seen": 35989952, + "step": 23560 + }, + { + "epoch": 72.73261205564143, + "grad_norm": 0.8974810242652893, + "learning_rate": 1.8093129298803494e-05, + "loss": 0.247, + "num_input_tokens_seen": 35997664, + "step": 23565 + }, + { + "epoch": 72.74806800618238, + "grad_norm": 0.5418897271156311, + "learning_rate": 1.808369446378209e-05, + "loss": 0.2448, + "num_input_tokens_seen": 36005024, + "step": 23570 + }, + { + "epoch": 72.76352395672333, + "grad_norm": 0.6537132859230042, + "learning_rate": 1.8074260695341914e-05, + "loss": 0.223, + "num_input_tokens_seen": 36013216, + "step": 23575 + }, + { + "epoch": 72.7789799072643, + "grad_norm": 0.5414034128189087, + "learning_rate": 1.8064827994937782e-05, + "loss": 0.2241, + "num_input_tokens_seen": 36021152, + "step": 23580 + }, + { + "epoch": 72.79443585780525, + "grad_norm": 0.6164546608924866, + "learning_rate": 1.8055396364024317e-05, + "loss": 0.2565, + "num_input_tokens_seen": 36028320, + "step": 23585 + }, + { + "epoch": 72.80989180834621, + "grad_norm": 0.6175724864006042, + "learning_rate": 1.804596580405601e-05, + "loss": 0.2367, + "num_input_tokens_seen": 36035744, + "step": 23590 + }, + { + "epoch": 72.82534775888718, + "grad_norm": 0.9639675617218018, + "learning_rate": 1.8036536316487174e-05, + "loss": 0.2926, + "num_input_tokens_seen": 36043424, + "step": 23595 + }, + { + "epoch": 72.84080370942813, + "grad_norm": 0.6375603079795837, + "learning_rate": 1.802710790277193e-05, + "loss": 0.2151, + "num_input_tokens_seen": 36050688, + "step": 23600 + }, + { + "epoch": 72.84080370942813, + "eval_loss": 0.3069213032722473, + "eval_runtime": 6.3183, + "eval_samples_per_second": 91.005, + "eval_steps_per_second": 22.791, + "num_input_tokens_seen": 36050688, + "step": 23600 + }, + { + "epoch": 72.85625965996908, + "grad_norm": 0.6946887373924255, + "learning_rate": 1.801768056436429e-05, + "loss": 0.2192, + "num_input_tokens_seen": 36058080, + "step": 23605 + }, + { + "epoch": 72.87171561051005, + "grad_norm": 0.3187466263771057, + "learning_rate": 1.8008254302718035e-05, + "loss": 0.2059, + "num_input_tokens_seen": 36065920, + "step": 23610 + }, + { + "epoch": 72.887171561051, + "grad_norm": 0.5756940245628357, + "learning_rate": 1.7998829119286837e-05, + "loss": 0.2094, + "num_input_tokens_seen": 36073536, + "step": 23615 + }, + { + "epoch": 72.90262751159196, + "grad_norm": 0.4688720405101776, + "learning_rate": 1.798940501552418e-05, + "loss": 0.252, + "num_input_tokens_seen": 36081216, + "step": 23620 + }, + { + "epoch": 72.91808346213293, + "grad_norm": 0.6473387479782104, + "learning_rate": 1.797998199288336e-05, + "loss": 0.3133, + "num_input_tokens_seen": 36088928, + "step": 23625 + }, + { + "epoch": 72.93353941267388, + "grad_norm": 0.5454524159431458, + "learning_rate": 1.7970560052817543e-05, + "loss": 0.2518, + "num_input_tokens_seen": 36096480, + "step": 23630 + }, + { + "epoch": 72.94899536321483, + "grad_norm": 0.4188137650489807, + "learning_rate": 1.7961139196779702e-05, + "loss": 0.2588, + "num_input_tokens_seen": 36104096, + "step": 23635 + }, + { + "epoch": 72.9644513137558, + "grad_norm": 0.4924335777759552, + "learning_rate": 1.7951719426222647e-05, + "loss": 0.1931, + "num_input_tokens_seen": 36111584, + "step": 23640 + }, + { + "epoch": 72.97990726429676, + "grad_norm": 0.7100754976272583, + "learning_rate": 1.794230074259904e-05, + "loss": 0.213, + "num_input_tokens_seen": 36118880, + "step": 23645 + }, + { + "epoch": 72.99536321483771, + "grad_norm": 0.46746838092803955, + "learning_rate": 1.7932883147361336e-05, + "loss": 0.2227, + "num_input_tokens_seen": 36126048, + "step": 23650 + }, + { + "epoch": 73.00927357032458, + "grad_norm": 0.7326673269271851, + "learning_rate": 1.7923466641961865e-05, + "loss": 0.2337, + "num_input_tokens_seen": 36132528, + "step": 23655 + }, + { + "epoch": 73.02472952086553, + "grad_norm": 0.5676479339599609, + "learning_rate": 1.791405122785278e-05, + "loss": 0.2504, + "num_input_tokens_seen": 36139728, + "step": 23660 + }, + { + "epoch": 73.04018547140649, + "grad_norm": 0.5909138321876526, + "learning_rate": 1.7904636906486037e-05, + "loss": 0.225, + "num_input_tokens_seen": 36147344, + "step": 23665 + }, + { + "epoch": 73.05564142194746, + "grad_norm": 0.4249404966831207, + "learning_rate": 1.7895223679313448e-05, + "loss": 0.2344, + "num_input_tokens_seen": 36154832, + "step": 23670 + }, + { + "epoch": 73.07109737248841, + "grad_norm": 0.5632539391517639, + "learning_rate": 1.7885811547786653e-05, + "loss": 0.2888, + "num_input_tokens_seen": 36162704, + "step": 23675 + }, + { + "epoch": 73.08655332302936, + "grad_norm": 0.3886469900608063, + "learning_rate": 1.7876400513357115e-05, + "loss": 0.2384, + "num_input_tokens_seen": 36170512, + "step": 23680 + }, + { + "epoch": 73.10200927357033, + "grad_norm": 0.40153953433036804, + "learning_rate": 1.7866990577476146e-05, + "loss": 0.2026, + "num_input_tokens_seen": 36177936, + "step": 23685 + }, + { + "epoch": 73.11746522411129, + "grad_norm": 0.4825916886329651, + "learning_rate": 1.7857581741594863e-05, + "loss": 0.2452, + "num_input_tokens_seen": 36185232, + "step": 23690 + }, + { + "epoch": 73.13292117465224, + "grad_norm": 0.6779016852378845, + "learning_rate": 1.7848174007164237e-05, + "loss": 0.2297, + "num_input_tokens_seen": 36192624, + "step": 23695 + }, + { + "epoch": 73.14837712519319, + "grad_norm": 0.6004745364189148, + "learning_rate": 1.7838767375635052e-05, + "loss": 0.3441, + "num_input_tokens_seen": 36200112, + "step": 23700 + }, + { + "epoch": 73.16383307573416, + "grad_norm": 0.3946051001548767, + "learning_rate": 1.782936184845793e-05, + "loss": 0.2046, + "num_input_tokens_seen": 36208240, + "step": 23705 + }, + { + "epoch": 73.17928902627511, + "grad_norm": 0.3758413791656494, + "learning_rate": 1.7819957427083334e-05, + "loss": 0.2241, + "num_input_tokens_seen": 36216624, + "step": 23710 + }, + { + "epoch": 73.19474497681607, + "grad_norm": 0.746444046497345, + "learning_rate": 1.7810554112961516e-05, + "loss": 0.2059, + "num_input_tokens_seen": 36224080, + "step": 23715 + }, + { + "epoch": 73.21020092735704, + "grad_norm": 0.4342808425426483, + "learning_rate": 1.7801151907542607e-05, + "loss": 0.2552, + "num_input_tokens_seen": 36231728, + "step": 23720 + }, + { + "epoch": 73.22565687789799, + "grad_norm": 0.4879346191883087, + "learning_rate": 1.7791750812276547e-05, + "loss": 0.2287, + "num_input_tokens_seen": 36239056, + "step": 23725 + }, + { + "epoch": 73.24111282843894, + "grad_norm": 0.836516797542572, + "learning_rate": 1.778235082861309e-05, + "loss": 0.3589, + "num_input_tokens_seen": 36247088, + "step": 23730 + }, + { + "epoch": 73.25656877897991, + "grad_norm": 0.5443766117095947, + "learning_rate": 1.777295195800184e-05, + "loss": 0.2058, + "num_input_tokens_seen": 36254704, + "step": 23735 + }, + { + "epoch": 73.27202472952087, + "grad_norm": 0.5446268320083618, + "learning_rate": 1.7763554201892215e-05, + "loss": 0.2796, + "num_input_tokens_seen": 36262320, + "step": 23740 + }, + { + "epoch": 73.28748068006182, + "grad_norm": 0.7097078561782837, + "learning_rate": 1.7754157561733476e-05, + "loss": 0.3512, + "num_input_tokens_seen": 36269840, + "step": 23745 + }, + { + "epoch": 73.30293663060279, + "grad_norm": 0.5306058526039124, + "learning_rate": 1.7744762038974702e-05, + "loss": 0.3365, + "num_input_tokens_seen": 36277616, + "step": 23750 + }, + { + "epoch": 73.31839258114374, + "grad_norm": 0.6499126553535461, + "learning_rate": 1.7735367635064788e-05, + "loss": 0.2925, + "num_input_tokens_seen": 36285296, + "step": 23755 + }, + { + "epoch": 73.3338485316847, + "grad_norm": 0.7336291670799255, + "learning_rate": 1.7725974351452474e-05, + "loss": 0.2445, + "num_input_tokens_seen": 36293072, + "step": 23760 + }, + { + "epoch": 73.34930448222566, + "grad_norm": 0.6209560036659241, + "learning_rate": 1.771658218958634e-05, + "loss": 0.2097, + "num_input_tokens_seen": 36300368, + "step": 23765 + }, + { + "epoch": 73.36476043276662, + "grad_norm": 0.4360598623752594, + "learning_rate": 1.770719115091475e-05, + "loss": 0.2401, + "num_input_tokens_seen": 36308112, + "step": 23770 + }, + { + "epoch": 73.38021638330757, + "grad_norm": 0.5656053423881531, + "learning_rate": 1.7697801236885935e-05, + "loss": 0.2826, + "num_input_tokens_seen": 36315696, + "step": 23775 + }, + { + "epoch": 73.39567233384854, + "grad_norm": 0.6244055032730103, + "learning_rate": 1.7688412448947944e-05, + "loss": 0.2939, + "num_input_tokens_seen": 36322992, + "step": 23780 + }, + { + "epoch": 73.41112828438949, + "grad_norm": 0.43464428186416626, + "learning_rate": 1.767902478854862e-05, + "loss": 0.2406, + "num_input_tokens_seen": 36330512, + "step": 23785 + }, + { + "epoch": 73.42658423493044, + "grad_norm": 0.5929484963417053, + "learning_rate": 1.766963825713569e-05, + "loss": 0.2322, + "num_input_tokens_seen": 36338896, + "step": 23790 + }, + { + "epoch": 73.44204018547141, + "grad_norm": 0.6658332347869873, + "learning_rate": 1.766025285615665e-05, + "loss": 0.2384, + "num_input_tokens_seen": 36346352, + "step": 23795 + }, + { + "epoch": 73.45749613601237, + "grad_norm": 0.4721815586090088, + "learning_rate": 1.7650868587058854e-05, + "loss": 0.2393, + "num_input_tokens_seen": 36353808, + "step": 23800 + }, + { + "epoch": 73.45749613601237, + "eval_loss": 0.305495947599411, + "eval_runtime": 6.3122, + "eval_samples_per_second": 91.093, + "eval_steps_per_second": 22.813, + "num_input_tokens_seen": 36353808, + "step": 23800 + }, + { + "epoch": 73.47295208655332, + "grad_norm": 0.4762929081916809, + "learning_rate": 1.7641485451289484e-05, + "loss": 0.246, + "num_input_tokens_seen": 36361392, + "step": 23805 + }, + { + "epoch": 73.48840803709429, + "grad_norm": 0.3871055245399475, + "learning_rate": 1.7632103450295534e-05, + "loss": 0.2691, + "num_input_tokens_seen": 36368272, + "step": 23810 + }, + { + "epoch": 73.50386398763524, + "grad_norm": 0.6991453170776367, + "learning_rate": 1.762272258552381e-05, + "loss": 0.2784, + "num_input_tokens_seen": 36376176, + "step": 23815 + }, + { + "epoch": 73.5193199381762, + "grad_norm": 0.5819641351699829, + "learning_rate": 1.7613342858420988e-05, + "loss": 0.2155, + "num_input_tokens_seen": 36383760, + "step": 23820 + }, + { + "epoch": 73.53477588871715, + "grad_norm": 0.58077472448349, + "learning_rate": 1.760396427043351e-05, + "loss": 0.2191, + "num_input_tokens_seen": 36391376, + "step": 23825 + }, + { + "epoch": 73.55023183925812, + "grad_norm": 0.3678111732006073, + "learning_rate": 1.7594586823007696e-05, + "loss": 0.2008, + "num_input_tokens_seen": 36399408, + "step": 23830 + }, + { + "epoch": 73.56568778979907, + "grad_norm": 0.6223919987678528, + "learning_rate": 1.7585210517589646e-05, + "loss": 0.2619, + "num_input_tokens_seen": 36407184, + "step": 23835 + }, + { + "epoch": 73.58114374034002, + "grad_norm": 0.5749629139900208, + "learning_rate": 1.7575835355625314e-05, + "loss": 0.2452, + "num_input_tokens_seen": 36414896, + "step": 23840 + }, + { + "epoch": 73.59659969088099, + "grad_norm": 0.2888401448726654, + "learning_rate": 1.756646133856048e-05, + "loss": 0.2588, + "num_input_tokens_seen": 36422800, + "step": 23845 + }, + { + "epoch": 73.61205564142195, + "grad_norm": 0.416856050491333, + "learning_rate": 1.7557088467840714e-05, + "loss": 0.2379, + "num_input_tokens_seen": 36430960, + "step": 23850 + }, + { + "epoch": 73.6275115919629, + "grad_norm": 0.960756778717041, + "learning_rate": 1.7547716744911438e-05, + "loss": 0.2299, + "num_input_tokens_seen": 36438672, + "step": 23855 + }, + { + "epoch": 73.64296754250387, + "grad_norm": 0.6019672751426697, + "learning_rate": 1.7538346171217902e-05, + "loss": 0.2658, + "num_input_tokens_seen": 36446160, + "step": 23860 + }, + { + "epoch": 73.65842349304482, + "grad_norm": 0.6907082796096802, + "learning_rate": 1.7528976748205146e-05, + "loss": 0.2518, + "num_input_tokens_seen": 36453648, + "step": 23865 + }, + { + "epoch": 73.67387944358578, + "grad_norm": 0.3092101812362671, + "learning_rate": 1.751960847731807e-05, + "loss": 0.267, + "num_input_tokens_seen": 36461008, + "step": 23870 + }, + { + "epoch": 73.68933539412674, + "grad_norm": 0.6143779754638672, + "learning_rate": 1.7510241360001362e-05, + "loss": 0.2344, + "num_input_tokens_seen": 36468560, + "step": 23875 + }, + { + "epoch": 73.7047913446677, + "grad_norm": 0.5181480050086975, + "learning_rate": 1.7500875397699562e-05, + "loss": 0.255, + "num_input_tokens_seen": 36476304, + "step": 23880 + }, + { + "epoch": 73.72024729520865, + "grad_norm": 0.3895101547241211, + "learning_rate": 1.7491510591857015e-05, + "loss": 0.2376, + "num_input_tokens_seen": 36484048, + "step": 23885 + }, + { + "epoch": 73.73570324574962, + "grad_norm": 0.3903208374977112, + "learning_rate": 1.7482146943917896e-05, + "loss": 0.2007, + "num_input_tokens_seen": 36491600, + "step": 23890 + }, + { + "epoch": 73.75115919629057, + "grad_norm": 0.43143391609191895, + "learning_rate": 1.7472784455326185e-05, + "loss": 0.2538, + "num_input_tokens_seen": 36499056, + "step": 23895 + }, + { + "epoch": 73.76661514683153, + "grad_norm": 0.5568698048591614, + "learning_rate": 1.746342312752572e-05, + "loss": 0.254, + "num_input_tokens_seen": 36506576, + "step": 23900 + }, + { + "epoch": 73.7820710973725, + "grad_norm": 0.40347081422805786, + "learning_rate": 1.74540629619601e-05, + "loss": 0.2017, + "num_input_tokens_seen": 36514576, + "step": 23905 + }, + { + "epoch": 73.79752704791345, + "grad_norm": 0.576567530632019, + "learning_rate": 1.7444703960072815e-05, + "loss": 0.2548, + "num_input_tokens_seen": 36522448, + "step": 23910 + }, + { + "epoch": 73.8129829984544, + "grad_norm": 0.546377420425415, + "learning_rate": 1.7435346123307118e-05, + "loss": 0.2391, + "num_input_tokens_seen": 36530000, + "step": 23915 + }, + { + "epoch": 73.82843894899537, + "grad_norm": 0.6257656812667847, + "learning_rate": 1.742598945310611e-05, + "loss": 0.2304, + "num_input_tokens_seen": 36537776, + "step": 23920 + }, + { + "epoch": 73.84389489953632, + "grad_norm": 0.47811415791511536, + "learning_rate": 1.741663395091272e-05, + "loss": 0.3017, + "num_input_tokens_seen": 36545904, + "step": 23925 + }, + { + "epoch": 73.85935085007728, + "grad_norm": 0.40091672539711, + "learning_rate": 1.7407279618169657e-05, + "loss": 0.2363, + "num_input_tokens_seen": 36553872, + "step": 23930 + }, + { + "epoch": 73.87480680061825, + "grad_norm": 0.5424801111221313, + "learning_rate": 1.73979264563195e-05, + "loss": 0.2259, + "num_input_tokens_seen": 36561392, + "step": 23935 + }, + { + "epoch": 73.8902627511592, + "grad_norm": 0.42946842312812805, + "learning_rate": 1.7388574466804625e-05, + "loss": 0.2134, + "num_input_tokens_seen": 36569296, + "step": 23940 + }, + { + "epoch": 73.90571870170015, + "grad_norm": 0.490125834941864, + "learning_rate": 1.7379223651067207e-05, + "loss": 0.2625, + "num_input_tokens_seen": 36576656, + "step": 23945 + }, + { + "epoch": 73.9211746522411, + "grad_norm": 0.5724729299545288, + "learning_rate": 1.736987401054928e-05, + "loss": 0.2587, + "num_input_tokens_seen": 36584656, + "step": 23950 + }, + { + "epoch": 73.93663060278207, + "grad_norm": 0.6594762206077576, + "learning_rate": 1.736052554669266e-05, + "loss": 0.2183, + "num_input_tokens_seen": 36592432, + "step": 23955 + }, + { + "epoch": 73.95208655332303, + "grad_norm": 0.5345464944839478, + "learning_rate": 1.7351178260939007e-05, + "loss": 0.2587, + "num_input_tokens_seen": 36600080, + "step": 23960 + }, + { + "epoch": 73.96754250386398, + "grad_norm": 0.5690375566482544, + "learning_rate": 1.7341832154729794e-05, + "loss": 0.3293, + "num_input_tokens_seen": 36608048, + "step": 23965 + }, + { + "epoch": 73.98299845440495, + "grad_norm": 0.9856212139129639, + "learning_rate": 1.7332487229506286e-05, + "loss": 0.2385, + "num_input_tokens_seen": 36615632, + "step": 23970 + }, + { + "epoch": 73.9984544049459, + "grad_norm": 0.7155455350875854, + "learning_rate": 1.732314348670961e-05, + "loss": 0.2859, + "num_input_tokens_seen": 36623248, + "step": 23975 + }, + { + "epoch": 74.01236476043276, + "grad_norm": 0.440483421087265, + "learning_rate": 1.7313800927780686e-05, + "loss": 0.1935, + "num_input_tokens_seen": 36629904, + "step": 23980 + }, + { + "epoch": 74.02782071097373, + "grad_norm": 0.6012100577354431, + "learning_rate": 1.7304459554160245e-05, + "loss": 0.2531, + "num_input_tokens_seen": 36637904, + "step": 23985 + }, + { + "epoch": 74.04327666151468, + "grad_norm": 0.745444118976593, + "learning_rate": 1.7295119367288853e-05, + "loss": 0.2722, + "num_input_tokens_seen": 36645360, + "step": 23990 + }, + { + "epoch": 74.05873261205564, + "grad_norm": 0.6655027866363525, + "learning_rate": 1.728578036860688e-05, + "loss": 0.2072, + "num_input_tokens_seen": 36652880, + "step": 23995 + }, + { + "epoch": 74.0741885625966, + "grad_norm": 0.692054033279419, + "learning_rate": 1.7276442559554513e-05, + "loss": 0.2728, + "num_input_tokens_seen": 36660560, + "step": 24000 + }, + { + "epoch": 74.0741885625966, + "eval_loss": 0.3063768744468689, + "eval_runtime": 6.296, + "eval_samples_per_second": 91.328, + "eval_steps_per_second": 22.872, + "num_input_tokens_seen": 36660560, + "step": 24000 + }, + { + "epoch": 74.08964451313756, + "grad_norm": 0.9546911716461182, + "learning_rate": 1.726710594157177e-05, + "loss": 0.271, + "num_input_tokens_seen": 36668592, + "step": 24005 + }, + { + "epoch": 74.10510046367851, + "grad_norm": 0.28345057368278503, + "learning_rate": 1.725777051609846e-05, + "loss": 0.2232, + "num_input_tokens_seen": 36676592, + "step": 24010 + }, + { + "epoch": 74.12055641421948, + "grad_norm": 0.44053223729133606, + "learning_rate": 1.7248436284574228e-05, + "loss": 0.278, + "num_input_tokens_seen": 36684496, + "step": 24015 + }, + { + "epoch": 74.13601236476043, + "grad_norm": 0.3379259705543518, + "learning_rate": 1.723910324843855e-05, + "loss": 0.2735, + "num_input_tokens_seen": 36691888, + "step": 24020 + }, + { + "epoch": 74.15146831530139, + "grad_norm": 1.1092630624771118, + "learning_rate": 1.722977140913067e-05, + "loss": 0.2549, + "num_input_tokens_seen": 36699728, + "step": 24025 + }, + { + "epoch": 74.16692426584235, + "grad_norm": 0.31825917959213257, + "learning_rate": 1.7220440768089688e-05, + "loss": 0.2655, + "num_input_tokens_seen": 36707440, + "step": 24030 + }, + { + "epoch": 74.18238021638331, + "grad_norm": 0.581413984298706, + "learning_rate": 1.7211111326754505e-05, + "loss": 0.219, + "num_input_tokens_seen": 36714800, + "step": 24035 + }, + { + "epoch": 74.19783616692426, + "grad_norm": 0.5505969524383545, + "learning_rate": 1.720178308656383e-05, + "loss": 0.2278, + "num_input_tokens_seen": 36723024, + "step": 24040 + }, + { + "epoch": 74.21329211746523, + "grad_norm": 0.4545193910598755, + "learning_rate": 1.719245604895621e-05, + "loss": 0.2592, + "num_input_tokens_seen": 36730704, + "step": 24045 + }, + { + "epoch": 74.22874806800618, + "grad_norm": 0.4190310835838318, + "learning_rate": 1.7183130215369972e-05, + "loss": 0.2251, + "num_input_tokens_seen": 36738480, + "step": 24050 + }, + { + "epoch": 74.24420401854714, + "grad_norm": 0.6643664836883545, + "learning_rate": 1.7173805587243292e-05, + "loss": 0.2864, + "num_input_tokens_seen": 36745744, + "step": 24055 + }, + { + "epoch": 74.2596599690881, + "grad_norm": 0.4879671335220337, + "learning_rate": 1.7164482166014147e-05, + "loss": 0.2305, + "num_input_tokens_seen": 36753424, + "step": 24060 + }, + { + "epoch": 74.27511591962906, + "grad_norm": 0.7362693548202515, + "learning_rate": 1.7155159953120313e-05, + "loss": 0.2769, + "num_input_tokens_seen": 36761392, + "step": 24065 + }, + { + "epoch": 74.29057187017001, + "grad_norm": 0.5100319385528564, + "learning_rate": 1.714583894999941e-05, + "loss": 0.2312, + "num_input_tokens_seen": 36768816, + "step": 24070 + }, + { + "epoch": 74.30602782071098, + "grad_norm": 0.523606538772583, + "learning_rate": 1.7136519158088826e-05, + "loss": 0.201, + "num_input_tokens_seen": 36776656, + "step": 24075 + }, + { + "epoch": 74.32148377125193, + "grad_norm": 0.36157673597335815, + "learning_rate": 1.712720057882581e-05, + "loss": 0.2221, + "num_input_tokens_seen": 36784432, + "step": 24080 + }, + { + "epoch": 74.33693972179289, + "grad_norm": 0.3613783121109009, + "learning_rate": 1.7117883213647413e-05, + "loss": 0.2305, + "num_input_tokens_seen": 36792752, + "step": 24085 + }, + { + "epoch": 74.35239567233384, + "grad_norm": 0.4700733721256256, + "learning_rate": 1.710856706399046e-05, + "loss": 0.2124, + "num_input_tokens_seen": 36800720, + "step": 24090 + }, + { + "epoch": 74.36785162287481, + "grad_norm": 0.5567167401313782, + "learning_rate": 1.7099252131291648e-05, + "loss": 0.1974, + "num_input_tokens_seen": 36808016, + "step": 24095 + }, + { + "epoch": 74.38330757341576, + "grad_norm": 0.5010637640953064, + "learning_rate": 1.708993841698744e-05, + "loss": 0.299, + "num_input_tokens_seen": 36815888, + "step": 24100 + }, + { + "epoch": 74.39876352395672, + "grad_norm": 0.48602423071861267, + "learning_rate": 1.7080625922514132e-05, + "loss": 0.3043, + "num_input_tokens_seen": 36823920, + "step": 24105 + }, + { + "epoch": 74.41421947449768, + "grad_norm": 0.49205702543258667, + "learning_rate": 1.7071314649307836e-05, + "loss": 0.2627, + "num_input_tokens_seen": 36831728, + "step": 24110 + }, + { + "epoch": 74.42967542503864, + "grad_norm": 0.6427831053733826, + "learning_rate": 1.7062004598804448e-05, + "loss": 0.2453, + "num_input_tokens_seen": 36839440, + "step": 24115 + }, + { + "epoch": 74.44513137557959, + "grad_norm": 0.428708553314209, + "learning_rate": 1.7052695772439702e-05, + "loss": 0.2536, + "num_input_tokens_seen": 36846992, + "step": 24120 + }, + { + "epoch": 74.46058732612056, + "grad_norm": 0.5512096285820007, + "learning_rate": 1.7043388171649154e-05, + "loss": 0.3355, + "num_input_tokens_seen": 36854544, + "step": 24125 + }, + { + "epoch": 74.47604327666151, + "grad_norm": 0.5539300441741943, + "learning_rate": 1.7034081797868127e-05, + "loss": 0.2342, + "num_input_tokens_seen": 36862416, + "step": 24130 + }, + { + "epoch": 74.49149922720247, + "grad_norm": 0.3157656192779541, + "learning_rate": 1.70247766525318e-05, + "loss": 0.2123, + "num_input_tokens_seen": 36870672, + "step": 24135 + }, + { + "epoch": 74.50695517774344, + "grad_norm": 0.4145858585834503, + "learning_rate": 1.701547273707514e-05, + "loss": 0.2563, + "num_input_tokens_seen": 36878192, + "step": 24140 + }, + { + "epoch": 74.52241112828439, + "grad_norm": 0.44974884390830994, + "learning_rate": 1.7006170052932916e-05, + "loss": 0.2093, + "num_input_tokens_seen": 36885840, + "step": 24145 + }, + { + "epoch": 74.53786707882534, + "grad_norm": 0.46878674626350403, + "learning_rate": 1.6996868601539735e-05, + "loss": 0.3154, + "num_input_tokens_seen": 36893392, + "step": 24150 + }, + { + "epoch": 74.55332302936631, + "grad_norm": 0.4970663785934448, + "learning_rate": 1.6987568384329977e-05, + "loss": 0.2211, + "num_input_tokens_seen": 36901168, + "step": 24155 + }, + { + "epoch": 74.56877897990726, + "grad_norm": 0.603671669960022, + "learning_rate": 1.6978269402737866e-05, + "loss": 0.2256, + "num_input_tokens_seen": 36908272, + "step": 24160 + }, + { + "epoch": 74.58423493044822, + "grad_norm": 0.4218437373638153, + "learning_rate": 1.696897165819743e-05, + "loss": 0.2801, + "num_input_tokens_seen": 36916016, + "step": 24165 + }, + { + "epoch": 74.59969088098919, + "grad_norm": 0.5621456503868103, + "learning_rate": 1.6959675152142487e-05, + "loss": 0.2308, + "num_input_tokens_seen": 36923440, + "step": 24170 + }, + { + "epoch": 74.61514683153014, + "grad_norm": 0.4669077694416046, + "learning_rate": 1.6950379886006667e-05, + "loss": 0.2021, + "num_input_tokens_seen": 36930640, + "step": 24175 + }, + { + "epoch": 74.6306027820711, + "grad_norm": 0.4034515619277954, + "learning_rate": 1.6941085861223438e-05, + "loss": 0.249, + "num_input_tokens_seen": 36938128, + "step": 24180 + }, + { + "epoch": 74.64605873261206, + "grad_norm": 0.39460137486457825, + "learning_rate": 1.6931793079226034e-05, + "loss": 0.2346, + "num_input_tokens_seen": 36946160, + "step": 24185 + }, + { + "epoch": 74.66151468315302, + "grad_norm": 0.4555318355560303, + "learning_rate": 1.692250154144754e-05, + "loss": 0.2181, + "num_input_tokens_seen": 36953712, + "step": 24190 + }, + { + "epoch": 74.67697063369397, + "grad_norm": 0.41687843203544617, + "learning_rate": 1.6913211249320807e-05, + "loss": 0.2218, + "num_input_tokens_seen": 36960976, + "step": 24195 + }, + { + "epoch": 74.69242658423494, + "grad_norm": 0.41272610425949097, + "learning_rate": 1.6903922204278522e-05, + "loss": 0.2662, + "num_input_tokens_seen": 36968464, + "step": 24200 + }, + { + "epoch": 74.69242658423494, + "eval_loss": 0.3074624240398407, + "eval_runtime": 6.3068, + "eval_samples_per_second": 91.171, + "eval_steps_per_second": 22.832, + "num_input_tokens_seen": 36968464, + "step": 24200 + }, + { + "epoch": 74.70788253477589, + "grad_norm": 0.6340409517288208, + "learning_rate": 1.6894634407753186e-05, + "loss": 0.2811, + "num_input_tokens_seen": 36976144, + "step": 24205 + }, + { + "epoch": 74.72333848531684, + "grad_norm": 0.3362922668457031, + "learning_rate": 1.6885347861177077e-05, + "loss": 0.2953, + "num_input_tokens_seen": 36983696, + "step": 24210 + }, + { + "epoch": 74.7387944358578, + "grad_norm": 0.593023419380188, + "learning_rate": 1.6876062565982298e-05, + "loss": 0.287, + "num_input_tokens_seen": 36990800, + "step": 24215 + }, + { + "epoch": 74.75425038639877, + "grad_norm": 0.5338794589042664, + "learning_rate": 1.6866778523600774e-05, + "loss": 0.2573, + "num_input_tokens_seen": 36998064, + "step": 24220 + }, + { + "epoch": 74.76970633693972, + "grad_norm": 0.49575600028038025, + "learning_rate": 1.6857495735464195e-05, + "loss": 0.2162, + "num_input_tokens_seen": 37006160, + "step": 24225 + }, + { + "epoch": 74.78516228748067, + "grad_norm": 0.5857529640197754, + "learning_rate": 1.6848214203004115e-05, + "loss": 0.2359, + "num_input_tokens_seen": 37014032, + "step": 24230 + }, + { + "epoch": 74.80061823802164, + "grad_norm": 0.6639948487281799, + "learning_rate": 1.6838933927651835e-05, + "loss": 0.2125, + "num_input_tokens_seen": 37021776, + "step": 24235 + }, + { + "epoch": 74.8160741885626, + "grad_norm": 0.4958662688732147, + "learning_rate": 1.6829654910838506e-05, + "loss": 0.2857, + "num_input_tokens_seen": 37029168, + "step": 24240 + }, + { + "epoch": 74.83153013910355, + "grad_norm": 0.8273560404777527, + "learning_rate": 1.6820377153995065e-05, + "loss": 0.2831, + "num_input_tokens_seen": 37036880, + "step": 24245 + }, + { + "epoch": 74.84698608964452, + "grad_norm": 0.5287346839904785, + "learning_rate": 1.681110065855226e-05, + "loss": 0.2673, + "num_input_tokens_seen": 37045168, + "step": 24250 + }, + { + "epoch": 74.86244204018547, + "grad_norm": 0.29660189151763916, + "learning_rate": 1.6801825425940642e-05, + "loss": 0.3464, + "num_input_tokens_seen": 37053552, + "step": 24255 + }, + { + "epoch": 74.87789799072642, + "grad_norm": 0.3601396977901459, + "learning_rate": 1.679255145759056e-05, + "loss": 0.2297, + "num_input_tokens_seen": 37060528, + "step": 24260 + }, + { + "epoch": 74.89335394126739, + "grad_norm": 0.5305078029632568, + "learning_rate": 1.6783278754932187e-05, + "loss": 0.2213, + "num_input_tokens_seen": 37067792, + "step": 24265 + }, + { + "epoch": 74.90880989180835, + "grad_norm": 0.5115076899528503, + "learning_rate": 1.6774007319395496e-05, + "loss": 0.3259, + "num_input_tokens_seen": 37075472, + "step": 24270 + }, + { + "epoch": 74.9242658423493, + "grad_norm": 0.5244728922843933, + "learning_rate": 1.6764737152410243e-05, + "loss": 0.2555, + "num_input_tokens_seen": 37083056, + "step": 24275 + }, + { + "epoch": 74.93972179289027, + "grad_norm": 0.34727954864501953, + "learning_rate": 1.6755468255406016e-05, + "loss": 0.2794, + "num_input_tokens_seen": 37090192, + "step": 24280 + }, + { + "epoch": 74.95517774343122, + "grad_norm": 0.6615221500396729, + "learning_rate": 1.674620062981219e-05, + "loss": 0.2501, + "num_input_tokens_seen": 37097968, + "step": 24285 + }, + { + "epoch": 74.97063369397218, + "grad_norm": 0.48460307717323303, + "learning_rate": 1.6736934277057947e-05, + "loss": 0.2546, + "num_input_tokens_seen": 37105232, + "step": 24290 + }, + { + "epoch": 74.98608964451314, + "grad_norm": 0.39496272802352905, + "learning_rate": 1.6727669198572286e-05, + "loss": 0.2039, + "num_input_tokens_seen": 37112944, + "step": 24295 + }, + { + "epoch": 75.0, + "grad_norm": 1.26399827003479, + "learning_rate": 1.6718405395783984e-05, + "loss": 0.269, + "num_input_tokens_seen": 37119728, + "step": 24300 + }, + { + "epoch": 75.01545595054095, + "grad_norm": 0.5346540808677673, + "learning_rate": 1.6709142870121643e-05, + "loss": 0.2178, + "num_input_tokens_seen": 37127088, + "step": 24305 + }, + { + "epoch": 75.03091190108192, + "grad_norm": 0.5794314742088318, + "learning_rate": 1.669988162301367e-05, + "loss": 0.24, + "num_input_tokens_seen": 37134992, + "step": 24310 + }, + { + "epoch": 75.04636785162288, + "grad_norm": 0.6130228042602539, + "learning_rate": 1.6690621655888243e-05, + "loss": 0.2728, + "num_input_tokens_seen": 37143184, + "step": 24315 + }, + { + "epoch": 75.06182380216383, + "grad_norm": 0.4233796298503876, + "learning_rate": 1.6681362970173386e-05, + "loss": 0.2344, + "num_input_tokens_seen": 37151184, + "step": 24320 + }, + { + "epoch": 75.0772797527048, + "grad_norm": 0.4224173426628113, + "learning_rate": 1.6672105567296904e-05, + "loss": 0.2302, + "num_input_tokens_seen": 37158672, + "step": 24325 + }, + { + "epoch": 75.09273570324575, + "grad_norm": 0.6084365248680115, + "learning_rate": 1.666284944868639e-05, + "loss": 0.3179, + "num_input_tokens_seen": 37166544, + "step": 24330 + }, + { + "epoch": 75.1081916537867, + "grad_norm": 0.6291736960411072, + "learning_rate": 1.665359461576927e-05, + "loss": 0.2344, + "num_input_tokens_seen": 37173968, + "step": 24335 + }, + { + "epoch": 75.12364760432767, + "grad_norm": 0.2939152419567108, + "learning_rate": 1.6644341069972736e-05, + "loss": 0.2617, + "num_input_tokens_seen": 37181264, + "step": 24340 + }, + { + "epoch": 75.13910355486863, + "grad_norm": 0.4251011610031128, + "learning_rate": 1.6635088812723813e-05, + "loss": 0.2327, + "num_input_tokens_seen": 37188848, + "step": 24345 + }, + { + "epoch": 75.15455950540958, + "grad_norm": 0.70240318775177, + "learning_rate": 1.6625837845449328e-05, + "loss": 0.2237, + "num_input_tokens_seen": 37196304, + "step": 24350 + }, + { + "epoch": 75.17001545595055, + "grad_norm": 0.4514673054218292, + "learning_rate": 1.6616588169575874e-05, + "loss": 0.2916, + "num_input_tokens_seen": 37204048, + "step": 24355 + }, + { + "epoch": 75.1854714064915, + "grad_norm": 0.5898910164833069, + "learning_rate": 1.6607339786529878e-05, + "loss": 0.25, + "num_input_tokens_seen": 37212304, + "step": 24360 + }, + { + "epoch": 75.20092735703246, + "grad_norm": 0.48809707164764404, + "learning_rate": 1.659809269773756e-05, + "loss": 0.2418, + "num_input_tokens_seen": 37220016, + "step": 24365 + }, + { + "epoch": 75.21638330757341, + "grad_norm": 0.4688980281352997, + "learning_rate": 1.658884690462493e-05, + "loss": 0.2442, + "num_input_tokens_seen": 37227248, + "step": 24370 + }, + { + "epoch": 75.23183925811438, + "grad_norm": 0.2644396722316742, + "learning_rate": 1.6579602408617813e-05, + "loss": 0.2876, + "num_input_tokens_seen": 37234768, + "step": 24375 + }, + { + "epoch": 75.24729520865533, + "grad_norm": 0.3695697486400604, + "learning_rate": 1.657035921114181e-05, + "loss": 0.2405, + "num_input_tokens_seen": 37242864, + "step": 24380 + }, + { + "epoch": 75.26275115919628, + "grad_norm": 0.572852373123169, + "learning_rate": 1.656111731362236e-05, + "loss": 0.2396, + "num_input_tokens_seen": 37250608, + "step": 24385 + }, + { + "epoch": 75.27820710973725, + "grad_norm": 0.42267081141471863, + "learning_rate": 1.6551876717484666e-05, + "loss": 0.1894, + "num_input_tokens_seen": 37258000, + "step": 24390 + }, + { + "epoch": 75.2936630602782, + "grad_norm": 0.7012901902198792, + "learning_rate": 1.6542637424153752e-05, + "loss": 0.2754, + "num_input_tokens_seen": 37265360, + "step": 24395 + }, + { + "epoch": 75.30911901081916, + "grad_norm": 0.6612425446510315, + "learning_rate": 1.6533399435054418e-05, + "loss": 0.2926, + "num_input_tokens_seen": 37273264, + "step": 24400 + }, + { + "epoch": 75.30911901081916, + "eval_loss": 0.3048773407936096, + "eval_runtime": 6.3136, + "eval_samples_per_second": 91.074, + "eval_steps_per_second": 22.808, + "num_input_tokens_seen": 37273264, + "step": 24400 + }, + { + "epoch": 75.32457496136013, + "grad_norm": 0.6593853831291199, + "learning_rate": 1.6524162751611304e-05, + "loss": 0.2564, + "num_input_tokens_seen": 37280720, + "step": 24405 + }, + { + "epoch": 75.34003091190108, + "grad_norm": 0.293259859085083, + "learning_rate": 1.6514927375248796e-05, + "loss": 0.2823, + "num_input_tokens_seen": 37288144, + "step": 24410 + }, + { + "epoch": 75.35548686244204, + "grad_norm": 0.5149055123329163, + "learning_rate": 1.6505693307391127e-05, + "loss": 0.2632, + "num_input_tokens_seen": 37295888, + "step": 24415 + }, + { + "epoch": 75.370942812983, + "grad_norm": 0.3532363772392273, + "learning_rate": 1.6496460549462288e-05, + "loss": 0.2038, + "num_input_tokens_seen": 37303376, + "step": 24420 + }, + { + "epoch": 75.38639876352396, + "grad_norm": 0.5233265161514282, + "learning_rate": 1.6487229102886097e-05, + "loss": 0.1893, + "num_input_tokens_seen": 37310608, + "step": 24425 + }, + { + "epoch": 75.40185471406491, + "grad_norm": 0.4452745318412781, + "learning_rate": 1.6477998969086155e-05, + "loss": 0.2632, + "num_input_tokens_seen": 37318576, + "step": 24430 + }, + { + "epoch": 75.41731066460588, + "grad_norm": 0.4065243601799011, + "learning_rate": 1.646877014948587e-05, + "loss": 0.2498, + "num_input_tokens_seen": 37326096, + "step": 24435 + }, + { + "epoch": 75.43276661514683, + "grad_norm": 0.37811508774757385, + "learning_rate": 1.6459542645508433e-05, + "loss": 0.2547, + "num_input_tokens_seen": 37333584, + "step": 24440 + }, + { + "epoch": 75.44822256568779, + "grad_norm": 0.42128682136535645, + "learning_rate": 1.6450316458576852e-05, + "loss": 0.2437, + "num_input_tokens_seen": 37341936, + "step": 24445 + }, + { + "epoch": 75.46367851622875, + "grad_norm": 0.5133332014083862, + "learning_rate": 1.6441091590113912e-05, + "loss": 0.2442, + "num_input_tokens_seen": 37349488, + "step": 24450 + }, + { + "epoch": 75.47913446676971, + "grad_norm": 0.4565753638744354, + "learning_rate": 1.6431868041542213e-05, + "loss": 0.2373, + "num_input_tokens_seen": 37357136, + "step": 24455 + }, + { + "epoch": 75.49459041731066, + "grad_norm": 0.6727443933486938, + "learning_rate": 1.6422645814284123e-05, + "loss": 0.2275, + "num_input_tokens_seen": 37364240, + "step": 24460 + }, + { + "epoch": 75.51004636785163, + "grad_norm": 0.4170835018157959, + "learning_rate": 1.6413424909761846e-05, + "loss": 0.2968, + "num_input_tokens_seen": 37371728, + "step": 24465 + }, + { + "epoch": 75.52550231839258, + "grad_norm": 0.6567391753196716, + "learning_rate": 1.640420532939736e-05, + "loss": 0.2526, + "num_input_tokens_seen": 37379248, + "step": 24470 + }, + { + "epoch": 75.54095826893354, + "grad_norm": 0.6550217866897583, + "learning_rate": 1.639498707461242e-05, + "loss": 0.3216, + "num_input_tokens_seen": 37387120, + "step": 24475 + }, + { + "epoch": 75.5564142194745, + "grad_norm": 1.0170778036117554, + "learning_rate": 1.6385770146828614e-05, + "loss": 0.281, + "num_input_tokens_seen": 37394992, + "step": 24480 + }, + { + "epoch": 75.57187017001546, + "grad_norm": 0.5484082102775574, + "learning_rate": 1.637655454746731e-05, + "loss": 0.3009, + "num_input_tokens_seen": 37402704, + "step": 24485 + }, + { + "epoch": 75.58732612055641, + "grad_norm": 0.3882492780685425, + "learning_rate": 1.6367340277949658e-05, + "loss": 0.2268, + "num_input_tokens_seen": 37410160, + "step": 24490 + }, + { + "epoch": 75.60278207109737, + "grad_norm": 0.27243852615356445, + "learning_rate": 1.635812733969663e-05, + "loss": 0.2297, + "num_input_tokens_seen": 37417904, + "step": 24495 + }, + { + "epoch": 75.61823802163833, + "grad_norm": 0.6461927890777588, + "learning_rate": 1.634891573412896e-05, + "loss": 0.2427, + "num_input_tokens_seen": 37425040, + "step": 24500 + }, + { + "epoch": 75.63369397217929, + "grad_norm": 0.3364003896713257, + "learning_rate": 1.6339705462667196e-05, + "loss": 0.214, + "num_input_tokens_seen": 37432624, + "step": 24505 + }, + { + "epoch": 75.64914992272024, + "grad_norm": 0.41560742259025574, + "learning_rate": 1.633049652673169e-05, + "loss": 0.2681, + "num_input_tokens_seen": 37440176, + "step": 24510 + }, + { + "epoch": 75.66460587326121, + "grad_norm": 0.7718905210494995, + "learning_rate": 1.632128892774256e-05, + "loss": 0.2636, + "num_input_tokens_seen": 37448080, + "step": 24515 + }, + { + "epoch": 75.68006182380216, + "grad_norm": 0.58321213722229, + "learning_rate": 1.6312082667119737e-05, + "loss": 0.2433, + "num_input_tokens_seen": 37455248, + "step": 24520 + }, + { + "epoch": 75.69551777434312, + "grad_norm": 0.3823283016681671, + "learning_rate": 1.630287774628296e-05, + "loss": 0.2118, + "num_input_tokens_seen": 37462704, + "step": 24525 + }, + { + "epoch": 75.71097372488408, + "grad_norm": 0.6004847884178162, + "learning_rate": 1.6293674166651718e-05, + "loss": 0.2485, + "num_input_tokens_seen": 37470768, + "step": 24530 + }, + { + "epoch": 75.72642967542504, + "grad_norm": 0.3978109061717987, + "learning_rate": 1.6284471929645338e-05, + "loss": 0.279, + "num_input_tokens_seen": 37478160, + "step": 24535 + }, + { + "epoch": 75.74188562596599, + "grad_norm": 0.45491957664489746, + "learning_rate": 1.627527103668291e-05, + "loss": 0.2086, + "num_input_tokens_seen": 37486192, + "step": 24540 + }, + { + "epoch": 75.75734157650696, + "grad_norm": 0.5311907529830933, + "learning_rate": 1.6266071489183327e-05, + "loss": 0.2871, + "num_input_tokens_seen": 37494096, + "step": 24545 + }, + { + "epoch": 75.77279752704791, + "grad_norm": 0.766846776008606, + "learning_rate": 1.6256873288565283e-05, + "loss": 0.2278, + "num_input_tokens_seen": 37502320, + "step": 24550 + }, + { + "epoch": 75.78825347758887, + "grad_norm": 0.4884554147720337, + "learning_rate": 1.6247676436247245e-05, + "loss": 0.3017, + "num_input_tokens_seen": 37510320, + "step": 24555 + }, + { + "epoch": 75.80370942812984, + "grad_norm": 0.5192103385925293, + "learning_rate": 1.6238480933647486e-05, + "loss": 0.2255, + "num_input_tokens_seen": 37518064, + "step": 24560 + }, + { + "epoch": 75.81916537867079, + "grad_norm": 0.6128157377243042, + "learning_rate": 1.6229286782184083e-05, + "loss": 0.2317, + "num_input_tokens_seen": 37525616, + "step": 24565 + }, + { + "epoch": 75.83462132921174, + "grad_norm": 0.9141660332679749, + "learning_rate": 1.622009398327487e-05, + "loss": 0.2498, + "num_input_tokens_seen": 37533328, + "step": 24570 + }, + { + "epoch": 75.85007727975271, + "grad_norm": 0.6092883348464966, + "learning_rate": 1.6210902538337502e-05, + "loss": 0.2495, + "num_input_tokens_seen": 37540784, + "step": 24575 + }, + { + "epoch": 75.86553323029366, + "grad_norm": 0.4272138178348541, + "learning_rate": 1.6201712448789413e-05, + "loss": 0.2534, + "num_input_tokens_seen": 37548592, + "step": 24580 + }, + { + "epoch": 75.88098918083462, + "grad_norm": 0.45964479446411133, + "learning_rate": 1.6192523716047827e-05, + "loss": 0.238, + "num_input_tokens_seen": 37556144, + "step": 24585 + }, + { + "epoch": 75.89644513137559, + "grad_norm": 0.288462370634079, + "learning_rate": 1.6183336341529776e-05, + "loss": 0.2245, + "num_input_tokens_seen": 37563664, + "step": 24590 + }, + { + "epoch": 75.91190108191654, + "grad_norm": 0.2763141989707947, + "learning_rate": 1.6174150326652047e-05, + "loss": 0.1779, + "num_input_tokens_seen": 37571376, + "step": 24595 + }, + { + "epoch": 75.9273570324575, + "grad_norm": 0.5384166836738586, + "learning_rate": 1.6164965672831256e-05, + "loss": 0.2, + "num_input_tokens_seen": 37578896, + "step": 24600 + }, + { + "epoch": 75.9273570324575, + "eval_loss": 0.30473682284355164, + "eval_runtime": 6.3145, + "eval_samples_per_second": 91.06, + "eval_steps_per_second": 22.804, + "num_input_tokens_seen": 37578896, + "step": 24600 + }, + { + "epoch": 75.94281298299846, + "grad_norm": 0.3573700189590454, + "learning_rate": 1.6155782381483784e-05, + "loss": 0.2666, + "num_input_tokens_seen": 37586384, + "step": 24605 + }, + { + "epoch": 75.95826893353942, + "grad_norm": 0.5487555265426636, + "learning_rate": 1.6146600454025813e-05, + "loss": 0.303, + "num_input_tokens_seen": 37593648, + "step": 24610 + }, + { + "epoch": 75.97372488408037, + "grad_norm": 0.9809728860855103, + "learning_rate": 1.6137419891873317e-05, + "loss": 0.2727, + "num_input_tokens_seen": 37601296, + "step": 24615 + }, + { + "epoch": 75.98918083462132, + "grad_norm": 0.7332598567008972, + "learning_rate": 1.6128240696442038e-05, + "loss": 0.2574, + "num_input_tokens_seen": 37609744, + "step": 24620 + }, + { + "epoch": 76.0030911901082, + "grad_norm": 0.6711234450340271, + "learning_rate": 1.611906286914753e-05, + "loss": 0.3049, + "num_input_tokens_seen": 37616016, + "step": 24625 + }, + { + "epoch": 76.01854714064915, + "grad_norm": 0.6263713240623474, + "learning_rate": 1.6109886411405144e-05, + "loss": 0.3318, + "num_input_tokens_seen": 37623760, + "step": 24630 + }, + { + "epoch": 76.03400309119012, + "grad_norm": 0.43879038095474243, + "learning_rate": 1.6100711324629985e-05, + "loss": 0.2215, + "num_input_tokens_seen": 37631216, + "step": 24635 + }, + { + "epoch": 76.04945904173107, + "grad_norm": 0.7045154571533203, + "learning_rate": 1.609153761023698e-05, + "loss": 0.2505, + "num_input_tokens_seen": 37638800, + "step": 24640 + }, + { + "epoch": 76.06491499227202, + "grad_norm": 0.6252254843711853, + "learning_rate": 1.608236526964083e-05, + "loss": 0.2333, + "num_input_tokens_seen": 37646640, + "step": 24645 + }, + { + "epoch": 76.08037094281298, + "grad_norm": 0.36040356755256653, + "learning_rate": 1.607319430425601e-05, + "loss": 0.193, + "num_input_tokens_seen": 37654544, + "step": 24650 + }, + { + "epoch": 76.09582689335394, + "grad_norm": 0.8152958154678345, + "learning_rate": 1.606402471549682e-05, + "loss": 0.2729, + "num_input_tokens_seen": 37662288, + "step": 24655 + }, + { + "epoch": 76.1112828438949, + "grad_norm": 0.4200970232486725, + "learning_rate": 1.6054856504777312e-05, + "loss": 0.2573, + "num_input_tokens_seen": 37669936, + "step": 24660 + }, + { + "epoch": 76.12673879443585, + "grad_norm": 0.5085294246673584, + "learning_rate": 1.6045689673511334e-05, + "loss": 0.2373, + "num_input_tokens_seen": 37677296, + "step": 24665 + }, + { + "epoch": 76.14219474497682, + "grad_norm": 0.554395854473114, + "learning_rate": 1.6036524223112548e-05, + "loss": 0.2609, + "num_input_tokens_seen": 37684624, + "step": 24670 + }, + { + "epoch": 76.15765069551777, + "grad_norm": 0.6750978231430054, + "learning_rate": 1.602736015499436e-05, + "loss": 0.2698, + "num_input_tokens_seen": 37692176, + "step": 24675 + }, + { + "epoch": 76.17310664605873, + "grad_norm": 0.5466421246528625, + "learning_rate": 1.601819747057e-05, + "loss": 0.2879, + "num_input_tokens_seen": 37699536, + "step": 24680 + }, + { + "epoch": 76.1885625965997, + "grad_norm": 0.5513625144958496, + "learning_rate": 1.6009036171252465e-05, + "loss": 0.3515, + "num_input_tokens_seen": 37706960, + "step": 24685 + }, + { + "epoch": 76.20401854714065, + "grad_norm": 0.34923285245895386, + "learning_rate": 1.599987625845453e-05, + "loss": 0.2002, + "num_input_tokens_seen": 37714704, + "step": 24690 + }, + { + "epoch": 76.2194744976816, + "grad_norm": 0.5431675314903259, + "learning_rate": 1.599071773358879e-05, + "loss": 0.2194, + "num_input_tokens_seen": 37722608, + "step": 24695 + }, + { + "epoch": 76.23493044822257, + "grad_norm": 0.41518065333366394, + "learning_rate": 1.598156059806758e-05, + "loss": 0.2794, + "num_input_tokens_seen": 37729872, + "step": 24700 + }, + { + "epoch": 76.25038639876352, + "grad_norm": 0.5786216855049133, + "learning_rate": 1.5972404853303062e-05, + "loss": 0.2419, + "num_input_tokens_seen": 37737552, + "step": 24705 + }, + { + "epoch": 76.26584234930448, + "grad_norm": 0.37731480598449707, + "learning_rate": 1.5963250500707172e-05, + "loss": 0.203, + "num_input_tokens_seen": 37745488, + "step": 24710 + }, + { + "epoch": 76.28129829984545, + "grad_norm": 0.3108454644680023, + "learning_rate": 1.5954097541691612e-05, + "loss": 0.2757, + "num_input_tokens_seen": 37753104, + "step": 24715 + }, + { + "epoch": 76.2967542503864, + "grad_norm": 0.3859815001487732, + "learning_rate": 1.5944945977667884e-05, + "loss": 0.2169, + "num_input_tokens_seen": 37760400, + "step": 24720 + }, + { + "epoch": 76.31221020092735, + "grad_norm": 0.7401391863822937, + "learning_rate": 1.593579581004729e-05, + "loss": 0.2894, + "num_input_tokens_seen": 37768208, + "step": 24725 + }, + { + "epoch": 76.32766615146832, + "grad_norm": 0.6639695167541504, + "learning_rate": 1.592664704024088e-05, + "loss": 0.2524, + "num_input_tokens_seen": 37775536, + "step": 24730 + }, + { + "epoch": 76.34312210200927, + "grad_norm": 0.24069754779338837, + "learning_rate": 1.591749966965953e-05, + "loss": 0.2234, + "num_input_tokens_seen": 37783376, + "step": 24735 + }, + { + "epoch": 76.35857805255023, + "grad_norm": 0.8997962474822998, + "learning_rate": 1.5908353699713856e-05, + "loss": 0.2095, + "num_input_tokens_seen": 37790352, + "step": 24740 + }, + { + "epoch": 76.3740340030912, + "grad_norm": 0.40863433480262756, + "learning_rate": 1.5899209131814298e-05, + "loss": 0.2413, + "num_input_tokens_seen": 37798032, + "step": 24745 + }, + { + "epoch": 76.38948995363215, + "grad_norm": 0.35796231031417847, + "learning_rate": 1.5890065967371067e-05, + "loss": 0.2103, + "num_input_tokens_seen": 37805680, + "step": 24750 + }, + { + "epoch": 76.4049459041731, + "grad_norm": 0.7333918809890747, + "learning_rate": 1.5880924207794144e-05, + "loss": 0.2985, + "num_input_tokens_seen": 37814096, + "step": 24755 + }, + { + "epoch": 76.42040185471407, + "grad_norm": 0.3883785307407379, + "learning_rate": 1.5871783854493298e-05, + "loss": 0.2397, + "num_input_tokens_seen": 37821520, + "step": 24760 + }, + { + "epoch": 76.43585780525503, + "grad_norm": 1.0417040586471558, + "learning_rate": 1.5862644908878106e-05, + "loss": 0.2326, + "num_input_tokens_seen": 37829520, + "step": 24765 + }, + { + "epoch": 76.45131375579598, + "grad_norm": 0.5010568499565125, + "learning_rate": 1.5853507372357885e-05, + "loss": 0.1971, + "num_input_tokens_seen": 37837328, + "step": 24770 + }, + { + "epoch": 76.46676970633693, + "grad_norm": 0.4343417286872864, + "learning_rate": 1.5844371246341776e-05, + "loss": 0.228, + "num_input_tokens_seen": 37844976, + "step": 24775 + }, + { + "epoch": 76.4822256568779, + "grad_norm": 0.5915930867195129, + "learning_rate": 1.5835236532238674e-05, + "loss": 0.2829, + "num_input_tokens_seen": 37852208, + "step": 24780 + }, + { + "epoch": 76.49768160741885, + "grad_norm": 0.4900747239589691, + "learning_rate": 1.582610323145727e-05, + "loss": 0.2527, + "num_input_tokens_seen": 37860400, + "step": 24785 + }, + { + "epoch": 76.51313755795981, + "grad_norm": 0.3693520128726959, + "learning_rate": 1.5816971345406035e-05, + "loss": 0.2587, + "num_input_tokens_seen": 37867760, + "step": 24790 + }, + { + "epoch": 76.52859350850078, + "grad_norm": 0.546149730682373, + "learning_rate": 1.5807840875493225e-05, + "loss": 0.2397, + "num_input_tokens_seen": 37875344, + "step": 24795 + }, + { + "epoch": 76.54404945904173, + "grad_norm": 0.6176754832267761, + "learning_rate": 1.5798711823126854e-05, + "loss": 0.2329, + "num_input_tokens_seen": 37882832, + "step": 24800 + }, + { + "epoch": 76.54404945904173, + "eval_loss": 0.30656400322914124, + "eval_runtime": 6.3062, + "eval_samples_per_second": 91.18, + "eval_steps_per_second": 22.835, + "num_input_tokens_seen": 37882832, + "step": 24800 + }, + { + "epoch": 76.55950540958268, + "grad_norm": 0.35379618406295776, + "learning_rate": 1.578958418971477e-05, + "loss": 0.2377, + "num_input_tokens_seen": 37890704, + "step": 24805 + }, + { + "epoch": 76.57496136012365, + "grad_norm": 0.42106252908706665, + "learning_rate": 1.578045797666453e-05, + "loss": 0.2141, + "num_input_tokens_seen": 37898256, + "step": 24810 + }, + { + "epoch": 76.5904173106646, + "grad_norm": 0.6624546647071838, + "learning_rate": 1.5771333185383548e-05, + "loss": 0.291, + "num_input_tokens_seen": 37905968, + "step": 24815 + }, + { + "epoch": 76.60587326120556, + "grad_norm": 0.5865904092788696, + "learning_rate": 1.576220981727895e-05, + "loss": 0.2463, + "num_input_tokens_seen": 37913264, + "step": 24820 + }, + { + "epoch": 76.62132921174653, + "grad_norm": 0.6094203591346741, + "learning_rate": 1.575308787375769e-05, + "loss": 0.1952, + "num_input_tokens_seen": 37921168, + "step": 24825 + }, + { + "epoch": 76.63678516228748, + "grad_norm": 0.6933367252349854, + "learning_rate": 1.5743967356226492e-05, + "loss": 0.2851, + "num_input_tokens_seen": 37928624, + "step": 24830 + }, + { + "epoch": 76.65224111282843, + "grad_norm": 0.5693715214729309, + "learning_rate": 1.5734848266091835e-05, + "loss": 0.22, + "num_input_tokens_seen": 37936944, + "step": 24835 + }, + { + "epoch": 76.6676970633694, + "grad_norm": 0.9356279373168945, + "learning_rate": 1.572573060476001e-05, + "loss": 0.2848, + "num_input_tokens_seen": 37944848, + "step": 24840 + }, + { + "epoch": 76.68315301391036, + "grad_norm": 0.5206984877586365, + "learning_rate": 1.5716614373637085e-05, + "loss": 0.3107, + "num_input_tokens_seen": 37952496, + "step": 24845 + }, + { + "epoch": 76.69860896445131, + "grad_norm": 0.44216567277908325, + "learning_rate": 1.570749957412887e-05, + "loss": 0.2434, + "num_input_tokens_seen": 37960432, + "step": 24850 + }, + { + "epoch": 76.71406491499228, + "grad_norm": 0.820314884185791, + "learning_rate": 1.5698386207641013e-05, + "loss": 0.2181, + "num_input_tokens_seen": 37968208, + "step": 24855 + }, + { + "epoch": 76.72952086553323, + "grad_norm": 0.605814516544342, + "learning_rate": 1.5689274275578884e-05, + "loss": 0.3111, + "num_input_tokens_seen": 37976112, + "step": 24860 + }, + { + "epoch": 76.74497681607419, + "grad_norm": 0.7826926708221436, + "learning_rate": 1.5680163779347667e-05, + "loss": 0.2249, + "num_input_tokens_seen": 37983088, + "step": 24865 + }, + { + "epoch": 76.76043276661515, + "grad_norm": 0.3922808766365051, + "learning_rate": 1.5671054720352327e-05, + "loss": 0.2787, + "num_input_tokens_seen": 37990256, + "step": 24870 + }, + { + "epoch": 76.7758887171561, + "grad_norm": 0.37634536623954773, + "learning_rate": 1.566194709999757e-05, + "loss": 0.2645, + "num_input_tokens_seen": 37997840, + "step": 24875 + }, + { + "epoch": 76.79134466769706, + "grad_norm": 0.4796171188354492, + "learning_rate": 1.5652840919687933e-05, + "loss": 0.2289, + "num_input_tokens_seen": 38005104, + "step": 24880 + }, + { + "epoch": 76.80680061823801, + "grad_norm": 0.5104261040687561, + "learning_rate": 1.5643736180827676e-05, + "loss": 0.2332, + "num_input_tokens_seen": 38012912, + "step": 24885 + }, + { + "epoch": 76.82225656877898, + "grad_norm": 0.7041598558425903, + "learning_rate": 1.5634632884820878e-05, + "loss": 0.2376, + "num_input_tokens_seen": 38020624, + "step": 24890 + }, + { + "epoch": 76.83771251931994, + "grad_norm": 0.32933294773101807, + "learning_rate": 1.5625531033071395e-05, + "loss": 0.2363, + "num_input_tokens_seen": 38027984, + "step": 24895 + }, + { + "epoch": 76.85316846986089, + "grad_norm": 0.4884193241596222, + "learning_rate": 1.5616430626982828e-05, + "loss": 0.2449, + "num_input_tokens_seen": 38035504, + "step": 24900 + }, + { + "epoch": 76.86862442040186, + "grad_norm": 0.35283783078193665, + "learning_rate": 1.5607331667958575e-05, + "loss": 0.2878, + "num_input_tokens_seen": 38042864, + "step": 24905 + }, + { + "epoch": 76.88408037094281, + "grad_norm": 0.2942243814468384, + "learning_rate": 1.5598234157401824e-05, + "loss": 0.2527, + "num_input_tokens_seen": 38050928, + "step": 24910 + }, + { + "epoch": 76.89953632148377, + "grad_norm": 0.6415638327598572, + "learning_rate": 1.5589138096715503e-05, + "loss": 0.2664, + "num_input_tokens_seen": 38058768, + "step": 24915 + }, + { + "epoch": 76.91499227202473, + "grad_norm": 0.6409512758255005, + "learning_rate": 1.5580043487302365e-05, + "loss": 0.2861, + "num_input_tokens_seen": 38066320, + "step": 24920 + }, + { + "epoch": 76.93044822256569, + "grad_norm": 0.4946555495262146, + "learning_rate": 1.5570950330564888e-05, + "loss": 0.238, + "num_input_tokens_seen": 38073904, + "step": 24925 + }, + { + "epoch": 76.94590417310664, + "grad_norm": 0.3574916124343872, + "learning_rate": 1.5561858627905367e-05, + "loss": 0.2617, + "num_input_tokens_seen": 38081776, + "step": 24930 + }, + { + "epoch": 76.96136012364761, + "grad_norm": 0.6225712299346924, + "learning_rate": 1.5552768380725857e-05, + "loss": 0.2538, + "num_input_tokens_seen": 38089712, + "step": 24935 + }, + { + "epoch": 76.97681607418856, + "grad_norm": 0.4068385362625122, + "learning_rate": 1.5543679590428183e-05, + "loss": 0.2521, + "num_input_tokens_seen": 38097136, + "step": 24940 + }, + { + "epoch": 76.99227202472952, + "grad_norm": 0.31843626499176025, + "learning_rate": 1.5534592258413943e-05, + "loss": 0.2731, + "num_input_tokens_seen": 38104848, + "step": 24945 + }, + { + "epoch": 77.00618238021639, + "grad_norm": 0.469350665807724, + "learning_rate": 1.5525506386084538e-05, + "loss": 0.2648, + "num_input_tokens_seen": 38111216, + "step": 24950 + }, + { + "epoch": 77.02163833075734, + "grad_norm": 0.6208364963531494, + "learning_rate": 1.55164219748411e-05, + "loss": 0.2429, + "num_input_tokens_seen": 38118992, + "step": 24955 + }, + { + "epoch": 77.0370942812983, + "grad_norm": 0.4779422879219055, + "learning_rate": 1.550733902608459e-05, + "loss": 0.2757, + "num_input_tokens_seen": 38126544, + "step": 24960 + }, + { + "epoch": 77.05255023183926, + "grad_norm": 0.36120375990867615, + "learning_rate": 1.549825754121568e-05, + "loss": 0.2423, + "num_input_tokens_seen": 38133904, + "step": 24965 + }, + { + "epoch": 77.06800618238022, + "grad_norm": 0.4852805733680725, + "learning_rate": 1.5489177521634864e-05, + "loss": 0.3133, + "num_input_tokens_seen": 38141424, + "step": 24970 + }, + { + "epoch": 77.08346213292117, + "grad_norm": 0.5179075598716736, + "learning_rate": 1.5480098968742402e-05, + "loss": 0.2859, + "num_input_tokens_seen": 38148784, + "step": 24975 + }, + { + "epoch": 77.09891808346214, + "grad_norm": 0.4987069070339203, + "learning_rate": 1.5471021883938304e-05, + "loss": 0.2944, + "num_input_tokens_seen": 38156080, + "step": 24980 + }, + { + "epoch": 77.11437403400309, + "grad_norm": 0.35291776061058044, + "learning_rate": 1.546194626862238e-05, + "loss": 0.2247, + "num_input_tokens_seen": 38164112, + "step": 24985 + }, + { + "epoch": 77.12982998454405, + "grad_norm": 0.7234706282615662, + "learning_rate": 1.5452872124194216e-05, + "loss": 0.2678, + "num_input_tokens_seen": 38171696, + "step": 24990 + }, + { + "epoch": 77.14528593508501, + "grad_norm": 0.4132424592971802, + "learning_rate": 1.5443799452053136e-05, + "loss": 0.2856, + "num_input_tokens_seen": 38179376, + "step": 24995 + }, + { + "epoch": 77.16074188562597, + "grad_norm": 0.422102153301239, + "learning_rate": 1.543472825359828e-05, + "loss": 0.2172, + "num_input_tokens_seen": 38187312, + "step": 25000 + }, + { + "epoch": 77.16074188562597, + "eval_loss": 0.3037336766719818, + "eval_runtime": 6.3268, + "eval_samples_per_second": 90.883, + "eval_steps_per_second": 22.76, + "num_input_tokens_seen": 38187312, + "step": 25000 + }, + { + "epoch": 77.17619783616692, + "grad_norm": 0.29048043489456177, + "learning_rate": 1.5425658530228522e-05, + "loss": 0.2207, + "num_input_tokens_seen": 38194704, + "step": 25005 + }, + { + "epoch": 77.19165378670789, + "grad_norm": 0.3519541025161743, + "learning_rate": 1.5416590283342546e-05, + "loss": 0.265, + "num_input_tokens_seen": 38202416, + "step": 25010 + }, + { + "epoch": 77.20710973724884, + "grad_norm": 0.6312552094459534, + "learning_rate": 1.5407523514338783e-05, + "loss": 0.2754, + "num_input_tokens_seen": 38209840, + "step": 25015 + }, + { + "epoch": 77.2225656877898, + "grad_norm": 0.6982485055923462, + "learning_rate": 1.539845822461543e-05, + "loss": 0.2115, + "num_input_tokens_seen": 38217552, + "step": 25020 + }, + { + "epoch": 77.23802163833076, + "grad_norm": 0.8819343447685242, + "learning_rate": 1.538939441557048e-05, + "loss": 0.2594, + "num_input_tokens_seen": 38225456, + "step": 25025 + }, + { + "epoch": 77.25347758887172, + "grad_norm": 0.41078323125839233, + "learning_rate": 1.5380332088601696e-05, + "loss": 0.2786, + "num_input_tokens_seen": 38232976, + "step": 25030 + }, + { + "epoch": 77.26893353941267, + "grad_norm": 0.5027745366096497, + "learning_rate": 1.537127124510658e-05, + "loss": 0.2313, + "num_input_tokens_seen": 38240720, + "step": 25035 + }, + { + "epoch": 77.28438948995363, + "grad_norm": 0.4913721978664398, + "learning_rate": 1.5362211886482457e-05, + "loss": 0.2408, + "num_input_tokens_seen": 38248240, + "step": 25040 + }, + { + "epoch": 77.2998454404946, + "grad_norm": 0.6827644109725952, + "learning_rate": 1.5353154014126363e-05, + "loss": 0.2215, + "num_input_tokens_seen": 38255888, + "step": 25045 + }, + { + "epoch": 77.31530139103555, + "grad_norm": 0.8074898719787598, + "learning_rate": 1.534409762943515e-05, + "loss": 0.2375, + "num_input_tokens_seen": 38263440, + "step": 25050 + }, + { + "epoch": 77.3307573415765, + "grad_norm": 0.5920524597167969, + "learning_rate": 1.5335042733805438e-05, + "loss": 0.2777, + "num_input_tokens_seen": 38271280, + "step": 25055 + }, + { + "epoch": 77.34621329211747, + "grad_norm": 0.5118272304534912, + "learning_rate": 1.532598932863358e-05, + "loss": 0.2147, + "num_input_tokens_seen": 38278704, + "step": 25060 + }, + { + "epoch": 77.36166924265842, + "grad_norm": 0.5220451354980469, + "learning_rate": 1.531693741531574e-05, + "loss": 0.2374, + "num_input_tokens_seen": 38286320, + "step": 25065 + }, + { + "epoch": 77.37712519319938, + "grad_norm": 0.6429589986801147, + "learning_rate": 1.5307886995247844e-05, + "loss": 0.2859, + "num_input_tokens_seen": 38293808, + "step": 25070 + }, + { + "epoch": 77.39258114374034, + "grad_norm": 0.42822006344795227, + "learning_rate": 1.529883806982557e-05, + "loss": 0.2122, + "num_input_tokens_seen": 38301200, + "step": 25075 + }, + { + "epoch": 77.4080370942813, + "grad_norm": 0.8770368099212646, + "learning_rate": 1.5289790640444376e-05, + "loss": 0.2159, + "num_input_tokens_seen": 38309392, + "step": 25080 + }, + { + "epoch": 77.42349304482225, + "grad_norm": 0.5926796793937683, + "learning_rate": 1.5280744708499494e-05, + "loss": 0.2853, + "num_input_tokens_seen": 38317072, + "step": 25085 + }, + { + "epoch": 77.43894899536322, + "grad_norm": 0.3859569728374481, + "learning_rate": 1.527170027538591e-05, + "loss": 0.2104, + "num_input_tokens_seen": 38324688, + "step": 25090 + }, + { + "epoch": 77.45440494590417, + "grad_norm": 0.5021330714225769, + "learning_rate": 1.5262657342498407e-05, + "loss": 0.2165, + "num_input_tokens_seen": 38331920, + "step": 25095 + }, + { + "epoch": 77.46986089644513, + "grad_norm": 0.4747294783592224, + "learning_rate": 1.52536159112315e-05, + "loss": 0.2519, + "num_input_tokens_seen": 38339824, + "step": 25100 + }, + { + "epoch": 77.4853168469861, + "grad_norm": 0.8236152529716492, + "learning_rate": 1.5244575982979497e-05, + "loss": 0.2302, + "num_input_tokens_seen": 38347408, + "step": 25105 + }, + { + "epoch": 77.50077279752705, + "grad_norm": 0.4620729684829712, + "learning_rate": 1.5235537559136487e-05, + "loss": 0.2386, + "num_input_tokens_seen": 38355248, + "step": 25110 + }, + { + "epoch": 77.516228748068, + "grad_norm": 0.5459260940551758, + "learning_rate": 1.5226500641096286e-05, + "loss": 0.2893, + "num_input_tokens_seen": 38363504, + "step": 25115 + }, + { + "epoch": 77.53168469860897, + "grad_norm": 0.39224112033843994, + "learning_rate": 1.5217465230252509e-05, + "loss": 0.1994, + "num_input_tokens_seen": 38370896, + "step": 25120 + }, + { + "epoch": 77.54714064914992, + "grad_norm": 0.4714636206626892, + "learning_rate": 1.5208431327998523e-05, + "loss": 0.2368, + "num_input_tokens_seen": 38378800, + "step": 25125 + }, + { + "epoch": 77.56259659969088, + "grad_norm": 0.5523361563682556, + "learning_rate": 1.5199398935727477e-05, + "loss": 0.2799, + "num_input_tokens_seen": 38386288, + "step": 25130 + }, + { + "epoch": 77.57805255023185, + "grad_norm": 0.5433787107467651, + "learning_rate": 1.5190368054832282e-05, + "loss": 0.2796, + "num_input_tokens_seen": 38393904, + "step": 25135 + }, + { + "epoch": 77.5935085007728, + "grad_norm": 0.6385858654975891, + "learning_rate": 1.5181338686705601e-05, + "loss": 0.2033, + "num_input_tokens_seen": 38401488, + "step": 25140 + }, + { + "epoch": 77.60896445131375, + "grad_norm": 0.37139010429382324, + "learning_rate": 1.5172310832739889e-05, + "loss": 0.2114, + "num_input_tokens_seen": 38409104, + "step": 25145 + }, + { + "epoch": 77.62442040185472, + "grad_norm": 0.4902770519256592, + "learning_rate": 1.5163284494327346e-05, + "loss": 0.2167, + "num_input_tokens_seen": 38417200, + "step": 25150 + }, + { + "epoch": 77.63987635239567, + "grad_norm": 0.3622719943523407, + "learning_rate": 1.5154259672859952e-05, + "loss": 0.2906, + "num_input_tokens_seen": 38424720, + "step": 25155 + }, + { + "epoch": 77.65533230293663, + "grad_norm": 0.7697594165802002, + "learning_rate": 1.5145236369729452e-05, + "loss": 0.2157, + "num_input_tokens_seen": 38432208, + "step": 25160 + }, + { + "epoch": 77.67078825347758, + "grad_norm": 0.41772785782814026, + "learning_rate": 1.5136214586327335e-05, + "loss": 0.2971, + "num_input_tokens_seen": 38439856, + "step": 25165 + }, + { + "epoch": 77.68624420401855, + "grad_norm": 0.4873794615268707, + "learning_rate": 1.5127194324044885e-05, + "loss": 0.2124, + "num_input_tokens_seen": 38446800, + "step": 25170 + }, + { + "epoch": 77.7017001545595, + "grad_norm": 0.6970942616462708, + "learning_rate": 1.5118175584273148e-05, + "loss": 0.2267, + "num_input_tokens_seen": 38454032, + "step": 25175 + }, + { + "epoch": 77.71715610510046, + "grad_norm": 0.3282184898853302, + "learning_rate": 1.5109158368402909e-05, + "loss": 0.2689, + "num_input_tokens_seen": 38462192, + "step": 25180 + }, + { + "epoch": 77.73261205564143, + "grad_norm": 0.3246099650859833, + "learning_rate": 1.5100142677824753e-05, + "loss": 0.2725, + "num_input_tokens_seen": 38469904, + "step": 25185 + }, + { + "epoch": 77.74806800618238, + "grad_norm": 0.47320881485939026, + "learning_rate": 1.509112851392901e-05, + "loss": 0.2653, + "num_input_tokens_seen": 38477680, + "step": 25190 + }, + { + "epoch": 77.76352395672333, + "grad_norm": 0.6329721212387085, + "learning_rate": 1.5082115878105763e-05, + "loss": 0.2547, + "num_input_tokens_seen": 38485008, + "step": 25195 + }, + { + "epoch": 77.7789799072643, + "grad_norm": 0.38857734203338623, + "learning_rate": 1.5073104771744892e-05, + "loss": 0.2493, + "num_input_tokens_seen": 38492720, + "step": 25200 + }, + { + "epoch": 77.7789799072643, + "eval_loss": 0.3038293123245239, + "eval_runtime": 6.3123, + "eval_samples_per_second": 91.092, + "eval_steps_per_second": 22.813, + "num_input_tokens_seen": 38492720, + "step": 25200 + }, + { + "epoch": 77.79443585780525, + "grad_norm": 0.45352768898010254, + "learning_rate": 1.5064095196236006e-05, + "loss": 0.2314, + "num_input_tokens_seen": 38500176, + "step": 25205 + }, + { + "epoch": 77.80989180834621, + "grad_norm": 0.394794762134552, + "learning_rate": 1.50550871529685e-05, + "loss": 0.2372, + "num_input_tokens_seen": 38508336, + "step": 25210 + }, + { + "epoch": 77.82534775888718, + "grad_norm": 0.7281161546707153, + "learning_rate": 1.5046080643331546e-05, + "loss": 0.1924, + "num_input_tokens_seen": 38516112, + "step": 25215 + }, + { + "epoch": 77.84080370942813, + "grad_norm": 0.4130757749080658, + "learning_rate": 1.5037075668714028e-05, + "loss": 0.1962, + "num_input_tokens_seen": 38523984, + "step": 25220 + }, + { + "epoch": 77.85625965996908, + "grad_norm": 0.45192480087280273, + "learning_rate": 1.5028072230504656e-05, + "loss": 0.2816, + "num_input_tokens_seen": 38531312, + "step": 25225 + }, + { + "epoch": 77.87171561051005, + "grad_norm": 0.5855618119239807, + "learning_rate": 1.5019070330091861e-05, + "loss": 0.269, + "num_input_tokens_seen": 38539568, + "step": 25230 + }, + { + "epoch": 77.887171561051, + "grad_norm": 0.6448275446891785, + "learning_rate": 1.5010069968863843e-05, + "loss": 0.2911, + "num_input_tokens_seen": 38547088, + "step": 25235 + }, + { + "epoch": 77.90262751159196, + "grad_norm": 0.4053785502910614, + "learning_rate": 1.5001071148208584e-05, + "loss": 0.2964, + "num_input_tokens_seen": 38554896, + "step": 25240 + }, + { + "epoch": 77.91808346213293, + "grad_norm": 0.6194754838943481, + "learning_rate": 1.49920738695138e-05, + "loss": 0.239, + "num_input_tokens_seen": 38562032, + "step": 25245 + }, + { + "epoch": 77.93353941267388, + "grad_norm": 0.32716089487075806, + "learning_rate": 1.4983078134166995e-05, + "loss": 0.2394, + "num_input_tokens_seen": 38570064, + "step": 25250 + }, + { + "epoch": 77.94899536321483, + "grad_norm": 0.5254384279251099, + "learning_rate": 1.4974083943555428e-05, + "loss": 0.2285, + "num_input_tokens_seen": 38578128, + "step": 25255 + }, + { + "epoch": 77.9644513137558, + "grad_norm": 0.6341760158538818, + "learning_rate": 1.496509129906611e-05, + "loss": 0.3383, + "num_input_tokens_seen": 38586192, + "step": 25260 + }, + { + "epoch": 77.97990726429676, + "grad_norm": 0.5409407615661621, + "learning_rate": 1.4956100202085809e-05, + "loss": 0.2624, + "num_input_tokens_seen": 38593936, + "step": 25265 + }, + { + "epoch": 77.99536321483771, + "grad_norm": 0.5206388235092163, + "learning_rate": 1.4947110654001093e-05, + "loss": 0.2843, + "num_input_tokens_seen": 38601424, + "step": 25270 + }, + { + "epoch": 78.00927357032458, + "grad_norm": 0.31650465726852417, + "learning_rate": 1.4938122656198234e-05, + "loss": 0.3848, + "num_input_tokens_seen": 38608064, + "step": 25275 + }, + { + "epoch": 78.02472952086553, + "grad_norm": 0.4844062030315399, + "learning_rate": 1.4929136210063316e-05, + "loss": 0.2127, + "num_input_tokens_seen": 38615488, + "step": 25280 + }, + { + "epoch": 78.04018547140649, + "grad_norm": 0.5811097621917725, + "learning_rate": 1.4920151316982146e-05, + "loss": 0.2712, + "num_input_tokens_seen": 38623008, + "step": 25285 + }, + { + "epoch": 78.05564142194746, + "grad_norm": 0.6093802452087402, + "learning_rate": 1.4911167978340312e-05, + "loss": 0.2771, + "num_input_tokens_seen": 38630560, + "step": 25290 + }, + { + "epoch": 78.07109737248841, + "grad_norm": 0.8294676542282104, + "learning_rate": 1.4902186195523166e-05, + "loss": 0.2797, + "num_input_tokens_seen": 38638176, + "step": 25295 + }, + { + "epoch": 78.08655332302936, + "grad_norm": 0.5059984922409058, + "learning_rate": 1.4893205969915805e-05, + "loss": 0.2308, + "num_input_tokens_seen": 38645376, + "step": 25300 + }, + { + "epoch": 78.10200927357033, + "grad_norm": 0.4975740313529968, + "learning_rate": 1.4884227302903086e-05, + "loss": 0.2481, + "num_input_tokens_seen": 38653120, + "step": 25305 + }, + { + "epoch": 78.11746522411129, + "grad_norm": 0.6189994812011719, + "learning_rate": 1.4875250195869653e-05, + "loss": 0.2486, + "num_input_tokens_seen": 38660832, + "step": 25310 + }, + { + "epoch": 78.13292117465224, + "grad_norm": 0.7088874578475952, + "learning_rate": 1.4866274650199862e-05, + "loss": 0.2438, + "num_input_tokens_seen": 38668448, + "step": 25315 + }, + { + "epoch": 78.14837712519319, + "grad_norm": 0.5389425754547119, + "learning_rate": 1.485730066727788e-05, + "loss": 0.2682, + "num_input_tokens_seen": 38676608, + "step": 25320 + }, + { + "epoch": 78.16383307573416, + "grad_norm": 0.7180792689323425, + "learning_rate": 1.4848328248487586e-05, + "loss": 0.2774, + "num_input_tokens_seen": 38683936, + "step": 25325 + }, + { + "epoch": 78.17928902627511, + "grad_norm": 0.552670419216156, + "learning_rate": 1.4839357395212656e-05, + "loss": 0.2747, + "num_input_tokens_seen": 38690848, + "step": 25330 + }, + { + "epoch": 78.19474497681607, + "grad_norm": 0.48866304755210876, + "learning_rate": 1.4830388108836502e-05, + "loss": 0.2839, + "num_input_tokens_seen": 38698496, + "step": 25335 + }, + { + "epoch": 78.21020092735704, + "grad_norm": 0.5732927918434143, + "learning_rate": 1.4821420390742299e-05, + "loss": 0.1898, + "num_input_tokens_seen": 38705600, + "step": 25340 + }, + { + "epoch": 78.22565687789799, + "grad_norm": 0.39480844140052795, + "learning_rate": 1.4812454242312979e-05, + "loss": 0.2543, + "num_input_tokens_seen": 38713312, + "step": 25345 + }, + { + "epoch": 78.24111282843894, + "grad_norm": 0.39069226384162903, + "learning_rate": 1.4803489664931253e-05, + "loss": 0.268, + "num_input_tokens_seen": 38720864, + "step": 25350 + }, + { + "epoch": 78.25656877897991, + "grad_norm": 0.5102551579475403, + "learning_rate": 1.4794526659979544e-05, + "loss": 0.2555, + "num_input_tokens_seen": 38728288, + "step": 25355 + }, + { + "epoch": 78.27202472952087, + "grad_norm": 0.3614768981933594, + "learning_rate": 1.4785565228840086e-05, + "loss": 0.262, + "num_input_tokens_seen": 38735904, + "step": 25360 + }, + { + "epoch": 78.28748068006182, + "grad_norm": 0.3430122137069702, + "learning_rate": 1.4776605372894819e-05, + "loss": 0.2614, + "num_input_tokens_seen": 38744256, + "step": 25365 + }, + { + "epoch": 78.30293663060279, + "grad_norm": 0.2884925305843353, + "learning_rate": 1.4767647093525488e-05, + "loss": 0.2004, + "num_input_tokens_seen": 38751840, + "step": 25370 + }, + { + "epoch": 78.31839258114374, + "grad_norm": 0.5370197892189026, + "learning_rate": 1.4758690392113566e-05, + "loss": 0.2599, + "num_input_tokens_seen": 38759232, + "step": 25375 + }, + { + "epoch": 78.3338485316847, + "grad_norm": 0.8781554102897644, + "learning_rate": 1.4749735270040276e-05, + "loss": 0.3029, + "num_input_tokens_seen": 38766656, + "step": 25380 + }, + { + "epoch": 78.34930448222566, + "grad_norm": 0.6621625423431396, + "learning_rate": 1.4740781728686623e-05, + "loss": 0.2674, + "num_input_tokens_seen": 38774144, + "step": 25385 + }, + { + "epoch": 78.36476043276662, + "grad_norm": 0.8871240019798279, + "learning_rate": 1.4731829769433358e-05, + "loss": 0.2352, + "num_input_tokens_seen": 38781088, + "step": 25390 + }, + { + "epoch": 78.38021638330757, + "grad_norm": 0.3147250711917877, + "learning_rate": 1.4722879393660976e-05, + "loss": 0.1984, + "num_input_tokens_seen": 38788608, + "step": 25395 + }, + { + "epoch": 78.39567233384854, + "grad_norm": 0.4382149875164032, + "learning_rate": 1.4713930602749748e-05, + "loss": 0.332, + "num_input_tokens_seen": 38796864, + "step": 25400 + }, + { + "epoch": 78.39567233384854, + "eval_loss": 0.30387449264526367, + "eval_runtime": 6.32, + "eval_samples_per_second": 90.981, + "eval_steps_per_second": 22.785, + "num_input_tokens_seen": 38796864, + "step": 25400 + }, + { + "epoch": 78.41112828438949, + "grad_norm": 0.6334665417671204, + "learning_rate": 1.470498339807968e-05, + "loss": 0.2008, + "num_input_tokens_seen": 38804448, + "step": 25405 + }, + { + "epoch": 78.42658423493044, + "grad_norm": 0.38316476345062256, + "learning_rate": 1.4696037781030542e-05, + "loss": 0.2449, + "num_input_tokens_seen": 38811840, + "step": 25410 + }, + { + "epoch": 78.44204018547141, + "grad_norm": 0.5096942782402039, + "learning_rate": 1.4687093752981876e-05, + "loss": 0.2637, + "num_input_tokens_seen": 38819648, + "step": 25415 + }, + { + "epoch": 78.45749613601237, + "grad_norm": 0.5887839198112488, + "learning_rate": 1.4678151315312943e-05, + "loss": 0.2511, + "num_input_tokens_seen": 38827552, + "step": 25420 + }, + { + "epoch": 78.47295208655332, + "grad_norm": 0.38750535249710083, + "learning_rate": 1.4669210469402789e-05, + "loss": 0.2155, + "num_input_tokens_seen": 38834880, + "step": 25425 + }, + { + "epoch": 78.48840803709429, + "grad_norm": 0.3679249584674835, + "learning_rate": 1.4660271216630218e-05, + "loss": 0.207, + "num_input_tokens_seen": 38842656, + "step": 25430 + }, + { + "epoch": 78.50386398763524, + "grad_norm": 0.665695309638977, + "learning_rate": 1.4651333558373748e-05, + "loss": 0.2542, + "num_input_tokens_seen": 38850400, + "step": 25435 + }, + { + "epoch": 78.5193199381762, + "grad_norm": 0.4154652953147888, + "learning_rate": 1.4642397496011707e-05, + "loss": 0.2935, + "num_input_tokens_seen": 38857824, + "step": 25440 + }, + { + "epoch": 78.53477588871715, + "grad_norm": 0.6764383316040039, + "learning_rate": 1.4633463030922129e-05, + "loss": 0.2657, + "num_input_tokens_seen": 38865696, + "step": 25445 + }, + { + "epoch": 78.55023183925812, + "grad_norm": 0.36536139249801636, + "learning_rate": 1.462453016448282e-05, + "loss": 0.2351, + "num_input_tokens_seen": 38873184, + "step": 25450 + }, + { + "epoch": 78.56568778979907, + "grad_norm": 0.3932318389415741, + "learning_rate": 1.4615598898071354e-05, + "loss": 0.2662, + "num_input_tokens_seen": 38880704, + "step": 25455 + }, + { + "epoch": 78.58114374034002, + "grad_norm": 0.7482423782348633, + "learning_rate": 1.4606669233065026e-05, + "loss": 0.212, + "num_input_tokens_seen": 38888416, + "step": 25460 + }, + { + "epoch": 78.59659969088099, + "grad_norm": 0.5013781189918518, + "learning_rate": 1.4597741170840914e-05, + "loss": 0.2807, + "num_input_tokens_seen": 38896032, + "step": 25465 + }, + { + "epoch": 78.61205564142195, + "grad_norm": 0.4446191191673279, + "learning_rate": 1.4588814712775853e-05, + "loss": 0.2322, + "num_input_tokens_seen": 38903424, + "step": 25470 + }, + { + "epoch": 78.6275115919629, + "grad_norm": 0.5043695569038391, + "learning_rate": 1.4579889860246382e-05, + "loss": 0.2246, + "num_input_tokens_seen": 38911168, + "step": 25475 + }, + { + "epoch": 78.64296754250387, + "grad_norm": 0.5847644209861755, + "learning_rate": 1.457096661462885e-05, + "loss": 0.2926, + "num_input_tokens_seen": 38919296, + "step": 25480 + }, + { + "epoch": 78.65842349304482, + "grad_norm": 0.4560871422290802, + "learning_rate": 1.4562044977299322e-05, + "loss": 0.2453, + "num_input_tokens_seen": 38927456, + "step": 25485 + }, + { + "epoch": 78.67387944358578, + "grad_norm": 0.41941726207733154, + "learning_rate": 1.4553124949633623e-05, + "loss": 0.2121, + "num_input_tokens_seen": 38935232, + "step": 25490 + }, + { + "epoch": 78.68933539412674, + "grad_norm": 0.5285674333572388, + "learning_rate": 1.4544206533007354e-05, + "loss": 0.214, + "num_input_tokens_seen": 38942752, + "step": 25495 + }, + { + "epoch": 78.7047913446677, + "grad_norm": 0.41718432307243347, + "learning_rate": 1.4535289728795821e-05, + "loss": 0.2388, + "num_input_tokens_seen": 38950176, + "step": 25500 + }, + { + "epoch": 78.72024729520865, + "grad_norm": 0.6176150441169739, + "learning_rate": 1.4526374538374132e-05, + "loss": 0.353, + "num_input_tokens_seen": 38957664, + "step": 25505 + }, + { + "epoch": 78.73570324574962, + "grad_norm": 0.7775565981864929, + "learning_rate": 1.4517460963117097e-05, + "loss": 0.2747, + "num_input_tokens_seen": 38965600, + "step": 25510 + }, + { + "epoch": 78.75115919629057, + "grad_norm": 0.5825920701026917, + "learning_rate": 1.4508549004399314e-05, + "loss": 0.2661, + "num_input_tokens_seen": 38972576, + "step": 25515 + }, + { + "epoch": 78.76661514683153, + "grad_norm": 0.7039743661880493, + "learning_rate": 1.449963866359513e-05, + "loss": 0.2433, + "num_input_tokens_seen": 38980384, + "step": 25520 + }, + { + "epoch": 78.7820710973725, + "grad_norm": 0.4905712306499481, + "learning_rate": 1.4490729942078607e-05, + "loss": 0.2224, + "num_input_tokens_seen": 38987968, + "step": 25525 + }, + { + "epoch": 78.79752704791345, + "grad_norm": 0.6312170624732971, + "learning_rate": 1.4481822841223608e-05, + "loss": 0.2229, + "num_input_tokens_seen": 38995072, + "step": 25530 + }, + { + "epoch": 78.8129829984544, + "grad_norm": 0.48128175735473633, + "learning_rate": 1.4472917362403704e-05, + "loss": 0.2474, + "num_input_tokens_seen": 39002688, + "step": 25535 + }, + { + "epoch": 78.82843894899537, + "grad_norm": 0.38551056385040283, + "learning_rate": 1.4464013506992224e-05, + "loss": 0.2346, + "num_input_tokens_seen": 39010912, + "step": 25540 + }, + { + "epoch": 78.84389489953632, + "grad_norm": 0.6352313756942749, + "learning_rate": 1.4455111276362277e-05, + "loss": 0.2357, + "num_input_tokens_seen": 39018976, + "step": 25545 + }, + { + "epoch": 78.85935085007728, + "grad_norm": 0.3327806293964386, + "learning_rate": 1.4446210671886676e-05, + "loss": 0.2507, + "num_input_tokens_seen": 39026720, + "step": 25550 + }, + { + "epoch": 78.87480680061825, + "grad_norm": 0.47328516840934753, + "learning_rate": 1.4437311694938015e-05, + "loss": 0.2216, + "num_input_tokens_seen": 39034400, + "step": 25555 + }, + { + "epoch": 78.8902627511592, + "grad_norm": 0.4068593382835388, + "learning_rate": 1.442841434688864e-05, + "loss": 0.2233, + "num_input_tokens_seen": 39041568, + "step": 25560 + }, + { + "epoch": 78.90571870170015, + "grad_norm": 0.47270143032073975, + "learning_rate": 1.4419518629110615e-05, + "loss": 0.2213, + "num_input_tokens_seen": 39050208, + "step": 25565 + }, + { + "epoch": 78.9211746522411, + "grad_norm": 0.47761011123657227, + "learning_rate": 1.4410624542975778e-05, + "loss": 0.2282, + "num_input_tokens_seen": 39057664, + "step": 25570 + }, + { + "epoch": 78.93663060278207, + "grad_norm": 0.5989673733711243, + "learning_rate": 1.4401732089855724e-05, + "loss": 0.244, + "num_input_tokens_seen": 39065632, + "step": 25575 + }, + { + "epoch": 78.95208655332303, + "grad_norm": 0.6894962787628174, + "learning_rate": 1.4392841271121754e-05, + "loss": 0.2528, + "num_input_tokens_seen": 39073280, + "step": 25580 + }, + { + "epoch": 78.96754250386398, + "grad_norm": 0.5460954308509827, + "learning_rate": 1.438395208814497e-05, + "loss": 0.2351, + "num_input_tokens_seen": 39081216, + "step": 25585 + }, + { + "epoch": 78.98299845440495, + "grad_norm": 0.459560751914978, + "learning_rate": 1.4375064542296174e-05, + "loss": 0.2686, + "num_input_tokens_seen": 39089312, + "step": 25590 + }, + { + "epoch": 78.9984544049459, + "grad_norm": 0.32400432229042053, + "learning_rate": 1.4366178634945946e-05, + "loss": 0.2161, + "num_input_tokens_seen": 39097312, + "step": 25595 + }, + { + "epoch": 79.01236476043276, + "grad_norm": 0.3931751251220703, + "learning_rate": 1.4357294367464616e-05, + "loss": 0.2746, + "num_input_tokens_seen": 39103824, + "step": 25600 + }, + { + "epoch": 79.01236476043276, + "eval_loss": 0.3050310015678406, + "eval_runtime": 6.3146, + "eval_samples_per_second": 91.059, + "eval_steps_per_second": 22.804, + "num_input_tokens_seen": 39103824, + "step": 25600 + }, + { + "epoch": 79.02782071097373, + "grad_norm": 0.4178129732608795, + "learning_rate": 1.434841174122224e-05, + "loss": 0.2394, + "num_input_tokens_seen": 39111216, + "step": 25605 + }, + { + "epoch": 79.04327666151468, + "grad_norm": 1.0535807609558105, + "learning_rate": 1.4339530757588615e-05, + "loss": 0.2983, + "num_input_tokens_seen": 39119056, + "step": 25610 + }, + { + "epoch": 79.05873261205564, + "grad_norm": 0.4613521695137024, + "learning_rate": 1.433065141793333e-05, + "loss": 0.2367, + "num_input_tokens_seen": 39126896, + "step": 25615 + }, + { + "epoch": 79.0741885625966, + "grad_norm": 0.7663372755050659, + "learning_rate": 1.4321773723625665e-05, + "loss": 0.2942, + "num_input_tokens_seen": 39134544, + "step": 25620 + }, + { + "epoch": 79.08964451313756, + "grad_norm": 0.507489025592804, + "learning_rate": 1.4312897676034693e-05, + "loss": 0.2503, + "num_input_tokens_seen": 39141968, + "step": 25625 + }, + { + "epoch": 79.10510046367851, + "grad_norm": 0.5555543303489685, + "learning_rate": 1.4304023276529188e-05, + "loss": 0.3235, + "num_input_tokens_seen": 39149200, + "step": 25630 + }, + { + "epoch": 79.12055641421948, + "grad_norm": 0.44611942768096924, + "learning_rate": 1.4295150526477712e-05, + "loss": 0.2528, + "num_input_tokens_seen": 39157328, + "step": 25635 + }, + { + "epoch": 79.13601236476043, + "grad_norm": 0.58600914478302, + "learning_rate": 1.4286279427248562e-05, + "loss": 0.2283, + "num_input_tokens_seen": 39165040, + "step": 25640 + }, + { + "epoch": 79.15146831530139, + "grad_norm": 0.3731849491596222, + "learning_rate": 1.4277409980209747e-05, + "loss": 0.229, + "num_input_tokens_seen": 39173040, + "step": 25645 + }, + { + "epoch": 79.16692426584235, + "grad_norm": 0.5532472133636475, + "learning_rate": 1.4268542186729061e-05, + "loss": 0.2247, + "num_input_tokens_seen": 39180944, + "step": 25650 + }, + { + "epoch": 79.18238021638331, + "grad_norm": 0.602742075920105, + "learning_rate": 1.4259676048174043e-05, + "loss": 0.2169, + "num_input_tokens_seen": 39188240, + "step": 25655 + }, + { + "epoch": 79.19783616692426, + "grad_norm": 0.3964477777481079, + "learning_rate": 1.4250811565911937e-05, + "loss": 0.2683, + "num_input_tokens_seen": 39196176, + "step": 25660 + }, + { + "epoch": 79.21329211746523, + "grad_norm": 0.4011674225330353, + "learning_rate": 1.4241948741309782e-05, + "loss": 0.321, + "num_input_tokens_seen": 39204400, + "step": 25665 + }, + { + "epoch": 79.22874806800618, + "grad_norm": 0.5911960601806641, + "learning_rate": 1.4233087575734317e-05, + "loss": 0.226, + "num_input_tokens_seen": 39211792, + "step": 25670 + }, + { + "epoch": 79.24420401854714, + "grad_norm": 0.40478402376174927, + "learning_rate": 1.422422807055206e-05, + "loss": 0.2092, + "num_input_tokens_seen": 39219152, + "step": 25675 + }, + { + "epoch": 79.2596599690881, + "grad_norm": 0.43476563692092896, + "learning_rate": 1.4215370227129243e-05, + "loss": 0.2872, + "num_input_tokens_seen": 39226448, + "step": 25680 + }, + { + "epoch": 79.27511591962906, + "grad_norm": 0.4726409316062927, + "learning_rate": 1.4206514046831876e-05, + "loss": 0.2459, + "num_input_tokens_seen": 39233968, + "step": 25685 + }, + { + "epoch": 79.29057187017001, + "grad_norm": 0.2991446554660797, + "learning_rate": 1.419765953102567e-05, + "loss": 0.2378, + "num_input_tokens_seen": 39241840, + "step": 25690 + }, + { + "epoch": 79.30602782071098, + "grad_norm": 0.7298049926757812, + "learning_rate": 1.4188806681076125e-05, + "loss": 0.2025, + "num_input_tokens_seen": 39249296, + "step": 25695 + }, + { + "epoch": 79.32148377125193, + "grad_norm": 0.6122454404830933, + "learning_rate": 1.4179955498348443e-05, + "loss": 0.2134, + "num_input_tokens_seen": 39257104, + "step": 25700 + }, + { + "epoch": 79.33693972179289, + "grad_norm": 0.4925153851509094, + "learning_rate": 1.4171105984207605e-05, + "loss": 0.2419, + "num_input_tokens_seen": 39264272, + "step": 25705 + }, + { + "epoch": 79.35239567233384, + "grad_norm": 0.37028199434280396, + "learning_rate": 1.4162258140018304e-05, + "loss": 0.2713, + "num_input_tokens_seen": 39272592, + "step": 25710 + }, + { + "epoch": 79.36785162287481, + "grad_norm": 0.48106107115745544, + "learning_rate": 1.4153411967144986e-05, + "loss": 0.2585, + "num_input_tokens_seen": 39280080, + "step": 25715 + }, + { + "epoch": 79.38330757341576, + "grad_norm": 0.41841623187065125, + "learning_rate": 1.4144567466951864e-05, + "loss": 0.2502, + "num_input_tokens_seen": 39287664, + "step": 25720 + }, + { + "epoch": 79.39876352395672, + "grad_norm": 0.7183982133865356, + "learning_rate": 1.4135724640802844e-05, + "loss": 0.2916, + "num_input_tokens_seen": 39295504, + "step": 25725 + }, + { + "epoch": 79.41421947449768, + "grad_norm": 0.5168997645378113, + "learning_rate": 1.4126883490061615e-05, + "loss": 0.2424, + "num_input_tokens_seen": 39302960, + "step": 25730 + }, + { + "epoch": 79.42967542503864, + "grad_norm": 0.4473364055156708, + "learning_rate": 1.4118044016091603e-05, + "loss": 0.2503, + "num_input_tokens_seen": 39310544, + "step": 25735 + }, + { + "epoch": 79.44513137557959, + "grad_norm": 0.4086301922798157, + "learning_rate": 1.410920622025594e-05, + "loss": 0.2478, + "num_input_tokens_seen": 39318896, + "step": 25740 + }, + { + "epoch": 79.46058732612056, + "grad_norm": 0.600592315196991, + "learning_rate": 1.4100370103917554e-05, + "loss": 0.1989, + "num_input_tokens_seen": 39326704, + "step": 25745 + }, + { + "epoch": 79.47604327666151, + "grad_norm": 0.454607754945755, + "learning_rate": 1.409153566843907e-05, + "loss": 0.2506, + "num_input_tokens_seen": 39334192, + "step": 25750 + }, + { + "epoch": 79.49149922720247, + "grad_norm": 0.3671788275241852, + "learning_rate": 1.408270291518286e-05, + "loss": 0.2643, + "num_input_tokens_seen": 39341840, + "step": 25755 + }, + { + "epoch": 79.50695517774344, + "grad_norm": 0.2669127285480499, + "learning_rate": 1.407387184551107e-05, + "loss": 0.2468, + "num_input_tokens_seen": 39349872, + "step": 25760 + }, + { + "epoch": 79.52241112828439, + "grad_norm": 0.6512647867202759, + "learning_rate": 1.4065042460785532e-05, + "loss": 0.2841, + "num_input_tokens_seen": 39357808, + "step": 25765 + }, + { + "epoch": 79.53786707882534, + "grad_norm": 0.5544313788414001, + "learning_rate": 1.405621476236787e-05, + "loss": 0.2352, + "num_input_tokens_seen": 39365520, + "step": 25770 + }, + { + "epoch": 79.55332302936631, + "grad_norm": 0.6153936386108398, + "learning_rate": 1.4047388751619423e-05, + "loss": 0.2723, + "num_input_tokens_seen": 39373136, + "step": 25775 + }, + { + "epoch": 79.56877897990726, + "grad_norm": 0.3874678313732147, + "learning_rate": 1.4038564429901264e-05, + "loss": 0.273, + "num_input_tokens_seen": 39380816, + "step": 25780 + }, + { + "epoch": 79.58423493044822, + "grad_norm": 0.6883248090744019, + "learning_rate": 1.4029741798574227e-05, + "loss": 0.2391, + "num_input_tokens_seen": 39388240, + "step": 25785 + }, + { + "epoch": 79.59969088098919, + "grad_norm": 0.6787912249565125, + "learning_rate": 1.402092085899886e-05, + "loss": 0.2054, + "num_input_tokens_seen": 39395536, + "step": 25790 + }, + { + "epoch": 79.61514683153014, + "grad_norm": 0.731341540813446, + "learning_rate": 1.4012101612535464e-05, + "loss": 0.2358, + "num_input_tokens_seen": 39402832, + "step": 25795 + }, + { + "epoch": 79.6306027820711, + "grad_norm": 0.8109884858131409, + "learning_rate": 1.4003284060544092e-05, + "loss": 0.2415, + "num_input_tokens_seen": 39410448, + "step": 25800 + }, + { + "epoch": 79.6306027820711, + "eval_loss": 0.30373069643974304, + "eval_runtime": 6.3216, + "eval_samples_per_second": 90.957, + "eval_steps_per_second": 22.779, + "num_input_tokens_seen": 39410448, + "step": 25800 + }, + { + "epoch": 79.64605873261206, + "grad_norm": 0.5551446080207825, + "learning_rate": 1.3994468204384504e-05, + "loss": 0.2466, + "num_input_tokens_seen": 39418320, + "step": 25805 + }, + { + "epoch": 79.66151468315302, + "grad_norm": 0.5900956392288208, + "learning_rate": 1.398565404541622e-05, + "loss": 0.2813, + "num_input_tokens_seen": 39425680, + "step": 25810 + }, + { + "epoch": 79.67697063369397, + "grad_norm": 0.5365244150161743, + "learning_rate": 1.3976841584998513e-05, + "loss": 0.2237, + "num_input_tokens_seen": 39433904, + "step": 25815 + }, + { + "epoch": 79.69242658423494, + "grad_norm": 0.4875124990940094, + "learning_rate": 1.3968030824490352e-05, + "loss": 0.2113, + "num_input_tokens_seen": 39441680, + "step": 25820 + }, + { + "epoch": 79.70788253477589, + "grad_norm": 0.5004512667655945, + "learning_rate": 1.3959221765250469e-05, + "loss": 0.2675, + "num_input_tokens_seen": 39448944, + "step": 25825 + }, + { + "epoch": 79.72333848531684, + "grad_norm": 0.6008651256561279, + "learning_rate": 1.3950414408637343e-05, + "loss": 0.3225, + "num_input_tokens_seen": 39457008, + "step": 25830 + }, + { + "epoch": 79.7387944358578, + "grad_norm": 0.6096850633621216, + "learning_rate": 1.3941608756009166e-05, + "loss": 0.2328, + "num_input_tokens_seen": 39464592, + "step": 25835 + }, + { + "epoch": 79.75425038639877, + "grad_norm": 0.46021801233291626, + "learning_rate": 1.3932804808723898e-05, + "loss": 0.232, + "num_input_tokens_seen": 39471696, + "step": 25840 + }, + { + "epoch": 79.76970633693972, + "grad_norm": 0.4861118197441101, + "learning_rate": 1.3924002568139194e-05, + "loss": 0.2318, + "num_input_tokens_seen": 39479120, + "step": 25845 + }, + { + "epoch": 79.78516228748067, + "grad_norm": 0.4080955386161804, + "learning_rate": 1.3915202035612485e-05, + "loss": 0.3293, + "num_input_tokens_seen": 39486832, + "step": 25850 + }, + { + "epoch": 79.80061823802164, + "grad_norm": 0.4151894748210907, + "learning_rate": 1.3906403212500935e-05, + "loss": 0.2146, + "num_input_tokens_seen": 39494224, + "step": 25855 + }, + { + "epoch": 79.8160741885626, + "grad_norm": 0.36067119240760803, + "learning_rate": 1.3897606100161409e-05, + "loss": 0.2668, + "num_input_tokens_seen": 39502384, + "step": 25860 + }, + { + "epoch": 79.83153013910355, + "grad_norm": 0.7458221316337585, + "learning_rate": 1.388881069995055e-05, + "loss": 0.249, + "num_input_tokens_seen": 39509968, + "step": 25865 + }, + { + "epoch": 79.84698608964452, + "grad_norm": 0.4473022520542145, + "learning_rate": 1.3880017013224708e-05, + "loss": 0.2602, + "num_input_tokens_seen": 39517328, + "step": 25870 + }, + { + "epoch": 79.86244204018547, + "grad_norm": 0.46948060393333435, + "learning_rate": 1.3871225041339984e-05, + "loss": 0.2335, + "num_input_tokens_seen": 39525232, + "step": 25875 + }, + { + "epoch": 79.87789799072642, + "grad_norm": 0.6534764170646667, + "learning_rate": 1.386243478565222e-05, + "loss": 0.2223, + "num_input_tokens_seen": 39533104, + "step": 25880 + }, + { + "epoch": 79.89335394126739, + "grad_norm": 0.28709402680397034, + "learning_rate": 1.3853646247516966e-05, + "loss": 0.2618, + "num_input_tokens_seen": 39540976, + "step": 25885 + }, + { + "epoch": 79.90880989180835, + "grad_norm": 0.45303234457969666, + "learning_rate": 1.3844859428289545e-05, + "loss": 0.1942, + "num_input_tokens_seen": 39548592, + "step": 25890 + }, + { + "epoch": 79.9242658423493, + "grad_norm": 0.6128506660461426, + "learning_rate": 1.3836074329324984e-05, + "loss": 0.2561, + "num_input_tokens_seen": 39556080, + "step": 25895 + }, + { + "epoch": 79.93972179289027, + "grad_norm": 0.4346185326576233, + "learning_rate": 1.3827290951978044e-05, + "loss": 0.2576, + "num_input_tokens_seen": 39563568, + "step": 25900 + }, + { + "epoch": 79.95517774343122, + "grad_norm": 0.3044913411140442, + "learning_rate": 1.381850929760326e-05, + "loss": 0.2147, + "num_input_tokens_seen": 39571184, + "step": 25905 + }, + { + "epoch": 79.97063369397218, + "grad_norm": 0.44563165307044983, + "learning_rate": 1.3809729367554842e-05, + "loss": 0.2089, + "num_input_tokens_seen": 39578960, + "step": 25910 + }, + { + "epoch": 79.98608964451314, + "grad_norm": 0.32322415709495544, + "learning_rate": 1.3800951163186784e-05, + "loss": 0.3026, + "num_input_tokens_seen": 39586416, + "step": 25915 + }, + { + "epoch": 80.0, + "grad_norm": 0.600984513759613, + "learning_rate": 1.3792174685852801e-05, + "loss": 0.1939, + "num_input_tokens_seen": 39592368, + "step": 25920 + }, + { + "epoch": 80.01545595054095, + "grad_norm": 0.5428879857063293, + "learning_rate": 1.378339993690632e-05, + "loss": 0.3142, + "num_input_tokens_seen": 39600464, + "step": 25925 + }, + { + "epoch": 80.03091190108192, + "grad_norm": 0.3831782042980194, + "learning_rate": 1.3774626917700523e-05, + "loss": 0.264, + "num_input_tokens_seen": 39608112, + "step": 25930 + }, + { + "epoch": 80.04636785162288, + "grad_norm": 0.4930684268474579, + "learning_rate": 1.3765855629588334e-05, + "loss": 0.2217, + "num_input_tokens_seen": 39615664, + "step": 25935 + }, + { + "epoch": 80.06182380216383, + "grad_norm": 0.6340920925140381, + "learning_rate": 1.3757086073922374e-05, + "loss": 0.227, + "num_input_tokens_seen": 39623728, + "step": 25940 + }, + { + "epoch": 80.0772797527048, + "grad_norm": 0.9087247848510742, + "learning_rate": 1.3748318252055038e-05, + "loss": 0.1921, + "num_input_tokens_seen": 39630960, + "step": 25945 + }, + { + "epoch": 80.09273570324575, + "grad_norm": 0.35803675651550293, + "learning_rate": 1.3739552165338416e-05, + "loss": 0.2155, + "num_input_tokens_seen": 39638608, + "step": 25950 + }, + { + "epoch": 80.1081916537867, + "grad_norm": 0.46406498551368713, + "learning_rate": 1.3730787815124354e-05, + "loss": 0.2348, + "num_input_tokens_seen": 39646288, + "step": 25955 + }, + { + "epoch": 80.12364760432767, + "grad_norm": 0.7720661759376526, + "learning_rate": 1.3722025202764443e-05, + "loss": 0.2347, + "num_input_tokens_seen": 39653776, + "step": 25960 + }, + { + "epoch": 80.13910355486863, + "grad_norm": 0.49792999029159546, + "learning_rate": 1.371326432960997e-05, + "loss": 0.2216, + "num_input_tokens_seen": 39662128, + "step": 25965 + }, + { + "epoch": 80.15455950540958, + "grad_norm": 0.42709845304489136, + "learning_rate": 1.3704505197011969e-05, + "loss": 0.2074, + "num_input_tokens_seen": 39669136, + "step": 25970 + }, + { + "epoch": 80.17001545595055, + "grad_norm": 0.6198345422744751, + "learning_rate": 1.3695747806321224e-05, + "loss": 0.2714, + "num_input_tokens_seen": 39676656, + "step": 25975 + }, + { + "epoch": 80.1854714064915, + "grad_norm": 0.5323808789253235, + "learning_rate": 1.3686992158888212e-05, + "loss": 0.2748, + "num_input_tokens_seen": 39684144, + "step": 25980 + }, + { + "epoch": 80.20092735703246, + "grad_norm": 0.5285797715187073, + "learning_rate": 1.367823825606319e-05, + "loss": 0.2953, + "num_input_tokens_seen": 39692400, + "step": 25985 + }, + { + "epoch": 80.21638330757341, + "grad_norm": 0.584017276763916, + "learning_rate": 1.36694860991961e-05, + "loss": 0.2087, + "num_input_tokens_seen": 39700208, + "step": 25990 + }, + { + "epoch": 80.23183925811438, + "grad_norm": 0.3597901463508606, + "learning_rate": 1.3660735689636636e-05, + "loss": 0.2269, + "num_input_tokens_seen": 39708048, + "step": 25995 + }, + { + "epoch": 80.24729520865533, + "grad_norm": 0.7635351419448853, + "learning_rate": 1.365198702873424e-05, + "loss": 0.3165, + "num_input_tokens_seen": 39715280, + "step": 26000 + }, + { + "epoch": 80.24729520865533, + "eval_loss": 0.3044487535953522, + "eval_runtime": 6.307, + "eval_samples_per_second": 91.169, + "eval_steps_per_second": 22.832, + "num_input_tokens_seen": 39715280, + "step": 26000 + }, + { + "epoch": 80.26275115919628, + "grad_norm": 0.4569092392921448, + "learning_rate": 1.364324011783804e-05, + "loss": 0.2235, + "num_input_tokens_seen": 39722576, + "step": 26005 + }, + { + "epoch": 80.27820710973725, + "grad_norm": 0.38267698884010315, + "learning_rate": 1.3634494958296934e-05, + "loss": 0.2306, + "num_input_tokens_seen": 39730352, + "step": 26010 + }, + { + "epoch": 80.2936630602782, + "grad_norm": 0.3959808349609375, + "learning_rate": 1.3625751551459542e-05, + "loss": 0.2173, + "num_input_tokens_seen": 39738384, + "step": 26015 + }, + { + "epoch": 80.30911901081916, + "grad_norm": 0.36447229981422424, + "learning_rate": 1.3617009898674188e-05, + "loss": 0.3088, + "num_input_tokens_seen": 39745744, + "step": 26020 + }, + { + "epoch": 80.32457496136013, + "grad_norm": 0.35553959012031555, + "learning_rate": 1.3608270001288967e-05, + "loss": 0.206, + "num_input_tokens_seen": 39753040, + "step": 26025 + }, + { + "epoch": 80.34003091190108, + "grad_norm": 1.025750756263733, + "learning_rate": 1.359953186065166e-05, + "loss": 0.2522, + "num_input_tokens_seen": 39759856, + "step": 26030 + }, + { + "epoch": 80.35548686244204, + "grad_norm": 0.712436318397522, + "learning_rate": 1.3590795478109814e-05, + "loss": 0.3145, + "num_input_tokens_seen": 39767312, + "step": 26035 + }, + { + "epoch": 80.370942812983, + "grad_norm": 0.5843535661697388, + "learning_rate": 1.3582060855010675e-05, + "loss": 0.2716, + "num_input_tokens_seen": 39775280, + "step": 26040 + }, + { + "epoch": 80.38639876352396, + "grad_norm": 0.375337690114975, + "learning_rate": 1.3573327992701245e-05, + "loss": 0.3395, + "num_input_tokens_seen": 39782960, + "step": 26045 + }, + { + "epoch": 80.40185471406491, + "grad_norm": 0.4452996551990509, + "learning_rate": 1.356459689252823e-05, + "loss": 0.1786, + "num_input_tokens_seen": 39790288, + "step": 26050 + }, + { + "epoch": 80.41731066460588, + "grad_norm": 0.3456845283508301, + "learning_rate": 1.3555867555838087e-05, + "loss": 0.2252, + "num_input_tokens_seen": 39798192, + "step": 26055 + }, + { + "epoch": 80.43276661514683, + "grad_norm": 1.1931535005569458, + "learning_rate": 1.3547139983976975e-05, + "loss": 0.2541, + "num_input_tokens_seen": 39806192, + "step": 26060 + }, + { + "epoch": 80.44822256568779, + "grad_norm": 0.3372664749622345, + "learning_rate": 1.3538414178290815e-05, + "loss": 0.2452, + "num_input_tokens_seen": 39813744, + "step": 26065 + }, + { + "epoch": 80.46367851622875, + "grad_norm": 0.722578763961792, + "learning_rate": 1.3529690140125209e-05, + "loss": 0.2522, + "num_input_tokens_seen": 39821360, + "step": 26070 + }, + { + "epoch": 80.47913446676971, + "grad_norm": 0.45293745398521423, + "learning_rate": 1.352096787082553e-05, + "loss": 0.24, + "num_input_tokens_seen": 39829072, + "step": 26075 + }, + { + "epoch": 80.49459041731066, + "grad_norm": 0.5422620177268982, + "learning_rate": 1.3512247371736871e-05, + "loss": 0.2746, + "num_input_tokens_seen": 39836496, + "step": 26080 + }, + { + "epoch": 80.51004636785163, + "grad_norm": 0.5214555859565735, + "learning_rate": 1.3503528644204022e-05, + "loss": 0.2672, + "num_input_tokens_seen": 39843824, + "step": 26085 + }, + { + "epoch": 80.52550231839258, + "grad_norm": 0.5186354517936707, + "learning_rate": 1.349481168957153e-05, + "loss": 0.2275, + "num_input_tokens_seen": 39851632, + "step": 26090 + }, + { + "epoch": 80.54095826893354, + "grad_norm": 0.403985857963562, + "learning_rate": 1.3486096509183665e-05, + "loss": 0.2429, + "num_input_tokens_seen": 39858992, + "step": 26095 + }, + { + "epoch": 80.5564142194745, + "grad_norm": 0.37322553992271423, + "learning_rate": 1.3477383104384406e-05, + "loss": 0.2205, + "num_input_tokens_seen": 39866928, + "step": 26100 + }, + { + "epoch": 80.57187017001546, + "grad_norm": 0.49777951836586, + "learning_rate": 1.3468671476517481e-05, + "loss": 0.2639, + "num_input_tokens_seen": 39874032, + "step": 26105 + }, + { + "epoch": 80.58732612055641, + "grad_norm": 0.4323778748512268, + "learning_rate": 1.3459961626926326e-05, + "loss": 0.2071, + "num_input_tokens_seen": 39881584, + "step": 26110 + }, + { + "epoch": 80.60278207109737, + "grad_norm": 0.362641304731369, + "learning_rate": 1.3451253556954101e-05, + "loss": 0.2422, + "num_input_tokens_seen": 39889104, + "step": 26115 + }, + { + "epoch": 80.61823802163833, + "grad_norm": 0.5948364734649658, + "learning_rate": 1.3442547267943717e-05, + "loss": 0.262, + "num_input_tokens_seen": 39896624, + "step": 26120 + }, + { + "epoch": 80.63369397217929, + "grad_norm": 0.5070391297340393, + "learning_rate": 1.3433842761237774e-05, + "loss": 0.2696, + "num_input_tokens_seen": 39904784, + "step": 26125 + }, + { + "epoch": 80.64914992272024, + "grad_norm": 0.32022303342819214, + "learning_rate": 1.3425140038178639e-05, + "loss": 0.2254, + "num_input_tokens_seen": 39912400, + "step": 26130 + }, + { + "epoch": 80.66460587326121, + "grad_norm": 0.5136694312095642, + "learning_rate": 1.3416439100108358e-05, + "loss": 0.2318, + "num_input_tokens_seen": 39920112, + "step": 26135 + }, + { + "epoch": 80.68006182380216, + "grad_norm": 0.37690746784210205, + "learning_rate": 1.3407739948368734e-05, + "loss": 0.2269, + "num_input_tokens_seen": 39927984, + "step": 26140 + }, + { + "epoch": 80.69551777434312, + "grad_norm": 0.41796934604644775, + "learning_rate": 1.3399042584301298e-05, + "loss": 0.3373, + "num_input_tokens_seen": 39935664, + "step": 26145 + }, + { + "epoch": 80.71097372488408, + "grad_norm": 0.6343073844909668, + "learning_rate": 1.3390347009247272e-05, + "loss": 0.2289, + "num_input_tokens_seen": 39943120, + "step": 26150 + }, + { + "epoch": 80.72642967542504, + "grad_norm": 0.43763649463653564, + "learning_rate": 1.3381653224547635e-05, + "loss": 0.2131, + "num_input_tokens_seen": 39950672, + "step": 26155 + }, + { + "epoch": 80.74188562596599, + "grad_norm": 0.8229809403419495, + "learning_rate": 1.3372961231543086e-05, + "loss": 0.2658, + "num_input_tokens_seen": 39958544, + "step": 26160 + }, + { + "epoch": 80.75734157650696, + "grad_norm": 0.39464491605758667, + "learning_rate": 1.3364271031574016e-05, + "loss": 0.2016, + "num_input_tokens_seen": 39966320, + "step": 26165 + }, + { + "epoch": 80.77279752704791, + "grad_norm": 0.6193063259124756, + "learning_rate": 1.335558262598059e-05, + "loss": 0.2305, + "num_input_tokens_seen": 39974096, + "step": 26170 + }, + { + "epoch": 80.78825347758887, + "grad_norm": 0.8039327263832092, + "learning_rate": 1.3346896016102645e-05, + "loss": 0.2864, + "num_input_tokens_seen": 39982608, + "step": 26175 + }, + { + "epoch": 80.80370942812984, + "grad_norm": 0.89588463306427, + "learning_rate": 1.3338211203279788e-05, + "loss": 0.3002, + "num_input_tokens_seen": 39990960, + "step": 26180 + }, + { + "epoch": 80.81916537867079, + "grad_norm": 0.5384249091148376, + "learning_rate": 1.3329528188851303e-05, + "loss": 0.2458, + "num_input_tokens_seen": 39998800, + "step": 26185 + }, + { + "epoch": 80.83462132921174, + "grad_norm": 0.3892299234867096, + "learning_rate": 1.3320846974156242e-05, + "loss": 0.2053, + "num_input_tokens_seen": 40006160, + "step": 26190 + }, + { + "epoch": 80.85007727975271, + "grad_norm": 0.4574582576751709, + "learning_rate": 1.3312167560533337e-05, + "loss": 0.2211, + "num_input_tokens_seen": 40014320, + "step": 26195 + }, + { + "epoch": 80.86553323029366, + "grad_norm": 0.4714636206626892, + "learning_rate": 1.3303489949321082e-05, + "loss": 0.2209, + "num_input_tokens_seen": 40021520, + "step": 26200 + }, + { + "epoch": 80.86553323029366, + "eval_loss": 0.30503982305526733, + "eval_runtime": 6.2975, + "eval_samples_per_second": 91.306, + "eval_steps_per_second": 22.866, + "num_input_tokens_seen": 40021520, + "step": 26200 + }, + { + "epoch": 80.88098918083462, + "grad_norm": 0.5393342971801758, + "learning_rate": 1.3294814141857653e-05, + "loss": 0.2391, + "num_input_tokens_seen": 40028880, + "step": 26205 + }, + { + "epoch": 80.89644513137559, + "grad_norm": 0.5985392332077026, + "learning_rate": 1.3286140139480992e-05, + "loss": 0.2631, + "num_input_tokens_seen": 40036624, + "step": 26210 + }, + { + "epoch": 80.91190108191654, + "grad_norm": 0.5573476552963257, + "learning_rate": 1.3277467943528719e-05, + "loss": 0.3227, + "num_input_tokens_seen": 40044656, + "step": 26215 + }, + { + "epoch": 80.9273570324575, + "grad_norm": 0.32592499256134033, + "learning_rate": 1.3268797555338203e-05, + "loss": 0.2888, + "num_input_tokens_seen": 40052464, + "step": 26220 + }, + { + "epoch": 80.94281298299846, + "grad_norm": 0.4681345224380493, + "learning_rate": 1.3260128976246533e-05, + "loss": 0.2705, + "num_input_tokens_seen": 40059984, + "step": 26225 + }, + { + "epoch": 80.95826893353942, + "grad_norm": 0.41728222370147705, + "learning_rate": 1.32514622075905e-05, + "loss": 0.2834, + "num_input_tokens_seen": 40067856, + "step": 26230 + }, + { + "epoch": 80.97372488408037, + "grad_norm": 0.6787484288215637, + "learning_rate": 1.3242797250706638e-05, + "loss": 0.3141, + "num_input_tokens_seen": 40075696, + "step": 26235 + }, + { + "epoch": 80.98918083462132, + "grad_norm": 0.4217672348022461, + "learning_rate": 1.3234134106931195e-05, + "loss": 0.2546, + "num_input_tokens_seen": 40082992, + "step": 26240 + }, + { + "epoch": 81.0030911901082, + "grad_norm": 0.3965408504009247, + "learning_rate": 1.322547277760013e-05, + "loss": 0.2087, + "num_input_tokens_seen": 40089664, + "step": 26245 + }, + { + "epoch": 81.01854714064915, + "grad_norm": 0.9564039707183838, + "learning_rate": 1.3216813264049132e-05, + "loss": 0.2144, + "num_input_tokens_seen": 40096864, + "step": 26250 + }, + { + "epoch": 81.03400309119012, + "grad_norm": 0.44595351815223694, + "learning_rate": 1.32081555676136e-05, + "loss": 0.2777, + "num_input_tokens_seen": 40104224, + "step": 26255 + }, + { + "epoch": 81.04945904173107, + "grad_norm": 0.3986407220363617, + "learning_rate": 1.3199499689628674e-05, + "loss": 0.2866, + "num_input_tokens_seen": 40111968, + "step": 26260 + }, + { + "epoch": 81.06491499227202, + "grad_norm": 0.8816087245941162, + "learning_rate": 1.3190845631429192e-05, + "loss": 0.199, + "num_input_tokens_seen": 40119360, + "step": 26265 + }, + { + "epoch": 81.08037094281298, + "grad_norm": 0.37374165654182434, + "learning_rate": 1.3182193394349704e-05, + "loss": 0.2464, + "num_input_tokens_seen": 40127168, + "step": 26270 + }, + { + "epoch": 81.09582689335394, + "grad_norm": 0.48701804876327515, + "learning_rate": 1.3173542979724507e-05, + "loss": 0.2705, + "num_input_tokens_seen": 40134432, + "step": 26275 + }, + { + "epoch": 81.1112828438949, + "grad_norm": 0.42675885558128357, + "learning_rate": 1.3164894388887617e-05, + "loss": 0.2186, + "num_input_tokens_seen": 40142176, + "step": 26280 + }, + { + "epoch": 81.12673879443585, + "grad_norm": 0.2877981662750244, + "learning_rate": 1.3156247623172727e-05, + "loss": 0.2473, + "num_input_tokens_seen": 40149952, + "step": 26285 + }, + { + "epoch": 81.14219474497682, + "grad_norm": 0.4571559727191925, + "learning_rate": 1.3147602683913302e-05, + "loss": 0.262, + "num_input_tokens_seen": 40157280, + "step": 26290 + }, + { + "epoch": 81.15765069551777, + "grad_norm": 0.5910922884941101, + "learning_rate": 1.3138959572442481e-05, + "loss": 0.3032, + "num_input_tokens_seen": 40165088, + "step": 26295 + }, + { + "epoch": 81.17310664605873, + "grad_norm": 0.4549030065536499, + "learning_rate": 1.3130318290093146e-05, + "loss": 0.2155, + "num_input_tokens_seen": 40172448, + "step": 26300 + }, + { + "epoch": 81.1885625965997, + "grad_norm": 0.43949368596076965, + "learning_rate": 1.3121678838197909e-05, + "loss": 0.251, + "num_input_tokens_seen": 40180064, + "step": 26305 + }, + { + "epoch": 81.20401854714065, + "grad_norm": 0.8663647770881653, + "learning_rate": 1.3113041218089056e-05, + "loss": 0.2718, + "num_input_tokens_seen": 40187680, + "step": 26310 + }, + { + "epoch": 81.2194744976816, + "grad_norm": 0.38716036081314087, + "learning_rate": 1.3104405431098626e-05, + "loss": 0.2175, + "num_input_tokens_seen": 40195584, + "step": 26315 + }, + { + "epoch": 81.23493044822257, + "grad_norm": 0.5889010429382324, + "learning_rate": 1.3095771478558377e-05, + "loss": 0.3141, + "num_input_tokens_seen": 40203520, + "step": 26320 + }, + { + "epoch": 81.25038639876352, + "grad_norm": 0.5232728719711304, + "learning_rate": 1.3087139361799766e-05, + "loss": 0.3434, + "num_input_tokens_seen": 40211584, + "step": 26325 + }, + { + "epoch": 81.26584234930448, + "grad_norm": 0.5727049708366394, + "learning_rate": 1.3078509082153964e-05, + "loss": 0.2624, + "num_input_tokens_seen": 40219296, + "step": 26330 + }, + { + "epoch": 81.28129829984545, + "grad_norm": 0.3926084637641907, + "learning_rate": 1.3069880640951885e-05, + "loss": 0.2179, + "num_input_tokens_seen": 40227360, + "step": 26335 + }, + { + "epoch": 81.2967542503864, + "grad_norm": 0.39319080114364624, + "learning_rate": 1.3061254039524123e-05, + "loss": 0.2152, + "num_input_tokens_seen": 40235200, + "step": 26340 + }, + { + "epoch": 81.31221020092735, + "grad_norm": 0.6305240988731384, + "learning_rate": 1.3052629279201028e-05, + "loss": 0.2576, + "num_input_tokens_seen": 40242976, + "step": 26345 + }, + { + "epoch": 81.32766615146832, + "grad_norm": 0.3699406385421753, + "learning_rate": 1.3044006361312633e-05, + "loss": 0.1937, + "num_input_tokens_seen": 40249952, + "step": 26350 + }, + { + "epoch": 81.34312210200927, + "grad_norm": 0.328660786151886, + "learning_rate": 1.30353852871887e-05, + "loss": 0.2692, + "num_input_tokens_seen": 40257440, + "step": 26355 + }, + { + "epoch": 81.35857805255023, + "grad_norm": 0.6246643662452698, + "learning_rate": 1.302676605815873e-05, + "loss": 0.2572, + "num_input_tokens_seen": 40265120, + "step": 26360 + }, + { + "epoch": 81.3740340030912, + "grad_norm": 0.605009913444519, + "learning_rate": 1.3018148675551884e-05, + "loss": 0.2234, + "num_input_tokens_seen": 40272864, + "step": 26365 + }, + { + "epoch": 81.38948995363215, + "grad_norm": 0.36079710721969604, + "learning_rate": 1.3009533140697094e-05, + "loss": 0.2015, + "num_input_tokens_seen": 40280448, + "step": 26370 + }, + { + "epoch": 81.4049459041731, + "grad_norm": 0.786505401134491, + "learning_rate": 1.3000919454922966e-05, + "loss": 0.2124, + "num_input_tokens_seen": 40287936, + "step": 26375 + }, + { + "epoch": 81.42040185471407, + "grad_norm": 0.4812209904193878, + "learning_rate": 1.299230761955785e-05, + "loss": 0.2575, + "num_input_tokens_seen": 40295296, + "step": 26380 + }, + { + "epoch": 81.43585780525503, + "grad_norm": 0.3535204827785492, + "learning_rate": 1.2983697635929807e-05, + "loss": 0.3019, + "num_input_tokens_seen": 40302496, + "step": 26385 + }, + { + "epoch": 81.45131375579598, + "grad_norm": 0.575851559638977, + "learning_rate": 1.2975089505366584e-05, + "loss": 0.2193, + "num_input_tokens_seen": 40309664, + "step": 26390 + }, + { + "epoch": 81.46676970633693, + "grad_norm": 0.4596392810344696, + "learning_rate": 1.2966483229195683e-05, + "loss": 0.244, + "num_input_tokens_seen": 40317728, + "step": 26395 + }, + { + "epoch": 81.4822256568779, + "grad_norm": 0.9419782757759094, + "learning_rate": 1.2957878808744283e-05, + "loss": 0.2747, + "num_input_tokens_seen": 40325376, + "step": 26400 + }, + { + "epoch": 81.4822256568779, + "eval_loss": 0.30357325077056885, + "eval_runtime": 6.3133, + "eval_samples_per_second": 91.077, + "eval_steps_per_second": 22.809, + "num_input_tokens_seen": 40325376, + "step": 26400 + }, + { + "epoch": 81.49768160741885, + "grad_norm": 0.5178597569465637, + "learning_rate": 1.294927624533931e-05, + "loss": 0.2839, + "num_input_tokens_seen": 40333728, + "step": 26405 + }, + { + "epoch": 81.51313755795981, + "grad_norm": 0.3166300654411316, + "learning_rate": 1.2940675540307378e-05, + "loss": 0.2601, + "num_input_tokens_seen": 40341632, + "step": 26410 + }, + { + "epoch": 81.52859350850078, + "grad_norm": 0.49064451456069946, + "learning_rate": 1.2932076694974814e-05, + "loss": 0.212, + "num_input_tokens_seen": 40349056, + "step": 26415 + }, + { + "epoch": 81.54404945904173, + "grad_norm": 0.4612979590892792, + "learning_rate": 1.2923479710667682e-05, + "loss": 0.2321, + "num_input_tokens_seen": 40356736, + "step": 26420 + }, + { + "epoch": 81.55950540958268, + "grad_norm": 0.5057898163795471, + "learning_rate": 1.2914884588711751e-05, + "loss": 0.2091, + "num_input_tokens_seen": 40363808, + "step": 26425 + }, + { + "epoch": 81.57496136012365, + "grad_norm": 0.4463055431842804, + "learning_rate": 1.2906291330432475e-05, + "loss": 0.2635, + "num_input_tokens_seen": 40370880, + "step": 26430 + }, + { + "epoch": 81.5904173106646, + "grad_norm": 0.3569948077201843, + "learning_rate": 1.2897699937155055e-05, + "loss": 0.2963, + "num_input_tokens_seen": 40378400, + "step": 26435 + }, + { + "epoch": 81.60587326120556, + "grad_norm": 0.34054306149482727, + "learning_rate": 1.2889110410204403e-05, + "loss": 0.2517, + "num_input_tokens_seen": 40385952, + "step": 26440 + }, + { + "epoch": 81.62132921174653, + "grad_norm": 0.6847313642501831, + "learning_rate": 1.2880522750905111e-05, + "loss": 0.2495, + "num_input_tokens_seen": 40393408, + "step": 26445 + }, + { + "epoch": 81.63678516228748, + "grad_norm": 0.4477197229862213, + "learning_rate": 1.2871936960581523e-05, + "loss": 0.2005, + "num_input_tokens_seen": 40401120, + "step": 26450 + }, + { + "epoch": 81.65224111282843, + "grad_norm": 0.46766188740730286, + "learning_rate": 1.2863353040557658e-05, + "loss": 0.254, + "num_input_tokens_seen": 40408736, + "step": 26455 + }, + { + "epoch": 81.6676970633694, + "grad_norm": 0.42809242010116577, + "learning_rate": 1.2854770992157273e-05, + "loss": 0.252, + "num_input_tokens_seen": 40416416, + "step": 26460 + }, + { + "epoch": 81.68315301391036, + "grad_norm": 0.5105990767478943, + "learning_rate": 1.2846190816703835e-05, + "loss": 0.2218, + "num_input_tokens_seen": 40423840, + "step": 26465 + }, + { + "epoch": 81.69860896445131, + "grad_norm": 0.513450026512146, + "learning_rate": 1.2837612515520498e-05, + "loss": 0.2542, + "num_input_tokens_seen": 40431552, + "step": 26470 + }, + { + "epoch": 81.71406491499228, + "grad_norm": 0.6487897634506226, + "learning_rate": 1.2829036089930163e-05, + "loss": 0.2278, + "num_input_tokens_seen": 40439680, + "step": 26475 + }, + { + "epoch": 81.72952086553323, + "grad_norm": 0.7043088674545288, + "learning_rate": 1.2820461541255412e-05, + "loss": 0.24, + "num_input_tokens_seen": 40447008, + "step": 26480 + }, + { + "epoch": 81.74497681607419, + "grad_norm": 0.30971550941467285, + "learning_rate": 1.2811888870818543e-05, + "loss": 0.3071, + "num_input_tokens_seen": 40454208, + "step": 26485 + }, + { + "epoch": 81.76043276661515, + "grad_norm": 0.6514818072319031, + "learning_rate": 1.2803318079941581e-05, + "loss": 0.2168, + "num_input_tokens_seen": 40461536, + "step": 26490 + }, + { + "epoch": 81.7758887171561, + "grad_norm": 0.5188510417938232, + "learning_rate": 1.2794749169946235e-05, + "loss": 0.2769, + "num_input_tokens_seen": 40469824, + "step": 26495 + }, + { + "epoch": 81.79134466769706, + "grad_norm": 0.4034996032714844, + "learning_rate": 1.2786182142153952e-05, + "loss": 0.2474, + "num_input_tokens_seen": 40477600, + "step": 26500 + }, + { + "epoch": 81.80680061823801, + "grad_norm": 0.7337327003479004, + "learning_rate": 1.2777616997885878e-05, + "loss": 0.2913, + "num_input_tokens_seen": 40485920, + "step": 26505 + }, + { + "epoch": 81.82225656877898, + "grad_norm": 0.4864504933357239, + "learning_rate": 1.2769053738462847e-05, + "loss": 0.2007, + "num_input_tokens_seen": 40493600, + "step": 26510 + }, + { + "epoch": 81.83771251931994, + "grad_norm": 0.6359832882881165, + "learning_rate": 1.2760492365205434e-05, + "loss": 0.2458, + "num_input_tokens_seen": 40501600, + "step": 26515 + }, + { + "epoch": 81.85316846986089, + "grad_norm": 0.5617509484291077, + "learning_rate": 1.2751932879433919e-05, + "loss": 0.2561, + "num_input_tokens_seen": 40509312, + "step": 26520 + }, + { + "epoch": 81.86862442040186, + "grad_norm": 0.7166659235954285, + "learning_rate": 1.2743375282468267e-05, + "loss": 0.3273, + "num_input_tokens_seen": 40516768, + "step": 26525 + }, + { + "epoch": 81.88408037094281, + "grad_norm": 0.5556740760803223, + "learning_rate": 1.2734819575628182e-05, + "loss": 0.2676, + "num_input_tokens_seen": 40525024, + "step": 26530 + }, + { + "epoch": 81.89953632148377, + "grad_norm": 0.39837804436683655, + "learning_rate": 1.2726265760233039e-05, + "loss": 0.2558, + "num_input_tokens_seen": 40532800, + "step": 26535 + }, + { + "epoch": 81.91499227202473, + "grad_norm": 0.42389824986457825, + "learning_rate": 1.271771383760197e-05, + "loss": 0.2617, + "num_input_tokens_seen": 40540128, + "step": 26540 + }, + { + "epoch": 81.93044822256569, + "grad_norm": 0.41964974999427795, + "learning_rate": 1.2709163809053764e-05, + "loss": 0.2326, + "num_input_tokens_seen": 40548032, + "step": 26545 + }, + { + "epoch": 81.94590417310664, + "grad_norm": 0.5457534193992615, + "learning_rate": 1.2700615675906963e-05, + "loss": 0.2192, + "num_input_tokens_seen": 40555424, + "step": 26550 + }, + { + "epoch": 81.96136012364761, + "grad_norm": 0.42933931946754456, + "learning_rate": 1.269206943947978e-05, + "loss": 0.2366, + "num_input_tokens_seen": 40562752, + "step": 26555 + }, + { + "epoch": 81.97681607418856, + "grad_norm": 0.7360703945159912, + "learning_rate": 1.2683525101090177e-05, + "loss": 0.2905, + "num_input_tokens_seen": 40570304, + "step": 26560 + }, + { + "epoch": 81.99227202472952, + "grad_norm": 0.5018593668937683, + "learning_rate": 1.2674982662055765e-05, + "loss": 0.2228, + "num_input_tokens_seen": 40578592, + "step": 26565 + }, + { + "epoch": 82.00618238021639, + "grad_norm": 0.40827006101608276, + "learning_rate": 1.2666442123693922e-05, + "loss": 0.2725, + "num_input_tokens_seen": 40585024, + "step": 26570 + }, + { + "epoch": 82.02163833075734, + "grad_norm": 0.7627209424972534, + "learning_rate": 1.265790348732169e-05, + "loss": 0.2664, + "num_input_tokens_seen": 40592640, + "step": 26575 + }, + { + "epoch": 82.0370942812983, + "grad_norm": 0.6134704351425171, + "learning_rate": 1.264936675425584e-05, + "loss": 0.232, + "num_input_tokens_seen": 40600416, + "step": 26580 + }, + { + "epoch": 82.05255023183926, + "grad_norm": 0.4228672981262207, + "learning_rate": 1.2640831925812852e-05, + "loss": 0.2287, + "num_input_tokens_seen": 40608064, + "step": 26585 + }, + { + "epoch": 82.06800618238022, + "grad_norm": 0.7019554376602173, + "learning_rate": 1.263229900330889e-05, + "loss": 0.2516, + "num_input_tokens_seen": 40615872, + "step": 26590 + }, + { + "epoch": 82.08346213292117, + "grad_norm": 0.5426173806190491, + "learning_rate": 1.2623767988059843e-05, + "loss": 0.2303, + "num_input_tokens_seen": 40623936, + "step": 26595 + }, + { + "epoch": 82.09891808346214, + "grad_norm": 0.5219367742538452, + "learning_rate": 1.2615238881381309e-05, + "loss": 0.267, + "num_input_tokens_seen": 40631296, + "step": 26600 + }, + { + "epoch": 82.09891808346214, + "eval_loss": 0.30460676550865173, + "eval_runtime": 6.3105, + "eval_samples_per_second": 91.118, + "eval_steps_per_second": 22.819, + "num_input_tokens_seen": 40631296, + "step": 26600 + }, + { + "epoch": 82.11437403400309, + "grad_norm": 0.8191511631011963, + "learning_rate": 1.2606711684588568e-05, + "loss": 0.2732, + "num_input_tokens_seen": 40638816, + "step": 26605 + }, + { + "epoch": 82.12982998454405, + "grad_norm": 0.4607643187046051, + "learning_rate": 1.2598186398996636e-05, + "loss": 0.2957, + "num_input_tokens_seen": 40646400, + "step": 26610 + }, + { + "epoch": 82.14528593508501, + "grad_norm": 0.6259880065917969, + "learning_rate": 1.2589663025920207e-05, + "loss": 0.3192, + "num_input_tokens_seen": 40653984, + "step": 26615 + }, + { + "epoch": 82.16074188562597, + "grad_norm": 0.49893835186958313, + "learning_rate": 1.2581141566673705e-05, + "loss": 0.3001, + "num_input_tokens_seen": 40662144, + "step": 26620 + }, + { + "epoch": 82.17619783616692, + "grad_norm": 0.4388258755207062, + "learning_rate": 1.257262202257124e-05, + "loss": 0.2573, + "num_input_tokens_seen": 40670496, + "step": 26625 + }, + { + "epoch": 82.19165378670789, + "grad_norm": 0.3656001687049866, + "learning_rate": 1.2564104394926618e-05, + "loss": 0.2743, + "num_input_tokens_seen": 40678176, + "step": 26630 + }, + { + "epoch": 82.20710973724884, + "grad_norm": 0.6389029026031494, + "learning_rate": 1.2555588685053383e-05, + "loss": 0.2668, + "num_input_tokens_seen": 40685952, + "step": 26635 + }, + { + "epoch": 82.2225656877898, + "grad_norm": 0.3191850781440735, + "learning_rate": 1.2547074894264762e-05, + "loss": 0.228, + "num_input_tokens_seen": 40693376, + "step": 26640 + }, + { + "epoch": 82.23802163833076, + "grad_norm": 0.650680422782898, + "learning_rate": 1.2538563023873679e-05, + "loss": 0.2265, + "num_input_tokens_seen": 40700672, + "step": 26645 + }, + { + "epoch": 82.25347758887172, + "grad_norm": 0.44979774951934814, + "learning_rate": 1.2530053075192789e-05, + "loss": 0.2143, + "num_input_tokens_seen": 40708064, + "step": 26650 + }, + { + "epoch": 82.26893353941267, + "grad_norm": 0.7377551794052124, + "learning_rate": 1.252154504953441e-05, + "loss": 0.2312, + "num_input_tokens_seen": 40716064, + "step": 26655 + }, + { + "epoch": 82.28438948995363, + "grad_norm": 0.41903215646743774, + "learning_rate": 1.25130389482106e-05, + "loss": 0.2184, + "num_input_tokens_seen": 40723936, + "step": 26660 + }, + { + "epoch": 82.2998454404946, + "grad_norm": 0.36712804436683655, + "learning_rate": 1.2504534772533116e-05, + "loss": 0.3412, + "num_input_tokens_seen": 40731360, + "step": 26665 + }, + { + "epoch": 82.31530139103555, + "grad_norm": 0.34368255734443665, + "learning_rate": 1.2496032523813387e-05, + "loss": 0.2255, + "num_input_tokens_seen": 40738656, + "step": 26670 + }, + { + "epoch": 82.3307573415765, + "grad_norm": 0.7213471531867981, + "learning_rate": 1.2487532203362576e-05, + "loss": 0.2283, + "num_input_tokens_seen": 40745952, + "step": 26675 + }, + { + "epoch": 82.34621329211747, + "grad_norm": 0.33536699414253235, + "learning_rate": 1.247903381249155e-05, + "loss": 0.2454, + "num_input_tokens_seen": 40753344, + "step": 26680 + }, + { + "epoch": 82.36166924265842, + "grad_norm": 0.47209426760673523, + "learning_rate": 1.2470537352510853e-05, + "loss": 0.2627, + "num_input_tokens_seen": 40761536, + "step": 26685 + }, + { + "epoch": 82.37712519319938, + "grad_norm": 0.7747596502304077, + "learning_rate": 1.2462042824730758e-05, + "loss": 0.2165, + "num_input_tokens_seen": 40768800, + "step": 26690 + }, + { + "epoch": 82.39258114374034, + "grad_norm": 0.6926923990249634, + "learning_rate": 1.245355023046122e-05, + "loss": 0.2731, + "num_input_tokens_seen": 40776576, + "step": 26695 + }, + { + "epoch": 82.4080370942813, + "grad_norm": 0.7898098826408386, + "learning_rate": 1.2445059571011896e-05, + "loss": 0.3254, + "num_input_tokens_seen": 40783968, + "step": 26700 + }, + { + "epoch": 82.42349304482225, + "grad_norm": 0.5974790453910828, + "learning_rate": 1.2436570847692173e-05, + "loss": 0.331, + "num_input_tokens_seen": 40791904, + "step": 26705 + }, + { + "epoch": 82.43894899536322, + "grad_norm": 0.5869937539100647, + "learning_rate": 1.2428084061811096e-05, + "loss": 0.2274, + "num_input_tokens_seen": 40799552, + "step": 26710 + }, + { + "epoch": 82.45440494590417, + "grad_norm": 0.3133825361728668, + "learning_rate": 1.2419599214677447e-05, + "loss": 0.2195, + "num_input_tokens_seen": 40807808, + "step": 26715 + }, + { + "epoch": 82.46986089644513, + "grad_norm": 0.8317719101905823, + "learning_rate": 1.2411116307599702e-05, + "loss": 0.3107, + "num_input_tokens_seen": 40815328, + "step": 26720 + }, + { + "epoch": 82.4853168469861, + "grad_norm": 0.4984034299850464, + "learning_rate": 1.2402635341886016e-05, + "loss": 0.2668, + "num_input_tokens_seen": 40823776, + "step": 26725 + }, + { + "epoch": 82.50077279752705, + "grad_norm": 0.4236953556537628, + "learning_rate": 1.2394156318844278e-05, + "loss": 0.2311, + "num_input_tokens_seen": 40831360, + "step": 26730 + }, + { + "epoch": 82.516228748068, + "grad_norm": 0.40819990634918213, + "learning_rate": 1.2385679239782039e-05, + "loss": 0.1988, + "num_input_tokens_seen": 40838592, + "step": 26735 + }, + { + "epoch": 82.53168469860897, + "grad_norm": 0.4962233304977417, + "learning_rate": 1.2377204106006585e-05, + "loss": 0.235, + "num_input_tokens_seen": 40845824, + "step": 26740 + }, + { + "epoch": 82.54714064914992, + "grad_norm": 0.4280478060245514, + "learning_rate": 1.2368730918824891e-05, + "loss": 0.24, + "num_input_tokens_seen": 40853376, + "step": 26745 + }, + { + "epoch": 82.56259659969088, + "grad_norm": 0.3115095794200897, + "learning_rate": 1.236025967954362e-05, + "loss": 0.2096, + "num_input_tokens_seen": 40860736, + "step": 26750 + }, + { + "epoch": 82.57805255023185, + "grad_norm": 0.3105193078517914, + "learning_rate": 1.2351790389469153e-05, + "loss": 0.2841, + "num_input_tokens_seen": 40867968, + "step": 26755 + }, + { + "epoch": 82.5935085007728, + "grad_norm": 0.6957253217697144, + "learning_rate": 1.234332304990755e-05, + "loss": 0.2898, + "num_input_tokens_seen": 40875520, + "step": 26760 + }, + { + "epoch": 82.60896445131375, + "grad_norm": 0.9398914575576782, + "learning_rate": 1.2334857662164593e-05, + "loss": 0.2526, + "num_input_tokens_seen": 40883008, + "step": 26765 + }, + { + "epoch": 82.62442040185472, + "grad_norm": 0.5844948291778564, + "learning_rate": 1.2326394227545743e-05, + "loss": 0.221, + "num_input_tokens_seen": 40890784, + "step": 26770 + }, + { + "epoch": 82.63987635239567, + "grad_norm": 0.6184459328651428, + "learning_rate": 1.2317932747356162e-05, + "loss": 0.2623, + "num_input_tokens_seen": 40898720, + "step": 26775 + }, + { + "epoch": 82.65533230293663, + "grad_norm": 0.43953484296798706, + "learning_rate": 1.2309473222900726e-05, + "loss": 0.2126, + "num_input_tokens_seen": 40906400, + "step": 26780 + }, + { + "epoch": 82.67078825347758, + "grad_norm": 0.7737885117530823, + "learning_rate": 1.2301015655484006e-05, + "loss": 0.2448, + "num_input_tokens_seen": 40914176, + "step": 26785 + }, + { + "epoch": 82.68624420401855, + "grad_norm": 0.4987028241157532, + "learning_rate": 1.2292560046410245e-05, + "loss": 0.2084, + "num_input_tokens_seen": 40922112, + "step": 26790 + }, + { + "epoch": 82.7017001545595, + "grad_norm": 0.4130164086818695, + "learning_rate": 1.228410639698343e-05, + "loss": 0.2353, + "num_input_tokens_seen": 40929920, + "step": 26795 + }, + { + "epoch": 82.71715610510046, + "grad_norm": 0.5930743217468262, + "learning_rate": 1.2275654708507195e-05, + "loss": 0.2835, + "num_input_tokens_seen": 40937312, + "step": 26800 + }, + { + "epoch": 82.71715610510046, + "eval_loss": 0.3045356571674347, + "eval_runtime": 6.2891, + "eval_samples_per_second": 91.428, + "eval_steps_per_second": 22.897, + "num_input_tokens_seen": 40937312, + "step": 26800 + }, + { + "epoch": 82.73261205564143, + "grad_norm": 0.9879082441329956, + "learning_rate": 1.2267204982284908e-05, + "loss": 0.2431, + "num_input_tokens_seen": 40944640, + "step": 26805 + }, + { + "epoch": 82.74806800618238, + "grad_norm": 0.2792445123195648, + "learning_rate": 1.2258757219619635e-05, + "loss": 0.1982, + "num_input_tokens_seen": 40952576, + "step": 26810 + }, + { + "epoch": 82.76352395672333, + "grad_norm": 0.4075752794742584, + "learning_rate": 1.2250311421814104e-05, + "loss": 0.2574, + "num_input_tokens_seen": 40960640, + "step": 26815 + }, + { + "epoch": 82.7789799072643, + "grad_norm": 0.6411494612693787, + "learning_rate": 1.2241867590170772e-05, + "loss": 0.2629, + "num_input_tokens_seen": 40968128, + "step": 26820 + }, + { + "epoch": 82.79443585780525, + "grad_norm": 0.4152558445930481, + "learning_rate": 1.2233425725991799e-05, + "loss": 0.2466, + "num_input_tokens_seen": 40975232, + "step": 26825 + }, + { + "epoch": 82.80989180834621, + "grad_norm": 0.4909226894378662, + "learning_rate": 1.2224985830579003e-05, + "loss": 0.265, + "num_input_tokens_seen": 40983200, + "step": 26830 + }, + { + "epoch": 82.82534775888718, + "grad_norm": 0.2809932231903076, + "learning_rate": 1.2216547905233944e-05, + "loss": 0.2229, + "num_input_tokens_seen": 40991744, + "step": 26835 + }, + { + "epoch": 82.84080370942813, + "grad_norm": 0.45727530121803284, + "learning_rate": 1.2208111951257842e-05, + "loss": 0.2002, + "num_input_tokens_seen": 40999200, + "step": 26840 + }, + { + "epoch": 82.85625965996908, + "grad_norm": 0.3406575322151184, + "learning_rate": 1.2199677969951622e-05, + "loss": 0.211, + "num_input_tokens_seen": 41006784, + "step": 26845 + }, + { + "epoch": 82.87171561051005, + "grad_norm": 0.39659997820854187, + "learning_rate": 1.2191245962615927e-05, + "loss": 0.1862, + "num_input_tokens_seen": 41014304, + "step": 26850 + }, + { + "epoch": 82.887171561051, + "grad_norm": 0.427328884601593, + "learning_rate": 1.218281593055106e-05, + "loss": 0.225, + "num_input_tokens_seen": 41021728, + "step": 26855 + }, + { + "epoch": 82.90262751159196, + "grad_norm": 0.5560395121574402, + "learning_rate": 1.217438787505705e-05, + "loss": 0.2071, + "num_input_tokens_seen": 41029472, + "step": 26860 + }, + { + "epoch": 82.91808346213293, + "grad_norm": 0.5699031352996826, + "learning_rate": 1.2165961797433615e-05, + "loss": 0.2022, + "num_input_tokens_seen": 41037056, + "step": 26865 + }, + { + "epoch": 82.93353941267388, + "grad_norm": 0.2931666970252991, + "learning_rate": 1.215753769898014e-05, + "loss": 0.2596, + "num_input_tokens_seen": 41044928, + "step": 26870 + }, + { + "epoch": 82.94899536321483, + "grad_norm": 0.5136330127716064, + "learning_rate": 1.2149115580995755e-05, + "loss": 0.2681, + "num_input_tokens_seen": 41052128, + "step": 26875 + }, + { + "epoch": 82.9644513137558, + "grad_norm": 0.5368669629096985, + "learning_rate": 1.2140695444779227e-05, + "loss": 0.2446, + "num_input_tokens_seen": 41059904, + "step": 26880 + }, + { + "epoch": 82.97990726429676, + "grad_norm": 0.496685266494751, + "learning_rate": 1.2132277291629066e-05, + "loss": 0.2461, + "num_input_tokens_seen": 41067872, + "step": 26885 + }, + { + "epoch": 82.99536321483771, + "grad_norm": 0.7069861888885498, + "learning_rate": 1.2123861122843458e-05, + "loss": 0.2949, + "num_input_tokens_seen": 41076288, + "step": 26890 + }, + { + "epoch": 83.00927357032458, + "grad_norm": 0.3980322480201721, + "learning_rate": 1.2115446939720271e-05, + "loss": 0.322, + "num_input_tokens_seen": 41083088, + "step": 26895 + }, + { + "epoch": 83.02472952086553, + "grad_norm": 0.5986178517341614, + "learning_rate": 1.210703474355708e-05, + "loss": 0.2125, + "num_input_tokens_seen": 41090128, + "step": 26900 + }, + { + "epoch": 83.04018547140649, + "grad_norm": 0.6632575392723083, + "learning_rate": 1.2098624535651164e-05, + "loss": 0.2452, + "num_input_tokens_seen": 41097744, + "step": 26905 + }, + { + "epoch": 83.05564142194746, + "grad_norm": 0.3833131194114685, + "learning_rate": 1.2090216317299477e-05, + "loss": 0.2404, + "num_input_tokens_seen": 41105392, + "step": 26910 + }, + { + "epoch": 83.07109737248841, + "grad_norm": 0.4913733899593353, + "learning_rate": 1.2081810089798668e-05, + "loss": 0.2943, + "num_input_tokens_seen": 41112304, + "step": 26915 + }, + { + "epoch": 83.08655332302936, + "grad_norm": 0.5926149487495422, + "learning_rate": 1.2073405854445072e-05, + "loss": 0.2172, + "num_input_tokens_seen": 41120048, + "step": 26920 + }, + { + "epoch": 83.10200927357033, + "grad_norm": 0.48697859048843384, + "learning_rate": 1.206500361253474e-05, + "loss": 0.309, + "num_input_tokens_seen": 41127504, + "step": 26925 + }, + { + "epoch": 83.11746522411129, + "grad_norm": 0.4618781805038452, + "learning_rate": 1.2056603365363409e-05, + "loss": 0.2343, + "num_input_tokens_seen": 41134768, + "step": 26930 + }, + { + "epoch": 83.13292117465224, + "grad_norm": 0.44035694003105164, + "learning_rate": 1.2048205114226487e-05, + "loss": 0.2273, + "num_input_tokens_seen": 41142128, + "step": 26935 + }, + { + "epoch": 83.14837712519319, + "grad_norm": 0.5068046450614929, + "learning_rate": 1.2039808860419102e-05, + "loss": 0.235, + "num_input_tokens_seen": 41149744, + "step": 26940 + }, + { + "epoch": 83.16383307573416, + "grad_norm": 0.4400373101234436, + "learning_rate": 1.2031414605236066e-05, + "loss": 0.3435, + "num_input_tokens_seen": 41157008, + "step": 26945 + }, + { + "epoch": 83.17928902627511, + "grad_norm": 0.3828267753124237, + "learning_rate": 1.2023022349971862e-05, + "loss": 0.2175, + "num_input_tokens_seen": 41164656, + "step": 26950 + }, + { + "epoch": 83.19474497681607, + "grad_norm": 0.5168089866638184, + "learning_rate": 1.20146320959207e-05, + "loss": 0.284, + "num_input_tokens_seen": 41172080, + "step": 26955 + }, + { + "epoch": 83.21020092735704, + "grad_norm": 0.5699959993362427, + "learning_rate": 1.2006243844376445e-05, + "loss": 0.1986, + "num_input_tokens_seen": 41179344, + "step": 26960 + }, + { + "epoch": 83.22565687789799, + "grad_norm": 0.47054949402809143, + "learning_rate": 1.1997857596632678e-05, + "loss": 0.2458, + "num_input_tokens_seen": 41186736, + "step": 26965 + }, + { + "epoch": 83.24111282843894, + "grad_norm": 0.6943028569221497, + "learning_rate": 1.1989473353982672e-05, + "loss": 0.2557, + "num_input_tokens_seen": 41194448, + "step": 26970 + }, + { + "epoch": 83.25656877897991, + "grad_norm": 0.5569959878921509, + "learning_rate": 1.198109111771937e-05, + "loss": 0.2568, + "num_input_tokens_seen": 41202384, + "step": 26975 + }, + { + "epoch": 83.27202472952087, + "grad_norm": 0.5483672618865967, + "learning_rate": 1.197271088913543e-05, + "loss": 0.2396, + "num_input_tokens_seen": 41210096, + "step": 26980 + }, + { + "epoch": 83.28748068006182, + "grad_norm": 0.4112490713596344, + "learning_rate": 1.1964332669523182e-05, + "loss": 0.2851, + "num_input_tokens_seen": 41217872, + "step": 26985 + }, + { + "epoch": 83.30293663060279, + "grad_norm": 0.5305684804916382, + "learning_rate": 1.1955956460174645e-05, + "loss": 0.27, + "num_input_tokens_seen": 41226000, + "step": 26990 + }, + { + "epoch": 83.31839258114374, + "grad_norm": 0.3598931133747101, + "learning_rate": 1.1947582262381552e-05, + "loss": 0.2468, + "num_input_tokens_seen": 41233168, + "step": 26995 + }, + { + "epoch": 83.3338485316847, + "grad_norm": 0.3960656523704529, + "learning_rate": 1.1939210077435293e-05, + "loss": 0.2254, + "num_input_tokens_seen": 41240464, + "step": 27000 + }, + { + "epoch": 83.3338485316847, + "eval_loss": 0.3038424849510193, + "eval_runtime": 6.3114, + "eval_samples_per_second": 91.105, + "eval_steps_per_second": 22.816, + "num_input_tokens_seen": 41240464, + "step": 27000 + }, + { + "epoch": 83.34930448222566, + "grad_norm": 0.42435693740844727, + "learning_rate": 1.193083990662697e-05, + "loss": 0.2634, + "num_input_tokens_seen": 41248400, + "step": 27005 + }, + { + "epoch": 83.36476043276662, + "grad_norm": 0.39556893706321716, + "learning_rate": 1.192247175124738e-05, + "loss": 0.2163, + "num_input_tokens_seen": 41256368, + "step": 27010 + }, + { + "epoch": 83.38021638330757, + "grad_norm": 0.5382022857666016, + "learning_rate": 1.191410561258698e-05, + "loss": 0.2358, + "num_input_tokens_seen": 41264112, + "step": 27015 + }, + { + "epoch": 83.39567233384854, + "grad_norm": 0.37443962693214417, + "learning_rate": 1.1905741491935944e-05, + "loss": 0.272, + "num_input_tokens_seen": 41272336, + "step": 27020 + }, + { + "epoch": 83.41112828438949, + "grad_norm": 0.3478115200996399, + "learning_rate": 1.1897379390584129e-05, + "loss": 0.2008, + "num_input_tokens_seen": 41280208, + "step": 27025 + }, + { + "epoch": 83.42658423493044, + "grad_norm": 0.7188317775726318, + "learning_rate": 1.1889019309821062e-05, + "loss": 0.2348, + "num_input_tokens_seen": 41288464, + "step": 27030 + }, + { + "epoch": 83.44204018547141, + "grad_norm": 0.6071372628211975, + "learning_rate": 1.188066125093599e-05, + "loss": 0.2539, + "num_input_tokens_seen": 41296432, + "step": 27035 + }, + { + "epoch": 83.45749613601237, + "grad_norm": 0.42964786291122437, + "learning_rate": 1.1872305215217811e-05, + "loss": 0.2767, + "num_input_tokens_seen": 41304656, + "step": 27040 + }, + { + "epoch": 83.47295208655332, + "grad_norm": 0.6593500971794128, + "learning_rate": 1.186395120395514e-05, + "loss": 0.2532, + "num_input_tokens_seen": 41312272, + "step": 27045 + }, + { + "epoch": 83.48840803709429, + "grad_norm": 0.5347891449928284, + "learning_rate": 1.1855599218436283e-05, + "loss": 0.315, + "num_input_tokens_seen": 41319856, + "step": 27050 + }, + { + "epoch": 83.50386398763524, + "grad_norm": 0.3677418828010559, + "learning_rate": 1.1847249259949209e-05, + "loss": 0.2429, + "num_input_tokens_seen": 41327856, + "step": 27055 + }, + { + "epoch": 83.5193199381762, + "grad_norm": 0.3901311159133911, + "learning_rate": 1.1838901329781574e-05, + "loss": 0.3027, + "num_input_tokens_seen": 41336016, + "step": 27060 + }, + { + "epoch": 83.53477588871715, + "grad_norm": 0.3468918204307556, + "learning_rate": 1.1830555429220758e-05, + "loss": 0.2479, + "num_input_tokens_seen": 41343536, + "step": 27065 + }, + { + "epoch": 83.55023183925812, + "grad_norm": 0.6388643980026245, + "learning_rate": 1.1822211559553784e-05, + "loss": 0.2856, + "num_input_tokens_seen": 41351312, + "step": 27070 + }, + { + "epoch": 83.56568778979907, + "grad_norm": 0.7859109044075012, + "learning_rate": 1.18138697220674e-05, + "loss": 0.2635, + "num_input_tokens_seen": 41358672, + "step": 27075 + }, + { + "epoch": 83.58114374034002, + "grad_norm": 0.4243195652961731, + "learning_rate": 1.1805529918048e-05, + "loss": 0.2243, + "num_input_tokens_seen": 41366096, + "step": 27080 + }, + { + "epoch": 83.59659969088099, + "grad_norm": 0.35936984419822693, + "learning_rate": 1.1797192148781702e-05, + "loss": 0.2165, + "num_input_tokens_seen": 41373744, + "step": 27085 + }, + { + "epoch": 83.61205564142195, + "grad_norm": 1.065285563468933, + "learning_rate": 1.1788856415554297e-05, + "loss": 0.2283, + "num_input_tokens_seen": 41381552, + "step": 27090 + }, + { + "epoch": 83.6275115919629, + "grad_norm": 0.529249906539917, + "learning_rate": 1.1780522719651249e-05, + "loss": 0.3172, + "num_input_tokens_seen": 41389232, + "step": 27095 + }, + { + "epoch": 83.64296754250387, + "grad_norm": 0.6594669818878174, + "learning_rate": 1.1772191062357721e-05, + "loss": 0.229, + "num_input_tokens_seen": 41396848, + "step": 27100 + }, + { + "epoch": 83.65842349304482, + "grad_norm": 0.33327361941337585, + "learning_rate": 1.1763861444958573e-05, + "loss": 0.2119, + "num_input_tokens_seen": 41404144, + "step": 27105 + }, + { + "epoch": 83.67387944358578, + "grad_norm": 0.7111627459526062, + "learning_rate": 1.1755533868738317e-05, + "loss": 0.2626, + "num_input_tokens_seen": 41411568, + "step": 27110 + }, + { + "epoch": 83.68933539412674, + "grad_norm": 0.5822504758834839, + "learning_rate": 1.1747208334981185e-05, + "loss": 0.3022, + "num_input_tokens_seen": 41419216, + "step": 27115 + }, + { + "epoch": 83.7047913446677, + "grad_norm": 0.3643776476383209, + "learning_rate": 1.1738884844971067e-05, + "loss": 0.2447, + "num_input_tokens_seen": 41426864, + "step": 27120 + }, + { + "epoch": 83.72024729520865, + "grad_norm": 0.4904348850250244, + "learning_rate": 1.1730563399991563e-05, + "loss": 0.2647, + "num_input_tokens_seen": 41434800, + "step": 27125 + }, + { + "epoch": 83.73570324574962, + "grad_norm": 0.5704101920127869, + "learning_rate": 1.1722244001325938e-05, + "loss": 0.2119, + "num_input_tokens_seen": 41442384, + "step": 27130 + }, + { + "epoch": 83.75115919629057, + "grad_norm": 0.8551609516143799, + "learning_rate": 1.1713926650257137e-05, + "loss": 0.2374, + "num_input_tokens_seen": 41449552, + "step": 27135 + }, + { + "epoch": 83.76661514683153, + "grad_norm": 0.3350535035133362, + "learning_rate": 1.170561134806781e-05, + "loss": 0.2691, + "num_input_tokens_seen": 41457616, + "step": 27140 + }, + { + "epoch": 83.7820710973725, + "grad_norm": 0.42882558703422546, + "learning_rate": 1.1697298096040287e-05, + "loss": 0.2739, + "num_input_tokens_seen": 41465392, + "step": 27145 + }, + { + "epoch": 83.79752704791345, + "grad_norm": 0.7726675868034363, + "learning_rate": 1.1688986895456567e-05, + "loss": 0.2102, + "num_input_tokens_seen": 41473200, + "step": 27150 + }, + { + "epoch": 83.8129829984544, + "grad_norm": 0.5224388241767883, + "learning_rate": 1.1680677747598349e-05, + "loss": 0.2399, + "num_input_tokens_seen": 41480432, + "step": 27155 + }, + { + "epoch": 83.82843894899537, + "grad_norm": 0.6872227787971497, + "learning_rate": 1.1672370653746995e-05, + "loss": 0.2218, + "num_input_tokens_seen": 41488016, + "step": 27160 + }, + { + "epoch": 83.84389489953632, + "grad_norm": 0.37522533535957336, + "learning_rate": 1.166406561518357e-05, + "loss": 0.2425, + "num_input_tokens_seen": 41495568, + "step": 27165 + }, + { + "epoch": 83.85935085007728, + "grad_norm": 0.45772862434387207, + "learning_rate": 1.1655762633188826e-05, + "loss": 0.2356, + "num_input_tokens_seen": 41503312, + "step": 27170 + }, + { + "epoch": 83.87480680061825, + "grad_norm": 0.3477891981601715, + "learning_rate": 1.1647461709043172e-05, + "loss": 0.3214, + "num_input_tokens_seen": 41510832, + "step": 27175 + }, + { + "epoch": 83.8902627511592, + "grad_norm": 0.49020814895629883, + "learning_rate": 1.1639162844026722e-05, + "loss": 0.2344, + "num_input_tokens_seen": 41518640, + "step": 27180 + }, + { + "epoch": 83.90571870170015, + "grad_norm": 0.41944804787635803, + "learning_rate": 1.163086603941927e-05, + "loss": 0.2003, + "num_input_tokens_seen": 41526544, + "step": 27185 + }, + { + "epoch": 83.9211746522411, + "grad_norm": 0.5738982558250427, + "learning_rate": 1.1622571296500273e-05, + "loss": 0.2447, + "num_input_tokens_seen": 41535024, + "step": 27190 + }, + { + "epoch": 83.93663060278207, + "grad_norm": 0.5756977796554565, + "learning_rate": 1.1614278616548904e-05, + "loss": 0.2397, + "num_input_tokens_seen": 41542896, + "step": 27195 + }, + { + "epoch": 83.95208655332303, + "grad_norm": 0.6567556858062744, + "learning_rate": 1.1605988000843986e-05, + "loss": 0.2325, + "num_input_tokens_seen": 41550128, + "step": 27200 + }, + { + "epoch": 83.95208655332303, + "eval_loss": 0.3017323315143585, + "eval_runtime": 6.3031, + "eval_samples_per_second": 91.224, + "eval_steps_per_second": 22.846, + "num_input_tokens_seen": 41550128, + "step": 27200 + }, + { + "epoch": 83.96754250386398, + "grad_norm": 0.5938810706138611, + "learning_rate": 1.1597699450664028e-05, + "loss": 0.2939, + "num_input_tokens_seen": 41557744, + "step": 27205 + }, + { + "epoch": 83.98299845440495, + "grad_norm": 0.37854939699172974, + "learning_rate": 1.1589412967287252e-05, + "loss": 0.2382, + "num_input_tokens_seen": 41565360, + "step": 27210 + }, + { + "epoch": 83.9984544049459, + "grad_norm": 0.6129202246665955, + "learning_rate": 1.1581128551991514e-05, + "loss": 0.2268, + "num_input_tokens_seen": 41573328, + "step": 27215 + }, + { + "epoch": 84.01236476043276, + "grad_norm": 0.44392722845077515, + "learning_rate": 1.1572846206054383e-05, + "loss": 0.2337, + "num_input_tokens_seen": 41579952, + "step": 27220 + }, + { + "epoch": 84.02782071097373, + "grad_norm": 0.4329356849193573, + "learning_rate": 1.1564565930753113e-05, + "loss": 0.2329, + "num_input_tokens_seen": 41587856, + "step": 27225 + }, + { + "epoch": 84.04327666151468, + "grad_norm": 0.4175267219543457, + "learning_rate": 1.1556287727364606e-05, + "loss": 0.2341, + "num_input_tokens_seen": 41595120, + "step": 27230 + }, + { + "epoch": 84.05873261205564, + "grad_norm": 0.6889358162879944, + "learning_rate": 1.1548011597165489e-05, + "loss": 0.2506, + "num_input_tokens_seen": 41602960, + "step": 27235 + }, + { + "epoch": 84.0741885625966, + "grad_norm": 0.7737910747528076, + "learning_rate": 1.1539737541432019e-05, + "loss": 0.239, + "num_input_tokens_seen": 41610192, + "step": 27240 + }, + { + "epoch": 84.08964451313756, + "grad_norm": 0.5885764360427856, + "learning_rate": 1.1531465561440174e-05, + "loss": 0.2019, + "num_input_tokens_seen": 41617232, + "step": 27245 + }, + { + "epoch": 84.10510046367851, + "grad_norm": 0.30073511600494385, + "learning_rate": 1.1523195658465605e-05, + "loss": 0.2419, + "num_input_tokens_seen": 41625296, + "step": 27250 + }, + { + "epoch": 84.12055641421948, + "grad_norm": 0.6527672410011292, + "learning_rate": 1.1514927833783618e-05, + "loss": 0.221, + "num_input_tokens_seen": 41632784, + "step": 27255 + }, + { + "epoch": 84.13601236476043, + "grad_norm": 0.4333227574825287, + "learning_rate": 1.150666208866922e-05, + "loss": 0.2818, + "num_input_tokens_seen": 41640016, + "step": 27260 + }, + { + "epoch": 84.15146831530139, + "grad_norm": 0.2763580083847046, + "learning_rate": 1.1498398424397106e-05, + "loss": 0.2219, + "num_input_tokens_seen": 41647696, + "step": 27265 + }, + { + "epoch": 84.16692426584235, + "grad_norm": 0.6108129024505615, + "learning_rate": 1.1490136842241628e-05, + "loss": 0.2071, + "num_input_tokens_seen": 41655536, + "step": 27270 + }, + { + "epoch": 84.18238021638331, + "grad_norm": 0.47499191761016846, + "learning_rate": 1.1481877343476813e-05, + "loss": 0.245, + "num_input_tokens_seen": 41662896, + "step": 27275 + }, + { + "epoch": 84.19783616692426, + "grad_norm": 0.6363943815231323, + "learning_rate": 1.14736199293764e-05, + "loss": 0.2732, + "num_input_tokens_seen": 41671216, + "step": 27280 + }, + { + "epoch": 84.21329211746523, + "grad_norm": 0.7473030090332031, + "learning_rate": 1.1465364601213771e-05, + "loss": 0.2669, + "num_input_tokens_seen": 41679504, + "step": 27285 + }, + { + "epoch": 84.22874806800618, + "grad_norm": 0.8088611364364624, + "learning_rate": 1.1457111360262012e-05, + "loss": 0.2845, + "num_input_tokens_seen": 41687440, + "step": 27290 + }, + { + "epoch": 84.24420401854714, + "grad_norm": 0.6362254619598389, + "learning_rate": 1.1448860207793869e-05, + "loss": 0.2808, + "num_input_tokens_seen": 41694672, + "step": 27295 + }, + { + "epoch": 84.2596599690881, + "grad_norm": 0.4947842061519623, + "learning_rate": 1.144061114508177e-05, + "loss": 0.2318, + "num_input_tokens_seen": 41702288, + "step": 27300 + }, + { + "epoch": 84.27511591962906, + "grad_norm": 0.357893705368042, + "learning_rate": 1.1432364173397842e-05, + "loss": 0.1995, + "num_input_tokens_seen": 41709936, + "step": 27305 + }, + { + "epoch": 84.29057187017001, + "grad_norm": 0.6320204734802246, + "learning_rate": 1.1424119294013852e-05, + "loss": 0.2555, + "num_input_tokens_seen": 41717680, + "step": 27310 + }, + { + "epoch": 84.30602782071098, + "grad_norm": 0.44789862632751465, + "learning_rate": 1.1415876508201279e-05, + "loss": 0.2876, + "num_input_tokens_seen": 41725840, + "step": 27315 + }, + { + "epoch": 84.32148377125193, + "grad_norm": 0.478190541267395, + "learning_rate": 1.140763581723125e-05, + "loss": 0.288, + "num_input_tokens_seen": 41733168, + "step": 27320 + }, + { + "epoch": 84.33693972179289, + "grad_norm": 0.8320364356040955, + "learning_rate": 1.1399397222374588e-05, + "loss": 0.2415, + "num_input_tokens_seen": 41740592, + "step": 27325 + }, + { + "epoch": 84.35239567233384, + "grad_norm": 0.6162146925926208, + "learning_rate": 1.1391160724901804e-05, + "loss": 0.2074, + "num_input_tokens_seen": 41747632, + "step": 27330 + }, + { + "epoch": 84.36785162287481, + "grad_norm": 0.3742061257362366, + "learning_rate": 1.138292632608304e-05, + "loss": 0.2073, + "num_input_tokens_seen": 41755248, + "step": 27335 + }, + { + "epoch": 84.38330757341576, + "grad_norm": 0.42647671699523926, + "learning_rate": 1.1374694027188174e-05, + "loss": 0.2118, + "num_input_tokens_seen": 41763280, + "step": 27340 + }, + { + "epoch": 84.39876352395672, + "grad_norm": 0.5405092239379883, + "learning_rate": 1.1366463829486711e-05, + "loss": 0.2666, + "num_input_tokens_seen": 41770640, + "step": 27345 + }, + { + "epoch": 84.41421947449768, + "grad_norm": 0.5854699015617371, + "learning_rate": 1.1358235734247849e-05, + "loss": 0.2755, + "num_input_tokens_seen": 41778384, + "step": 27350 + }, + { + "epoch": 84.42967542503864, + "grad_norm": 0.38666659593582153, + "learning_rate": 1.1350009742740478e-05, + "loss": 0.2274, + "num_input_tokens_seen": 41786448, + "step": 27355 + }, + { + "epoch": 84.44513137557959, + "grad_norm": 0.4403802454471588, + "learning_rate": 1.134178585623313e-05, + "loss": 0.2046, + "num_input_tokens_seen": 41794544, + "step": 27360 + }, + { + "epoch": 84.46058732612056, + "grad_norm": 0.8283993005752563, + "learning_rate": 1.1333564075994047e-05, + "loss": 0.255, + "num_input_tokens_seen": 41802128, + "step": 27365 + }, + { + "epoch": 84.47604327666151, + "grad_norm": 1.0906509160995483, + "learning_rate": 1.1325344403291133e-05, + "loss": 0.3532, + "num_input_tokens_seen": 41809360, + "step": 27370 + }, + { + "epoch": 84.49149922720247, + "grad_norm": 0.41251498460769653, + "learning_rate": 1.1317126839391951e-05, + "loss": 0.2675, + "num_input_tokens_seen": 41816400, + "step": 27375 + }, + { + "epoch": 84.50695517774344, + "grad_norm": 0.8096650838851929, + "learning_rate": 1.1308911385563766e-05, + "loss": 0.2489, + "num_input_tokens_seen": 41823824, + "step": 27380 + }, + { + "epoch": 84.52241112828439, + "grad_norm": 0.4415666460990906, + "learning_rate": 1.1300698043073494e-05, + "loss": 0.2248, + "num_input_tokens_seen": 41831760, + "step": 27385 + }, + { + "epoch": 84.53786707882534, + "grad_norm": 0.957149863243103, + "learning_rate": 1.1292486813187736e-05, + "loss": 0.2029, + "num_input_tokens_seen": 41839600, + "step": 27390 + }, + { + "epoch": 84.55332302936631, + "grad_norm": 0.3959501385688782, + "learning_rate": 1.1284277697172782e-05, + "loss": 0.3062, + "num_input_tokens_seen": 41847152, + "step": 27395 + }, + { + "epoch": 84.56877897990726, + "grad_norm": 0.5997223258018494, + "learning_rate": 1.127607069629456e-05, + "loss": 0.2357, + "num_input_tokens_seen": 41855152, + "step": 27400 + }, + { + "epoch": 84.56877897990726, + "eval_loss": 0.3031654357910156, + "eval_runtime": 6.3256, + "eval_samples_per_second": 90.9, + "eval_steps_per_second": 22.765, + "num_input_tokens_seen": 41855152, + "step": 27400 + }, + { + "epoch": 84.58423493044822, + "grad_norm": 0.3140384256839752, + "learning_rate": 1.1267865811818701e-05, + "loss": 0.2295, + "num_input_tokens_seen": 41862192, + "step": 27405 + }, + { + "epoch": 84.59969088098919, + "grad_norm": 0.46424970030784607, + "learning_rate": 1.1259663045010513e-05, + "loss": 0.255, + "num_input_tokens_seen": 41870256, + "step": 27410 + }, + { + "epoch": 84.61514683153014, + "grad_norm": 0.3573237359523773, + "learning_rate": 1.1251462397134957e-05, + "loss": 0.2114, + "num_input_tokens_seen": 41878256, + "step": 27415 + }, + { + "epoch": 84.6306027820711, + "grad_norm": 0.517570436000824, + "learning_rate": 1.1243263869456664e-05, + "loss": 0.2921, + "num_input_tokens_seen": 41885200, + "step": 27420 + }, + { + "epoch": 84.64605873261206, + "grad_norm": 0.43602246046066284, + "learning_rate": 1.1235067463239967e-05, + "loss": 0.2677, + "num_input_tokens_seen": 41892688, + "step": 27425 + }, + { + "epoch": 84.66151468315302, + "grad_norm": 0.5144816040992737, + "learning_rate": 1.122687317974884e-05, + "loss": 0.2466, + "num_input_tokens_seen": 41900528, + "step": 27430 + }, + { + "epoch": 84.67697063369397, + "grad_norm": 0.48329290747642517, + "learning_rate": 1.1218681020246963e-05, + "loss": 0.2464, + "num_input_tokens_seen": 41907888, + "step": 27435 + }, + { + "epoch": 84.69242658423494, + "grad_norm": 0.5438981652259827, + "learning_rate": 1.1210490985997652e-05, + "loss": 0.2348, + "num_input_tokens_seen": 41915600, + "step": 27440 + }, + { + "epoch": 84.70788253477589, + "grad_norm": 0.5563170313835144, + "learning_rate": 1.1202303078263917e-05, + "loss": 0.2232, + "num_input_tokens_seen": 41923152, + "step": 27445 + }, + { + "epoch": 84.72333848531684, + "grad_norm": 0.4815210998058319, + "learning_rate": 1.1194117298308451e-05, + "loss": 0.2716, + "num_input_tokens_seen": 41931216, + "step": 27450 + }, + { + "epoch": 84.7387944358578, + "grad_norm": 0.49395087361335754, + "learning_rate": 1.1185933647393585e-05, + "loss": 0.2334, + "num_input_tokens_seen": 41938992, + "step": 27455 + }, + { + "epoch": 84.75425038639877, + "grad_norm": 0.4915381968021393, + "learning_rate": 1.1177752126781354e-05, + "loss": 0.2312, + "num_input_tokens_seen": 41946384, + "step": 27460 + }, + { + "epoch": 84.76970633693972, + "grad_norm": 0.34147730469703674, + "learning_rate": 1.1169572737733441e-05, + "loss": 0.2362, + "num_input_tokens_seen": 41953936, + "step": 27465 + }, + { + "epoch": 84.78516228748067, + "grad_norm": 0.37894025444984436, + "learning_rate": 1.1161395481511216e-05, + "loss": 0.2818, + "num_input_tokens_seen": 41961200, + "step": 27470 + }, + { + "epoch": 84.80061823802164, + "grad_norm": 0.6384143233299255, + "learning_rate": 1.1153220359375722e-05, + "loss": 0.2629, + "num_input_tokens_seen": 41968976, + "step": 27475 + }, + { + "epoch": 84.8160741885626, + "grad_norm": 0.43266433477401733, + "learning_rate": 1.114504737258765e-05, + "loss": 0.2258, + "num_input_tokens_seen": 41976560, + "step": 27480 + }, + { + "epoch": 84.83153013910355, + "grad_norm": 0.6426129341125488, + "learning_rate": 1.1136876522407393e-05, + "loss": 0.2629, + "num_input_tokens_seen": 41984432, + "step": 27485 + }, + { + "epoch": 84.84698608964452, + "grad_norm": 0.4656246602535248, + "learning_rate": 1.1128707810094985e-05, + "loss": 0.2358, + "num_input_tokens_seen": 41991792, + "step": 27490 + }, + { + "epoch": 84.86244204018547, + "grad_norm": 0.8200754523277283, + "learning_rate": 1.1120541236910157e-05, + "loss": 0.2553, + "num_input_tokens_seen": 41999344, + "step": 27495 + }, + { + "epoch": 84.87789799072642, + "grad_norm": 0.43771737813949585, + "learning_rate": 1.111237680411229e-05, + "loss": 0.2371, + "num_input_tokens_seen": 42006672, + "step": 27500 + }, + { + "epoch": 84.89335394126739, + "grad_norm": 0.3922138810157776, + "learning_rate": 1.1104214512960433e-05, + "loss": 0.2795, + "num_input_tokens_seen": 42014128, + "step": 27505 + }, + { + "epoch": 84.90880989180835, + "grad_norm": 0.4448891580104828, + "learning_rate": 1.1096054364713327e-05, + "loss": 0.2751, + "num_input_tokens_seen": 42021552, + "step": 27510 + }, + { + "epoch": 84.9242658423493, + "grad_norm": 0.39800921082496643, + "learning_rate": 1.1087896360629371e-05, + "loss": 0.2957, + "num_input_tokens_seen": 42029712, + "step": 27515 + }, + { + "epoch": 84.93972179289027, + "grad_norm": 0.4231564700603485, + "learning_rate": 1.107974050196662e-05, + "loss": 0.3081, + "num_input_tokens_seen": 42037584, + "step": 27520 + }, + { + "epoch": 84.95517774343122, + "grad_norm": 0.3513517677783966, + "learning_rate": 1.1071586789982816e-05, + "loss": 0.2266, + "num_input_tokens_seen": 42044976, + "step": 27525 + }, + { + "epoch": 84.97063369397218, + "grad_norm": 0.5644052624702454, + "learning_rate": 1.1063435225935373e-05, + "loss": 0.2804, + "num_input_tokens_seen": 42053232, + "step": 27530 + }, + { + "epoch": 84.98608964451314, + "grad_norm": 0.4391343593597412, + "learning_rate": 1.1055285811081348e-05, + "loss": 0.2114, + "num_input_tokens_seen": 42060656, + "step": 27535 + }, + { + "epoch": 85.0, + "grad_norm": 1.270263671875, + "learning_rate": 1.1047138546677499e-05, + "loss": 0.1942, + "num_input_tokens_seen": 42067008, + "step": 27540 + }, + { + "epoch": 85.01545595054095, + "grad_norm": 0.4588533043861389, + "learning_rate": 1.1038993433980219e-05, + "loss": 0.2568, + "num_input_tokens_seen": 42074528, + "step": 27545 + }, + { + "epoch": 85.03091190108192, + "grad_norm": 0.5338744521141052, + "learning_rate": 1.1030850474245597e-05, + "loss": 0.2799, + "num_input_tokens_seen": 42082336, + "step": 27550 + }, + { + "epoch": 85.04636785162288, + "grad_norm": 0.7365648746490479, + "learning_rate": 1.102270966872939e-05, + "loss": 0.2552, + "num_input_tokens_seen": 42089888, + "step": 27555 + }, + { + "epoch": 85.06182380216383, + "grad_norm": 0.5008007287979126, + "learning_rate": 1.1014571018687e-05, + "loss": 0.1989, + "num_input_tokens_seen": 42097696, + "step": 27560 + }, + { + "epoch": 85.0772797527048, + "grad_norm": 0.8637479543685913, + "learning_rate": 1.1006434525373502e-05, + "loss": 0.2591, + "num_input_tokens_seen": 42105600, + "step": 27565 + }, + { + "epoch": 85.09273570324575, + "grad_norm": 0.6900137066841125, + "learning_rate": 1.0998300190043664e-05, + "loss": 0.2414, + "num_input_tokens_seen": 42112960, + "step": 27570 + }, + { + "epoch": 85.1081916537867, + "grad_norm": 0.4553575813770294, + "learning_rate": 1.0990168013951882e-05, + "loss": 0.3102, + "num_input_tokens_seen": 42120352, + "step": 27575 + }, + { + "epoch": 85.12364760432767, + "grad_norm": 0.46201327443122864, + "learning_rate": 1.0982037998352263e-05, + "loss": 0.2086, + "num_input_tokens_seen": 42127680, + "step": 27580 + }, + { + "epoch": 85.13910355486863, + "grad_norm": 0.40259498357772827, + "learning_rate": 1.0973910144498534e-05, + "loss": 0.2669, + "num_input_tokens_seen": 42135680, + "step": 27585 + }, + { + "epoch": 85.15455950540958, + "grad_norm": 0.4756637513637543, + "learning_rate": 1.0965784453644123e-05, + "loss": 0.2292, + "num_input_tokens_seen": 42143616, + "step": 27590 + }, + { + "epoch": 85.17001545595055, + "grad_norm": 0.5743431448936462, + "learning_rate": 1.0957660927042127e-05, + "loss": 0.2389, + "num_input_tokens_seen": 42151040, + "step": 27595 + }, + { + "epoch": 85.1854714064915, + "grad_norm": 0.4031464755535126, + "learning_rate": 1.094953956594527e-05, + "loss": 0.2491, + "num_input_tokens_seen": 42158912, + "step": 27600 + }, + { + "epoch": 85.1854714064915, + "eval_loss": 0.30252891778945923, + "eval_runtime": 6.2656, + "eval_samples_per_second": 91.771, + "eval_steps_per_second": 22.983, + "num_input_tokens_seen": 42158912, + "step": 27600 + }, + { + "epoch": 85.20092735703246, + "grad_norm": 0.44674021005630493, + "learning_rate": 1.0941420371605981e-05, + "loss": 0.2398, + "num_input_tokens_seen": 42166368, + "step": 27605 + }, + { + "epoch": 85.21638330757341, + "grad_norm": 0.4706546366214752, + "learning_rate": 1.0933303345276354e-05, + "loss": 0.2301, + "num_input_tokens_seen": 42174240, + "step": 27610 + }, + { + "epoch": 85.23183925811438, + "grad_norm": 0.4223633408546448, + "learning_rate": 1.0925188488208112e-05, + "loss": 0.2478, + "num_input_tokens_seen": 42181536, + "step": 27615 + }, + { + "epoch": 85.24729520865533, + "grad_norm": 0.5232880115509033, + "learning_rate": 1.0917075801652694e-05, + "loss": 0.2004, + "num_input_tokens_seen": 42189152, + "step": 27620 + }, + { + "epoch": 85.26275115919628, + "grad_norm": 0.37008756399154663, + "learning_rate": 1.0908965286861151e-05, + "loss": 0.1856, + "num_input_tokens_seen": 42197056, + "step": 27625 + }, + { + "epoch": 85.27820710973725, + "grad_norm": 0.5117804408073425, + "learning_rate": 1.090085694508425e-05, + "loss": 0.2678, + "num_input_tokens_seen": 42204512, + "step": 27630 + }, + { + "epoch": 85.2936630602782, + "grad_norm": 0.29567277431488037, + "learning_rate": 1.089275077757238e-05, + "loss": 0.2186, + "num_input_tokens_seen": 42211904, + "step": 27635 + }, + { + "epoch": 85.30911901081916, + "grad_norm": 0.40613919496536255, + "learning_rate": 1.0884646785575633e-05, + "loss": 0.2644, + "num_input_tokens_seen": 42219168, + "step": 27640 + }, + { + "epoch": 85.32457496136013, + "grad_norm": 0.401033490896225, + "learning_rate": 1.0876544970343728e-05, + "loss": 0.2878, + "num_input_tokens_seen": 42226784, + "step": 27645 + }, + { + "epoch": 85.34003091190108, + "grad_norm": 0.6549006700515747, + "learning_rate": 1.0868445333126082e-05, + "loss": 0.2605, + "num_input_tokens_seen": 42234176, + "step": 27650 + }, + { + "epoch": 85.35548686244204, + "grad_norm": 0.5507638454437256, + "learning_rate": 1.0860347875171745e-05, + "loss": 0.2562, + "num_input_tokens_seen": 42241728, + "step": 27655 + }, + { + "epoch": 85.370942812983, + "grad_norm": 0.4826849102973938, + "learning_rate": 1.0852252597729465e-05, + "loss": 0.2233, + "num_input_tokens_seen": 42248992, + "step": 27660 + }, + { + "epoch": 85.38639876352396, + "grad_norm": 1.1409260034561157, + "learning_rate": 1.0844159502047615e-05, + "loss": 0.3011, + "num_input_tokens_seen": 42256832, + "step": 27665 + }, + { + "epoch": 85.40185471406491, + "grad_norm": 0.6845898032188416, + "learning_rate": 1.0836068589374265e-05, + "loss": 0.2555, + "num_input_tokens_seen": 42264768, + "step": 27670 + }, + { + "epoch": 85.41731066460588, + "grad_norm": 1.0311365127563477, + "learning_rate": 1.0827979860957144e-05, + "loss": 0.2905, + "num_input_tokens_seen": 42272480, + "step": 27675 + }, + { + "epoch": 85.43276661514683, + "grad_norm": 0.6485891342163086, + "learning_rate": 1.0819893318043615e-05, + "loss": 0.2647, + "num_input_tokens_seen": 42279904, + "step": 27680 + }, + { + "epoch": 85.44822256568779, + "grad_norm": 0.3662322163581848, + "learning_rate": 1.0811808961880734e-05, + "loss": 0.2765, + "num_input_tokens_seen": 42287424, + "step": 27685 + }, + { + "epoch": 85.46367851622875, + "grad_norm": 0.3794974684715271, + "learning_rate": 1.080372679371522e-05, + "loss": 0.2389, + "num_input_tokens_seen": 42294944, + "step": 27690 + }, + { + "epoch": 85.47913446676971, + "grad_norm": 0.614750325679779, + "learning_rate": 1.0795646814793428e-05, + "loss": 0.3295, + "num_input_tokens_seen": 42301952, + "step": 27695 + }, + { + "epoch": 85.49459041731066, + "grad_norm": 0.3437339663505554, + "learning_rate": 1.078756902636141e-05, + "loss": 0.228, + "num_input_tokens_seen": 42309504, + "step": 27700 + }, + { + "epoch": 85.51004636785163, + "grad_norm": 0.4003326892852783, + "learning_rate": 1.077949342966485e-05, + "loss": 0.2212, + "num_input_tokens_seen": 42317152, + "step": 27705 + }, + { + "epoch": 85.52550231839258, + "grad_norm": 0.5934181809425354, + "learning_rate": 1.0771420025949103e-05, + "loss": 0.2812, + "num_input_tokens_seen": 42324864, + "step": 27710 + }, + { + "epoch": 85.54095826893354, + "grad_norm": 0.4944312870502472, + "learning_rate": 1.0763348816459204e-05, + "loss": 0.2336, + "num_input_tokens_seen": 42332544, + "step": 27715 + }, + { + "epoch": 85.5564142194745, + "grad_norm": 0.41583186388015747, + "learning_rate": 1.0755279802439816e-05, + "loss": 0.196, + "num_input_tokens_seen": 42339872, + "step": 27720 + }, + { + "epoch": 85.57187017001546, + "grad_norm": 0.5403656363487244, + "learning_rate": 1.0747212985135293e-05, + "loss": 0.2155, + "num_input_tokens_seen": 42347424, + "step": 27725 + }, + { + "epoch": 85.58732612055641, + "grad_norm": 0.38549456000328064, + "learning_rate": 1.073914836578965e-05, + "loss": 0.1837, + "num_input_tokens_seen": 42354912, + "step": 27730 + }, + { + "epoch": 85.60278207109737, + "grad_norm": 0.5940226316452026, + "learning_rate": 1.0731085945646529e-05, + "loss": 0.2193, + "num_input_tokens_seen": 42362688, + "step": 27735 + }, + { + "epoch": 85.61823802163833, + "grad_norm": 0.47855743765830994, + "learning_rate": 1.0723025725949285e-05, + "loss": 0.2421, + "num_input_tokens_seen": 42370464, + "step": 27740 + }, + { + "epoch": 85.63369397217929, + "grad_norm": 0.4468066692352295, + "learning_rate": 1.0714967707940875e-05, + "loss": 0.239, + "num_input_tokens_seen": 42378464, + "step": 27745 + }, + { + "epoch": 85.64914992272024, + "grad_norm": 0.6119871735572815, + "learning_rate": 1.0706911892863963e-05, + "loss": 0.2287, + "num_input_tokens_seen": 42385696, + "step": 27750 + }, + { + "epoch": 85.66460587326121, + "grad_norm": 0.7819129228591919, + "learning_rate": 1.0698858281960866e-05, + "loss": 0.2505, + "num_input_tokens_seen": 42392928, + "step": 27755 + }, + { + "epoch": 85.68006182380216, + "grad_norm": 0.46078062057495117, + "learning_rate": 1.069080687647353e-05, + "loss": 0.3214, + "num_input_tokens_seen": 42400000, + "step": 27760 + }, + { + "epoch": 85.69551777434312, + "grad_norm": 0.46951863169670105, + "learning_rate": 1.0682757677643596e-05, + "loss": 0.2483, + "num_input_tokens_seen": 42407872, + "step": 27765 + }, + { + "epoch": 85.71097372488408, + "grad_norm": 0.551030158996582, + "learning_rate": 1.0674710686712359e-05, + "loss": 0.2515, + "num_input_tokens_seen": 42415712, + "step": 27770 + }, + { + "epoch": 85.72642967542504, + "grad_norm": 0.5356042981147766, + "learning_rate": 1.0666665904920756e-05, + "loss": 0.2762, + "num_input_tokens_seen": 42423168, + "step": 27775 + }, + { + "epoch": 85.74188562596599, + "grad_norm": 0.492279589176178, + "learning_rate": 1.0658623333509385e-05, + "loss": 0.2138, + "num_input_tokens_seen": 42431264, + "step": 27780 + }, + { + "epoch": 85.75734157650696, + "grad_norm": 0.40615421533584595, + "learning_rate": 1.0650582973718532e-05, + "loss": 0.2253, + "num_input_tokens_seen": 42439008, + "step": 27785 + }, + { + "epoch": 85.77279752704791, + "grad_norm": 0.6723787784576416, + "learning_rate": 1.0642544826788098e-05, + "loss": 0.2753, + "num_input_tokens_seen": 42446784, + "step": 27790 + }, + { + "epoch": 85.78825347758887, + "grad_norm": 0.7758018970489502, + "learning_rate": 1.063450889395769e-05, + "loss": 0.2172, + "num_input_tokens_seen": 42453888, + "step": 27795 + }, + { + "epoch": 85.80370942812984, + "grad_norm": 0.6367888450622559, + "learning_rate": 1.062647517646653e-05, + "loss": 0.2423, + "num_input_tokens_seen": 42461856, + "step": 27800 + }, + { + "epoch": 85.80370942812984, + "eval_loss": 0.30351221561431885, + "eval_runtime": 6.3123, + "eval_samples_per_second": 91.092, + "eval_steps_per_second": 22.813, + "num_input_tokens_seen": 42461856, + "step": 27800 + }, + { + "epoch": 85.81916537867079, + "grad_norm": 0.43842053413391113, + "learning_rate": 1.0618443675553527e-05, + "loss": 0.2399, + "num_input_tokens_seen": 42469568, + "step": 27805 + }, + { + "epoch": 85.83462132921174, + "grad_norm": 0.5181204676628113, + "learning_rate": 1.0610414392457247e-05, + "loss": 0.2396, + "num_input_tokens_seen": 42477344, + "step": 27810 + }, + { + "epoch": 85.85007727975271, + "grad_norm": 0.37861862778663635, + "learning_rate": 1.0602387328415888e-05, + "loss": 0.2866, + "num_input_tokens_seen": 42484576, + "step": 27815 + }, + { + "epoch": 85.86553323029366, + "grad_norm": 0.651793360710144, + "learning_rate": 1.0594362484667347e-05, + "loss": 0.273, + "num_input_tokens_seen": 42492832, + "step": 27820 + }, + { + "epoch": 85.88098918083462, + "grad_norm": 0.7174116969108582, + "learning_rate": 1.0586339862449132e-05, + "loss": 0.2177, + "num_input_tokens_seen": 42500992, + "step": 27825 + }, + { + "epoch": 85.89644513137559, + "grad_norm": 0.634909987449646, + "learning_rate": 1.0578319462998445e-05, + "loss": 0.2066, + "num_input_tokens_seen": 42508640, + "step": 27830 + }, + { + "epoch": 85.91190108191654, + "grad_norm": 0.5243695378303528, + "learning_rate": 1.057030128755214e-05, + "loss": 0.2561, + "num_input_tokens_seen": 42516416, + "step": 27835 + }, + { + "epoch": 85.9273570324575, + "grad_norm": 0.6794880628585815, + "learning_rate": 1.0562285337346703e-05, + "loss": 0.2335, + "num_input_tokens_seen": 42524320, + "step": 27840 + }, + { + "epoch": 85.94281298299846, + "grad_norm": 0.6116589307785034, + "learning_rate": 1.0554271613618308e-05, + "loss": 0.2419, + "num_input_tokens_seen": 42532480, + "step": 27845 + }, + { + "epoch": 85.95826893353942, + "grad_norm": 0.30625638365745544, + "learning_rate": 1.054626011760276e-05, + "loss": 0.2247, + "num_input_tokens_seen": 42540000, + "step": 27850 + }, + { + "epoch": 85.97372488408037, + "grad_norm": 0.7637845277786255, + "learning_rate": 1.0538250850535549e-05, + "loss": 0.2629, + "num_input_tokens_seen": 42547296, + "step": 27855 + }, + { + "epoch": 85.98918083462132, + "grad_norm": 0.7035704851150513, + "learning_rate": 1.0530243813651794e-05, + "loss": 0.3019, + "num_input_tokens_seen": 42555232, + "step": 27860 + }, + { + "epoch": 86.0030911901082, + "grad_norm": 0.8161876797676086, + "learning_rate": 1.0522239008186271e-05, + "loss": 0.2753, + "num_input_tokens_seen": 42562720, + "step": 27865 + }, + { + "epoch": 86.01854714064915, + "grad_norm": 0.41419684886932373, + "learning_rate": 1.0514236435373434e-05, + "loss": 0.25, + "num_input_tokens_seen": 42570208, + "step": 27870 + }, + { + "epoch": 86.03400309119012, + "grad_norm": 0.5533746480941772, + "learning_rate": 1.0506236096447386e-05, + "loss": 0.2576, + "num_input_tokens_seen": 42577856, + "step": 27875 + }, + { + "epoch": 86.04945904173107, + "grad_norm": 0.4425630569458008, + "learning_rate": 1.049823799264186e-05, + "loss": 0.2363, + "num_input_tokens_seen": 42585408, + "step": 27880 + }, + { + "epoch": 86.06491499227202, + "grad_norm": 0.8399210572242737, + "learning_rate": 1.049024212519028e-05, + "loss": 0.2352, + "num_input_tokens_seen": 42592640, + "step": 27885 + }, + { + "epoch": 86.08037094281298, + "grad_norm": 0.6193274259567261, + "learning_rate": 1.0482248495325713e-05, + "loss": 0.2135, + "num_input_tokens_seen": 42600608, + "step": 27890 + }, + { + "epoch": 86.09582689335394, + "grad_norm": 0.6591106653213501, + "learning_rate": 1.047425710428086e-05, + "loss": 0.2288, + "num_input_tokens_seen": 42608000, + "step": 27895 + }, + { + "epoch": 86.1112828438949, + "grad_norm": 0.4908786416053772, + "learning_rate": 1.0466267953288114e-05, + "loss": 0.2218, + "num_input_tokens_seen": 42615552, + "step": 27900 + }, + { + "epoch": 86.12673879443585, + "grad_norm": 0.7498488426208496, + "learning_rate": 1.0458281043579482e-05, + "loss": 0.1972, + "num_input_tokens_seen": 42623040, + "step": 27905 + }, + { + "epoch": 86.14219474497682, + "grad_norm": 0.3910141587257385, + "learning_rate": 1.0450296376386657e-05, + "loss": 0.2474, + "num_input_tokens_seen": 42630784, + "step": 27910 + }, + { + "epoch": 86.15765069551777, + "grad_norm": 0.43151578307151794, + "learning_rate": 1.044231395294098e-05, + "loss": 0.233, + "num_input_tokens_seen": 42638496, + "step": 27915 + }, + { + "epoch": 86.17310664605873, + "grad_norm": 0.5677752494812012, + "learning_rate": 1.0434333774473435e-05, + "loss": 0.2385, + "num_input_tokens_seen": 42645888, + "step": 27920 + }, + { + "epoch": 86.1885625965997, + "grad_norm": 0.5322011113166809, + "learning_rate": 1.0426355842214657e-05, + "loss": 0.2336, + "num_input_tokens_seen": 42653536, + "step": 27925 + }, + { + "epoch": 86.20401854714065, + "grad_norm": 0.552646279335022, + "learning_rate": 1.0418380157394963e-05, + "loss": 0.293, + "num_input_tokens_seen": 42660864, + "step": 27930 + }, + { + "epoch": 86.2194744976816, + "grad_norm": 0.8040180802345276, + "learning_rate": 1.0410406721244281e-05, + "loss": 0.3004, + "num_input_tokens_seen": 42668480, + "step": 27935 + }, + { + "epoch": 86.23493044822257, + "grad_norm": 0.8381541967391968, + "learning_rate": 1.0402435534992238e-05, + "loss": 0.2763, + "num_input_tokens_seen": 42676480, + "step": 27940 + }, + { + "epoch": 86.25038639876352, + "grad_norm": 0.8144832849502563, + "learning_rate": 1.0394466599868071e-05, + "loss": 0.3021, + "num_input_tokens_seen": 42684192, + "step": 27945 + }, + { + "epoch": 86.26584234930448, + "grad_norm": 0.6446875333786011, + "learning_rate": 1.0386499917100697e-05, + "loss": 0.2811, + "num_input_tokens_seen": 42691936, + "step": 27950 + }, + { + "epoch": 86.28129829984545, + "grad_norm": 0.32327258586883545, + "learning_rate": 1.0378535487918692e-05, + "loss": 0.2646, + "num_input_tokens_seen": 42699712, + "step": 27955 + }, + { + "epoch": 86.2967542503864, + "grad_norm": 0.49063435196876526, + "learning_rate": 1.037057331355025e-05, + "loss": 0.2076, + "num_input_tokens_seen": 42707648, + "step": 27960 + }, + { + "epoch": 86.31221020092735, + "grad_norm": 0.45978617668151855, + "learning_rate": 1.0362613395223247e-05, + "loss": 0.2499, + "num_input_tokens_seen": 42715328, + "step": 27965 + }, + { + "epoch": 86.32766615146832, + "grad_norm": 0.5816802978515625, + "learning_rate": 1.0354655734165212e-05, + "loss": 0.2192, + "num_input_tokens_seen": 42723872, + "step": 27970 + }, + { + "epoch": 86.34312210200927, + "grad_norm": 0.8846196532249451, + "learning_rate": 1.03467003316033e-05, + "loss": 0.3078, + "num_input_tokens_seen": 42731488, + "step": 27975 + }, + { + "epoch": 86.35857805255023, + "grad_norm": 0.4376341700553894, + "learning_rate": 1.033874718876435e-05, + "loss": 0.2561, + "num_input_tokens_seen": 42739168, + "step": 27980 + }, + { + "epoch": 86.3740340030912, + "grad_norm": 0.36155620217323303, + "learning_rate": 1.0330796306874818e-05, + "loss": 0.2763, + "num_input_tokens_seen": 42746848, + "step": 27985 + }, + { + "epoch": 86.38948995363215, + "grad_norm": 0.46231260895729065, + "learning_rate": 1.032284768716085e-05, + "loss": 0.2341, + "num_input_tokens_seen": 42754208, + "step": 27990 + }, + { + "epoch": 86.4049459041731, + "grad_norm": 0.34237077832221985, + "learning_rate": 1.0314901330848206e-05, + "loss": 0.2401, + "num_input_tokens_seen": 42761856, + "step": 27995 + }, + { + "epoch": 86.42040185471407, + "grad_norm": 0.5624198317527771, + "learning_rate": 1.030695723916233e-05, + "loss": 0.2651, + "num_input_tokens_seen": 42769760, + "step": 28000 + }, + { + "epoch": 86.42040185471407, + "eval_loss": 0.30351731181144714, + "eval_runtime": 6.3212, + "eval_samples_per_second": 90.963, + "eval_steps_per_second": 22.78, + "num_input_tokens_seen": 42769760, + "step": 28000 + }, + { + "epoch": 86.43585780525503, + "grad_norm": 0.4575471878051758, + "learning_rate": 1.0299015413328289e-05, + "loss": 0.2558, + "num_input_tokens_seen": 42777696, + "step": 28005 + }, + { + "epoch": 86.45131375579598, + "grad_norm": 0.6511774063110352, + "learning_rate": 1.0291075854570809e-05, + "loss": 0.2241, + "num_input_tokens_seen": 42785152, + "step": 28010 + }, + { + "epoch": 86.46676970633693, + "grad_norm": 0.5059083104133606, + "learning_rate": 1.0283138564114275e-05, + "loss": 0.225, + "num_input_tokens_seen": 42792416, + "step": 28015 + }, + { + "epoch": 86.4822256568779, + "grad_norm": 0.5235575437545776, + "learning_rate": 1.027520354318273e-05, + "loss": 0.2777, + "num_input_tokens_seen": 42800352, + "step": 28020 + }, + { + "epoch": 86.49768160741885, + "grad_norm": 0.49653810262680054, + "learning_rate": 1.0267270792999828e-05, + "loss": 0.2078, + "num_input_tokens_seen": 42807936, + "step": 28025 + }, + { + "epoch": 86.51313755795981, + "grad_norm": 0.6484488248825073, + "learning_rate": 1.0259340314788919e-05, + "loss": 0.2618, + "num_input_tokens_seen": 42816320, + "step": 28030 + }, + { + "epoch": 86.52859350850078, + "grad_norm": 0.3027002215385437, + "learning_rate": 1.0251412109772979e-05, + "loss": 0.2708, + "num_input_tokens_seen": 42823616, + "step": 28035 + }, + { + "epoch": 86.54404945904173, + "grad_norm": 0.3784297704696655, + "learning_rate": 1.0243486179174627e-05, + "loss": 0.2675, + "num_input_tokens_seen": 42832192, + "step": 28040 + }, + { + "epoch": 86.55950540958268, + "grad_norm": 0.543332040309906, + "learning_rate": 1.0235562524216158e-05, + "loss": 0.2548, + "num_input_tokens_seen": 42840000, + "step": 28045 + }, + { + "epoch": 86.57496136012365, + "grad_norm": 0.4592035412788391, + "learning_rate": 1.022764114611948e-05, + "loss": 0.216, + "num_input_tokens_seen": 42847776, + "step": 28050 + }, + { + "epoch": 86.5904173106646, + "grad_norm": 0.3597687780857086, + "learning_rate": 1.0219722046106178e-05, + "loss": 0.2173, + "num_input_tokens_seen": 42855200, + "step": 28055 + }, + { + "epoch": 86.60587326120556, + "grad_norm": 0.6473430395126343, + "learning_rate": 1.0211805225397486e-05, + "loss": 0.2666, + "num_input_tokens_seen": 42862560, + "step": 28060 + }, + { + "epoch": 86.62132921174653, + "grad_norm": 1.0116230249404907, + "learning_rate": 1.020389068521426e-05, + "loss": 0.317, + "num_input_tokens_seen": 42870016, + "step": 28065 + }, + { + "epoch": 86.63678516228748, + "grad_norm": 0.40845584869384766, + "learning_rate": 1.0195978426777039e-05, + "loss": 0.2935, + "num_input_tokens_seen": 42877536, + "step": 28070 + }, + { + "epoch": 86.65224111282843, + "grad_norm": 0.6636371612548828, + "learning_rate": 1.0188068451305982e-05, + "loss": 0.265, + "num_input_tokens_seen": 42885248, + "step": 28075 + }, + { + "epoch": 86.6676970633694, + "grad_norm": 0.4612802267074585, + "learning_rate": 1.0180160760020902e-05, + "loss": 0.2331, + "num_input_tokens_seen": 42892864, + "step": 28080 + }, + { + "epoch": 86.68315301391036, + "grad_norm": 0.5102769136428833, + "learning_rate": 1.0172255354141278e-05, + "loss": 0.2472, + "num_input_tokens_seen": 42900224, + "step": 28085 + }, + { + "epoch": 86.69860896445131, + "grad_norm": 0.49195486307144165, + "learning_rate": 1.0164352234886205e-05, + "loss": 0.2469, + "num_input_tokens_seen": 42908384, + "step": 28090 + }, + { + "epoch": 86.71406491499228, + "grad_norm": 0.6112313866615295, + "learning_rate": 1.0156451403474454e-05, + "loss": 0.2206, + "num_input_tokens_seen": 42916000, + "step": 28095 + }, + { + "epoch": 86.72952086553323, + "grad_norm": 0.44309139251708984, + "learning_rate": 1.0148552861124443e-05, + "loss": 0.2355, + "num_input_tokens_seen": 42923552, + "step": 28100 + }, + { + "epoch": 86.74497681607419, + "grad_norm": 0.5386139154434204, + "learning_rate": 1.0140656609054205e-05, + "loss": 0.2262, + "num_input_tokens_seen": 42931328, + "step": 28105 + }, + { + "epoch": 86.76043276661515, + "grad_norm": 0.45874977111816406, + "learning_rate": 1.0132762648481455e-05, + "loss": 0.2376, + "num_input_tokens_seen": 42938880, + "step": 28110 + }, + { + "epoch": 86.7758887171561, + "grad_norm": 0.44894981384277344, + "learning_rate": 1.0124870980623543e-05, + "loss": 0.2779, + "num_input_tokens_seen": 42946016, + "step": 28115 + }, + { + "epoch": 86.79134466769706, + "grad_norm": 0.39314916729927063, + "learning_rate": 1.0116981606697453e-05, + "loss": 0.2607, + "num_input_tokens_seen": 42954208, + "step": 28120 + }, + { + "epoch": 86.80680061823801, + "grad_norm": 0.5779237747192383, + "learning_rate": 1.0109094527919838e-05, + "loss": 0.2224, + "num_input_tokens_seen": 42961856, + "step": 28125 + }, + { + "epoch": 86.82225656877898, + "grad_norm": 0.4631180763244629, + "learning_rate": 1.010120974550697e-05, + "loss": 0.3135, + "num_input_tokens_seen": 42969600, + "step": 28130 + }, + { + "epoch": 86.83771251931994, + "grad_norm": 0.42045992612838745, + "learning_rate": 1.0093327260674795e-05, + "loss": 0.2424, + "num_input_tokens_seen": 42977344, + "step": 28135 + }, + { + "epoch": 86.85316846986089, + "grad_norm": 0.6633922457695007, + "learning_rate": 1.0085447074638878e-05, + "loss": 0.2142, + "num_input_tokens_seen": 42984800, + "step": 28140 + }, + { + "epoch": 86.86862442040186, + "grad_norm": 0.42536166310310364, + "learning_rate": 1.0077569188614461e-05, + "loss": 0.2209, + "num_input_tokens_seen": 42992064, + "step": 28145 + }, + { + "epoch": 86.88408037094281, + "grad_norm": 0.41133642196655273, + "learning_rate": 1.0069693603816393e-05, + "loss": 0.1938, + "num_input_tokens_seen": 42999552, + "step": 28150 + }, + { + "epoch": 86.89953632148377, + "grad_norm": 0.3527090847492218, + "learning_rate": 1.0061820321459204e-05, + "loss": 0.2651, + "num_input_tokens_seen": 43007392, + "step": 28155 + }, + { + "epoch": 86.91499227202473, + "grad_norm": 0.6590564250946045, + "learning_rate": 1.0053949342757038e-05, + "loss": 0.2332, + "num_input_tokens_seen": 43014688, + "step": 28160 + }, + { + "epoch": 86.93044822256569, + "grad_norm": 0.8975959420204163, + "learning_rate": 1.0046080668923717e-05, + "loss": 0.2009, + "num_input_tokens_seen": 43022496, + "step": 28165 + }, + { + "epoch": 86.94590417310664, + "grad_norm": 0.43642503023147583, + "learning_rate": 1.003821430117267e-05, + "loss": 0.2245, + "num_input_tokens_seen": 43030016, + "step": 28170 + }, + { + "epoch": 86.96136012364761, + "grad_norm": 0.3712984621524811, + "learning_rate": 1.0030350240716999e-05, + "loss": 0.2155, + "num_input_tokens_seen": 43037952, + "step": 28175 + }, + { + "epoch": 86.97681607418856, + "grad_norm": 0.5707288980484009, + "learning_rate": 1.0022488488769449e-05, + "loss": 0.2679, + "num_input_tokens_seen": 43045600, + "step": 28180 + }, + { + "epoch": 86.99227202472952, + "grad_norm": 0.7516317367553711, + "learning_rate": 1.0014629046542387e-05, + "loss": 0.3506, + "num_input_tokens_seen": 43052608, + "step": 28185 + }, + { + "epoch": 87.00618238021639, + "grad_norm": 0.47716063261032104, + "learning_rate": 1.0006771915247842e-05, + "loss": 0.2439, + "num_input_tokens_seen": 43059760, + "step": 28190 + }, + { + "epoch": 87.02163833075734, + "grad_norm": 0.36040839552879333, + "learning_rate": 9.998917096097495e-06, + "loss": 0.2704, + "num_input_tokens_seen": 43067056, + "step": 28195 + }, + { + "epoch": 87.0370942812983, + "grad_norm": 0.5695573091506958, + "learning_rate": 9.991064590302638e-06, + "loss": 0.2752, + "num_input_tokens_seen": 43074800, + "step": 28200 + }, + { + "epoch": 87.0370942812983, + "eval_loss": 0.3035561144351959, + "eval_runtime": 6.28, + "eval_samples_per_second": 91.56, + "eval_steps_per_second": 22.93, + "num_input_tokens_seen": 43074800, + "step": 28200 + }, + { + "epoch": 87.05255023183926, + "grad_norm": 0.5543006658554077, + "learning_rate": 9.983214399074241e-06, + "loss": 0.2221, + "num_input_tokens_seen": 43082448, + "step": 28205 + }, + { + "epoch": 87.06800618238022, + "grad_norm": 0.44083112478256226, + "learning_rate": 9.975366523622893e-06, + "loss": 0.2927, + "num_input_tokens_seen": 43090096, + "step": 28210 + }, + { + "epoch": 87.08346213292117, + "grad_norm": 0.5157335996627808, + "learning_rate": 9.967520965158841e-06, + "loss": 0.229, + "num_input_tokens_seen": 43097488, + "step": 28215 + }, + { + "epoch": 87.09891808346214, + "grad_norm": 0.39803266525268555, + "learning_rate": 9.95967772489197e-06, + "loss": 0.2716, + "num_input_tokens_seen": 43105296, + "step": 28220 + }, + { + "epoch": 87.11437403400309, + "grad_norm": 0.41385963559150696, + "learning_rate": 9.951836804031794e-06, + "loss": 0.2175, + "num_input_tokens_seen": 43112688, + "step": 28225 + }, + { + "epoch": 87.12982998454405, + "grad_norm": 0.6378418803215027, + "learning_rate": 9.943998203787489e-06, + "loss": 0.2356, + "num_input_tokens_seen": 43119920, + "step": 28230 + }, + { + "epoch": 87.14528593508501, + "grad_norm": 0.8365742564201355, + "learning_rate": 9.936161925367874e-06, + "loss": 0.2627, + "num_input_tokens_seen": 43127984, + "step": 28235 + }, + { + "epoch": 87.16074188562597, + "grad_norm": 0.45749104022979736, + "learning_rate": 9.928327969981386e-06, + "loss": 0.2831, + "num_input_tokens_seen": 43135536, + "step": 28240 + }, + { + "epoch": 87.17619783616692, + "grad_norm": 0.5543023347854614, + "learning_rate": 9.920496338836135e-06, + "loss": 0.2464, + "num_input_tokens_seen": 43143184, + "step": 28245 + }, + { + "epoch": 87.19165378670789, + "grad_norm": 0.541142463684082, + "learning_rate": 9.912667033139844e-06, + "loss": 0.217, + "num_input_tokens_seen": 43151056, + "step": 28250 + }, + { + "epoch": 87.20710973724884, + "grad_norm": 0.6719357371330261, + "learning_rate": 9.904840054099893e-06, + "loss": 0.2671, + "num_input_tokens_seen": 43158672, + "step": 28255 + }, + { + "epoch": 87.2225656877898, + "grad_norm": 0.4497307240962982, + "learning_rate": 9.897015402923312e-06, + "loss": 0.2379, + "num_input_tokens_seen": 43166416, + "step": 28260 + }, + { + "epoch": 87.23802163833076, + "grad_norm": 0.582970380783081, + "learning_rate": 9.889193080816744e-06, + "loss": 0.3239, + "num_input_tokens_seen": 43174192, + "step": 28265 + }, + { + "epoch": 87.25347758887172, + "grad_norm": 0.4104613959789276, + "learning_rate": 9.881373088986498e-06, + "loss": 0.2194, + "num_input_tokens_seen": 43181712, + "step": 28270 + }, + { + "epoch": 87.26893353941267, + "grad_norm": 0.7269062995910645, + "learning_rate": 9.873555428638523e-06, + "loss": 0.2536, + "num_input_tokens_seen": 43189008, + "step": 28275 + }, + { + "epoch": 87.28438948995363, + "grad_norm": 0.4839138388633728, + "learning_rate": 9.865740100978383e-06, + "loss": 0.2299, + "num_input_tokens_seen": 43196944, + "step": 28280 + }, + { + "epoch": 87.2998454404946, + "grad_norm": 0.3870372176170349, + "learning_rate": 9.857927107211315e-06, + "loss": 0.2152, + "num_input_tokens_seen": 43204368, + "step": 28285 + }, + { + "epoch": 87.31530139103555, + "grad_norm": 0.6527467370033264, + "learning_rate": 9.850116448542177e-06, + "loss": 0.3343, + "num_input_tokens_seen": 43211952, + "step": 28290 + }, + { + "epoch": 87.3307573415765, + "grad_norm": 0.34698542952537537, + "learning_rate": 9.842308126175457e-06, + "loss": 0.219, + "num_input_tokens_seen": 43219312, + "step": 28295 + }, + { + "epoch": 87.34621329211747, + "grad_norm": 0.39485859870910645, + "learning_rate": 9.834502141315315e-06, + "loss": 0.2873, + "num_input_tokens_seen": 43226736, + "step": 28300 + }, + { + "epoch": 87.36166924265842, + "grad_norm": 0.3655512034893036, + "learning_rate": 9.82669849516552e-06, + "loss": 0.2323, + "num_input_tokens_seen": 43233872, + "step": 28305 + }, + { + "epoch": 87.37712519319938, + "grad_norm": 0.6034566760063171, + "learning_rate": 9.818897188929493e-06, + "loss": 0.2279, + "num_input_tokens_seen": 43241168, + "step": 28310 + }, + { + "epoch": 87.39258114374034, + "grad_norm": 0.45497843623161316, + "learning_rate": 9.811098223810309e-06, + "loss": 0.2556, + "num_input_tokens_seen": 43248624, + "step": 28315 + }, + { + "epoch": 87.4080370942813, + "grad_norm": 0.41176798939704895, + "learning_rate": 9.803301601010641e-06, + "loss": 0.2545, + "num_input_tokens_seen": 43256176, + "step": 28320 + }, + { + "epoch": 87.42349304482225, + "grad_norm": 0.37140271067619324, + "learning_rate": 9.795507321732853e-06, + "loss": 0.234, + "num_input_tokens_seen": 43263952, + "step": 28325 + }, + { + "epoch": 87.43894899536322, + "grad_norm": 0.5638495683670044, + "learning_rate": 9.787715387178898e-06, + "loss": 0.2492, + "num_input_tokens_seen": 43271696, + "step": 28330 + }, + { + "epoch": 87.45440494590417, + "grad_norm": 1.1945308446884155, + "learning_rate": 9.779925798550399e-06, + "loss": 0.275, + "num_input_tokens_seen": 43279664, + "step": 28335 + }, + { + "epoch": 87.46986089644513, + "grad_norm": 0.38476547598838806, + "learning_rate": 9.772138557048619e-06, + "loss": 0.2443, + "num_input_tokens_seen": 43287056, + "step": 28340 + }, + { + "epoch": 87.4853168469861, + "grad_norm": 0.33331164717674255, + "learning_rate": 9.764353663874426e-06, + "loss": 0.2734, + "num_input_tokens_seen": 43294288, + "step": 28345 + }, + { + "epoch": 87.50077279752705, + "grad_norm": 0.5683243274688721, + "learning_rate": 9.756571120228375e-06, + "loss": 0.2647, + "num_input_tokens_seen": 43302320, + "step": 28350 + }, + { + "epoch": 87.516228748068, + "grad_norm": 0.6034025549888611, + "learning_rate": 9.748790927310605e-06, + "loss": 0.2128, + "num_input_tokens_seen": 43309968, + "step": 28355 + }, + { + "epoch": 87.53168469860897, + "grad_norm": 0.41369205713272095, + "learning_rate": 9.741013086320946e-06, + "loss": 0.2135, + "num_input_tokens_seen": 43317648, + "step": 28360 + }, + { + "epoch": 87.54714064914992, + "grad_norm": 1.118666410446167, + "learning_rate": 9.733237598458821e-06, + "loss": 0.2394, + "num_input_tokens_seen": 43325232, + "step": 28365 + }, + { + "epoch": 87.56259659969088, + "grad_norm": 0.6043142080307007, + "learning_rate": 9.725464464923308e-06, + "loss": 0.2255, + "num_input_tokens_seen": 43332720, + "step": 28370 + }, + { + "epoch": 87.57805255023185, + "grad_norm": 0.40870940685272217, + "learning_rate": 9.717693686913123e-06, + "loss": 0.2107, + "num_input_tokens_seen": 43340432, + "step": 28375 + }, + { + "epoch": 87.5935085007728, + "grad_norm": 0.5471151471138, + "learning_rate": 9.709925265626632e-06, + "loss": 0.2023, + "num_input_tokens_seen": 43348144, + "step": 28380 + }, + { + "epoch": 87.60896445131375, + "grad_norm": 0.8271473050117493, + "learning_rate": 9.702159202261801e-06, + "loss": 0.1932, + "num_input_tokens_seen": 43355888, + "step": 28385 + }, + { + "epoch": 87.62442040185472, + "grad_norm": 0.39087626338005066, + "learning_rate": 9.694395498016268e-06, + "loss": 0.2032, + "num_input_tokens_seen": 43363152, + "step": 28390 + }, + { + "epoch": 87.63987635239567, + "grad_norm": 0.4141581356525421, + "learning_rate": 9.686634154087298e-06, + "loss": 0.2505, + "num_input_tokens_seen": 43370736, + "step": 28395 + }, + { + "epoch": 87.65533230293663, + "grad_norm": 0.6232531666755676, + "learning_rate": 9.678875171671776e-06, + "loss": 0.25, + "num_input_tokens_seen": 43378640, + "step": 28400 + }, + { + "epoch": 87.65533230293663, + "eval_loss": 0.30358967185020447, + "eval_runtime": 6.3031, + "eval_samples_per_second": 91.226, + "eval_steps_per_second": 22.846, + "num_input_tokens_seen": 43378640, + "step": 28400 + }, + { + "epoch": 87.67078825347758, + "grad_norm": 0.8536402583122253, + "learning_rate": 9.671118551966246e-06, + "loss": 0.2578, + "num_input_tokens_seen": 43386064, + "step": 28405 + }, + { + "epoch": 87.68624420401855, + "grad_norm": 0.3040732145309448, + "learning_rate": 9.66336429616686e-06, + "loss": 0.1943, + "num_input_tokens_seen": 43394064, + "step": 28410 + }, + { + "epoch": 87.7017001545595, + "grad_norm": 0.6226955056190491, + "learning_rate": 9.655612405469436e-06, + "loss": 0.1935, + "num_input_tokens_seen": 43400880, + "step": 28415 + }, + { + "epoch": 87.71715610510046, + "grad_norm": 0.3846195638179779, + "learning_rate": 9.647862881069413e-06, + "loss": 0.2225, + "num_input_tokens_seen": 43407888, + "step": 28420 + }, + { + "epoch": 87.73261205564143, + "grad_norm": 0.5079991221427917, + "learning_rate": 9.640115724161855e-06, + "loss": 0.2949, + "num_input_tokens_seen": 43415408, + "step": 28425 + }, + { + "epoch": 87.74806800618238, + "grad_norm": 0.8068957924842834, + "learning_rate": 9.632370935941483e-06, + "loss": 0.2718, + "num_input_tokens_seen": 43423216, + "step": 28430 + }, + { + "epoch": 87.76352395672333, + "grad_norm": 0.5664185881614685, + "learning_rate": 9.624628517602634e-06, + "loss": 0.26, + "num_input_tokens_seen": 43430992, + "step": 28435 + }, + { + "epoch": 87.7789799072643, + "grad_norm": 0.4318919777870178, + "learning_rate": 9.61688847033928e-06, + "loss": 0.2285, + "num_input_tokens_seen": 43438704, + "step": 28440 + }, + { + "epoch": 87.79443585780525, + "grad_norm": 0.6406595706939697, + "learning_rate": 9.609150795345051e-06, + "loss": 0.2688, + "num_input_tokens_seen": 43446224, + "step": 28445 + }, + { + "epoch": 87.80989180834621, + "grad_norm": 0.6716864705085754, + "learning_rate": 9.601415493813171e-06, + "loss": 0.2794, + "num_input_tokens_seen": 43453776, + "step": 28450 + }, + { + "epoch": 87.82534775888718, + "grad_norm": 0.7629016041755676, + "learning_rate": 9.593682566936533e-06, + "loss": 0.2667, + "num_input_tokens_seen": 43461840, + "step": 28455 + }, + { + "epoch": 87.84080370942813, + "grad_norm": 0.8804354667663574, + "learning_rate": 9.58595201590766e-06, + "loss": 0.2642, + "num_input_tokens_seen": 43470096, + "step": 28460 + }, + { + "epoch": 87.85625965996908, + "grad_norm": 0.6456589102745056, + "learning_rate": 9.578223841918681e-06, + "loss": 0.2578, + "num_input_tokens_seen": 43477712, + "step": 28465 + }, + { + "epoch": 87.87171561051005, + "grad_norm": 0.4412784278392792, + "learning_rate": 9.570498046161389e-06, + "loss": 0.2762, + "num_input_tokens_seen": 43485168, + "step": 28470 + }, + { + "epoch": 87.887171561051, + "grad_norm": 0.7424514293670654, + "learning_rate": 9.562774629827206e-06, + "loss": 0.2449, + "num_input_tokens_seen": 43492912, + "step": 28475 + }, + { + "epoch": 87.90262751159196, + "grad_norm": 0.49115896224975586, + "learning_rate": 9.555053594107163e-06, + "loss": 0.2317, + "num_input_tokens_seen": 43500368, + "step": 28480 + }, + { + "epoch": 87.91808346213293, + "grad_norm": 0.507375955581665, + "learning_rate": 9.547334940191957e-06, + "loss": 0.2748, + "num_input_tokens_seen": 43508240, + "step": 28485 + }, + { + "epoch": 87.93353941267388, + "grad_norm": 0.7094714641571045, + "learning_rate": 9.539618669271886e-06, + "loss": 0.2385, + "num_input_tokens_seen": 43515696, + "step": 28490 + }, + { + "epoch": 87.94899536321483, + "grad_norm": 0.4729014039039612, + "learning_rate": 9.531904782536904e-06, + "loss": 0.2094, + "num_input_tokens_seen": 43523248, + "step": 28495 + }, + { + "epoch": 87.9644513137558, + "grad_norm": 0.5838273167610168, + "learning_rate": 9.524193281176597e-06, + "loss": 0.2577, + "num_input_tokens_seen": 43530608, + "step": 28500 + }, + { + "epoch": 87.97990726429676, + "grad_norm": 0.7336195707321167, + "learning_rate": 9.516484166380165e-06, + "loss": 0.2763, + "num_input_tokens_seen": 43538608, + "step": 28505 + }, + { + "epoch": 87.99536321483771, + "grad_norm": 0.7760328650474548, + "learning_rate": 9.508777439336447e-06, + "loss": 0.265, + "num_input_tokens_seen": 43546384, + "step": 28510 + }, + { + "epoch": 88.00927357032458, + "grad_norm": 0.5142660737037659, + "learning_rate": 9.50107310123393e-06, + "loss": 0.3423, + "num_input_tokens_seen": 43553376, + "step": 28515 + }, + { + "epoch": 88.02472952086553, + "grad_norm": 0.6252831220626831, + "learning_rate": 9.493371153260702e-06, + "loss": 0.2485, + "num_input_tokens_seen": 43561216, + "step": 28520 + }, + { + "epoch": 88.04018547140649, + "grad_norm": 0.43210700154304504, + "learning_rate": 9.485671596604523e-06, + "loss": 0.2668, + "num_input_tokens_seen": 43569344, + "step": 28525 + }, + { + "epoch": 88.05564142194746, + "grad_norm": 0.5454025864601135, + "learning_rate": 9.477974432452738e-06, + "loss": 0.2452, + "num_input_tokens_seen": 43577632, + "step": 28530 + }, + { + "epoch": 88.07109737248841, + "grad_norm": 0.5789111852645874, + "learning_rate": 9.470279661992356e-06, + "loss": 0.2766, + "num_input_tokens_seen": 43585216, + "step": 28535 + }, + { + "epoch": 88.08655332302936, + "grad_norm": 0.3467225134372711, + "learning_rate": 9.462587286410021e-06, + "loss": 0.2428, + "num_input_tokens_seen": 43592576, + "step": 28540 + }, + { + "epoch": 88.10200927357033, + "grad_norm": 0.6099010705947876, + "learning_rate": 9.454897306891972e-06, + "loss": 0.2687, + "num_input_tokens_seen": 43599808, + "step": 28545 + }, + { + "epoch": 88.11746522411129, + "grad_norm": 0.40780603885650635, + "learning_rate": 9.44720972462411e-06, + "loss": 0.2802, + "num_input_tokens_seen": 43606944, + "step": 28550 + }, + { + "epoch": 88.13292117465224, + "grad_norm": 0.6100894808769226, + "learning_rate": 9.439524540791964e-06, + "loss": 0.2241, + "num_input_tokens_seen": 43615392, + "step": 28555 + }, + { + "epoch": 88.14837712519319, + "grad_norm": 0.3779960870742798, + "learning_rate": 9.431841756580673e-06, + "loss": 0.2393, + "num_input_tokens_seen": 43622816, + "step": 28560 + }, + { + "epoch": 88.16383307573416, + "grad_norm": 0.33574941754341125, + "learning_rate": 9.42416137317503e-06, + "loss": 0.2536, + "num_input_tokens_seen": 43629984, + "step": 28565 + }, + { + "epoch": 88.17928902627511, + "grad_norm": 0.4371608793735504, + "learning_rate": 9.416483391759437e-06, + "loss": 0.2193, + "num_input_tokens_seen": 43637856, + "step": 28570 + }, + { + "epoch": 88.19474497681607, + "grad_norm": 0.5165972113609314, + "learning_rate": 9.408807813517945e-06, + "loss": 0.2409, + "num_input_tokens_seen": 43645728, + "step": 28575 + }, + { + "epoch": 88.21020092735704, + "grad_norm": 0.4443740248680115, + "learning_rate": 9.401134639634221e-06, + "loss": 0.2031, + "num_input_tokens_seen": 43653376, + "step": 28580 + }, + { + "epoch": 88.22565687789799, + "grad_norm": 0.6683840155601501, + "learning_rate": 9.393463871291555e-06, + "loss": 0.2152, + "num_input_tokens_seen": 43660928, + "step": 28585 + }, + { + "epoch": 88.24111282843894, + "grad_norm": 0.4604256749153137, + "learning_rate": 9.385795509672881e-06, + "loss": 0.2115, + "num_input_tokens_seen": 43668640, + "step": 28590 + }, + { + "epoch": 88.25656877897991, + "grad_norm": 0.43013203144073486, + "learning_rate": 9.378129555960771e-06, + "loss": 0.2482, + "num_input_tokens_seen": 43676672, + "step": 28595 + }, + { + "epoch": 88.27202472952087, + "grad_norm": 0.5471272468566895, + "learning_rate": 9.370466011337392e-06, + "loss": 0.2937, + "num_input_tokens_seen": 43683840, + "step": 28600 + }, + { + "epoch": 88.27202472952087, + "eval_loss": 0.3019525706768036, + "eval_runtime": 6.3114, + "eval_samples_per_second": 91.104, + "eval_steps_per_second": 22.816, + "num_input_tokens_seen": 43683840, + "step": 28600 + }, + { + "epoch": 88.28748068006182, + "grad_norm": 0.5577519536018372, + "learning_rate": 9.362804876984573e-06, + "loss": 0.2262, + "num_input_tokens_seen": 43691296, + "step": 28605 + }, + { + "epoch": 88.30293663060279, + "grad_norm": 0.3822959363460541, + "learning_rate": 9.355146154083747e-06, + "loss": 0.218, + "num_input_tokens_seen": 43698464, + "step": 28610 + }, + { + "epoch": 88.31839258114374, + "grad_norm": 0.4437844455242157, + "learning_rate": 9.347489843815987e-06, + "loss": 0.1957, + "num_input_tokens_seen": 43706272, + "step": 28615 + }, + { + "epoch": 88.3338485316847, + "grad_norm": 0.4376005530357361, + "learning_rate": 9.339835947362002e-06, + "loss": 0.2742, + "num_input_tokens_seen": 43713920, + "step": 28620 + }, + { + "epoch": 88.34930448222566, + "grad_norm": 0.6851513981819153, + "learning_rate": 9.332184465902105e-06, + "loss": 0.2831, + "num_input_tokens_seen": 43721568, + "step": 28625 + }, + { + "epoch": 88.36476043276662, + "grad_norm": 0.46622782945632935, + "learning_rate": 9.324535400616266e-06, + "loss": 0.2218, + "num_input_tokens_seen": 43728480, + "step": 28630 + }, + { + "epoch": 88.38021638330757, + "grad_norm": 0.5917273759841919, + "learning_rate": 9.31688875268405e-06, + "loss": 0.2244, + "num_input_tokens_seen": 43736224, + "step": 28635 + }, + { + "epoch": 88.39567233384854, + "grad_norm": 0.45074036717414856, + "learning_rate": 9.309244523284674e-06, + "loss": 0.23, + "num_input_tokens_seen": 43743616, + "step": 28640 + }, + { + "epoch": 88.41112828438949, + "grad_norm": 0.5948356986045837, + "learning_rate": 9.301602713596982e-06, + "loss": 0.2652, + "num_input_tokens_seen": 43751072, + "step": 28645 + }, + { + "epoch": 88.42658423493044, + "grad_norm": 0.4777241051197052, + "learning_rate": 9.293963324799432e-06, + "loss": 0.2657, + "num_input_tokens_seen": 43758560, + "step": 28650 + }, + { + "epoch": 88.44204018547141, + "grad_norm": 0.60769122838974, + "learning_rate": 9.286326358070104e-06, + "loss": 0.2567, + "num_input_tokens_seen": 43766080, + "step": 28655 + }, + { + "epoch": 88.45749613601237, + "grad_norm": 0.3588170111179352, + "learning_rate": 9.278691814586729e-06, + "loss": 0.2941, + "num_input_tokens_seen": 43773472, + "step": 28660 + }, + { + "epoch": 88.47295208655332, + "grad_norm": 0.5822659134864807, + "learning_rate": 9.271059695526635e-06, + "loss": 0.2317, + "num_input_tokens_seen": 43781472, + "step": 28665 + }, + { + "epoch": 88.48840803709429, + "grad_norm": 0.4151764512062073, + "learning_rate": 9.263430002066805e-06, + "loss": 0.2279, + "num_input_tokens_seen": 43789056, + "step": 28670 + }, + { + "epoch": 88.50386398763524, + "grad_norm": 0.40164443850517273, + "learning_rate": 9.25580273538382e-06, + "loss": 0.2722, + "num_input_tokens_seen": 43796480, + "step": 28675 + }, + { + "epoch": 88.5193199381762, + "grad_norm": 0.4510716199874878, + "learning_rate": 9.248177896653907e-06, + "loss": 0.2989, + "num_input_tokens_seen": 43804800, + "step": 28680 + }, + { + "epoch": 88.53477588871715, + "grad_norm": 0.47675731778144836, + "learning_rate": 9.240555487052918e-06, + "loss": 0.2516, + "num_input_tokens_seen": 43812512, + "step": 28685 + }, + { + "epoch": 88.55023183925812, + "grad_norm": 0.692221999168396, + "learning_rate": 9.232935507756313e-06, + "loss": 0.2044, + "num_input_tokens_seen": 43820064, + "step": 28690 + }, + { + "epoch": 88.56568778979907, + "grad_norm": 0.6453453302383423, + "learning_rate": 9.225317959939193e-06, + "loss": 0.3104, + "num_input_tokens_seen": 43827712, + "step": 28695 + }, + { + "epoch": 88.58114374034002, + "grad_norm": 0.721460223197937, + "learning_rate": 9.217702844776287e-06, + "loss": 0.2552, + "num_input_tokens_seen": 43834912, + "step": 28700 + }, + { + "epoch": 88.59659969088099, + "grad_norm": 0.43861955404281616, + "learning_rate": 9.210090163441929e-06, + "loss": 0.2732, + "num_input_tokens_seen": 43842624, + "step": 28705 + }, + { + "epoch": 88.61205564142195, + "grad_norm": 0.559405505657196, + "learning_rate": 9.202479917110105e-06, + "loss": 0.2198, + "num_input_tokens_seen": 43849888, + "step": 28710 + }, + { + "epoch": 88.6275115919629, + "grad_norm": 0.4174163341522217, + "learning_rate": 9.194872106954392e-06, + "loss": 0.2548, + "num_input_tokens_seen": 43857120, + "step": 28715 + }, + { + "epoch": 88.64296754250387, + "grad_norm": 0.650650680065155, + "learning_rate": 9.187266734148029e-06, + "loss": 0.2232, + "num_input_tokens_seen": 43864736, + "step": 28720 + }, + { + "epoch": 88.65842349304482, + "grad_norm": 0.4826023280620575, + "learning_rate": 9.179663799863849e-06, + "loss": 0.2249, + "num_input_tokens_seen": 43872576, + "step": 28725 + }, + { + "epoch": 88.67387944358578, + "grad_norm": 0.4857155382633209, + "learning_rate": 9.172063305274317e-06, + "loss": 0.25, + "num_input_tokens_seen": 43880544, + "step": 28730 + }, + { + "epoch": 88.68933539412674, + "grad_norm": 0.5598483085632324, + "learning_rate": 9.164465251551527e-06, + "loss": 0.273, + "num_input_tokens_seen": 43888704, + "step": 28735 + }, + { + "epoch": 88.7047913446677, + "grad_norm": 0.9118428230285645, + "learning_rate": 9.156869639867205e-06, + "loss": 0.2717, + "num_input_tokens_seen": 43896832, + "step": 28740 + }, + { + "epoch": 88.72024729520865, + "grad_norm": 0.5860568881034851, + "learning_rate": 9.149276471392677e-06, + "loss": 0.2258, + "num_input_tokens_seen": 43904256, + "step": 28745 + }, + { + "epoch": 88.73570324574962, + "grad_norm": 0.3937710225582123, + "learning_rate": 9.141685747298914e-06, + "loss": 0.2245, + "num_input_tokens_seen": 43912576, + "step": 28750 + }, + { + "epoch": 88.75115919629057, + "grad_norm": 0.4947090744972229, + "learning_rate": 9.13409746875649e-06, + "loss": 0.2556, + "num_input_tokens_seen": 43919840, + "step": 28755 + }, + { + "epoch": 88.76661514683153, + "grad_norm": 0.44946780800819397, + "learning_rate": 9.12651163693562e-06, + "loss": 0.2837, + "num_input_tokens_seen": 43927296, + "step": 28760 + }, + { + "epoch": 88.7820710973725, + "grad_norm": 0.7154698967933655, + "learning_rate": 9.11892825300614e-06, + "loss": 0.2153, + "num_input_tokens_seen": 43935008, + "step": 28765 + }, + { + "epoch": 88.79752704791345, + "grad_norm": 0.5291147828102112, + "learning_rate": 9.111347318137491e-06, + "loss": 0.2401, + "num_input_tokens_seen": 43942720, + "step": 28770 + }, + { + "epoch": 88.8129829984544, + "grad_norm": 0.5008749961853027, + "learning_rate": 9.103768833498755e-06, + "loss": 0.2556, + "num_input_tokens_seen": 43950720, + "step": 28775 + }, + { + "epoch": 88.82843894899537, + "grad_norm": 0.4139895737171173, + "learning_rate": 9.096192800258639e-06, + "loss": 0.2385, + "num_input_tokens_seen": 43958080, + "step": 28780 + }, + { + "epoch": 88.84389489953632, + "grad_norm": 0.45842891931533813, + "learning_rate": 9.088619219585443e-06, + "loss": 0.21, + "num_input_tokens_seen": 43965504, + "step": 28785 + }, + { + "epoch": 88.85935085007728, + "grad_norm": 0.4856281578540802, + "learning_rate": 9.081048092647127e-06, + "loss": 0.2348, + "num_input_tokens_seen": 43973472, + "step": 28790 + }, + { + "epoch": 88.87480680061825, + "grad_norm": 0.30532464385032654, + "learning_rate": 9.073479420611245e-06, + "loss": 0.22, + "num_input_tokens_seen": 43980576, + "step": 28795 + }, + { + "epoch": 88.8902627511592, + "grad_norm": 0.6682021617889404, + "learning_rate": 9.065913204644974e-06, + "loss": 0.2567, + "num_input_tokens_seen": 43988256, + "step": 28800 + }, + { + "epoch": 88.8902627511592, + "eval_loss": 0.3020457625389099, + "eval_runtime": 6.3026, + "eval_samples_per_second": 91.232, + "eval_steps_per_second": 22.848, + "num_input_tokens_seen": 43988256, + "step": 28800 + }, + { + "epoch": 88.90571870170015, + "grad_norm": 0.4708431363105774, + "learning_rate": 9.058349445915135e-06, + "loss": 0.3035, + "num_input_tokens_seen": 43996320, + "step": 28805 + }, + { + "epoch": 88.9211746522411, + "grad_norm": 0.6028380393981934, + "learning_rate": 9.050788145588138e-06, + "loss": 0.2877, + "num_input_tokens_seen": 44003584, + "step": 28810 + }, + { + "epoch": 88.93663060278207, + "grad_norm": 0.4119549095630646, + "learning_rate": 9.043229304830039e-06, + "loss": 0.2091, + "num_input_tokens_seen": 44011360, + "step": 28815 + }, + { + "epoch": 88.95208655332303, + "grad_norm": 0.5040522813796997, + "learning_rate": 9.035672924806515e-06, + "loss": 0.2229, + "num_input_tokens_seen": 44019168, + "step": 28820 + }, + { + "epoch": 88.96754250386398, + "grad_norm": 0.6093661189079285, + "learning_rate": 9.028119006682839e-06, + "loss": 0.2336, + "num_input_tokens_seen": 44026848, + "step": 28825 + }, + { + "epoch": 88.98299845440495, + "grad_norm": 0.48338013887405396, + "learning_rate": 9.020567551623935e-06, + "loss": 0.2657, + "num_input_tokens_seen": 44034336, + "step": 28830 + }, + { + "epoch": 88.9984544049459, + "grad_norm": 0.42407071590423584, + "learning_rate": 9.013018560794318e-06, + "loss": 0.2096, + "num_input_tokens_seen": 44042144, + "step": 28835 + }, + { + "epoch": 89.01236476043276, + "grad_norm": 0.7108706831932068, + "learning_rate": 9.005472035358139e-06, + "loss": 0.2734, + "num_input_tokens_seen": 44049072, + "step": 28840 + }, + { + "epoch": 89.02782071097373, + "grad_norm": 0.4217952489852905, + "learning_rate": 8.997927976479185e-06, + "loss": 0.2128, + "num_input_tokens_seen": 44057008, + "step": 28845 + }, + { + "epoch": 89.04327666151468, + "grad_norm": 0.5372127294540405, + "learning_rate": 8.99038638532082e-06, + "loss": 0.2691, + "num_input_tokens_seen": 44064624, + "step": 28850 + }, + { + "epoch": 89.05873261205564, + "grad_norm": 0.5166469216346741, + "learning_rate": 8.982847263046065e-06, + "loss": 0.2886, + "num_input_tokens_seen": 44072464, + "step": 28855 + }, + { + "epoch": 89.0741885625966, + "grad_norm": 0.5662027597427368, + "learning_rate": 8.975310610817555e-06, + "loss": 0.2149, + "num_input_tokens_seen": 44080400, + "step": 28860 + }, + { + "epoch": 89.08964451313756, + "grad_norm": 0.4213014245033264, + "learning_rate": 8.967776429797528e-06, + "loss": 0.211, + "num_input_tokens_seen": 44088432, + "step": 28865 + }, + { + "epoch": 89.10510046367851, + "grad_norm": 0.5020546317100525, + "learning_rate": 8.960244721147842e-06, + "loss": 0.2574, + "num_input_tokens_seen": 44095920, + "step": 28870 + }, + { + "epoch": 89.12055641421948, + "grad_norm": 0.5780847668647766, + "learning_rate": 8.952715486029995e-06, + "loss": 0.2281, + "num_input_tokens_seen": 44103440, + "step": 28875 + }, + { + "epoch": 89.13601236476043, + "grad_norm": 0.46011853218078613, + "learning_rate": 8.945188725605075e-06, + "loss": 0.2277, + "num_input_tokens_seen": 44111344, + "step": 28880 + }, + { + "epoch": 89.15146831530139, + "grad_norm": 0.4759959578514099, + "learning_rate": 8.937664441033817e-06, + "loss": 0.2681, + "num_input_tokens_seen": 44118288, + "step": 28885 + }, + { + "epoch": 89.16692426584235, + "grad_norm": 0.5592358708381653, + "learning_rate": 8.930142633476549e-06, + "loss": 0.2653, + "num_input_tokens_seen": 44126128, + "step": 28890 + }, + { + "epoch": 89.18238021638331, + "grad_norm": 0.7288039922714233, + "learning_rate": 8.92262330409323e-06, + "loss": 0.2118, + "num_input_tokens_seen": 44133904, + "step": 28895 + }, + { + "epoch": 89.19783616692426, + "grad_norm": 0.8151293992996216, + "learning_rate": 8.915106454043448e-06, + "loss": 0.2654, + "num_input_tokens_seen": 44141392, + "step": 28900 + }, + { + "epoch": 89.21329211746523, + "grad_norm": 0.555464506149292, + "learning_rate": 8.90759208448638e-06, + "loss": 0.2924, + "num_input_tokens_seen": 44149392, + "step": 28905 + }, + { + "epoch": 89.22874806800618, + "grad_norm": 0.5377168655395508, + "learning_rate": 8.900080196580848e-06, + "loss": 0.2538, + "num_input_tokens_seen": 44156944, + "step": 28910 + }, + { + "epoch": 89.24420401854714, + "grad_norm": 0.3962329626083374, + "learning_rate": 8.892570791485267e-06, + "loss": 0.2445, + "num_input_tokens_seen": 44164944, + "step": 28915 + }, + { + "epoch": 89.2596599690881, + "grad_norm": 0.3761639893054962, + "learning_rate": 8.885063870357688e-06, + "loss": 0.2276, + "num_input_tokens_seen": 44172624, + "step": 28920 + }, + { + "epoch": 89.27511591962906, + "grad_norm": 0.4826631546020508, + "learning_rate": 8.87755943435578e-06, + "loss": 0.2515, + "num_input_tokens_seen": 44180368, + "step": 28925 + }, + { + "epoch": 89.29057187017001, + "grad_norm": 0.6093316674232483, + "learning_rate": 8.87005748463681e-06, + "loss": 0.231, + "num_input_tokens_seen": 44188208, + "step": 28930 + }, + { + "epoch": 89.30602782071098, + "grad_norm": 0.5855954885482788, + "learning_rate": 8.862558022357681e-06, + "loss": 0.2476, + "num_input_tokens_seen": 44195440, + "step": 28935 + }, + { + "epoch": 89.32148377125193, + "grad_norm": 0.378889262676239, + "learning_rate": 8.855061048674903e-06, + "loss": 0.2312, + "num_input_tokens_seen": 44202864, + "step": 28940 + }, + { + "epoch": 89.33693972179289, + "grad_norm": 0.6213170886039734, + "learning_rate": 8.847566564744595e-06, + "loss": 0.3082, + "num_input_tokens_seen": 44210576, + "step": 28945 + }, + { + "epoch": 89.35239567233384, + "grad_norm": 0.35212600231170654, + "learning_rate": 8.840074571722512e-06, + "loss": 0.2127, + "num_input_tokens_seen": 44218352, + "step": 28950 + }, + { + "epoch": 89.36785162287481, + "grad_norm": 0.7420196533203125, + "learning_rate": 8.832585070764002e-06, + "loss": 0.266, + "num_input_tokens_seen": 44226256, + "step": 28955 + }, + { + "epoch": 89.38330757341576, + "grad_norm": 0.42499423027038574, + "learning_rate": 8.825098063024045e-06, + "loss": 0.3439, + "num_input_tokens_seen": 44234128, + "step": 28960 + }, + { + "epoch": 89.39876352395672, + "grad_norm": 0.49822497367858887, + "learning_rate": 8.817613549657244e-06, + "loss": 0.2502, + "num_input_tokens_seen": 44241680, + "step": 28965 + }, + { + "epoch": 89.41421947449768, + "grad_norm": 0.6155866980552673, + "learning_rate": 8.810131531817783e-06, + "loss": 0.248, + "num_input_tokens_seen": 44248912, + "step": 28970 + }, + { + "epoch": 89.42967542503864, + "grad_norm": 0.37737971544265747, + "learning_rate": 8.802652010659496e-06, + "loss": 0.2702, + "num_input_tokens_seen": 44256176, + "step": 28975 + }, + { + "epoch": 89.44513137557959, + "grad_norm": 0.7394519448280334, + "learning_rate": 8.795174987335827e-06, + "loss": 0.1991, + "num_input_tokens_seen": 44263728, + "step": 28980 + }, + { + "epoch": 89.46058732612056, + "grad_norm": 0.44710448384284973, + "learning_rate": 8.787700462999807e-06, + "loss": 0.3223, + "num_input_tokens_seen": 44271024, + "step": 28985 + }, + { + "epoch": 89.47604327666151, + "grad_norm": 0.47165513038635254, + "learning_rate": 8.780228438804122e-06, + "loss": 0.2133, + "num_input_tokens_seen": 44278992, + "step": 28990 + }, + { + "epoch": 89.49149922720247, + "grad_norm": 0.6534603834152222, + "learning_rate": 8.772758915901032e-06, + "loss": 0.216, + "num_input_tokens_seen": 44286768, + "step": 28995 + }, + { + "epoch": 89.50695517774344, + "grad_norm": 0.3224707841873169, + "learning_rate": 8.765291895442443e-06, + "loss": 0.2193, + "num_input_tokens_seen": 44294256, + "step": 29000 + }, + { + "epoch": 89.50695517774344, + "eval_loss": 0.30310919880867004, + "eval_runtime": 6.3207, + "eval_samples_per_second": 90.97, + "eval_steps_per_second": 22.782, + "num_input_tokens_seen": 44294256, + "step": 29000 + }, + { + "epoch": 89.52241112828439, + "grad_norm": 0.5090948939323425, + "learning_rate": 8.75782737857987e-06, + "loss": 0.3298, + "num_input_tokens_seen": 44301936, + "step": 29005 + }, + { + "epoch": 89.53786707882534, + "grad_norm": 0.4226423501968384, + "learning_rate": 8.750365366464425e-06, + "loss": 0.2464, + "num_input_tokens_seen": 44309200, + "step": 29010 + }, + { + "epoch": 89.55332302936631, + "grad_norm": 0.7922381162643433, + "learning_rate": 8.742905860246838e-06, + "loss": 0.3451, + "num_input_tokens_seen": 44316688, + "step": 29015 + }, + { + "epoch": 89.56877897990726, + "grad_norm": 0.5515689253807068, + "learning_rate": 8.735448861077478e-06, + "loss": 0.2836, + "num_input_tokens_seen": 44324560, + "step": 29020 + }, + { + "epoch": 89.58423493044822, + "grad_norm": 0.583073616027832, + "learning_rate": 8.727994370106288e-06, + "loss": 0.2506, + "num_input_tokens_seen": 44332464, + "step": 29025 + }, + { + "epoch": 89.59969088098919, + "grad_norm": 0.6482725739479065, + "learning_rate": 8.720542388482861e-06, + "loss": 0.2155, + "num_input_tokens_seen": 44339920, + "step": 29030 + }, + { + "epoch": 89.61514683153014, + "grad_norm": 0.3851865530014038, + "learning_rate": 8.71309291735637e-06, + "loss": 0.221, + "num_input_tokens_seen": 44347824, + "step": 29035 + }, + { + "epoch": 89.6306027820711, + "grad_norm": 0.4445726275444031, + "learning_rate": 8.705645957875621e-06, + "loss": 0.2537, + "num_input_tokens_seen": 44355696, + "step": 29040 + }, + { + "epoch": 89.64605873261206, + "grad_norm": 0.5556478500366211, + "learning_rate": 8.698201511189048e-06, + "loss": 0.2305, + "num_input_tokens_seen": 44363632, + "step": 29045 + }, + { + "epoch": 89.66151468315302, + "grad_norm": 0.3894475996494293, + "learning_rate": 8.690759578444649e-06, + "loss": 0.1996, + "num_input_tokens_seen": 44371440, + "step": 29050 + }, + { + "epoch": 89.67697063369397, + "grad_norm": 0.34200912714004517, + "learning_rate": 8.68332016079008e-06, + "loss": 0.1985, + "num_input_tokens_seen": 44379152, + "step": 29055 + }, + { + "epoch": 89.69242658423494, + "grad_norm": 0.6356738209724426, + "learning_rate": 8.6758832593726e-06, + "loss": 0.2261, + "num_input_tokens_seen": 44386704, + "step": 29060 + }, + { + "epoch": 89.70788253477589, + "grad_norm": 0.6261863708496094, + "learning_rate": 8.668448875339053e-06, + "loss": 0.2931, + "num_input_tokens_seen": 44394352, + "step": 29065 + }, + { + "epoch": 89.72333848531684, + "grad_norm": 0.6936354041099548, + "learning_rate": 8.661017009835933e-06, + "loss": 0.2129, + "num_input_tokens_seen": 44401680, + "step": 29070 + }, + { + "epoch": 89.7387944358578, + "grad_norm": 0.7223072052001953, + "learning_rate": 8.653587664009311e-06, + "loss": 0.2394, + "num_input_tokens_seen": 44409360, + "step": 29075 + }, + { + "epoch": 89.75425038639877, + "grad_norm": 0.5832447409629822, + "learning_rate": 8.646160839004902e-06, + "loss": 0.2702, + "num_input_tokens_seen": 44417264, + "step": 29080 + }, + { + "epoch": 89.76970633693972, + "grad_norm": 0.678854763507843, + "learning_rate": 8.638736535967998e-06, + "loss": 0.215, + "num_input_tokens_seen": 44425200, + "step": 29085 + }, + { + "epoch": 89.78516228748067, + "grad_norm": 0.374032586812973, + "learning_rate": 8.631314756043535e-06, + "loss": 0.2124, + "num_input_tokens_seen": 44432624, + "step": 29090 + }, + { + "epoch": 89.80061823802164, + "grad_norm": 0.43317681550979614, + "learning_rate": 8.62389550037603e-06, + "loss": 0.2623, + "num_input_tokens_seen": 44440560, + "step": 29095 + }, + { + "epoch": 89.8160741885626, + "grad_norm": 0.36580267548561096, + "learning_rate": 8.616478770109646e-06, + "loss": 0.2128, + "num_input_tokens_seen": 44447920, + "step": 29100 + }, + { + "epoch": 89.83153013910355, + "grad_norm": 0.715548038482666, + "learning_rate": 8.609064566388111e-06, + "loss": 0.2192, + "num_input_tokens_seen": 44454672, + "step": 29105 + }, + { + "epoch": 89.84698608964452, + "grad_norm": 0.5241458415985107, + "learning_rate": 8.601652890354815e-06, + "loss": 0.2712, + "num_input_tokens_seen": 44462672, + "step": 29110 + }, + { + "epoch": 89.86244204018547, + "grad_norm": 0.43972301483154297, + "learning_rate": 8.594243743152705e-06, + "loss": 0.2609, + "num_input_tokens_seen": 44470352, + "step": 29115 + }, + { + "epoch": 89.87789799072642, + "grad_norm": 0.6595968008041382, + "learning_rate": 8.58683712592438e-06, + "loss": 0.2483, + "num_input_tokens_seen": 44478000, + "step": 29120 + }, + { + "epoch": 89.89335394126739, + "grad_norm": 0.6043790578842163, + "learning_rate": 8.579433039812037e-06, + "loss": 0.2302, + "num_input_tokens_seen": 44485296, + "step": 29125 + }, + { + "epoch": 89.90880989180835, + "grad_norm": 0.8016046285629272, + "learning_rate": 8.572031485957466e-06, + "loss": 0.2149, + "num_input_tokens_seen": 44492624, + "step": 29130 + }, + { + "epoch": 89.9242658423493, + "grad_norm": 0.5241984724998474, + "learning_rate": 8.564632465502084e-06, + "loss": 0.2584, + "num_input_tokens_seen": 44500080, + "step": 29135 + }, + { + "epoch": 89.93972179289027, + "grad_norm": 0.6176918148994446, + "learning_rate": 8.557235979586928e-06, + "loss": 0.289, + "num_input_tokens_seen": 44507632, + "step": 29140 + }, + { + "epoch": 89.95517774343122, + "grad_norm": 0.3590410649776459, + "learning_rate": 8.549842029352606e-06, + "loss": 0.2342, + "num_input_tokens_seen": 44515472, + "step": 29145 + }, + { + "epoch": 89.97063369397218, + "grad_norm": 0.38277649879455566, + "learning_rate": 8.542450615939376e-06, + "loss": 0.2901, + "num_input_tokens_seen": 44523088, + "step": 29150 + }, + { + "epoch": 89.98608964451314, + "grad_norm": 0.3288668692111969, + "learning_rate": 8.535061740487082e-06, + "loss": 0.2086, + "num_input_tokens_seen": 44530544, + "step": 29155 + }, + { + "epoch": 90.0, + "grad_norm": 0.5006799697875977, + "learning_rate": 8.527675404135168e-06, + "loss": 0.1841, + "num_input_tokens_seen": 44536928, + "step": 29160 + }, + { + "epoch": 90.01545595054095, + "grad_norm": 0.45984190702438354, + "learning_rate": 8.520291608022724e-06, + "loss": 0.206, + "num_input_tokens_seen": 44544864, + "step": 29165 + }, + { + "epoch": 90.03091190108192, + "grad_norm": 0.35473909974098206, + "learning_rate": 8.512910353288398e-06, + "loss": 0.2378, + "num_input_tokens_seen": 44552288, + "step": 29170 + }, + { + "epoch": 90.04636785162288, + "grad_norm": 0.6141825914382935, + "learning_rate": 8.505531641070486e-06, + "loss": 0.2806, + "num_input_tokens_seen": 44559840, + "step": 29175 + }, + { + "epoch": 90.06182380216383, + "grad_norm": 0.5017266273498535, + "learning_rate": 8.498155472506885e-06, + "loss": 0.2332, + "num_input_tokens_seen": 44567552, + "step": 29180 + }, + { + "epoch": 90.0772797527048, + "grad_norm": 0.48102837800979614, + "learning_rate": 8.49078184873508e-06, + "loss": 0.2354, + "num_input_tokens_seen": 44575264, + "step": 29185 + }, + { + "epoch": 90.09273570324575, + "grad_norm": 0.589320957660675, + "learning_rate": 8.483410770892188e-06, + "loss": 0.2843, + "num_input_tokens_seen": 44582848, + "step": 29190 + }, + { + "epoch": 90.1081916537867, + "grad_norm": 0.3667062819004059, + "learning_rate": 8.476042240114909e-06, + "loss": 0.197, + "num_input_tokens_seen": 44590400, + "step": 29195 + }, + { + "epoch": 90.12364760432767, + "grad_norm": 0.35431385040283203, + "learning_rate": 8.468676257539568e-06, + "loss": 0.216, + "num_input_tokens_seen": 44598464, + "step": 29200 + }, + { + "epoch": 90.12364760432767, + "eval_loss": 0.3019578754901886, + "eval_runtime": 6.3292, + "eval_samples_per_second": 90.848, + "eval_steps_per_second": 22.752, + "num_input_tokens_seen": 44598464, + "step": 29200 + }, + { + "epoch": 90.13910355486863, + "grad_norm": 0.44499078392982483, + "learning_rate": 8.4613128243021e-06, + "loss": 0.3073, + "num_input_tokens_seen": 44605920, + "step": 29205 + }, + { + "epoch": 90.15455950540958, + "grad_norm": 0.75246262550354, + "learning_rate": 8.453951941538028e-06, + "loss": 0.261, + "num_input_tokens_seen": 44613248, + "step": 29210 + }, + { + "epoch": 90.17001545595055, + "grad_norm": 0.6418164968490601, + "learning_rate": 8.446593610382495e-06, + "loss": 0.221, + "num_input_tokens_seen": 44620832, + "step": 29215 + }, + { + "epoch": 90.1854714064915, + "grad_norm": 0.6443812251091003, + "learning_rate": 8.439237831970259e-06, + "loss": 0.3106, + "num_input_tokens_seen": 44628256, + "step": 29220 + }, + { + "epoch": 90.20092735703246, + "grad_norm": 0.28422456979751587, + "learning_rate": 8.431884607435667e-06, + "loss": 0.2709, + "num_input_tokens_seen": 44636192, + "step": 29225 + }, + { + "epoch": 90.21638330757341, + "grad_norm": 0.5205686688423157, + "learning_rate": 8.424533937912665e-06, + "loss": 0.2288, + "num_input_tokens_seen": 44644448, + "step": 29230 + }, + { + "epoch": 90.23183925811438, + "grad_norm": 0.5597392916679382, + "learning_rate": 8.41718582453484e-06, + "loss": 0.2766, + "num_input_tokens_seen": 44652352, + "step": 29235 + }, + { + "epoch": 90.24729520865533, + "grad_norm": 0.2817009389400482, + "learning_rate": 8.409840268435346e-06, + "loss": 0.299, + "num_input_tokens_seen": 44660192, + "step": 29240 + }, + { + "epoch": 90.26275115919628, + "grad_norm": 0.4045141637325287, + "learning_rate": 8.402497270746976e-06, + "loss": 0.219, + "num_input_tokens_seen": 44668192, + "step": 29245 + }, + { + "epoch": 90.27820710973725, + "grad_norm": 0.6289852857589722, + "learning_rate": 8.395156832602095e-06, + "loss": 0.289, + "num_input_tokens_seen": 44675840, + "step": 29250 + }, + { + "epoch": 90.2936630602782, + "grad_norm": 0.8077979683876038, + "learning_rate": 8.387818955132707e-06, + "loss": 0.252, + "num_input_tokens_seen": 44683200, + "step": 29255 + }, + { + "epoch": 90.30911901081916, + "grad_norm": 0.5897994041442871, + "learning_rate": 8.38048363947039e-06, + "loss": 0.2367, + "num_input_tokens_seen": 44691232, + "step": 29260 + }, + { + "epoch": 90.32457496136013, + "grad_norm": 0.6859399676322937, + "learning_rate": 8.373150886746351e-06, + "loss": 0.2913, + "num_input_tokens_seen": 44698880, + "step": 29265 + }, + { + "epoch": 90.34003091190108, + "grad_norm": 0.5497391223907471, + "learning_rate": 8.365820698091397e-06, + "loss": 0.2642, + "num_input_tokens_seen": 44706400, + "step": 29270 + }, + { + "epoch": 90.35548686244204, + "grad_norm": 0.48740360140800476, + "learning_rate": 8.358493074635922e-06, + "loss": 0.2348, + "num_input_tokens_seen": 44714176, + "step": 29275 + }, + { + "epoch": 90.370942812983, + "grad_norm": 0.6586455702781677, + "learning_rate": 8.351168017509948e-06, + "loss": 0.2826, + "num_input_tokens_seen": 44721888, + "step": 29280 + }, + { + "epoch": 90.38639876352396, + "grad_norm": 0.5395870804786682, + "learning_rate": 8.343845527843094e-06, + "loss": 0.2351, + "num_input_tokens_seen": 44729088, + "step": 29285 + }, + { + "epoch": 90.40185471406491, + "grad_norm": 0.5228132009506226, + "learning_rate": 8.336525606764566e-06, + "loss": 0.2025, + "num_input_tokens_seen": 44736704, + "step": 29290 + }, + { + "epoch": 90.41731066460588, + "grad_norm": 0.6005445718765259, + "learning_rate": 8.329208255403204e-06, + "loss": 0.2255, + "num_input_tokens_seen": 44744544, + "step": 29295 + }, + { + "epoch": 90.43276661514683, + "grad_norm": 0.41849684715270996, + "learning_rate": 8.321893474887426e-06, + "loss": 0.2566, + "num_input_tokens_seen": 44751904, + "step": 29300 + }, + { + "epoch": 90.44822256568779, + "grad_norm": 0.3064611852169037, + "learning_rate": 8.31458126634526e-06, + "loss": 0.2467, + "num_input_tokens_seen": 44759328, + "step": 29305 + }, + { + "epoch": 90.46367851622875, + "grad_norm": 0.7335488200187683, + "learning_rate": 8.30727163090435e-06, + "loss": 0.223, + "num_input_tokens_seen": 44767168, + "step": 29310 + }, + { + "epoch": 90.47913446676971, + "grad_norm": 0.7882587909698486, + "learning_rate": 8.29996456969192e-06, + "loss": 0.2159, + "num_input_tokens_seen": 44775424, + "step": 29315 + }, + { + "epoch": 90.49459041731066, + "grad_norm": 0.5198348164558411, + "learning_rate": 8.292660083834818e-06, + "loss": 0.2198, + "num_input_tokens_seen": 44782272, + "step": 29320 + }, + { + "epoch": 90.51004636785163, + "grad_norm": 0.8223177790641785, + "learning_rate": 8.2853581744595e-06, + "loss": 0.2531, + "num_input_tokens_seen": 44789760, + "step": 29325 + }, + { + "epoch": 90.52550231839258, + "grad_norm": 0.4947661757469177, + "learning_rate": 8.278058842691991e-06, + "loss": 0.2766, + "num_input_tokens_seen": 44797248, + "step": 29330 + }, + { + "epoch": 90.54095826893354, + "grad_norm": 0.3827954828739166, + "learning_rate": 8.27076208965796e-06, + "loss": 0.2494, + "num_input_tokens_seen": 44804960, + "step": 29335 + }, + { + "epoch": 90.5564142194745, + "grad_norm": 0.47468116879463196, + "learning_rate": 8.263467916482637e-06, + "loss": 0.2694, + "num_input_tokens_seen": 44812832, + "step": 29340 + }, + { + "epoch": 90.57187017001546, + "grad_norm": 0.6194696426391602, + "learning_rate": 8.256176324290885e-06, + "loss": 0.2458, + "num_input_tokens_seen": 44820224, + "step": 29345 + }, + { + "epoch": 90.58732612055641, + "grad_norm": 0.4163975715637207, + "learning_rate": 8.248887314207168e-06, + "loss": 0.3044, + "num_input_tokens_seen": 44828128, + "step": 29350 + }, + { + "epoch": 90.60278207109737, + "grad_norm": 0.6921218633651733, + "learning_rate": 8.24160088735553e-06, + "loss": 0.2616, + "num_input_tokens_seen": 44836064, + "step": 29355 + }, + { + "epoch": 90.61823802163833, + "grad_norm": 0.6494745016098022, + "learning_rate": 8.234317044859629e-06, + "loss": 0.2328, + "num_input_tokens_seen": 44843424, + "step": 29360 + }, + { + "epoch": 90.63369397217929, + "grad_norm": 0.625029981136322, + "learning_rate": 8.227035787842744e-06, + "loss": 0.2916, + "num_input_tokens_seen": 44851424, + "step": 29365 + }, + { + "epoch": 90.64914992272024, + "grad_norm": 0.3996470868587494, + "learning_rate": 8.219757117427721e-06, + "loss": 0.215, + "num_input_tokens_seen": 44858560, + "step": 29370 + }, + { + "epoch": 90.66460587326121, + "grad_norm": 0.6671704053878784, + "learning_rate": 8.212481034737014e-06, + "loss": 0.2959, + "num_input_tokens_seen": 44865824, + "step": 29375 + }, + { + "epoch": 90.68006182380216, + "grad_norm": 0.5132991075515747, + "learning_rate": 8.205207540892707e-06, + "loss": 0.2407, + "num_input_tokens_seen": 44873472, + "step": 29380 + }, + { + "epoch": 90.69551777434312, + "grad_norm": 0.5107837319374084, + "learning_rate": 8.197936637016442e-06, + "loss": 0.2854, + "num_input_tokens_seen": 44881312, + "step": 29385 + }, + { + "epoch": 90.71097372488408, + "grad_norm": 0.3151988089084625, + "learning_rate": 8.190668324229508e-06, + "loss": 0.2816, + "num_input_tokens_seen": 44889184, + "step": 29390 + }, + { + "epoch": 90.72642967542504, + "grad_norm": 0.6708422899246216, + "learning_rate": 8.183402603652749e-06, + "loss": 0.2335, + "num_input_tokens_seen": 44897184, + "step": 29395 + }, + { + "epoch": 90.74188562596599, + "grad_norm": 0.4002018868923187, + "learning_rate": 8.176139476406635e-06, + "loss": 0.2053, + "num_input_tokens_seen": 44904928, + "step": 29400 + }, + { + "epoch": 90.74188562596599, + "eval_loss": 0.3031764626502991, + "eval_runtime": 6.319, + "eval_samples_per_second": 90.996, + "eval_steps_per_second": 22.789, + "num_input_tokens_seen": 44904928, + "step": 29400 + }, + { + "epoch": 90.75734157650696, + "grad_norm": 0.34798699617385864, + "learning_rate": 8.16887894361125e-06, + "loss": 0.1915, + "num_input_tokens_seen": 44913056, + "step": 29405 + }, + { + "epoch": 90.77279752704791, + "grad_norm": 0.33146560192108154, + "learning_rate": 8.161621006386233e-06, + "loss": 0.2283, + "num_input_tokens_seen": 44920640, + "step": 29410 + }, + { + "epoch": 90.78825347758887, + "grad_norm": 0.6062893271446228, + "learning_rate": 8.154365665850869e-06, + "loss": 0.2878, + "num_input_tokens_seen": 44927968, + "step": 29415 + }, + { + "epoch": 90.80370942812984, + "grad_norm": 0.6809675693511963, + "learning_rate": 8.147112923124005e-06, + "loss": 0.2237, + "num_input_tokens_seen": 44935520, + "step": 29420 + }, + { + "epoch": 90.81916537867079, + "grad_norm": 0.33692365884780884, + "learning_rate": 8.13986277932412e-06, + "loss": 0.2084, + "num_input_tokens_seen": 44943360, + "step": 29425 + }, + { + "epoch": 90.83462132921174, + "grad_norm": 0.3691738247871399, + "learning_rate": 8.132615235569277e-06, + "loss": 0.2122, + "num_input_tokens_seen": 44951360, + "step": 29430 + }, + { + "epoch": 90.85007727975271, + "grad_norm": 0.3539089560508728, + "learning_rate": 8.125370292977124e-06, + "loss": 0.2281, + "num_input_tokens_seen": 44958560, + "step": 29435 + }, + { + "epoch": 90.86553323029366, + "grad_norm": 0.6076382994651794, + "learning_rate": 8.118127952664944e-06, + "loss": 0.2204, + "num_input_tokens_seen": 44966176, + "step": 29440 + }, + { + "epoch": 90.88098918083462, + "grad_norm": 0.4678049683570862, + "learning_rate": 8.110888215749574e-06, + "loss": 0.229, + "num_input_tokens_seen": 44973632, + "step": 29445 + }, + { + "epoch": 90.89644513137559, + "grad_norm": 0.458100825548172, + "learning_rate": 8.10365108334749e-06, + "loss": 0.2053, + "num_input_tokens_seen": 44981888, + "step": 29450 + }, + { + "epoch": 90.91190108191654, + "grad_norm": 0.5087770223617554, + "learning_rate": 8.096416556574743e-06, + "loss": 0.2571, + "num_input_tokens_seen": 44989408, + "step": 29455 + }, + { + "epoch": 90.9273570324575, + "grad_norm": 0.6999450325965881, + "learning_rate": 8.08918463654698e-06, + "loss": 0.2354, + "num_input_tokens_seen": 44996608, + "step": 29460 + }, + { + "epoch": 90.94281298299846, + "grad_norm": 0.4470808804035187, + "learning_rate": 8.081955324379458e-06, + "loss": 0.2294, + "num_input_tokens_seen": 45004448, + "step": 29465 + }, + { + "epoch": 90.95826893353942, + "grad_norm": 0.6344714164733887, + "learning_rate": 8.074728621187039e-06, + "loss": 0.2469, + "num_input_tokens_seen": 45011872, + "step": 29470 + }, + { + "epoch": 90.97372488408037, + "grad_norm": 0.9492040276527405, + "learning_rate": 8.067504528084158e-06, + "loss": 0.2313, + "num_input_tokens_seen": 45019072, + "step": 29475 + }, + { + "epoch": 90.98918083462132, + "grad_norm": 0.5881673097610474, + "learning_rate": 8.060283046184861e-06, + "loss": 0.2795, + "num_input_tokens_seen": 45026560, + "step": 29480 + }, + { + "epoch": 91.0030911901082, + "grad_norm": 0.7292839288711548, + "learning_rate": 8.053064176602806e-06, + "loss": 0.2606, + "num_input_tokens_seen": 45032784, + "step": 29485 + }, + { + "epoch": 91.01854714064915, + "grad_norm": 0.7191352844238281, + "learning_rate": 8.045847920451216e-06, + "loss": 0.2505, + "num_input_tokens_seen": 45040592, + "step": 29490 + }, + { + "epoch": 91.03400309119012, + "grad_norm": 0.39998793601989746, + "learning_rate": 8.038634278842944e-06, + "loss": 0.2259, + "num_input_tokens_seen": 45048048, + "step": 29495 + }, + { + "epoch": 91.04945904173107, + "grad_norm": 0.5855808258056641, + "learning_rate": 8.031423252890408e-06, + "loss": 0.2476, + "num_input_tokens_seen": 45055440, + "step": 29500 + }, + { + "epoch": 91.06491499227202, + "grad_norm": 0.4760558009147644, + "learning_rate": 8.024214843705646e-06, + "loss": 0.2039, + "num_input_tokens_seen": 45062672, + "step": 29505 + }, + { + "epoch": 91.08037094281298, + "grad_norm": 0.4266144335269928, + "learning_rate": 8.017009052400295e-06, + "loss": 0.232, + "num_input_tokens_seen": 45070544, + "step": 29510 + }, + { + "epoch": 91.09582689335394, + "grad_norm": 0.4909985363483429, + "learning_rate": 8.00980588008557e-06, + "loss": 0.2308, + "num_input_tokens_seen": 45078416, + "step": 29515 + }, + { + "epoch": 91.1112828438949, + "grad_norm": 0.5220217108726501, + "learning_rate": 8.002605327872282e-06, + "loss": 0.2163, + "num_input_tokens_seen": 45086032, + "step": 29520 + }, + { + "epoch": 91.12673879443585, + "grad_norm": 0.44657784700393677, + "learning_rate": 7.995407396870862e-06, + "loss": 0.2302, + "num_input_tokens_seen": 45093520, + "step": 29525 + }, + { + "epoch": 91.14219474497682, + "grad_norm": 0.6254831552505493, + "learning_rate": 7.988212088191307e-06, + "loss": 0.256, + "num_input_tokens_seen": 45100880, + "step": 29530 + }, + { + "epoch": 91.15765069551777, + "grad_norm": 0.45295077562332153, + "learning_rate": 7.98101940294324e-06, + "loss": 0.2638, + "num_input_tokens_seen": 45108560, + "step": 29535 + }, + { + "epoch": 91.17310664605873, + "grad_norm": 0.3703729510307312, + "learning_rate": 7.973829342235847e-06, + "loss": 0.2382, + "num_input_tokens_seen": 45115856, + "step": 29540 + }, + { + "epoch": 91.1885625965997, + "grad_norm": 0.6681313514709473, + "learning_rate": 7.966641907177936e-06, + "loss": 0.3161, + "num_input_tokens_seen": 45123664, + "step": 29545 + }, + { + "epoch": 91.20401854714065, + "grad_norm": 0.4254791736602783, + "learning_rate": 7.959457098877901e-06, + "loss": 0.2471, + "num_input_tokens_seen": 45131504, + "step": 29550 + }, + { + "epoch": 91.2194744976816, + "grad_norm": 1.0027682781219482, + "learning_rate": 7.952274918443719e-06, + "loss": 0.2718, + "num_input_tokens_seen": 45139152, + "step": 29555 + }, + { + "epoch": 91.23493044822257, + "grad_norm": 0.6162398457527161, + "learning_rate": 7.945095366982983e-06, + "loss": 0.2143, + "num_input_tokens_seen": 45147088, + "step": 29560 + }, + { + "epoch": 91.25038639876352, + "grad_norm": 0.4019787311553955, + "learning_rate": 7.937918445602871e-06, + "loss": 0.2213, + "num_input_tokens_seen": 45154928, + "step": 29565 + }, + { + "epoch": 91.26584234930448, + "grad_norm": 0.2863685190677643, + "learning_rate": 7.930744155410145e-06, + "loss": 0.2923, + "num_input_tokens_seen": 45162896, + "step": 29570 + }, + { + "epoch": 91.28129829984545, + "grad_norm": 0.38746729493141174, + "learning_rate": 7.923572497511181e-06, + "loss": 0.2626, + "num_input_tokens_seen": 45170800, + "step": 29575 + }, + { + "epoch": 91.2967542503864, + "grad_norm": 0.4291417896747589, + "learning_rate": 7.916403473011927e-06, + "loss": 0.2699, + "num_input_tokens_seen": 45178544, + "step": 29580 + }, + { + "epoch": 91.31221020092735, + "grad_norm": 0.7273932099342346, + "learning_rate": 7.909237083017953e-06, + "loss": 0.234, + "num_input_tokens_seen": 45185904, + "step": 29585 + }, + { + "epoch": 91.32766615146832, + "grad_norm": 0.3581875264644623, + "learning_rate": 7.902073328634389e-06, + "loss": 0.2098, + "num_input_tokens_seen": 45193328, + "step": 29590 + }, + { + "epoch": 91.34312210200927, + "grad_norm": 0.41584962606430054, + "learning_rate": 7.894912210965987e-06, + "loss": 0.309, + "num_input_tokens_seen": 45200944, + "step": 29595 + }, + { + "epoch": 91.35857805255023, + "grad_norm": 0.41508418321609497, + "learning_rate": 7.887753731117075e-06, + "loss": 0.1961, + "num_input_tokens_seen": 45208784, + "step": 29600 + }, + { + "epoch": 91.35857805255023, + "eval_loss": 0.3038986027240753, + "eval_runtime": 6.3143, + "eval_samples_per_second": 91.063, + "eval_steps_per_second": 22.805, + "num_input_tokens_seen": 45208784, + "step": 29600 + }, + { + "epoch": 91.3740340030912, + "grad_norm": 0.30479753017425537, + "learning_rate": 7.880597890191587e-06, + "loss": 0.2741, + "num_input_tokens_seen": 45216720, + "step": 29605 + }, + { + "epoch": 91.38948995363215, + "grad_norm": 0.36678746342658997, + "learning_rate": 7.873444689293036e-06, + "loss": 0.2479, + "num_input_tokens_seen": 45224528, + "step": 29610 + }, + { + "epoch": 91.4049459041731, + "grad_norm": 0.43357744812965393, + "learning_rate": 7.866294129524548e-06, + "loss": 0.24, + "num_input_tokens_seen": 45231600, + "step": 29615 + }, + { + "epoch": 91.42040185471407, + "grad_norm": 0.9057990908622742, + "learning_rate": 7.859146211988811e-06, + "loss": 0.2026, + "num_input_tokens_seen": 45238960, + "step": 29620 + }, + { + "epoch": 91.43585780525503, + "grad_norm": 0.4212260842323303, + "learning_rate": 7.852000937788134e-06, + "loss": 0.2445, + "num_input_tokens_seen": 45247024, + "step": 29625 + }, + { + "epoch": 91.45131375579598, + "grad_norm": 0.3298800587654114, + "learning_rate": 7.844858308024416e-06, + "loss": 0.2773, + "num_input_tokens_seen": 45254480, + "step": 29630 + }, + { + "epoch": 91.46676970633693, + "grad_norm": 0.4343000054359436, + "learning_rate": 7.837718323799122e-06, + "loss": 0.277, + "num_input_tokens_seen": 45261936, + "step": 29635 + }, + { + "epoch": 91.4822256568779, + "grad_norm": 0.4769318401813507, + "learning_rate": 7.83058098621334e-06, + "loss": 0.2121, + "num_input_tokens_seen": 45269552, + "step": 29640 + }, + { + "epoch": 91.49768160741885, + "grad_norm": 0.7548100352287292, + "learning_rate": 7.823446296367739e-06, + "loss": 0.2873, + "num_input_tokens_seen": 45277424, + "step": 29645 + }, + { + "epoch": 91.51313755795981, + "grad_norm": 0.5912042260169983, + "learning_rate": 7.81631425536257e-06, + "loss": 0.2135, + "num_input_tokens_seen": 45284816, + "step": 29650 + }, + { + "epoch": 91.52859350850078, + "grad_norm": 0.7857062816619873, + "learning_rate": 7.809184864297689e-06, + "loss": 0.2197, + "num_input_tokens_seen": 45292720, + "step": 29655 + }, + { + "epoch": 91.54404945904173, + "grad_norm": 0.466460645198822, + "learning_rate": 7.802058124272532e-06, + "loss": 0.3176, + "num_input_tokens_seen": 45300496, + "step": 29660 + }, + { + "epoch": 91.55950540958268, + "grad_norm": 0.4090646803379059, + "learning_rate": 7.79493403638614e-06, + "loss": 0.2102, + "num_input_tokens_seen": 45308016, + "step": 29665 + }, + { + "epoch": 91.57496136012365, + "grad_norm": 0.48266080021858215, + "learning_rate": 7.787812601737132e-06, + "loss": 0.2002, + "num_input_tokens_seen": 45315856, + "step": 29670 + }, + { + "epoch": 91.5904173106646, + "grad_norm": 0.4210183024406433, + "learning_rate": 7.780693821423715e-06, + "loss": 0.2108, + "num_input_tokens_seen": 45323120, + "step": 29675 + }, + { + "epoch": 91.60587326120556, + "grad_norm": 0.3564063012599945, + "learning_rate": 7.773577696543705e-06, + "loss": 0.2595, + "num_input_tokens_seen": 45330480, + "step": 29680 + }, + { + "epoch": 91.62132921174653, + "grad_norm": 0.5005021095275879, + "learning_rate": 7.7664642281945e-06, + "loss": 0.3546, + "num_input_tokens_seen": 45337776, + "step": 29685 + }, + { + "epoch": 91.63678516228748, + "grad_norm": 0.45631811022758484, + "learning_rate": 7.759353417473072e-06, + "loss": 0.2052, + "num_input_tokens_seen": 45345200, + "step": 29690 + }, + { + "epoch": 91.65224111282843, + "grad_norm": 0.7265158891677856, + "learning_rate": 7.752245265476016e-06, + "loss": 0.2629, + "num_input_tokens_seen": 45352688, + "step": 29695 + }, + { + "epoch": 91.6676970633694, + "grad_norm": 0.3284403085708618, + "learning_rate": 7.745139773299481e-06, + "loss": 0.2611, + "num_input_tokens_seen": 45360592, + "step": 29700 + }, + { + "epoch": 91.68315301391036, + "grad_norm": 0.7367317080497742, + "learning_rate": 7.738036942039232e-06, + "loss": 0.2642, + "num_input_tokens_seen": 45368496, + "step": 29705 + }, + { + "epoch": 91.69860896445131, + "grad_norm": 0.5506697297096252, + "learning_rate": 7.73093677279062e-06, + "loss": 0.2905, + "num_input_tokens_seen": 45376304, + "step": 29710 + }, + { + "epoch": 91.71406491499228, + "grad_norm": 0.6771460771560669, + "learning_rate": 7.72383926664857e-06, + "loss": 0.2795, + "num_input_tokens_seen": 45384496, + "step": 29715 + }, + { + "epoch": 91.72952086553323, + "grad_norm": 0.5448882579803467, + "learning_rate": 7.716744424707606e-06, + "loss": 0.2794, + "num_input_tokens_seen": 45392656, + "step": 29720 + }, + { + "epoch": 91.74497681607419, + "grad_norm": 0.6145354509353638, + "learning_rate": 7.709652248061858e-06, + "loss": 0.2756, + "num_input_tokens_seen": 45400496, + "step": 29725 + }, + { + "epoch": 91.76043276661515, + "grad_norm": 0.3658173680305481, + "learning_rate": 7.702562737805017e-06, + "loss": 0.2808, + "num_input_tokens_seen": 45408048, + "step": 29730 + }, + { + "epoch": 91.7758887171561, + "grad_norm": 0.5387558937072754, + "learning_rate": 7.695475895030365e-06, + "loss": 0.2477, + "num_input_tokens_seen": 45416080, + "step": 29735 + }, + { + "epoch": 91.79134466769706, + "grad_norm": 0.7130991220474243, + "learning_rate": 7.6883917208308e-06, + "loss": 0.2389, + "num_input_tokens_seen": 45423984, + "step": 29740 + }, + { + "epoch": 91.80680061823801, + "grad_norm": 0.3720863163471222, + "learning_rate": 7.681310216298778e-06, + "loss": 0.212, + "num_input_tokens_seen": 45431312, + "step": 29745 + }, + { + "epoch": 91.82225656877898, + "grad_norm": 0.37059497833251953, + "learning_rate": 7.674231382526367e-06, + "loss": 0.1982, + "num_input_tokens_seen": 45439280, + "step": 29750 + }, + { + "epoch": 91.83771251931994, + "grad_norm": 0.4705576002597809, + "learning_rate": 7.667155220605198e-06, + "loss": 0.1999, + "num_input_tokens_seen": 45446896, + "step": 29755 + }, + { + "epoch": 91.85316846986089, + "grad_norm": 0.5189156532287598, + "learning_rate": 7.660081731626515e-06, + "loss": 0.2041, + "num_input_tokens_seen": 45454928, + "step": 29760 + }, + { + "epoch": 91.86862442040186, + "grad_norm": 0.625336229801178, + "learning_rate": 7.653010916681141e-06, + "loss": 0.2378, + "num_input_tokens_seen": 45462480, + "step": 29765 + }, + { + "epoch": 91.88408037094281, + "grad_norm": 0.7791847586631775, + "learning_rate": 7.645942776859472e-06, + "loss": 0.262, + "num_input_tokens_seen": 45470032, + "step": 29770 + }, + { + "epoch": 91.89953632148377, + "grad_norm": 0.4276624321937561, + "learning_rate": 7.63887731325152e-06, + "loss": 0.2244, + "num_input_tokens_seen": 45477840, + "step": 29775 + }, + { + "epoch": 91.91499227202473, + "grad_norm": 0.40624406933784485, + "learning_rate": 7.63181452694685e-06, + "loss": 0.2079, + "num_input_tokens_seen": 45485776, + "step": 29780 + }, + { + "epoch": 91.93044822256569, + "grad_norm": 0.6487457156181335, + "learning_rate": 7.624754419034644e-06, + "loss": 0.2997, + "num_input_tokens_seen": 45493840, + "step": 29785 + }, + { + "epoch": 91.94590417310664, + "grad_norm": 0.7544581294059753, + "learning_rate": 7.6176969906036645e-06, + "loss": 0.3128, + "num_input_tokens_seen": 45501392, + "step": 29790 + }, + { + "epoch": 91.96136012364761, + "grad_norm": 0.42866775393486023, + "learning_rate": 7.610642242742242e-06, + "loss": 0.2345, + "num_input_tokens_seen": 45508848, + "step": 29795 + }, + { + "epoch": 91.97681607418856, + "grad_norm": 0.46992191672325134, + "learning_rate": 7.603590176538322e-06, + "loss": 0.2677, + "num_input_tokens_seen": 45516336, + "step": 29800 + }, + { + "epoch": 91.97681607418856, + "eval_loss": 0.3022017478942871, + "eval_runtime": 6.3267, + "eval_samples_per_second": 90.884, + "eval_steps_per_second": 22.761, + "num_input_tokens_seen": 45516336, + "step": 29800 + }, + { + "epoch": 91.99227202472952, + "grad_norm": 0.5734893679618835, + "learning_rate": 7.596540793079404e-06, + "loss": 0.2248, + "num_input_tokens_seen": 45524048, + "step": 29805 + }, + { + "epoch": 92.00618238021639, + "grad_norm": 0.8228649497032166, + "learning_rate": 7.5894940934526125e-06, + "loss": 0.225, + "num_input_tokens_seen": 45530544, + "step": 29810 + }, + { + "epoch": 92.02163833075734, + "grad_norm": 0.6430031061172485, + "learning_rate": 7.582450078744621e-06, + "loss": 0.2581, + "num_input_tokens_seen": 45538224, + "step": 29815 + }, + { + "epoch": 92.0370942812983, + "grad_norm": 0.34730294346809387, + "learning_rate": 7.575408750041707e-06, + "loss": 0.2709, + "num_input_tokens_seen": 45545968, + "step": 29820 + }, + { + "epoch": 92.05255023183926, + "grad_norm": 0.3912573456764221, + "learning_rate": 7.568370108429732e-06, + "loss": 0.1989, + "num_input_tokens_seen": 45554224, + "step": 29825 + }, + { + "epoch": 92.06800618238022, + "grad_norm": 0.47538331151008606, + "learning_rate": 7.561334154994154e-06, + "loss": 0.2295, + "num_input_tokens_seen": 45562032, + "step": 29830 + }, + { + "epoch": 92.08346213292117, + "grad_norm": 0.4849514663219452, + "learning_rate": 7.55430089081999e-06, + "loss": 0.2433, + "num_input_tokens_seen": 45569584, + "step": 29835 + }, + { + "epoch": 92.09891808346214, + "grad_norm": 0.6003970503807068, + "learning_rate": 7.547270316991864e-06, + "loss": 0.2752, + "num_input_tokens_seen": 45576912, + "step": 29840 + }, + { + "epoch": 92.11437403400309, + "grad_norm": 0.37847617268562317, + "learning_rate": 7.5402424345939884e-06, + "loss": 0.2361, + "num_input_tokens_seen": 45584688, + "step": 29845 + }, + { + "epoch": 92.12982998454405, + "grad_norm": 0.5779397487640381, + "learning_rate": 7.533217244710133e-06, + "loss": 0.2552, + "num_input_tokens_seen": 45592048, + "step": 29850 + }, + { + "epoch": 92.14528593508501, + "grad_norm": 0.42919883131980896, + "learning_rate": 7.52619474842369e-06, + "loss": 0.2524, + "num_input_tokens_seen": 45599280, + "step": 29855 + }, + { + "epoch": 92.16074188562597, + "grad_norm": 0.7464612126350403, + "learning_rate": 7.519174946817597e-06, + "loss": 0.3143, + "num_input_tokens_seen": 45606896, + "step": 29860 + }, + { + "epoch": 92.17619783616692, + "grad_norm": 0.5000373721122742, + "learning_rate": 7.512157840974407e-06, + "loss": 0.2464, + "num_input_tokens_seen": 45614352, + "step": 29865 + }, + { + "epoch": 92.19165378670789, + "grad_norm": 0.7761164903640747, + "learning_rate": 7.5051434319762496e-06, + "loss": 0.2594, + "num_input_tokens_seen": 45622256, + "step": 29870 + }, + { + "epoch": 92.20710973724884, + "grad_norm": 0.6071208119392395, + "learning_rate": 7.498131720904822e-06, + "loss": 0.2819, + "num_input_tokens_seen": 45629712, + "step": 29875 + }, + { + "epoch": 92.2225656877898, + "grad_norm": 0.4985024034976959, + "learning_rate": 7.491122708841433e-06, + "loss": 0.1875, + "num_input_tokens_seen": 45637872, + "step": 29880 + }, + { + "epoch": 92.23802163833076, + "grad_norm": 0.6916691064834595, + "learning_rate": 7.4841163968669524e-06, + "loss": 0.2595, + "num_input_tokens_seen": 45645424, + "step": 29885 + }, + { + "epoch": 92.25347758887172, + "grad_norm": 0.40038490295410156, + "learning_rate": 7.4771127860618355e-06, + "loss": 0.201, + "num_input_tokens_seen": 45653360, + "step": 29890 + }, + { + "epoch": 92.26893353941267, + "grad_norm": 0.3341025710105896, + "learning_rate": 7.470111877506139e-06, + "loss": 0.2527, + "num_input_tokens_seen": 45660976, + "step": 29895 + }, + { + "epoch": 92.28438948995363, + "grad_norm": 0.31430646777153015, + "learning_rate": 7.463113672279479e-06, + "loss": 0.2867, + "num_input_tokens_seen": 45669168, + "step": 29900 + }, + { + "epoch": 92.2998454404946, + "grad_norm": 0.5429894328117371, + "learning_rate": 7.456118171461071e-06, + "loss": 0.2335, + "num_input_tokens_seen": 45676880, + "step": 29905 + }, + { + "epoch": 92.31530139103555, + "grad_norm": 0.35476112365722656, + "learning_rate": 7.449125376129721e-06, + "loss": 0.2106, + "num_input_tokens_seen": 45684464, + "step": 29910 + }, + { + "epoch": 92.3307573415765, + "grad_norm": 1.2723485231399536, + "learning_rate": 7.442135287363788e-06, + "loss": 0.2449, + "num_input_tokens_seen": 45691376, + "step": 29915 + }, + { + "epoch": 92.34621329211747, + "grad_norm": 0.5784487128257751, + "learning_rate": 7.435147906241247e-06, + "loss": 0.2712, + "num_input_tokens_seen": 45698896, + "step": 29920 + }, + { + "epoch": 92.36166924265842, + "grad_norm": 0.6711474657058716, + "learning_rate": 7.428163233839624e-06, + "loss": 0.2181, + "num_input_tokens_seen": 45706512, + "step": 29925 + }, + { + "epoch": 92.37712519319938, + "grad_norm": 0.3655756711959839, + "learning_rate": 7.4211812712360525e-06, + "loss": 0.1985, + "num_input_tokens_seen": 45714064, + "step": 29930 + }, + { + "epoch": 92.39258114374034, + "grad_norm": 0.6191118359565735, + "learning_rate": 7.4142020195072464e-06, + "loss": 0.2536, + "num_input_tokens_seen": 45721520, + "step": 29935 + }, + { + "epoch": 92.4080370942813, + "grad_norm": 0.5598530769348145, + "learning_rate": 7.407225479729479e-06, + "loss": 0.2978, + "num_input_tokens_seen": 45728912, + "step": 29940 + }, + { + "epoch": 92.42349304482225, + "grad_norm": 0.535807192325592, + "learning_rate": 7.400251652978632e-06, + "loss": 0.2277, + "num_input_tokens_seen": 45736304, + "step": 29945 + }, + { + "epoch": 92.43894899536322, + "grad_norm": 0.5204641819000244, + "learning_rate": 7.393280540330147e-06, + "loss": 0.2281, + "num_input_tokens_seen": 45744144, + "step": 29950 + }, + { + "epoch": 92.45440494590417, + "grad_norm": 0.3921407163143158, + "learning_rate": 7.386312142859069e-06, + "loss": 0.229, + "num_input_tokens_seen": 45752016, + "step": 29955 + }, + { + "epoch": 92.46986089644513, + "grad_norm": 0.6898938417434692, + "learning_rate": 7.379346461640008e-06, + "loss": 0.2185, + "num_input_tokens_seen": 45759600, + "step": 29960 + }, + { + "epoch": 92.4853168469861, + "grad_norm": 0.5734711289405823, + "learning_rate": 7.372383497747149e-06, + "loss": 0.3414, + "num_input_tokens_seen": 45766864, + "step": 29965 + }, + { + "epoch": 92.50077279752705, + "grad_norm": 0.5345205068588257, + "learning_rate": 7.3654232522542775e-06, + "loss": 0.2158, + "num_input_tokens_seen": 45774800, + "step": 29970 + }, + { + "epoch": 92.516228748068, + "grad_norm": 0.5611756443977356, + "learning_rate": 7.358465726234756e-06, + "loss": 0.2395, + "num_input_tokens_seen": 45782160, + "step": 29975 + }, + { + "epoch": 92.53168469860897, + "grad_norm": 0.5097535848617554, + "learning_rate": 7.351510920761512e-06, + "loss": 0.293, + "num_input_tokens_seen": 45789776, + "step": 29980 + }, + { + "epoch": 92.54714064914992, + "grad_norm": 0.43939101696014404, + "learning_rate": 7.344558836907067e-06, + "loss": 0.2186, + "num_input_tokens_seen": 45797136, + "step": 29985 + }, + { + "epoch": 92.56259659969088, + "grad_norm": 0.41446834802627563, + "learning_rate": 7.3376094757435285e-06, + "loss": 0.2428, + "num_input_tokens_seen": 45804688, + "step": 29990 + }, + { + "epoch": 92.57805255023185, + "grad_norm": 0.4330303966999054, + "learning_rate": 7.330662838342561e-06, + "loss": 0.2146, + "num_input_tokens_seen": 45813072, + "step": 29995 + }, + { + "epoch": 92.5935085007728, + "grad_norm": 0.47160303592681885, + "learning_rate": 7.323718925775438e-06, + "loss": 0.2594, + "num_input_tokens_seen": 45820432, + "step": 30000 + }, + { + "epoch": 92.5935085007728, + "eval_loss": 0.30243119597435, + "eval_runtime": 6.3111, + "eval_samples_per_second": 91.109, + "eval_steps_per_second": 22.817, + "num_input_tokens_seen": 45820432, + "step": 30000 + }, + { + "epoch": 92.60896445131375, + "grad_norm": 0.6208270192146301, + "learning_rate": 7.316777739112985e-06, + "loss": 0.2421, + "num_input_tokens_seen": 45827952, + "step": 30005 + }, + { + "epoch": 92.62442040185472, + "grad_norm": 0.5695608854293823, + "learning_rate": 7.309839279425626e-06, + "loss": 0.2218, + "num_input_tokens_seen": 45835376, + "step": 30010 + }, + { + "epoch": 92.63987635239567, + "grad_norm": 0.8918057680130005, + "learning_rate": 7.302903547783366e-06, + "loss": 0.3239, + "num_input_tokens_seen": 45842608, + "step": 30015 + }, + { + "epoch": 92.65533230293663, + "grad_norm": 0.5296716094017029, + "learning_rate": 7.2959705452557644e-06, + "loss": 0.2582, + "num_input_tokens_seen": 45850448, + "step": 30020 + }, + { + "epoch": 92.67078825347758, + "grad_norm": 0.664189338684082, + "learning_rate": 7.289040272911996e-06, + "loss": 0.2738, + "num_input_tokens_seen": 45858384, + "step": 30025 + }, + { + "epoch": 92.68624420401855, + "grad_norm": 0.6844856142997742, + "learning_rate": 7.282112731820789e-06, + "loss": 0.2658, + "num_input_tokens_seen": 45866384, + "step": 30030 + }, + { + "epoch": 92.7017001545595, + "grad_norm": 0.668536365032196, + "learning_rate": 7.275187923050447e-06, + "loss": 0.2759, + "num_input_tokens_seen": 45874448, + "step": 30035 + }, + { + "epoch": 92.71715610510046, + "grad_norm": 0.35388603806495667, + "learning_rate": 7.268265847668879e-06, + "loss": 0.2262, + "num_input_tokens_seen": 45882096, + "step": 30040 + }, + { + "epoch": 92.73261205564143, + "grad_norm": 0.48105624318122864, + "learning_rate": 7.261346506743538e-06, + "loss": 0.2089, + "num_input_tokens_seen": 45889072, + "step": 30045 + }, + { + "epoch": 92.74806800618238, + "grad_norm": 0.626754641532898, + "learning_rate": 7.254429901341486e-06, + "loss": 0.2297, + "num_input_tokens_seen": 45896016, + "step": 30050 + }, + { + "epoch": 92.76352395672333, + "grad_norm": 0.44512543082237244, + "learning_rate": 7.247516032529356e-06, + "loss": 0.2152, + "num_input_tokens_seen": 45903952, + "step": 30055 + }, + { + "epoch": 92.7789799072643, + "grad_norm": 0.41299253702163696, + "learning_rate": 7.240604901373338e-06, + "loss": 0.2685, + "num_input_tokens_seen": 45911568, + "step": 30060 + }, + { + "epoch": 92.79443585780525, + "grad_norm": 0.38087671995162964, + "learning_rate": 7.233696508939223e-06, + "loss": 0.1774, + "num_input_tokens_seen": 45918992, + "step": 30065 + }, + { + "epoch": 92.80989180834621, + "grad_norm": 0.8007519841194153, + "learning_rate": 7.226790856292376e-06, + "loss": 0.3195, + "num_input_tokens_seen": 45927152, + "step": 30070 + }, + { + "epoch": 92.82534775888718, + "grad_norm": 0.6680240035057068, + "learning_rate": 7.219887944497727e-06, + "loss": 0.2342, + "num_input_tokens_seen": 45935120, + "step": 30075 + }, + { + "epoch": 92.84080370942813, + "grad_norm": 0.8804259300231934, + "learning_rate": 7.2129877746198e-06, + "loss": 0.264, + "num_input_tokens_seen": 45943088, + "step": 30080 + }, + { + "epoch": 92.85625965996908, + "grad_norm": 0.6298076510429382, + "learning_rate": 7.20609034772268e-06, + "loss": 0.2539, + "num_input_tokens_seen": 45950352, + "step": 30085 + }, + { + "epoch": 92.87171561051005, + "grad_norm": 0.5143224000930786, + "learning_rate": 7.19919566487004e-06, + "loss": 0.2771, + "num_input_tokens_seen": 45958544, + "step": 30090 + }, + { + "epoch": 92.887171561051, + "grad_norm": 0.43094953894615173, + "learning_rate": 7.192303727125132e-06, + "loss": 0.2968, + "num_input_tokens_seen": 45966064, + "step": 30095 + }, + { + "epoch": 92.90262751159196, + "grad_norm": 0.8074979186058044, + "learning_rate": 7.185414535550777e-06, + "loss": 0.2386, + "num_input_tokens_seen": 45973552, + "step": 30100 + }, + { + "epoch": 92.91808346213293, + "grad_norm": 0.6899589896202087, + "learning_rate": 7.178528091209363e-06, + "loss": 0.257, + "num_input_tokens_seen": 45981200, + "step": 30105 + }, + { + "epoch": 92.93353941267388, + "grad_norm": 0.5885576009750366, + "learning_rate": 7.171644395162888e-06, + "loss": 0.2513, + "num_input_tokens_seen": 45988912, + "step": 30110 + }, + { + "epoch": 92.94899536321483, + "grad_norm": 0.6384811997413635, + "learning_rate": 7.164763448472881e-06, + "loss": 0.2163, + "num_input_tokens_seen": 45996432, + "step": 30115 + }, + { + "epoch": 92.9644513137558, + "grad_norm": 0.43699535727500916, + "learning_rate": 7.157885252200491e-06, + "loss": 0.2036, + "num_input_tokens_seen": 46004560, + "step": 30120 + }, + { + "epoch": 92.97990726429676, + "grad_norm": 0.3960891366004944, + "learning_rate": 7.151009807406403e-06, + "loss": 0.2685, + "num_input_tokens_seen": 46012720, + "step": 30125 + }, + { + "epoch": 92.99536321483771, + "grad_norm": 0.5959576368331909, + "learning_rate": 7.144137115150909e-06, + "loss": 0.2007, + "num_input_tokens_seen": 46020720, + "step": 30130 + }, + { + "epoch": 93.00927357032458, + "grad_norm": 0.44417405128479004, + "learning_rate": 7.1372671764938725e-06, + "loss": 0.2496, + "num_input_tokens_seen": 46027248, + "step": 30135 + }, + { + "epoch": 93.02472952086553, + "grad_norm": 0.6866132616996765, + "learning_rate": 7.130399992494705e-06, + "loss": 0.3201, + "num_input_tokens_seen": 46035600, + "step": 30140 + }, + { + "epoch": 93.04018547140649, + "grad_norm": 0.45641204714775085, + "learning_rate": 7.123535564212419e-06, + "loss": 0.2492, + "num_input_tokens_seen": 46043408, + "step": 30145 + }, + { + "epoch": 93.05564142194746, + "grad_norm": 0.489249050617218, + "learning_rate": 7.116673892705611e-06, + "loss": 0.2034, + "num_input_tokens_seen": 46050864, + "step": 30150 + }, + { + "epoch": 93.07109737248841, + "grad_norm": 0.2945634126663208, + "learning_rate": 7.109814979032415e-06, + "loss": 0.1877, + "num_input_tokens_seen": 46058032, + "step": 30155 + }, + { + "epoch": 93.08655332302936, + "grad_norm": 0.4866640567779541, + "learning_rate": 7.102958824250577e-06, + "loss": 0.2387, + "num_input_tokens_seen": 46065584, + "step": 30160 + }, + { + "epoch": 93.10200927357033, + "grad_norm": 0.5520482063293457, + "learning_rate": 7.096105429417393e-06, + "loss": 0.2594, + "num_input_tokens_seen": 46073200, + "step": 30165 + }, + { + "epoch": 93.11746522411129, + "grad_norm": 0.39064323902130127, + "learning_rate": 7.0892547955897506e-06, + "loss": 0.2626, + "num_input_tokens_seen": 46081136, + "step": 30170 + }, + { + "epoch": 93.13292117465224, + "grad_norm": 0.8227396607398987, + "learning_rate": 7.0824069238241e-06, + "loss": 0.2723, + "num_input_tokens_seen": 46088880, + "step": 30175 + }, + { + "epoch": 93.14837712519319, + "grad_norm": 0.40842491388320923, + "learning_rate": 7.075561815176462e-06, + "loss": 0.2776, + "num_input_tokens_seen": 46096208, + "step": 30180 + }, + { + "epoch": 93.16383307573416, + "grad_norm": 0.45923903584480286, + "learning_rate": 7.068719470702445e-06, + "loss": 0.2172, + "num_input_tokens_seen": 46104368, + "step": 30185 + }, + { + "epoch": 93.17928902627511, + "grad_norm": 0.7473905086517334, + "learning_rate": 7.061879891457229e-06, + "loss": 0.2308, + "num_input_tokens_seen": 46112336, + "step": 30190 + }, + { + "epoch": 93.19474497681607, + "grad_norm": 0.30583930015563965, + "learning_rate": 7.0550430784955515e-06, + "loss": 0.2232, + "num_input_tokens_seen": 46119888, + "step": 30195 + }, + { + "epoch": 93.21020092735704, + "grad_norm": 0.7301666140556335, + "learning_rate": 7.048209032871752e-06, + "loss": 0.2166, + "num_input_tokens_seen": 46127408, + "step": 30200 + }, + { + "epoch": 93.21020092735704, + "eval_loss": 0.3032008707523346, + "eval_runtime": 6.2772, + "eval_samples_per_second": 91.601, + "eval_steps_per_second": 22.94, + "num_input_tokens_seen": 46127408, + "step": 30200 + }, + { + "epoch": 93.22565687789799, + "grad_norm": 0.41598883271217346, + "learning_rate": 7.0413777556397055e-06, + "loss": 0.243, + "num_input_tokens_seen": 46134864, + "step": 30205 + }, + { + "epoch": 93.24111282843894, + "grad_norm": 0.5675224661827087, + "learning_rate": 7.0345492478528925e-06, + "loss": 0.2601, + "num_input_tokens_seen": 46142416, + "step": 30210 + }, + { + "epoch": 93.25656877897991, + "grad_norm": 0.49072906374931335, + "learning_rate": 7.02772351056436e-06, + "loss": 0.2224, + "num_input_tokens_seen": 46149840, + "step": 30215 + }, + { + "epoch": 93.27202472952087, + "grad_norm": 0.5948920249938965, + "learning_rate": 7.020900544826709e-06, + "loss": 0.2513, + "num_input_tokens_seen": 46157840, + "step": 30220 + }, + { + "epoch": 93.28748068006182, + "grad_norm": 0.7906067371368408, + "learning_rate": 7.014080351692134e-06, + "loss": 0.2231, + "num_input_tokens_seen": 46165392, + "step": 30225 + }, + { + "epoch": 93.30293663060279, + "grad_norm": 0.569977879524231, + "learning_rate": 7.0072629322124024e-06, + "loss": 0.2604, + "num_input_tokens_seen": 46173040, + "step": 30230 + }, + { + "epoch": 93.31839258114374, + "grad_norm": 0.4157654941082001, + "learning_rate": 7.000448287438827e-06, + "loss": 0.2647, + "num_input_tokens_seen": 46181776, + "step": 30235 + }, + { + "epoch": 93.3338485316847, + "grad_norm": 0.6278927326202393, + "learning_rate": 6.993636418422331e-06, + "loss": 0.2171, + "num_input_tokens_seen": 46189424, + "step": 30240 + }, + { + "epoch": 93.34930448222566, + "grad_norm": 0.5300338268280029, + "learning_rate": 6.986827326213383e-06, + "loss": 0.2131, + "num_input_tokens_seen": 46197008, + "step": 30245 + }, + { + "epoch": 93.36476043276662, + "grad_norm": 0.5613789558410645, + "learning_rate": 6.9800210118620205e-06, + "loss": 0.2071, + "num_input_tokens_seen": 46204720, + "step": 30250 + }, + { + "epoch": 93.38021638330757, + "grad_norm": 0.4904351830482483, + "learning_rate": 6.973217476417876e-06, + "loss": 0.2533, + "num_input_tokens_seen": 46212464, + "step": 30255 + }, + { + "epoch": 93.39567233384854, + "grad_norm": 0.47716808319091797, + "learning_rate": 6.96641672093013e-06, + "loss": 0.2684, + "num_input_tokens_seen": 46220176, + "step": 30260 + }, + { + "epoch": 93.41112828438949, + "grad_norm": 0.6834133267402649, + "learning_rate": 6.95961874644755e-06, + "loss": 0.2651, + "num_input_tokens_seen": 46228176, + "step": 30265 + }, + { + "epoch": 93.42658423493044, + "grad_norm": 0.5715587735176086, + "learning_rate": 6.952823554018476e-06, + "loss": 0.2298, + "num_input_tokens_seen": 46235856, + "step": 30270 + }, + { + "epoch": 93.44204018547141, + "grad_norm": 0.42537617683410645, + "learning_rate": 6.946031144690798e-06, + "loss": 0.2772, + "num_input_tokens_seen": 46243056, + "step": 30275 + }, + { + "epoch": 93.45749613601237, + "grad_norm": 0.7887798547744751, + "learning_rate": 6.939241519512005e-06, + "loss": 0.2113, + "num_input_tokens_seen": 46250864, + "step": 30280 + }, + { + "epoch": 93.47295208655332, + "grad_norm": 0.5123666524887085, + "learning_rate": 6.932454679529129e-06, + "loss": 0.2026, + "num_input_tokens_seen": 46258960, + "step": 30285 + }, + { + "epoch": 93.48840803709429, + "grad_norm": 0.5058121681213379, + "learning_rate": 6.925670625788791e-06, + "loss": 0.2623, + "num_input_tokens_seen": 46266512, + "step": 30290 + }, + { + "epoch": 93.50386398763524, + "grad_norm": 0.5344847440719604, + "learning_rate": 6.918889359337186e-06, + "loss": 0.2519, + "num_input_tokens_seen": 46274000, + "step": 30295 + }, + { + "epoch": 93.5193199381762, + "grad_norm": 0.5819957256317139, + "learning_rate": 6.912110881220058e-06, + "loss": 0.2795, + "num_input_tokens_seen": 46282000, + "step": 30300 + }, + { + "epoch": 93.53477588871715, + "grad_norm": 0.518407940864563, + "learning_rate": 6.905335192482735e-06, + "loss": 0.233, + "num_input_tokens_seen": 46289648, + "step": 30305 + }, + { + "epoch": 93.55023183925812, + "grad_norm": 0.4420775771141052, + "learning_rate": 6.8985622941701275e-06, + "loss": 0.2799, + "num_input_tokens_seen": 46297360, + "step": 30310 + }, + { + "epoch": 93.56568778979907, + "grad_norm": 0.560071587562561, + "learning_rate": 6.89179218732669e-06, + "loss": 0.2518, + "num_input_tokens_seen": 46304912, + "step": 30315 + }, + { + "epoch": 93.58114374034002, + "grad_norm": 0.7509660720825195, + "learning_rate": 6.8850248729964595e-06, + "loss": 0.2335, + "num_input_tokens_seen": 46312144, + "step": 30320 + }, + { + "epoch": 93.59659969088099, + "grad_norm": 0.7752385139465332, + "learning_rate": 6.8782603522230314e-06, + "loss": 0.2562, + "num_input_tokens_seen": 46319536, + "step": 30325 + }, + { + "epoch": 93.61205564142195, + "grad_norm": 0.6797152161598206, + "learning_rate": 6.871498626049591e-06, + "loss": 0.24, + "num_input_tokens_seen": 46326608, + "step": 30330 + }, + { + "epoch": 93.6275115919629, + "grad_norm": 0.8462241888046265, + "learning_rate": 6.8647396955188875e-06, + "loss": 0.2716, + "num_input_tokens_seen": 46334032, + "step": 30335 + }, + { + "epoch": 93.64296754250387, + "grad_norm": 0.5019630193710327, + "learning_rate": 6.857983561673218e-06, + "loss": 0.1982, + "num_input_tokens_seen": 46341552, + "step": 30340 + }, + { + "epoch": 93.65842349304482, + "grad_norm": 0.8564046621322632, + "learning_rate": 6.851230225554467e-06, + "loss": 0.276, + "num_input_tokens_seen": 46349296, + "step": 30345 + }, + { + "epoch": 93.67387944358578, + "grad_norm": 0.6758881211280823, + "learning_rate": 6.8444796882040946e-06, + "loss": 0.2329, + "num_input_tokens_seen": 46356944, + "step": 30350 + }, + { + "epoch": 93.68933539412674, + "grad_norm": 0.8823022842407227, + "learning_rate": 6.837731950663106e-06, + "loss": 0.2529, + "num_input_tokens_seen": 46365040, + "step": 30355 + }, + { + "epoch": 93.7047913446677, + "grad_norm": 1.0183768272399902, + "learning_rate": 6.830987013972098e-06, + "loss": 0.2933, + "num_input_tokens_seen": 46372592, + "step": 30360 + }, + { + "epoch": 93.72024729520865, + "grad_norm": 0.6167237162590027, + "learning_rate": 6.82424487917121e-06, + "loss": 0.2317, + "num_input_tokens_seen": 46380368, + "step": 30365 + }, + { + "epoch": 93.73570324574962, + "grad_norm": 0.7656148672103882, + "learning_rate": 6.8175055473001735e-06, + "loss": 0.2595, + "num_input_tokens_seen": 46387632, + "step": 30370 + }, + { + "epoch": 93.75115919629057, + "grad_norm": 0.6623209714889526, + "learning_rate": 6.8107690193982855e-06, + "loss": 0.2652, + "num_input_tokens_seen": 46395056, + "step": 30375 + }, + { + "epoch": 93.76661514683153, + "grad_norm": 0.4643712043762207, + "learning_rate": 6.804035296504385e-06, + "loss": 0.2408, + "num_input_tokens_seen": 46402000, + "step": 30380 + }, + { + "epoch": 93.7820710973725, + "grad_norm": 0.4058256447315216, + "learning_rate": 6.797304379656916e-06, + "loss": 0.2385, + "num_input_tokens_seen": 46409616, + "step": 30385 + }, + { + "epoch": 93.79752704791345, + "grad_norm": 0.4865945875644684, + "learning_rate": 6.790576269893861e-06, + "loss": 0.2462, + "num_input_tokens_seen": 46417456, + "step": 30390 + }, + { + "epoch": 93.8129829984544, + "grad_norm": 0.6571665406227112, + "learning_rate": 6.783850968252772e-06, + "loss": 0.2262, + "num_input_tokens_seen": 46424720, + "step": 30395 + }, + { + "epoch": 93.82843894899537, + "grad_norm": 1.1074615716934204, + "learning_rate": 6.777128475770789e-06, + "loss": 0.2801, + "num_input_tokens_seen": 46431888, + "step": 30400 + }, + { + "epoch": 93.82843894899537, + "eval_loss": 0.3027218282222748, + "eval_runtime": 6.3127, + "eval_samples_per_second": 91.086, + "eval_steps_per_second": 22.811, + "num_input_tokens_seen": 46431888, + "step": 30400 + }, + { + "epoch": 93.84389489953632, + "grad_norm": 0.6363179087638855, + "learning_rate": 6.77040879348459e-06, + "loss": 0.2727, + "num_input_tokens_seen": 46439536, + "step": 30405 + }, + { + "epoch": 93.85935085007728, + "grad_norm": 0.4512923061847687, + "learning_rate": 6.763691922430443e-06, + "loss": 0.2283, + "num_input_tokens_seen": 46447344, + "step": 30410 + }, + { + "epoch": 93.87480680061825, + "grad_norm": 0.5440972447395325, + "learning_rate": 6.756977863644178e-06, + "loss": 0.2237, + "num_input_tokens_seen": 46455280, + "step": 30415 + }, + { + "epoch": 93.8902627511592, + "grad_norm": 0.31151509284973145, + "learning_rate": 6.7502666181611804e-06, + "loss": 0.2344, + "num_input_tokens_seen": 46462928, + "step": 30420 + }, + { + "epoch": 93.90571870170015, + "grad_norm": 0.7336382269859314, + "learning_rate": 6.743558187016405e-06, + "loss": 0.3176, + "num_input_tokens_seen": 46470832, + "step": 30425 + }, + { + "epoch": 93.9211746522411, + "grad_norm": 0.7597355246543884, + "learning_rate": 6.7368525712443925e-06, + "loss": 0.2952, + "num_input_tokens_seen": 46478320, + "step": 30430 + }, + { + "epoch": 93.93663060278207, + "grad_norm": 0.43927788734436035, + "learning_rate": 6.7301497718792155e-06, + "loss": 0.2475, + "num_input_tokens_seen": 46486160, + "step": 30435 + }, + { + "epoch": 93.95208655332303, + "grad_norm": 0.5776520371437073, + "learning_rate": 6.723449789954544e-06, + "loss": 0.2287, + "num_input_tokens_seen": 46493584, + "step": 30440 + }, + { + "epoch": 93.96754250386398, + "grad_norm": 0.42037901282310486, + "learning_rate": 6.716752626503586e-06, + "loss": 0.1912, + "num_input_tokens_seen": 46501136, + "step": 30445 + }, + { + "epoch": 93.98299845440495, + "grad_norm": 0.44252893328666687, + "learning_rate": 6.710058282559131e-06, + "loss": 0.2463, + "num_input_tokens_seen": 46509040, + "step": 30450 + }, + { + "epoch": 93.9984544049459, + "grad_norm": 0.41322365403175354, + "learning_rate": 6.703366759153545e-06, + "loss": 0.2408, + "num_input_tokens_seen": 46516752, + "step": 30455 + }, + { + "epoch": 94.01236476043276, + "grad_norm": 0.7353025078773499, + "learning_rate": 6.6966780573187335e-06, + "loss": 0.2913, + "num_input_tokens_seen": 46523696, + "step": 30460 + }, + { + "epoch": 94.02782071097373, + "grad_norm": 0.3904307782649994, + "learning_rate": 6.689992178086174e-06, + "loss": 0.3081, + "num_input_tokens_seen": 46531760, + "step": 30465 + }, + { + "epoch": 94.04327666151468, + "grad_norm": 0.8659180998802185, + "learning_rate": 6.683309122486925e-06, + "loss": 0.2598, + "num_input_tokens_seen": 46539920, + "step": 30470 + }, + { + "epoch": 94.05873261205564, + "grad_norm": 0.3470292091369629, + "learning_rate": 6.676628891551584e-06, + "loss": 0.2138, + "num_input_tokens_seen": 46547504, + "step": 30475 + }, + { + "epoch": 94.0741885625966, + "grad_norm": 0.7373965382575989, + "learning_rate": 6.6699514863103385e-06, + "loss": 0.2499, + "num_input_tokens_seen": 46555184, + "step": 30480 + }, + { + "epoch": 94.08964451313756, + "grad_norm": 0.4642457067966461, + "learning_rate": 6.663276907792921e-06, + "loss": 0.243, + "num_input_tokens_seen": 46563024, + "step": 30485 + }, + { + "epoch": 94.10510046367851, + "grad_norm": 0.8302521109580994, + "learning_rate": 6.656605157028634e-06, + "loss": 0.22, + "num_input_tokens_seen": 46570160, + "step": 30490 + }, + { + "epoch": 94.12055641421948, + "grad_norm": 0.3594073951244354, + "learning_rate": 6.649936235046358e-06, + "loss": 0.2246, + "num_input_tokens_seen": 46578256, + "step": 30495 + }, + { + "epoch": 94.13601236476043, + "grad_norm": 0.7892304062843323, + "learning_rate": 6.643270142874508e-06, + "loss": 0.211, + "num_input_tokens_seen": 46585776, + "step": 30500 + }, + { + "epoch": 94.15146831530139, + "grad_norm": 0.5556277632713318, + "learning_rate": 6.636606881541094e-06, + "loss": 0.248, + "num_input_tokens_seen": 46592976, + "step": 30505 + }, + { + "epoch": 94.16692426584235, + "grad_norm": 0.6862470507621765, + "learning_rate": 6.629946452073662e-06, + "loss": 0.2503, + "num_input_tokens_seen": 46600784, + "step": 30510 + }, + { + "epoch": 94.18238021638331, + "grad_norm": 0.4691910445690155, + "learning_rate": 6.6232888554993375e-06, + "loss": 0.2178, + "num_input_tokens_seen": 46608848, + "step": 30515 + }, + { + "epoch": 94.19783616692426, + "grad_norm": 0.5834534764289856, + "learning_rate": 6.616634092844817e-06, + "loss": 0.2154, + "num_input_tokens_seen": 46616048, + "step": 30520 + }, + { + "epoch": 94.21329211746523, + "grad_norm": 0.748288094997406, + "learning_rate": 6.609982165136331e-06, + "loss": 0.2396, + "num_input_tokens_seen": 46623504, + "step": 30525 + }, + { + "epoch": 94.22874806800618, + "grad_norm": 0.3223646879196167, + "learning_rate": 6.603333073399706e-06, + "loss": 0.2339, + "num_input_tokens_seen": 46630864, + "step": 30530 + }, + { + "epoch": 94.24420401854714, + "grad_norm": 0.4381754696369171, + "learning_rate": 6.596686818660308e-06, + "loss": 0.2259, + "num_input_tokens_seen": 46638096, + "step": 30535 + }, + { + "epoch": 94.2596599690881, + "grad_norm": 0.359617680311203, + "learning_rate": 6.590043401943066e-06, + "loss": 0.2587, + "num_input_tokens_seen": 46645680, + "step": 30540 + }, + { + "epoch": 94.27511591962906, + "grad_norm": 0.39509329199790955, + "learning_rate": 6.583402824272494e-06, + "loss": 0.2218, + "num_input_tokens_seen": 46652720, + "step": 30545 + }, + { + "epoch": 94.29057187017001, + "grad_norm": 0.4114348292350769, + "learning_rate": 6.576765086672634e-06, + "loss": 0.2074, + "num_input_tokens_seen": 46660592, + "step": 30550 + }, + { + "epoch": 94.30602782071098, + "grad_norm": 0.42940863966941833, + "learning_rate": 6.57013019016712e-06, + "loss": 0.2323, + "num_input_tokens_seen": 46668272, + "step": 30555 + }, + { + "epoch": 94.32148377125193, + "grad_norm": 0.43417277932167053, + "learning_rate": 6.563498135779142e-06, + "loss": 0.2596, + "num_input_tokens_seen": 46675792, + "step": 30560 + }, + { + "epoch": 94.33693972179289, + "grad_norm": 0.4126754105091095, + "learning_rate": 6.556868924531431e-06, + "loss": 0.2233, + "num_input_tokens_seen": 46683504, + "step": 30565 + }, + { + "epoch": 94.35239567233384, + "grad_norm": 0.4036075472831726, + "learning_rate": 6.550242557446304e-06, + "loss": 0.2514, + "num_input_tokens_seen": 46691184, + "step": 30570 + }, + { + "epoch": 94.36785162287481, + "grad_norm": 0.6169818043708801, + "learning_rate": 6.543619035545634e-06, + "loss": 0.2511, + "num_input_tokens_seen": 46698832, + "step": 30575 + }, + { + "epoch": 94.38330757341576, + "grad_norm": 0.5600112080574036, + "learning_rate": 6.53699835985084e-06, + "loss": 0.2627, + "num_input_tokens_seen": 46706352, + "step": 30580 + }, + { + "epoch": 94.39876352395672, + "grad_norm": 0.44947656989097595, + "learning_rate": 6.530380531382927e-06, + "loss": 0.2296, + "num_input_tokens_seen": 46713840, + "step": 30585 + }, + { + "epoch": 94.41421947449768, + "grad_norm": 0.6496559381484985, + "learning_rate": 6.523765551162433e-06, + "loss": 0.2997, + "num_input_tokens_seen": 46721296, + "step": 30590 + }, + { + "epoch": 94.42967542503864, + "grad_norm": 0.41388553380966187, + "learning_rate": 6.517153420209476e-06, + "loss": 0.2572, + "num_input_tokens_seen": 46728944, + "step": 30595 + }, + { + "epoch": 94.44513137557959, + "grad_norm": 0.47756174206733704, + "learning_rate": 6.510544139543739e-06, + "loss": 0.2391, + "num_input_tokens_seen": 46736368, + "step": 30600 + }, + { + "epoch": 94.44513137557959, + "eval_loss": 0.30273857712745667, + "eval_runtime": 6.3071, + "eval_samples_per_second": 91.168, + "eval_steps_per_second": 22.832, + "num_input_tokens_seen": 46736368, + "step": 30600 + }, + { + "epoch": 94.46058732612056, + "grad_norm": 0.39046400785446167, + "learning_rate": 6.503937710184452e-06, + "loss": 0.2436, + "num_input_tokens_seen": 46744656, + "step": 30605 + }, + { + "epoch": 94.47604327666151, + "grad_norm": 0.39376407861709595, + "learning_rate": 6.4973341331503954e-06, + "loss": 0.2209, + "num_input_tokens_seen": 46752368, + "step": 30610 + }, + { + "epoch": 94.49149922720247, + "grad_norm": 0.6503193974494934, + "learning_rate": 6.490733409459942e-06, + "loss": 0.2174, + "num_input_tokens_seen": 46760272, + "step": 30615 + }, + { + "epoch": 94.50695517774344, + "grad_norm": 0.370194673538208, + "learning_rate": 6.484135540130995e-06, + "loss": 0.2369, + "num_input_tokens_seen": 46767888, + "step": 30620 + }, + { + "epoch": 94.52241112828439, + "grad_norm": 0.550024151802063, + "learning_rate": 6.4775405261810364e-06, + "loss": 0.2371, + "num_input_tokens_seen": 46775888, + "step": 30625 + }, + { + "epoch": 94.53786707882534, + "grad_norm": 0.5905160903930664, + "learning_rate": 6.470948368627092e-06, + "loss": 0.2278, + "num_input_tokens_seen": 46783344, + "step": 30630 + }, + { + "epoch": 94.55332302936631, + "grad_norm": 0.47447893023490906, + "learning_rate": 6.464359068485756e-06, + "loss": 0.2518, + "num_input_tokens_seen": 46791056, + "step": 30635 + }, + { + "epoch": 94.56877897990726, + "grad_norm": 0.39778971672058105, + "learning_rate": 6.457772626773195e-06, + "loss": 0.3519, + "num_input_tokens_seen": 46798800, + "step": 30640 + }, + { + "epoch": 94.58423493044822, + "grad_norm": 0.4836573600769043, + "learning_rate": 6.451189044505104e-06, + "loss": 0.2051, + "num_input_tokens_seen": 46806512, + "step": 30645 + }, + { + "epoch": 94.59969088098919, + "grad_norm": 0.48783230781555176, + "learning_rate": 6.44460832269676e-06, + "loss": 0.2024, + "num_input_tokens_seen": 46813968, + "step": 30650 + }, + { + "epoch": 94.61514683153014, + "grad_norm": 0.4773997962474823, + "learning_rate": 6.438030462363001e-06, + "loss": 0.2082, + "num_input_tokens_seen": 46822064, + "step": 30655 + }, + { + "epoch": 94.6306027820711, + "grad_norm": 0.674354076385498, + "learning_rate": 6.431455464518205e-06, + "loss": 0.3311, + "num_input_tokens_seen": 46830000, + "step": 30660 + }, + { + "epoch": 94.64605873261206, + "grad_norm": 0.4704815149307251, + "learning_rate": 6.424883330176326e-06, + "loss": 0.1951, + "num_input_tokens_seen": 46837712, + "step": 30665 + }, + { + "epoch": 94.66151468315302, + "grad_norm": 0.5458249449729919, + "learning_rate": 6.418314060350864e-06, + "loss": 0.2062, + "num_input_tokens_seen": 46845488, + "step": 30670 + }, + { + "epoch": 94.67697063369397, + "grad_norm": 0.5268712043762207, + "learning_rate": 6.4117476560548895e-06, + "loss": 0.2313, + "num_input_tokens_seen": 46853392, + "step": 30675 + }, + { + "epoch": 94.69242658423494, + "grad_norm": 0.6737940311431885, + "learning_rate": 6.405184118301016e-06, + "loss": 0.331, + "num_input_tokens_seen": 46860720, + "step": 30680 + }, + { + "epoch": 94.70788253477589, + "grad_norm": 0.4297029972076416, + "learning_rate": 6.398623448101434e-06, + "loss": 0.2389, + "num_input_tokens_seen": 46868144, + "step": 30685 + }, + { + "epoch": 94.72333848531684, + "grad_norm": 0.6228651404380798, + "learning_rate": 6.392065646467871e-06, + "loss": 0.2468, + "num_input_tokens_seen": 46875728, + "step": 30690 + }, + { + "epoch": 94.7387944358578, + "grad_norm": 0.48882657289505005, + "learning_rate": 6.385510714411632e-06, + "loss": 0.2542, + "num_input_tokens_seen": 46883504, + "step": 30695 + }, + { + "epoch": 94.75425038639877, + "grad_norm": 0.9977812170982361, + "learning_rate": 6.378958652943559e-06, + "loss": 0.3702, + "num_input_tokens_seen": 46891440, + "step": 30700 + }, + { + "epoch": 94.76970633693972, + "grad_norm": 0.44653618335723877, + "learning_rate": 6.3724094630740776e-06, + "loss": 0.2866, + "num_input_tokens_seen": 46898608, + "step": 30705 + }, + { + "epoch": 94.78516228748067, + "grad_norm": 0.2854514420032501, + "learning_rate": 6.365863145813136e-06, + "loss": 0.2829, + "num_input_tokens_seen": 46905968, + "step": 30710 + }, + { + "epoch": 94.80061823802164, + "grad_norm": 0.5949681401252747, + "learning_rate": 6.359319702170269e-06, + "loss": 0.2323, + "num_input_tokens_seen": 46913456, + "step": 30715 + }, + { + "epoch": 94.8160741885626, + "grad_norm": 0.6608742475509644, + "learning_rate": 6.352779133154566e-06, + "loss": 0.2856, + "num_input_tokens_seen": 46920912, + "step": 30720 + }, + { + "epoch": 94.83153013910355, + "grad_norm": 0.5523222088813782, + "learning_rate": 6.346241439774648e-06, + "loss": 0.2499, + "num_input_tokens_seen": 46928528, + "step": 30725 + }, + { + "epoch": 94.84698608964452, + "grad_norm": 0.4961026906967163, + "learning_rate": 6.339706623038716e-06, + "loss": 0.2286, + "num_input_tokens_seen": 46936048, + "step": 30730 + }, + { + "epoch": 94.86244204018547, + "grad_norm": 0.5062846541404724, + "learning_rate": 6.333174683954532e-06, + "loss": 0.2692, + "num_input_tokens_seen": 46943408, + "step": 30735 + }, + { + "epoch": 94.87789799072642, + "grad_norm": 0.7471227645874023, + "learning_rate": 6.326645623529387e-06, + "loss": 0.255, + "num_input_tokens_seen": 46951536, + "step": 30740 + }, + { + "epoch": 94.89335394126739, + "grad_norm": 0.2872883677482605, + "learning_rate": 6.320119442770156e-06, + "loss": 0.208, + "num_input_tokens_seen": 46958960, + "step": 30745 + }, + { + "epoch": 94.90880989180835, + "grad_norm": 0.9200299382209778, + "learning_rate": 6.313596142683254e-06, + "loss": 0.2845, + "num_input_tokens_seen": 46967088, + "step": 30750 + }, + { + "epoch": 94.9242658423493, + "grad_norm": 0.6940195560455322, + "learning_rate": 6.307075724274647e-06, + "loss": 0.2816, + "num_input_tokens_seen": 46975024, + "step": 30755 + }, + { + "epoch": 94.93972179289027, + "grad_norm": 0.4399118721485138, + "learning_rate": 6.300558188549882e-06, + "loss": 0.2206, + "num_input_tokens_seen": 46982480, + "step": 30760 + }, + { + "epoch": 94.95517774343122, + "grad_norm": 0.4042028784751892, + "learning_rate": 6.29404353651403e-06, + "loss": 0.2132, + "num_input_tokens_seen": 46990160, + "step": 30765 + }, + { + "epoch": 94.97063369397218, + "grad_norm": 0.4362073540687561, + "learning_rate": 6.287531769171737e-06, + "loss": 0.2478, + "num_input_tokens_seen": 46998096, + "step": 30770 + }, + { + "epoch": 94.98608964451314, + "grad_norm": 0.4044071435928345, + "learning_rate": 6.2810228875272045e-06, + "loss": 0.2791, + "num_input_tokens_seen": 47005808, + "step": 30775 + }, + { + "epoch": 95.0, + "grad_norm": 0.5108433365821838, + "learning_rate": 6.274516892584179e-06, + "loss": 0.1891, + "num_input_tokens_seen": 47012688, + "step": 30780 + }, + { + "epoch": 95.01545595054095, + "grad_norm": 0.31021636724472046, + "learning_rate": 6.268013785345969e-06, + "loss": 0.2906, + "num_input_tokens_seen": 47020720, + "step": 30785 + }, + { + "epoch": 95.03091190108192, + "grad_norm": 0.46651384234428406, + "learning_rate": 6.26151356681543e-06, + "loss": 0.2463, + "num_input_tokens_seen": 47028112, + "step": 30790 + }, + { + "epoch": 95.04636785162288, + "grad_norm": 0.3413243591785431, + "learning_rate": 6.255016237994981e-06, + "loss": 0.2184, + "num_input_tokens_seen": 47035632, + "step": 30795 + }, + { + "epoch": 95.06182380216383, + "grad_norm": 0.40324199199676514, + "learning_rate": 6.248521799886603e-06, + "loss": 0.2193, + "num_input_tokens_seen": 47043472, + "step": 30800 + }, + { + "epoch": 95.06182380216383, + "eval_loss": 0.3018428385257721, + "eval_runtime": 6.3089, + "eval_samples_per_second": 91.142, + "eval_steps_per_second": 22.825, + "num_input_tokens_seen": 47043472, + "step": 30800 + }, + { + "epoch": 95.0772797527048, + "grad_norm": 0.44856294989585876, + "learning_rate": 6.242030253491798e-06, + "loss": 0.2205, + "num_input_tokens_seen": 47051664, + "step": 30805 + }, + { + "epoch": 95.09273570324575, + "grad_norm": 0.7762630581855774, + "learning_rate": 6.235541599811656e-06, + "loss": 0.2525, + "num_input_tokens_seen": 47058672, + "step": 30810 + }, + { + "epoch": 95.1081916537867, + "grad_norm": 0.41390854120254517, + "learning_rate": 6.229055839846814e-06, + "loss": 0.2148, + "num_input_tokens_seen": 47066352, + "step": 30815 + }, + { + "epoch": 95.12364760432767, + "grad_norm": 0.45625877380371094, + "learning_rate": 6.222572974597455e-06, + "loss": 0.2334, + "num_input_tokens_seen": 47075088, + "step": 30820 + }, + { + "epoch": 95.13910355486863, + "grad_norm": 0.5894721150398254, + "learning_rate": 6.216093005063306e-06, + "loss": 0.2378, + "num_input_tokens_seen": 47082800, + "step": 30825 + }, + { + "epoch": 95.15455950540958, + "grad_norm": 0.6362146139144897, + "learning_rate": 6.209615932243678e-06, + "loss": 0.316, + "num_input_tokens_seen": 47090576, + "step": 30830 + }, + { + "epoch": 95.17001545595055, + "grad_norm": 0.9547118544578552, + "learning_rate": 6.203141757137399e-06, + "loss": 0.218, + "num_input_tokens_seen": 47098224, + "step": 30835 + }, + { + "epoch": 95.1854714064915, + "grad_norm": 0.47271156311035156, + "learning_rate": 6.196670480742886e-06, + "loss": 0.2211, + "num_input_tokens_seen": 47105616, + "step": 30840 + }, + { + "epoch": 95.20092735703246, + "grad_norm": 0.8188210725784302, + "learning_rate": 6.190202104058074e-06, + "loss": 0.2601, + "num_input_tokens_seen": 47113200, + "step": 30845 + }, + { + "epoch": 95.21638330757341, + "grad_norm": 0.5350376963615417, + "learning_rate": 6.183736628080475e-06, + "loss": 0.2996, + "num_input_tokens_seen": 47120912, + "step": 30850 + }, + { + "epoch": 95.23183925811438, + "grad_norm": 0.472674697637558, + "learning_rate": 6.177274053807155e-06, + "loss": 0.3156, + "num_input_tokens_seen": 47128560, + "step": 30855 + }, + { + "epoch": 95.24729520865533, + "grad_norm": 0.5664703249931335, + "learning_rate": 6.170814382234713e-06, + "loss": 0.2192, + "num_input_tokens_seen": 47136336, + "step": 30860 + }, + { + "epoch": 95.26275115919628, + "grad_norm": 0.3685034513473511, + "learning_rate": 6.16435761435932e-06, + "loss": 0.2616, + "num_input_tokens_seen": 47143344, + "step": 30865 + }, + { + "epoch": 95.27820710973725, + "grad_norm": 0.5297368764877319, + "learning_rate": 6.157903751176681e-06, + "loss": 0.2279, + "num_input_tokens_seen": 47151280, + "step": 30870 + }, + { + "epoch": 95.2936630602782, + "grad_norm": 0.5560827255249023, + "learning_rate": 6.151452793682066e-06, + "loss": 0.2321, + "num_input_tokens_seen": 47158928, + "step": 30875 + }, + { + "epoch": 95.30911901081916, + "grad_norm": 0.42337197065353394, + "learning_rate": 6.145004742870305e-06, + "loss": 0.3098, + "num_input_tokens_seen": 47166736, + "step": 30880 + }, + { + "epoch": 95.32457496136013, + "grad_norm": 0.4563445746898651, + "learning_rate": 6.138559599735752e-06, + "loss": 0.1901, + "num_input_tokens_seen": 47174032, + "step": 30885 + }, + { + "epoch": 95.34003091190108, + "grad_norm": 0.47485819458961487, + "learning_rate": 6.132117365272344e-06, + "loss": 0.3151, + "num_input_tokens_seen": 47181232, + "step": 30890 + }, + { + "epoch": 95.35548686244204, + "grad_norm": 0.8766730427742004, + "learning_rate": 6.125678040473545e-06, + "loss": 0.2068, + "num_input_tokens_seen": 47188976, + "step": 30895 + }, + { + "epoch": 95.370942812983, + "grad_norm": 0.5438593626022339, + "learning_rate": 6.1192416263323755e-06, + "loss": 0.2625, + "num_input_tokens_seen": 47196912, + "step": 30900 + }, + { + "epoch": 95.38639876352396, + "grad_norm": 0.41508564352989197, + "learning_rate": 6.112808123841424e-06, + "loss": 0.2734, + "num_input_tokens_seen": 47204880, + "step": 30905 + }, + { + "epoch": 95.40185471406491, + "grad_norm": 0.48653310537338257, + "learning_rate": 6.106377533992805e-06, + "loss": 0.2473, + "num_input_tokens_seen": 47213232, + "step": 30910 + }, + { + "epoch": 95.41731066460588, + "grad_norm": 0.5665993690490723, + "learning_rate": 6.099949857778204e-06, + "loss": 0.2274, + "num_input_tokens_seen": 47220432, + "step": 30915 + }, + { + "epoch": 95.43276661514683, + "grad_norm": 0.4089028239250183, + "learning_rate": 6.093525096188852e-06, + "loss": 0.2005, + "num_input_tokens_seen": 47227408, + "step": 30920 + }, + { + "epoch": 95.44822256568779, + "grad_norm": 0.30686116218566895, + "learning_rate": 6.087103250215518e-06, + "loss": 0.2506, + "num_input_tokens_seen": 47234768, + "step": 30925 + }, + { + "epoch": 95.46367851622875, + "grad_norm": 0.6373357176780701, + "learning_rate": 6.080684320848537e-06, + "loss": 0.2169, + "num_input_tokens_seen": 47242256, + "step": 30930 + }, + { + "epoch": 95.47913446676971, + "grad_norm": 0.4151831269264221, + "learning_rate": 6.074268309077794e-06, + "loss": 0.2453, + "num_input_tokens_seen": 47249872, + "step": 30935 + }, + { + "epoch": 95.49459041731066, + "grad_norm": 0.303619384765625, + "learning_rate": 6.067855215892709e-06, + "loss": 0.1861, + "num_input_tokens_seen": 47257168, + "step": 30940 + }, + { + "epoch": 95.51004636785163, + "grad_norm": 0.3809833526611328, + "learning_rate": 6.061445042282271e-06, + "loss": 0.2815, + "num_input_tokens_seen": 47265008, + "step": 30945 + }, + { + "epoch": 95.52550231839258, + "grad_norm": 0.5698578953742981, + "learning_rate": 6.055037789234999e-06, + "loss": 0.2826, + "num_input_tokens_seen": 47272624, + "step": 30950 + }, + { + "epoch": 95.54095826893354, + "grad_norm": 0.4486117959022522, + "learning_rate": 6.048633457738975e-06, + "loss": 0.2813, + "num_input_tokens_seen": 47280240, + "step": 30955 + }, + { + "epoch": 95.5564142194745, + "grad_norm": 0.7088447213172913, + "learning_rate": 6.042232048781837e-06, + "loss": 0.2751, + "num_input_tokens_seen": 47288016, + "step": 30960 + }, + { + "epoch": 95.57187017001546, + "grad_norm": 0.3538661003112793, + "learning_rate": 6.035833563350757e-06, + "loss": 0.2357, + "num_input_tokens_seen": 47295696, + "step": 30965 + }, + { + "epoch": 95.58732612055641, + "grad_norm": 0.41158971190452576, + "learning_rate": 6.0294380024324525e-06, + "loss": 0.2383, + "num_input_tokens_seen": 47302896, + "step": 30970 + }, + { + "epoch": 95.60278207109737, + "grad_norm": 0.5945173501968384, + "learning_rate": 6.023045367013213e-06, + "loss": 0.2814, + "num_input_tokens_seen": 47310736, + "step": 30975 + }, + { + "epoch": 95.61823802163833, + "grad_norm": 0.5564904808998108, + "learning_rate": 6.016655658078851e-06, + "loss": 0.2378, + "num_input_tokens_seen": 47318640, + "step": 30980 + }, + { + "epoch": 95.63369397217929, + "grad_norm": 0.5311793684959412, + "learning_rate": 6.010268876614753e-06, + "loss": 0.2375, + "num_input_tokens_seen": 47326768, + "step": 30985 + }, + { + "epoch": 95.64914992272024, + "grad_norm": 0.6466048955917358, + "learning_rate": 6.0038850236058266e-06, + "loss": 0.2298, + "num_input_tokens_seen": 47334160, + "step": 30990 + }, + { + "epoch": 95.66460587326121, + "grad_norm": 0.35159194469451904, + "learning_rate": 5.997504100036549e-06, + "loss": 0.2484, + "num_input_tokens_seen": 47341456, + "step": 30995 + }, + { + "epoch": 95.68006182380216, + "grad_norm": 0.37939244508743286, + "learning_rate": 5.991126106890949e-06, + "loss": 0.2231, + "num_input_tokens_seen": 47348976, + "step": 31000 + }, + { + "epoch": 95.68006182380216, + "eval_loss": 0.30241087079048157, + "eval_runtime": 6.3118, + "eval_samples_per_second": 91.1, + "eval_steps_per_second": 22.815, + "num_input_tokens_seen": 47348976, + "step": 31000 + }, + { + "epoch": 95.69551777434312, + "grad_norm": 0.44206923246383667, + "learning_rate": 5.984751045152576e-06, + "loss": 0.257, + "num_input_tokens_seen": 47356848, + "step": 31005 + }, + { + "epoch": 95.71097372488408, + "grad_norm": 0.6786791682243347, + "learning_rate": 5.978378915804553e-06, + "loss": 0.3339, + "num_input_tokens_seen": 47364528, + "step": 31010 + }, + { + "epoch": 95.72642967542504, + "grad_norm": 0.49219071865081787, + "learning_rate": 5.972009719829547e-06, + "loss": 0.214, + "num_input_tokens_seen": 47372208, + "step": 31015 + }, + { + "epoch": 95.74188562596599, + "grad_norm": 0.5967442393302917, + "learning_rate": 5.965643458209755e-06, + "loss": 0.2158, + "num_input_tokens_seen": 47380048, + "step": 31020 + }, + { + "epoch": 95.75734157650696, + "grad_norm": 0.3699418008327484, + "learning_rate": 5.95928013192695e-06, + "loss": 0.2358, + "num_input_tokens_seen": 47387312, + "step": 31025 + }, + { + "epoch": 95.77279752704791, + "grad_norm": 0.4475533962249756, + "learning_rate": 5.952919741962423e-06, + "loss": 0.2623, + "num_input_tokens_seen": 47395472, + "step": 31030 + }, + { + "epoch": 95.78825347758887, + "grad_norm": 0.5407943725585938, + "learning_rate": 5.946562289297042e-06, + "loss": 0.2725, + "num_input_tokens_seen": 47403408, + "step": 31035 + }, + { + "epoch": 95.80370942812984, + "grad_norm": 0.43146389722824097, + "learning_rate": 5.9402077749111855e-06, + "loss": 0.198, + "num_input_tokens_seen": 47411120, + "step": 31040 + }, + { + "epoch": 95.81916537867079, + "grad_norm": 0.5050118565559387, + "learning_rate": 5.933856199784821e-06, + "loss": 0.237, + "num_input_tokens_seen": 47418672, + "step": 31045 + }, + { + "epoch": 95.83462132921174, + "grad_norm": 0.4830307960510254, + "learning_rate": 5.927507564897419e-06, + "loss": 0.2182, + "num_input_tokens_seen": 47425808, + "step": 31050 + }, + { + "epoch": 95.85007727975271, + "grad_norm": 0.668388307094574, + "learning_rate": 5.9211618712280395e-06, + "loss": 0.2827, + "num_input_tokens_seen": 47433488, + "step": 31055 + }, + { + "epoch": 95.86553323029366, + "grad_norm": 0.6691716909408569, + "learning_rate": 5.914819119755255e-06, + "loss": 0.2537, + "num_input_tokens_seen": 47440912, + "step": 31060 + }, + { + "epoch": 95.88098918083462, + "grad_norm": 0.40663260221481323, + "learning_rate": 5.908479311457205e-06, + "loss": 0.1964, + "num_input_tokens_seen": 47448176, + "step": 31065 + }, + { + "epoch": 95.89644513137559, + "grad_norm": 0.5066598653793335, + "learning_rate": 5.902142447311559e-06, + "loss": 0.2722, + "num_input_tokens_seen": 47455600, + "step": 31070 + }, + { + "epoch": 95.91190108191654, + "grad_norm": 0.38328710198402405, + "learning_rate": 5.895808528295546e-06, + "loss": 0.2488, + "num_input_tokens_seen": 47463984, + "step": 31075 + }, + { + "epoch": 95.9273570324575, + "grad_norm": 0.3986482322216034, + "learning_rate": 5.889477555385941e-06, + "loss": 0.237, + "num_input_tokens_seen": 47471568, + "step": 31080 + }, + { + "epoch": 95.94281298299846, + "grad_norm": 0.7141881585121155, + "learning_rate": 5.883149529559051e-06, + "loss": 0.2385, + "num_input_tokens_seen": 47478960, + "step": 31085 + }, + { + "epoch": 95.95826893353942, + "grad_norm": 0.3654693067073822, + "learning_rate": 5.876824451790738e-06, + "loss": 0.2524, + "num_input_tokens_seen": 47486736, + "step": 31090 + }, + { + "epoch": 95.97372488408037, + "grad_norm": 0.5562013983726501, + "learning_rate": 5.87050232305642e-06, + "loss": 0.2274, + "num_input_tokens_seen": 47494128, + "step": 31095 + }, + { + "epoch": 95.98918083462132, + "grad_norm": 0.3259187936782837, + "learning_rate": 5.864183144331034e-06, + "loss": 0.2232, + "num_input_tokens_seen": 47501520, + "step": 31100 + }, + { + "epoch": 96.0030911901082, + "grad_norm": 0.6950200796127319, + "learning_rate": 5.857866916589089e-06, + "loss": 0.2185, + "num_input_tokens_seen": 47508256, + "step": 31105 + }, + { + "epoch": 96.01854714064915, + "grad_norm": 0.5099040269851685, + "learning_rate": 5.8515536408046216e-06, + "loss": 0.2029, + "num_input_tokens_seen": 47516096, + "step": 31110 + }, + { + "epoch": 96.03400309119012, + "grad_norm": 0.4967260956764221, + "learning_rate": 5.845243317951208e-06, + "loss": 0.2431, + "num_input_tokens_seen": 47523616, + "step": 31115 + }, + { + "epoch": 96.04945904173107, + "grad_norm": 0.6053420901298523, + "learning_rate": 5.838935949001997e-06, + "loss": 0.2465, + "num_input_tokens_seen": 47531264, + "step": 31120 + }, + { + "epoch": 96.06491499227202, + "grad_norm": 0.7895645499229431, + "learning_rate": 5.8326315349296476e-06, + "loss": 0.3476, + "num_input_tokens_seen": 47539520, + "step": 31125 + }, + { + "epoch": 96.08037094281298, + "grad_norm": 0.4987969398498535, + "learning_rate": 5.826330076706396e-06, + "loss": 0.334, + "num_input_tokens_seen": 47547072, + "step": 31130 + }, + { + "epoch": 96.09582689335394, + "grad_norm": 0.6060331463813782, + "learning_rate": 5.820031575303988e-06, + "loss": 0.2018, + "num_input_tokens_seen": 47554976, + "step": 31135 + }, + { + "epoch": 96.1112828438949, + "grad_norm": 0.5932754874229431, + "learning_rate": 5.813736031693745e-06, + "loss": 0.2464, + "num_input_tokens_seen": 47562464, + "step": 31140 + }, + { + "epoch": 96.12673879443585, + "grad_norm": 0.6812340021133423, + "learning_rate": 5.807443446846522e-06, + "loss": 0.2794, + "num_input_tokens_seen": 47569824, + "step": 31145 + }, + { + "epoch": 96.14219474497682, + "grad_norm": 0.3853297829627991, + "learning_rate": 5.801153821732699e-06, + "loss": 0.2167, + "num_input_tokens_seen": 47576960, + "step": 31150 + }, + { + "epoch": 96.15765069551777, + "grad_norm": 0.48109546303749084, + "learning_rate": 5.794867157322229e-06, + "loss": 0.2041, + "num_input_tokens_seen": 47584512, + "step": 31155 + }, + { + "epoch": 96.17310664605873, + "grad_norm": 0.7122852206230164, + "learning_rate": 5.788583454584593e-06, + "loss": 0.2084, + "num_input_tokens_seen": 47591808, + "step": 31160 + }, + { + "epoch": 96.1885625965997, + "grad_norm": 0.7008673548698425, + "learning_rate": 5.7823027144888075e-06, + "loss": 0.2708, + "num_input_tokens_seen": 47599008, + "step": 31165 + }, + { + "epoch": 96.20401854714065, + "grad_norm": 0.3837071359157562, + "learning_rate": 5.776024938003455e-06, + "loss": 0.242, + "num_input_tokens_seen": 47607136, + "step": 31170 + }, + { + "epoch": 96.2194744976816, + "grad_norm": 0.582919180393219, + "learning_rate": 5.7697501260966345e-06, + "loss": 0.2872, + "num_input_tokens_seen": 47614912, + "step": 31175 + }, + { + "epoch": 96.23493044822257, + "grad_norm": 0.5263403058052063, + "learning_rate": 5.7634782797360145e-06, + "loss": 0.194, + "num_input_tokens_seen": 47622592, + "step": 31180 + }, + { + "epoch": 96.25038639876352, + "grad_norm": 0.37828198075294495, + "learning_rate": 5.757209399888777e-06, + "loss": 0.2059, + "num_input_tokens_seen": 47630400, + "step": 31185 + }, + { + "epoch": 96.26584234930448, + "grad_norm": 0.40872886776924133, + "learning_rate": 5.750943487521679e-06, + "loss": 0.2953, + "num_input_tokens_seen": 47637856, + "step": 31190 + }, + { + "epoch": 96.28129829984545, + "grad_norm": 0.6088994741439819, + "learning_rate": 5.744680543600986e-06, + "loss": 0.2411, + "num_input_tokens_seen": 47644864, + "step": 31195 + }, + { + "epoch": 96.2967542503864, + "grad_norm": 0.41016337275505066, + "learning_rate": 5.738420569092537e-06, + "loss": 0.271, + "num_input_tokens_seen": 47652864, + "step": 31200 + }, + { + "epoch": 96.2967542503864, + "eval_loss": 0.3021477460861206, + "eval_runtime": 6.3279, + "eval_samples_per_second": 90.867, + "eval_steps_per_second": 22.756, + "num_input_tokens_seen": 47652864, + "step": 31200 + }, + { + "epoch": 96.31221020092735, + "grad_norm": 0.4593863785266876, + "learning_rate": 5.732163564961684e-06, + "loss": 0.1999, + "num_input_tokens_seen": 47660640, + "step": 31205 + }, + { + "epoch": 96.32766615146832, + "grad_norm": 0.2738485038280487, + "learning_rate": 5.725909532173354e-06, + "loss": 0.2208, + "num_input_tokens_seen": 47669056, + "step": 31210 + }, + { + "epoch": 96.34312210200927, + "grad_norm": 0.7520812749862671, + "learning_rate": 5.719658471691977e-06, + "loss": 0.2191, + "num_input_tokens_seen": 47676384, + "step": 31215 + }, + { + "epoch": 96.35857805255023, + "grad_norm": 0.5665197372436523, + "learning_rate": 5.71341038448156e-06, + "loss": 0.296, + "num_input_tokens_seen": 47684096, + "step": 31220 + }, + { + "epoch": 96.3740340030912, + "grad_norm": 0.5048315525054932, + "learning_rate": 5.707165271505635e-06, + "loss": 0.2755, + "num_input_tokens_seen": 47692192, + "step": 31225 + }, + { + "epoch": 96.38948995363215, + "grad_norm": 0.5965669751167297, + "learning_rate": 5.700923133727271e-06, + "loss": 0.2588, + "num_input_tokens_seen": 47700192, + "step": 31230 + }, + { + "epoch": 96.4049459041731, + "grad_norm": 0.46098020672798157, + "learning_rate": 5.694683972109083e-06, + "loss": 0.1982, + "num_input_tokens_seen": 47707520, + "step": 31235 + }, + { + "epoch": 96.42040185471407, + "grad_norm": 0.3400077223777771, + "learning_rate": 5.688447787613241e-06, + "loss": 0.2365, + "num_input_tokens_seen": 47715040, + "step": 31240 + }, + { + "epoch": 96.43585780525503, + "grad_norm": 0.28524383902549744, + "learning_rate": 5.6822145812014285e-06, + "loss": 0.2482, + "num_input_tokens_seen": 47722656, + "step": 31245 + }, + { + "epoch": 96.45131375579598, + "grad_norm": 0.34722936153411865, + "learning_rate": 5.675984353834896e-06, + "loss": 0.3484, + "num_input_tokens_seen": 47729888, + "step": 31250 + }, + { + "epoch": 96.46676970633693, + "grad_norm": 0.5601996183395386, + "learning_rate": 5.66975710647441e-06, + "loss": 0.2188, + "num_input_tokens_seen": 47737504, + "step": 31255 + }, + { + "epoch": 96.4822256568779, + "grad_norm": 0.62555992603302, + "learning_rate": 5.663532840080304e-06, + "loss": 0.2323, + "num_input_tokens_seen": 47744864, + "step": 31260 + }, + { + "epoch": 96.49768160741885, + "grad_norm": 0.4326164126396179, + "learning_rate": 5.6573115556124325e-06, + "loss": 0.2124, + "num_input_tokens_seen": 47753280, + "step": 31265 + }, + { + "epoch": 96.51313755795981, + "grad_norm": 0.5476487874984741, + "learning_rate": 5.651093254030185e-06, + "loss": 0.2494, + "num_input_tokens_seen": 47761184, + "step": 31270 + }, + { + "epoch": 96.52859350850078, + "grad_norm": 0.49683159589767456, + "learning_rate": 5.644877936292514e-06, + "loss": 0.2156, + "num_input_tokens_seen": 47768704, + "step": 31275 + }, + { + "epoch": 96.54404945904173, + "grad_norm": 1.208444356918335, + "learning_rate": 5.638665603357901e-06, + "loss": 0.2815, + "num_input_tokens_seen": 47776352, + "step": 31280 + }, + { + "epoch": 96.55950540958268, + "grad_norm": 0.3996089696884155, + "learning_rate": 5.632456256184357e-06, + "loss": 0.3082, + "num_input_tokens_seen": 47784128, + "step": 31285 + }, + { + "epoch": 96.57496136012365, + "grad_norm": 0.6069238781929016, + "learning_rate": 5.626249895729452e-06, + "loss": 0.2528, + "num_input_tokens_seen": 47792576, + "step": 31290 + }, + { + "epoch": 96.5904173106646, + "grad_norm": 0.44667547941207886, + "learning_rate": 5.620046522950273e-06, + "loss": 0.318, + "num_input_tokens_seen": 47799776, + "step": 31295 + }, + { + "epoch": 96.60587326120556, + "grad_norm": 0.2947489321231842, + "learning_rate": 5.613846138803464e-06, + "loss": 0.2624, + "num_input_tokens_seen": 47807552, + "step": 31300 + }, + { + "epoch": 96.62132921174653, + "grad_norm": 0.6289663910865784, + "learning_rate": 5.607648744245206e-06, + "loss": 0.2456, + "num_input_tokens_seen": 47815008, + "step": 31305 + }, + { + "epoch": 96.63678516228748, + "grad_norm": 0.43866202235221863, + "learning_rate": 5.601454340231207e-06, + "loss": 0.3039, + "num_input_tokens_seen": 47822240, + "step": 31310 + }, + { + "epoch": 96.65224111282843, + "grad_norm": 0.708176851272583, + "learning_rate": 5.595262927716724e-06, + "loss": 0.2391, + "num_input_tokens_seen": 47829600, + "step": 31315 + }, + { + "epoch": 96.6676970633694, + "grad_norm": 0.5999800562858582, + "learning_rate": 5.589074507656561e-06, + "loss": 0.2534, + "num_input_tokens_seen": 47837600, + "step": 31320 + }, + { + "epoch": 96.68315301391036, + "grad_norm": 0.3300793170928955, + "learning_rate": 5.582889081005044e-06, + "loss": 0.2159, + "num_input_tokens_seen": 47845760, + "step": 31325 + }, + { + "epoch": 96.69860896445131, + "grad_norm": 0.35990363359451294, + "learning_rate": 5.5767066487160316e-06, + "loss": 0.2248, + "num_input_tokens_seen": 47853312, + "step": 31330 + }, + { + "epoch": 96.71406491499228, + "grad_norm": 0.6642647385597229, + "learning_rate": 5.570527211742949e-06, + "loss": 0.2509, + "num_input_tokens_seen": 47860832, + "step": 31335 + }, + { + "epoch": 96.72952086553323, + "grad_norm": 0.6369528770446777, + "learning_rate": 5.564350771038731e-06, + "loss": 0.3059, + "num_input_tokens_seen": 47868640, + "step": 31340 + }, + { + "epoch": 96.74497681607419, + "grad_norm": 0.6846590638160706, + "learning_rate": 5.558177327555875e-06, + "loss": 0.2542, + "num_input_tokens_seen": 47875904, + "step": 31345 + }, + { + "epoch": 96.76043276661515, + "grad_norm": 0.5639842748641968, + "learning_rate": 5.552006882246388e-06, + "loss": 0.1953, + "num_input_tokens_seen": 47883488, + "step": 31350 + }, + { + "epoch": 96.7758887171561, + "grad_norm": 0.4118680953979492, + "learning_rate": 5.545839436061839e-06, + "loss": 0.262, + "num_input_tokens_seen": 47891104, + "step": 31355 + }, + { + "epoch": 96.79134466769706, + "grad_norm": 0.39462029933929443, + "learning_rate": 5.539674989953331e-06, + "loss": 0.2051, + "num_input_tokens_seen": 47899168, + "step": 31360 + }, + { + "epoch": 96.80680061823801, + "grad_norm": 0.3714091181755066, + "learning_rate": 5.533513544871488e-06, + "loss": 0.2008, + "num_input_tokens_seen": 47907264, + "step": 31365 + }, + { + "epoch": 96.82225656877898, + "grad_norm": 0.6341313719749451, + "learning_rate": 5.527355101766493e-06, + "loss": 0.2161, + "num_input_tokens_seen": 47914592, + "step": 31370 + }, + { + "epoch": 96.83771251931994, + "grad_norm": 0.70749431848526, + "learning_rate": 5.521199661588044e-06, + "loss": 0.26, + "num_input_tokens_seen": 47922080, + "step": 31375 + }, + { + "epoch": 96.85316846986089, + "grad_norm": 0.37076547741889954, + "learning_rate": 5.5150472252853944e-06, + "loss": 0.2329, + "num_input_tokens_seen": 47928928, + "step": 31380 + }, + { + "epoch": 96.86862442040186, + "grad_norm": 0.6215193867683411, + "learning_rate": 5.50889779380733e-06, + "loss": 0.314, + "num_input_tokens_seen": 47937056, + "step": 31385 + }, + { + "epoch": 96.88408037094281, + "grad_norm": 0.44603806734085083, + "learning_rate": 5.5027513681021605e-06, + "loss": 0.2189, + "num_input_tokens_seen": 47944224, + "step": 31390 + }, + { + "epoch": 96.89953632148377, + "grad_norm": 0.5582305192947388, + "learning_rate": 5.4966079491177545e-06, + "loss": 0.2428, + "num_input_tokens_seen": 47951776, + "step": 31395 + }, + { + "epoch": 96.91499227202473, + "grad_norm": 0.5018426179885864, + "learning_rate": 5.490467537801491e-06, + "loss": 0.2043, + "num_input_tokens_seen": 47959872, + "step": 31400 + }, + { + "epoch": 96.91499227202473, + "eval_loss": 0.30289357900619507, + "eval_runtime": 6.329, + "eval_samples_per_second": 90.851, + "eval_steps_per_second": 22.752, + "num_input_tokens_seen": 47959872, + "step": 31400 + }, + { + "epoch": 96.93044822256569, + "grad_norm": 0.6423179507255554, + "learning_rate": 5.484330135100313e-06, + "loss": 0.1882, + "num_input_tokens_seen": 47967296, + "step": 31405 + }, + { + "epoch": 96.94590417310664, + "grad_norm": 0.7184274792671204, + "learning_rate": 5.4781957419606785e-06, + "loss": 0.2189, + "num_input_tokens_seen": 47974624, + "step": 31410 + }, + { + "epoch": 96.96136012364761, + "grad_norm": 0.5350795388221741, + "learning_rate": 5.472064359328577e-06, + "loss": 0.2532, + "num_input_tokens_seen": 47982336, + "step": 31415 + }, + { + "epoch": 96.97681607418856, + "grad_norm": 0.5017014741897583, + "learning_rate": 5.4659359881495565e-06, + "loss": 0.2453, + "num_input_tokens_seen": 47989984, + "step": 31420 + }, + { + "epoch": 96.99227202472952, + "grad_norm": 0.5361764430999756, + "learning_rate": 5.4598106293686916e-06, + "loss": 0.2322, + "num_input_tokens_seen": 47997824, + "step": 31425 + }, + { + "epoch": 97.00618238021639, + "grad_norm": 0.45538529753685, + "learning_rate": 5.45368828393058e-06, + "loss": 0.2187, + "num_input_tokens_seen": 48004848, + "step": 31430 + }, + { + "epoch": 97.02163833075734, + "grad_norm": 0.381543904542923, + "learning_rate": 5.44756895277937e-06, + "loss": 0.2685, + "num_input_tokens_seen": 48012272, + "step": 31435 + }, + { + "epoch": 97.0370942812983, + "grad_norm": 0.540128231048584, + "learning_rate": 5.441452636858746e-06, + "loss": 0.2687, + "num_input_tokens_seen": 48020144, + "step": 31440 + }, + { + "epoch": 97.05255023183926, + "grad_norm": 0.7329239845275879, + "learning_rate": 5.435339337111905e-06, + "loss": 0.2651, + "num_input_tokens_seen": 48027952, + "step": 31445 + }, + { + "epoch": 97.06800618238022, + "grad_norm": 0.516598105430603, + "learning_rate": 5.42922905448161e-06, + "loss": 0.272, + "num_input_tokens_seen": 48035408, + "step": 31450 + }, + { + "epoch": 97.08346213292117, + "grad_norm": 0.5183907747268677, + "learning_rate": 5.423121789910129e-06, + "loss": 0.2476, + "num_input_tokens_seen": 48042672, + "step": 31455 + }, + { + "epoch": 97.09891808346214, + "grad_norm": 0.598972737789154, + "learning_rate": 5.417017544339287e-06, + "loss": 0.2153, + "num_input_tokens_seen": 48050448, + "step": 31460 + }, + { + "epoch": 97.11437403400309, + "grad_norm": 0.6616159081459045, + "learning_rate": 5.410916318710443e-06, + "loss": 0.2296, + "num_input_tokens_seen": 48058288, + "step": 31465 + }, + { + "epoch": 97.12982998454405, + "grad_norm": 0.438798189163208, + "learning_rate": 5.404818113964466e-06, + "loss": 0.2452, + "num_input_tokens_seen": 48066928, + "step": 31470 + }, + { + "epoch": 97.14528593508501, + "grad_norm": 0.3641869127750397, + "learning_rate": 5.398722931041792e-06, + "loss": 0.2253, + "num_input_tokens_seen": 48074160, + "step": 31475 + }, + { + "epoch": 97.16074188562597, + "grad_norm": 0.6079098582267761, + "learning_rate": 5.392630770882367e-06, + "loss": 0.2446, + "num_input_tokens_seen": 48082576, + "step": 31480 + }, + { + "epoch": 97.17619783616692, + "grad_norm": 0.5512700080871582, + "learning_rate": 5.3865416344256705e-06, + "loss": 0.2241, + "num_input_tokens_seen": 48090096, + "step": 31485 + }, + { + "epoch": 97.19165378670789, + "grad_norm": 0.548893928527832, + "learning_rate": 5.380455522610742e-06, + "loss": 0.2546, + "num_input_tokens_seen": 48097232, + "step": 31490 + }, + { + "epoch": 97.20710973724884, + "grad_norm": 0.6069579720497131, + "learning_rate": 5.374372436376116e-06, + "loss": 0.2107, + "num_input_tokens_seen": 48104720, + "step": 31495 + }, + { + "epoch": 97.2225656877898, + "grad_norm": 0.7530271410942078, + "learning_rate": 5.368292376659895e-06, + "loss": 0.215, + "num_input_tokens_seen": 48112464, + "step": 31500 + }, + { + "epoch": 97.23802163833076, + "grad_norm": 0.41442981362342834, + "learning_rate": 5.362215344399701e-06, + "loss": 0.2944, + "num_input_tokens_seen": 48120240, + "step": 31505 + }, + { + "epoch": 97.25347758887172, + "grad_norm": 0.4899432063102722, + "learning_rate": 5.356141340532678e-06, + "loss": 0.2698, + "num_input_tokens_seen": 48127536, + "step": 31510 + }, + { + "epoch": 97.26893353941267, + "grad_norm": 0.5178877711296082, + "learning_rate": 5.350070365995522e-06, + "loss": 0.2307, + "num_input_tokens_seen": 48134960, + "step": 31515 + }, + { + "epoch": 97.28438948995363, + "grad_norm": 0.46409162878990173, + "learning_rate": 5.344002421724459e-06, + "loss": 0.2452, + "num_input_tokens_seen": 48142032, + "step": 31520 + }, + { + "epoch": 97.2998454404946, + "grad_norm": 0.5602829456329346, + "learning_rate": 5.337937508655228e-06, + "loss": 0.2759, + "num_input_tokens_seen": 48149616, + "step": 31525 + }, + { + "epoch": 97.31530139103555, + "grad_norm": 0.42697951197624207, + "learning_rate": 5.331875627723126e-06, + "loss": 0.2531, + "num_input_tokens_seen": 48157360, + "step": 31530 + }, + { + "epoch": 97.3307573415765, + "grad_norm": 0.5382289290428162, + "learning_rate": 5.325816779862963e-06, + "loss": 0.2676, + "num_input_tokens_seen": 48164528, + "step": 31535 + }, + { + "epoch": 97.34621329211747, + "grad_norm": 0.4100892245769501, + "learning_rate": 5.319760966009102e-06, + "loss": 0.2881, + "num_input_tokens_seen": 48172336, + "step": 31540 + }, + { + "epoch": 97.36166924265842, + "grad_norm": 0.5600048899650574, + "learning_rate": 5.3137081870954096e-06, + "loss": 0.2617, + "num_input_tokens_seen": 48179984, + "step": 31545 + }, + { + "epoch": 97.37712519319938, + "grad_norm": 0.35742855072021484, + "learning_rate": 5.307658444055313e-06, + "loss": 0.2973, + "num_input_tokens_seen": 48187472, + "step": 31550 + }, + { + "epoch": 97.39258114374034, + "grad_norm": 0.6068450808525085, + "learning_rate": 5.301611737821749e-06, + "loss": 0.3072, + "num_input_tokens_seen": 48195696, + "step": 31555 + }, + { + "epoch": 97.4080370942813, + "grad_norm": 0.4124838709831238, + "learning_rate": 5.295568069327206e-06, + "loss": 0.2529, + "num_input_tokens_seen": 48203632, + "step": 31560 + }, + { + "epoch": 97.42349304482225, + "grad_norm": 0.6819404363632202, + "learning_rate": 5.289527439503683e-06, + "loss": 0.2286, + "num_input_tokens_seen": 48210896, + "step": 31565 + }, + { + "epoch": 97.43894899536322, + "grad_norm": 0.5240316390991211, + "learning_rate": 5.28348984928273e-06, + "loss": 0.2366, + "num_input_tokens_seen": 48218288, + "step": 31570 + }, + { + "epoch": 97.45440494590417, + "grad_norm": 0.4531111419200897, + "learning_rate": 5.27745529959541e-06, + "loss": 0.2193, + "num_input_tokens_seen": 48226384, + "step": 31575 + }, + { + "epoch": 97.46986089644513, + "grad_norm": 0.31483057141304016, + "learning_rate": 5.271423791372335e-06, + "loss": 0.2189, + "num_input_tokens_seen": 48235024, + "step": 31580 + }, + { + "epoch": 97.4853168469861, + "grad_norm": 0.37947872281074524, + "learning_rate": 5.26539532554364e-06, + "loss": 0.2108, + "num_input_tokens_seen": 48242832, + "step": 31585 + }, + { + "epoch": 97.50077279752705, + "grad_norm": 0.4849662482738495, + "learning_rate": 5.25936990303898e-06, + "loss": 0.2371, + "num_input_tokens_seen": 48249488, + "step": 31590 + }, + { + "epoch": 97.516228748068, + "grad_norm": 0.3578939735889435, + "learning_rate": 5.253347524787555e-06, + "loss": 0.2077, + "num_input_tokens_seen": 48257328, + "step": 31595 + }, + { + "epoch": 97.53168469860897, + "grad_norm": 0.3832333981990814, + "learning_rate": 5.2473281917181035e-06, + "loss": 0.2605, + "num_input_tokens_seen": 48265392, + "step": 31600 + }, + { + "epoch": 97.53168469860897, + "eval_loss": 0.3007649779319763, + "eval_runtime": 6.3174, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 22.794, + "num_input_tokens_seen": 48265392, + "step": 31600 + }, + { + "epoch": 97.54714064914992, + "grad_norm": 0.3458702862262726, + "learning_rate": 5.241311904758864e-06, + "loss": 0.2321, + "num_input_tokens_seen": 48273264, + "step": 31605 + }, + { + "epoch": 97.56259659969088, + "grad_norm": 0.5908074378967285, + "learning_rate": 5.23529866483764e-06, + "loss": 0.3155, + "num_input_tokens_seen": 48280592, + "step": 31610 + }, + { + "epoch": 97.57805255023185, + "grad_norm": 1.2166637182235718, + "learning_rate": 5.229288472881732e-06, + "loss": 0.283, + "num_input_tokens_seen": 48288016, + "step": 31615 + }, + { + "epoch": 97.5935085007728, + "grad_norm": 0.6977247595787048, + "learning_rate": 5.2232813298180025e-06, + "loss": 0.2692, + "num_input_tokens_seen": 48295280, + "step": 31620 + }, + { + "epoch": 97.60896445131375, + "grad_norm": 0.5153554677963257, + "learning_rate": 5.217277236572824e-06, + "loss": 0.2367, + "num_input_tokens_seen": 48303216, + "step": 31625 + }, + { + "epoch": 97.62442040185472, + "grad_norm": 0.6874595880508423, + "learning_rate": 5.211276194072093e-06, + "loss": 0.2689, + "num_input_tokens_seen": 48310352, + "step": 31630 + }, + { + "epoch": 97.63987635239567, + "grad_norm": 0.869360625743866, + "learning_rate": 5.205278203241254e-06, + "loss": 0.2394, + "num_input_tokens_seen": 48317936, + "step": 31635 + }, + { + "epoch": 97.65533230293663, + "grad_norm": 0.5189098119735718, + "learning_rate": 5.199283265005278e-06, + "loss": 0.2068, + "num_input_tokens_seen": 48325104, + "step": 31640 + }, + { + "epoch": 97.67078825347758, + "grad_norm": 0.5040066838264465, + "learning_rate": 5.193291380288648e-06, + "loss": 0.2798, + "num_input_tokens_seen": 48332752, + "step": 31645 + }, + { + "epoch": 97.68624420401855, + "grad_norm": 0.31652534008026123, + "learning_rate": 5.1873025500153995e-06, + "loss": 0.2054, + "num_input_tokens_seen": 48339888, + "step": 31650 + }, + { + "epoch": 97.7017001545595, + "grad_norm": 0.3555551767349243, + "learning_rate": 5.181316775109071e-06, + "loss": 0.2447, + "num_input_tokens_seen": 48347760, + "step": 31655 + }, + { + "epoch": 97.71715610510046, + "grad_norm": 0.41133663058280945, + "learning_rate": 5.1753340564927564e-06, + "loss": 0.2846, + "num_input_tokens_seen": 48354928, + "step": 31660 + }, + { + "epoch": 97.73261205564143, + "grad_norm": 0.4991222023963928, + "learning_rate": 5.169354395089068e-06, + "loss": 0.2173, + "num_input_tokens_seen": 48362672, + "step": 31665 + }, + { + "epoch": 97.74806800618238, + "grad_norm": 0.5872287750244141, + "learning_rate": 5.1633777918201346e-06, + "loss": 0.2634, + "num_input_tokens_seen": 48370896, + "step": 31670 + }, + { + "epoch": 97.76352395672333, + "grad_norm": 0.6550438404083252, + "learning_rate": 5.157404247607625e-06, + "loss": 0.2414, + "num_input_tokens_seen": 48378992, + "step": 31675 + }, + { + "epoch": 97.7789799072643, + "grad_norm": 0.5046446919441223, + "learning_rate": 5.1514337633727454e-06, + "loss": 0.2141, + "num_input_tokens_seen": 48386896, + "step": 31680 + }, + { + "epoch": 97.79443585780525, + "grad_norm": 0.47292613983154297, + "learning_rate": 5.145466340036206e-06, + "loss": 0.2522, + "num_input_tokens_seen": 48394544, + "step": 31685 + }, + { + "epoch": 97.80989180834621, + "grad_norm": 0.5049138069152832, + "learning_rate": 5.139501978518274e-06, + "loss": 0.2498, + "num_input_tokens_seen": 48401744, + "step": 31690 + }, + { + "epoch": 97.82534775888718, + "grad_norm": 0.5385206341743469, + "learning_rate": 5.133540679738716e-06, + "loss": 0.1887, + "num_input_tokens_seen": 48409456, + "step": 31695 + }, + { + "epoch": 97.84080370942813, + "grad_norm": 0.2548004686832428, + "learning_rate": 5.127582444616838e-06, + "loss": 0.218, + "num_input_tokens_seen": 48416688, + "step": 31700 + }, + { + "epoch": 97.85625965996908, + "grad_norm": 0.48092150688171387, + "learning_rate": 5.121627274071486e-06, + "loss": 0.3168, + "num_input_tokens_seen": 48424816, + "step": 31705 + }, + { + "epoch": 97.87171561051005, + "grad_norm": 0.4170447587966919, + "learning_rate": 5.115675169021009e-06, + "loss": 0.2363, + "num_input_tokens_seen": 48432528, + "step": 31710 + }, + { + "epoch": 97.887171561051, + "grad_norm": 0.7244011759757996, + "learning_rate": 5.1097261303832994e-06, + "loss": 0.2632, + "num_input_tokens_seen": 48439760, + "step": 31715 + }, + { + "epoch": 97.90262751159196, + "grad_norm": 0.48019513487815857, + "learning_rate": 5.103780159075788e-06, + "loss": 0.2316, + "num_input_tokens_seen": 48447856, + "step": 31720 + }, + { + "epoch": 97.91808346213293, + "grad_norm": 0.40673407912254333, + "learning_rate": 5.0978372560154e-06, + "loss": 0.2246, + "num_input_tokens_seen": 48455184, + "step": 31725 + }, + { + "epoch": 97.93353941267388, + "grad_norm": 0.3880172371864319, + "learning_rate": 5.091897422118619e-06, + "loss": 0.1851, + "num_input_tokens_seen": 48463056, + "step": 31730 + }, + { + "epoch": 97.94899536321483, + "grad_norm": 0.4102140963077545, + "learning_rate": 5.0859606583014305e-06, + "loss": 0.2567, + "num_input_tokens_seen": 48470544, + "step": 31735 + }, + { + "epoch": 97.9644513137558, + "grad_norm": 0.7407744526863098, + "learning_rate": 5.080026965479365e-06, + "loss": 0.2718, + "num_input_tokens_seen": 48478160, + "step": 31740 + }, + { + "epoch": 97.97990726429676, + "grad_norm": 0.3211164176464081, + "learning_rate": 5.074096344567475e-06, + "loss": 0.2888, + "num_input_tokens_seen": 48485840, + "step": 31745 + }, + { + "epoch": 97.99536321483771, + "grad_norm": 0.6602587103843689, + "learning_rate": 5.0681687964803294e-06, + "loss": 0.2507, + "num_input_tokens_seen": 48493488, + "step": 31750 + }, + { + "epoch": 98.00927357032458, + "grad_norm": 0.4041113555431366, + "learning_rate": 5.06224432213204e-06, + "loss": 0.2042, + "num_input_tokens_seen": 48500160, + "step": 31755 + }, + { + "epoch": 98.02472952086553, + "grad_norm": 0.6388854384422302, + "learning_rate": 5.056322922436224e-06, + "loss": 0.2137, + "num_input_tokens_seen": 48508160, + "step": 31760 + }, + { + "epoch": 98.04018547140649, + "grad_norm": 0.50752854347229, + "learning_rate": 5.0504045983060465e-06, + "loss": 0.3297, + "num_input_tokens_seen": 48516448, + "step": 31765 + }, + { + "epoch": 98.05564142194746, + "grad_norm": 0.720289945602417, + "learning_rate": 5.044489350654183e-06, + "loss": 0.2669, + "num_input_tokens_seen": 48524192, + "step": 31770 + }, + { + "epoch": 98.07109737248841, + "grad_norm": 0.630598247051239, + "learning_rate": 5.038577180392831e-06, + "loss": 0.2302, + "num_input_tokens_seen": 48531872, + "step": 31775 + }, + { + "epoch": 98.08655332302936, + "grad_norm": 0.525938093662262, + "learning_rate": 5.032668088433729e-06, + "loss": 0.2467, + "num_input_tokens_seen": 48539104, + "step": 31780 + }, + { + "epoch": 98.10200927357033, + "grad_norm": 0.4835854470729828, + "learning_rate": 5.02676207568814e-06, + "loss": 0.2354, + "num_input_tokens_seen": 48546784, + "step": 31785 + }, + { + "epoch": 98.11746522411129, + "grad_norm": 0.44384971261024475, + "learning_rate": 5.02085914306683e-06, + "loss": 0.2216, + "num_input_tokens_seen": 48554528, + "step": 31790 + }, + { + "epoch": 98.13292117465224, + "grad_norm": 0.43111202120780945, + "learning_rate": 5.014959291480123e-06, + "loss": 0.2592, + "num_input_tokens_seen": 48562528, + "step": 31795 + }, + { + "epoch": 98.14837712519319, + "grad_norm": 0.4137697219848633, + "learning_rate": 5.009062521837835e-06, + "loss": 0.2414, + "num_input_tokens_seen": 48569984, + "step": 31800 + }, + { + "epoch": 98.14837712519319, + "eval_loss": 0.3015042841434479, + "eval_runtime": 6.3184, + "eval_samples_per_second": 91.003, + "eval_steps_per_second": 22.79, + "num_input_tokens_seen": 48569984, + "step": 31800 + }, + { + "epoch": 98.16383307573416, + "grad_norm": 0.3961060047149658, + "learning_rate": 5.003168835049324e-06, + "loss": 0.3002, + "num_input_tokens_seen": 48577408, + "step": 31805 + }, + { + "epoch": 98.17928902627511, + "grad_norm": 0.3907219469547272, + "learning_rate": 4.997278232023483e-06, + "loss": 0.2515, + "num_input_tokens_seen": 48585184, + "step": 31810 + }, + { + "epoch": 98.19474497681607, + "grad_norm": 0.516057550907135, + "learning_rate": 4.9913907136687036e-06, + "loss": 0.2837, + "num_input_tokens_seen": 48593312, + "step": 31815 + }, + { + "epoch": 98.21020092735704, + "grad_norm": 0.5178554654121399, + "learning_rate": 4.985506280892918e-06, + "loss": 0.2502, + "num_input_tokens_seen": 48601312, + "step": 31820 + }, + { + "epoch": 98.22565687789799, + "grad_norm": 0.5556032061576843, + "learning_rate": 4.979624934603589e-06, + "loss": 0.2776, + "num_input_tokens_seen": 48609120, + "step": 31825 + }, + { + "epoch": 98.24111282843894, + "grad_norm": 0.43636423349380493, + "learning_rate": 4.97374667570768e-06, + "loss": 0.25, + "num_input_tokens_seen": 48616832, + "step": 31830 + }, + { + "epoch": 98.25656877897991, + "grad_norm": 0.47047290205955505, + "learning_rate": 4.967871505111704e-06, + "loss": 0.2759, + "num_input_tokens_seen": 48624384, + "step": 31835 + }, + { + "epoch": 98.27202472952087, + "grad_norm": 0.3485344648361206, + "learning_rate": 4.961999423721686e-06, + "loss": 0.2568, + "num_input_tokens_seen": 48631776, + "step": 31840 + }, + { + "epoch": 98.28748068006182, + "grad_norm": 0.5323132276535034, + "learning_rate": 4.956130432443159e-06, + "loss": 0.219, + "num_input_tokens_seen": 48639680, + "step": 31845 + }, + { + "epoch": 98.30293663060279, + "grad_norm": 0.27397722005844116, + "learning_rate": 4.950264532181215e-06, + "loss": 0.2559, + "num_input_tokens_seen": 48647520, + "step": 31850 + }, + { + "epoch": 98.31839258114374, + "grad_norm": 0.7602635622024536, + "learning_rate": 4.944401723840433e-06, + "loss": 0.2155, + "num_input_tokens_seen": 48655680, + "step": 31855 + }, + { + "epoch": 98.3338485316847, + "grad_norm": 0.3112543821334839, + "learning_rate": 4.938542008324942e-06, + "loss": 0.2551, + "num_input_tokens_seen": 48663904, + "step": 31860 + }, + { + "epoch": 98.34930448222566, + "grad_norm": 0.8395365476608276, + "learning_rate": 4.9326853865383855e-06, + "loss": 0.3286, + "num_input_tokens_seen": 48671136, + "step": 31865 + }, + { + "epoch": 98.36476043276662, + "grad_norm": 0.5721968412399292, + "learning_rate": 4.926831859383918e-06, + "loss": 0.2227, + "num_input_tokens_seen": 48678560, + "step": 31870 + }, + { + "epoch": 98.38021638330757, + "grad_norm": 0.3142179548740387, + "learning_rate": 4.92098142776424e-06, + "loss": 0.2327, + "num_input_tokens_seen": 48686304, + "step": 31875 + }, + { + "epoch": 98.39567233384854, + "grad_norm": 0.6700940132141113, + "learning_rate": 4.91513409258155e-06, + "loss": 0.2375, + "num_input_tokens_seen": 48693952, + "step": 31880 + }, + { + "epoch": 98.41112828438949, + "grad_norm": 0.7944446206092834, + "learning_rate": 4.909289854737581e-06, + "loss": 0.2641, + "num_input_tokens_seen": 48701376, + "step": 31885 + }, + { + "epoch": 98.42658423493044, + "grad_norm": 0.5724947452545166, + "learning_rate": 4.903448715133602e-06, + "loss": 0.3129, + "num_input_tokens_seen": 48709024, + "step": 31890 + }, + { + "epoch": 98.44204018547141, + "grad_norm": 0.6012621521949768, + "learning_rate": 4.897610674670372e-06, + "loss": 0.2248, + "num_input_tokens_seen": 48716608, + "step": 31895 + }, + { + "epoch": 98.45749613601237, + "grad_norm": 0.8815328478813171, + "learning_rate": 4.8917757342482e-06, + "loss": 0.2697, + "num_input_tokens_seen": 48723968, + "step": 31900 + }, + { + "epoch": 98.47295208655332, + "grad_norm": 0.5159397721290588, + "learning_rate": 4.885943894766909e-06, + "loss": 0.2689, + "num_input_tokens_seen": 48731392, + "step": 31905 + }, + { + "epoch": 98.48840803709429, + "grad_norm": 0.4032024145126343, + "learning_rate": 4.880115157125842e-06, + "loss": 0.2512, + "num_input_tokens_seen": 48739168, + "step": 31910 + }, + { + "epoch": 98.50386398763524, + "grad_norm": 0.5066783428192139, + "learning_rate": 4.874289522223857e-06, + "loss": 0.1964, + "num_input_tokens_seen": 48746752, + "step": 31915 + }, + { + "epoch": 98.5193199381762, + "grad_norm": 0.5777072906494141, + "learning_rate": 4.868466990959339e-06, + "loss": 0.2252, + "num_input_tokens_seen": 48754432, + "step": 31920 + }, + { + "epoch": 98.53477588871715, + "grad_norm": 0.43663039803504944, + "learning_rate": 4.8626475642301964e-06, + "loss": 0.1895, + "num_input_tokens_seen": 48762176, + "step": 31925 + }, + { + "epoch": 98.55023183925812, + "grad_norm": 0.5042808055877686, + "learning_rate": 4.856831242933871e-06, + "loss": 0.2087, + "num_input_tokens_seen": 48769888, + "step": 31930 + }, + { + "epoch": 98.56568778979907, + "grad_norm": 0.6709396839141846, + "learning_rate": 4.851018027967294e-06, + "loss": 0.2548, + "num_input_tokens_seen": 48777568, + "step": 31935 + }, + { + "epoch": 98.58114374034002, + "grad_norm": 0.4875859320163727, + "learning_rate": 4.845207920226946e-06, + "loss": 0.2288, + "num_input_tokens_seen": 48785472, + "step": 31940 + }, + { + "epoch": 98.59659969088099, + "grad_norm": 0.48755788803100586, + "learning_rate": 4.839400920608825e-06, + "loss": 0.2154, + "num_input_tokens_seen": 48793184, + "step": 31945 + }, + { + "epoch": 98.61205564142195, + "grad_norm": 0.4776366353034973, + "learning_rate": 4.83359703000843e-06, + "loss": 0.2031, + "num_input_tokens_seen": 48800096, + "step": 31950 + }, + { + "epoch": 98.6275115919629, + "grad_norm": 0.745410680770874, + "learning_rate": 4.827796249320804e-06, + "loss": 0.2443, + "num_input_tokens_seen": 48807744, + "step": 31955 + }, + { + "epoch": 98.64296754250387, + "grad_norm": 0.45470723509788513, + "learning_rate": 4.82199857944049e-06, + "loss": 0.2339, + "num_input_tokens_seen": 48815328, + "step": 31960 + }, + { + "epoch": 98.65842349304482, + "grad_norm": 0.5070059895515442, + "learning_rate": 4.8162040212615695e-06, + "loss": 0.2024, + "num_input_tokens_seen": 48822432, + "step": 31965 + }, + { + "epoch": 98.67387944358578, + "grad_norm": 0.31507211923599243, + "learning_rate": 4.810412575677639e-06, + "loss": 0.2589, + "num_input_tokens_seen": 48830592, + "step": 31970 + }, + { + "epoch": 98.68933539412674, + "grad_norm": 0.9105979204177856, + "learning_rate": 4.804624243581801e-06, + "loss": 0.297, + "num_input_tokens_seen": 48837248, + "step": 31975 + }, + { + "epoch": 98.7047913446677, + "grad_norm": 0.39924535155296326, + "learning_rate": 4.798839025866703e-06, + "loss": 0.2391, + "num_input_tokens_seen": 48844704, + "step": 31980 + }, + { + "epoch": 98.72024729520865, + "grad_norm": 0.5671884417533875, + "learning_rate": 4.793056923424491e-06, + "loss": 0.293, + "num_input_tokens_seen": 48851968, + "step": 31985 + }, + { + "epoch": 98.73570324574962, + "grad_norm": 0.7524290680885315, + "learning_rate": 4.78727793714683e-06, + "loss": 0.2337, + "num_input_tokens_seen": 48859104, + "step": 31990 + }, + { + "epoch": 98.75115919629057, + "grad_norm": 0.3891954720020294, + "learning_rate": 4.7815020679249285e-06, + "loss": 0.2054, + "num_input_tokens_seen": 48866528, + "step": 31995 + }, + { + "epoch": 98.76661514683153, + "grad_norm": 0.4137503206729889, + "learning_rate": 4.775729316649483e-06, + "loss": 0.2337, + "num_input_tokens_seen": 48874016, + "step": 32000 + }, + { + "epoch": 98.76661514683153, + "eval_loss": 0.30133330821990967, + "eval_runtime": 6.3936, + "eval_samples_per_second": 89.934, + "eval_steps_per_second": 22.523, + "num_input_tokens_seen": 48874016, + "step": 32000 + }, + { + "epoch": 98.7820710973725, + "grad_norm": 0.5736216306686401, + "learning_rate": 4.769959684210728e-06, + "loss": 0.2643, + "num_input_tokens_seen": 48881792, + "step": 32005 + }, + { + "epoch": 98.79752704791345, + "grad_norm": 0.48532217741012573, + "learning_rate": 4.764193171498426e-06, + "loss": 0.2421, + "num_input_tokens_seen": 48889984, + "step": 32010 + }, + { + "epoch": 98.8129829984544, + "grad_norm": 0.5794306397438049, + "learning_rate": 4.75842977940183e-06, + "loss": 0.2, + "num_input_tokens_seen": 48897408, + "step": 32015 + }, + { + "epoch": 98.82843894899537, + "grad_norm": 0.37071478366851807, + "learning_rate": 4.752669508809729e-06, + "loss": 0.2196, + "num_input_tokens_seen": 48905280, + "step": 32020 + }, + { + "epoch": 98.84389489953632, + "grad_norm": 0.3400077223777771, + "learning_rate": 4.746912360610445e-06, + "loss": 0.2303, + "num_input_tokens_seen": 48913344, + "step": 32025 + }, + { + "epoch": 98.85935085007728, + "grad_norm": 0.5101634860038757, + "learning_rate": 4.741158335691781e-06, + "loss": 0.2143, + "num_input_tokens_seen": 48921056, + "step": 32030 + }, + { + "epoch": 98.87480680061825, + "grad_norm": 0.4249114394187927, + "learning_rate": 4.7354074349410994e-06, + "loss": 0.2419, + "num_input_tokens_seen": 48928448, + "step": 32035 + }, + { + "epoch": 98.8902627511592, + "grad_norm": 0.3121612071990967, + "learning_rate": 4.729659659245245e-06, + "loss": 0.2754, + "num_input_tokens_seen": 48935648, + "step": 32040 + }, + { + "epoch": 98.90571870170015, + "grad_norm": 0.4061265289783478, + "learning_rate": 4.723915009490601e-06, + "loss": 0.258, + "num_input_tokens_seen": 48942816, + "step": 32045 + }, + { + "epoch": 98.9211746522411, + "grad_norm": 0.3975181579589844, + "learning_rate": 4.718173486563077e-06, + "loss": 0.2271, + "num_input_tokens_seen": 48950304, + "step": 32050 + }, + { + "epoch": 98.93663060278207, + "grad_norm": 0.3764186501502991, + "learning_rate": 4.71243509134808e-06, + "loss": 0.3019, + "num_input_tokens_seen": 48958048, + "step": 32055 + }, + { + "epoch": 98.95208655332303, + "grad_norm": 0.5447747111320496, + "learning_rate": 4.706699824730532e-06, + "loss": 0.2193, + "num_input_tokens_seen": 48965632, + "step": 32060 + }, + { + "epoch": 98.96754250386398, + "grad_norm": 0.38395437598228455, + "learning_rate": 4.700967687594901e-06, + "loss": 0.2197, + "num_input_tokens_seen": 48973632, + "step": 32065 + }, + { + "epoch": 98.98299845440495, + "grad_norm": 0.3922845721244812, + "learning_rate": 4.69523868082514e-06, + "loss": 0.2656, + "num_input_tokens_seen": 48981376, + "step": 32070 + }, + { + "epoch": 98.9984544049459, + "grad_norm": 0.6823888421058655, + "learning_rate": 4.689512805304747e-06, + "loss": 0.2908, + "num_input_tokens_seen": 48989312, + "step": 32075 + }, + { + "epoch": 99.01236476043276, + "grad_norm": 0.46701282262802124, + "learning_rate": 4.683790061916707e-06, + "loss": 0.1734, + "num_input_tokens_seen": 48996096, + "step": 32080 + }, + { + "epoch": 99.02782071097373, + "grad_norm": 0.6872003674507141, + "learning_rate": 4.678070451543551e-06, + "loss": 0.2214, + "num_input_tokens_seen": 49004416, + "step": 32085 + }, + { + "epoch": 99.04327666151468, + "grad_norm": 0.5488923788070679, + "learning_rate": 4.6723539750673204e-06, + "loss": 0.2425, + "num_input_tokens_seen": 49011936, + "step": 32090 + }, + { + "epoch": 99.05873261205564, + "grad_norm": 0.39110124111175537, + "learning_rate": 4.666640633369551e-06, + "loss": 0.266, + "num_input_tokens_seen": 49019008, + "step": 32095 + }, + { + "epoch": 99.0741885625966, + "grad_norm": 0.42686668038368225, + "learning_rate": 4.660930427331323e-06, + "loss": 0.2092, + "num_input_tokens_seen": 49026816, + "step": 32100 + }, + { + "epoch": 99.08964451313756, + "grad_norm": 0.7518803477287292, + "learning_rate": 4.6552233578332244e-06, + "loss": 0.2398, + "num_input_tokens_seen": 49034624, + "step": 32105 + }, + { + "epoch": 99.10510046367851, + "grad_norm": 0.34272095561027527, + "learning_rate": 4.649519425755347e-06, + "loss": 0.2115, + "num_input_tokens_seen": 49042496, + "step": 32110 + }, + { + "epoch": 99.12055641421948, + "grad_norm": 0.6227372288703918, + "learning_rate": 4.64381863197732e-06, + "loss": 0.224, + "num_input_tokens_seen": 49050016, + "step": 32115 + }, + { + "epoch": 99.13601236476043, + "grad_norm": 0.4444291889667511, + "learning_rate": 4.638120977378269e-06, + "loss": 0.2479, + "num_input_tokens_seen": 49057568, + "step": 32120 + }, + { + "epoch": 99.15146831530139, + "grad_norm": 0.4049142301082611, + "learning_rate": 4.632426462836848e-06, + "loss": 0.2682, + "num_input_tokens_seen": 49065344, + "step": 32125 + }, + { + "epoch": 99.16692426584235, + "grad_norm": 0.43961697816848755, + "learning_rate": 4.626735089231224e-06, + "loss": 0.2419, + "num_input_tokens_seen": 49073056, + "step": 32130 + }, + { + "epoch": 99.18238021638331, + "grad_norm": 0.5533018112182617, + "learning_rate": 4.621046857439068e-06, + "loss": 0.3174, + "num_input_tokens_seen": 49080416, + "step": 32135 + }, + { + "epoch": 99.19783616692426, + "grad_norm": 1.017263412475586, + "learning_rate": 4.615361768337587e-06, + "loss": 0.2451, + "num_input_tokens_seen": 49088064, + "step": 32140 + }, + { + "epoch": 99.21329211746523, + "grad_norm": 0.4387090802192688, + "learning_rate": 4.6096798228034946e-06, + "loss": 0.2728, + "num_input_tokens_seen": 49096352, + "step": 32145 + }, + { + "epoch": 99.22874806800618, + "grad_norm": 0.49978718161582947, + "learning_rate": 4.604001021713008e-06, + "loss": 0.2271, + "num_input_tokens_seen": 49104064, + "step": 32150 + }, + { + "epoch": 99.24420401854714, + "grad_norm": 0.6576821208000183, + "learning_rate": 4.598325365941883e-06, + "loss": 0.2313, + "num_input_tokens_seen": 49111648, + "step": 32155 + }, + { + "epoch": 99.2596599690881, + "grad_norm": 0.5321937203407288, + "learning_rate": 4.5926528563653645e-06, + "loss": 0.2476, + "num_input_tokens_seen": 49119168, + "step": 32160 + }, + { + "epoch": 99.27511591962906, + "grad_norm": 0.7262979745864868, + "learning_rate": 4.5869834938582295e-06, + "loss": 0.2247, + "num_input_tokens_seen": 49126688, + "step": 32165 + }, + { + "epoch": 99.29057187017001, + "grad_norm": 0.3570484519004822, + "learning_rate": 4.581317279294772e-06, + "loss": 0.2827, + "num_input_tokens_seen": 49134368, + "step": 32170 + }, + { + "epoch": 99.30602782071098, + "grad_norm": 0.4119720458984375, + "learning_rate": 4.57565421354878e-06, + "loss": 0.2346, + "num_input_tokens_seen": 49142080, + "step": 32175 + }, + { + "epoch": 99.32148377125193, + "grad_norm": 0.3906199336051941, + "learning_rate": 4.569994297493579e-06, + "loss": 0.2557, + "num_input_tokens_seen": 49149728, + "step": 32180 + }, + { + "epoch": 99.33693972179289, + "grad_norm": 0.44886597990989685, + "learning_rate": 4.564337532002002e-06, + "loss": 0.2365, + "num_input_tokens_seen": 49157792, + "step": 32185 + }, + { + "epoch": 99.35239567233384, + "grad_norm": 0.43962833285331726, + "learning_rate": 4.55868391794638e-06, + "loss": 0.2338, + "num_input_tokens_seen": 49165536, + "step": 32190 + }, + { + "epoch": 99.36785162287481, + "grad_norm": 0.964920163154602, + "learning_rate": 4.553033456198588e-06, + "loss": 0.2247, + "num_input_tokens_seen": 49173376, + "step": 32195 + }, + { + "epoch": 99.38330757341576, + "grad_norm": 0.4547809660434723, + "learning_rate": 4.54738614762999e-06, + "loss": 0.2581, + "num_input_tokens_seen": 49181056, + "step": 32200 + }, + { + "epoch": 99.38330757341576, + "eval_loss": 0.30226659774780273, + "eval_runtime": 6.2801, + "eval_samples_per_second": 91.559, + "eval_steps_per_second": 22.93, + "num_input_tokens_seen": 49181056, + "step": 32200 + }, + { + "epoch": 99.39876352395672, + "grad_norm": 0.935551643371582, + "learning_rate": 4.541741993111465e-06, + "loss": 0.227, + "num_input_tokens_seen": 49188992, + "step": 32205 + }, + { + "epoch": 99.41421947449768, + "grad_norm": 0.6031968593597412, + "learning_rate": 4.536100993513423e-06, + "loss": 0.2401, + "num_input_tokens_seen": 49197184, + "step": 32210 + }, + { + "epoch": 99.42967542503864, + "grad_norm": 0.5280326008796692, + "learning_rate": 4.530463149705768e-06, + "loss": 0.2515, + "num_input_tokens_seen": 49204800, + "step": 32215 + }, + { + "epoch": 99.44513137557959, + "grad_norm": 0.6793011426925659, + "learning_rate": 4.524828462557934e-06, + "loss": 0.2552, + "num_input_tokens_seen": 49212864, + "step": 32220 + }, + { + "epoch": 99.46058732612056, + "grad_norm": 1.1080665588378906, + "learning_rate": 4.5191969329388625e-06, + "loss": 0.203, + "num_input_tokens_seen": 49220288, + "step": 32225 + }, + { + "epoch": 99.47604327666151, + "grad_norm": 0.41602757573127747, + "learning_rate": 4.5135685617169965e-06, + "loss": 0.2147, + "num_input_tokens_seen": 49228384, + "step": 32230 + }, + { + "epoch": 99.49149922720247, + "grad_norm": 0.6804381012916565, + "learning_rate": 4.507943349760313e-06, + "loss": 0.3224, + "num_input_tokens_seen": 49235424, + "step": 32235 + }, + { + "epoch": 99.50695517774344, + "grad_norm": 0.43347039818763733, + "learning_rate": 4.502321297936277e-06, + "loss": 0.2116, + "num_input_tokens_seen": 49242880, + "step": 32240 + }, + { + "epoch": 99.52241112828439, + "grad_norm": 0.454521507024765, + "learning_rate": 4.496702407111888e-06, + "loss": 0.2887, + "num_input_tokens_seen": 49250208, + "step": 32245 + }, + { + "epoch": 99.53786707882534, + "grad_norm": 0.5076784491539001, + "learning_rate": 4.491086678153653e-06, + "loss": 0.2313, + "num_input_tokens_seen": 49257792, + "step": 32250 + }, + { + "epoch": 99.55332302936631, + "grad_norm": 0.4087354242801666, + "learning_rate": 4.485474111927579e-06, + "loss": 0.2004, + "num_input_tokens_seen": 49264896, + "step": 32255 + }, + { + "epoch": 99.56877897990726, + "grad_norm": 0.41328802704811096, + "learning_rate": 4.479864709299197e-06, + "loss": 0.2381, + "num_input_tokens_seen": 49272992, + "step": 32260 + }, + { + "epoch": 99.58423493044822, + "grad_norm": 0.5675731301307678, + "learning_rate": 4.474258471133555e-06, + "loss": 0.2958, + "num_input_tokens_seen": 49280352, + "step": 32265 + }, + { + "epoch": 99.59969088098919, + "grad_norm": 0.4650726318359375, + "learning_rate": 4.4686553982952014e-06, + "loss": 0.2486, + "num_input_tokens_seen": 49288480, + "step": 32270 + }, + { + "epoch": 99.61514683153014, + "grad_norm": 0.342403769493103, + "learning_rate": 4.463055491648191e-06, + "loss": 0.2431, + "num_input_tokens_seen": 49296416, + "step": 32275 + }, + { + "epoch": 99.6306027820711, + "grad_norm": 0.38672083616256714, + "learning_rate": 4.457458752056112e-06, + "loss": 0.2435, + "num_input_tokens_seen": 49303392, + "step": 32280 + }, + { + "epoch": 99.64605873261206, + "grad_norm": 0.5178338289260864, + "learning_rate": 4.451865180382042e-06, + "loss": 0.2636, + "num_input_tokens_seen": 49310880, + "step": 32285 + }, + { + "epoch": 99.66151468315302, + "grad_norm": 0.74361252784729, + "learning_rate": 4.4462747774885936e-06, + "loss": 0.2171, + "num_input_tokens_seen": 49318528, + "step": 32290 + }, + { + "epoch": 99.67697063369397, + "grad_norm": 0.4103875756263733, + "learning_rate": 4.440687544237859e-06, + "loss": 0.2549, + "num_input_tokens_seen": 49326464, + "step": 32295 + }, + { + "epoch": 99.69242658423494, + "grad_norm": 0.5576472282409668, + "learning_rate": 4.435103481491471e-06, + "loss": 0.1969, + "num_input_tokens_seen": 49334016, + "step": 32300 + }, + { + "epoch": 99.70788253477589, + "grad_norm": 0.5278209447860718, + "learning_rate": 4.429522590110569e-06, + "loss": 0.2743, + "num_input_tokens_seen": 49341920, + "step": 32305 + }, + { + "epoch": 99.72333848531684, + "grad_norm": 0.2832231819629669, + "learning_rate": 4.423944870955779e-06, + "loss": 0.2574, + "num_input_tokens_seen": 49349344, + "step": 32310 + }, + { + "epoch": 99.7387944358578, + "grad_norm": 0.7213377356529236, + "learning_rate": 4.418370324887272e-06, + "loss": 0.2772, + "num_input_tokens_seen": 49356832, + "step": 32315 + }, + { + "epoch": 99.75425038639877, + "grad_norm": 0.4088996648788452, + "learning_rate": 4.412798952764699e-06, + "loss": 0.2877, + "num_input_tokens_seen": 49364480, + "step": 32320 + }, + { + "epoch": 99.76970633693972, + "grad_norm": 0.6414746046066284, + "learning_rate": 4.407230755447245e-06, + "loss": 0.2655, + "num_input_tokens_seen": 49371904, + "step": 32325 + }, + { + "epoch": 99.78516228748067, + "grad_norm": 0.4629502594470978, + "learning_rate": 4.401665733793598e-06, + "loss": 0.2355, + "num_input_tokens_seen": 49378976, + "step": 32330 + }, + { + "epoch": 99.80061823802164, + "grad_norm": 0.7647600769996643, + "learning_rate": 4.3961038886619425e-06, + "loss": 0.3011, + "num_input_tokens_seen": 49386752, + "step": 32335 + }, + { + "epoch": 99.8160741885626, + "grad_norm": 0.4745276868343353, + "learning_rate": 4.39054522091e-06, + "loss": 0.2154, + "num_input_tokens_seen": 49393824, + "step": 32340 + }, + { + "epoch": 99.83153013910355, + "grad_norm": 0.41544705629348755, + "learning_rate": 4.384989731394979e-06, + "loss": 0.2754, + "num_input_tokens_seen": 49401248, + "step": 32345 + }, + { + "epoch": 99.84698608964452, + "grad_norm": 0.5558002591133118, + "learning_rate": 4.379437420973598e-06, + "loss": 0.2482, + "num_input_tokens_seen": 49409056, + "step": 32350 + }, + { + "epoch": 99.86244204018547, + "grad_norm": 0.300307035446167, + "learning_rate": 4.373888290502107e-06, + "loss": 0.2057, + "num_input_tokens_seen": 49416480, + "step": 32355 + }, + { + "epoch": 99.87789799072642, + "grad_norm": 0.4394280016422272, + "learning_rate": 4.36834234083624e-06, + "loss": 0.3164, + "num_input_tokens_seen": 49424064, + "step": 32360 + }, + { + "epoch": 99.89335394126739, + "grad_norm": 0.8721095323562622, + "learning_rate": 4.362799572831258e-06, + "loss": 0.1981, + "num_input_tokens_seen": 49432064, + "step": 32365 + }, + { + "epoch": 99.90880989180835, + "grad_norm": 0.48538294434547424, + "learning_rate": 4.35725998734193e-06, + "loss": 0.2353, + "num_input_tokens_seen": 49439712, + "step": 32370 + }, + { + "epoch": 99.9242658423493, + "grad_norm": 0.4632115364074707, + "learning_rate": 4.3517235852225195e-06, + "loss": 0.2842, + "num_input_tokens_seen": 49447296, + "step": 32375 + }, + { + "epoch": 99.93972179289027, + "grad_norm": 0.3194144666194916, + "learning_rate": 4.346190367326822e-06, + "loss": 0.2538, + "num_input_tokens_seen": 49454752, + "step": 32380 + }, + { + "epoch": 99.95517774343122, + "grad_norm": 0.43149155378341675, + "learning_rate": 4.340660334508115e-06, + "loss": 0.2255, + "num_input_tokens_seen": 49462272, + "step": 32385 + }, + { + "epoch": 99.97063369397218, + "grad_norm": 0.564211905002594, + "learning_rate": 4.335133487619206e-06, + "loss": 0.2606, + "num_input_tokens_seen": 49469792, + "step": 32390 + }, + { + "epoch": 99.98608964451314, + "grad_norm": 0.4616422653198242, + "learning_rate": 4.329609827512409e-06, + "loss": 0.2597, + "num_input_tokens_seen": 49478176, + "step": 32395 + }, + { + "epoch": 100.0, + "grad_norm": 0.5184189081192017, + "learning_rate": 4.324089355039531e-06, + "loss": 0.2602, + "num_input_tokens_seen": 49485120, + "step": 32400 + }, + { + "epoch": 100.0, + "eval_loss": 0.3036465048789978, + "eval_runtime": 6.2804, + "eval_samples_per_second": 91.555, + "eval_steps_per_second": 22.929, + "num_input_tokens_seen": 49485120, + "step": 32400 + }, + { + "epoch": 100.01545595054095, + "grad_norm": 0.5928211808204651, + "learning_rate": 4.3185720710519075e-06, + "loss": 0.2471, + "num_input_tokens_seen": 49492928, + "step": 32405 + }, + { + "epoch": 100.03091190108192, + "grad_norm": 0.31383439898490906, + "learning_rate": 4.3130579764003724e-06, + "loss": 0.2034, + "num_input_tokens_seen": 49500544, + "step": 32410 + }, + { + "epoch": 100.04636785162288, + "grad_norm": 0.5440360903739929, + "learning_rate": 4.307547071935267e-06, + "loss": 0.2632, + "num_input_tokens_seen": 49508448, + "step": 32415 + }, + { + "epoch": 100.06182380216383, + "grad_norm": 0.45927566289901733, + "learning_rate": 4.302039358506435e-06, + "loss": 0.2357, + "num_input_tokens_seen": 49515776, + "step": 32420 + }, + { + "epoch": 100.0772797527048, + "grad_norm": 0.39449357986450195, + "learning_rate": 4.296534836963245e-06, + "loss": 0.2201, + "num_input_tokens_seen": 49523328, + "step": 32425 + }, + { + "epoch": 100.09273570324575, + "grad_norm": 0.576935887336731, + "learning_rate": 4.291033508154555e-06, + "loss": 0.3165, + "num_input_tokens_seen": 49531008, + "step": 32430 + }, + { + "epoch": 100.1081916537867, + "grad_norm": 0.30052152276039124, + "learning_rate": 4.285535372928748e-06, + "loss": 0.266, + "num_input_tokens_seen": 49538944, + "step": 32435 + }, + { + "epoch": 100.12364760432767, + "grad_norm": 0.6756371259689331, + "learning_rate": 4.280040432133695e-06, + "loss": 0.2429, + "num_input_tokens_seen": 49546176, + "step": 32440 + }, + { + "epoch": 100.13910355486863, + "grad_norm": 0.5805705189704895, + "learning_rate": 4.274548686616789e-06, + "loss": 0.2417, + "num_input_tokens_seen": 49554496, + "step": 32445 + }, + { + "epoch": 100.15455950540958, + "grad_norm": 0.36327847838401794, + "learning_rate": 4.2690601372249364e-06, + "loss": 0.2017, + "num_input_tokens_seen": 49562240, + "step": 32450 + }, + { + "epoch": 100.17001545595055, + "grad_norm": 0.4734560251235962, + "learning_rate": 4.263574784804525e-06, + "loss": 0.2064, + "num_input_tokens_seen": 49569888, + "step": 32455 + }, + { + "epoch": 100.1854714064915, + "grad_norm": 0.4410291612148285, + "learning_rate": 4.258092630201479e-06, + "loss": 0.2654, + "num_input_tokens_seen": 49577216, + "step": 32460 + }, + { + "epoch": 100.20092735703246, + "grad_norm": 0.5690504312515259, + "learning_rate": 4.252613674261202e-06, + "loss": 0.2059, + "num_input_tokens_seen": 49584320, + "step": 32465 + }, + { + "epoch": 100.21638330757341, + "grad_norm": 0.569034993648529, + "learning_rate": 4.2471379178286224e-06, + "loss": 0.272, + "num_input_tokens_seen": 49592128, + "step": 32470 + }, + { + "epoch": 100.23183925811438, + "grad_norm": 0.5467929244041443, + "learning_rate": 4.241665361748181e-06, + "loss": 0.2632, + "num_input_tokens_seen": 49599808, + "step": 32475 + }, + { + "epoch": 100.24729520865533, + "grad_norm": 0.460746169090271, + "learning_rate": 4.2361960068637994e-06, + "loss": 0.2738, + "num_input_tokens_seen": 49607872, + "step": 32480 + }, + { + "epoch": 100.26275115919628, + "grad_norm": 0.45665907859802246, + "learning_rate": 4.230729854018933e-06, + "loss": 0.2475, + "num_input_tokens_seen": 49615392, + "step": 32485 + }, + { + "epoch": 100.27820710973725, + "grad_norm": 0.807305634021759, + "learning_rate": 4.225266904056521e-06, + "loss": 0.2606, + "num_input_tokens_seen": 49623040, + "step": 32490 + }, + { + "epoch": 100.2936630602782, + "grad_norm": 0.7013992071151733, + "learning_rate": 4.21980715781903e-06, + "loss": 0.2298, + "num_input_tokens_seen": 49630784, + "step": 32495 + }, + { + "epoch": 100.30911901081916, + "grad_norm": 0.33515727519989014, + "learning_rate": 4.214350616148416e-06, + "loss": 0.2423, + "num_input_tokens_seen": 49638112, + "step": 32500 + }, + { + "epoch": 100.32457496136013, + "grad_norm": 0.3171323537826538, + "learning_rate": 4.20889727988614e-06, + "loss": 0.1986, + "num_input_tokens_seen": 49645056, + "step": 32505 + }, + { + "epoch": 100.34003091190108, + "grad_norm": 0.5294604897499084, + "learning_rate": 4.20344714987318e-06, + "loss": 0.2456, + "num_input_tokens_seen": 49652960, + "step": 32510 + }, + { + "epoch": 100.35548686244204, + "grad_norm": 0.5784464478492737, + "learning_rate": 4.198000226950022e-06, + "loss": 0.2304, + "num_input_tokens_seen": 49660992, + "step": 32515 + }, + { + "epoch": 100.370942812983, + "grad_norm": 0.7640151977539062, + "learning_rate": 4.192556511956635e-06, + "loss": 0.2627, + "num_input_tokens_seen": 49668640, + "step": 32520 + }, + { + "epoch": 100.38639876352396, + "grad_norm": 0.5342705249786377, + "learning_rate": 4.18711600573252e-06, + "loss": 0.3141, + "num_input_tokens_seen": 49676608, + "step": 32525 + }, + { + "epoch": 100.40185471406491, + "grad_norm": 0.4603467583656311, + "learning_rate": 4.181678709116671e-06, + "loss": 0.2784, + "num_input_tokens_seen": 49684512, + "step": 32530 + }, + { + "epoch": 100.41731066460588, + "grad_norm": 0.5671128630638123, + "learning_rate": 4.1762446229475785e-06, + "loss": 0.2688, + "num_input_tokens_seen": 49691872, + "step": 32535 + }, + { + "epoch": 100.43276661514683, + "grad_norm": 0.43005889654159546, + "learning_rate": 4.17081374806326e-06, + "loss": 0.2217, + "num_input_tokens_seen": 49699424, + "step": 32540 + }, + { + "epoch": 100.44822256568779, + "grad_norm": 0.5572022795677185, + "learning_rate": 4.165386085301212e-06, + "loss": 0.3099, + "num_input_tokens_seen": 49707232, + "step": 32545 + }, + { + "epoch": 100.46367851622875, + "grad_norm": 0.36802077293395996, + "learning_rate": 4.1599616354984525e-06, + "loss": 0.2396, + "num_input_tokens_seen": 49714848, + "step": 32550 + }, + { + "epoch": 100.47913446676971, + "grad_norm": 0.4500170052051544, + "learning_rate": 4.154540399491508e-06, + "loss": 0.2262, + "num_input_tokens_seen": 49722400, + "step": 32555 + }, + { + "epoch": 100.49459041731066, + "grad_norm": 0.3702827990055084, + "learning_rate": 4.149122378116394e-06, + "loss": 0.2455, + "num_input_tokens_seen": 49729888, + "step": 32560 + }, + { + "epoch": 100.51004636785163, + "grad_norm": 0.2904905378818512, + "learning_rate": 4.14370757220863e-06, + "loss": 0.2472, + "num_input_tokens_seen": 49737120, + "step": 32565 + }, + { + "epoch": 100.52550231839258, + "grad_norm": 0.7020426988601685, + "learning_rate": 4.138295982603263e-06, + "loss": 0.2044, + "num_input_tokens_seen": 49745472, + "step": 32570 + }, + { + "epoch": 100.54095826893354, + "grad_norm": 0.396381139755249, + "learning_rate": 4.132887610134814e-06, + "loss": 0.2228, + "num_input_tokens_seen": 49752832, + "step": 32575 + }, + { + "epoch": 100.5564142194745, + "grad_norm": 0.5362563133239746, + "learning_rate": 4.127482455637335e-06, + "loss": 0.2826, + "num_input_tokens_seen": 49760352, + "step": 32580 + }, + { + "epoch": 100.57187017001546, + "grad_norm": 0.6700633764266968, + "learning_rate": 4.1220805199443545e-06, + "loss": 0.2977, + "num_input_tokens_seen": 49767808, + "step": 32585 + }, + { + "epoch": 100.58732612055641, + "grad_norm": 0.6537600755691528, + "learning_rate": 4.116681803888925e-06, + "loss": 0.2944, + "num_input_tokens_seen": 49775072, + "step": 32590 + }, + { + "epoch": 100.60278207109737, + "grad_norm": 0.37083443999290466, + "learning_rate": 4.111286308303605e-06, + "loss": 0.2085, + "num_input_tokens_seen": 49783040, + "step": 32595 + }, + { + "epoch": 100.61823802163833, + "grad_norm": 0.4289321303367615, + "learning_rate": 4.105894034020433e-06, + "loss": 0.2153, + "num_input_tokens_seen": 49790304, + "step": 32600 + }, + { + "epoch": 100.61823802163833, + "eval_loss": 0.3017272353172302, + "eval_runtime": 6.3291, + "eval_samples_per_second": 90.851, + "eval_steps_per_second": 22.752, + "num_input_tokens_seen": 49790304, + "step": 32600 + }, + { + "epoch": 100.63369397217929, + "grad_norm": 0.39949020743370056, + "learning_rate": 4.100504981870975e-06, + "loss": 0.1833, + "num_input_tokens_seen": 49798432, + "step": 32605 + }, + { + "epoch": 100.64914992272024, + "grad_norm": 0.6873458027839661, + "learning_rate": 4.0951191526862915e-06, + "loss": 0.2478, + "num_input_tokens_seen": 49805664, + "step": 32610 + }, + { + "epoch": 100.66460587326121, + "grad_norm": 0.3917701542377472, + "learning_rate": 4.089736547296938e-06, + "loss": 0.2432, + "num_input_tokens_seen": 49813056, + "step": 32615 + }, + { + "epoch": 100.68006182380216, + "grad_norm": 0.4574105441570282, + "learning_rate": 4.08435716653299e-06, + "loss": 0.2091, + "num_input_tokens_seen": 49819904, + "step": 32620 + }, + { + "epoch": 100.69551777434312, + "grad_norm": 0.5851573348045349, + "learning_rate": 4.0789810112240005e-06, + "loss": 0.2298, + "num_input_tokens_seen": 49827360, + "step": 32625 + }, + { + "epoch": 100.71097372488408, + "grad_norm": 0.535963773727417, + "learning_rate": 4.073608082199057e-06, + "loss": 0.262, + "num_input_tokens_seen": 49835456, + "step": 32630 + }, + { + "epoch": 100.72642967542504, + "grad_norm": 0.3967670798301697, + "learning_rate": 4.068238380286718e-06, + "loss": 0.3185, + "num_input_tokens_seen": 49843680, + "step": 32635 + }, + { + "epoch": 100.74188562596599, + "grad_norm": 0.4990377128124237, + "learning_rate": 4.062871906315072e-06, + "loss": 0.2579, + "num_input_tokens_seen": 49851296, + "step": 32640 + }, + { + "epoch": 100.75734157650696, + "grad_norm": 0.4614027738571167, + "learning_rate": 4.057508661111686e-06, + "loss": 0.2695, + "num_input_tokens_seen": 49859360, + "step": 32645 + }, + { + "epoch": 100.77279752704791, + "grad_norm": 0.6934947967529297, + "learning_rate": 4.052148645503648e-06, + "loss": 0.2511, + "num_input_tokens_seen": 49866944, + "step": 32650 + }, + { + "epoch": 100.78825347758887, + "grad_norm": 0.5230453610420227, + "learning_rate": 4.046791860317531e-06, + "loss": 0.2198, + "num_input_tokens_seen": 49874976, + "step": 32655 + }, + { + "epoch": 100.80370942812984, + "grad_norm": 0.32344475388526917, + "learning_rate": 4.041438306379431e-06, + "loss": 0.2528, + "num_input_tokens_seen": 49883200, + "step": 32660 + }, + { + "epoch": 100.81916537867079, + "grad_norm": 0.42894938588142395, + "learning_rate": 4.036087984514916e-06, + "loss": 0.2415, + "num_input_tokens_seen": 49891040, + "step": 32665 + }, + { + "epoch": 100.83462132921174, + "grad_norm": 0.48330190777778625, + "learning_rate": 4.030740895549084e-06, + "loss": 0.2359, + "num_input_tokens_seen": 49898592, + "step": 32670 + }, + { + "epoch": 100.85007727975271, + "grad_norm": 0.46479859948158264, + "learning_rate": 4.025397040306531e-06, + "loss": 0.2411, + "num_input_tokens_seen": 49905792, + "step": 32675 + }, + { + "epoch": 100.86553323029366, + "grad_norm": 0.6400618553161621, + "learning_rate": 4.0200564196113285e-06, + "loss": 0.2843, + "num_input_tokens_seen": 49912864, + "step": 32680 + }, + { + "epoch": 100.88098918083462, + "grad_norm": 0.5620918869972229, + "learning_rate": 4.014719034287079e-06, + "loss": 0.2565, + "num_input_tokens_seen": 49920640, + "step": 32685 + }, + { + "epoch": 100.89644513137559, + "grad_norm": 0.6549969911575317, + "learning_rate": 4.0093848851568775e-06, + "loss": 0.2376, + "num_input_tokens_seen": 49928640, + "step": 32690 + }, + { + "epoch": 100.91190108191654, + "grad_norm": 0.5267703533172607, + "learning_rate": 4.004053973043304e-06, + "loss": 0.1842, + "num_input_tokens_seen": 49936256, + "step": 32695 + }, + { + "epoch": 100.9273570324575, + "grad_norm": 0.4541674852371216, + "learning_rate": 3.998726298768465e-06, + "loss": 0.2323, + "num_input_tokens_seen": 49944672, + "step": 32700 + }, + { + "epoch": 100.94281298299846, + "grad_norm": 0.3998953700065613, + "learning_rate": 3.99340186315395e-06, + "loss": 0.2034, + "num_input_tokens_seen": 49952192, + "step": 32705 + }, + { + "epoch": 100.95826893353942, + "grad_norm": 0.41574662923812866, + "learning_rate": 3.988080667020849e-06, + "loss": 0.2272, + "num_input_tokens_seen": 49959808, + "step": 32710 + }, + { + "epoch": 100.97372488408037, + "grad_norm": 0.5759938955307007, + "learning_rate": 3.982762711189766e-06, + "loss": 0.3168, + "num_input_tokens_seen": 49967456, + "step": 32715 + }, + { + "epoch": 100.98918083462132, + "grad_norm": 0.506329357624054, + "learning_rate": 3.977447996480785e-06, + "loss": 0.201, + "num_input_tokens_seen": 49975232, + "step": 32720 + }, + { + "epoch": 101.0030911901082, + "grad_norm": 0.3452344834804535, + "learning_rate": 3.97213652371351e-06, + "loss": 0.2963, + "num_input_tokens_seen": 49981840, + "step": 32725 + }, + { + "epoch": 101.01854714064915, + "grad_norm": 0.5616006255149841, + "learning_rate": 3.966828293707042e-06, + "loss": 0.2414, + "num_input_tokens_seen": 49989840, + "step": 32730 + }, + { + "epoch": 101.03400309119012, + "grad_norm": 0.5161609053611755, + "learning_rate": 3.961523307279963e-06, + "loss": 0.1995, + "num_input_tokens_seen": 49997040, + "step": 32735 + }, + { + "epoch": 101.04945904173107, + "grad_norm": 0.4905989468097687, + "learning_rate": 3.956221565250382e-06, + "loss": 0.2286, + "num_input_tokens_seen": 50004784, + "step": 32740 + }, + { + "epoch": 101.06491499227202, + "grad_norm": 0.5659801363945007, + "learning_rate": 3.950923068435883e-06, + "loss": 0.1898, + "num_input_tokens_seen": 50012336, + "step": 32745 + }, + { + "epoch": 101.08037094281298, + "grad_norm": 0.7890764474868774, + "learning_rate": 3.945627817653566e-06, + "loss": 0.2543, + "num_input_tokens_seen": 50020400, + "step": 32750 + }, + { + "epoch": 101.09582689335394, + "grad_norm": 0.887534499168396, + "learning_rate": 3.9403358137200335e-06, + "loss": 0.263, + "num_input_tokens_seen": 50027824, + "step": 32755 + }, + { + "epoch": 101.1112828438949, + "grad_norm": 0.5214781165122986, + "learning_rate": 3.9350470574513605e-06, + "loss": 0.2215, + "num_input_tokens_seen": 50035248, + "step": 32760 + }, + { + "epoch": 101.12673879443585, + "grad_norm": 0.5671672224998474, + "learning_rate": 3.9297615496631525e-06, + "loss": 0.2568, + "num_input_tokens_seen": 50042480, + "step": 32765 + }, + { + "epoch": 101.14219474497682, + "grad_norm": 0.5655852556228638, + "learning_rate": 3.924479291170505e-06, + "loss": 0.2612, + "num_input_tokens_seen": 50049840, + "step": 32770 + }, + { + "epoch": 101.15765069551777, + "grad_norm": 0.444044828414917, + "learning_rate": 3.919200282788002e-06, + "loss": 0.2584, + "num_input_tokens_seen": 50058096, + "step": 32775 + }, + { + "epoch": 101.17310664605873, + "grad_norm": 0.3126656115055084, + "learning_rate": 3.913924525329726e-06, + "loss": 0.2621, + "num_input_tokens_seen": 50066384, + "step": 32780 + }, + { + "epoch": 101.1885625965997, + "grad_norm": 0.5896692872047424, + "learning_rate": 3.908652019609279e-06, + "loss": 0.2225, + "num_input_tokens_seen": 50073680, + "step": 32785 + }, + { + "epoch": 101.20401854714065, + "grad_norm": 0.46291548013687134, + "learning_rate": 3.9033827664397364e-06, + "loss": 0.259, + "num_input_tokens_seen": 50081936, + "step": 32790 + }, + { + "epoch": 101.2194744976816, + "grad_norm": 0.5629589557647705, + "learning_rate": 3.898116766633694e-06, + "loss": 0.229, + "num_input_tokens_seen": 50089648, + "step": 32795 + }, + { + "epoch": 101.23493044822257, + "grad_norm": 0.465371698141098, + "learning_rate": 3.8928540210032225e-06, + "loss": 0.2732, + "num_input_tokens_seen": 50097008, + "step": 32800 + }, + { + "epoch": 101.23493044822257, + "eval_loss": 0.3024393320083618, + "eval_runtime": 6.2729, + "eval_samples_per_second": 91.664, + "eval_steps_per_second": 22.956, + "num_input_tokens_seen": 50097008, + "step": 32800 + }, + { + "epoch": 101.25038639876352, + "grad_norm": 0.5247994661331177, + "learning_rate": 3.887594530359909e-06, + "loss": 0.2507, + "num_input_tokens_seen": 50104688, + "step": 32805 + }, + { + "epoch": 101.26584234930448, + "grad_norm": 0.5290411114692688, + "learning_rate": 3.88233829551484e-06, + "loss": 0.2856, + "num_input_tokens_seen": 50112336, + "step": 32810 + }, + { + "epoch": 101.28129829984545, + "grad_norm": 0.5641283988952637, + "learning_rate": 3.877085317278581e-06, + "loss": 0.2185, + "num_input_tokens_seen": 50119856, + "step": 32815 + }, + { + "epoch": 101.2967542503864, + "grad_norm": 0.41963064670562744, + "learning_rate": 3.87183559646122e-06, + "loss": 0.2196, + "num_input_tokens_seen": 50127184, + "step": 32820 + }, + { + "epoch": 101.31221020092735, + "grad_norm": 0.48484331369400024, + "learning_rate": 3.866589133872317e-06, + "loss": 0.2198, + "num_input_tokens_seen": 50135024, + "step": 32825 + }, + { + "epoch": 101.32766615146832, + "grad_norm": 0.8107137084007263, + "learning_rate": 3.861345930320948e-06, + "loss": 0.2703, + "num_input_tokens_seen": 50142064, + "step": 32830 + }, + { + "epoch": 101.34312210200927, + "grad_norm": 0.6031244397163391, + "learning_rate": 3.856105986615688e-06, + "loss": 0.2203, + "num_input_tokens_seen": 50149776, + "step": 32835 + }, + { + "epoch": 101.35857805255023, + "grad_norm": 0.5312060117721558, + "learning_rate": 3.850869303564589e-06, + "loss": 0.216, + "num_input_tokens_seen": 50157328, + "step": 32840 + }, + { + "epoch": 101.3740340030912, + "grad_norm": 0.4303360879421234, + "learning_rate": 3.845635881975226e-06, + "loss": 0.2169, + "num_input_tokens_seen": 50164592, + "step": 32845 + }, + { + "epoch": 101.38948995363215, + "grad_norm": 0.48135852813720703, + "learning_rate": 3.840405722654647e-06, + "loss": 0.2355, + "num_input_tokens_seen": 50171824, + "step": 32850 + }, + { + "epoch": 101.4049459041731, + "grad_norm": 0.32475876808166504, + "learning_rate": 3.835178826409419e-06, + "loss": 0.2336, + "num_input_tokens_seen": 50180432, + "step": 32855 + }, + { + "epoch": 101.42040185471407, + "grad_norm": 0.5166984796524048, + "learning_rate": 3.8299551940455895e-06, + "loss": 0.1904, + "num_input_tokens_seen": 50188240, + "step": 32860 + }, + { + "epoch": 101.43585780525503, + "grad_norm": 0.5604356527328491, + "learning_rate": 3.824734826368703e-06, + "loss": 0.269, + "num_input_tokens_seen": 50196048, + "step": 32865 + }, + { + "epoch": 101.45131375579598, + "grad_norm": 0.5661957859992981, + "learning_rate": 3.819517724183813e-06, + "loss": 0.222, + "num_input_tokens_seen": 50203472, + "step": 32870 + }, + { + "epoch": 101.46676970633693, + "grad_norm": 0.4830678701400757, + "learning_rate": 3.8143038882954648e-06, + "loss": 0.2633, + "num_input_tokens_seen": 50211280, + "step": 32875 + }, + { + "epoch": 101.4822256568779, + "grad_norm": 0.3860786259174347, + "learning_rate": 3.8090933195076867e-06, + "loss": 0.3186, + "num_input_tokens_seen": 50219152, + "step": 32880 + }, + { + "epoch": 101.49768160741885, + "grad_norm": 0.40262940526008606, + "learning_rate": 3.8038860186240198e-06, + "loss": 0.2554, + "num_input_tokens_seen": 50226768, + "step": 32885 + }, + { + "epoch": 101.51313755795981, + "grad_norm": 0.38150662183761597, + "learning_rate": 3.7986819864475026e-06, + "loss": 0.2224, + "num_input_tokens_seen": 50234416, + "step": 32890 + }, + { + "epoch": 101.52859350850078, + "grad_norm": 0.7064465284347534, + "learning_rate": 3.793481223780651e-06, + "loss": 0.264, + "num_input_tokens_seen": 50241968, + "step": 32895 + }, + { + "epoch": 101.54404945904173, + "grad_norm": 0.5002114176750183, + "learning_rate": 3.788283731425496e-06, + "loss": 0.2399, + "num_input_tokens_seen": 50250064, + "step": 32900 + }, + { + "epoch": 101.55950540958268, + "grad_norm": 0.4544620215892792, + "learning_rate": 3.7830895101835488e-06, + "loss": 0.2225, + "num_input_tokens_seen": 50257616, + "step": 32905 + }, + { + "epoch": 101.57496136012365, + "grad_norm": 0.3429126441478729, + "learning_rate": 3.7778985608558274e-06, + "loss": 0.2393, + "num_input_tokens_seen": 50265808, + "step": 32910 + }, + { + "epoch": 101.5904173106646, + "grad_norm": 0.34063974022865295, + "learning_rate": 3.7727108842428443e-06, + "loss": 0.2953, + "num_input_tokens_seen": 50273616, + "step": 32915 + }, + { + "epoch": 101.60587326120556, + "grad_norm": 0.4753231406211853, + "learning_rate": 3.7675264811446065e-06, + "loss": 0.2547, + "num_input_tokens_seen": 50280912, + "step": 32920 + }, + { + "epoch": 101.62132921174653, + "grad_norm": 0.43773216009140015, + "learning_rate": 3.7623453523605994e-06, + "loss": 0.2232, + "num_input_tokens_seen": 50288816, + "step": 32925 + }, + { + "epoch": 101.63678516228748, + "grad_norm": 0.6398486495018005, + "learning_rate": 3.757167498689834e-06, + "loss": 0.3047, + "num_input_tokens_seen": 50296176, + "step": 32930 + }, + { + "epoch": 101.65224111282843, + "grad_norm": 0.4613582193851471, + "learning_rate": 3.7519929209307914e-06, + "loss": 0.2935, + "num_input_tokens_seen": 50303376, + "step": 32935 + }, + { + "epoch": 101.6676970633694, + "grad_norm": 0.7749642729759216, + "learning_rate": 3.746821619881463e-06, + "loss": 0.2775, + "num_input_tokens_seen": 50311280, + "step": 32940 + }, + { + "epoch": 101.68315301391036, + "grad_norm": 0.6666228771209717, + "learning_rate": 3.74165359633932e-06, + "loss": 0.2719, + "num_input_tokens_seen": 50318864, + "step": 32945 + }, + { + "epoch": 101.69860896445131, + "grad_norm": 0.37505266070365906, + "learning_rate": 3.736488851101341e-06, + "loss": 0.1902, + "num_input_tokens_seen": 50326320, + "step": 32950 + }, + { + "epoch": 101.71406491499228, + "grad_norm": 0.7435512542724609, + "learning_rate": 3.7313273849640035e-06, + "loss": 0.2405, + "num_input_tokens_seen": 50334320, + "step": 32955 + }, + { + "epoch": 101.72952086553323, + "grad_norm": 0.3207756280899048, + "learning_rate": 3.7261691987232533e-06, + "loss": 0.2276, + "num_input_tokens_seen": 50341776, + "step": 32960 + }, + { + "epoch": 101.74497681607419, + "grad_norm": 0.44737645983695984, + "learning_rate": 3.7210142931745575e-06, + "loss": 0.291, + "num_input_tokens_seen": 50349616, + "step": 32965 + }, + { + "epoch": 101.76043276661515, + "grad_norm": 0.951321005821228, + "learning_rate": 3.7158626691128712e-06, + "loss": 0.2621, + "num_input_tokens_seen": 50357424, + "step": 32970 + }, + { + "epoch": 101.7758887171561, + "grad_norm": 0.5997331738471985, + "learning_rate": 3.710714327332629e-06, + "loss": 0.2032, + "num_input_tokens_seen": 50364688, + "step": 32975 + }, + { + "epoch": 101.79134466769706, + "grad_norm": 0.32385697960853577, + "learning_rate": 3.7055692686277815e-06, + "loss": 0.1942, + "num_input_tokens_seen": 50372272, + "step": 32980 + }, + { + "epoch": 101.80680061823801, + "grad_norm": 0.446539968252182, + "learning_rate": 3.70042749379175e-06, + "loss": 0.2409, + "num_input_tokens_seen": 50379952, + "step": 32985 + }, + { + "epoch": 101.82225656877898, + "grad_norm": 0.44619590044021606, + "learning_rate": 3.6952890036174693e-06, + "loss": 0.2954, + "num_input_tokens_seen": 50387952, + "step": 32990 + }, + { + "epoch": 101.83771251931994, + "grad_norm": 0.4702449142932892, + "learning_rate": 3.690153798897353e-06, + "loss": 0.2472, + "num_input_tokens_seen": 50395248, + "step": 32995 + }, + { + "epoch": 101.85316846986089, + "grad_norm": 0.5959342122077942, + "learning_rate": 3.6850218804233225e-06, + "loss": 0.2764, + "num_input_tokens_seen": 50403088, + "step": 33000 + }, + { + "epoch": 101.85316846986089, + "eval_loss": 0.30161264538764954, + "eval_runtime": 6.3271, + "eval_samples_per_second": 90.878, + "eval_steps_per_second": 22.759, + "num_input_tokens_seen": 50403088, + "step": 33000 + }, + { + "epoch": 101.86862442040186, + "grad_norm": 0.6691633462905884, + "learning_rate": 3.679893248986779e-06, + "loss": 0.2842, + "num_input_tokens_seen": 50410320, + "step": 33005 + }, + { + "epoch": 101.88408037094281, + "grad_norm": 0.5929972529411316, + "learning_rate": 3.6747679053786147e-06, + "loss": 0.264, + "num_input_tokens_seen": 50417872, + "step": 33010 + }, + { + "epoch": 101.89953632148377, + "grad_norm": 0.3467199504375458, + "learning_rate": 3.669645850389228e-06, + "loss": 0.2456, + "num_input_tokens_seen": 50425232, + "step": 33015 + }, + { + "epoch": 101.91499227202473, + "grad_norm": 0.48025113344192505, + "learning_rate": 3.664527084808514e-06, + "loss": 0.2283, + "num_input_tokens_seen": 50433136, + "step": 33020 + }, + { + "epoch": 101.93044822256569, + "grad_norm": 0.4099641740322113, + "learning_rate": 3.6594116094258337e-06, + "loss": 0.2489, + "num_input_tokens_seen": 50440816, + "step": 33025 + }, + { + "epoch": 101.94590417310664, + "grad_norm": 0.4043547511100769, + "learning_rate": 3.6542994250300665e-06, + "loss": 0.2054, + "num_input_tokens_seen": 50449136, + "step": 33030 + }, + { + "epoch": 101.96136012364761, + "grad_norm": 0.4686160981655121, + "learning_rate": 3.6491905324095825e-06, + "loss": 0.279, + "num_input_tokens_seen": 50456272, + "step": 33035 + }, + { + "epoch": 101.97681607418856, + "grad_norm": 0.36518394947052, + "learning_rate": 3.644084932352221e-06, + "loss": 0.2848, + "num_input_tokens_seen": 50463632, + "step": 33040 + }, + { + "epoch": 101.99227202472952, + "grad_norm": 0.7400506138801575, + "learning_rate": 3.6389826256453457e-06, + "loss": 0.2399, + "num_input_tokens_seen": 50470992, + "step": 33045 + }, + { + "epoch": 102.00618238021639, + "grad_norm": 0.49947935342788696, + "learning_rate": 3.633883613075781e-06, + "loss": 0.284, + "num_input_tokens_seen": 50477936, + "step": 33050 + }, + { + "epoch": 102.02163833075734, + "grad_norm": 0.3752979636192322, + "learning_rate": 3.6287878954298693e-06, + "loss": 0.19, + "num_input_tokens_seen": 50485744, + "step": 33055 + }, + { + "epoch": 102.0370942812983, + "grad_norm": 0.6559798717498779, + "learning_rate": 3.6236954734934354e-06, + "loss": 0.2908, + "num_input_tokens_seen": 50493264, + "step": 33060 + }, + { + "epoch": 102.05255023183926, + "grad_norm": 0.36502936482429504, + "learning_rate": 3.618606348051784e-06, + "loss": 0.2728, + "num_input_tokens_seen": 50500720, + "step": 33065 + }, + { + "epoch": 102.06800618238022, + "grad_norm": 0.5014691948890686, + "learning_rate": 3.6135205198897376e-06, + "loss": 0.2327, + "num_input_tokens_seen": 50508400, + "step": 33070 + }, + { + "epoch": 102.08346213292117, + "grad_norm": 0.38742026686668396, + "learning_rate": 3.6084379897915854e-06, + "loss": 0.2637, + "num_input_tokens_seen": 50515952, + "step": 33075 + }, + { + "epoch": 102.09891808346214, + "grad_norm": 0.3715677261352539, + "learning_rate": 3.6033587585411115e-06, + "loss": 0.2855, + "num_input_tokens_seen": 50523024, + "step": 33080 + }, + { + "epoch": 102.11437403400309, + "grad_norm": 0.6149278879165649, + "learning_rate": 3.5982828269216117e-06, + "loss": 0.2174, + "num_input_tokens_seen": 50530608, + "step": 33085 + }, + { + "epoch": 102.12982998454405, + "grad_norm": 0.3688269555568695, + "learning_rate": 3.593210195715843e-06, + "loss": 0.2091, + "num_input_tokens_seen": 50538480, + "step": 33090 + }, + { + "epoch": 102.14528593508501, + "grad_norm": 0.5754055380821228, + "learning_rate": 3.5881408657060773e-06, + "loss": 0.2808, + "num_input_tokens_seen": 50546192, + "step": 33095 + }, + { + "epoch": 102.16074188562597, + "grad_norm": 0.446468323469162, + "learning_rate": 3.583074837674075e-06, + "loss": 0.2537, + "num_input_tokens_seen": 50553808, + "step": 33100 + }, + { + "epoch": 102.17619783616692, + "grad_norm": 0.8430851697921753, + "learning_rate": 3.578012112401069e-06, + "loss": 0.3168, + "num_input_tokens_seen": 50561424, + "step": 33105 + }, + { + "epoch": 102.19165378670789, + "grad_norm": 0.7947986125946045, + "learning_rate": 3.5729526906677996e-06, + "loss": 0.2838, + "num_input_tokens_seen": 50568880, + "step": 33110 + }, + { + "epoch": 102.20710973724884, + "grad_norm": 0.5248599052429199, + "learning_rate": 3.5678965732545007e-06, + "loss": 0.2227, + "num_input_tokens_seen": 50576368, + "step": 33115 + }, + { + "epoch": 102.2225656877898, + "grad_norm": 0.57773357629776, + "learning_rate": 3.562843760940876e-06, + "loss": 0.2883, + "num_input_tokens_seen": 50584272, + "step": 33120 + }, + { + "epoch": 102.23802163833076, + "grad_norm": 0.43991488218307495, + "learning_rate": 3.5577942545061473e-06, + "loss": 0.2182, + "num_input_tokens_seen": 50591728, + "step": 33125 + }, + { + "epoch": 102.25347758887172, + "grad_norm": 0.4001513421535492, + "learning_rate": 3.5527480547289967e-06, + "loss": 0.2119, + "num_input_tokens_seen": 50599088, + "step": 33130 + }, + { + "epoch": 102.26893353941267, + "grad_norm": 0.5429539680480957, + "learning_rate": 3.547705162387624e-06, + "loss": 0.2202, + "num_input_tokens_seen": 50606384, + "step": 33135 + }, + { + "epoch": 102.28438948995363, + "grad_norm": 0.6498711109161377, + "learning_rate": 3.542665578259699e-06, + "loss": 0.252, + "num_input_tokens_seen": 50614064, + "step": 33140 + }, + { + "epoch": 102.2998454404946, + "grad_norm": 0.5029294490814209, + "learning_rate": 3.5376293031223945e-06, + "loss": 0.2257, + "num_input_tokens_seen": 50621680, + "step": 33145 + }, + { + "epoch": 102.31530139103555, + "grad_norm": 0.37202155590057373, + "learning_rate": 3.5325963377523614e-06, + "loss": 0.2094, + "num_input_tokens_seen": 50629584, + "step": 33150 + }, + { + "epoch": 102.3307573415765, + "grad_norm": 0.45826104283332825, + "learning_rate": 3.5275666829257536e-06, + "loss": 0.2164, + "num_input_tokens_seen": 50637168, + "step": 33155 + }, + { + "epoch": 102.34621329211747, + "grad_norm": 0.4359164237976074, + "learning_rate": 3.5225403394181955e-06, + "loss": 0.2568, + "num_input_tokens_seen": 50645200, + "step": 33160 + }, + { + "epoch": 102.36166924265842, + "grad_norm": 0.3814990222454071, + "learning_rate": 3.517517308004828e-06, + "loss": 0.2171, + "num_input_tokens_seen": 50652816, + "step": 33165 + }, + { + "epoch": 102.37712519319938, + "grad_norm": 0.49172642827033997, + "learning_rate": 3.512497589460251e-06, + "loss": 0.2214, + "num_input_tokens_seen": 50660304, + "step": 33170 + }, + { + "epoch": 102.39258114374034, + "grad_norm": 0.48557329177856445, + "learning_rate": 3.5074811845585727e-06, + "loss": 0.2285, + "num_input_tokens_seen": 50668208, + "step": 33175 + }, + { + "epoch": 102.4080370942813, + "grad_norm": 0.4013907313346863, + "learning_rate": 3.5024680940733937e-06, + "loss": 0.2539, + "num_input_tokens_seen": 50675696, + "step": 33180 + }, + { + "epoch": 102.42349304482225, + "grad_norm": 0.6871642470359802, + "learning_rate": 3.4974583187777852e-06, + "loss": 0.2791, + "num_input_tokens_seen": 50683344, + "step": 33185 + }, + { + "epoch": 102.43894899536322, + "grad_norm": 0.6197205185890198, + "learning_rate": 3.4924518594443204e-06, + "loss": 0.2436, + "num_input_tokens_seen": 50691440, + "step": 33190 + }, + { + "epoch": 102.45440494590417, + "grad_norm": 0.33762094378471375, + "learning_rate": 3.4874487168450682e-06, + "loss": 0.1983, + "num_input_tokens_seen": 50699504, + "step": 33195 + }, + { + "epoch": 102.46986089644513, + "grad_norm": 0.48293426632881165, + "learning_rate": 3.482448891751558e-06, + "loss": 0.2199, + "num_input_tokens_seen": 50707088, + "step": 33200 + }, + { + "epoch": 102.46986089644513, + "eval_loss": 0.3029390573501587, + "eval_runtime": 6.3393, + "eval_samples_per_second": 90.703, + "eval_steps_per_second": 22.715, + "num_input_tokens_seen": 50707088, + "step": 33200 + }, + { + "epoch": 102.4853168469861, + "grad_norm": 0.369296133518219, + "learning_rate": 3.477452384934843e-06, + "loss": 0.2461, + "num_input_tokens_seen": 50714992, + "step": 33205 + }, + { + "epoch": 102.50077279752705, + "grad_norm": 0.35676005482673645, + "learning_rate": 3.472459197165434e-06, + "loss": 0.2109, + "num_input_tokens_seen": 50722928, + "step": 33210 + }, + { + "epoch": 102.516228748068, + "grad_norm": 0.48206067085266113, + "learning_rate": 3.4674693292133518e-06, + "loss": 0.2826, + "num_input_tokens_seen": 50730160, + "step": 33215 + }, + { + "epoch": 102.53168469860897, + "grad_norm": 0.6042934060096741, + "learning_rate": 3.4624827818480977e-06, + "loss": 0.2724, + "num_input_tokens_seen": 50737840, + "step": 33220 + }, + { + "epoch": 102.54714064914992, + "grad_norm": 0.48370856046676636, + "learning_rate": 3.4574995558386474e-06, + "loss": 0.2063, + "num_input_tokens_seen": 50745872, + "step": 33225 + }, + { + "epoch": 102.56259659969088, + "grad_norm": 0.42570972442626953, + "learning_rate": 3.452519651953487e-06, + "loss": 0.255, + "num_input_tokens_seen": 50753680, + "step": 33230 + }, + { + "epoch": 102.57805255023185, + "grad_norm": 0.45265114307403564, + "learning_rate": 3.447543070960585e-06, + "loss": 0.2779, + "num_input_tokens_seen": 50761008, + "step": 33235 + }, + { + "epoch": 102.5935085007728, + "grad_norm": 0.5110713243484497, + "learning_rate": 3.4425698136273778e-06, + "loss": 0.3185, + "num_input_tokens_seen": 50768528, + "step": 33240 + }, + { + "epoch": 102.60896445131375, + "grad_norm": 0.7634619474411011, + "learning_rate": 3.437599880720821e-06, + "loss": 0.247, + "num_input_tokens_seen": 50776304, + "step": 33245 + }, + { + "epoch": 102.62442040185472, + "grad_norm": 0.4410085380077362, + "learning_rate": 3.4326332730073267e-06, + "loss": 0.2364, + "num_input_tokens_seen": 50783376, + "step": 33250 + }, + { + "epoch": 102.63987635239567, + "grad_norm": 0.4696482717990875, + "learning_rate": 3.427669991252813e-06, + "loss": 0.3399, + "num_input_tokens_seen": 50791280, + "step": 33255 + }, + { + "epoch": 102.65533230293663, + "grad_norm": 0.6144499182701111, + "learning_rate": 3.42271003622269e-06, + "loss": 0.2375, + "num_input_tokens_seen": 50798896, + "step": 33260 + }, + { + "epoch": 102.67078825347758, + "grad_norm": 0.5085341334342957, + "learning_rate": 3.4177534086818286e-06, + "loss": 0.2426, + "num_input_tokens_seen": 50806640, + "step": 33265 + }, + { + "epoch": 102.68624420401855, + "grad_norm": 0.5124855637550354, + "learning_rate": 3.412800109394612e-06, + "loss": 0.2076, + "num_input_tokens_seen": 50814096, + "step": 33270 + }, + { + "epoch": 102.7017001545595, + "grad_norm": 0.4224030673503876, + "learning_rate": 3.4078501391249044e-06, + "loss": 0.2225, + "num_input_tokens_seen": 50821872, + "step": 33275 + }, + { + "epoch": 102.71715610510046, + "grad_norm": 0.6221756339073181, + "learning_rate": 3.4029034986360453e-06, + "loss": 0.2171, + "num_input_tokens_seen": 50830000, + "step": 33280 + }, + { + "epoch": 102.73261205564143, + "grad_norm": 0.6192018985748291, + "learning_rate": 3.397960188690877e-06, + "loss": 0.2905, + "num_input_tokens_seen": 50837616, + "step": 33285 + }, + { + "epoch": 102.74806800618238, + "grad_norm": 0.4390241801738739, + "learning_rate": 3.393020210051717e-06, + "loss": 0.3078, + "num_input_tokens_seen": 50845360, + "step": 33290 + }, + { + "epoch": 102.76352395672333, + "grad_norm": 0.44428762793540955, + "learning_rate": 3.3880835634803655e-06, + "loss": 0.2693, + "num_input_tokens_seen": 50853040, + "step": 33295 + }, + { + "epoch": 102.7789799072643, + "grad_norm": 0.3003148138523102, + "learning_rate": 3.383150249738126e-06, + "loss": 0.2315, + "num_input_tokens_seen": 50860080, + "step": 33300 + }, + { + "epoch": 102.79443585780525, + "grad_norm": 0.5911772847175598, + "learning_rate": 3.3782202695857663e-06, + "loss": 0.2207, + "num_input_tokens_seen": 50867536, + "step": 33305 + }, + { + "epoch": 102.80989180834621, + "grad_norm": 0.5001242160797119, + "learning_rate": 3.373293623783558e-06, + "loss": 0.2099, + "num_input_tokens_seen": 50875472, + "step": 33310 + }, + { + "epoch": 102.82534775888718, + "grad_norm": 0.4854273498058319, + "learning_rate": 3.368370313091257e-06, + "loss": 0.2251, + "num_input_tokens_seen": 50882960, + "step": 33315 + }, + { + "epoch": 102.84080370942813, + "grad_norm": 0.7871544361114502, + "learning_rate": 3.363450338268087e-06, + "loss": 0.2506, + "num_input_tokens_seen": 50890672, + "step": 33320 + }, + { + "epoch": 102.85625965996908, + "grad_norm": 0.7260357737541199, + "learning_rate": 3.358533700072783e-06, + "loss": 0.2029, + "num_input_tokens_seen": 50897968, + "step": 33325 + }, + { + "epoch": 102.87171561051005, + "grad_norm": 0.5196611881256104, + "learning_rate": 3.3536203992635377e-06, + "loss": 0.2176, + "num_input_tokens_seen": 50905616, + "step": 33330 + }, + { + "epoch": 102.887171561051, + "grad_norm": 0.537696123123169, + "learning_rate": 3.348710436598057e-06, + "loss": 0.2443, + "num_input_tokens_seen": 50913328, + "step": 33335 + }, + { + "epoch": 102.90262751159196, + "grad_norm": 0.39502522349357605, + "learning_rate": 3.3438038128335155e-06, + "loss": 0.235, + "num_input_tokens_seen": 50921040, + "step": 33340 + }, + { + "epoch": 102.91808346213293, + "grad_norm": 0.5947027802467346, + "learning_rate": 3.338900528726571e-06, + "loss": 0.3826, + "num_input_tokens_seen": 50928272, + "step": 33345 + }, + { + "epoch": 102.93353941267388, + "grad_norm": 0.479120135307312, + "learning_rate": 3.3340005850333812e-06, + "loss": 0.2452, + "num_input_tokens_seen": 50936240, + "step": 33350 + }, + { + "epoch": 102.94899536321483, + "grad_norm": 0.5768097639083862, + "learning_rate": 3.329103982509568e-06, + "loss": 0.2353, + "num_input_tokens_seen": 50943952, + "step": 33355 + }, + { + "epoch": 102.9644513137558, + "grad_norm": 0.6778420805931091, + "learning_rate": 3.324210721910259e-06, + "loss": 0.2618, + "num_input_tokens_seen": 50951280, + "step": 33360 + }, + { + "epoch": 102.97990726429676, + "grad_norm": 0.36093050241470337, + "learning_rate": 3.319320803990053e-06, + "loss": 0.2538, + "num_input_tokens_seen": 50959312, + "step": 33365 + }, + { + "epoch": 102.99536321483771, + "grad_norm": 0.31176191568374634, + "learning_rate": 3.3144342295030274e-06, + "loss": 0.2015, + "num_input_tokens_seen": 50966544, + "step": 33370 + }, + { + "epoch": 103.00927357032458, + "grad_norm": 0.3085910677909851, + "learning_rate": 3.309550999202765e-06, + "loss": 0.2241, + "num_input_tokens_seen": 50973216, + "step": 33375 + }, + { + "epoch": 103.02472952086553, + "grad_norm": 0.3645504117012024, + "learning_rate": 3.3046711138423197e-06, + "loss": 0.2463, + "num_input_tokens_seen": 50979968, + "step": 33380 + }, + { + "epoch": 103.04018547140649, + "grad_norm": 0.9029706716537476, + "learning_rate": 3.2997945741742255e-06, + "loss": 0.3304, + "num_input_tokens_seen": 50987328, + "step": 33385 + }, + { + "epoch": 103.05564142194746, + "grad_norm": 0.5518814921379089, + "learning_rate": 3.2949213809505082e-06, + "loss": 0.2426, + "num_input_tokens_seen": 50994912, + "step": 33390 + }, + { + "epoch": 103.07109737248841, + "grad_norm": 0.5004714131355286, + "learning_rate": 3.2900515349226834e-06, + "loss": 0.2385, + "num_input_tokens_seen": 51002336, + "step": 33395 + }, + { + "epoch": 103.08655332302936, + "grad_norm": 0.46144068241119385, + "learning_rate": 3.285185036841731e-06, + "loss": 0.2872, + "num_input_tokens_seen": 51010144, + "step": 33400 + }, + { + "epoch": 103.08655332302936, + "eval_loss": 0.3008640706539154, + "eval_runtime": 6.3135, + "eval_samples_per_second": 91.075, + "eval_steps_per_second": 22.808, + "num_input_tokens_seen": 51010144, + "step": 33400 + }, + { + "epoch": 103.10200927357033, + "grad_norm": 0.42282459139823914, + "learning_rate": 3.2803218874581377e-06, + "loss": 0.214, + "num_input_tokens_seen": 51017888, + "step": 33405 + }, + { + "epoch": 103.11746522411129, + "grad_norm": 0.46848416328430176, + "learning_rate": 3.2754620875218494e-06, + "loss": 0.2071, + "num_input_tokens_seen": 51025504, + "step": 33410 + }, + { + "epoch": 103.13292117465224, + "grad_norm": 0.37250036001205444, + "learning_rate": 3.2706056377823146e-06, + "loss": 0.2555, + "num_input_tokens_seen": 51033280, + "step": 33415 + }, + { + "epoch": 103.14837712519319, + "grad_norm": 0.7603008151054382, + "learning_rate": 3.2657525389884647e-06, + "loss": 0.1956, + "num_input_tokens_seen": 51041248, + "step": 33420 + }, + { + "epoch": 103.16383307573416, + "grad_norm": 0.5298740267753601, + "learning_rate": 3.260902791888698e-06, + "loss": 0.2401, + "num_input_tokens_seen": 51049088, + "step": 33425 + }, + { + "epoch": 103.17928902627511, + "grad_norm": 0.3881337344646454, + "learning_rate": 3.2560563972309166e-06, + "loss": 0.2584, + "num_input_tokens_seen": 51056736, + "step": 33430 + }, + { + "epoch": 103.19474497681607, + "grad_norm": 0.809528648853302, + "learning_rate": 3.251213355762489e-06, + "loss": 0.2162, + "num_input_tokens_seen": 51064320, + "step": 33435 + }, + { + "epoch": 103.21020092735704, + "grad_norm": 0.4043140709400177, + "learning_rate": 3.2463736682302707e-06, + "loss": 0.2303, + "num_input_tokens_seen": 51071328, + "step": 33440 + }, + { + "epoch": 103.22565687789799, + "grad_norm": 0.46949583292007446, + "learning_rate": 3.2415373353806124e-06, + "loss": 0.2373, + "num_input_tokens_seen": 51079360, + "step": 33445 + }, + { + "epoch": 103.24111282843894, + "grad_norm": 0.5911230444908142, + "learning_rate": 3.236704357959322e-06, + "loss": 0.2, + "num_input_tokens_seen": 51086944, + "step": 33450 + }, + { + "epoch": 103.25656877897991, + "grad_norm": 0.788888156414032, + "learning_rate": 3.2318747367117154e-06, + "loss": 0.2408, + "num_input_tokens_seen": 51094720, + "step": 33455 + }, + { + "epoch": 103.27202472952087, + "grad_norm": 0.5967994928359985, + "learning_rate": 3.227048472382585e-06, + "loss": 0.2017, + "num_input_tokens_seen": 51102752, + "step": 33460 + }, + { + "epoch": 103.28748068006182, + "grad_norm": 0.7158678770065308, + "learning_rate": 3.2222255657161915e-06, + "loss": 0.2797, + "num_input_tokens_seen": 51110848, + "step": 33465 + }, + { + "epoch": 103.30293663060279, + "grad_norm": 0.5498484373092651, + "learning_rate": 3.2174060174562924e-06, + "loss": 0.2688, + "num_input_tokens_seen": 51118784, + "step": 33470 + }, + { + "epoch": 103.31839258114374, + "grad_norm": 0.27896904945373535, + "learning_rate": 3.2125898283461298e-06, + "loss": 0.2514, + "num_input_tokens_seen": 51126304, + "step": 33475 + }, + { + "epoch": 103.3338485316847, + "grad_norm": 0.4801897704601288, + "learning_rate": 3.207776999128406e-06, + "loss": 0.2493, + "num_input_tokens_seen": 51133856, + "step": 33480 + }, + { + "epoch": 103.34930448222566, + "grad_norm": 0.4363578259944916, + "learning_rate": 3.202967530545331e-06, + "loss": 0.3394, + "num_input_tokens_seen": 51142080, + "step": 33485 + }, + { + "epoch": 103.36476043276662, + "grad_norm": 0.35884442925453186, + "learning_rate": 3.1981614233385778e-06, + "loss": 0.1967, + "num_input_tokens_seen": 51149856, + "step": 33490 + }, + { + "epoch": 103.38021638330757, + "grad_norm": 0.4088520109653473, + "learning_rate": 3.1933586782493115e-06, + "loss": 0.293, + "num_input_tokens_seen": 51157792, + "step": 33495 + }, + { + "epoch": 103.39567233384854, + "grad_norm": 0.4757870137691498, + "learning_rate": 3.188559296018184e-06, + "loss": 0.3239, + "num_input_tokens_seen": 51165952, + "step": 33500 + }, + { + "epoch": 103.41112828438949, + "grad_norm": 0.43693551421165466, + "learning_rate": 3.1837632773853098e-06, + "loss": 0.2173, + "num_input_tokens_seen": 51173184, + "step": 33505 + }, + { + "epoch": 103.42658423493044, + "grad_norm": 0.526877760887146, + "learning_rate": 3.178970623090294e-06, + "loss": 0.2232, + "num_input_tokens_seen": 51180608, + "step": 33510 + }, + { + "epoch": 103.44204018547141, + "grad_norm": 0.4947860836982727, + "learning_rate": 3.174181333872234e-06, + "loss": 0.2282, + "num_input_tokens_seen": 51188128, + "step": 33515 + }, + { + "epoch": 103.45749613601237, + "grad_norm": 0.6841133236885071, + "learning_rate": 3.169395410469686e-06, + "loss": 0.2581, + "num_input_tokens_seen": 51196160, + "step": 33520 + }, + { + "epoch": 103.47295208655332, + "grad_norm": 0.572128415107727, + "learning_rate": 3.164612853620713e-06, + "loss": 0.2431, + "num_input_tokens_seen": 51204736, + "step": 33525 + }, + { + "epoch": 103.48840803709429, + "grad_norm": 0.5513541102409363, + "learning_rate": 3.1598336640628333e-06, + "loss": 0.2089, + "num_input_tokens_seen": 51212768, + "step": 33530 + }, + { + "epoch": 103.50386398763524, + "grad_norm": 0.7775075435638428, + "learning_rate": 3.155057842533063e-06, + "loss": 0.2812, + "num_input_tokens_seen": 51220000, + "step": 33535 + }, + { + "epoch": 103.5193199381762, + "grad_norm": 0.3830152153968811, + "learning_rate": 3.1502853897678984e-06, + "loss": 0.2336, + "num_input_tokens_seen": 51227616, + "step": 33540 + }, + { + "epoch": 103.53477588871715, + "grad_norm": 0.4680978059768677, + "learning_rate": 3.1455163065033017e-06, + "loss": 0.2146, + "num_input_tokens_seen": 51235296, + "step": 33545 + }, + { + "epoch": 103.55023183925812, + "grad_norm": 0.4958612024784088, + "learning_rate": 3.140750593474734e-06, + "loss": 0.2426, + "num_input_tokens_seen": 51243328, + "step": 33550 + }, + { + "epoch": 103.56568778979907, + "grad_norm": 0.464711457490921, + "learning_rate": 3.1359882514171294e-06, + "loss": 0.2317, + "num_input_tokens_seen": 51250848, + "step": 33555 + }, + { + "epoch": 103.58114374034002, + "grad_norm": 0.4058274030685425, + "learning_rate": 3.1312292810648903e-06, + "loss": 0.2801, + "num_input_tokens_seen": 51258592, + "step": 33560 + }, + { + "epoch": 103.59659969088099, + "grad_norm": 0.31329402327537537, + "learning_rate": 3.1264736831519204e-06, + "loss": 0.2248, + "num_input_tokens_seen": 51265984, + "step": 33565 + }, + { + "epoch": 103.61205564142195, + "grad_norm": 0.3923429548740387, + "learning_rate": 3.1217214584115863e-06, + "loss": 0.245, + "num_input_tokens_seen": 51273504, + "step": 33570 + }, + { + "epoch": 103.6275115919629, + "grad_norm": 0.6082834005355835, + "learning_rate": 3.116972607576746e-06, + "loss": 0.2344, + "num_input_tokens_seen": 51281280, + "step": 33575 + }, + { + "epoch": 103.64296754250387, + "grad_norm": 0.5733535289764404, + "learning_rate": 3.1122271313797303e-06, + "loss": 0.2111, + "num_input_tokens_seen": 51288800, + "step": 33580 + }, + { + "epoch": 103.65842349304482, + "grad_norm": 0.3946464955806732, + "learning_rate": 3.107485030552343e-06, + "loss": 0.2234, + "num_input_tokens_seen": 51296512, + "step": 33585 + }, + { + "epoch": 103.67387944358578, + "grad_norm": 0.6349407434463501, + "learning_rate": 3.1027463058258848e-06, + "loss": 0.2791, + "num_input_tokens_seen": 51304128, + "step": 33590 + }, + { + "epoch": 103.68933539412674, + "grad_norm": 0.5977966785430908, + "learning_rate": 3.0980109579311273e-06, + "loss": 0.2242, + "num_input_tokens_seen": 51311744, + "step": 33595 + }, + { + "epoch": 103.7047913446677, + "grad_norm": 0.4454321563243866, + "learning_rate": 3.093278987598314e-06, + "loss": 0.2565, + "num_input_tokens_seen": 51318976, + "step": 33600 + }, + { + "epoch": 103.7047913446677, + "eval_loss": 0.3013375997543335, + "eval_runtime": 6.3209, + "eval_samples_per_second": 90.968, + "eval_steps_per_second": 22.782, + "num_input_tokens_seen": 51318976, + "step": 33600 + }, + { + "epoch": 103.72024729520865, + "grad_norm": 0.6351287364959717, + "learning_rate": 3.0885503955571826e-06, + "loss": 0.2568, + "num_input_tokens_seen": 51326528, + "step": 33605 + }, + { + "epoch": 103.73570324574962, + "grad_norm": 0.8660024404525757, + "learning_rate": 3.0838251825369313e-06, + "loss": 0.3336, + "num_input_tokens_seen": 51334112, + "step": 33610 + }, + { + "epoch": 103.75115919629057, + "grad_norm": 0.48313820362091064, + "learning_rate": 3.0791033492662517e-06, + "loss": 0.2653, + "num_input_tokens_seen": 51341408, + "step": 33615 + }, + { + "epoch": 103.76661514683153, + "grad_norm": 0.5033913850784302, + "learning_rate": 3.0743848964733203e-06, + "loss": 0.2472, + "num_input_tokens_seen": 51348960, + "step": 33620 + }, + { + "epoch": 103.7820710973725, + "grad_norm": 0.8728715181350708, + "learning_rate": 3.0696698248857625e-06, + "loss": 0.2617, + "num_input_tokens_seen": 51356384, + "step": 33625 + }, + { + "epoch": 103.79752704791345, + "grad_norm": 0.3894610106945038, + "learning_rate": 3.0649581352307192e-06, + "loss": 0.2067, + "num_input_tokens_seen": 51364032, + "step": 33630 + }, + { + "epoch": 103.8129829984544, + "grad_norm": 0.37623530626296997, + "learning_rate": 3.060249828234776e-06, + "loss": 0.2047, + "num_input_tokens_seen": 51371872, + "step": 33635 + }, + { + "epoch": 103.82843894899537, + "grad_norm": 0.506638765335083, + "learning_rate": 3.055544904624025e-06, + "loss": 0.2648, + "num_input_tokens_seen": 51379808, + "step": 33640 + }, + { + "epoch": 103.84389489953632, + "grad_norm": 0.5074219107627869, + "learning_rate": 3.050843365124026e-06, + "loss": 0.2581, + "num_input_tokens_seen": 51388032, + "step": 33645 + }, + { + "epoch": 103.85935085007728, + "grad_norm": 0.45325732231140137, + "learning_rate": 3.0461452104598083e-06, + "loss": 0.3235, + "num_input_tokens_seen": 51395040, + "step": 33650 + }, + { + "epoch": 103.87480680061825, + "grad_norm": 0.5038722157478333, + "learning_rate": 3.0414504413558836e-06, + "loss": 0.2953, + "num_input_tokens_seen": 51402976, + "step": 33655 + }, + { + "epoch": 103.8902627511592, + "grad_norm": 0.4028841257095337, + "learning_rate": 3.0367590585362564e-06, + "loss": 0.2099, + "num_input_tokens_seen": 51410816, + "step": 33660 + }, + { + "epoch": 103.90571870170015, + "grad_norm": 0.5304909944534302, + "learning_rate": 3.0320710627243813e-06, + "loss": 0.2707, + "num_input_tokens_seen": 51418688, + "step": 33665 + }, + { + "epoch": 103.9211746522411, + "grad_norm": 0.44815608859062195, + "learning_rate": 3.027386454643222e-06, + "loss": 0.2317, + "num_input_tokens_seen": 51426016, + "step": 33670 + }, + { + "epoch": 103.93663060278207, + "grad_norm": 0.9482572674751282, + "learning_rate": 3.0227052350151914e-06, + "loss": 0.2669, + "num_input_tokens_seen": 51433280, + "step": 33675 + }, + { + "epoch": 103.95208655332303, + "grad_norm": 0.6075193881988525, + "learning_rate": 3.0180274045621957e-06, + "loss": 0.2148, + "num_input_tokens_seen": 51440768, + "step": 33680 + }, + { + "epoch": 103.96754250386398, + "grad_norm": 0.7659736275672913, + "learning_rate": 3.013352964005625e-06, + "loss": 0.2435, + "num_input_tokens_seen": 51448160, + "step": 33685 + }, + { + "epoch": 103.98299845440495, + "grad_norm": 0.506959080696106, + "learning_rate": 3.0086819140663218e-06, + "loss": 0.2387, + "num_input_tokens_seen": 51455712, + "step": 33690 + }, + { + "epoch": 103.9984544049459, + "grad_norm": 0.37075456976890564, + "learning_rate": 3.0040142554646265e-06, + "loss": 0.1926, + "num_input_tokens_seen": 51463680, + "step": 33695 + }, + { + "epoch": 104.01236476043276, + "grad_norm": 0.3982624411582947, + "learning_rate": 2.999349988920361e-06, + "loss": 0.2322, + "num_input_tokens_seen": 51470656, + "step": 33700 + }, + { + "epoch": 104.02782071097373, + "grad_norm": 0.49330446124076843, + "learning_rate": 2.994689115152796e-06, + "loss": 0.2113, + "num_input_tokens_seen": 51478080, + "step": 33705 + }, + { + "epoch": 104.04327666151468, + "grad_norm": 0.34455499053001404, + "learning_rate": 2.9900316348807105e-06, + "loss": 0.2256, + "num_input_tokens_seen": 51485696, + "step": 33710 + }, + { + "epoch": 104.05873261205564, + "grad_norm": 0.5371315479278564, + "learning_rate": 2.985377548822338e-06, + "loss": 0.2616, + "num_input_tokens_seen": 51493088, + "step": 33715 + }, + { + "epoch": 104.0741885625966, + "grad_norm": 0.39452770352363586, + "learning_rate": 2.980726857695404e-06, + "loss": 0.249, + "num_input_tokens_seen": 51501248, + "step": 33720 + }, + { + "epoch": 104.08964451313756, + "grad_norm": 0.5172643065452576, + "learning_rate": 2.9760795622171017e-06, + "loss": 0.2411, + "num_input_tokens_seen": 51509408, + "step": 33725 + }, + { + "epoch": 104.10510046367851, + "grad_norm": 0.48957523703575134, + "learning_rate": 2.971435663104094e-06, + "loss": 0.2385, + "num_input_tokens_seen": 51516960, + "step": 33730 + }, + { + "epoch": 104.12055641421948, + "grad_norm": 0.6038855910301208, + "learning_rate": 2.9667951610725385e-06, + "loss": 0.28, + "num_input_tokens_seen": 51524576, + "step": 33735 + }, + { + "epoch": 104.13601236476043, + "grad_norm": 0.41831478476524353, + "learning_rate": 2.9621580568380575e-06, + "loss": 0.2977, + "num_input_tokens_seen": 51532096, + "step": 33740 + }, + { + "epoch": 104.15146831530139, + "grad_norm": 0.48965588212013245, + "learning_rate": 2.9575243511157453e-06, + "loss": 0.2414, + "num_input_tokens_seen": 51539680, + "step": 33745 + }, + { + "epoch": 104.16692426584235, + "grad_norm": 0.2873556613922119, + "learning_rate": 2.952894044620186e-06, + "loss": 0.2321, + "num_input_tokens_seen": 51547232, + "step": 33750 + }, + { + "epoch": 104.18238021638331, + "grad_norm": 0.5126206874847412, + "learning_rate": 2.948267138065419e-06, + "loss": 0.2223, + "num_input_tokens_seen": 51554976, + "step": 33755 + }, + { + "epoch": 104.19783616692426, + "grad_norm": 0.6185243725776672, + "learning_rate": 2.943643632164983e-06, + "loss": 0.3329, + "num_input_tokens_seen": 51563136, + "step": 33760 + }, + { + "epoch": 104.21329211746523, + "grad_norm": 0.6473104953765869, + "learning_rate": 2.939023527631879e-06, + "loss": 0.2505, + "num_input_tokens_seen": 51570304, + "step": 33765 + }, + { + "epoch": 104.22874806800618, + "grad_norm": 0.5485879778862, + "learning_rate": 2.934406825178576e-06, + "loss": 0.2234, + "num_input_tokens_seen": 51577824, + "step": 33770 + }, + { + "epoch": 104.24420401854714, + "grad_norm": 0.536807656288147, + "learning_rate": 2.9297935255170357e-06, + "loss": 0.1973, + "num_input_tokens_seen": 51585728, + "step": 33775 + }, + { + "epoch": 104.2596599690881, + "grad_norm": 0.45126211643218994, + "learning_rate": 2.925183629358691e-06, + "loss": 0.2526, + "num_input_tokens_seen": 51593088, + "step": 33780 + }, + { + "epoch": 104.27511591962906, + "grad_norm": 0.45596247911453247, + "learning_rate": 2.9205771374144346e-06, + "loss": 0.2143, + "num_input_tokens_seen": 51600032, + "step": 33785 + }, + { + "epoch": 104.29057187017001, + "grad_norm": 0.6135831475257874, + "learning_rate": 2.915974050394657e-06, + "loss": 0.3331, + "num_input_tokens_seen": 51607456, + "step": 33790 + }, + { + "epoch": 104.30602782071098, + "grad_norm": 0.3972296714782715, + "learning_rate": 2.9113743690092067e-06, + "loss": 0.2543, + "num_input_tokens_seen": 51615232, + "step": 33795 + }, + { + "epoch": 104.32148377125193, + "grad_norm": 0.4802548587322235, + "learning_rate": 2.906778093967402e-06, + "loss": 0.2709, + "num_input_tokens_seen": 51623136, + "step": 33800 + }, + { + "epoch": 104.32148377125193, + "eval_loss": 0.30206945538520813, + "eval_runtime": 6.3367, + "eval_samples_per_second": 90.741, + "eval_steps_per_second": 22.725, + "num_input_tokens_seen": 51623136, + "step": 33800 + }, + { + "epoch": 104.33693972179289, + "grad_norm": 0.6112470030784607, + "learning_rate": 2.9021852259780656e-06, + "loss": 0.2542, + "num_input_tokens_seen": 51630784, + "step": 33805 + }, + { + "epoch": 104.35239567233384, + "grad_norm": 0.3978727459907532, + "learning_rate": 2.8975957657494583e-06, + "loss": 0.2356, + "num_input_tokens_seen": 51638464, + "step": 33810 + }, + { + "epoch": 104.36785162287481, + "grad_norm": 0.6188533306121826, + "learning_rate": 2.8930097139893417e-06, + "loss": 0.2068, + "num_input_tokens_seen": 51646144, + "step": 33815 + }, + { + "epoch": 104.38330757341576, + "grad_norm": 0.5246843099594116, + "learning_rate": 2.888427071404945e-06, + "loss": 0.2761, + "num_input_tokens_seen": 51654240, + "step": 33820 + }, + { + "epoch": 104.39876352395672, + "grad_norm": 0.5006309151649475, + "learning_rate": 2.8838478387029606e-06, + "loss": 0.2424, + "num_input_tokens_seen": 51662368, + "step": 33825 + }, + { + "epoch": 104.41421947449768, + "grad_norm": 0.5245916247367859, + "learning_rate": 2.8792720165895737e-06, + "loss": 0.2404, + "num_input_tokens_seen": 51669696, + "step": 33830 + }, + { + "epoch": 104.42967542503864, + "grad_norm": 0.5369617342948914, + "learning_rate": 2.874699605770423e-06, + "loss": 0.2472, + "num_input_tokens_seen": 51677312, + "step": 33835 + }, + { + "epoch": 104.44513137557959, + "grad_norm": 0.6321355700492859, + "learning_rate": 2.8701306069506383e-06, + "loss": 0.2291, + "num_input_tokens_seen": 51685248, + "step": 33840 + }, + { + "epoch": 104.46058732612056, + "grad_norm": 0.43793410062789917, + "learning_rate": 2.8655650208348178e-06, + "loss": 0.3038, + "num_input_tokens_seen": 51692736, + "step": 33845 + }, + { + "epoch": 104.47604327666151, + "grad_norm": 0.5616946816444397, + "learning_rate": 2.8610028481270257e-06, + "loss": 0.2462, + "num_input_tokens_seen": 51700448, + "step": 33850 + }, + { + "epoch": 104.49149922720247, + "grad_norm": 0.5795754194259644, + "learning_rate": 2.856444089530813e-06, + "loss": 0.3115, + "num_input_tokens_seen": 51708000, + "step": 33855 + }, + { + "epoch": 104.50695517774344, + "grad_norm": 0.45123574137687683, + "learning_rate": 2.8518887457491955e-06, + "loss": 0.2217, + "num_input_tokens_seen": 51715712, + "step": 33860 + }, + { + "epoch": 104.52241112828439, + "grad_norm": 0.4572630524635315, + "learning_rate": 2.8473368174846666e-06, + "loss": 0.2621, + "num_input_tokens_seen": 51723680, + "step": 33865 + }, + { + "epoch": 104.53786707882534, + "grad_norm": 0.4323078393936157, + "learning_rate": 2.842788305439184e-06, + "loss": 0.2384, + "num_input_tokens_seen": 51731488, + "step": 33870 + }, + { + "epoch": 104.55332302936631, + "grad_norm": 0.42882126569747925, + "learning_rate": 2.8382432103141925e-06, + "loss": 0.2874, + "num_input_tokens_seen": 51739424, + "step": 33875 + }, + { + "epoch": 104.56877897990726, + "grad_norm": 0.5943766832351685, + "learning_rate": 2.833701532810598e-06, + "loss": 0.3155, + "num_input_tokens_seen": 51746880, + "step": 33880 + }, + { + "epoch": 104.58423493044822, + "grad_norm": 0.3639163672924042, + "learning_rate": 2.8291632736287877e-06, + "loss": 0.217, + "num_input_tokens_seen": 51754080, + "step": 33885 + }, + { + "epoch": 104.59969088098919, + "grad_norm": 0.4521675109863281, + "learning_rate": 2.824628433468615e-06, + "loss": 0.257, + "num_input_tokens_seen": 51761408, + "step": 33890 + }, + { + "epoch": 104.61514683153014, + "grad_norm": 0.7561042904853821, + "learning_rate": 2.8200970130294073e-06, + "loss": 0.2492, + "num_input_tokens_seen": 51768992, + "step": 33895 + }, + { + "epoch": 104.6306027820711, + "grad_norm": 0.5771350860595703, + "learning_rate": 2.8155690130099775e-06, + "loss": 0.2224, + "num_input_tokens_seen": 51776128, + "step": 33900 + }, + { + "epoch": 104.64605873261206, + "grad_norm": 0.34044721722602844, + "learning_rate": 2.8110444341085895e-06, + "loss": 0.1951, + "num_input_tokens_seen": 51783840, + "step": 33905 + }, + { + "epoch": 104.66151468315302, + "grad_norm": 0.7565206289291382, + "learning_rate": 2.806523277022996e-06, + "loss": 0.3357, + "num_input_tokens_seen": 51791808, + "step": 33910 + }, + { + "epoch": 104.67697063369397, + "grad_norm": 0.45674240589141846, + "learning_rate": 2.802005542450409e-06, + "loss": 0.2381, + "num_input_tokens_seen": 51799424, + "step": 33915 + }, + { + "epoch": 104.69242658423494, + "grad_norm": 0.3574930429458618, + "learning_rate": 2.797491231087526e-06, + "loss": 0.253, + "num_input_tokens_seen": 51807392, + "step": 33920 + }, + { + "epoch": 104.70788253477589, + "grad_norm": 0.4644301235675812, + "learning_rate": 2.7929803436305137e-06, + "loss": 0.2312, + "num_input_tokens_seen": 51814976, + "step": 33925 + }, + { + "epoch": 104.72333848531684, + "grad_norm": 0.5821274518966675, + "learning_rate": 2.788472880774998e-06, + "loss": 0.2711, + "num_input_tokens_seen": 51822688, + "step": 33930 + }, + { + "epoch": 104.7387944358578, + "grad_norm": 0.4128824770450592, + "learning_rate": 2.7839688432160977e-06, + "loss": 0.2151, + "num_input_tokens_seen": 51830368, + "step": 33935 + }, + { + "epoch": 104.75425038639877, + "grad_norm": 0.7994541525840759, + "learning_rate": 2.779468231648383e-06, + "loss": 0.2624, + "num_input_tokens_seen": 51838528, + "step": 33940 + }, + { + "epoch": 104.76970633693972, + "grad_norm": 0.3391956686973572, + "learning_rate": 2.774971046765906e-06, + "loss": 0.2161, + "num_input_tokens_seen": 51845920, + "step": 33945 + }, + { + "epoch": 104.78516228748067, + "grad_norm": 0.41348797082901, + "learning_rate": 2.770477289262194e-06, + "loss": 0.2569, + "num_input_tokens_seen": 51853984, + "step": 33950 + }, + { + "epoch": 104.80061823802164, + "grad_norm": 0.2537861466407776, + "learning_rate": 2.765986959830233e-06, + "loss": 0.1995, + "num_input_tokens_seen": 51861152, + "step": 33955 + }, + { + "epoch": 104.8160741885626, + "grad_norm": 0.4398740231990814, + "learning_rate": 2.761500059162492e-06, + "loss": 0.2727, + "num_input_tokens_seen": 51869344, + "step": 33960 + }, + { + "epoch": 104.83153013910355, + "grad_norm": 0.523564338684082, + "learning_rate": 2.757016587950914e-06, + "loss": 0.198, + "num_input_tokens_seen": 51876800, + "step": 33965 + }, + { + "epoch": 104.84698608964452, + "grad_norm": 0.3965770900249481, + "learning_rate": 2.752536546886897e-06, + "loss": 0.2423, + "num_input_tokens_seen": 51884832, + "step": 33970 + }, + { + "epoch": 104.86244204018547, + "grad_norm": 0.5953988432884216, + "learning_rate": 2.7480599366613234e-06, + "loss": 0.3403, + "num_input_tokens_seen": 51892864, + "step": 33975 + }, + { + "epoch": 104.87789799072642, + "grad_norm": 0.4268450140953064, + "learning_rate": 2.7435867579645473e-06, + "loss": 0.2833, + "num_input_tokens_seen": 51900288, + "step": 33980 + }, + { + "epoch": 104.89335394126739, + "grad_norm": 0.39486485719680786, + "learning_rate": 2.739117011486378e-06, + "loss": 0.2243, + "num_input_tokens_seen": 51908128, + "step": 33985 + }, + { + "epoch": 104.90880989180835, + "grad_norm": 0.39357173442840576, + "learning_rate": 2.7346506979161216e-06, + "loss": 0.1822, + "num_input_tokens_seen": 51915360, + "step": 33990 + }, + { + "epoch": 104.9242658423493, + "grad_norm": 0.6417444348335266, + "learning_rate": 2.7301878179425227e-06, + "loss": 0.2138, + "num_input_tokens_seen": 51922784, + "step": 33995 + }, + { + "epoch": 104.93972179289027, + "grad_norm": 0.37946003675460815, + "learning_rate": 2.7257283722538244e-06, + "loss": 0.2471, + "num_input_tokens_seen": 51930240, + "step": 34000 + }, + { + "epoch": 104.93972179289027, + "eval_loss": 0.3022008240222931, + "eval_runtime": 6.328, + "eval_samples_per_second": 90.865, + "eval_steps_per_second": 22.756, + "num_input_tokens_seen": 51930240, + "step": 34000 + }, + { + "epoch": 104.95517774343122, + "grad_norm": 0.393149197101593, + "learning_rate": 2.7212723615377326e-06, + "loss": 0.2185, + "num_input_tokens_seen": 51937952, + "step": 34005 + }, + { + "epoch": 104.97063369397218, + "grad_norm": 0.34419360756874084, + "learning_rate": 2.7168197864814145e-06, + "loss": 0.2015, + "num_input_tokens_seen": 51945376, + "step": 34010 + }, + { + "epoch": 104.98608964451314, + "grad_norm": 0.44695085287094116, + "learning_rate": 2.712370647771509e-06, + "loss": 0.2508, + "num_input_tokens_seen": 51952672, + "step": 34015 + }, + { + "epoch": 105.0, + "grad_norm": 0.8375548720359802, + "learning_rate": 2.707924946094137e-06, + "loss": 0.1904, + "num_input_tokens_seen": 51959232, + "step": 34020 + }, + { + "epoch": 105.01545595054095, + "grad_norm": 0.5368375182151794, + "learning_rate": 2.7034826821348723e-06, + "loss": 0.2501, + "num_input_tokens_seen": 51966688, + "step": 34025 + }, + { + "epoch": 105.03091190108192, + "grad_norm": 0.3732016980648041, + "learning_rate": 2.6990438565787786e-06, + "loss": 0.2446, + "num_input_tokens_seen": 51974944, + "step": 34030 + }, + { + "epoch": 105.04636785162288, + "grad_norm": 0.7745949029922485, + "learning_rate": 2.6946084701103714e-06, + "loss": 0.2465, + "num_input_tokens_seen": 51982784, + "step": 34035 + }, + { + "epoch": 105.06182380216383, + "grad_norm": 0.3556444048881531, + "learning_rate": 2.6901765234136428e-06, + "loss": 0.2008, + "num_input_tokens_seen": 51989856, + "step": 34040 + }, + { + "epoch": 105.0772797527048, + "grad_norm": 0.9908781051635742, + "learning_rate": 2.685748017172063e-06, + "loss": 0.231, + "num_input_tokens_seen": 51997120, + "step": 34045 + }, + { + "epoch": 105.09273570324575, + "grad_norm": 1.000187873840332, + "learning_rate": 2.681322952068549e-06, + "loss": 0.3712, + "num_input_tokens_seen": 52004800, + "step": 34050 + }, + { + "epoch": 105.1081916537867, + "grad_norm": 0.6969144344329834, + "learning_rate": 2.6769013287855137e-06, + "loss": 0.2383, + "num_input_tokens_seen": 52012736, + "step": 34055 + }, + { + "epoch": 105.12364760432767, + "grad_norm": 0.3808014392852783, + "learning_rate": 2.6724831480048286e-06, + "loss": 0.237, + "num_input_tokens_seen": 52020896, + "step": 34060 + }, + { + "epoch": 105.13910355486863, + "grad_norm": 0.6386421322822571, + "learning_rate": 2.66806841040782e-06, + "loss": 0.253, + "num_input_tokens_seen": 52028704, + "step": 34065 + }, + { + "epoch": 105.15455950540958, + "grad_norm": 0.37981319427490234, + "learning_rate": 2.6636571166753083e-06, + "loss": 0.2544, + "num_input_tokens_seen": 52036000, + "step": 34070 + }, + { + "epoch": 105.17001545595055, + "grad_norm": 0.6607080698013306, + "learning_rate": 2.6592492674875598e-06, + "loss": 0.233, + "num_input_tokens_seen": 52043424, + "step": 34075 + }, + { + "epoch": 105.1854714064915, + "grad_norm": 0.44023507833480835, + "learning_rate": 2.6548448635243305e-06, + "loss": 0.2618, + "num_input_tokens_seen": 52050656, + "step": 34080 + }, + { + "epoch": 105.20092735703246, + "grad_norm": 0.4558001160621643, + "learning_rate": 2.650443905464828e-06, + "loss": 0.2312, + "num_input_tokens_seen": 52058208, + "step": 34085 + }, + { + "epoch": 105.21638330757341, + "grad_norm": 0.5027107000350952, + "learning_rate": 2.646046393987739e-06, + "loss": 0.2706, + "num_input_tokens_seen": 52065888, + "step": 34090 + }, + { + "epoch": 105.23183925811438, + "grad_norm": 0.48480939865112305, + "learning_rate": 2.64165232977121e-06, + "loss": 0.235, + "num_input_tokens_seen": 52073280, + "step": 34095 + }, + { + "epoch": 105.24729520865533, + "grad_norm": 0.4199996590614319, + "learning_rate": 2.6372617134928695e-06, + "loss": 0.2337, + "num_input_tokens_seen": 52080800, + "step": 34100 + }, + { + "epoch": 105.26275115919628, + "grad_norm": 0.5785274505615234, + "learning_rate": 2.6328745458297943e-06, + "loss": 0.2634, + "num_input_tokens_seen": 52088576, + "step": 34105 + }, + { + "epoch": 105.27820710973725, + "grad_norm": 0.616492509841919, + "learning_rate": 2.6284908274585546e-06, + "loss": 0.2645, + "num_input_tokens_seen": 52096384, + "step": 34110 + }, + { + "epoch": 105.2936630602782, + "grad_norm": 0.3106856346130371, + "learning_rate": 2.6241105590551595e-06, + "loss": 0.209, + "num_input_tokens_seen": 52104064, + "step": 34115 + }, + { + "epoch": 105.30911901081916, + "grad_norm": 0.5991312265396118, + "learning_rate": 2.6197337412951105e-06, + "loss": 0.2669, + "num_input_tokens_seen": 52111328, + "step": 34120 + }, + { + "epoch": 105.32457496136013, + "grad_norm": 0.4233664572238922, + "learning_rate": 2.6153603748533705e-06, + "loss": 0.2055, + "num_input_tokens_seen": 52119008, + "step": 34125 + }, + { + "epoch": 105.34003091190108, + "grad_norm": 0.513839840888977, + "learning_rate": 2.6109904604043585e-06, + "loss": 0.2147, + "num_input_tokens_seen": 52126752, + "step": 34130 + }, + { + "epoch": 105.35548686244204, + "grad_norm": 0.3046720325946808, + "learning_rate": 2.6066239986219765e-06, + "loss": 0.2366, + "num_input_tokens_seen": 52134496, + "step": 34135 + }, + { + "epoch": 105.370942812983, + "grad_norm": 0.500855565071106, + "learning_rate": 2.602260990179592e-06, + "loss": 0.2806, + "num_input_tokens_seen": 52142048, + "step": 34140 + }, + { + "epoch": 105.38639876352396, + "grad_norm": 0.43475326895713806, + "learning_rate": 2.5979014357500248e-06, + "loss": 0.2508, + "num_input_tokens_seen": 52149344, + "step": 34145 + }, + { + "epoch": 105.40185471406491, + "grad_norm": 0.48115089535713196, + "learning_rate": 2.5935453360055844e-06, + "loss": 0.2003, + "num_input_tokens_seen": 52156768, + "step": 34150 + }, + { + "epoch": 105.41731066460588, + "grad_norm": 0.5334502458572388, + "learning_rate": 2.5891926916180283e-06, + "loss": 0.3212, + "num_input_tokens_seen": 52164512, + "step": 34155 + }, + { + "epoch": 105.43276661514683, + "grad_norm": 0.3645380437374115, + "learning_rate": 2.5848435032585883e-06, + "loss": 0.2343, + "num_input_tokens_seen": 52172320, + "step": 34160 + }, + { + "epoch": 105.44822256568779, + "grad_norm": 0.5639675855636597, + "learning_rate": 2.58049777159797e-06, + "loss": 0.216, + "num_input_tokens_seen": 52179648, + "step": 34165 + }, + { + "epoch": 105.46367851622875, + "grad_norm": 0.48624691367149353, + "learning_rate": 2.576155497306332e-06, + "loss": 0.2056, + "num_input_tokens_seen": 52187168, + "step": 34170 + }, + { + "epoch": 105.47913446676971, + "grad_norm": 0.36620527505874634, + "learning_rate": 2.57181668105331e-06, + "loss": 0.2477, + "num_input_tokens_seen": 52195232, + "step": 34175 + }, + { + "epoch": 105.49459041731066, + "grad_norm": 0.5738275051116943, + "learning_rate": 2.567481323508014e-06, + "loss": 0.2956, + "num_input_tokens_seen": 52203232, + "step": 34180 + }, + { + "epoch": 105.51004636785163, + "grad_norm": 0.7295140624046326, + "learning_rate": 2.5631494253389954e-06, + "loss": 0.2682, + "num_input_tokens_seen": 52211424, + "step": 34185 + }, + { + "epoch": 105.52550231839258, + "grad_norm": 0.5889233946800232, + "learning_rate": 2.5588209872142997e-06, + "loss": 0.2152, + "num_input_tokens_seen": 52218656, + "step": 34190 + }, + { + "epoch": 105.54095826893354, + "grad_norm": 0.3876938223838806, + "learning_rate": 2.5544960098014186e-06, + "loss": 0.2183, + "num_input_tokens_seen": 52226016, + "step": 34195 + }, + { + "epoch": 105.5564142194745, + "grad_norm": 0.6150287985801697, + "learning_rate": 2.550174493767318e-06, + "loss": 0.2691, + "num_input_tokens_seen": 52233888, + "step": 34200 + }, + { + "epoch": 105.5564142194745, + "eval_loss": 0.30205705761909485, + "eval_runtime": 6.3198, + "eval_samples_per_second": 90.984, + "eval_steps_per_second": 22.786, + "num_input_tokens_seen": 52233888, + "step": 34200 + }, + { + "epoch": 105.57187017001546, + "grad_norm": 0.67551189661026, + "learning_rate": 2.545856439778438e-06, + "loss": 0.2094, + "num_input_tokens_seen": 52241984, + "step": 34205 + }, + { + "epoch": 105.58732612055641, + "grad_norm": 0.45362964272499084, + "learning_rate": 2.541541848500667e-06, + "loss": 0.2917, + "num_input_tokens_seen": 52249504, + "step": 34210 + }, + { + "epoch": 105.60278207109737, + "grad_norm": 0.5583823323249817, + "learning_rate": 2.5372307205993733e-06, + "loss": 0.2716, + "num_input_tokens_seen": 52257024, + "step": 34215 + }, + { + "epoch": 105.61823802163833, + "grad_norm": 0.3921528458595276, + "learning_rate": 2.5329230567393917e-06, + "loss": 0.2076, + "num_input_tokens_seen": 52265120, + "step": 34220 + }, + { + "epoch": 105.63369397217929, + "grad_norm": 0.6707494258880615, + "learning_rate": 2.5286188575850164e-06, + "loss": 0.2084, + "num_input_tokens_seen": 52273280, + "step": 34225 + }, + { + "epoch": 105.64914992272024, + "grad_norm": 0.28334304690361023, + "learning_rate": 2.5243181237999984e-06, + "loss": 0.2374, + "num_input_tokens_seen": 52280736, + "step": 34230 + }, + { + "epoch": 105.66460587326121, + "grad_norm": 0.3849351108074188, + "learning_rate": 2.520020856047578e-06, + "loss": 0.1995, + "num_input_tokens_seen": 52288832, + "step": 34235 + }, + { + "epoch": 105.68006182380216, + "grad_norm": 0.5251360535621643, + "learning_rate": 2.515727054990438e-06, + "loss": 0.2737, + "num_input_tokens_seen": 52296544, + "step": 34240 + }, + { + "epoch": 105.69551777434312, + "grad_norm": 0.4693096876144409, + "learning_rate": 2.511436721290747e-06, + "loss": 0.2492, + "num_input_tokens_seen": 52303680, + "step": 34245 + }, + { + "epoch": 105.71097372488408, + "grad_norm": 0.5535417199134827, + "learning_rate": 2.5071498556101164e-06, + "loss": 0.1934, + "num_input_tokens_seen": 52311552, + "step": 34250 + }, + { + "epoch": 105.72642967542504, + "grad_norm": 0.8918483853340149, + "learning_rate": 2.5028664586096485e-06, + "loss": 0.27, + "num_input_tokens_seen": 52319744, + "step": 34255 + }, + { + "epoch": 105.74188562596599, + "grad_norm": 0.6476888060569763, + "learning_rate": 2.498586530949881e-06, + "loss": 0.2134, + "num_input_tokens_seen": 52327424, + "step": 34260 + }, + { + "epoch": 105.75734157650696, + "grad_norm": 0.6706854701042175, + "learning_rate": 2.4943100732908427e-06, + "loss": 0.2588, + "num_input_tokens_seen": 52335008, + "step": 34265 + }, + { + "epoch": 105.77279752704791, + "grad_norm": 0.41605615615844727, + "learning_rate": 2.4900370862920188e-06, + "loss": 0.2284, + "num_input_tokens_seen": 52342400, + "step": 34270 + }, + { + "epoch": 105.78825347758887, + "grad_norm": 0.6049885749816895, + "learning_rate": 2.4857675706123518e-06, + "loss": 0.2481, + "num_input_tokens_seen": 52350048, + "step": 34275 + }, + { + "epoch": 105.80370942812984, + "grad_norm": 0.4254513680934906, + "learning_rate": 2.4815015269102543e-06, + "loss": 0.2257, + "num_input_tokens_seen": 52357856, + "step": 34280 + }, + { + "epoch": 105.81916537867079, + "grad_norm": 0.5291129350662231, + "learning_rate": 2.477238955843611e-06, + "loss": 0.2564, + "num_input_tokens_seen": 52365600, + "step": 34285 + }, + { + "epoch": 105.83462132921174, + "grad_norm": 0.3395911455154419, + "learning_rate": 2.4729798580697573e-06, + "loss": 0.2531, + "num_input_tokens_seen": 52372928, + "step": 34290 + }, + { + "epoch": 105.85007727975271, + "grad_norm": 0.6918345093727112, + "learning_rate": 2.4687242342455034e-06, + "loss": 0.2416, + "num_input_tokens_seen": 52380384, + "step": 34295 + }, + { + "epoch": 105.86553323029366, + "grad_norm": 0.42484110593795776, + "learning_rate": 2.4644720850271196e-06, + "loss": 0.287, + "num_input_tokens_seen": 52388256, + "step": 34300 + }, + { + "epoch": 105.88098918083462, + "grad_norm": 0.5580119490623474, + "learning_rate": 2.4602234110703364e-06, + "loss": 0.2492, + "num_input_tokens_seen": 52396224, + "step": 34305 + }, + { + "epoch": 105.89644513137559, + "grad_norm": 0.5041733980178833, + "learning_rate": 2.4559782130303576e-06, + "loss": 0.2162, + "num_input_tokens_seen": 52404096, + "step": 34310 + }, + { + "epoch": 105.91190108191654, + "grad_norm": 0.3938717246055603, + "learning_rate": 2.451736491561843e-06, + "loss": 0.3116, + "num_input_tokens_seen": 52411840, + "step": 34315 + }, + { + "epoch": 105.9273570324575, + "grad_norm": 0.3825080692768097, + "learning_rate": 2.4474982473189163e-06, + "loss": 0.2119, + "num_input_tokens_seen": 52419776, + "step": 34320 + }, + { + "epoch": 105.94281298299846, + "grad_norm": 0.35137778520584106, + "learning_rate": 2.4432634809551796e-06, + "loss": 0.2937, + "num_input_tokens_seen": 52426880, + "step": 34325 + }, + { + "epoch": 105.95826893353942, + "grad_norm": 0.5234748721122742, + "learning_rate": 2.439032193123675e-06, + "loss": 0.2725, + "num_input_tokens_seen": 52434208, + "step": 34330 + }, + { + "epoch": 105.97372488408037, + "grad_norm": 0.6589529514312744, + "learning_rate": 2.4348043844769297e-06, + "loss": 0.2725, + "num_input_tokens_seen": 52441920, + "step": 34335 + }, + { + "epoch": 105.98918083462132, + "grad_norm": 0.5196415781974792, + "learning_rate": 2.4305800556669146e-06, + "loss": 0.2433, + "num_input_tokens_seen": 52449824, + "step": 34340 + }, + { + "epoch": 106.0030911901082, + "grad_norm": 0.5022512078285217, + "learning_rate": 2.426359207345083e-06, + "loss": 0.2827, + "num_input_tokens_seen": 52456752, + "step": 34345 + }, + { + "epoch": 106.01854714064915, + "grad_norm": 0.6077545285224915, + "learning_rate": 2.4221418401623396e-06, + "loss": 0.2956, + "num_input_tokens_seen": 52464272, + "step": 34350 + }, + { + "epoch": 106.03400309119012, + "grad_norm": 0.41092509031295776, + "learning_rate": 2.4179279547690557e-06, + "loss": 0.2039, + "num_input_tokens_seen": 52471792, + "step": 34355 + }, + { + "epoch": 106.04945904173107, + "grad_norm": 0.5990587472915649, + "learning_rate": 2.413717551815062e-06, + "loss": 0.226, + "num_input_tokens_seen": 52479376, + "step": 34360 + }, + { + "epoch": 106.06491499227202, + "grad_norm": 0.47703808546066284, + "learning_rate": 2.409510631949666e-06, + "loss": 0.2711, + "num_input_tokens_seen": 52487216, + "step": 34365 + }, + { + "epoch": 106.08037094281298, + "grad_norm": 0.49033331871032715, + "learning_rate": 2.405307195821618e-06, + "loss": 0.2091, + "num_input_tokens_seen": 52495088, + "step": 34370 + }, + { + "epoch": 106.09582689335394, + "grad_norm": 0.5566762685775757, + "learning_rate": 2.4011072440791372e-06, + "loss": 0.2182, + "num_input_tokens_seen": 52502896, + "step": 34375 + }, + { + "epoch": 106.1112828438949, + "grad_norm": 0.3249191343784332, + "learning_rate": 2.3969107773699233e-06, + "loss": 0.2274, + "num_input_tokens_seen": 52510224, + "step": 34380 + }, + { + "epoch": 106.12673879443585, + "grad_norm": 0.4396032691001892, + "learning_rate": 2.3927177963411096e-06, + "loss": 0.2629, + "num_input_tokens_seen": 52518000, + "step": 34385 + }, + { + "epoch": 106.14219474497682, + "grad_norm": 0.48799529671669006, + "learning_rate": 2.3885283016393144e-06, + "loss": 0.2423, + "num_input_tokens_seen": 52525648, + "step": 34390 + }, + { + "epoch": 106.15765069551777, + "grad_norm": 0.7018665671348572, + "learning_rate": 2.3843422939106076e-06, + "loss": 0.2308, + "num_input_tokens_seen": 52533424, + "step": 34395 + }, + { + "epoch": 106.17310664605873, + "grad_norm": 0.3249553143978119, + "learning_rate": 2.380159773800525e-06, + "loss": 0.2033, + "num_input_tokens_seen": 52541008, + "step": 34400 + }, + { + "epoch": 106.17310664605873, + "eval_loss": 0.30035173892974854, + "eval_runtime": 6.3327, + "eval_samples_per_second": 90.799, + "eval_steps_per_second": 22.739, + "num_input_tokens_seen": 52541008, + "step": 34400 + }, + { + "epoch": 106.1885625965997, + "grad_norm": 0.5734151005744934, + "learning_rate": 2.3759807419540675e-06, + "loss": 0.2401, + "num_input_tokens_seen": 52548144, + "step": 34405 + }, + { + "epoch": 106.20401854714065, + "grad_norm": 0.3849896490573883, + "learning_rate": 2.3718051990156835e-06, + "loss": 0.2099, + "num_input_tokens_seen": 52555664, + "step": 34410 + }, + { + "epoch": 106.2194744976816, + "grad_norm": 0.5080630779266357, + "learning_rate": 2.367633145629311e-06, + "loss": 0.2117, + "num_input_tokens_seen": 52563280, + "step": 34415 + }, + { + "epoch": 106.23493044822257, + "grad_norm": 0.3425595164299011, + "learning_rate": 2.363464582438316e-06, + "loss": 0.212, + "num_input_tokens_seen": 52571184, + "step": 34420 + }, + { + "epoch": 106.25038639876352, + "grad_norm": 0.5643797516822815, + "learning_rate": 2.3592995100855526e-06, + "loss": 0.2808, + "num_input_tokens_seen": 52578768, + "step": 34425 + }, + { + "epoch": 106.26584234930448, + "grad_norm": 0.7590980529785156, + "learning_rate": 2.3551379292133273e-06, + "loss": 0.2361, + "num_input_tokens_seen": 52586096, + "step": 34430 + }, + { + "epoch": 106.28129829984545, + "grad_norm": 0.7587357759475708, + "learning_rate": 2.3509798404634047e-06, + "loss": 0.3015, + "num_input_tokens_seen": 52593648, + "step": 34435 + }, + { + "epoch": 106.2967542503864, + "grad_norm": 0.7133234739303589, + "learning_rate": 2.346825244477019e-06, + "loss": 0.1908, + "num_input_tokens_seen": 52601616, + "step": 34440 + }, + { + "epoch": 106.31221020092735, + "grad_norm": 0.43921905755996704, + "learning_rate": 2.3426741418948545e-06, + "loss": 0.2296, + "num_input_tokens_seen": 52609008, + "step": 34445 + }, + { + "epoch": 106.32766615146832, + "grad_norm": 0.5584828853607178, + "learning_rate": 2.3385265333570715e-06, + "loss": 0.2539, + "num_input_tokens_seen": 52616496, + "step": 34450 + }, + { + "epoch": 106.34312210200927, + "grad_norm": 0.49856507778167725, + "learning_rate": 2.334382419503278e-06, + "loss": 0.1994, + "num_input_tokens_seen": 52623792, + "step": 34455 + }, + { + "epoch": 106.35857805255023, + "grad_norm": 0.4944525957107544, + "learning_rate": 2.3302418009725465e-06, + "loss": 0.236, + "num_input_tokens_seen": 52631568, + "step": 34460 + }, + { + "epoch": 106.3740340030912, + "grad_norm": 0.5218174457550049, + "learning_rate": 2.326104678403415e-06, + "loss": 0.2268, + "num_input_tokens_seen": 52639440, + "step": 34465 + }, + { + "epoch": 106.38948995363215, + "grad_norm": 0.34524986147880554, + "learning_rate": 2.321971052433883e-06, + "loss": 0.2554, + "num_input_tokens_seen": 52646992, + "step": 34470 + }, + { + "epoch": 106.4049459041731, + "grad_norm": 0.6996511220932007, + "learning_rate": 2.3178409237014004e-06, + "loss": 0.3139, + "num_input_tokens_seen": 52654544, + "step": 34475 + }, + { + "epoch": 106.42040185471407, + "grad_norm": 0.8113186955451965, + "learning_rate": 2.313714292842889e-06, + "loss": 0.241, + "num_input_tokens_seen": 52662512, + "step": 34480 + }, + { + "epoch": 106.43585780525503, + "grad_norm": 0.5931443572044373, + "learning_rate": 2.309591160494734e-06, + "loss": 0.2252, + "num_input_tokens_seen": 52669968, + "step": 34485 + }, + { + "epoch": 106.45131375579598, + "grad_norm": 0.3985709249973297, + "learning_rate": 2.305471527292763e-06, + "loss": 0.2578, + "num_input_tokens_seen": 52678096, + "step": 34490 + }, + { + "epoch": 106.46676970633693, + "grad_norm": 0.6057313680648804, + "learning_rate": 2.3013553938722817e-06, + "loss": 0.2493, + "num_input_tokens_seen": 52685584, + "step": 34495 + }, + { + "epoch": 106.4822256568779, + "grad_norm": 0.5078456401824951, + "learning_rate": 2.297242760868043e-06, + "loss": 0.2155, + "num_input_tokens_seen": 52693296, + "step": 34500 + }, + { + "epoch": 106.49768160741885, + "grad_norm": 0.33153703808784485, + "learning_rate": 2.2931336289142735e-06, + "loss": 0.2116, + "num_input_tokens_seen": 52700912, + "step": 34505 + }, + { + "epoch": 106.51313755795981, + "grad_norm": 0.6570813655853271, + "learning_rate": 2.289027998644655e-06, + "loss": 0.3039, + "num_input_tokens_seen": 52708624, + "step": 34510 + }, + { + "epoch": 106.52859350850078, + "grad_norm": 0.632634162902832, + "learning_rate": 2.2849258706923228e-06, + "loss": 0.2799, + "num_input_tokens_seen": 52716656, + "step": 34515 + }, + { + "epoch": 106.54404945904173, + "grad_norm": 0.7795816659927368, + "learning_rate": 2.2808272456898705e-06, + "loss": 0.2775, + "num_input_tokens_seen": 52724368, + "step": 34520 + }, + { + "epoch": 106.55950540958268, + "grad_norm": 0.2860281467437744, + "learning_rate": 2.2767321242693707e-06, + "loss": 0.2947, + "num_input_tokens_seen": 52732208, + "step": 34525 + }, + { + "epoch": 106.57496136012365, + "grad_norm": 0.684747576713562, + "learning_rate": 2.272640507062329e-06, + "loss": 0.347, + "num_input_tokens_seen": 52739408, + "step": 34530 + }, + { + "epoch": 106.5904173106646, + "grad_norm": 0.5203025341033936, + "learning_rate": 2.2685523946997382e-06, + "loss": 0.2396, + "num_input_tokens_seen": 52747568, + "step": 34535 + }, + { + "epoch": 106.60587326120556, + "grad_norm": 0.46731916069984436, + "learning_rate": 2.2644677878120245e-06, + "loss": 0.2407, + "num_input_tokens_seen": 52755088, + "step": 34540 + }, + { + "epoch": 106.62132921174653, + "grad_norm": 0.488639235496521, + "learning_rate": 2.2603866870290897e-06, + "loss": 0.2409, + "num_input_tokens_seen": 52762672, + "step": 34545 + }, + { + "epoch": 106.63678516228748, + "grad_norm": 0.42432302236557007, + "learning_rate": 2.256309092980294e-06, + "loss": 0.24, + "num_input_tokens_seen": 52770512, + "step": 34550 + }, + { + "epoch": 106.65224111282843, + "grad_norm": 0.4266451597213745, + "learning_rate": 2.252235006294448e-06, + "loss": 0.2201, + "num_input_tokens_seen": 52778416, + "step": 34555 + }, + { + "epoch": 106.6676970633694, + "grad_norm": 0.5408986210823059, + "learning_rate": 2.2481644275998333e-06, + "loss": 0.2426, + "num_input_tokens_seen": 52786032, + "step": 34560 + }, + { + "epoch": 106.68315301391036, + "grad_norm": 0.45088472962379456, + "learning_rate": 2.2440973575241832e-06, + "loss": 0.2987, + "num_input_tokens_seen": 52793360, + "step": 34565 + }, + { + "epoch": 106.69860896445131, + "grad_norm": 0.3145276606082916, + "learning_rate": 2.240033796694685e-06, + "loss": 0.2543, + "num_input_tokens_seen": 52800880, + "step": 34570 + }, + { + "epoch": 106.71406491499228, + "grad_norm": 0.6441019177436829, + "learning_rate": 2.235973745737999e-06, + "loss": 0.2779, + "num_input_tokens_seen": 52807984, + "step": 34575 + }, + { + "epoch": 106.72952086553323, + "grad_norm": 0.457920104265213, + "learning_rate": 2.2319172052802263e-06, + "loss": 0.2595, + "num_input_tokens_seen": 52815984, + "step": 34580 + }, + { + "epoch": 106.74497681607419, + "grad_norm": 0.686439573764801, + "learning_rate": 2.2278641759469477e-06, + "loss": 0.2658, + "num_input_tokens_seen": 52823824, + "step": 34585 + }, + { + "epoch": 106.76043276661515, + "grad_norm": 0.4323807954788208, + "learning_rate": 2.2238146583631825e-06, + "loss": 0.2451, + "num_input_tokens_seen": 52831248, + "step": 34590 + }, + { + "epoch": 106.7758887171561, + "grad_norm": 0.46340176463127136, + "learning_rate": 2.2197686531534256e-06, + "loss": 0.2556, + "num_input_tokens_seen": 52838736, + "step": 34595 + }, + { + "epoch": 106.79134466769706, + "grad_norm": 0.46141305565834045, + "learning_rate": 2.2157261609416087e-06, + "loss": 0.243, + "num_input_tokens_seen": 52845904, + "step": 34600 + }, + { + "epoch": 106.79134466769706, + "eval_loss": 0.30159005522727966, + "eval_runtime": 6.3044, + "eval_samples_per_second": 91.206, + "eval_steps_per_second": 22.841, + "num_input_tokens_seen": 52845904, + "step": 34600 + }, + { + "epoch": 106.80680061823801, + "grad_norm": 0.5813551545143127, + "learning_rate": 2.211687182351149e-06, + "loss": 0.2417, + "num_input_tokens_seen": 52853616, + "step": 34605 + }, + { + "epoch": 106.82225656877898, + "grad_norm": 0.33824652433395386, + "learning_rate": 2.2076517180048993e-06, + "loss": 0.2254, + "num_input_tokens_seen": 52861264, + "step": 34610 + }, + { + "epoch": 106.83771251931994, + "grad_norm": 0.49106934666633606, + "learning_rate": 2.2036197685251834e-06, + "loss": 0.274, + "num_input_tokens_seen": 52869616, + "step": 34615 + }, + { + "epoch": 106.85316846986089, + "grad_norm": 0.3179728388786316, + "learning_rate": 2.199591334533771e-06, + "loss": 0.2025, + "num_input_tokens_seen": 52877168, + "step": 34620 + }, + { + "epoch": 106.86862442040186, + "grad_norm": 0.6294081807136536, + "learning_rate": 2.1955664166519036e-06, + "loss": 0.2089, + "num_input_tokens_seen": 52884656, + "step": 34625 + }, + { + "epoch": 106.88408037094281, + "grad_norm": 0.374787837266922, + "learning_rate": 2.1915450155002793e-06, + "loss": 0.237, + "num_input_tokens_seen": 52892336, + "step": 34630 + }, + { + "epoch": 106.89953632148377, + "grad_norm": 0.49088791012763977, + "learning_rate": 2.187527131699038e-06, + "loss": 0.2768, + "num_input_tokens_seen": 52900272, + "step": 34635 + }, + { + "epoch": 106.91499227202473, + "grad_norm": 0.40188488364219666, + "learning_rate": 2.18351276586779e-06, + "loss": 0.1896, + "num_input_tokens_seen": 52907920, + "step": 34640 + }, + { + "epoch": 106.93044822256569, + "grad_norm": 0.49814900755882263, + "learning_rate": 2.1795019186256092e-06, + "loss": 0.2595, + "num_input_tokens_seen": 52916016, + "step": 34645 + }, + { + "epoch": 106.94590417310664, + "grad_norm": 0.4712717831134796, + "learning_rate": 2.1754945905910094e-06, + "loss": 0.3106, + "num_input_tokens_seen": 52923920, + "step": 34650 + }, + { + "epoch": 106.96136012364761, + "grad_norm": 0.6784136891365051, + "learning_rate": 2.171490782381977e-06, + "loss": 0.2257, + "num_input_tokens_seen": 52931504, + "step": 34655 + }, + { + "epoch": 106.97681607418856, + "grad_norm": 0.8664469122886658, + "learning_rate": 2.1674904946159425e-06, + "loss": 0.2236, + "num_input_tokens_seen": 52938896, + "step": 34660 + }, + { + "epoch": 106.99227202472952, + "grad_norm": 0.6320236921310425, + "learning_rate": 2.16349372790981e-06, + "loss": 0.2505, + "num_input_tokens_seen": 52946576, + "step": 34665 + }, + { + "epoch": 107.00618238021639, + "grad_norm": 0.4476604759693146, + "learning_rate": 2.159500482879928e-06, + "loss": 0.2513, + "num_input_tokens_seen": 52953504, + "step": 34670 + }, + { + "epoch": 107.02163833075734, + "grad_norm": 0.47264644503593445, + "learning_rate": 2.155510760142096e-06, + "loss": 0.2522, + "num_input_tokens_seen": 52961248, + "step": 34675 + }, + { + "epoch": 107.0370942812983, + "grad_norm": 0.5187572240829468, + "learning_rate": 2.151524560311588e-06, + "loss": 0.2686, + "num_input_tokens_seen": 52968704, + "step": 34680 + }, + { + "epoch": 107.05255023183926, + "grad_norm": 0.5437662601470947, + "learning_rate": 2.147541884003129e-06, + "loss": 0.2683, + "num_input_tokens_seen": 52976160, + "step": 34685 + }, + { + "epoch": 107.06800618238022, + "grad_norm": 0.5639529824256897, + "learning_rate": 2.1435627318308895e-06, + "loss": 0.2443, + "num_input_tokens_seen": 52984064, + "step": 34690 + }, + { + "epoch": 107.08346213292117, + "grad_norm": 0.43180060386657715, + "learning_rate": 2.139587104408511e-06, + "loss": 0.3356, + "num_input_tokens_seen": 52991616, + "step": 34695 + }, + { + "epoch": 107.09891808346214, + "grad_norm": 0.38762974739074707, + "learning_rate": 2.1356150023490783e-06, + "loss": 0.2411, + "num_input_tokens_seen": 52999392, + "step": 34700 + }, + { + "epoch": 107.11437403400309, + "grad_norm": 0.4188445806503296, + "learning_rate": 2.1316464262651464e-06, + "loss": 0.2024, + "num_input_tokens_seen": 53006624, + "step": 34705 + }, + { + "epoch": 107.12982998454405, + "grad_norm": 1.0899710655212402, + "learning_rate": 2.1276813767687224e-06, + "loss": 0.3122, + "num_input_tokens_seen": 53014304, + "step": 34710 + }, + { + "epoch": 107.14528593508501, + "grad_norm": 0.4064508378505707, + "learning_rate": 2.123719854471254e-06, + "loss": 0.2885, + "num_input_tokens_seen": 53022528, + "step": 34715 + }, + { + "epoch": 107.16074188562597, + "grad_norm": 0.4991737902164459, + "learning_rate": 2.119761859983668e-06, + "loss": 0.2193, + "num_input_tokens_seen": 53030592, + "step": 34720 + }, + { + "epoch": 107.17619783616692, + "grad_norm": 0.5136814713478088, + "learning_rate": 2.1158073939163386e-06, + "loss": 0.268, + "num_input_tokens_seen": 53037856, + "step": 34725 + }, + { + "epoch": 107.19165378670789, + "grad_norm": 0.8907212615013123, + "learning_rate": 2.111856456879088e-06, + "loss": 0.3001, + "num_input_tokens_seen": 53045248, + "step": 34730 + }, + { + "epoch": 107.20710973724884, + "grad_norm": 0.9722672700881958, + "learning_rate": 2.1079090494811993e-06, + "loss": 0.2814, + "num_input_tokens_seen": 53052704, + "step": 34735 + }, + { + "epoch": 107.2225656877898, + "grad_norm": 0.7105987668037415, + "learning_rate": 2.103965172331418e-06, + "loss": 0.2388, + "num_input_tokens_seen": 53060224, + "step": 34740 + }, + { + "epoch": 107.23802163833076, + "grad_norm": 0.46654224395751953, + "learning_rate": 2.100024826037933e-06, + "loss": 0.2177, + "num_input_tokens_seen": 53068032, + "step": 34745 + }, + { + "epoch": 107.25347758887172, + "grad_norm": 0.7451262474060059, + "learning_rate": 2.0960880112084027e-06, + "loss": 0.2406, + "num_input_tokens_seen": 53075328, + "step": 34750 + }, + { + "epoch": 107.26893353941267, + "grad_norm": 0.6784075498580933, + "learning_rate": 2.092154728449927e-06, + "loss": 0.2809, + "num_input_tokens_seen": 53082816, + "step": 34755 + }, + { + "epoch": 107.28438948995363, + "grad_norm": 0.7083227038383484, + "learning_rate": 2.0882249783690687e-06, + "loss": 0.2513, + "num_input_tokens_seen": 53090080, + "step": 34760 + }, + { + "epoch": 107.2998454404946, + "grad_norm": 0.5213271975517273, + "learning_rate": 2.084298761571851e-06, + "loss": 0.2394, + "num_input_tokens_seen": 53097568, + "step": 34765 + }, + { + "epoch": 107.31530139103555, + "grad_norm": 0.3345949351787567, + "learning_rate": 2.080376078663737e-06, + "loss": 0.2098, + "num_input_tokens_seen": 53105504, + "step": 34770 + }, + { + "epoch": 107.3307573415765, + "grad_norm": 0.36843714118003845, + "learning_rate": 2.0764569302496593e-06, + "loss": 0.2168, + "num_input_tokens_seen": 53112992, + "step": 34775 + }, + { + "epoch": 107.34621329211747, + "grad_norm": 0.33050742745399475, + "learning_rate": 2.0725413169339957e-06, + "loss": 0.2404, + "num_input_tokens_seen": 53120480, + "step": 34780 + }, + { + "epoch": 107.36166924265842, + "grad_norm": 0.44873180985450745, + "learning_rate": 2.068629239320588e-06, + "loss": 0.2306, + "num_input_tokens_seen": 53128000, + "step": 34785 + }, + { + "epoch": 107.37712519319938, + "grad_norm": 0.5386741757392883, + "learning_rate": 2.064720698012726e-06, + "loss": 0.2966, + "num_input_tokens_seen": 53135584, + "step": 34790 + }, + { + "epoch": 107.39258114374034, + "grad_norm": 0.49759766459465027, + "learning_rate": 2.0608156936131522e-06, + "loss": 0.2261, + "num_input_tokens_seen": 53143200, + "step": 34795 + }, + { + "epoch": 107.4080370942813, + "grad_norm": 0.6327207088470459, + "learning_rate": 2.056914226724074e-06, + "loss": 0.1853, + "num_input_tokens_seen": 53150720, + "step": 34800 + }, + { + "epoch": 107.4080370942813, + "eval_loss": 0.30257660150527954, + "eval_runtime": 6.3373, + "eval_samples_per_second": 90.732, + "eval_steps_per_second": 22.722, + "num_input_tokens_seen": 53150720, + "step": 34800 + }, + { + "epoch": 107.42349304482225, + "grad_norm": 0.8970136642456055, + "learning_rate": 2.0530162979471385e-06, + "loss": 0.2235, + "num_input_tokens_seen": 53158272, + "step": 34805 + }, + { + "epoch": 107.43894899536322, + "grad_norm": 0.6317940950393677, + "learning_rate": 2.0491219078834667e-06, + "loss": 0.2446, + "num_input_tokens_seen": 53165952, + "step": 34810 + }, + { + "epoch": 107.45440494590417, + "grad_norm": 0.3142385184764862, + "learning_rate": 2.045231057133612e-06, + "loss": 0.2076, + "num_input_tokens_seen": 53173696, + "step": 34815 + }, + { + "epoch": 107.46986089644513, + "grad_norm": 0.532596230506897, + "learning_rate": 2.0413437462975944e-06, + "loss": 0.2393, + "num_input_tokens_seen": 53181536, + "step": 34820 + }, + { + "epoch": 107.4853168469861, + "grad_norm": 0.5660046935081482, + "learning_rate": 2.0374599759748843e-06, + "loss": 0.2105, + "num_input_tokens_seen": 53188896, + "step": 34825 + }, + { + "epoch": 107.50077279752705, + "grad_norm": 0.4197404980659485, + "learning_rate": 2.033579746764419e-06, + "loss": 0.2625, + "num_input_tokens_seen": 53196576, + "step": 34830 + }, + { + "epoch": 107.516228748068, + "grad_norm": 0.3279869854450226, + "learning_rate": 2.029703059264565e-06, + "loss": 0.1978, + "num_input_tokens_seen": 53204704, + "step": 34835 + }, + { + "epoch": 107.53168469860897, + "grad_norm": 0.8753597140312195, + "learning_rate": 2.02582991407316e-06, + "loss": 0.2701, + "num_input_tokens_seen": 53212672, + "step": 34840 + }, + { + "epoch": 107.54714064914992, + "grad_norm": 0.5063170790672302, + "learning_rate": 2.0219603117874992e-06, + "loss": 0.2753, + "num_input_tokens_seen": 53220448, + "step": 34845 + }, + { + "epoch": 107.56259659969088, + "grad_norm": 0.3883221447467804, + "learning_rate": 2.0180942530043156e-06, + "loss": 0.2307, + "num_input_tokens_seen": 53227968, + "step": 34850 + }, + { + "epoch": 107.57805255023185, + "grad_norm": 0.400847464799881, + "learning_rate": 2.0142317383198107e-06, + "loss": 0.2299, + "num_input_tokens_seen": 53235424, + "step": 34855 + }, + { + "epoch": 107.5935085007728, + "grad_norm": 0.637606143951416, + "learning_rate": 2.0103727683296243e-06, + "loss": 0.3068, + "num_input_tokens_seen": 53242752, + "step": 34860 + }, + { + "epoch": 107.60896445131375, + "grad_norm": 0.3459347188472748, + "learning_rate": 2.0065173436288636e-06, + "loss": 0.2419, + "num_input_tokens_seen": 53250240, + "step": 34865 + }, + { + "epoch": 107.62442040185472, + "grad_norm": 0.37819844484329224, + "learning_rate": 2.002665464812087e-06, + "loss": 0.2187, + "num_input_tokens_seen": 53258592, + "step": 34870 + }, + { + "epoch": 107.63987635239567, + "grad_norm": 0.4083689749240875, + "learning_rate": 1.998817132473291e-06, + "loss": 0.2262, + "num_input_tokens_seen": 53266208, + "step": 34875 + }, + { + "epoch": 107.65533230293663, + "grad_norm": 0.4554068148136139, + "learning_rate": 1.9949723472059507e-06, + "loss": 0.2439, + "num_input_tokens_seen": 53273920, + "step": 34880 + }, + { + "epoch": 107.67078825347758, + "grad_norm": 0.35993635654449463, + "learning_rate": 1.9911311096029726e-06, + "loss": 0.2488, + "num_input_tokens_seen": 53281472, + "step": 34885 + }, + { + "epoch": 107.68624420401855, + "grad_norm": 0.6798469424247742, + "learning_rate": 1.9872934202567224e-06, + "loss": 0.2408, + "num_input_tokens_seen": 53289312, + "step": 34890 + }, + { + "epoch": 107.7017001545595, + "grad_norm": 0.48549404740333557, + "learning_rate": 1.9834592797590257e-06, + "loss": 0.2446, + "num_input_tokens_seen": 53297344, + "step": 34895 + }, + { + "epoch": 107.71715610510046, + "grad_norm": 0.7112581729888916, + "learning_rate": 1.979628688701149e-06, + "loss": 0.2671, + "num_input_tokens_seen": 53304928, + "step": 34900 + }, + { + "epoch": 107.73261205564143, + "grad_norm": 0.4458460509777069, + "learning_rate": 1.9758016476738193e-06, + "loss": 0.2151, + "num_input_tokens_seen": 53312704, + "step": 34905 + }, + { + "epoch": 107.74806800618238, + "grad_norm": 0.3616657555103302, + "learning_rate": 1.971978157267221e-06, + "loss": 0.2082, + "num_input_tokens_seen": 53320288, + "step": 34910 + }, + { + "epoch": 107.76352395672333, + "grad_norm": 0.5256296396255493, + "learning_rate": 1.968158218070973e-06, + "loss": 0.2702, + "num_input_tokens_seen": 53327840, + "step": 34915 + }, + { + "epoch": 107.7789799072643, + "grad_norm": 0.43391165137290955, + "learning_rate": 1.9643418306741682e-06, + "loss": 0.2561, + "num_input_tokens_seen": 53336320, + "step": 34920 + }, + { + "epoch": 107.79443585780525, + "grad_norm": 0.6686109900474548, + "learning_rate": 1.9605289956653337e-06, + "loss": 0.2608, + "num_input_tokens_seen": 53343168, + "step": 34925 + }, + { + "epoch": 107.80989180834621, + "grad_norm": 0.8048394322395325, + "learning_rate": 1.9567197136324626e-06, + "loss": 0.2763, + "num_input_tokens_seen": 53351040, + "step": 34930 + }, + { + "epoch": 107.82534775888718, + "grad_norm": 0.4306463599205017, + "learning_rate": 1.9529139851629935e-06, + "loss": 0.2282, + "num_input_tokens_seen": 53358464, + "step": 34935 + }, + { + "epoch": 107.84080370942813, + "grad_norm": 0.8491923809051514, + "learning_rate": 1.949111810843812e-06, + "loss": 0.3041, + "num_input_tokens_seen": 53365984, + "step": 34940 + }, + { + "epoch": 107.85625965996908, + "grad_norm": 0.6064119338989258, + "learning_rate": 1.9453131912612694e-06, + "loss": 0.2755, + "num_input_tokens_seen": 53374080, + "step": 34945 + }, + { + "epoch": 107.87171561051005, + "grad_norm": 0.5336241126060486, + "learning_rate": 1.941518127001149e-06, + "loss": 0.2254, + "num_input_tokens_seen": 53382016, + "step": 34950 + }, + { + "epoch": 107.887171561051, + "grad_norm": 0.4233493506908417, + "learning_rate": 1.9377266186487107e-06, + "loss": 0.2098, + "num_input_tokens_seen": 53389088, + "step": 34955 + }, + { + "epoch": 107.90262751159196, + "grad_norm": 0.4740794003009796, + "learning_rate": 1.9339386667886483e-06, + "loss": 0.254, + "num_input_tokens_seen": 53396928, + "step": 34960 + }, + { + "epoch": 107.91808346213293, + "grad_norm": 0.512998104095459, + "learning_rate": 1.9301542720051024e-06, + "loss": 0.2674, + "num_input_tokens_seen": 53404384, + "step": 34965 + }, + { + "epoch": 107.93353941267388, + "grad_norm": 0.46442940831184387, + "learning_rate": 1.926373434881684e-06, + "loss": 0.2202, + "num_input_tokens_seen": 53411968, + "step": 34970 + }, + { + "epoch": 107.94899536321483, + "grad_norm": 0.4585355520248413, + "learning_rate": 1.9225961560014468e-06, + "loss": 0.2572, + "num_input_tokens_seen": 53419584, + "step": 34975 + }, + { + "epoch": 107.9644513137558, + "grad_norm": 0.6340644359588623, + "learning_rate": 1.918822435946885e-06, + "loss": 0.2304, + "num_input_tokens_seen": 53427232, + "step": 34980 + }, + { + "epoch": 107.97990726429676, + "grad_norm": 0.46024149656295776, + "learning_rate": 1.915052275299961e-06, + "loss": 0.2497, + "num_input_tokens_seen": 53434944, + "step": 34985 + }, + { + "epoch": 107.99536321483771, + "grad_norm": 0.6454667448997498, + "learning_rate": 1.9112856746420854e-06, + "loss": 0.1839, + "num_input_tokens_seen": 53442496, + "step": 34990 + }, + { + "epoch": 108.00927357032458, + "grad_norm": 0.5817062258720398, + "learning_rate": 1.907522634554104e-06, + "loss": 0.2343, + "num_input_tokens_seen": 53449104, + "step": 34995 + }, + { + "epoch": 108.02472952086553, + "grad_norm": 1.1308927536010742, + "learning_rate": 1.9037631556163337e-06, + "loss": 0.2903, + "num_input_tokens_seen": 53456816, + "step": 35000 + }, + { + "epoch": 108.02472952086553, + "eval_loss": 0.3012757897377014, + "eval_runtime": 6.3705, + "eval_samples_per_second": 90.26, + "eval_steps_per_second": 22.604, + "num_input_tokens_seen": 53456816, + "step": 35000 + }, + { + "epoch": 108.04018547140649, + "grad_norm": 0.47568538784980774, + "learning_rate": 1.9000072384085272e-06, + "loss": 0.2398, + "num_input_tokens_seen": 53464848, + "step": 35005 + }, + { + "epoch": 108.05564142194746, + "grad_norm": 0.7597646117210388, + "learning_rate": 1.8962548835098987e-06, + "loss": 0.2995, + "num_input_tokens_seen": 53472720, + "step": 35010 + }, + { + "epoch": 108.07109737248841, + "grad_norm": 0.659780740737915, + "learning_rate": 1.8925060914991077e-06, + "loss": 0.2339, + "num_input_tokens_seen": 53480880, + "step": 35015 + }, + { + "epoch": 108.08655332302936, + "grad_norm": 0.3659748136997223, + "learning_rate": 1.888760862954264e-06, + "loss": 0.2619, + "num_input_tokens_seen": 53488208, + "step": 35020 + }, + { + "epoch": 108.10200927357033, + "grad_norm": 0.8084350824356079, + "learning_rate": 1.8850191984529309e-06, + "loss": 0.2225, + "num_input_tokens_seen": 53495440, + "step": 35025 + }, + { + "epoch": 108.11746522411129, + "grad_norm": 0.9849970936775208, + "learning_rate": 1.8812810985721186e-06, + "loss": 0.3086, + "num_input_tokens_seen": 53502928, + "step": 35030 + }, + { + "epoch": 108.13292117465224, + "grad_norm": 0.3750026226043701, + "learning_rate": 1.8775465638882856e-06, + "loss": 0.2517, + "num_input_tokens_seen": 53510928, + "step": 35035 + }, + { + "epoch": 108.14837712519319, + "grad_norm": 0.6360068321228027, + "learning_rate": 1.8738155949773517e-06, + "loss": 0.2679, + "num_input_tokens_seen": 53518832, + "step": 35040 + }, + { + "epoch": 108.16383307573416, + "grad_norm": 0.577630341053009, + "learning_rate": 1.8700881924146707e-06, + "loss": 0.2002, + "num_input_tokens_seen": 53525840, + "step": 35045 + }, + { + "epoch": 108.17928902627511, + "grad_norm": 0.5287483930587769, + "learning_rate": 1.8663643567750577e-06, + "loss": 0.2905, + "num_input_tokens_seen": 53533296, + "step": 35050 + }, + { + "epoch": 108.19474497681607, + "grad_norm": 0.5997861623764038, + "learning_rate": 1.8626440886327813e-06, + "loss": 0.2419, + "num_input_tokens_seen": 53540720, + "step": 35055 + }, + { + "epoch": 108.21020092735704, + "grad_norm": 0.5712509155273438, + "learning_rate": 1.8589273885615432e-06, + "loss": 0.2431, + "num_input_tokens_seen": 53548464, + "step": 35060 + }, + { + "epoch": 108.22565687789799, + "grad_norm": 0.32955220341682434, + "learning_rate": 1.8552142571345133e-06, + "loss": 0.2552, + "num_input_tokens_seen": 53556080, + "step": 35065 + }, + { + "epoch": 108.24111282843894, + "grad_norm": 0.32865673303604126, + "learning_rate": 1.8515046949243025e-06, + "loss": 0.2013, + "num_input_tokens_seen": 53564240, + "step": 35070 + }, + { + "epoch": 108.25656877897991, + "grad_norm": 0.5047616958618164, + "learning_rate": 1.8477987025029674e-06, + "loss": 0.27, + "num_input_tokens_seen": 53571568, + "step": 35075 + }, + { + "epoch": 108.27202472952087, + "grad_norm": 0.39088746905326843, + "learning_rate": 1.8440962804420232e-06, + "loss": 0.2131, + "num_input_tokens_seen": 53578800, + "step": 35080 + }, + { + "epoch": 108.28748068006182, + "grad_norm": 0.6525124311447144, + "learning_rate": 1.8403974293124265e-06, + "loss": 0.2307, + "num_input_tokens_seen": 53586608, + "step": 35085 + }, + { + "epoch": 108.30293663060279, + "grad_norm": 0.6375552415847778, + "learning_rate": 1.8367021496845854e-06, + "loss": 0.2657, + "num_input_tokens_seen": 53593904, + "step": 35090 + }, + { + "epoch": 108.31839258114374, + "grad_norm": 0.5169235467910767, + "learning_rate": 1.8330104421283662e-06, + "loss": 0.2222, + "num_input_tokens_seen": 53601136, + "step": 35095 + }, + { + "epoch": 108.3338485316847, + "grad_norm": 0.3994070589542389, + "learning_rate": 1.8293223072130717e-06, + "loss": 0.2522, + "num_input_tokens_seen": 53608528, + "step": 35100 + }, + { + "epoch": 108.34930448222566, + "grad_norm": 0.38584646582603455, + "learning_rate": 1.8256377455074525e-06, + "loss": 0.2139, + "num_input_tokens_seen": 53615920, + "step": 35105 + }, + { + "epoch": 108.36476043276662, + "grad_norm": 0.40872922539711, + "learning_rate": 1.8219567575797263e-06, + "loss": 0.217, + "num_input_tokens_seen": 53623024, + "step": 35110 + }, + { + "epoch": 108.38021638330757, + "grad_norm": 0.4640132486820221, + "learning_rate": 1.8182793439975365e-06, + "loss": 0.2754, + "num_input_tokens_seen": 53630640, + "step": 35115 + }, + { + "epoch": 108.39567233384854, + "grad_norm": 0.3230783939361572, + "learning_rate": 1.8146055053279958e-06, + "loss": 0.2596, + "num_input_tokens_seen": 53638192, + "step": 35120 + }, + { + "epoch": 108.41112828438949, + "grad_norm": 0.3758191466331482, + "learning_rate": 1.8109352421376486e-06, + "loss": 0.2487, + "num_input_tokens_seen": 53645744, + "step": 35125 + }, + { + "epoch": 108.42658423493044, + "grad_norm": 0.47506478428840637, + "learning_rate": 1.8072685549924972e-06, + "loss": 0.2956, + "num_input_tokens_seen": 53653168, + "step": 35130 + }, + { + "epoch": 108.44204018547141, + "grad_norm": 0.32661178708076477, + "learning_rate": 1.8036054444579982e-06, + "loss": 0.2471, + "num_input_tokens_seen": 53660880, + "step": 35135 + }, + { + "epoch": 108.45749613601237, + "grad_norm": 0.5111494064331055, + "learning_rate": 1.7999459110990407e-06, + "loss": 0.2698, + "num_input_tokens_seen": 53668688, + "step": 35140 + }, + { + "epoch": 108.47295208655332, + "grad_norm": 0.467756986618042, + "learning_rate": 1.7962899554799712e-06, + "loss": 0.2345, + "num_input_tokens_seen": 53676336, + "step": 35145 + }, + { + "epoch": 108.48840803709429, + "grad_norm": 0.4727930426597595, + "learning_rate": 1.7926375781645937e-06, + "loss": 0.2735, + "num_input_tokens_seen": 53683952, + "step": 35150 + }, + { + "epoch": 108.50386398763524, + "grad_norm": 0.5325908660888672, + "learning_rate": 1.7889887797161359e-06, + "loss": 0.2534, + "num_input_tokens_seen": 53691312, + "step": 35155 + }, + { + "epoch": 108.5193199381762, + "grad_norm": 0.5223540663719177, + "learning_rate": 1.7853435606973028e-06, + "loss": 0.2573, + "num_input_tokens_seen": 53699568, + "step": 35160 + }, + { + "epoch": 108.53477588871715, + "grad_norm": 0.5729156136512756, + "learning_rate": 1.781701921670223e-06, + "loss": 0.2483, + "num_input_tokens_seen": 53707248, + "step": 35165 + }, + { + "epoch": 108.55023183925812, + "grad_norm": 0.3885067403316498, + "learning_rate": 1.7780638631964886e-06, + "loss": 0.2439, + "num_input_tokens_seen": 53714800, + "step": 35170 + }, + { + "epoch": 108.56568778979907, + "grad_norm": 0.49394485354423523, + "learning_rate": 1.7744293858371314e-06, + "loss": 0.2668, + "num_input_tokens_seen": 53722544, + "step": 35175 + }, + { + "epoch": 108.58114374034002, + "grad_norm": 0.5298072695732117, + "learning_rate": 1.770798490152631e-06, + "loss": 0.2187, + "num_input_tokens_seen": 53730512, + "step": 35180 + }, + { + "epoch": 108.59659969088099, + "grad_norm": 0.45977360010147095, + "learning_rate": 1.767171176702917e-06, + "loss": 0.256, + "num_input_tokens_seen": 53738000, + "step": 35185 + }, + { + "epoch": 108.61205564142195, + "grad_norm": 0.5960544943809509, + "learning_rate": 1.7635474460473755e-06, + "loss": 0.2796, + "num_input_tokens_seen": 53745584, + "step": 35190 + }, + { + "epoch": 108.6275115919629, + "grad_norm": 0.48013681173324585, + "learning_rate": 1.7599272987448206e-06, + "loss": 0.2547, + "num_input_tokens_seen": 53752848, + "step": 35195 + }, + { + "epoch": 108.64296754250387, + "grad_norm": 0.48967379331588745, + "learning_rate": 1.7563107353535362e-06, + "loss": 0.2021, + "num_input_tokens_seen": 53760816, + "step": 35200 + }, + { + "epoch": 108.64296754250387, + "eval_loss": 0.301018625497818, + "eval_runtime": 6.2803, + "eval_samples_per_second": 91.557, + "eval_steps_per_second": 22.929, + "num_input_tokens_seen": 53760816, + "step": 35200 + }, + { + "epoch": 108.65842349304482, + "grad_norm": 0.6385693550109863, + "learning_rate": 1.7526977564312263e-06, + "loss": 0.248, + "num_input_tokens_seen": 53769200, + "step": 35205 + }, + { + "epoch": 108.67387944358578, + "grad_norm": 0.5130825638771057, + "learning_rate": 1.7490883625350701e-06, + "loss": 0.2504, + "num_input_tokens_seen": 53776976, + "step": 35210 + }, + { + "epoch": 108.68933539412674, + "grad_norm": 0.5921401977539062, + "learning_rate": 1.7454825542216807e-06, + "loss": 0.2504, + "num_input_tokens_seen": 53784848, + "step": 35215 + }, + { + "epoch": 108.7047913446677, + "grad_norm": 0.5964892506599426, + "learning_rate": 1.7418803320471105e-06, + "loss": 0.2349, + "num_input_tokens_seen": 53792336, + "step": 35220 + }, + { + "epoch": 108.72024729520865, + "grad_norm": 0.7314035892486572, + "learning_rate": 1.7382816965668737e-06, + "loss": 0.2442, + "num_input_tokens_seen": 53800272, + "step": 35225 + }, + { + "epoch": 108.73570324574962, + "grad_norm": 0.3761426508426666, + "learning_rate": 1.7346866483359285e-06, + "loss": 0.2458, + "num_input_tokens_seen": 53807856, + "step": 35230 + }, + { + "epoch": 108.75115919629057, + "grad_norm": 0.6462298631668091, + "learning_rate": 1.7310951879086657e-06, + "loss": 0.2717, + "num_input_tokens_seen": 53815696, + "step": 35235 + }, + { + "epoch": 108.76661514683153, + "grad_norm": 0.851233720779419, + "learning_rate": 1.7275073158389471e-06, + "loss": 0.2348, + "num_input_tokens_seen": 53823888, + "step": 35240 + }, + { + "epoch": 108.7820710973725, + "grad_norm": 0.7943617105484009, + "learning_rate": 1.723923032680061e-06, + "loss": 0.2266, + "num_input_tokens_seen": 53831696, + "step": 35245 + }, + { + "epoch": 108.79752704791345, + "grad_norm": 0.39157354831695557, + "learning_rate": 1.7203423389847428e-06, + "loss": 0.2212, + "num_input_tokens_seen": 53838896, + "step": 35250 + }, + { + "epoch": 108.8129829984544, + "grad_norm": 0.44779232144355774, + "learning_rate": 1.7167652353051928e-06, + "loss": 0.222, + "num_input_tokens_seen": 53846864, + "step": 35255 + }, + { + "epoch": 108.82843894899537, + "grad_norm": 0.390694797039032, + "learning_rate": 1.7131917221930333e-06, + "loss": 0.219, + "num_input_tokens_seen": 53854352, + "step": 35260 + }, + { + "epoch": 108.84389489953632, + "grad_norm": 0.3332486152648926, + "learning_rate": 1.7096218001993513e-06, + "loss": 0.2238, + "num_input_tokens_seen": 53862000, + "step": 35265 + }, + { + "epoch": 108.85935085007728, + "grad_norm": 0.41739094257354736, + "learning_rate": 1.706055469874676e-06, + "loss": 0.271, + "num_input_tokens_seen": 53869968, + "step": 35270 + }, + { + "epoch": 108.87480680061825, + "grad_norm": 0.2498544156551361, + "learning_rate": 1.702492731768976e-06, + "loss": 0.2559, + "num_input_tokens_seen": 53877808, + "step": 35275 + }, + { + "epoch": 108.8902627511592, + "grad_norm": 0.36134400963783264, + "learning_rate": 1.6989335864316724e-06, + "loss": 0.2402, + "num_input_tokens_seen": 53885264, + "step": 35280 + }, + { + "epoch": 108.90571870170015, + "grad_norm": 0.2862444519996643, + "learning_rate": 1.6953780344116265e-06, + "loss": 0.2537, + "num_input_tokens_seen": 53892784, + "step": 35285 + }, + { + "epoch": 108.9211746522411, + "grad_norm": 0.49039149284362793, + "learning_rate": 1.6918260762571497e-06, + "loss": 0.2468, + "num_input_tokens_seen": 53900592, + "step": 35290 + }, + { + "epoch": 108.93663060278207, + "grad_norm": 0.3519963026046753, + "learning_rate": 1.6882777125160093e-06, + "loss": 0.2493, + "num_input_tokens_seen": 53908496, + "step": 35295 + }, + { + "epoch": 108.95208655332303, + "grad_norm": 0.5965585708618164, + "learning_rate": 1.6847329437353899e-06, + "loss": 0.2403, + "num_input_tokens_seen": 53916048, + "step": 35300 + }, + { + "epoch": 108.96754250386398, + "grad_norm": 0.47084841132164, + "learning_rate": 1.6811917704619511e-06, + "loss": 0.1995, + "num_input_tokens_seen": 53923216, + "step": 35305 + }, + { + "epoch": 108.98299845440495, + "grad_norm": 0.42924878001213074, + "learning_rate": 1.67765419324179e-06, + "loss": 0.2691, + "num_input_tokens_seen": 53930576, + "step": 35310 + }, + { + "epoch": 108.9984544049459, + "grad_norm": 0.43012750148773193, + "learning_rate": 1.6741202126204364e-06, + "loss": 0.185, + "num_input_tokens_seen": 53938064, + "step": 35315 + }, + { + "epoch": 109.01236476043276, + "grad_norm": 0.5485684275627136, + "learning_rate": 1.6705898291428767e-06, + "loss": 0.1931, + "num_input_tokens_seen": 53944656, + "step": 35320 + }, + { + "epoch": 109.02782071097373, + "grad_norm": 0.47623151540756226, + "learning_rate": 1.6670630433535395e-06, + "loss": 0.2306, + "num_input_tokens_seen": 53952112, + "step": 35325 + }, + { + "epoch": 109.04327666151468, + "grad_norm": 0.6562114357948303, + "learning_rate": 1.6635398557962979e-06, + "loss": 0.3243, + "num_input_tokens_seen": 53959888, + "step": 35330 + }, + { + "epoch": 109.05873261205564, + "grad_norm": 0.4569772481918335, + "learning_rate": 1.660020267014481e-06, + "loss": 0.2917, + "num_input_tokens_seen": 53967632, + "step": 35335 + }, + { + "epoch": 109.0741885625966, + "grad_norm": 0.4952171742916107, + "learning_rate": 1.6565042775508438e-06, + "loss": 0.1852, + "num_input_tokens_seen": 53974864, + "step": 35340 + }, + { + "epoch": 109.08964451313756, + "grad_norm": 0.5825137495994568, + "learning_rate": 1.6529918879475997e-06, + "loss": 0.2514, + "num_input_tokens_seen": 53982192, + "step": 35345 + }, + { + "epoch": 109.10510046367851, + "grad_norm": 0.4551703929901123, + "learning_rate": 1.6494830987464043e-06, + "loss": 0.2411, + "num_input_tokens_seen": 53989712, + "step": 35350 + }, + { + "epoch": 109.12055641421948, + "grad_norm": 0.5926213264465332, + "learning_rate": 1.6459779104883555e-06, + "loss": 0.2422, + "num_input_tokens_seen": 53997744, + "step": 35355 + }, + { + "epoch": 109.13601236476043, + "grad_norm": 0.3762698471546173, + "learning_rate": 1.6424763237140013e-06, + "loss": 0.2118, + "num_input_tokens_seen": 54005424, + "step": 35360 + }, + { + "epoch": 109.15146831530139, + "grad_norm": 0.32254284620285034, + "learning_rate": 1.6389783389633207e-06, + "loss": 0.2194, + "num_input_tokens_seen": 54012592, + "step": 35365 + }, + { + "epoch": 109.16692426584235, + "grad_norm": 0.4546253979206085, + "learning_rate": 1.6354839567757546e-06, + "loss": 0.2584, + "num_input_tokens_seen": 54020304, + "step": 35370 + }, + { + "epoch": 109.18238021638331, + "grad_norm": 0.6796672344207764, + "learning_rate": 1.6319931776901831e-06, + "loss": 0.2702, + "num_input_tokens_seen": 54027568, + "step": 35375 + }, + { + "epoch": 109.19783616692426, + "grad_norm": 0.3158354163169861, + "learning_rate": 1.6285060022449229e-06, + "loss": 0.211, + "num_input_tokens_seen": 54035056, + "step": 35380 + }, + { + "epoch": 109.21329211746523, + "grad_norm": 0.5020326972007751, + "learning_rate": 1.6250224309777434e-06, + "loss": 0.297, + "num_input_tokens_seen": 54042704, + "step": 35385 + }, + { + "epoch": 109.22874806800618, + "grad_norm": 0.6003766655921936, + "learning_rate": 1.6215424644258515e-06, + "loss": 0.2606, + "num_input_tokens_seen": 54050448, + "step": 35390 + }, + { + "epoch": 109.24420401854714, + "grad_norm": 0.4196203649044037, + "learning_rate": 1.6180661031259036e-06, + "loss": 0.2231, + "num_input_tokens_seen": 54058160, + "step": 35395 + }, + { + "epoch": 109.2596599690881, + "grad_norm": 0.2989703118801117, + "learning_rate": 1.614593347613999e-06, + "loss": 0.2481, + "num_input_tokens_seen": 54066160, + "step": 35400 + }, + { + "epoch": 109.2596599690881, + "eval_loss": 0.3016166090965271, + "eval_runtime": 6.3263, + "eval_samples_per_second": 90.891, + "eval_steps_per_second": 22.762, + "num_input_tokens_seen": 54066160, + "step": 35400 + }, + { + "epoch": 109.27511591962906, + "grad_norm": 0.32419949769973755, + "learning_rate": 1.6111241984256758e-06, + "loss": 0.2561, + "num_input_tokens_seen": 54073328, + "step": 35405 + }, + { + "epoch": 109.29057187017001, + "grad_norm": 0.8802986741065979, + "learning_rate": 1.6076586560959257e-06, + "loss": 0.2407, + "num_input_tokens_seen": 54080528, + "step": 35410 + }, + { + "epoch": 109.30602782071098, + "grad_norm": 0.5690697431564331, + "learning_rate": 1.604196721159182e-06, + "loss": 0.2295, + "num_input_tokens_seen": 54088112, + "step": 35415 + }, + { + "epoch": 109.32148377125193, + "grad_norm": 0.7810268402099609, + "learning_rate": 1.6007383941493092e-06, + "loss": 0.2709, + "num_input_tokens_seen": 54095408, + "step": 35420 + }, + { + "epoch": 109.33693972179289, + "grad_norm": 0.8089560270309448, + "learning_rate": 1.5972836755996285e-06, + "loss": 0.2157, + "num_input_tokens_seen": 54102896, + "step": 35425 + }, + { + "epoch": 109.35239567233384, + "grad_norm": 0.7160922288894653, + "learning_rate": 1.5938325660429076e-06, + "loss": 0.2102, + "num_input_tokens_seen": 54110640, + "step": 35430 + }, + { + "epoch": 109.36785162287481, + "grad_norm": 0.5115320682525635, + "learning_rate": 1.5903850660113378e-06, + "loss": 0.2245, + "num_input_tokens_seen": 54118160, + "step": 35435 + }, + { + "epoch": 109.38330757341576, + "grad_norm": 0.4401369094848633, + "learning_rate": 1.5869411760365826e-06, + "loss": 0.2078, + "num_input_tokens_seen": 54125904, + "step": 35440 + }, + { + "epoch": 109.39876352395672, + "grad_norm": 0.48530441522598267, + "learning_rate": 1.58350089664972e-06, + "loss": 0.2433, + "num_input_tokens_seen": 54133552, + "step": 35445 + }, + { + "epoch": 109.41421947449768, + "grad_norm": 0.5852329730987549, + "learning_rate": 1.5800642283812865e-06, + "loss": 0.2121, + "num_input_tokens_seen": 54141520, + "step": 35450 + }, + { + "epoch": 109.42967542503864, + "grad_norm": 0.3364708721637726, + "learning_rate": 1.5766311717612698e-06, + "loss": 0.1968, + "num_input_tokens_seen": 54148976, + "step": 35455 + }, + { + "epoch": 109.44513137557959, + "grad_norm": 0.63846355676651, + "learning_rate": 1.5732017273190818e-06, + "loss": 0.2336, + "num_input_tokens_seen": 54156240, + "step": 35460 + }, + { + "epoch": 109.46058732612056, + "grad_norm": 0.4718191921710968, + "learning_rate": 1.5697758955835806e-06, + "loss": 0.2987, + "num_input_tokens_seen": 54163760, + "step": 35465 + }, + { + "epoch": 109.47604327666151, + "grad_norm": 0.35675129294395447, + "learning_rate": 1.566353677083085e-06, + "loss": 0.287, + "num_input_tokens_seen": 54172080, + "step": 35470 + }, + { + "epoch": 109.49149922720247, + "grad_norm": 0.3783992528915405, + "learning_rate": 1.562935072345334e-06, + "loss": 0.2319, + "num_input_tokens_seen": 54180272, + "step": 35475 + }, + { + "epoch": 109.50695517774344, + "grad_norm": 0.578520655632019, + "learning_rate": 1.5595200818975281e-06, + "loss": 0.2135, + "num_input_tokens_seen": 54188688, + "step": 35480 + }, + { + "epoch": 109.52241112828439, + "grad_norm": 0.5186245441436768, + "learning_rate": 1.5561087062662905e-06, + "loss": 0.2372, + "num_input_tokens_seen": 54196624, + "step": 35485 + }, + { + "epoch": 109.53786707882534, + "grad_norm": 0.4387061297893524, + "learning_rate": 1.5527009459777087e-06, + "loss": 0.3029, + "num_input_tokens_seen": 54204528, + "step": 35490 + }, + { + "epoch": 109.55332302936631, + "grad_norm": 0.8211427330970764, + "learning_rate": 1.5492968015572984e-06, + "loss": 0.2598, + "num_input_tokens_seen": 54211792, + "step": 35495 + }, + { + "epoch": 109.56877897990726, + "grad_norm": 0.5320289134979248, + "learning_rate": 1.5458962735300203e-06, + "loss": 0.2609, + "num_input_tokens_seen": 54219376, + "step": 35500 + }, + { + "epoch": 109.58423493044822, + "grad_norm": 0.4131038188934326, + "learning_rate": 1.54249936242028e-06, + "loss": 0.2172, + "num_input_tokens_seen": 54227504, + "step": 35505 + }, + { + "epoch": 109.59969088098919, + "grad_norm": 0.3594679832458496, + "learning_rate": 1.5391060687519222e-06, + "loss": 0.2543, + "num_input_tokens_seen": 54234896, + "step": 35510 + }, + { + "epoch": 109.61514683153014, + "grad_norm": 0.5886713862419128, + "learning_rate": 1.5357163930482367e-06, + "loss": 0.2817, + "num_input_tokens_seen": 54242512, + "step": 35515 + }, + { + "epoch": 109.6306027820711, + "grad_norm": 0.7334082126617432, + "learning_rate": 1.532330335831955e-06, + "loss": 0.2629, + "num_input_tokens_seen": 54249872, + "step": 35520 + }, + { + "epoch": 109.64605873261206, + "grad_norm": 0.5415750741958618, + "learning_rate": 1.5289478976252491e-06, + "loss": 0.2818, + "num_input_tokens_seen": 54257360, + "step": 35525 + }, + { + "epoch": 109.66151468315302, + "grad_norm": 0.9857199192047119, + "learning_rate": 1.5255690789497345e-06, + "loss": 0.2407, + "num_input_tokens_seen": 54264848, + "step": 35530 + }, + { + "epoch": 109.67697063369397, + "grad_norm": 0.36963602900505066, + "learning_rate": 1.5221938803264641e-06, + "loss": 0.245, + "num_input_tokens_seen": 54272048, + "step": 35535 + }, + { + "epoch": 109.69242658423494, + "grad_norm": 0.3750379681587219, + "learning_rate": 1.518822302275938e-06, + "loss": 0.2384, + "num_input_tokens_seen": 54279440, + "step": 35540 + }, + { + "epoch": 109.70788253477589, + "grad_norm": 0.815563976764679, + "learning_rate": 1.5154543453180958e-06, + "loss": 0.2317, + "num_input_tokens_seen": 54286768, + "step": 35545 + }, + { + "epoch": 109.72333848531684, + "grad_norm": 0.6900240182876587, + "learning_rate": 1.5120900099723167e-06, + "loss": 0.2524, + "num_input_tokens_seen": 54294384, + "step": 35550 + }, + { + "epoch": 109.7387944358578, + "grad_norm": 1.0257843732833862, + "learning_rate": 1.5087292967574273e-06, + "loss": 0.2302, + "num_input_tokens_seen": 54301872, + "step": 35555 + }, + { + "epoch": 109.75425038639877, + "grad_norm": 0.5592150688171387, + "learning_rate": 1.5053722061916908e-06, + "loss": 0.2304, + "num_input_tokens_seen": 54309392, + "step": 35560 + }, + { + "epoch": 109.76970633693972, + "grad_norm": 0.809695839881897, + "learning_rate": 1.5020187387928124e-06, + "loss": 0.2782, + "num_input_tokens_seen": 54317104, + "step": 35565 + }, + { + "epoch": 109.78516228748067, + "grad_norm": 0.3802238404750824, + "learning_rate": 1.4986688950779343e-06, + "loss": 0.273, + "num_input_tokens_seen": 54325008, + "step": 35570 + }, + { + "epoch": 109.80061823802164, + "grad_norm": 0.38549917936325073, + "learning_rate": 1.495322675563654e-06, + "loss": 0.3002, + "num_input_tokens_seen": 54333168, + "step": 35575 + }, + { + "epoch": 109.8160741885626, + "grad_norm": 0.5405445694923401, + "learning_rate": 1.4919800807659922e-06, + "loss": 0.2782, + "num_input_tokens_seen": 54340720, + "step": 35580 + }, + { + "epoch": 109.83153013910355, + "grad_norm": 0.8703058958053589, + "learning_rate": 1.4886411112004255e-06, + "loss": 0.3088, + "num_input_tokens_seen": 54348560, + "step": 35585 + }, + { + "epoch": 109.84698608964452, + "grad_norm": 0.7835688591003418, + "learning_rate": 1.4853057673818588e-06, + "loss": 0.306, + "num_input_tokens_seen": 54356176, + "step": 35590 + }, + { + "epoch": 109.86244204018547, + "grad_norm": 0.6219282150268555, + "learning_rate": 1.481974049824647e-06, + "loss": 0.2054, + "num_input_tokens_seen": 54364304, + "step": 35595 + }, + { + "epoch": 109.87789799072642, + "grad_norm": 0.7208481431007385, + "learning_rate": 1.4786459590425849e-06, + "loss": 0.2505, + "num_input_tokens_seen": 54371888, + "step": 35600 + }, + { + "epoch": 109.87789799072642, + "eval_loss": 0.30178913474082947, + "eval_runtime": 6.3217, + "eval_samples_per_second": 90.957, + "eval_steps_per_second": 22.779, + "num_input_tokens_seen": 54371888, + "step": 35600 + }, + { + "epoch": 109.89335394126739, + "grad_norm": 0.4585874676704407, + "learning_rate": 1.4753214955489036e-06, + "loss": 0.1958, + "num_input_tokens_seen": 54379440, + "step": 35605 + }, + { + "epoch": 109.90880989180835, + "grad_norm": 0.45743831992149353, + "learning_rate": 1.4720006598562737e-06, + "loss": 0.2267, + "num_input_tokens_seen": 54387120, + "step": 35610 + }, + { + "epoch": 109.9242658423493, + "grad_norm": 0.3963320851325989, + "learning_rate": 1.4686834524768185e-06, + "loss": 0.2502, + "num_input_tokens_seen": 54395312, + "step": 35615 + }, + { + "epoch": 109.93972179289027, + "grad_norm": 0.5161160230636597, + "learning_rate": 1.4653698739220844e-06, + "loss": 0.2301, + "num_input_tokens_seen": 54402672, + "step": 35620 + }, + { + "epoch": 109.95517774343122, + "grad_norm": 0.2890452444553375, + "learning_rate": 1.4620599247030715e-06, + "loss": 0.2278, + "num_input_tokens_seen": 54410384, + "step": 35625 + }, + { + "epoch": 109.97063369397218, + "grad_norm": 0.6716316938400269, + "learning_rate": 1.4587536053302125e-06, + "loss": 0.2359, + "num_input_tokens_seen": 54418032, + "step": 35630 + }, + { + "epoch": 109.98608964451314, + "grad_norm": 0.6097226738929749, + "learning_rate": 1.4554509163133862e-06, + "loss": 0.292, + "num_input_tokens_seen": 54425616, + "step": 35635 + }, + { + "epoch": 110.0, + "grad_norm": 0.9905992746353149, + "learning_rate": 1.4521518581619098e-06, + "loss": 0.2191, + "num_input_tokens_seen": 54432448, + "step": 35640 + }, + { + "epoch": 110.01545595054095, + "grad_norm": 0.49974146485328674, + "learning_rate": 1.4488564313845348e-06, + "loss": 0.2345, + "num_input_tokens_seen": 54439584, + "step": 35645 + }, + { + "epoch": 110.03091190108192, + "grad_norm": 0.41264286637306213, + "learning_rate": 1.4455646364894603e-06, + "loss": 0.2071, + "num_input_tokens_seen": 54447104, + "step": 35650 + }, + { + "epoch": 110.04636785162288, + "grad_norm": 0.68043452501297, + "learning_rate": 1.4422764739843247e-06, + "loss": 0.2396, + "num_input_tokens_seen": 54454944, + "step": 35655 + }, + { + "epoch": 110.06182380216383, + "grad_norm": 0.49153435230255127, + "learning_rate": 1.4389919443762e-06, + "loss": 0.2387, + "num_input_tokens_seen": 54463168, + "step": 35660 + }, + { + "epoch": 110.0772797527048, + "grad_norm": 0.4613112211227417, + "learning_rate": 1.4357110481716063e-06, + "loss": 0.2226, + "num_input_tokens_seen": 54470048, + "step": 35665 + }, + { + "epoch": 110.09273570324575, + "grad_norm": 0.3447103798389435, + "learning_rate": 1.4324337858764941e-06, + "loss": 0.2111, + "num_input_tokens_seen": 54477568, + "step": 35670 + }, + { + "epoch": 110.1081916537867, + "grad_norm": 0.5729087591171265, + "learning_rate": 1.4291601579962622e-06, + "loss": 0.2304, + "num_input_tokens_seen": 54485536, + "step": 35675 + }, + { + "epoch": 110.12364760432767, + "grad_norm": 0.31664136052131653, + "learning_rate": 1.42589016503574e-06, + "loss": 0.271, + "num_input_tokens_seen": 54492832, + "step": 35680 + }, + { + "epoch": 110.13910355486863, + "grad_norm": 0.39916497468948364, + "learning_rate": 1.4226238074992099e-06, + "loss": 0.2578, + "num_input_tokens_seen": 54500672, + "step": 35685 + }, + { + "epoch": 110.15455950540958, + "grad_norm": 0.3704289197921753, + "learning_rate": 1.4193610858903778e-06, + "loss": 0.2065, + "num_input_tokens_seen": 54507744, + "step": 35690 + }, + { + "epoch": 110.17001545595055, + "grad_norm": 0.5063654780387878, + "learning_rate": 1.416102000712402e-06, + "loss": 0.2365, + "num_input_tokens_seen": 54515232, + "step": 35695 + }, + { + "epoch": 110.1854714064915, + "grad_norm": 0.3682444989681244, + "learning_rate": 1.4128465524678668e-06, + "loss": 0.2232, + "num_input_tokens_seen": 54523072, + "step": 35700 + }, + { + "epoch": 110.20092735703246, + "grad_norm": 1.2723525762557983, + "learning_rate": 1.4095947416588124e-06, + "loss": 0.2345, + "num_input_tokens_seen": 54530912, + "step": 35705 + }, + { + "epoch": 110.21638330757341, + "grad_norm": 0.36787545680999756, + "learning_rate": 1.4063465687866983e-06, + "loss": 0.205, + "num_input_tokens_seen": 54538880, + "step": 35710 + }, + { + "epoch": 110.23183925811438, + "grad_norm": 0.4603872299194336, + "learning_rate": 1.4031020343524438e-06, + "loss": 0.2887, + "num_input_tokens_seen": 54546368, + "step": 35715 + }, + { + "epoch": 110.24729520865533, + "grad_norm": 0.34854331612586975, + "learning_rate": 1.3998611388563926e-06, + "loss": 0.2499, + "num_input_tokens_seen": 54554464, + "step": 35720 + }, + { + "epoch": 110.26275115919628, + "grad_norm": 0.47909772396087646, + "learning_rate": 1.3966238827983314e-06, + "loss": 0.3501, + "num_input_tokens_seen": 54562688, + "step": 35725 + }, + { + "epoch": 110.27820710973725, + "grad_norm": 0.3868680000305176, + "learning_rate": 1.393390266677483e-06, + "loss": 0.2684, + "num_input_tokens_seen": 54569696, + "step": 35730 + }, + { + "epoch": 110.2936630602782, + "grad_norm": 0.4928896129131317, + "learning_rate": 1.3901602909925204e-06, + "loss": 0.2568, + "num_input_tokens_seen": 54577216, + "step": 35735 + }, + { + "epoch": 110.30911901081916, + "grad_norm": 0.4773978292942047, + "learning_rate": 1.3869339562415373e-06, + "loss": 0.2738, + "num_input_tokens_seen": 54585120, + "step": 35740 + }, + { + "epoch": 110.32457496136013, + "grad_norm": 0.30128365755081177, + "learning_rate": 1.38371126292208e-06, + "loss": 0.2125, + "num_input_tokens_seen": 54592544, + "step": 35745 + }, + { + "epoch": 110.34003091190108, + "grad_norm": 0.47470179200172424, + "learning_rate": 1.3804922115311286e-06, + "loss": 0.2892, + "num_input_tokens_seen": 54599968, + "step": 35750 + }, + { + "epoch": 110.35548686244204, + "grad_norm": 0.3620717525482178, + "learning_rate": 1.3772768025650945e-06, + "loss": 0.241, + "num_input_tokens_seen": 54607488, + "step": 35755 + }, + { + "epoch": 110.370942812983, + "grad_norm": 0.6536969542503357, + "learning_rate": 1.3740650365198448e-06, + "loss": 0.25, + "num_input_tokens_seen": 54614784, + "step": 35760 + }, + { + "epoch": 110.38639876352396, + "grad_norm": 0.8916097283363342, + "learning_rate": 1.3708569138906612e-06, + "loss": 0.2255, + "num_input_tokens_seen": 54622176, + "step": 35765 + }, + { + "epoch": 110.40185471406491, + "grad_norm": 0.45616909861564636, + "learning_rate": 1.367652435172287e-06, + "loss": 0.2974, + "num_input_tokens_seen": 54629792, + "step": 35770 + }, + { + "epoch": 110.41731066460588, + "grad_norm": 0.4232064485549927, + "learning_rate": 1.364451600858893e-06, + "loss": 0.2086, + "num_input_tokens_seen": 54637696, + "step": 35775 + }, + { + "epoch": 110.43276661514683, + "grad_norm": 0.41246089339256287, + "learning_rate": 1.3612544114440823e-06, + "loss": 0.2232, + "num_input_tokens_seen": 54644992, + "step": 35780 + }, + { + "epoch": 110.44822256568779, + "grad_norm": 0.5114409327507019, + "learning_rate": 1.3580608674209072e-06, + "loss": 0.2428, + "num_input_tokens_seen": 54652640, + "step": 35785 + }, + { + "epoch": 110.46367851622875, + "grad_norm": 0.5472604632377625, + "learning_rate": 1.3548709692818434e-06, + "loss": 0.286, + "num_input_tokens_seen": 54660128, + "step": 35790 + }, + { + "epoch": 110.47913446676971, + "grad_norm": 0.7317089438438416, + "learning_rate": 1.3516847175188223e-06, + "loss": 0.3275, + "num_input_tokens_seen": 54668864, + "step": 35795 + }, + { + "epoch": 110.49459041731066, + "grad_norm": 0.3604462742805481, + "learning_rate": 1.348502112623204e-06, + "loss": 0.225, + "num_input_tokens_seen": 54676672, + "step": 35800 + }, + { + "epoch": 110.49459041731066, + "eval_loss": 0.3014999032020569, + "eval_runtime": 6.2985, + "eval_samples_per_second": 91.292, + "eval_steps_per_second": 22.863, + "num_input_tokens_seen": 54676672, + "step": 35800 + }, + { + "epoch": 110.51004636785163, + "grad_norm": 0.5187234282493591, + "learning_rate": 1.3453231550857787e-06, + "loss": 0.2057, + "num_input_tokens_seen": 54684288, + "step": 35805 + }, + { + "epoch": 110.52550231839258, + "grad_norm": 0.5534575581550598, + "learning_rate": 1.3421478453967878e-06, + "loss": 0.2835, + "num_input_tokens_seen": 54691872, + "step": 35810 + }, + { + "epoch": 110.54095826893354, + "grad_norm": 0.5206355452537537, + "learning_rate": 1.3389761840459065e-06, + "loss": 0.2892, + "num_input_tokens_seen": 54699456, + "step": 35815 + }, + { + "epoch": 110.5564142194745, + "grad_norm": 0.32162827253341675, + "learning_rate": 1.3358081715222376e-06, + "loss": 0.2479, + "num_input_tokens_seen": 54706912, + "step": 35820 + }, + { + "epoch": 110.57187017001546, + "grad_norm": 0.5475916266441345, + "learning_rate": 1.3326438083143295e-06, + "loss": 0.2394, + "num_input_tokens_seen": 54714336, + "step": 35825 + }, + { + "epoch": 110.58732612055641, + "grad_norm": 0.44385942816734314, + "learning_rate": 1.3294830949101723e-06, + "loss": 0.2271, + "num_input_tokens_seen": 54721856, + "step": 35830 + }, + { + "epoch": 110.60278207109737, + "grad_norm": 0.5715579986572266, + "learning_rate": 1.3263260317971815e-06, + "loss": 0.2396, + "num_input_tokens_seen": 54729344, + "step": 35835 + }, + { + "epoch": 110.61823802163833, + "grad_norm": 0.4880164563655853, + "learning_rate": 1.3231726194622208e-06, + "loss": 0.28, + "num_input_tokens_seen": 54736960, + "step": 35840 + }, + { + "epoch": 110.63369397217929, + "grad_norm": 0.3343174159526825, + "learning_rate": 1.3200228583915814e-06, + "loss": 0.2477, + "num_input_tokens_seen": 54744256, + "step": 35845 + }, + { + "epoch": 110.64914992272024, + "grad_norm": 0.6629220247268677, + "learning_rate": 1.3168767490709971e-06, + "loss": 0.2457, + "num_input_tokens_seen": 54751904, + "step": 35850 + }, + { + "epoch": 110.66460587326121, + "grad_norm": 0.6834918260574341, + "learning_rate": 1.3137342919856437e-06, + "loss": 0.2501, + "num_input_tokens_seen": 54759264, + "step": 35855 + }, + { + "epoch": 110.68006182380216, + "grad_norm": 0.6191946864128113, + "learning_rate": 1.310595487620117e-06, + "loss": 0.2453, + "num_input_tokens_seen": 54766944, + "step": 35860 + }, + { + "epoch": 110.69551777434312, + "grad_norm": 0.6014384627342224, + "learning_rate": 1.3074603364584715e-06, + "loss": 0.2161, + "num_input_tokens_seen": 54773888, + "step": 35865 + }, + { + "epoch": 110.71097372488408, + "grad_norm": 0.40375664830207825, + "learning_rate": 1.3043288389841758e-06, + "loss": 0.3066, + "num_input_tokens_seen": 54781344, + "step": 35870 + }, + { + "epoch": 110.72642967542504, + "grad_norm": 0.4182307720184326, + "learning_rate": 1.3012009956801546e-06, + "loss": 0.3019, + "num_input_tokens_seen": 54789216, + "step": 35875 + }, + { + "epoch": 110.74188562596599, + "grad_norm": 0.5202481746673584, + "learning_rate": 1.2980768070287586e-06, + "loss": 0.2149, + "num_input_tokens_seen": 54796672, + "step": 35880 + }, + { + "epoch": 110.75734157650696, + "grad_norm": 0.5161909461021423, + "learning_rate": 1.2949562735117716e-06, + "loss": 0.2241, + "num_input_tokens_seen": 54804768, + "step": 35885 + }, + { + "epoch": 110.77279752704791, + "grad_norm": 0.7784649729728699, + "learning_rate": 1.291839395610428e-06, + "loss": 0.2542, + "num_input_tokens_seen": 54813152, + "step": 35890 + }, + { + "epoch": 110.78825347758887, + "grad_norm": 0.37447044253349304, + "learning_rate": 1.2887261738053852e-06, + "loss": 0.2338, + "num_input_tokens_seen": 54820736, + "step": 35895 + }, + { + "epoch": 110.80370942812984, + "grad_norm": 0.6079940795898438, + "learning_rate": 1.2856166085767396e-06, + "loss": 0.2174, + "num_input_tokens_seen": 54828480, + "step": 35900 + }, + { + "epoch": 110.81916537867079, + "grad_norm": 0.4589165449142456, + "learning_rate": 1.2825107004040272e-06, + "loss": 0.2071, + "num_input_tokens_seen": 54837376, + "step": 35905 + }, + { + "epoch": 110.83462132921174, + "grad_norm": 0.48362264037132263, + "learning_rate": 1.2794084497662146e-06, + "loss": 0.2339, + "num_input_tokens_seen": 54845184, + "step": 35910 + }, + { + "epoch": 110.85007727975271, + "grad_norm": 0.557406485080719, + "learning_rate": 1.276309857141711e-06, + "loss": 0.2375, + "num_input_tokens_seen": 54852608, + "step": 35915 + }, + { + "epoch": 110.86553323029366, + "grad_norm": 0.6986296772956848, + "learning_rate": 1.273214923008359e-06, + "loss": 0.3323, + "num_input_tokens_seen": 54860096, + "step": 35920 + }, + { + "epoch": 110.88098918083462, + "grad_norm": 0.36285239458084106, + "learning_rate": 1.2701236478434352e-06, + "loss": 0.1949, + "num_input_tokens_seen": 54867712, + "step": 35925 + }, + { + "epoch": 110.89644513137559, + "grad_norm": 0.39901092648506165, + "learning_rate": 1.2670360321236502e-06, + "loss": 0.2765, + "num_input_tokens_seen": 54875328, + "step": 35930 + }, + { + "epoch": 110.91190108191654, + "grad_norm": 0.4583507478237152, + "learning_rate": 1.2639520763251617e-06, + "loss": 0.2038, + "num_input_tokens_seen": 54882752, + "step": 35935 + }, + { + "epoch": 110.9273570324575, + "grad_norm": 0.5420776605606079, + "learning_rate": 1.2608717809235448e-06, + "loss": 0.2372, + "num_input_tokens_seen": 54890592, + "step": 35940 + }, + { + "epoch": 110.94281298299846, + "grad_norm": 0.5420188903808594, + "learning_rate": 1.2577951463938282e-06, + "loss": 0.2733, + "num_input_tokens_seen": 54898656, + "step": 35945 + }, + { + "epoch": 110.95826893353942, + "grad_norm": 0.42950350046157837, + "learning_rate": 1.2547221732104569e-06, + "loss": 0.2379, + "num_input_tokens_seen": 54906464, + "step": 35950 + }, + { + "epoch": 110.97372488408037, + "grad_norm": 0.7627288103103638, + "learning_rate": 1.25165286184733e-06, + "loss": 0.2287, + "num_input_tokens_seen": 54913856, + "step": 35955 + }, + { + "epoch": 110.98918083462132, + "grad_norm": 0.49860596656799316, + "learning_rate": 1.248587212777777e-06, + "loss": 0.2268, + "num_input_tokens_seen": 54922624, + "step": 35960 + }, + { + "epoch": 111.0030911901082, + "grad_norm": 0.4414052665233612, + "learning_rate": 1.2455252264745532e-06, + "loss": 0.2128, + "num_input_tokens_seen": 54929600, + "step": 35965 + }, + { + "epoch": 111.01854714064915, + "grad_norm": 0.43327268958091736, + "learning_rate": 1.2424669034098528e-06, + "loss": 0.2695, + "num_input_tokens_seen": 54937792, + "step": 35970 + }, + { + "epoch": 111.03400309119012, + "grad_norm": 0.47471052408218384, + "learning_rate": 1.2394122440553185e-06, + "loss": 0.2856, + "num_input_tokens_seen": 54945824, + "step": 35975 + }, + { + "epoch": 111.04945904173107, + "grad_norm": 0.8316190838813782, + "learning_rate": 1.2363612488820037e-06, + "loss": 0.2551, + "num_input_tokens_seen": 54952928, + "step": 35980 + }, + { + "epoch": 111.06491499227202, + "grad_norm": 0.7049325704574585, + "learning_rate": 1.2333139183604208e-06, + "loss": 0.2687, + "num_input_tokens_seen": 54960416, + "step": 35985 + }, + { + "epoch": 111.08037094281298, + "grad_norm": 0.5587902665138245, + "learning_rate": 1.2302702529604998e-06, + "loss": 0.2812, + "num_input_tokens_seen": 54968256, + "step": 35990 + }, + { + "epoch": 111.09582689335394, + "grad_norm": 0.4984731674194336, + "learning_rate": 1.227230253151615e-06, + "loss": 0.2434, + "num_input_tokens_seen": 54975744, + "step": 35995 + }, + { + "epoch": 111.1112828438949, + "grad_norm": 0.5147832632064819, + "learning_rate": 1.2241939194025748e-06, + "loss": 0.2609, + "num_input_tokens_seen": 54983008, + "step": 36000 + }, + { + "epoch": 111.1112828438949, + "eval_loss": 0.301506906747818, + "eval_runtime": 6.3134, + "eval_samples_per_second": 91.076, + "eval_steps_per_second": 22.809, + "num_input_tokens_seen": 54983008, + "step": 36000 + }, + { + "epoch": 111.12673879443585, + "grad_norm": 0.41458553075790405, + "learning_rate": 1.2211612521816156e-06, + "loss": 0.2061, + "num_input_tokens_seen": 54990528, + "step": 36005 + }, + { + "epoch": 111.14219474497682, + "grad_norm": 0.3849138021469116, + "learning_rate": 1.2181322519564137e-06, + "loss": 0.2096, + "num_input_tokens_seen": 54998400, + "step": 36010 + }, + { + "epoch": 111.15765069551777, + "grad_norm": 0.5336860418319702, + "learning_rate": 1.2151069191940839e-06, + "loss": 0.2409, + "num_input_tokens_seen": 55006176, + "step": 36015 + }, + { + "epoch": 111.17310664605873, + "grad_norm": 0.41601401567459106, + "learning_rate": 1.2120852543611644e-06, + "loss": 0.2578, + "num_input_tokens_seen": 55013376, + "step": 36020 + }, + { + "epoch": 111.1885625965997, + "grad_norm": 0.4771096408367157, + "learning_rate": 1.2090672579236379e-06, + "loss": 0.2595, + "num_input_tokens_seen": 55021280, + "step": 36025 + }, + { + "epoch": 111.20401854714065, + "grad_norm": 0.44838497042655945, + "learning_rate": 1.2060529303469126e-06, + "loss": 0.2202, + "num_input_tokens_seen": 55029248, + "step": 36030 + }, + { + "epoch": 111.2194744976816, + "grad_norm": 0.5275312066078186, + "learning_rate": 1.2030422720958445e-06, + "loss": 0.2825, + "num_input_tokens_seen": 55036448, + "step": 36035 + }, + { + "epoch": 111.23493044822257, + "grad_norm": 0.4926569163799286, + "learning_rate": 1.200035283634704e-06, + "loss": 0.2147, + "num_input_tokens_seen": 55044384, + "step": 36040 + }, + { + "epoch": 111.25038639876352, + "grad_norm": 0.29772940278053284, + "learning_rate": 1.1970319654272144e-06, + "loss": 0.2982, + "num_input_tokens_seen": 55051936, + "step": 36045 + }, + { + "epoch": 111.26584234930448, + "grad_norm": 0.525235652923584, + "learning_rate": 1.1940323179365192e-06, + "loss": 0.2679, + "num_input_tokens_seen": 55059168, + "step": 36050 + }, + { + "epoch": 111.28129829984545, + "grad_norm": 0.4385564625263214, + "learning_rate": 1.1910363416252095e-06, + "loss": 0.1767, + "num_input_tokens_seen": 55066880, + "step": 36055 + }, + { + "epoch": 111.2967542503864, + "grad_norm": 0.47551169991493225, + "learning_rate": 1.1880440369552964e-06, + "loss": 0.2929, + "num_input_tokens_seen": 55074272, + "step": 36060 + }, + { + "epoch": 111.31221020092735, + "grad_norm": 0.6640962958335876, + "learning_rate": 1.1850554043882328e-06, + "loss": 0.2724, + "num_input_tokens_seen": 55082240, + "step": 36065 + }, + { + "epoch": 111.32766615146832, + "grad_norm": 0.5934236645698547, + "learning_rate": 1.1820704443849028e-06, + "loss": 0.193, + "num_input_tokens_seen": 55089952, + "step": 36070 + }, + { + "epoch": 111.34312210200927, + "grad_norm": 0.5845664739608765, + "learning_rate": 1.1790891574056219e-06, + "loss": 0.2245, + "num_input_tokens_seen": 55097856, + "step": 36075 + }, + { + "epoch": 111.35857805255023, + "grad_norm": 0.869540274143219, + "learning_rate": 1.1761115439101523e-06, + "loss": 0.2356, + "num_input_tokens_seen": 55105792, + "step": 36080 + }, + { + "epoch": 111.3740340030912, + "grad_norm": 0.5921748280525208, + "learning_rate": 1.1731376043576659e-06, + "loss": 0.1945, + "num_input_tokens_seen": 55112992, + "step": 36085 + }, + { + "epoch": 111.38948995363215, + "grad_norm": 0.6224775314331055, + "learning_rate": 1.1701673392067875e-06, + "loss": 0.2404, + "num_input_tokens_seen": 55120384, + "step": 36090 + }, + { + "epoch": 111.4049459041731, + "grad_norm": 0.5357508659362793, + "learning_rate": 1.1672007489155757e-06, + "loss": 0.2236, + "num_input_tokens_seen": 55127968, + "step": 36095 + }, + { + "epoch": 111.42040185471407, + "grad_norm": 0.5548082590103149, + "learning_rate": 1.164237833941506e-06, + "loss": 0.2168, + "num_input_tokens_seen": 55135712, + "step": 36100 + }, + { + "epoch": 111.43585780525503, + "grad_norm": 0.431573748588562, + "learning_rate": 1.1612785947415022e-06, + "loss": 0.2879, + "num_input_tokens_seen": 55142912, + "step": 36105 + }, + { + "epoch": 111.45131375579598, + "grad_norm": 0.384122371673584, + "learning_rate": 1.1583230317719185e-06, + "loss": 0.2119, + "num_input_tokens_seen": 55150944, + "step": 36110 + }, + { + "epoch": 111.46676970633693, + "grad_norm": 0.45070239901542664, + "learning_rate": 1.1553711454885318e-06, + "loss": 0.2559, + "num_input_tokens_seen": 55158688, + "step": 36115 + }, + { + "epoch": 111.4822256568779, + "grad_norm": 0.5309523344039917, + "learning_rate": 1.152422936346567e-06, + "loss": 0.2603, + "num_input_tokens_seen": 55166496, + "step": 36120 + }, + { + "epoch": 111.49768160741885, + "grad_norm": 0.4866388440132141, + "learning_rate": 1.1494784048006718e-06, + "loss": 0.2536, + "num_input_tokens_seen": 55174272, + "step": 36125 + }, + { + "epoch": 111.51313755795981, + "grad_norm": 0.3878767192363739, + "learning_rate": 1.1465375513049326e-06, + "loss": 0.2385, + "num_input_tokens_seen": 55182112, + "step": 36130 + }, + { + "epoch": 111.52859350850078, + "grad_norm": 0.6904415488243103, + "learning_rate": 1.1436003763128616e-06, + "loss": 0.2563, + "num_input_tokens_seen": 55190400, + "step": 36135 + }, + { + "epoch": 111.54404945904173, + "grad_norm": 0.658128023147583, + "learning_rate": 1.1406668802774106e-06, + "loss": 0.2791, + "num_input_tokens_seen": 55197696, + "step": 36140 + }, + { + "epoch": 111.55950540958268, + "grad_norm": 1.1741101741790771, + "learning_rate": 1.137737063650965e-06, + "loss": 0.272, + "num_input_tokens_seen": 55205184, + "step": 36145 + }, + { + "epoch": 111.57496136012365, + "grad_norm": 0.970321774482727, + "learning_rate": 1.1348109268853323e-06, + "loss": 0.2722, + "num_input_tokens_seen": 55212192, + "step": 36150 + }, + { + "epoch": 111.5904173106646, + "grad_norm": 0.49238282442092896, + "learning_rate": 1.1318884704317634e-06, + "loss": 0.2424, + "num_input_tokens_seen": 55220000, + "step": 36155 + }, + { + "epoch": 111.60587326120556, + "grad_norm": 0.8261064291000366, + "learning_rate": 1.1289696947409417e-06, + "loss": 0.2505, + "num_input_tokens_seen": 55228000, + "step": 36160 + }, + { + "epoch": 111.62132921174653, + "grad_norm": 0.30349862575531006, + "learning_rate": 1.126054600262974e-06, + "loss": 0.2393, + "num_input_tokens_seen": 55235904, + "step": 36165 + }, + { + "epoch": 111.63678516228748, + "grad_norm": 0.6516168117523193, + "learning_rate": 1.1231431874474064e-06, + "loss": 0.2687, + "num_input_tokens_seen": 55243616, + "step": 36170 + }, + { + "epoch": 111.65224111282843, + "grad_norm": 0.5401999354362488, + "learning_rate": 1.12023545674321e-06, + "loss": 0.2118, + "num_input_tokens_seen": 55250560, + "step": 36175 + }, + { + "epoch": 111.6676970633694, + "grad_norm": 0.5246886014938354, + "learning_rate": 1.117331408598804e-06, + "loss": 0.2802, + "num_input_tokens_seen": 55257984, + "step": 36180 + }, + { + "epoch": 111.68315301391036, + "grad_norm": 0.563715398311615, + "learning_rate": 1.1144310434620191e-06, + "loss": 0.2331, + "num_input_tokens_seen": 55266016, + "step": 36185 + }, + { + "epoch": 111.69860896445131, + "grad_norm": 0.823227047920227, + "learning_rate": 1.1115343617801365e-06, + "loss": 0.2578, + "num_input_tokens_seen": 55273984, + "step": 36190 + }, + { + "epoch": 111.71406491499228, + "grad_norm": 0.9335619807243347, + "learning_rate": 1.1086413639998515e-06, + "loss": 0.2993, + "num_input_tokens_seen": 55281344, + "step": 36195 + }, + { + "epoch": 111.72952086553323, + "grad_norm": 0.632819652557373, + "learning_rate": 1.1057520505673103e-06, + "loss": 0.2401, + "num_input_tokens_seen": 55289472, + "step": 36200 + }, + { + "epoch": 111.72952086553323, + "eval_loss": 0.3015429675579071, + "eval_runtime": 6.2648, + "eval_samples_per_second": 91.783, + "eval_steps_per_second": 22.986, + "num_input_tokens_seen": 55289472, + "step": 36200 + }, + { + "epoch": 111.74497681607419, + "grad_norm": 0.4494083821773529, + "learning_rate": 1.1028664219280727e-06, + "loss": 0.2687, + "num_input_tokens_seen": 55296992, + "step": 36205 + }, + { + "epoch": 111.76043276661515, + "grad_norm": 0.4743505120277405, + "learning_rate": 1.0999844785271468e-06, + "loss": 0.2598, + "num_input_tokens_seen": 55304064, + "step": 36210 + }, + { + "epoch": 111.7758887171561, + "grad_norm": 0.46818774938583374, + "learning_rate": 1.097106220808955e-06, + "loss": 0.2312, + "num_input_tokens_seen": 55311360, + "step": 36215 + }, + { + "epoch": 111.79134466769706, + "grad_norm": 0.3981790840625763, + "learning_rate": 1.0942316492173698e-06, + "loss": 0.2678, + "num_input_tokens_seen": 55319040, + "step": 36220 + }, + { + "epoch": 111.80680061823801, + "grad_norm": 0.5753517746925354, + "learning_rate": 1.0913607641956841e-06, + "loss": 0.2286, + "num_input_tokens_seen": 55326496, + "step": 36225 + }, + { + "epoch": 111.82225656877898, + "grad_norm": 0.5661334991455078, + "learning_rate": 1.0884935661866213e-06, + "loss": 0.2845, + "num_input_tokens_seen": 55333728, + "step": 36230 + }, + { + "epoch": 111.83771251931994, + "grad_norm": 0.4828256666660309, + "learning_rate": 1.0856300556323418e-06, + "loss": 0.2339, + "num_input_tokens_seen": 55342080, + "step": 36235 + }, + { + "epoch": 111.85316846986089, + "grad_norm": 0.5279965996742249, + "learning_rate": 1.0827702329744365e-06, + "loss": 0.2256, + "num_input_tokens_seen": 55350176, + "step": 36240 + }, + { + "epoch": 111.86862442040186, + "grad_norm": 0.7345801591873169, + "learning_rate": 1.0799140986539197e-06, + "loss": 0.2503, + "num_input_tokens_seen": 55357984, + "step": 36245 + }, + { + "epoch": 111.88408037094281, + "grad_norm": 0.4901556968688965, + "learning_rate": 1.0770616531112526e-06, + "loss": 0.2002, + "num_input_tokens_seen": 55365664, + "step": 36250 + }, + { + "epoch": 111.89953632148377, + "grad_norm": 0.4870505928993225, + "learning_rate": 1.0742128967863085e-06, + "loss": 0.213, + "num_input_tokens_seen": 55373824, + "step": 36255 + }, + { + "epoch": 111.91499227202473, + "grad_norm": 0.6340245008468628, + "learning_rate": 1.071367830118411e-06, + "loss": 0.2987, + "num_input_tokens_seen": 55381120, + "step": 36260 + }, + { + "epoch": 111.93044822256569, + "grad_norm": 0.31731513142585754, + "learning_rate": 1.068526453546298e-06, + "loss": 0.275, + "num_input_tokens_seen": 55388544, + "step": 36265 + }, + { + "epoch": 111.94590417310664, + "grad_norm": 0.5639902353286743, + "learning_rate": 1.0656887675081467e-06, + "loss": 0.2108, + "num_input_tokens_seen": 55396032, + "step": 36270 + }, + { + "epoch": 111.96136012364761, + "grad_norm": 0.6303313970565796, + "learning_rate": 1.0628547724415628e-06, + "loss": 0.2405, + "num_input_tokens_seen": 55403552, + "step": 36275 + }, + { + "epoch": 111.97681607418856, + "grad_norm": 0.37413373589515686, + "learning_rate": 1.0600244687835881e-06, + "loss": 0.234, + "num_input_tokens_seen": 55411456, + "step": 36280 + }, + { + "epoch": 111.99227202472952, + "grad_norm": 0.39698830246925354, + "learning_rate": 1.0571978569706876e-06, + "loss": 0.2164, + "num_input_tokens_seen": 55419104, + "step": 36285 + }, + { + "epoch": 112.00618238021639, + "grad_norm": 0.6187307238578796, + "learning_rate": 1.0543749374387652e-06, + "loss": 0.2213, + "num_input_tokens_seen": 55425520, + "step": 36290 + }, + { + "epoch": 112.02163833075734, + "grad_norm": 0.5955240726470947, + "learning_rate": 1.051555710623142e-06, + "loss": 0.3122, + "num_input_tokens_seen": 55433616, + "step": 36295 + }, + { + "epoch": 112.0370942812983, + "grad_norm": 0.38029423356056213, + "learning_rate": 1.0487401769585847e-06, + "loss": 0.2212, + "num_input_tokens_seen": 55440816, + "step": 36300 + }, + { + "epoch": 112.05255023183926, + "grad_norm": 0.7954638600349426, + "learning_rate": 1.0459283368792845e-06, + "loss": 0.2091, + "num_input_tokens_seen": 55448176, + "step": 36305 + }, + { + "epoch": 112.06800618238022, + "grad_norm": 0.5702295899391174, + "learning_rate": 1.043120190818858e-06, + "loss": 0.2483, + "num_input_tokens_seen": 55456048, + "step": 36310 + }, + { + "epoch": 112.08346213292117, + "grad_norm": 0.5991599559783936, + "learning_rate": 1.0403157392103596e-06, + "loss": 0.2513, + "num_input_tokens_seen": 55463344, + "step": 36315 + }, + { + "epoch": 112.09891808346214, + "grad_norm": 0.395486980676651, + "learning_rate": 1.0375149824862735e-06, + "loss": 0.2544, + "num_input_tokens_seen": 55470448, + "step": 36320 + }, + { + "epoch": 112.11437403400309, + "grad_norm": 0.47912561893463135, + "learning_rate": 1.034717921078507e-06, + "loss": 0.249, + "num_input_tokens_seen": 55477936, + "step": 36325 + }, + { + "epoch": 112.12982998454405, + "grad_norm": 0.42593735456466675, + "learning_rate": 1.0319245554184009e-06, + "loss": 0.2432, + "num_input_tokens_seen": 55485648, + "step": 36330 + }, + { + "epoch": 112.14528593508501, + "grad_norm": 0.3226417899131775, + "learning_rate": 1.0291348859367361e-06, + "loss": 0.2539, + "num_input_tokens_seen": 55493840, + "step": 36335 + }, + { + "epoch": 112.16074188562597, + "grad_norm": 0.755936324596405, + "learning_rate": 1.0263489130637016e-06, + "loss": 0.2217, + "num_input_tokens_seen": 55501328, + "step": 36340 + }, + { + "epoch": 112.17619783616692, + "grad_norm": 0.38765382766723633, + "learning_rate": 1.0235666372289427e-06, + "loss": 0.213, + "num_input_tokens_seen": 55509072, + "step": 36345 + }, + { + "epoch": 112.19165378670789, + "grad_norm": 0.5679465532302856, + "learning_rate": 1.0207880588615076e-06, + "loss": 0.289, + "num_input_tokens_seen": 55516400, + "step": 36350 + }, + { + "epoch": 112.20710973724884, + "grad_norm": 0.5733044147491455, + "learning_rate": 1.0180131783898984e-06, + "loss": 0.2633, + "num_input_tokens_seen": 55523536, + "step": 36355 + }, + { + "epoch": 112.2225656877898, + "grad_norm": 0.3493580222129822, + "learning_rate": 1.0152419962420362e-06, + "loss": 0.2431, + "num_input_tokens_seen": 55530768, + "step": 36360 + }, + { + "epoch": 112.23802163833076, + "grad_norm": 0.40252628922462463, + "learning_rate": 1.0124745128452685e-06, + "loss": 0.2411, + "num_input_tokens_seen": 55537808, + "step": 36365 + }, + { + "epoch": 112.25347758887172, + "grad_norm": 0.5140567421913147, + "learning_rate": 1.0097107286263758e-06, + "loss": 0.269, + "num_input_tokens_seen": 55546064, + "step": 36370 + }, + { + "epoch": 112.26893353941267, + "grad_norm": 0.35582464933395386, + "learning_rate": 1.00695064401157e-06, + "loss": 0.2358, + "num_input_tokens_seen": 55553904, + "step": 36375 + }, + { + "epoch": 112.28438948995363, + "grad_norm": 0.6268022656440735, + "learning_rate": 1.0041942594264886e-06, + "loss": 0.2979, + "num_input_tokens_seen": 55561424, + "step": 36380 + }, + { + "epoch": 112.2998454404946, + "grad_norm": 0.8181461095809937, + "learning_rate": 1.001441575296208e-06, + "loss": 0.2291, + "num_input_tokens_seen": 55568912, + "step": 36385 + }, + { + "epoch": 112.31530139103555, + "grad_norm": 0.35424551367759705, + "learning_rate": 9.986925920452139e-07, + "loss": 0.1969, + "num_input_tokens_seen": 55576976, + "step": 36390 + }, + { + "epoch": 112.3307573415765, + "grad_norm": 0.44182726740837097, + "learning_rate": 9.959473100974475e-07, + "loss": 0.2424, + "num_input_tokens_seen": 55584112, + "step": 36395 + }, + { + "epoch": 112.34621329211747, + "grad_norm": 0.6285998225212097, + "learning_rate": 9.932057298762564e-07, + "loss": 0.2103, + "num_input_tokens_seen": 55591632, + "step": 36400 + }, + { + "epoch": 112.34621329211747, + "eval_loss": 0.302053302526474, + "eval_runtime": 6.3161, + "eval_samples_per_second": 91.038, + "eval_steps_per_second": 22.799, + "num_input_tokens_seen": 55591632, + "step": 36400 + }, + { + "epoch": 112.36166924265842, + "grad_norm": 0.46435919404029846, + "learning_rate": 9.90467851804433e-07, + "loss": 0.2255, + "num_input_tokens_seen": 55599376, + "step": 36405 + }, + { + "epoch": 112.37712519319938, + "grad_norm": 0.4678661823272705, + "learning_rate": 9.877336763041895e-07, + "loss": 0.268, + "num_input_tokens_seen": 55606992, + "step": 36410 + }, + { + "epoch": 112.39258114374034, + "grad_norm": 0.32255491614341736, + "learning_rate": 9.850032037971662e-07, + "loss": 0.2217, + "num_input_tokens_seen": 55614864, + "step": 36415 + }, + { + "epoch": 112.4080370942813, + "grad_norm": 0.48518136143684387, + "learning_rate": 9.822764347044406e-07, + "loss": 0.2055, + "num_input_tokens_seen": 55622480, + "step": 36420 + }, + { + "epoch": 112.42349304482225, + "grad_norm": 0.4696531593799591, + "learning_rate": 9.795533694465175e-07, + "loss": 0.2517, + "num_input_tokens_seen": 55630576, + "step": 36425 + }, + { + "epoch": 112.43894899536322, + "grad_norm": 0.3317917287349701, + "learning_rate": 9.768340084433197e-07, + "loss": 0.2394, + "num_input_tokens_seen": 55638096, + "step": 36430 + }, + { + "epoch": 112.45440494590417, + "grad_norm": 0.42111051082611084, + "learning_rate": 9.741183521142143e-07, + "loss": 0.2177, + "num_input_tokens_seen": 55645712, + "step": 36435 + }, + { + "epoch": 112.46986089644513, + "grad_norm": 0.4706868827342987, + "learning_rate": 9.714064008779889e-07, + "loss": 0.25, + "num_input_tokens_seen": 55653008, + "step": 36440 + }, + { + "epoch": 112.4853168469861, + "grad_norm": 0.6806322336196899, + "learning_rate": 9.686981551528584e-07, + "loss": 0.2033, + "num_input_tokens_seen": 55660944, + "step": 36445 + }, + { + "epoch": 112.50077279752705, + "grad_norm": 0.3626304566860199, + "learning_rate": 9.65993615356467e-07, + "loss": 0.2124, + "num_input_tokens_seen": 55668400, + "step": 36450 + }, + { + "epoch": 112.516228748068, + "grad_norm": 0.5157350897789001, + "learning_rate": 9.632927819058917e-07, + "loss": 0.2726, + "num_input_tokens_seen": 55676656, + "step": 36455 + }, + { + "epoch": 112.53168469860897, + "grad_norm": 0.4168548285961151, + "learning_rate": 9.605956552176305e-07, + "loss": 0.2184, + "num_input_tokens_seen": 55684528, + "step": 36460 + }, + { + "epoch": 112.54714064914992, + "grad_norm": 0.28869810700416565, + "learning_rate": 9.579022357076223e-07, + "loss": 0.2212, + "num_input_tokens_seen": 55691920, + "step": 36465 + }, + { + "epoch": 112.56259659969088, + "grad_norm": 0.5958966016769409, + "learning_rate": 9.552125237912158e-07, + "loss": 0.2369, + "num_input_tokens_seen": 55699760, + "step": 36470 + }, + { + "epoch": 112.57805255023185, + "grad_norm": 0.311226487159729, + "learning_rate": 9.525265198832096e-07, + "loss": 0.2292, + "num_input_tokens_seen": 55707472, + "step": 36475 + }, + { + "epoch": 112.5935085007728, + "grad_norm": 0.7736281156539917, + "learning_rate": 9.498442243978112e-07, + "loss": 0.2987, + "num_input_tokens_seen": 55714800, + "step": 36480 + }, + { + "epoch": 112.60896445131375, + "grad_norm": 0.32466772198677063, + "learning_rate": 9.471656377486649e-07, + "loss": 0.2112, + "num_input_tokens_seen": 55722640, + "step": 36485 + }, + { + "epoch": 112.62442040185472, + "grad_norm": 0.43679848313331604, + "learning_rate": 9.444907603488456e-07, + "loss": 0.2805, + "num_input_tokens_seen": 55730032, + "step": 36490 + }, + { + "epoch": 112.63987635239567, + "grad_norm": 0.5016418099403381, + "learning_rate": 9.418195926108514e-07, + "loss": 0.2515, + "num_input_tokens_seen": 55737392, + "step": 36495 + }, + { + "epoch": 112.65533230293663, + "grad_norm": 0.5771806240081787, + "learning_rate": 9.391521349466053e-07, + "loss": 0.229, + "num_input_tokens_seen": 55745424, + "step": 36500 + }, + { + "epoch": 112.67078825347758, + "grad_norm": 0.44663113355636597, + "learning_rate": 9.364883877674758e-07, + "loss": 0.2705, + "num_input_tokens_seen": 55753680, + "step": 36505 + }, + { + "epoch": 112.68624420401855, + "grad_norm": 0.6811786890029907, + "learning_rate": 9.33828351484231e-07, + "loss": 0.2439, + "num_input_tokens_seen": 55761232, + "step": 36510 + }, + { + "epoch": 112.7017001545595, + "grad_norm": 0.3352031409740448, + "learning_rate": 9.311720265070906e-07, + "loss": 0.2363, + "num_input_tokens_seen": 55769200, + "step": 36515 + }, + { + "epoch": 112.71715610510046, + "grad_norm": 0.7244672179222107, + "learning_rate": 9.285194132456931e-07, + "loss": 0.304, + "num_input_tokens_seen": 55776304, + "step": 36520 + }, + { + "epoch": 112.73261205564143, + "grad_norm": 0.37550225853919983, + "learning_rate": 9.258705121091032e-07, + "loss": 0.2405, + "num_input_tokens_seen": 55783568, + "step": 36525 + }, + { + "epoch": 112.74806800618238, + "grad_norm": 0.48093074560165405, + "learning_rate": 9.232253235058136e-07, + "loss": 0.3222, + "num_input_tokens_seen": 55790864, + "step": 36530 + }, + { + "epoch": 112.76352395672333, + "grad_norm": 0.4333009719848633, + "learning_rate": 9.205838478437478e-07, + "loss": 0.2217, + "num_input_tokens_seen": 55799024, + "step": 36535 + }, + { + "epoch": 112.7789799072643, + "grad_norm": 0.4830370545387268, + "learning_rate": 9.179460855302524e-07, + "loss": 0.2608, + "num_input_tokens_seen": 55806544, + "step": 36540 + }, + { + "epoch": 112.79443585780525, + "grad_norm": 0.621720552444458, + "learning_rate": 9.153120369721046e-07, + "loss": 0.2289, + "num_input_tokens_seen": 55814032, + "step": 36545 + }, + { + "epoch": 112.80989180834621, + "grad_norm": 0.36171457171440125, + "learning_rate": 9.126817025755103e-07, + "loss": 0.2077, + "num_input_tokens_seen": 55822064, + "step": 36550 + }, + { + "epoch": 112.82534775888718, + "grad_norm": 0.6805824637413025, + "learning_rate": 9.100550827460947e-07, + "loss": 0.2925, + "num_input_tokens_seen": 55830288, + "step": 36555 + }, + { + "epoch": 112.84080370942813, + "grad_norm": 0.4165670573711395, + "learning_rate": 9.0743217788892e-07, + "loss": 0.2059, + "num_input_tokens_seen": 55837840, + "step": 36560 + }, + { + "epoch": 112.85625965996908, + "grad_norm": 0.49462589621543884, + "learning_rate": 9.048129884084683e-07, + "loss": 0.2112, + "num_input_tokens_seen": 55845136, + "step": 36565 + }, + { + "epoch": 112.87171561051005, + "grad_norm": 0.3310208320617676, + "learning_rate": 9.021975147086553e-07, + "loss": 0.2261, + "num_input_tokens_seen": 55852944, + "step": 36570 + }, + { + "epoch": 112.887171561051, + "grad_norm": 0.42178934812545776, + "learning_rate": 8.995857571928141e-07, + "loss": 0.2523, + "num_input_tokens_seen": 55860752, + "step": 36575 + }, + { + "epoch": 112.90262751159196, + "grad_norm": 0.6827133297920227, + "learning_rate": 8.969777162637139e-07, + "loss": 0.28, + "num_input_tokens_seen": 55868080, + "step": 36580 + }, + { + "epoch": 112.91808346213293, + "grad_norm": 0.64750736951828, + "learning_rate": 8.943733923235525e-07, + "loss": 0.2724, + "num_input_tokens_seen": 55875344, + "step": 36585 + }, + { + "epoch": 112.93353941267388, + "grad_norm": 0.2620856761932373, + "learning_rate": 8.917727857739394e-07, + "loss": 0.2574, + "num_input_tokens_seen": 55883216, + "step": 36590 + }, + { + "epoch": 112.94899536321483, + "grad_norm": 0.7398517727851868, + "learning_rate": 8.891758970159258e-07, + "loss": 0.366, + "num_input_tokens_seen": 55891024, + "step": 36595 + }, + { + "epoch": 112.9644513137558, + "grad_norm": 0.4569053649902344, + "learning_rate": 8.86582726449986e-07, + "loss": 0.2725, + "num_input_tokens_seen": 55898640, + "step": 36600 + }, + { + "epoch": 112.9644513137558, + "eval_loss": 0.3016294240951538, + "eval_runtime": 6.3044, + "eval_samples_per_second": 91.206, + "eval_steps_per_second": 22.841, + "num_input_tokens_seen": 55898640, + "step": 36600 + }, + { + "epoch": 112.97990726429676, + "grad_norm": 0.5590840578079224, + "learning_rate": 8.839932744760165e-07, + "loss": 0.2188, + "num_input_tokens_seen": 55906512, + "step": 36605 + }, + { + "epoch": 112.99536321483771, + "grad_norm": 0.5068679451942444, + "learning_rate": 8.814075414933482e-07, + "loss": 0.2595, + "num_input_tokens_seen": 55913904, + "step": 36610 + }, + { + "epoch": 113.00927357032458, + "grad_norm": 0.6424798369407654, + "learning_rate": 8.788255279007257e-07, + "loss": 0.2875, + "num_input_tokens_seen": 55920592, + "step": 36615 + }, + { + "epoch": 113.02472952086553, + "grad_norm": 0.6013886332511902, + "learning_rate": 8.762472340963362e-07, + "loss": 0.2546, + "num_input_tokens_seen": 55928656, + "step": 36620 + }, + { + "epoch": 113.04018547140649, + "grad_norm": 0.411657452583313, + "learning_rate": 8.736726604777811e-07, + "loss": 0.2113, + "num_input_tokens_seen": 55936592, + "step": 36625 + }, + { + "epoch": 113.05564142194746, + "grad_norm": 0.6767692565917969, + "learning_rate": 8.711018074420901e-07, + "loss": 0.2007, + "num_input_tokens_seen": 55943888, + "step": 36630 + }, + { + "epoch": 113.07109737248841, + "grad_norm": 0.7065767049789429, + "learning_rate": 8.685346753857209e-07, + "loss": 0.2589, + "num_input_tokens_seen": 55951184, + "step": 36635 + }, + { + "epoch": 113.08655332302936, + "grad_norm": 0.5387369394302368, + "learning_rate": 8.659712647045654e-07, + "loss": 0.259, + "num_input_tokens_seen": 55958800, + "step": 36640 + }, + { + "epoch": 113.10200927357033, + "grad_norm": 0.7354624271392822, + "learning_rate": 8.634115757939209e-07, + "loss": 0.2674, + "num_input_tokens_seen": 55966832, + "step": 36645 + }, + { + "epoch": 113.11746522411129, + "grad_norm": 0.4357966482639313, + "learning_rate": 8.608556090485387e-07, + "loss": 0.2393, + "num_input_tokens_seen": 55974448, + "step": 36650 + }, + { + "epoch": 113.13292117465224, + "grad_norm": 0.3650665581226349, + "learning_rate": 8.583033648625671e-07, + "loss": 0.2499, + "num_input_tokens_seen": 55981680, + "step": 36655 + }, + { + "epoch": 113.14837712519319, + "grad_norm": 0.45359599590301514, + "learning_rate": 8.557548436295998e-07, + "loss": 0.2085, + "num_input_tokens_seen": 55989392, + "step": 36660 + }, + { + "epoch": 113.16383307573416, + "grad_norm": 0.49696964025497437, + "learning_rate": 8.532100457426556e-07, + "loss": 0.2352, + "num_input_tokens_seen": 55996944, + "step": 36665 + }, + { + "epoch": 113.17928902627511, + "grad_norm": 0.47548526525497437, + "learning_rate": 8.506689715941679e-07, + "loss": 0.261, + "num_input_tokens_seen": 56004784, + "step": 36670 + }, + { + "epoch": 113.19474497681607, + "grad_norm": 0.7284438014030457, + "learning_rate": 8.481316215760011e-07, + "loss": 0.2558, + "num_input_tokens_seen": 56012848, + "step": 36675 + }, + { + "epoch": 113.21020092735704, + "grad_norm": 0.47526249289512634, + "learning_rate": 8.455979960794558e-07, + "loss": 0.2101, + "num_input_tokens_seen": 56020432, + "step": 36680 + }, + { + "epoch": 113.22565687789799, + "grad_norm": 0.6384721994400024, + "learning_rate": 8.430680954952364e-07, + "loss": 0.214, + "num_input_tokens_seen": 56027824, + "step": 36685 + }, + { + "epoch": 113.24111282843894, + "grad_norm": 0.4317852854728699, + "learning_rate": 8.405419202134974e-07, + "loss": 0.274, + "num_input_tokens_seen": 56035888, + "step": 36690 + }, + { + "epoch": 113.25656877897991, + "grad_norm": 0.4019440710544586, + "learning_rate": 8.380194706237993e-07, + "loss": 0.2474, + "num_input_tokens_seen": 56043824, + "step": 36695 + }, + { + "epoch": 113.27202472952087, + "grad_norm": 0.45286494493484497, + "learning_rate": 8.355007471151366e-07, + "loss": 0.2705, + "num_input_tokens_seen": 56051152, + "step": 36700 + }, + { + "epoch": 113.28748068006182, + "grad_norm": 0.25634685158729553, + "learning_rate": 8.329857500759292e-07, + "loss": 0.2642, + "num_input_tokens_seen": 56059152, + "step": 36705 + }, + { + "epoch": 113.30293663060279, + "grad_norm": 0.5122438073158264, + "learning_rate": 8.304744798940194e-07, + "loss": 0.2526, + "num_input_tokens_seen": 56066928, + "step": 36710 + }, + { + "epoch": 113.31839258114374, + "grad_norm": 0.6015135049819946, + "learning_rate": 8.279669369566756e-07, + "loss": 0.2302, + "num_input_tokens_seen": 56074000, + "step": 36715 + }, + { + "epoch": 113.3338485316847, + "grad_norm": 0.6415765285491943, + "learning_rate": 8.254631216505993e-07, + "loss": 0.2858, + "num_input_tokens_seen": 56081584, + "step": 36720 + }, + { + "epoch": 113.34930448222566, + "grad_norm": 0.5045265555381775, + "learning_rate": 8.229630343619038e-07, + "loss": 0.2334, + "num_input_tokens_seen": 56088880, + "step": 36725 + }, + { + "epoch": 113.36476043276662, + "grad_norm": 0.8257158398628235, + "learning_rate": 8.204666754761392e-07, + "loss": 0.285, + "num_input_tokens_seen": 56096816, + "step": 36730 + }, + { + "epoch": 113.38021638330757, + "grad_norm": 0.34249112010002136, + "learning_rate": 8.179740453782669e-07, + "loss": 0.2019, + "num_input_tokens_seen": 56104656, + "step": 36735 + }, + { + "epoch": 113.39567233384854, + "grad_norm": 0.890343189239502, + "learning_rate": 8.154851444526907e-07, + "loss": 0.2521, + "num_input_tokens_seen": 56112304, + "step": 36740 + }, + { + "epoch": 113.41112828438949, + "grad_norm": 0.3173169791698456, + "learning_rate": 8.129999730832283e-07, + "loss": 0.2169, + "num_input_tokens_seen": 56119696, + "step": 36745 + }, + { + "epoch": 113.42658423493044, + "grad_norm": 0.4742306172847748, + "learning_rate": 8.105185316531178e-07, + "loss": 0.233, + "num_input_tokens_seen": 56127120, + "step": 36750 + }, + { + "epoch": 113.44204018547141, + "grad_norm": 0.4170528054237366, + "learning_rate": 8.08040820545039e-07, + "loss": 0.2774, + "num_input_tokens_seen": 56134640, + "step": 36755 + }, + { + "epoch": 113.45749613601237, + "grad_norm": 0.5063856244087219, + "learning_rate": 8.055668401410782e-07, + "loss": 0.2004, + "num_input_tokens_seen": 56142448, + "step": 36760 + }, + { + "epoch": 113.47295208655332, + "grad_norm": 0.34640970826148987, + "learning_rate": 8.030965908227578e-07, + "loss": 0.2242, + "num_input_tokens_seen": 56149776, + "step": 36765 + }, + { + "epoch": 113.48840803709429, + "grad_norm": 0.6271787285804749, + "learning_rate": 8.006300729710203e-07, + "loss": 0.2766, + "num_input_tokens_seen": 56157552, + "step": 36770 + }, + { + "epoch": 113.50386398763524, + "grad_norm": 0.776226818561554, + "learning_rate": 7.981672869662337e-07, + "loss": 0.2134, + "num_input_tokens_seen": 56165040, + "step": 36775 + }, + { + "epoch": 113.5193199381762, + "grad_norm": 0.35839229822158813, + "learning_rate": 7.957082331881888e-07, + "loss": 0.3022, + "num_input_tokens_seen": 56172208, + "step": 36780 + }, + { + "epoch": 113.53477588871715, + "grad_norm": 0.6589884757995605, + "learning_rate": 7.932529120161069e-07, + "loss": 0.2249, + "num_input_tokens_seen": 56179600, + "step": 36785 + }, + { + "epoch": 113.55023183925812, + "grad_norm": 0.44825151562690735, + "learning_rate": 7.908013238286243e-07, + "loss": 0.2541, + "num_input_tokens_seen": 56187376, + "step": 36790 + }, + { + "epoch": 113.56568778979907, + "grad_norm": 0.5263135433197021, + "learning_rate": 7.883534690038136e-07, + "loss": 0.2972, + "num_input_tokens_seen": 56195056, + "step": 36795 + }, + { + "epoch": 113.58114374034002, + "grad_norm": 0.46969282627105713, + "learning_rate": 7.859093479191559e-07, + "loss": 0.257, + "num_input_tokens_seen": 56202288, + "step": 36800 + }, + { + "epoch": 113.58114374034002, + "eval_loss": 0.30247724056243896, + "eval_runtime": 6.2983, + "eval_samples_per_second": 91.294, + "eval_steps_per_second": 22.863, + "num_input_tokens_seen": 56202288, + "step": 36800 + }, + { + "epoch": 113.59659969088099, + "grad_norm": 0.9705783128738403, + "learning_rate": 7.834689609515722e-07, + "loss": 0.2189, + "num_input_tokens_seen": 56210320, + "step": 36805 + }, + { + "epoch": 113.61205564142195, + "grad_norm": 0.4855603277683258, + "learning_rate": 7.810323084774002e-07, + "loss": 0.2203, + "num_input_tokens_seen": 56217840, + "step": 36810 + }, + { + "epoch": 113.6275115919629, + "grad_norm": 0.370866596698761, + "learning_rate": 7.785993908723976e-07, + "loss": 0.2229, + "num_input_tokens_seen": 56225200, + "step": 36815 + }, + { + "epoch": 113.64296754250387, + "grad_norm": 0.6643619537353516, + "learning_rate": 7.761702085117534e-07, + "loss": 0.2997, + "num_input_tokens_seen": 56233136, + "step": 36820 + }, + { + "epoch": 113.65842349304482, + "grad_norm": 0.537625253200531, + "learning_rate": 7.737447617700844e-07, + "loss": 0.2303, + "num_input_tokens_seen": 56240592, + "step": 36825 + }, + { + "epoch": 113.67387944358578, + "grad_norm": 0.38152143359184265, + "learning_rate": 7.713230510214136e-07, + "loss": 0.2321, + "num_input_tokens_seen": 56247728, + "step": 36830 + }, + { + "epoch": 113.68933539412674, + "grad_norm": 0.42288559675216675, + "learning_rate": 7.689050766392092e-07, + "loss": 0.2931, + "num_input_tokens_seen": 56255632, + "step": 36835 + }, + { + "epoch": 113.7047913446677, + "grad_norm": 0.4697713553905487, + "learning_rate": 7.664908389963477e-07, + "loss": 0.2689, + "num_input_tokens_seen": 56263344, + "step": 36840 + }, + { + "epoch": 113.72024729520865, + "grad_norm": 0.5519371628761292, + "learning_rate": 7.64080338465134e-07, + "loss": 0.198, + "num_input_tokens_seen": 56271600, + "step": 36845 + }, + { + "epoch": 113.73570324574962, + "grad_norm": 0.42223647236824036, + "learning_rate": 7.616735754173043e-07, + "loss": 0.21, + "num_input_tokens_seen": 56279312, + "step": 36850 + }, + { + "epoch": 113.75115919629057, + "grad_norm": 0.7550703287124634, + "learning_rate": 7.592705502240005e-07, + "loss": 0.2795, + "num_input_tokens_seen": 56286544, + "step": 36855 + }, + { + "epoch": 113.76661514683153, + "grad_norm": 0.34448885917663574, + "learning_rate": 7.568712632558095e-07, + "loss": 0.2067, + "num_input_tokens_seen": 56294864, + "step": 36860 + }, + { + "epoch": 113.7820710973725, + "grad_norm": 0.6282783150672913, + "learning_rate": 7.544757148827297e-07, + "loss": 0.2429, + "num_input_tokens_seen": 56302928, + "step": 36865 + }, + { + "epoch": 113.79752704791345, + "grad_norm": 0.3367539048194885, + "learning_rate": 7.520839054741797e-07, + "loss": 0.2603, + "num_input_tokens_seen": 56310288, + "step": 36870 + }, + { + "epoch": 113.8129829984544, + "grad_norm": 0.3815407156944275, + "learning_rate": 7.496958353990113e-07, + "loss": 0.2884, + "num_input_tokens_seen": 56317776, + "step": 36875 + }, + { + "epoch": 113.82843894899537, + "grad_norm": 0.34249091148376465, + "learning_rate": 7.473115050254941e-07, + "loss": 0.2238, + "num_input_tokens_seen": 56325296, + "step": 36880 + }, + { + "epoch": 113.84389489953632, + "grad_norm": 0.4012536406517029, + "learning_rate": 7.449309147213173e-07, + "loss": 0.2176, + "num_input_tokens_seen": 56333136, + "step": 36885 + }, + { + "epoch": 113.85935085007728, + "grad_norm": 0.9505470395088196, + "learning_rate": 7.425540648536067e-07, + "loss": 0.2735, + "num_input_tokens_seen": 56340624, + "step": 36890 + }, + { + "epoch": 113.87480680061825, + "grad_norm": 0.44123178720474243, + "learning_rate": 7.40180955788894e-07, + "loss": 0.2027, + "num_input_tokens_seen": 56348048, + "step": 36895 + }, + { + "epoch": 113.8902627511592, + "grad_norm": 0.691315233707428, + "learning_rate": 7.378115878931474e-07, + "loss": 0.228, + "num_input_tokens_seen": 56355888, + "step": 36900 + }, + { + "epoch": 113.90571870170015, + "grad_norm": 0.4086824357509613, + "learning_rate": 7.354459615317527e-07, + "loss": 0.2037, + "num_input_tokens_seen": 56363536, + "step": 36905 + }, + { + "epoch": 113.9211746522411, + "grad_norm": 0.45180806517601013, + "learning_rate": 7.33084077069518e-07, + "loss": 0.2347, + "num_input_tokens_seen": 56371280, + "step": 36910 + }, + { + "epoch": 113.93663060278207, + "grad_norm": 0.8898910284042358, + "learning_rate": 7.307259348706768e-07, + "loss": 0.2646, + "num_input_tokens_seen": 56378992, + "step": 36915 + }, + { + "epoch": 113.95208655332303, + "grad_norm": 0.5296764969825745, + "learning_rate": 7.283715352988801e-07, + "loss": 0.2598, + "num_input_tokens_seen": 56386896, + "step": 36920 + }, + { + "epoch": 113.96754250386398, + "grad_norm": 0.6439787745475769, + "learning_rate": 7.260208787172068e-07, + "loss": 0.2719, + "num_input_tokens_seen": 56394576, + "step": 36925 + }, + { + "epoch": 113.98299845440495, + "grad_norm": 0.5206494331359863, + "learning_rate": 7.23673965488167e-07, + "loss": 0.2912, + "num_input_tokens_seen": 56402288, + "step": 36930 + }, + { + "epoch": 113.9984544049459, + "grad_norm": 0.4268236756324768, + "learning_rate": 7.213307959736709e-07, + "loss": 0.242, + "num_input_tokens_seen": 56410160, + "step": 36935 + }, + { + "epoch": 114.01236476043276, + "grad_norm": 0.6134166717529297, + "learning_rate": 7.189913705350715e-07, + "loss": 0.2259, + "num_input_tokens_seen": 56416656, + "step": 36940 + }, + { + "epoch": 114.02782071097373, + "grad_norm": 0.40818750858306885, + "learning_rate": 7.166556895331411e-07, + "loss": 0.3211, + "num_input_tokens_seen": 56424592, + "step": 36945 + }, + { + "epoch": 114.04327666151468, + "grad_norm": 0.6188277006149292, + "learning_rate": 7.143237533280639e-07, + "loss": 0.2691, + "num_input_tokens_seen": 56432880, + "step": 36950 + }, + { + "epoch": 114.05873261205564, + "grad_norm": 0.39263153076171875, + "learning_rate": 7.119955622794578e-07, + "loss": 0.2255, + "num_input_tokens_seen": 56440560, + "step": 36955 + }, + { + "epoch": 114.0741885625966, + "grad_norm": 0.3839220106601715, + "learning_rate": 7.096711167463577e-07, + "loss": 0.1926, + "num_input_tokens_seen": 56448048, + "step": 36960 + }, + { + "epoch": 114.08964451313756, + "grad_norm": 0.3376341164112091, + "learning_rate": 7.073504170872213e-07, + "loss": 0.2301, + "num_input_tokens_seen": 56456176, + "step": 36965 + }, + { + "epoch": 114.10510046367851, + "grad_norm": 0.7428810596466064, + "learning_rate": 7.05033463659932e-07, + "loss": 0.2498, + "num_input_tokens_seen": 56463728, + "step": 36970 + }, + { + "epoch": 114.12055641421948, + "grad_norm": 0.5082917809486389, + "learning_rate": 7.027202568217928e-07, + "loss": 0.2323, + "num_input_tokens_seen": 56471408, + "step": 36975 + }, + { + "epoch": 114.13601236476043, + "grad_norm": 0.8530029654502869, + "learning_rate": 7.004107969295293e-07, + "loss": 0.2433, + "num_input_tokens_seen": 56479280, + "step": 36980 + }, + { + "epoch": 114.15146831530139, + "grad_norm": 0.4670429527759552, + "learning_rate": 6.9810508433929e-07, + "loss": 0.2412, + "num_input_tokens_seen": 56486896, + "step": 36985 + }, + { + "epoch": 114.16692426584235, + "grad_norm": 0.6296567320823669, + "learning_rate": 6.958031194066406e-07, + "loss": 0.2496, + "num_input_tokens_seen": 56494448, + "step": 36990 + }, + { + "epoch": 114.18238021638331, + "grad_norm": 0.5881506204605103, + "learning_rate": 6.935049024865776e-07, + "loss": 0.2198, + "num_input_tokens_seen": 56502288, + "step": 36995 + }, + { + "epoch": 114.19783616692426, + "grad_norm": 0.3466840982437134, + "learning_rate": 6.912104339335118e-07, + "loss": 0.2189, + "num_input_tokens_seen": 56510384, + "step": 37000 + }, + { + "epoch": 114.19783616692426, + "eval_loss": 0.3018469214439392, + "eval_runtime": 6.3115, + "eval_samples_per_second": 91.103, + "eval_steps_per_second": 22.815, + "num_input_tokens_seen": 56510384, + "step": 37000 + }, + { + "epoch": 114.21329211746523, + "grad_norm": 0.539428174495697, + "learning_rate": 6.889197141012799e-07, + "loss": 0.2134, + "num_input_tokens_seen": 56518480, + "step": 37005 + }, + { + "epoch": 114.22874806800618, + "grad_norm": 0.4164327085018158, + "learning_rate": 6.866327433431435e-07, + "loss": 0.221, + "num_input_tokens_seen": 56525744, + "step": 37010 + }, + { + "epoch": 114.24420401854714, + "grad_norm": 0.5981317758560181, + "learning_rate": 6.843495220117735e-07, + "loss": 0.2349, + "num_input_tokens_seen": 56533168, + "step": 37015 + }, + { + "epoch": 114.2596599690881, + "grad_norm": 0.32481706142425537, + "learning_rate": 6.820700504592798e-07, + "loss": 0.2804, + "num_input_tokens_seen": 56540944, + "step": 37020 + }, + { + "epoch": 114.27511591962906, + "grad_norm": 0.401551216840744, + "learning_rate": 6.797943290371839e-07, + "loss": 0.2416, + "num_input_tokens_seen": 56548912, + "step": 37025 + }, + { + "epoch": 114.29057187017001, + "grad_norm": 0.4942931532859802, + "learning_rate": 6.775223580964274e-07, + "loss": 0.2848, + "num_input_tokens_seen": 56555984, + "step": 37030 + }, + { + "epoch": 114.30602782071098, + "grad_norm": 0.6566072702407837, + "learning_rate": 6.7525413798738e-07, + "loss": 0.2413, + "num_input_tokens_seen": 56563632, + "step": 37035 + }, + { + "epoch": 114.32148377125193, + "grad_norm": 0.3580455183982849, + "learning_rate": 6.729896690598259e-07, + "loss": 0.2139, + "num_input_tokens_seen": 56571152, + "step": 37040 + }, + { + "epoch": 114.33693972179289, + "grad_norm": 0.6109552383422852, + "learning_rate": 6.707289516629772e-07, + "loss": 0.2225, + "num_input_tokens_seen": 56578512, + "step": 37045 + }, + { + "epoch": 114.35239567233384, + "grad_norm": 0.7465476393699646, + "learning_rate": 6.684719861454692e-07, + "loss": 0.2235, + "num_input_tokens_seen": 56585872, + "step": 37050 + }, + { + "epoch": 114.36785162287481, + "grad_norm": 0.5213819146156311, + "learning_rate": 6.662187728553481e-07, + "loss": 0.2544, + "num_input_tokens_seen": 56593712, + "step": 37055 + }, + { + "epoch": 114.38330757341576, + "grad_norm": 0.450920969247818, + "learning_rate": 6.639693121400892e-07, + "loss": 0.2292, + "num_input_tokens_seen": 56601360, + "step": 37060 + }, + { + "epoch": 114.39876352395672, + "grad_norm": 0.6915980577468872, + "learning_rate": 6.617236043465868e-07, + "loss": 0.2666, + "num_input_tokens_seen": 56608976, + "step": 37065 + }, + { + "epoch": 114.41421947449768, + "grad_norm": 0.6488029956817627, + "learning_rate": 6.594816498211587e-07, + "loss": 0.2877, + "num_input_tokens_seen": 56616592, + "step": 37070 + }, + { + "epoch": 114.42967542503864, + "grad_norm": 0.5578147172927856, + "learning_rate": 6.572434489095447e-07, + "loss": 0.2374, + "num_input_tokens_seen": 56624240, + "step": 37075 + }, + { + "epoch": 114.44513137557959, + "grad_norm": 0.6608492732048035, + "learning_rate": 6.550090019568994e-07, + "loss": 0.2648, + "num_input_tokens_seen": 56632048, + "step": 37080 + }, + { + "epoch": 114.46058732612056, + "grad_norm": 0.765672504901886, + "learning_rate": 6.527783093078027e-07, + "loss": 0.2098, + "num_input_tokens_seen": 56639664, + "step": 37085 + }, + { + "epoch": 114.47604327666151, + "grad_norm": 0.6060658693313599, + "learning_rate": 6.5055137130626e-07, + "loss": 0.3175, + "num_input_tokens_seen": 56646992, + "step": 37090 + }, + { + "epoch": 114.49149922720247, + "grad_norm": 0.7825860381126404, + "learning_rate": 6.483281882956854e-07, + "loss": 0.2369, + "num_input_tokens_seen": 56654576, + "step": 37095 + }, + { + "epoch": 114.50695517774344, + "grad_norm": 0.658376932144165, + "learning_rate": 6.461087606189298e-07, + "loss": 0.2414, + "num_input_tokens_seen": 56662320, + "step": 37100 + }, + { + "epoch": 114.52241112828439, + "grad_norm": 0.4526894688606262, + "learning_rate": 6.438930886182554e-07, + "loss": 0.2634, + "num_input_tokens_seen": 56669840, + "step": 37105 + }, + { + "epoch": 114.53786707882534, + "grad_norm": 0.764064371585846, + "learning_rate": 6.416811726353417e-07, + "loss": 0.2937, + "num_input_tokens_seen": 56678160, + "step": 37110 + }, + { + "epoch": 114.55332302936631, + "grad_norm": 0.565226137638092, + "learning_rate": 6.394730130112991e-07, + "loss": 0.2323, + "num_input_tokens_seen": 56685616, + "step": 37115 + }, + { + "epoch": 114.56877897990726, + "grad_norm": 0.38851994276046753, + "learning_rate": 6.372686100866471e-07, + "loss": 0.2474, + "num_input_tokens_seen": 56693360, + "step": 37120 + }, + { + "epoch": 114.58423493044822, + "grad_norm": 0.5154475569725037, + "learning_rate": 6.350679642013413e-07, + "loss": 0.1882, + "num_input_tokens_seen": 56700592, + "step": 37125 + }, + { + "epoch": 114.59969088098919, + "grad_norm": 0.7108444571495056, + "learning_rate": 6.328710756947437e-07, + "loss": 0.2933, + "num_input_tokens_seen": 56709168, + "step": 37130 + }, + { + "epoch": 114.61514683153014, + "grad_norm": 0.522663414478302, + "learning_rate": 6.306779449056416e-07, + "loss": 0.2303, + "num_input_tokens_seen": 56716464, + "step": 37135 + }, + { + "epoch": 114.6306027820711, + "grad_norm": 0.4314585328102112, + "learning_rate": 6.284885721722422e-07, + "loss": 0.241, + "num_input_tokens_seen": 56723952, + "step": 37140 + }, + { + "epoch": 114.64605873261206, + "grad_norm": 0.3201563358306885, + "learning_rate": 6.26302957832181e-07, + "loss": 0.2403, + "num_input_tokens_seen": 56731920, + "step": 37145 + }, + { + "epoch": 114.66151468315302, + "grad_norm": 0.38984382152557373, + "learning_rate": 6.241211022224997e-07, + "loss": 0.1954, + "num_input_tokens_seen": 56739760, + "step": 37150 + }, + { + "epoch": 114.67697063369397, + "grad_norm": 0.2969643175601959, + "learning_rate": 6.219430056796732e-07, + "loss": 0.2783, + "num_input_tokens_seen": 56747280, + "step": 37155 + }, + { + "epoch": 114.69242658423494, + "grad_norm": 0.9239513874053955, + "learning_rate": 6.19768668539586e-07, + "loss": 0.2983, + "num_input_tokens_seen": 56754896, + "step": 37160 + }, + { + "epoch": 114.70788253477589, + "grad_norm": 0.6190066337585449, + "learning_rate": 6.175980911375528e-07, + "loss": 0.2803, + "num_input_tokens_seen": 56762768, + "step": 37165 + }, + { + "epoch": 114.72333848531684, + "grad_norm": 0.5704757571220398, + "learning_rate": 6.154312738083034e-07, + "loss": 0.2054, + "num_input_tokens_seen": 56770672, + "step": 37170 + }, + { + "epoch": 114.7387944358578, + "grad_norm": 0.466804563999176, + "learning_rate": 6.132682168859843e-07, + "loss": 0.1994, + "num_input_tokens_seen": 56778128, + "step": 37175 + }, + { + "epoch": 114.75425038639877, + "grad_norm": 0.36224862933158875, + "learning_rate": 6.111089207041704e-07, + "loss": 0.2663, + "num_input_tokens_seen": 56786000, + "step": 37180 + }, + { + "epoch": 114.76970633693972, + "grad_norm": 0.3639911115169525, + "learning_rate": 6.089533855958507e-07, + "loss": 0.2554, + "num_input_tokens_seen": 56793936, + "step": 37185 + }, + { + "epoch": 114.78516228748067, + "grad_norm": 0.7723461389541626, + "learning_rate": 6.068016118934372e-07, + "loss": 0.2693, + "num_input_tokens_seen": 56802224, + "step": 37190 + }, + { + "epoch": 114.80061823802164, + "grad_norm": 0.39619123935699463, + "learning_rate": 6.04653599928759e-07, + "loss": 0.2683, + "num_input_tokens_seen": 56809520, + "step": 37195 + }, + { + "epoch": 114.8160741885626, + "grad_norm": 0.6134465336799622, + "learning_rate": 6.025093500330675e-07, + "loss": 0.2521, + "num_input_tokens_seen": 56816880, + "step": 37200 + }, + { + "epoch": 114.8160741885626, + "eval_loss": 0.30109551548957825, + "eval_runtime": 6.2774, + "eval_samples_per_second": 91.598, + "eval_steps_per_second": 22.939, + "num_input_tokens_seen": 56816880, + "step": 37200 + }, + { + "epoch": 114.83153013910355, + "grad_norm": 0.5411117076873779, + "learning_rate": 6.003688625370291e-07, + "loss": 0.209, + "num_input_tokens_seen": 56824528, + "step": 37205 + }, + { + "epoch": 114.84698608964452, + "grad_norm": 0.4167042076587677, + "learning_rate": 5.982321377707406e-07, + "loss": 0.2825, + "num_input_tokens_seen": 56831920, + "step": 37210 + }, + { + "epoch": 114.86244204018547, + "grad_norm": 0.32103338837623596, + "learning_rate": 5.96099176063708e-07, + "loss": 0.2636, + "num_input_tokens_seen": 56839472, + "step": 37215 + }, + { + "epoch": 114.87789799072642, + "grad_norm": 0.7828036546707153, + "learning_rate": 5.93969977744857e-07, + "loss": 0.2009, + "num_input_tokens_seen": 56846832, + "step": 37220 + }, + { + "epoch": 114.89335394126739, + "grad_norm": 0.7840213179588318, + "learning_rate": 5.918445431425445e-07, + "loss": 0.2952, + "num_input_tokens_seen": 56854576, + "step": 37225 + }, + { + "epoch": 114.90880989180835, + "grad_norm": 0.3769976496696472, + "learning_rate": 5.897228725845333e-07, + "loss": 0.2265, + "num_input_tokens_seen": 56861744, + "step": 37230 + }, + { + "epoch": 114.9242658423493, + "grad_norm": 0.4419137239456177, + "learning_rate": 5.876049663980171e-07, + "loss": 0.2805, + "num_input_tokens_seen": 56869296, + "step": 37235 + }, + { + "epoch": 114.93972179289027, + "grad_norm": 0.36195555329322815, + "learning_rate": 5.854908249095959e-07, + "loss": 0.2417, + "num_input_tokens_seen": 56876432, + "step": 37240 + }, + { + "epoch": 114.95517774343122, + "grad_norm": 0.7243116497993469, + "learning_rate": 5.833804484453031e-07, + "loss": 0.2716, + "num_input_tokens_seen": 56883632, + "step": 37245 + }, + { + "epoch": 114.97063369397218, + "grad_norm": 0.550279974937439, + "learning_rate": 5.81273837330587e-07, + "loss": 0.276, + "num_input_tokens_seen": 56890736, + "step": 37250 + }, + { + "epoch": 114.98608964451314, + "grad_norm": 0.3922118544578552, + "learning_rate": 5.791709918903071e-07, + "loss": 0.2233, + "num_input_tokens_seen": 56898448, + "step": 37255 + }, + { + "epoch": 115.0, + "grad_norm": 0.5996485352516174, + "learning_rate": 5.770719124487483e-07, + "loss": 0.205, + "num_input_tokens_seen": 56904992, + "step": 37260 + }, + { + "epoch": 115.01545595054095, + "grad_norm": 0.4607347548007965, + "learning_rate": 5.749765993296241e-07, + "loss": 0.2499, + "num_input_tokens_seen": 56912704, + "step": 37265 + }, + { + "epoch": 115.03091190108192, + "grad_norm": 0.434541255235672, + "learning_rate": 5.728850528560509e-07, + "loss": 0.2315, + "num_input_tokens_seen": 56920192, + "step": 37270 + }, + { + "epoch": 115.04636785162288, + "grad_norm": 0.4030480980873108, + "learning_rate": 5.707972733505707e-07, + "loss": 0.2124, + "num_input_tokens_seen": 56927520, + "step": 37275 + }, + { + "epoch": 115.06182380216383, + "grad_norm": 0.7208239436149597, + "learning_rate": 5.687132611351509e-07, + "loss": 0.2257, + "num_input_tokens_seen": 56935040, + "step": 37280 + }, + { + "epoch": 115.0772797527048, + "grad_norm": 0.6315537095069885, + "learning_rate": 5.666330165311651e-07, + "loss": 0.2112, + "num_input_tokens_seen": 56942816, + "step": 37285 + }, + { + "epoch": 115.09273570324575, + "grad_norm": 0.4234476685523987, + "learning_rate": 5.645565398594204e-07, + "loss": 0.2309, + "num_input_tokens_seen": 56950240, + "step": 37290 + }, + { + "epoch": 115.1081916537867, + "grad_norm": 0.5307192802429199, + "learning_rate": 5.624838314401304e-07, + "loss": 0.2338, + "num_input_tokens_seen": 56958080, + "step": 37295 + }, + { + "epoch": 115.12364760432767, + "grad_norm": 0.5979578495025635, + "learning_rate": 5.604148915929336e-07, + "loss": 0.316, + "num_input_tokens_seen": 56965760, + "step": 37300 + }, + { + "epoch": 115.13910355486863, + "grad_norm": 0.4578627347946167, + "learning_rate": 5.583497206368887e-07, + "loss": 0.2845, + "num_input_tokens_seen": 56973440, + "step": 37305 + }, + { + "epoch": 115.15455950540958, + "grad_norm": 0.35144802927970886, + "learning_rate": 5.562883188904688e-07, + "loss": 0.2097, + "num_input_tokens_seen": 56981824, + "step": 37310 + }, + { + "epoch": 115.17001545595055, + "grad_norm": 0.6471918225288391, + "learning_rate": 5.542306866715724e-07, + "loss": 0.2725, + "num_input_tokens_seen": 56989376, + "step": 37315 + }, + { + "epoch": 115.1854714064915, + "grad_norm": 0.5035140514373779, + "learning_rate": 5.52176824297504e-07, + "loss": 0.2352, + "num_input_tokens_seen": 56996928, + "step": 37320 + }, + { + "epoch": 115.20092735703246, + "grad_norm": 0.708209216594696, + "learning_rate": 5.501267320850018e-07, + "loss": 0.2518, + "num_input_tokens_seen": 57004192, + "step": 37325 + }, + { + "epoch": 115.21638330757341, + "grad_norm": 0.5602384805679321, + "learning_rate": 5.480804103502157e-07, + "loss": 0.2191, + "num_input_tokens_seen": 57011584, + "step": 37330 + }, + { + "epoch": 115.23183925811438, + "grad_norm": 0.8885470628738403, + "learning_rate": 5.460378594087101e-07, + "loss": 0.2725, + "num_input_tokens_seen": 57019104, + "step": 37335 + }, + { + "epoch": 115.24729520865533, + "grad_norm": 0.7856735587120056, + "learning_rate": 5.439990795754773e-07, + "loss": 0.2536, + "num_input_tokens_seen": 57026272, + "step": 37340 + }, + { + "epoch": 115.26275115919628, + "grad_norm": 0.6375582814216614, + "learning_rate": 5.419640711649188e-07, + "loss": 0.2664, + "num_input_tokens_seen": 57034208, + "step": 37345 + }, + { + "epoch": 115.27820710973725, + "grad_norm": 0.6176665425300598, + "learning_rate": 5.399328344908583e-07, + "loss": 0.2466, + "num_input_tokens_seen": 57041536, + "step": 37350 + }, + { + "epoch": 115.2936630602782, + "grad_norm": 0.4313599169254303, + "learning_rate": 5.379053698665399e-07, + "loss": 0.2117, + "num_input_tokens_seen": 57049248, + "step": 37355 + }, + { + "epoch": 115.30911901081916, + "grad_norm": 0.48654037714004517, + "learning_rate": 5.358816776046216e-07, + "loss": 0.2403, + "num_input_tokens_seen": 57056928, + "step": 37360 + }, + { + "epoch": 115.32457496136013, + "grad_norm": 0.5763363242149353, + "learning_rate": 5.338617580171817e-07, + "loss": 0.2711, + "num_input_tokens_seen": 57064544, + "step": 37365 + }, + { + "epoch": 115.34003091190108, + "grad_norm": 0.42053815722465515, + "learning_rate": 5.318456114157239e-07, + "loss": 0.2136, + "num_input_tokens_seen": 57072288, + "step": 37370 + }, + { + "epoch": 115.35548686244204, + "grad_norm": 0.6358646750450134, + "learning_rate": 5.298332381111576e-07, + "loss": 0.2708, + "num_input_tokens_seen": 57080320, + "step": 37375 + }, + { + "epoch": 115.370942812983, + "grad_norm": 0.5779606103897095, + "learning_rate": 5.27824638413818e-07, + "loss": 0.2647, + "num_input_tokens_seen": 57088544, + "step": 37380 + }, + { + "epoch": 115.38639876352396, + "grad_norm": 0.4785417914390564, + "learning_rate": 5.258198126334546e-07, + "loss": 0.3174, + "num_input_tokens_seen": 57096256, + "step": 37385 + }, + { + "epoch": 115.40185471406491, + "grad_norm": 0.6783437132835388, + "learning_rate": 5.238187610792367e-07, + "loss": 0.2127, + "num_input_tokens_seen": 57104096, + "step": 37390 + }, + { + "epoch": 115.41731066460588, + "grad_norm": 0.4164530634880066, + "learning_rate": 5.218214840597563e-07, + "loss": 0.2275, + "num_input_tokens_seen": 57111296, + "step": 37395 + }, + { + "epoch": 115.43276661514683, + "grad_norm": 0.5226666927337646, + "learning_rate": 5.198279818830115e-07, + "loss": 0.2003, + "num_input_tokens_seen": 57119232, + "step": 37400 + }, + { + "epoch": 115.43276661514683, + "eval_loss": 0.3017193377017975, + "eval_runtime": 6.3234, + "eval_samples_per_second": 90.933, + "eval_steps_per_second": 22.773, + "num_input_tokens_seen": 57119232, + "step": 37400 + }, + { + "epoch": 115.44822256568779, + "grad_norm": 0.3711816072463989, + "learning_rate": 5.178382548564287e-07, + "loss": 0.2519, + "num_input_tokens_seen": 57127040, + "step": 37405 + }, + { + "epoch": 115.46367851622875, + "grad_norm": 0.6482442021369934, + "learning_rate": 5.15852303286854e-07, + "loss": 0.2253, + "num_input_tokens_seen": 57134176, + "step": 37410 + }, + { + "epoch": 115.47913446676971, + "grad_norm": 0.4476620852947235, + "learning_rate": 5.138701274805396e-07, + "loss": 0.2395, + "num_input_tokens_seen": 57142112, + "step": 37415 + }, + { + "epoch": 115.49459041731066, + "grad_norm": 0.48550617694854736, + "learning_rate": 5.118917277431606e-07, + "loss": 0.2484, + "num_input_tokens_seen": 57149248, + "step": 37420 + }, + { + "epoch": 115.51004636785163, + "grad_norm": 0.37324783205986023, + "learning_rate": 5.099171043798145e-07, + "loss": 0.2115, + "num_input_tokens_seen": 57156480, + "step": 37425 + }, + { + "epoch": 115.52550231839258, + "grad_norm": 0.4330366849899292, + "learning_rate": 5.079462576950133e-07, + "loss": 0.3043, + "num_input_tokens_seen": 57164000, + "step": 37430 + }, + { + "epoch": 115.54095826893354, + "grad_norm": 0.45277515053749084, + "learning_rate": 5.059791879926862e-07, + "loss": 0.2207, + "num_input_tokens_seen": 57171872, + "step": 37435 + }, + { + "epoch": 115.5564142194745, + "grad_norm": 0.3599475920200348, + "learning_rate": 5.040158955761793e-07, + "loss": 0.1931, + "num_input_tokens_seen": 57179584, + "step": 37440 + }, + { + "epoch": 115.57187017001546, + "grad_norm": 0.6389016509056091, + "learning_rate": 5.020563807482559e-07, + "loss": 0.2436, + "num_input_tokens_seen": 57186784, + "step": 37445 + }, + { + "epoch": 115.58732612055641, + "grad_norm": 0.6697651743888855, + "learning_rate": 5.001006438110995e-07, + "loss": 0.2774, + "num_input_tokens_seen": 57194112, + "step": 37450 + }, + { + "epoch": 115.60278207109737, + "grad_norm": 0.3746280074119568, + "learning_rate": 4.981486850663075e-07, + "loss": 0.2291, + "num_input_tokens_seen": 57201888, + "step": 37455 + }, + { + "epoch": 115.61823802163833, + "grad_norm": 0.31241393089294434, + "learning_rate": 4.962005048149005e-07, + "loss": 0.2069, + "num_input_tokens_seen": 57209760, + "step": 37460 + }, + { + "epoch": 115.63369397217929, + "grad_norm": 0.5412416458129883, + "learning_rate": 4.942561033573073e-07, + "loss": 0.2833, + "num_input_tokens_seen": 57217184, + "step": 37465 + }, + { + "epoch": 115.64914992272024, + "grad_norm": 0.6836436986923218, + "learning_rate": 4.923154809933827e-07, + "loss": 0.2344, + "num_input_tokens_seen": 57224672, + "step": 37470 + }, + { + "epoch": 115.66460587326121, + "grad_norm": 0.4468829035758972, + "learning_rate": 4.903786380223957e-07, + "loss": 0.1911, + "num_input_tokens_seen": 57232384, + "step": 37475 + }, + { + "epoch": 115.68006182380216, + "grad_norm": 0.40804624557495117, + "learning_rate": 4.884455747430266e-07, + "loss": 0.2264, + "num_input_tokens_seen": 57240128, + "step": 37480 + }, + { + "epoch": 115.69551777434312, + "grad_norm": 0.5134743452072144, + "learning_rate": 4.865162914533816e-07, + "loss": 0.2903, + "num_input_tokens_seen": 57247424, + "step": 37485 + }, + { + "epoch": 115.71097372488408, + "grad_norm": 0.4666871726512909, + "learning_rate": 4.845907884509809e-07, + "loss": 0.2153, + "num_input_tokens_seen": 57254976, + "step": 37490 + }, + { + "epoch": 115.72642967542504, + "grad_norm": 0.5054497718811035, + "learning_rate": 4.82669066032762e-07, + "loss": 0.2594, + "num_input_tokens_seen": 57262560, + "step": 37495 + }, + { + "epoch": 115.74188562596599, + "grad_norm": 0.4934511184692383, + "learning_rate": 4.807511244950768e-07, + "loss": 0.2223, + "num_input_tokens_seen": 57270688, + "step": 37500 + }, + { + "epoch": 115.75734157650696, + "grad_norm": 0.6510342955589294, + "learning_rate": 4.788369641336943e-07, + "loss": 0.2839, + "num_input_tokens_seen": 57278592, + "step": 37505 + }, + { + "epoch": 115.77279752704791, + "grad_norm": 0.6111792325973511, + "learning_rate": 4.769265852438032e-07, + "loss": 0.2433, + "num_input_tokens_seen": 57286464, + "step": 37510 + }, + { + "epoch": 115.78825347758887, + "grad_norm": 0.5850226283073425, + "learning_rate": 4.750199881200124e-07, + "loss": 0.3189, + "num_input_tokens_seen": 57294176, + "step": 37515 + }, + { + "epoch": 115.80370942812984, + "grad_norm": 0.4663666784763336, + "learning_rate": 4.7311717305633664e-07, + "loss": 0.2815, + "num_input_tokens_seen": 57301856, + "step": 37520 + }, + { + "epoch": 115.81916537867079, + "grad_norm": 0.5170837044715881, + "learning_rate": 4.7121814034621623e-07, + "loss": 0.2373, + "num_input_tokens_seen": 57309536, + "step": 37525 + }, + { + "epoch": 115.83462132921174, + "grad_norm": 0.31057921051979065, + "learning_rate": 4.693228902825114e-07, + "loss": 0.2473, + "num_input_tokens_seen": 57317120, + "step": 37530 + }, + { + "epoch": 115.85007727975271, + "grad_norm": 0.4052570164203644, + "learning_rate": 4.6743142315748277e-07, + "loss": 0.2251, + "num_input_tokens_seen": 57324896, + "step": 37535 + }, + { + "epoch": 115.86553323029366, + "grad_norm": 0.6378384232521057, + "learning_rate": 4.655437392628276e-07, + "loss": 0.2396, + "num_input_tokens_seen": 57332800, + "step": 37540 + }, + { + "epoch": 115.88098918083462, + "grad_norm": 0.6842080950737, + "learning_rate": 4.636598388896463e-07, + "loss": 0.3337, + "num_input_tokens_seen": 57340288, + "step": 37545 + }, + { + "epoch": 115.89644513137559, + "grad_norm": 0.5169817805290222, + "learning_rate": 4.6177972232845925e-07, + "loss": 0.2276, + "num_input_tokens_seen": 57348192, + "step": 37550 + }, + { + "epoch": 115.91190108191654, + "grad_norm": 0.6203616857528687, + "learning_rate": 4.5990338986920953e-07, + "loss": 0.2182, + "num_input_tokens_seen": 57355648, + "step": 37555 + }, + { + "epoch": 115.9273570324575, + "grad_norm": 0.5253178477287292, + "learning_rate": 4.5803084180124633e-07, + "loss": 0.2363, + "num_input_tokens_seen": 57363616, + "step": 37560 + }, + { + "epoch": 115.94281298299846, + "grad_norm": 0.7750519514083862, + "learning_rate": 4.561620784133386e-07, + "loss": 0.2447, + "num_input_tokens_seen": 57371424, + "step": 37565 + }, + { + "epoch": 115.95826893353942, + "grad_norm": 0.821246325969696, + "learning_rate": 4.5429709999367796e-07, + "loss": 0.2313, + "num_input_tokens_seen": 57379552, + "step": 37570 + }, + { + "epoch": 115.97372488408037, + "grad_norm": 0.4714328348636627, + "learning_rate": 4.5243590682986223e-07, + "loss": 0.2572, + "num_input_tokens_seen": 57387264, + "step": 37575 + }, + { + "epoch": 115.98918083462132, + "grad_norm": 0.4716985821723938, + "learning_rate": 4.5057849920891735e-07, + "loss": 0.3166, + "num_input_tokens_seen": 57394816, + "step": 37580 + }, + { + "epoch": 116.0030911901082, + "grad_norm": 0.3723185360431671, + "learning_rate": 4.487248774172698e-07, + "loss": 0.2352, + "num_input_tokens_seen": 57401472, + "step": 37585 + }, + { + "epoch": 116.01854714064915, + "grad_norm": 0.43208789825439453, + "learning_rate": 4.4687504174077965e-07, + "loss": 0.2229, + "num_input_tokens_seen": 57409152, + "step": 37590 + }, + { + "epoch": 116.03400309119012, + "grad_norm": 0.4240677058696747, + "learning_rate": 4.450289924647133e-07, + "loss": 0.3489, + "num_input_tokens_seen": 57416960, + "step": 37595 + }, + { + "epoch": 116.04945904173107, + "grad_norm": 0.4977492392063141, + "learning_rate": 4.431867298737513e-07, + "loss": 0.2821, + "num_input_tokens_seen": 57424224, + "step": 37600 + }, + { + "epoch": 116.04945904173107, + "eval_loss": 0.3020728826522827, + "eval_runtime": 6.3246, + "eval_samples_per_second": 90.915, + "eval_steps_per_second": 22.768, + "num_input_tokens_seen": 57424224, + "step": 37600 + }, + { + "epoch": 116.06491499227202, + "grad_norm": 0.5532984137535095, + "learning_rate": 4.41348254251997e-07, + "loss": 0.232, + "num_input_tokens_seen": 57431616, + "step": 37605 + }, + { + "epoch": 116.08037094281298, + "grad_norm": 0.7867820262908936, + "learning_rate": 4.395135658829652e-07, + "loss": 0.346, + "num_input_tokens_seen": 57439904, + "step": 37610 + }, + { + "epoch": 116.09582689335394, + "grad_norm": 0.33760198950767517, + "learning_rate": 4.376826650495852e-07, + "loss": 0.2201, + "num_input_tokens_seen": 57447584, + "step": 37615 + }, + { + "epoch": 116.1112828438949, + "grad_norm": 0.5991175174713135, + "learning_rate": 4.358555520342117e-07, + "loss": 0.2573, + "num_input_tokens_seen": 57455328, + "step": 37620 + }, + { + "epoch": 116.12673879443585, + "grad_norm": 0.49941927194595337, + "learning_rate": 4.3403222711860257e-07, + "loss": 0.1735, + "num_input_tokens_seen": 57462976, + "step": 37625 + }, + { + "epoch": 116.14219474497682, + "grad_norm": 0.5708136558532715, + "learning_rate": 4.3221269058394133e-07, + "loss": 0.2074, + "num_input_tokens_seen": 57471328, + "step": 37630 + }, + { + "epoch": 116.15765069551777, + "grad_norm": 0.41369855403900146, + "learning_rate": 4.303969427108173e-07, + "loss": 0.2655, + "num_input_tokens_seen": 57478624, + "step": 37635 + }, + { + "epoch": 116.17310664605873, + "grad_norm": 0.38501855731010437, + "learning_rate": 4.2858498377924825e-07, + "loss": 0.2114, + "num_input_tokens_seen": 57486304, + "step": 37640 + }, + { + "epoch": 116.1885625965997, + "grad_norm": 0.4179442822933197, + "learning_rate": 4.267768140686579e-07, + "loss": 0.2335, + "num_input_tokens_seen": 57494240, + "step": 37645 + }, + { + "epoch": 116.20401854714065, + "grad_norm": 0.5486367344856262, + "learning_rate": 4.2497243385788975e-07, + "loss": 0.2086, + "num_input_tokens_seen": 57501920, + "step": 37650 + }, + { + "epoch": 116.2194744976816, + "grad_norm": 0.6405705809593201, + "learning_rate": 4.231718434251991e-07, + "loss": 0.2168, + "num_input_tokens_seen": 57509088, + "step": 37655 + }, + { + "epoch": 116.23493044822257, + "grad_norm": 0.5561368465423584, + "learning_rate": 4.213750430482666e-07, + "loss": 0.2072, + "num_input_tokens_seen": 57516800, + "step": 37660 + }, + { + "epoch": 116.25038639876352, + "grad_norm": 0.388594388961792, + "learning_rate": 4.1958203300417054e-07, + "loss": 0.2048, + "num_input_tokens_seen": 57524192, + "step": 37665 + }, + { + "epoch": 116.26584234930448, + "grad_norm": 0.548378050327301, + "learning_rate": 4.177928135694259e-07, + "loss": 0.2249, + "num_input_tokens_seen": 57531680, + "step": 37670 + }, + { + "epoch": 116.28129829984545, + "grad_norm": 0.4539403021335602, + "learning_rate": 4.1600738501994807e-07, + "loss": 0.2588, + "num_input_tokens_seen": 57539488, + "step": 37675 + }, + { + "epoch": 116.2967542503864, + "grad_norm": 1.053584098815918, + "learning_rate": 4.1422574763107237e-07, + "loss": 0.2501, + "num_input_tokens_seen": 57547264, + "step": 37680 + }, + { + "epoch": 116.31221020092735, + "grad_norm": 0.5755655169487, + "learning_rate": 4.124479016775512e-07, + "loss": 0.2317, + "num_input_tokens_seen": 57554944, + "step": 37685 + }, + { + "epoch": 116.32766615146832, + "grad_norm": 0.5061222910881042, + "learning_rate": 4.106738474335514e-07, + "loss": 0.2373, + "num_input_tokens_seen": 57563168, + "step": 37690 + }, + { + "epoch": 116.34312210200927, + "grad_norm": 0.7077041864395142, + "learning_rate": 4.089035851726486e-07, + "loss": 0.293, + "num_input_tokens_seen": 57570752, + "step": 37695 + }, + { + "epoch": 116.35857805255023, + "grad_norm": 0.3673132359981537, + "learning_rate": 4.0713711516784937e-07, + "loss": 0.2476, + "num_input_tokens_seen": 57578624, + "step": 37700 + }, + { + "epoch": 116.3740340030912, + "grad_norm": 0.9663387537002563, + "learning_rate": 4.05374437691558e-07, + "loss": 0.2503, + "num_input_tokens_seen": 57586016, + "step": 37705 + }, + { + "epoch": 116.38948995363215, + "grad_norm": 0.6228790283203125, + "learning_rate": 4.036155530156044e-07, + "loss": 0.2369, + "num_input_tokens_seen": 57593248, + "step": 37710 + }, + { + "epoch": 116.4049459041731, + "grad_norm": 0.8305060267448425, + "learning_rate": 4.018604614112298e-07, + "loss": 0.267, + "num_input_tokens_seen": 57600960, + "step": 37715 + }, + { + "epoch": 116.42040185471407, + "grad_norm": 0.6419792175292969, + "learning_rate": 4.0010916314908996e-07, + "loss": 0.2902, + "num_input_tokens_seen": 57608416, + "step": 37720 + }, + { + "epoch": 116.43585780525503, + "grad_norm": 0.6383503079414368, + "learning_rate": 3.983616584992578e-07, + "loss": 0.3242, + "num_input_tokens_seen": 57616576, + "step": 37725 + }, + { + "epoch": 116.45131375579598, + "grad_norm": 0.4739862382411957, + "learning_rate": 3.9661794773122595e-07, + "loss": 0.2858, + "num_input_tokens_seen": 57625152, + "step": 37730 + }, + { + "epoch": 116.46676970633693, + "grad_norm": 0.31304946541786194, + "learning_rate": 3.9487803111388777e-07, + "loss": 0.2158, + "num_input_tokens_seen": 57632608, + "step": 37735 + }, + { + "epoch": 116.4822256568779, + "grad_norm": 0.5331106781959534, + "learning_rate": 3.9314190891556747e-07, + "loss": 0.2256, + "num_input_tokens_seen": 57640064, + "step": 37740 + }, + { + "epoch": 116.49768160741885, + "grad_norm": 0.527690589427948, + "learning_rate": 3.914095814039925e-07, + "loss": 0.1887, + "num_input_tokens_seen": 57647744, + "step": 37745 + }, + { + "epoch": 116.51313755795981, + "grad_norm": 0.5835441946983337, + "learning_rate": 3.896810488463104e-07, + "loss": 0.2757, + "num_input_tokens_seen": 57655328, + "step": 37750 + }, + { + "epoch": 116.52859350850078, + "grad_norm": 0.49853092432022095, + "learning_rate": 3.8795631150908565e-07, + "loss": 0.3359, + "num_input_tokens_seen": 57662656, + "step": 37755 + }, + { + "epoch": 116.54404945904173, + "grad_norm": 0.4910303056240082, + "learning_rate": 3.862353696582888e-07, + "loss": 0.2451, + "num_input_tokens_seen": 57670112, + "step": 37760 + }, + { + "epoch": 116.55950540958268, + "grad_norm": 0.4900546073913574, + "learning_rate": 3.8451822355931313e-07, + "loss": 0.2393, + "num_input_tokens_seen": 57677152, + "step": 37765 + }, + { + "epoch": 116.57496136012365, + "grad_norm": 0.6717840433120728, + "learning_rate": 3.82804873476969e-07, + "loss": 0.2952, + "num_input_tokens_seen": 57684064, + "step": 37770 + }, + { + "epoch": 116.5904173106646, + "grad_norm": 0.35630422830581665, + "learning_rate": 3.810953196754702e-07, + "loss": 0.2075, + "num_input_tokens_seen": 57691456, + "step": 37775 + }, + { + "epoch": 116.60587326120556, + "grad_norm": 0.4813779294490814, + "learning_rate": 3.793895624184529e-07, + "loss": 0.2072, + "num_input_tokens_seen": 57699296, + "step": 37780 + }, + { + "epoch": 116.62132921174653, + "grad_norm": 0.508958637714386, + "learning_rate": 3.776876019689679e-07, + "loss": 0.2237, + "num_input_tokens_seen": 57706976, + "step": 37785 + }, + { + "epoch": 116.63678516228748, + "grad_norm": 0.542761504650116, + "learning_rate": 3.7598943858947743e-07, + "loss": 0.32, + "num_input_tokens_seen": 57715040, + "step": 37790 + }, + { + "epoch": 116.65224111282843, + "grad_norm": 0.6571284532546997, + "learning_rate": 3.742950725418637e-07, + "loss": 0.3042, + "num_input_tokens_seen": 57722560, + "step": 37795 + }, + { + "epoch": 116.6676970633694, + "grad_norm": 0.725811779499054, + "learning_rate": 3.726045040874093e-07, + "loss": 0.2208, + "num_input_tokens_seen": 57729856, + "step": 37800 + }, + { + "epoch": 116.6676970633694, + "eval_loss": 0.3016337752342224, + "eval_runtime": 6.3259, + "eval_samples_per_second": 90.896, + "eval_steps_per_second": 22.764, + "num_input_tokens_seen": 57729856, + "step": 37800 + }, + { + "epoch": 116.68315301391036, + "grad_norm": 0.6133099794387817, + "learning_rate": 3.709177334868308e-07, + "loss": 0.2845, + "num_input_tokens_seen": 57737440, + "step": 37805 + }, + { + "epoch": 116.69860896445131, + "grad_norm": 0.3324422538280487, + "learning_rate": 3.692347610002478e-07, + "loss": 0.261, + "num_input_tokens_seen": 57745408, + "step": 37810 + }, + { + "epoch": 116.71406491499228, + "grad_norm": 0.3430519998073578, + "learning_rate": 3.675555868871916e-07, + "loss": 0.2339, + "num_input_tokens_seen": 57753408, + "step": 37815 + }, + { + "epoch": 116.72952086553323, + "grad_norm": 0.4077998101711273, + "learning_rate": 3.658802114066162e-07, + "loss": 0.2405, + "num_input_tokens_seen": 57761184, + "step": 37820 + }, + { + "epoch": 116.74497681607419, + "grad_norm": 0.5452770590782166, + "learning_rate": 3.6420863481688437e-07, + "loss": 0.2805, + "num_input_tokens_seen": 57768608, + "step": 37825 + }, + { + "epoch": 116.76043276661515, + "grad_norm": 0.4781465530395508, + "learning_rate": 3.625408573757705e-07, + "loss": 0.2643, + "num_input_tokens_seen": 57776640, + "step": 37830 + }, + { + "epoch": 116.7758887171561, + "grad_norm": 0.6740089058876038, + "learning_rate": 3.608768793404743e-07, + "loss": 0.2307, + "num_input_tokens_seen": 57784320, + "step": 37835 + }, + { + "epoch": 116.79134466769706, + "grad_norm": 0.43655285239219666, + "learning_rate": 3.592167009675934e-07, + "loss": 0.232, + "num_input_tokens_seen": 57791808, + "step": 37840 + }, + { + "epoch": 116.80680061823801, + "grad_norm": 0.637340784072876, + "learning_rate": 3.575603225131563e-07, + "loss": 0.1997, + "num_input_tokens_seen": 57799392, + "step": 37845 + }, + { + "epoch": 116.82225656877898, + "grad_norm": 0.5042158961296082, + "learning_rate": 3.55907744232592e-07, + "loss": 0.231, + "num_input_tokens_seen": 57806656, + "step": 37850 + }, + { + "epoch": 116.83771251931994, + "grad_norm": 0.6087610125541687, + "learning_rate": 3.5425896638075217e-07, + "loss": 0.223, + "num_input_tokens_seen": 57814400, + "step": 37855 + }, + { + "epoch": 116.85316846986089, + "grad_norm": 0.5893296003341675, + "learning_rate": 3.5261398921189736e-07, + "loss": 0.2674, + "num_input_tokens_seen": 57822080, + "step": 37860 + }, + { + "epoch": 116.86862442040186, + "grad_norm": 0.5519049763679504, + "learning_rate": 3.509728129797024e-07, + "loss": 0.2491, + "num_input_tokens_seen": 57829728, + "step": 37865 + }, + { + "epoch": 116.88408037094281, + "grad_norm": 0.6442683935165405, + "learning_rate": 3.4933543793725656e-07, + "loss": 0.2015, + "num_input_tokens_seen": 57837152, + "step": 37870 + }, + { + "epoch": 116.89953632148377, + "grad_norm": 0.6139608025550842, + "learning_rate": 3.4770186433707163e-07, + "loss": 0.2308, + "num_input_tokens_seen": 57844960, + "step": 37875 + }, + { + "epoch": 116.91499227202473, + "grad_norm": 0.4588017761707306, + "learning_rate": 3.4607209243105453e-07, + "loss": 0.22, + "num_input_tokens_seen": 57852352, + "step": 37880 + }, + { + "epoch": 116.93044822256569, + "grad_norm": 0.37847501039505005, + "learning_rate": 3.444461224705431e-07, + "loss": 0.2219, + "num_input_tokens_seen": 57860352, + "step": 37885 + }, + { + "epoch": 116.94590417310664, + "grad_norm": 0.4214146137237549, + "learning_rate": 3.4282395470628116e-07, + "loss": 0.2112, + "num_input_tokens_seen": 57868192, + "step": 37890 + }, + { + "epoch": 116.96136012364761, + "grad_norm": 0.36020010709762573, + "learning_rate": 3.4120558938842417e-07, + "loss": 0.242, + "num_input_tokens_seen": 57875520, + "step": 37895 + }, + { + "epoch": 116.97681607418856, + "grad_norm": 0.6290783286094666, + "learning_rate": 3.395910267665503e-07, + "loss": 0.2237, + "num_input_tokens_seen": 57883200, + "step": 37900 + }, + { + "epoch": 116.99227202472952, + "grad_norm": 0.6189897060394287, + "learning_rate": 3.3798026708964094e-07, + "loss": 0.2568, + "num_input_tokens_seen": 57891168, + "step": 37905 + }, + { + "epoch": 117.00618238021639, + "grad_norm": 0.47673720121383667, + "learning_rate": 3.3637331060609456e-07, + "loss": 0.254, + "num_input_tokens_seen": 57897232, + "step": 37910 + }, + { + "epoch": 117.02163833075734, + "grad_norm": 0.29908889532089233, + "learning_rate": 3.3477015756372966e-07, + "loss": 0.2095, + "num_input_tokens_seen": 57904880, + "step": 37915 + }, + { + "epoch": 117.0370942812983, + "grad_norm": 0.4342035949230194, + "learning_rate": 3.3317080820976785e-07, + "loss": 0.2343, + "num_input_tokens_seen": 57912016, + "step": 37920 + }, + { + "epoch": 117.05255023183926, + "grad_norm": 0.6767832636833191, + "learning_rate": 3.315752627908508e-07, + "loss": 0.2595, + "num_input_tokens_seen": 57919120, + "step": 37925 + }, + { + "epoch": 117.06800618238022, + "grad_norm": 0.47537022829055786, + "learning_rate": 3.299835215530317e-07, + "loss": 0.214, + "num_input_tokens_seen": 57926960, + "step": 37930 + }, + { + "epoch": 117.08346213292117, + "grad_norm": 0.45199304819107056, + "learning_rate": 3.2839558474177245e-07, + "loss": 0.2805, + "num_input_tokens_seen": 57934576, + "step": 37935 + }, + { + "epoch": 117.09891808346214, + "grad_norm": 0.45702996850013733, + "learning_rate": 3.2681145260196056e-07, + "loss": 0.2407, + "num_input_tokens_seen": 57941840, + "step": 37940 + }, + { + "epoch": 117.11437403400309, + "grad_norm": 0.40495893359184265, + "learning_rate": 3.252311253778839e-07, + "loss": 0.3055, + "num_input_tokens_seen": 57949840, + "step": 37945 + }, + { + "epoch": 117.12982998454405, + "grad_norm": 0.5257571935653687, + "learning_rate": 3.2365460331325034e-07, + "loss": 0.2807, + "num_input_tokens_seen": 57957488, + "step": 37950 + }, + { + "epoch": 117.14528593508501, + "grad_norm": 0.8213506937026978, + "learning_rate": 3.2208188665117934e-07, + "loss": 0.3337, + "num_input_tokens_seen": 57965840, + "step": 37955 + }, + { + "epoch": 117.16074188562597, + "grad_norm": 0.43591776490211487, + "learning_rate": 3.205129756342018e-07, + "loss": 0.2764, + "num_input_tokens_seen": 57973424, + "step": 37960 + }, + { + "epoch": 117.17619783616692, + "grad_norm": 0.6215133666992188, + "learning_rate": 3.189478705042659e-07, + "loss": 0.2596, + "num_input_tokens_seen": 57981008, + "step": 37965 + }, + { + "epoch": 117.19165378670789, + "grad_norm": 0.3587261140346527, + "learning_rate": 3.173865715027341e-07, + "loss": 0.2324, + "num_input_tokens_seen": 57989200, + "step": 37970 + }, + { + "epoch": 117.20710973724884, + "grad_norm": 0.5319666862487793, + "learning_rate": 3.158290788703694e-07, + "loss": 0.2372, + "num_input_tokens_seen": 57996944, + "step": 37975 + }, + { + "epoch": 117.2225656877898, + "grad_norm": 0.6012566685676575, + "learning_rate": 3.1427539284736297e-07, + "loss": 0.2369, + "num_input_tokens_seen": 58004464, + "step": 37980 + }, + { + "epoch": 117.23802163833076, + "grad_norm": 1.1100832223892212, + "learning_rate": 3.127255136733093e-07, + "loss": 0.2286, + "num_input_tokens_seen": 58011248, + "step": 37985 + }, + { + "epoch": 117.25347758887172, + "grad_norm": 0.5979735255241394, + "learning_rate": 3.1117944158722544e-07, + "loss": 0.2696, + "num_input_tokens_seen": 58018800, + "step": 37990 + }, + { + "epoch": 117.26893353941267, + "grad_norm": 0.4935362935066223, + "learning_rate": 3.0963717682752635e-07, + "loss": 0.23, + "num_input_tokens_seen": 58027216, + "step": 37995 + }, + { + "epoch": 117.28438948995363, + "grad_norm": 0.4251273572444916, + "learning_rate": 3.080987196320578e-07, + "loss": 0.215, + "num_input_tokens_seen": 58034352, + "step": 38000 + }, + { + "epoch": 117.28438948995363, + "eval_loss": 0.3014582395553589, + "eval_runtime": 6.3263, + "eval_samples_per_second": 90.89, + "eval_steps_per_second": 22.762, + "num_input_tokens_seen": 58034352, + "step": 38000 + }, + { + "epoch": 117.2998454404946, + "grad_norm": 0.6210606694221497, + "learning_rate": 3.065640702380607e-07, + "loss": 0.2509, + "num_input_tokens_seen": 58042064, + "step": 38005 + }, + { + "epoch": 117.31530139103555, + "grad_norm": 0.5362794399261475, + "learning_rate": 3.050332288822011e-07, + "loss": 0.2434, + "num_input_tokens_seen": 58049200, + "step": 38010 + }, + { + "epoch": 117.3307573415765, + "grad_norm": 0.5062545537948608, + "learning_rate": 3.035061958005542e-07, + "loss": 0.2587, + "num_input_tokens_seen": 58057296, + "step": 38015 + }, + { + "epoch": 117.34621329211747, + "grad_norm": 0.2976444661617279, + "learning_rate": 3.019829712286093e-07, + "loss": 0.248, + "num_input_tokens_seen": 58064816, + "step": 38020 + }, + { + "epoch": 117.36166924265842, + "grad_norm": 0.49048376083374023, + "learning_rate": 3.004635554012647e-07, + "loss": 0.2359, + "num_input_tokens_seen": 58072592, + "step": 38025 + }, + { + "epoch": 117.37712519319938, + "grad_norm": 0.6650909185409546, + "learning_rate": 2.9894794855283017e-07, + "loss": 0.2561, + "num_input_tokens_seen": 58080432, + "step": 38030 + }, + { + "epoch": 117.39258114374034, + "grad_norm": 0.47745609283447266, + "learning_rate": 2.9743615091703816e-07, + "loss": 0.2127, + "num_input_tokens_seen": 58087824, + "step": 38035 + }, + { + "epoch": 117.4080370942813, + "grad_norm": 0.4210575222969055, + "learning_rate": 2.959281627270216e-07, + "loss": 0.1971, + "num_input_tokens_seen": 58095632, + "step": 38040 + }, + { + "epoch": 117.42349304482225, + "grad_norm": 0.5074567198753357, + "learning_rate": 2.944239842153362e-07, + "loss": 0.2209, + "num_input_tokens_seen": 58103888, + "step": 38045 + }, + { + "epoch": 117.43894899536322, + "grad_norm": 0.7006087303161621, + "learning_rate": 2.929236156139381e-07, + "loss": 0.3129, + "num_input_tokens_seen": 58111824, + "step": 38050 + }, + { + "epoch": 117.45440494590417, + "grad_norm": 0.4799152910709381, + "learning_rate": 2.9142705715420883e-07, + "loss": 0.247, + "num_input_tokens_seen": 58119536, + "step": 38055 + }, + { + "epoch": 117.46986089644513, + "grad_norm": 0.44368788599967957, + "learning_rate": 2.8993430906693595e-07, + "loss": 0.2375, + "num_input_tokens_seen": 58126992, + "step": 38060 + }, + { + "epoch": 117.4853168469861, + "grad_norm": 0.5173541307449341, + "learning_rate": 2.88445371582316e-07, + "loss": 0.2647, + "num_input_tokens_seen": 58134512, + "step": 38065 + }, + { + "epoch": 117.50077279752705, + "grad_norm": 0.37212246656417847, + "learning_rate": 2.8696024492996796e-07, + "loss": 0.1975, + "num_input_tokens_seen": 58142320, + "step": 38070 + }, + { + "epoch": 117.516228748068, + "grad_norm": 0.5682932734489441, + "learning_rate": 2.854789293389115e-07, + "loss": 0.2481, + "num_input_tokens_seen": 58149648, + "step": 38075 + }, + { + "epoch": 117.53168469860897, + "grad_norm": 0.46399083733558655, + "learning_rate": 2.8400142503758606e-07, + "loss": 0.2873, + "num_input_tokens_seen": 58156784, + "step": 38080 + }, + { + "epoch": 117.54714064914992, + "grad_norm": 0.46413737535476685, + "learning_rate": 2.8252773225384276e-07, + "loss": 0.2162, + "num_input_tokens_seen": 58164272, + "step": 38085 + }, + { + "epoch": 117.56259659969088, + "grad_norm": 0.7709930539131165, + "learning_rate": 2.8105785121494143e-07, + "loss": 0.29, + "num_input_tokens_seen": 58171792, + "step": 38090 + }, + { + "epoch": 117.57805255023185, + "grad_norm": 0.38494470715522766, + "learning_rate": 2.795917821475563e-07, + "loss": 0.2181, + "num_input_tokens_seen": 58179440, + "step": 38095 + }, + { + "epoch": 117.5935085007728, + "grad_norm": 0.6423577666282654, + "learning_rate": 2.78129525277776e-07, + "loss": 0.2183, + "num_input_tokens_seen": 58187152, + "step": 38100 + }, + { + "epoch": 117.60896445131375, + "grad_norm": 0.4591844379901886, + "learning_rate": 2.766710808310952e-07, + "loss": 0.2321, + "num_input_tokens_seen": 58194608, + "step": 38105 + }, + { + "epoch": 117.62442040185472, + "grad_norm": 0.5240879058837891, + "learning_rate": 2.7521644903242827e-07, + "loss": 0.2243, + "num_input_tokens_seen": 58202352, + "step": 38110 + }, + { + "epoch": 117.63987635239567, + "grad_norm": 0.3733628988265991, + "learning_rate": 2.7376563010609593e-07, + "loss": 0.2407, + "num_input_tokens_seen": 58210288, + "step": 38115 + }, + { + "epoch": 117.65533230293663, + "grad_norm": 0.419281005859375, + "learning_rate": 2.72318624275833e-07, + "loss": 0.2249, + "num_input_tokens_seen": 58218320, + "step": 38120 + }, + { + "epoch": 117.67078825347758, + "grad_norm": 0.4630796015262604, + "learning_rate": 2.7087543176478324e-07, + "loss": 0.2317, + "num_input_tokens_seen": 58225648, + "step": 38125 + }, + { + "epoch": 117.68624420401855, + "grad_norm": 0.38213053345680237, + "learning_rate": 2.694360527955103e-07, + "loss": 0.236, + "num_input_tokens_seen": 58232944, + "step": 38130 + }, + { + "epoch": 117.7017001545595, + "grad_norm": 0.44798797369003296, + "learning_rate": 2.680004875899811e-07, + "loss": 0.2069, + "num_input_tokens_seen": 58240432, + "step": 38135 + }, + { + "epoch": 117.71715610510046, + "grad_norm": 0.5595986247062683, + "learning_rate": 2.665687363695768e-07, + "loss": 0.3001, + "num_input_tokens_seen": 58247664, + "step": 38140 + }, + { + "epoch": 117.73261205564143, + "grad_norm": 0.5906869173049927, + "learning_rate": 2.6514079935509584e-07, + "loss": 0.2939, + "num_input_tokens_seen": 58255920, + "step": 38145 + }, + { + "epoch": 117.74806800618238, + "grad_norm": 0.3770712912082672, + "learning_rate": 2.6371667676673983e-07, + "loss": 0.2446, + "num_input_tokens_seen": 58264144, + "step": 38150 + }, + { + "epoch": 117.76352395672333, + "grad_norm": 0.3175910413265228, + "learning_rate": 2.6229636882412755e-07, + "loss": 0.2297, + "num_input_tokens_seen": 58272208, + "step": 38155 + }, + { + "epoch": 117.7789799072643, + "grad_norm": 0.6673468947410583, + "learning_rate": 2.6087987574628935e-07, + "loss": 0.2422, + "num_input_tokens_seen": 58279440, + "step": 38160 + }, + { + "epoch": 117.79443585780525, + "grad_norm": 0.5878450870513916, + "learning_rate": 2.5946719775166437e-07, + "loss": 0.3039, + "num_input_tokens_seen": 58287280, + "step": 38165 + }, + { + "epoch": 117.80989180834621, + "grad_norm": 0.4100155234336853, + "learning_rate": 2.5805833505810616e-07, + "loss": 0.2078, + "num_input_tokens_seen": 58295600, + "step": 38170 + }, + { + "epoch": 117.82534775888718, + "grad_norm": 0.6423658728599548, + "learning_rate": 2.566532878828798e-07, + "loss": 0.2639, + "num_input_tokens_seen": 58303120, + "step": 38175 + }, + { + "epoch": 117.84080370942813, + "grad_norm": 0.49566197395324707, + "learning_rate": 2.552520564426619e-07, + "loss": 0.2158, + "num_input_tokens_seen": 58310928, + "step": 38180 + }, + { + "epoch": 117.85625965996908, + "grad_norm": 0.7890515327453613, + "learning_rate": 2.5385464095353803e-07, + "loss": 0.2561, + "num_input_tokens_seen": 58318768, + "step": 38185 + }, + { + "epoch": 117.87171561051005, + "grad_norm": 0.8872687816619873, + "learning_rate": 2.5246104163100804e-07, + "loss": 0.2404, + "num_input_tokens_seen": 58326704, + "step": 38190 + }, + { + "epoch": 117.887171561051, + "grad_norm": 0.4834049940109253, + "learning_rate": 2.510712586899833e-07, + "loss": 0.2505, + "num_input_tokens_seen": 58334672, + "step": 38195 + }, + { + "epoch": 117.90262751159196, + "grad_norm": 0.4775446653366089, + "learning_rate": 2.4968529234478124e-07, + "loss": 0.2325, + "num_input_tokens_seen": 58342576, + "step": 38200 + }, + { + "epoch": 117.90262751159196, + "eval_loss": 0.3019532263278961, + "eval_runtime": 6.2803, + "eval_samples_per_second": 91.556, + "eval_steps_per_second": 22.929, + "num_input_tokens_seen": 58342576, + "step": 38200 + }, + { + "epoch": 117.91808346213293, + "grad_norm": 0.508428692817688, + "learning_rate": 2.483031428091448e-07, + "loss": 0.2015, + "num_input_tokens_seen": 58350640, + "step": 38205 + }, + { + "epoch": 117.93353941267388, + "grad_norm": 0.45902198553085327, + "learning_rate": 2.469248102962091e-07, + "loss": 0.3007, + "num_input_tokens_seen": 58358480, + "step": 38210 + }, + { + "epoch": 117.94899536321483, + "grad_norm": 0.3629830777645111, + "learning_rate": 2.4555029501853455e-07, + "loss": 0.2236, + "num_input_tokens_seen": 58365680, + "step": 38215 + }, + { + "epoch": 117.9644513137558, + "grad_norm": 0.4636707007884979, + "learning_rate": 2.441795971880906e-07, + "loss": 0.2505, + "num_input_tokens_seen": 58373040, + "step": 38220 + }, + { + "epoch": 117.97990726429676, + "grad_norm": 0.4098789095878601, + "learning_rate": 2.4281271701625255e-07, + "loss": 0.2672, + "num_input_tokens_seen": 58380528, + "step": 38225 + }, + { + "epoch": 117.99536321483771, + "grad_norm": 0.6836889386177063, + "learning_rate": 2.4144965471381007e-07, + "loss": 0.2384, + "num_input_tokens_seen": 58387920, + "step": 38230 + }, + { + "epoch": 118.00927357032458, + "grad_norm": 0.9210195541381836, + "learning_rate": 2.400904104909674e-07, + "loss": 0.2226, + "num_input_tokens_seen": 58394624, + "step": 38235 + }, + { + "epoch": 118.02472952086553, + "grad_norm": 0.7807341814041138, + "learning_rate": 2.3873498455733725e-07, + "loss": 0.2346, + "num_input_tokens_seen": 58402944, + "step": 38240 + }, + { + "epoch": 118.04018547140649, + "grad_norm": 1.0088741779327393, + "learning_rate": 2.3738337712194137e-07, + "loss": 0.2645, + "num_input_tokens_seen": 58410720, + "step": 38245 + }, + { + "epoch": 118.05564142194746, + "grad_norm": 0.47566473484039307, + "learning_rate": 2.3603558839321305e-07, + "loss": 0.2537, + "num_input_tokens_seen": 58418112, + "step": 38250 + }, + { + "epoch": 118.07109737248841, + "grad_norm": 0.7879226207733154, + "learning_rate": 2.3469161857900267e-07, + "loss": 0.2086, + "num_input_tokens_seen": 58425440, + "step": 38255 + }, + { + "epoch": 118.08655332302936, + "grad_norm": 0.549639880657196, + "learning_rate": 2.3335146788656393e-07, + "loss": 0.2238, + "num_input_tokens_seen": 58433152, + "step": 38260 + }, + { + "epoch": 118.10200927357033, + "grad_norm": 0.5068522095680237, + "learning_rate": 2.3201513652256757e-07, + "loss": 0.2318, + "num_input_tokens_seen": 58440384, + "step": 38265 + }, + { + "epoch": 118.11746522411129, + "grad_norm": 0.7809277772903442, + "learning_rate": 2.3068262469308766e-07, + "loss": 0.2866, + "num_input_tokens_seen": 58448224, + "step": 38270 + }, + { + "epoch": 118.13292117465224, + "grad_norm": 0.599633514881134, + "learning_rate": 2.2935393260362093e-07, + "loss": 0.3608, + "num_input_tokens_seen": 58455680, + "step": 38275 + }, + { + "epoch": 118.14837712519319, + "grad_norm": 0.4238741397857666, + "learning_rate": 2.2802906045906458e-07, + "loss": 0.2902, + "num_input_tokens_seen": 58463232, + "step": 38280 + }, + { + "epoch": 118.16383307573416, + "grad_norm": 0.4176347255706787, + "learning_rate": 2.2670800846373018e-07, + "loss": 0.2292, + "num_input_tokens_seen": 58470720, + "step": 38285 + }, + { + "epoch": 118.17928902627511, + "grad_norm": 0.3899247944355011, + "learning_rate": 2.2539077682134367e-07, + "loss": 0.2338, + "num_input_tokens_seen": 58478400, + "step": 38290 + }, + { + "epoch": 118.19474497681607, + "grad_norm": 0.3111315369606018, + "learning_rate": 2.2407736573503423e-07, + "loss": 0.2018, + "num_input_tokens_seen": 58486016, + "step": 38295 + }, + { + "epoch": 118.21020092735704, + "grad_norm": 0.7323338985443115, + "learning_rate": 2.2276777540735093e-07, + "loss": 0.2487, + "num_input_tokens_seen": 58494048, + "step": 38300 + }, + { + "epoch": 118.22565687789799, + "grad_norm": 0.36561906337738037, + "learning_rate": 2.2146200604024613e-07, + "loss": 0.2058, + "num_input_tokens_seen": 58501440, + "step": 38305 + }, + { + "epoch": 118.24111282843894, + "grad_norm": 0.5254160165786743, + "learning_rate": 2.2016005783508375e-07, + "loss": 0.2431, + "num_input_tokens_seen": 58509152, + "step": 38310 + }, + { + "epoch": 118.25656877897991, + "grad_norm": 0.45033004879951477, + "learning_rate": 2.1886193099264763e-07, + "loss": 0.2515, + "num_input_tokens_seen": 58517248, + "step": 38315 + }, + { + "epoch": 118.27202472952087, + "grad_norm": 0.4724467098712921, + "learning_rate": 2.175676257131165e-07, + "loss": 0.2599, + "num_input_tokens_seen": 58524800, + "step": 38320 + }, + { + "epoch": 118.28748068006182, + "grad_norm": 0.5698208808898926, + "learning_rate": 2.162771421960974e-07, + "loss": 0.2273, + "num_input_tokens_seen": 58532512, + "step": 38325 + }, + { + "epoch": 118.30293663060279, + "grad_norm": 0.49961286783218384, + "learning_rate": 2.1499048064059224e-07, + "loss": 0.2748, + "num_input_tokens_seen": 58539456, + "step": 38330 + }, + { + "epoch": 118.31839258114374, + "grad_norm": 0.7898120284080505, + "learning_rate": 2.1370764124502285e-07, + "loss": 0.2563, + "num_input_tokens_seen": 58547360, + "step": 38335 + }, + { + "epoch": 118.3338485316847, + "grad_norm": 0.36110585927963257, + "learning_rate": 2.1242862420721988e-07, + "loss": 0.2031, + "num_input_tokens_seen": 58555008, + "step": 38340 + }, + { + "epoch": 118.34930448222566, + "grad_norm": 0.6462382674217224, + "learning_rate": 2.1115342972442276e-07, + "loss": 0.2519, + "num_input_tokens_seen": 58563008, + "step": 38345 + }, + { + "epoch": 118.36476043276662, + "grad_norm": 0.3982223868370056, + "learning_rate": 2.0988205799328252e-07, + "loss": 0.188, + "num_input_tokens_seen": 58570496, + "step": 38350 + }, + { + "epoch": 118.38021638330757, + "grad_norm": 0.5555592775344849, + "learning_rate": 2.0861450920986182e-07, + "loss": 0.2595, + "num_input_tokens_seen": 58577920, + "step": 38355 + }, + { + "epoch": 118.39567233384854, + "grad_norm": 0.5904123783111572, + "learning_rate": 2.07350783569632e-07, + "loss": 0.2112, + "num_input_tokens_seen": 58585280, + "step": 38360 + }, + { + "epoch": 118.41112828438949, + "grad_norm": 0.5477237701416016, + "learning_rate": 2.060908812674761e-07, + "loss": 0.2049, + "num_input_tokens_seen": 58593024, + "step": 38365 + }, + { + "epoch": 118.42658423493044, + "grad_norm": 0.46568986773490906, + "learning_rate": 2.0483480249768317e-07, + "loss": 0.2224, + "num_input_tokens_seen": 58601056, + "step": 38370 + }, + { + "epoch": 118.44204018547141, + "grad_norm": 0.6033481955528259, + "learning_rate": 2.035825474539621e-07, + "loss": 0.2623, + "num_input_tokens_seen": 58608800, + "step": 38375 + }, + { + "epoch": 118.45749613601237, + "grad_norm": 0.6521425843238831, + "learning_rate": 2.0233411632942235e-07, + "loss": 0.2528, + "num_input_tokens_seen": 58617088, + "step": 38380 + }, + { + "epoch": 118.47295208655332, + "grad_norm": 0.45839494466781616, + "learning_rate": 2.0108950931658764e-07, + "loss": 0.2528, + "num_input_tokens_seen": 58624608, + "step": 38385 + }, + { + "epoch": 118.48840803709429, + "grad_norm": 0.38001471757888794, + "learning_rate": 1.998487266073934e-07, + "loss": 0.2482, + "num_input_tokens_seen": 58632704, + "step": 38390 + }, + { + "epoch": 118.50386398763524, + "grad_norm": 0.7630200982093811, + "learning_rate": 1.986117683931865e-07, + "loss": 0.2509, + "num_input_tokens_seen": 58640416, + "step": 38395 + }, + { + "epoch": 118.5193199381762, + "grad_norm": 0.6040888428688049, + "learning_rate": 1.9737863486471442e-07, + "loss": 0.2558, + "num_input_tokens_seen": 58648384, + "step": 38400 + }, + { + "epoch": 118.5193199381762, + "eval_loss": 0.3030909299850464, + "eval_runtime": 6.325, + "eval_samples_per_second": 90.91, + "eval_steps_per_second": 22.767, + "num_input_tokens_seen": 58648384, + "step": 38400 + }, + { + "epoch": 118.53477588871715, + "grad_norm": 0.4039148986339569, + "learning_rate": 1.9614932621215e-07, + "loss": 0.2671, + "num_input_tokens_seen": 58656064, + "step": 38405 + }, + { + "epoch": 118.55023183925812, + "grad_norm": 0.6841861605644226, + "learning_rate": 1.9492384262506102e-07, + "loss": 0.2119, + "num_input_tokens_seen": 58663168, + "step": 38410 + }, + { + "epoch": 118.56568778979907, + "grad_norm": 0.49796977639198303, + "learning_rate": 1.9370218429243524e-07, + "loss": 0.254, + "num_input_tokens_seen": 58670816, + "step": 38415 + }, + { + "epoch": 118.58114374034002, + "grad_norm": 0.3181965947151184, + "learning_rate": 1.9248435140267197e-07, + "loss": 0.2292, + "num_input_tokens_seen": 58678304, + "step": 38420 + }, + { + "epoch": 118.59659969088099, + "grad_norm": 0.3939974308013916, + "learning_rate": 1.9127034414356814e-07, + "loss": 0.2018, + "num_input_tokens_seen": 58685984, + "step": 38425 + }, + { + "epoch": 118.61205564142195, + "grad_norm": 0.523317813873291, + "learning_rate": 1.9006016270234627e-07, + "loss": 0.3367, + "num_input_tokens_seen": 58693728, + "step": 38430 + }, + { + "epoch": 118.6275115919629, + "grad_norm": 0.5032210946083069, + "learning_rate": 1.888538072656293e-07, + "loss": 0.2299, + "num_input_tokens_seen": 58701568, + "step": 38435 + }, + { + "epoch": 118.64296754250387, + "grad_norm": 0.3813948929309845, + "learning_rate": 1.8765127801944893e-07, + "loss": 0.1922, + "num_input_tokens_seen": 58709152, + "step": 38440 + }, + { + "epoch": 118.65842349304482, + "grad_norm": 0.4683697819709778, + "learning_rate": 1.8645257514925406e-07, + "loss": 0.2326, + "num_input_tokens_seen": 58716576, + "step": 38445 + }, + { + "epoch": 118.67387944358578, + "grad_norm": 0.5189456343650818, + "learning_rate": 1.8525769883989685e-07, + "loss": 0.2543, + "num_input_tokens_seen": 58724000, + "step": 38450 + }, + { + "epoch": 118.68933539412674, + "grad_norm": 0.34231674671173096, + "learning_rate": 1.8406664927564654e-07, + "loss": 0.2107, + "num_input_tokens_seen": 58732096, + "step": 38455 + }, + { + "epoch": 118.7047913446677, + "grad_norm": 0.5345114469528198, + "learning_rate": 1.8287942664017566e-07, + "loss": 0.2266, + "num_input_tokens_seen": 58738912, + "step": 38460 + }, + { + "epoch": 118.72024729520865, + "grad_norm": 0.44480186700820923, + "learning_rate": 1.8169603111656552e-07, + "loss": 0.2297, + "num_input_tokens_seen": 58746912, + "step": 38465 + }, + { + "epoch": 118.73570324574962, + "grad_norm": 0.37692198157310486, + "learning_rate": 1.805164628873146e-07, + "loss": 0.2354, + "num_input_tokens_seen": 58754464, + "step": 38470 + }, + { + "epoch": 118.75115919629057, + "grad_norm": 0.7549195885658264, + "learning_rate": 1.793407221343274e-07, + "loss": 0.2258, + "num_input_tokens_seen": 58762240, + "step": 38475 + }, + { + "epoch": 118.76661514683153, + "grad_norm": 0.4589606821537018, + "learning_rate": 1.781688090389172e-07, + "loss": 0.271, + "num_input_tokens_seen": 58769504, + "step": 38480 + }, + { + "epoch": 118.7820710973725, + "grad_norm": 0.6565883755683899, + "learning_rate": 1.770007237818061e-07, + "loss": 0.2922, + "num_input_tokens_seen": 58776640, + "step": 38485 + }, + { + "epoch": 118.79752704791345, + "grad_norm": 0.712588369846344, + "learning_rate": 1.7583646654313059e-07, + "loss": 0.3079, + "num_input_tokens_seen": 58784384, + "step": 38490 + }, + { + "epoch": 118.8129829984544, + "grad_norm": 0.4208821952342987, + "learning_rate": 1.7467603750242757e-07, + "loss": 0.2123, + "num_input_tokens_seen": 58791680, + "step": 38495 + }, + { + "epoch": 118.82843894899537, + "grad_norm": 0.2838591933250427, + "learning_rate": 1.7351943683865944e-07, + "loss": 0.2464, + "num_input_tokens_seen": 58799712, + "step": 38500 + }, + { + "epoch": 118.84389489953632, + "grad_norm": 0.5699580311775208, + "learning_rate": 1.723666647301808e-07, + "loss": 0.2311, + "num_input_tokens_seen": 58807424, + "step": 38505 + }, + { + "epoch": 118.85935085007728, + "grad_norm": 0.4826103448867798, + "learning_rate": 1.712177213547661e-07, + "loss": 0.2811, + "num_input_tokens_seen": 58815264, + "step": 38510 + }, + { + "epoch": 118.87480680061825, + "grad_norm": 0.791705310344696, + "learning_rate": 1.7007260688959581e-07, + "loss": 0.227, + "num_input_tokens_seen": 58823264, + "step": 38515 + }, + { + "epoch": 118.8902627511592, + "grad_norm": 0.4401208460330963, + "learning_rate": 1.68931321511262e-07, + "loss": 0.2793, + "num_input_tokens_seen": 58830816, + "step": 38520 + }, + { + "epoch": 118.90571870170015, + "grad_norm": 0.46369385719299316, + "learning_rate": 1.6779386539576835e-07, + "loss": 0.2019, + "num_input_tokens_seen": 58838272, + "step": 38525 + }, + { + "epoch": 118.9211746522411, + "grad_norm": 0.3803224265575409, + "learning_rate": 1.666602387185162e-07, + "loss": 0.299, + "num_input_tokens_seen": 58846048, + "step": 38530 + }, + { + "epoch": 118.93663060278207, + "grad_norm": 0.58189457654953, + "learning_rate": 1.655304416543352e-07, + "loss": 0.3166, + "num_input_tokens_seen": 58853824, + "step": 38535 + }, + { + "epoch": 118.95208655332303, + "grad_norm": 0.5372113585472107, + "learning_rate": 1.6440447437744698e-07, + "loss": 0.2287, + "num_input_tokens_seen": 58861760, + "step": 38540 + }, + { + "epoch": 118.96754250386398, + "grad_norm": 0.5981135964393616, + "learning_rate": 1.6328233706149332e-07, + "loss": 0.2995, + "num_input_tokens_seen": 58869216, + "step": 38545 + }, + { + "epoch": 118.98299845440495, + "grad_norm": 0.41629576683044434, + "learning_rate": 1.6216402987951906e-07, + "loss": 0.2226, + "num_input_tokens_seen": 58876736, + "step": 38550 + }, + { + "epoch": 118.9984544049459, + "grad_norm": 0.5328736901283264, + "learning_rate": 1.6104955300398627e-07, + "loss": 0.2157, + "num_input_tokens_seen": 58884832, + "step": 38555 + }, + { + "epoch": 119.01236476043276, + "grad_norm": 0.29872480034828186, + "learning_rate": 1.5993890660675748e-07, + "loss": 0.231, + "num_input_tokens_seen": 58891648, + "step": 38560 + }, + { + "epoch": 119.02782071097373, + "grad_norm": 1.4431251287460327, + "learning_rate": 1.5883209085910678e-07, + "loss": 0.2308, + "num_input_tokens_seen": 58899232, + "step": 38565 + }, + { + "epoch": 119.04327666151468, + "grad_norm": 0.5688505172729492, + "learning_rate": 1.5772910593172264e-07, + "loss": 0.3293, + "num_input_tokens_seen": 58906560, + "step": 38570 + }, + { + "epoch": 119.05873261205564, + "grad_norm": 0.6645981669425964, + "learning_rate": 1.5662995199469954e-07, + "loss": 0.2211, + "num_input_tokens_seen": 58914304, + "step": 38575 + }, + { + "epoch": 119.0741885625966, + "grad_norm": 0.7580187320709229, + "learning_rate": 1.5553462921753802e-07, + "loss": 0.2185, + "num_input_tokens_seen": 58922304, + "step": 38580 + }, + { + "epoch": 119.08964451313756, + "grad_norm": 0.37320151925086975, + "learning_rate": 1.544431377691502e-07, + "loss": 0.2395, + "num_input_tokens_seen": 58929952, + "step": 38585 + }, + { + "epoch": 119.10510046367851, + "grad_norm": 0.45189040899276733, + "learning_rate": 1.5335547781785975e-07, + "loss": 0.2613, + "num_input_tokens_seen": 58937728, + "step": 38590 + }, + { + "epoch": 119.12055641421948, + "grad_norm": 0.7442104816436768, + "learning_rate": 1.5227164953139917e-07, + "loss": 0.251, + "num_input_tokens_seen": 58945632, + "step": 38595 + }, + { + "epoch": 119.13601236476043, + "grad_norm": 0.40053677558898926, + "learning_rate": 1.511916530769042e-07, + "loss": 0.2495, + "num_input_tokens_seen": 58953568, + "step": 38600 + }, + { + "epoch": 119.13601236476043, + "eval_loss": 0.3022671341896057, + "eval_runtime": 6.2743, + "eval_samples_per_second": 91.643, + "eval_steps_per_second": 22.951, + "num_input_tokens_seen": 58953568, + "step": 38600 + }, + { + "epoch": 119.15146831530139, + "grad_norm": 0.5268297791481018, + "learning_rate": 1.5011548862092773e-07, + "loss": 0.3053, + "num_input_tokens_seen": 58961600, + "step": 38605 + }, + { + "epoch": 119.16692426584235, + "grad_norm": 0.49516263604164124, + "learning_rate": 1.490431563294231e-07, + "loss": 0.2067, + "num_input_tokens_seen": 58969792, + "step": 38610 + }, + { + "epoch": 119.18238021638331, + "grad_norm": 0.4138004183769226, + "learning_rate": 1.4797465636776365e-07, + "loss": 0.2559, + "num_input_tokens_seen": 58976896, + "step": 38615 + }, + { + "epoch": 119.19783616692426, + "grad_norm": 0.4995238780975342, + "learning_rate": 1.4690998890072027e-07, + "loss": 0.1951, + "num_input_tokens_seen": 58984224, + "step": 38620 + }, + { + "epoch": 119.21329211746523, + "grad_norm": 0.4809305667877197, + "learning_rate": 1.4584915409248112e-07, + "loss": 0.2546, + "num_input_tokens_seen": 58991776, + "step": 38625 + }, + { + "epoch": 119.22874806800618, + "grad_norm": 0.48985302448272705, + "learning_rate": 1.4479215210663754e-07, + "loss": 0.287, + "num_input_tokens_seen": 58999424, + "step": 38630 + }, + { + "epoch": 119.24420401854714, + "grad_norm": 0.5989328026771545, + "learning_rate": 1.4373898310619528e-07, + "loss": 0.2988, + "num_input_tokens_seen": 59007008, + "step": 38635 + }, + { + "epoch": 119.2596599690881, + "grad_norm": 0.3873289227485657, + "learning_rate": 1.4268964725356604e-07, + "loss": 0.2572, + "num_input_tokens_seen": 59015200, + "step": 38640 + }, + { + "epoch": 119.27511591962906, + "grad_norm": 0.3657183349132538, + "learning_rate": 1.4164414471056764e-07, + "loss": 0.2213, + "num_input_tokens_seen": 59022432, + "step": 38645 + }, + { + "epoch": 119.29057187017001, + "grad_norm": 0.43302375078201294, + "learning_rate": 1.4060247563843497e-07, + "loss": 0.2162, + "num_input_tokens_seen": 59030016, + "step": 38650 + }, + { + "epoch": 119.30602782071098, + "grad_norm": 0.35661497712135315, + "learning_rate": 1.3956464019780068e-07, + "loss": 0.2482, + "num_input_tokens_seen": 59037568, + "step": 38655 + }, + { + "epoch": 119.32148377125193, + "grad_norm": 0.4901511073112488, + "learning_rate": 1.385306385487145e-07, + "loss": 0.2607, + "num_input_tokens_seen": 59044672, + "step": 38660 + }, + { + "epoch": 119.33693972179289, + "grad_norm": 0.7587239146232605, + "learning_rate": 1.3750047085063222e-07, + "loss": 0.2654, + "num_input_tokens_seen": 59051936, + "step": 38665 + }, + { + "epoch": 119.35239567233384, + "grad_norm": 0.6189292073249817, + "learning_rate": 1.3647413726242119e-07, + "loss": 0.2603, + "num_input_tokens_seen": 59059520, + "step": 38670 + }, + { + "epoch": 119.36785162287481, + "grad_norm": 0.6599475145339966, + "learning_rate": 1.3545163794235205e-07, + "loss": 0.2433, + "num_input_tokens_seen": 59066656, + "step": 38675 + }, + { + "epoch": 119.38330757341576, + "grad_norm": 0.37837013602256775, + "learning_rate": 1.3443297304810698e-07, + "loss": 0.3139, + "num_input_tokens_seen": 59073792, + "step": 38680 + }, + { + "epoch": 119.39876352395672, + "grad_norm": 0.3968062698841095, + "learning_rate": 1.3341814273677977e-07, + "loss": 0.309, + "num_input_tokens_seen": 59081376, + "step": 38685 + }, + { + "epoch": 119.41421947449768, + "grad_norm": 0.4954511225223541, + "learning_rate": 1.324071471648647e-07, + "loss": 0.2975, + "num_input_tokens_seen": 59089408, + "step": 38690 + }, + { + "epoch": 119.42967542503864, + "grad_norm": 0.5682471990585327, + "learning_rate": 1.3139998648827312e-07, + "loss": 0.2732, + "num_input_tokens_seen": 59097056, + "step": 38695 + }, + { + "epoch": 119.44513137557959, + "grad_norm": 0.6110686659812927, + "learning_rate": 1.3039666086232526e-07, + "loss": 0.2255, + "num_input_tokens_seen": 59103872, + "step": 38700 + }, + { + "epoch": 119.46058732612056, + "grad_norm": 0.6260729432106018, + "learning_rate": 1.2939717044174183e-07, + "loss": 0.2314, + "num_input_tokens_seen": 59111808, + "step": 38705 + }, + { + "epoch": 119.47604327666151, + "grad_norm": 0.4462418854236603, + "learning_rate": 1.284015153806578e-07, + "loss": 0.2483, + "num_input_tokens_seen": 59119104, + "step": 38710 + }, + { + "epoch": 119.49149922720247, + "grad_norm": 0.697205126285553, + "learning_rate": 1.274096958326171e-07, + "loss": 0.29, + "num_input_tokens_seen": 59126560, + "step": 38715 + }, + { + "epoch": 119.50695517774344, + "grad_norm": 0.5696557760238647, + "learning_rate": 1.2642171195056952e-07, + "loss": 0.2137, + "num_input_tokens_seen": 59134432, + "step": 38720 + }, + { + "epoch": 119.52241112828439, + "grad_norm": 0.5172268748283386, + "learning_rate": 1.2543756388687377e-07, + "loss": 0.3092, + "num_input_tokens_seen": 59142080, + "step": 38725 + }, + { + "epoch": 119.53786707882534, + "grad_norm": 0.42668256163597107, + "learning_rate": 1.2445725179330014e-07, + "loss": 0.2404, + "num_input_tokens_seen": 59149792, + "step": 38730 + }, + { + "epoch": 119.55332302936631, + "grad_norm": 0.4272518754005432, + "learning_rate": 1.2348077582102212e-07, + "loss": 0.2382, + "num_input_tokens_seen": 59157344, + "step": 38735 + }, + { + "epoch": 119.56877897990726, + "grad_norm": 0.469579815864563, + "learning_rate": 1.2250813612062762e-07, + "loss": 0.2017, + "num_input_tokens_seen": 59165120, + "step": 38740 + }, + { + "epoch": 119.58423493044822, + "grad_norm": 0.4414757490158081, + "learning_rate": 1.215393328421105e-07, + "loss": 0.1977, + "num_input_tokens_seen": 59172672, + "step": 38745 + }, + { + "epoch": 119.59969088098919, + "grad_norm": 0.7758395075798035, + "learning_rate": 1.2057436613486796e-07, + "loss": 0.2104, + "num_input_tokens_seen": 59180224, + "step": 38750 + }, + { + "epoch": 119.61514683153014, + "grad_norm": 0.43821871280670166, + "learning_rate": 1.1961323614771424e-07, + "loss": 0.259, + "num_input_tokens_seen": 59187616, + "step": 38755 + }, + { + "epoch": 119.6306027820711, + "grad_norm": 0.6547970771789551, + "learning_rate": 1.1865594302886418e-07, + "loss": 0.2477, + "num_input_tokens_seen": 59195552, + "step": 38760 + }, + { + "epoch": 119.64605873261206, + "grad_norm": 0.3957693874835968, + "learning_rate": 1.1770248692594687e-07, + "loss": 0.2006, + "num_input_tokens_seen": 59203008, + "step": 38765 + }, + { + "epoch": 119.66151468315302, + "grad_norm": 0.3307890295982361, + "learning_rate": 1.167528679859975e-07, + "loss": 0.2371, + "num_input_tokens_seen": 59210304, + "step": 38770 + }, + { + "epoch": 119.67697063369397, + "grad_norm": 0.5358704924583435, + "learning_rate": 1.1580708635545446e-07, + "loss": 0.2499, + "num_input_tokens_seen": 59218240, + "step": 38775 + }, + { + "epoch": 119.69242658423494, + "grad_norm": 0.6995629072189331, + "learning_rate": 1.1486514218017885e-07, + "loss": 0.219, + "num_input_tokens_seen": 59225952, + "step": 38780 + }, + { + "epoch": 119.70788253477589, + "grad_norm": 0.4355279505252838, + "learning_rate": 1.1392703560542117e-07, + "loss": 0.2319, + "num_input_tokens_seen": 59233888, + "step": 38785 + }, + { + "epoch": 119.72333848531684, + "grad_norm": 0.5377179980278015, + "learning_rate": 1.129927667758518e-07, + "loss": 0.2563, + "num_input_tokens_seen": 59241600, + "step": 38790 + }, + { + "epoch": 119.7387944358578, + "grad_norm": 0.63379967212677, + "learning_rate": 1.1206233583554992e-07, + "loss": 0.1969, + "num_input_tokens_seen": 59249312, + "step": 38795 + }, + { + "epoch": 119.75425038639877, + "grad_norm": 0.6614294052124023, + "learning_rate": 1.1113574292799523e-07, + "loss": 0.2401, + "num_input_tokens_seen": 59257088, + "step": 38800 + }, + { + "epoch": 119.75425038639877, + "eval_loss": 0.3019011318683624, + "eval_runtime": 6.3149, + "eval_samples_per_second": 91.054, + "eval_steps_per_second": 22.803, + "num_input_tokens_seen": 59257088, + "step": 38800 + }, + { + "epoch": 119.76970633693972, + "grad_norm": 0.7357609272003174, + "learning_rate": 1.1021298819608449e-07, + "loss": 0.2852, + "num_input_tokens_seen": 59264736, + "step": 38805 + }, + { + "epoch": 119.78516228748067, + "grad_norm": 0.6540694832801819, + "learning_rate": 1.0929407178211226e-07, + "loss": 0.23, + "num_input_tokens_seen": 59272288, + "step": 38810 + }, + { + "epoch": 119.80061823802164, + "grad_norm": 0.3469410836696625, + "learning_rate": 1.0837899382779293e-07, + "loss": 0.2099, + "num_input_tokens_seen": 59280128, + "step": 38815 + }, + { + "epoch": 119.8160741885626, + "grad_norm": 0.8448670506477356, + "learning_rate": 1.0746775447423862e-07, + "loss": 0.2291, + "num_input_tokens_seen": 59287904, + "step": 38820 + }, + { + "epoch": 119.83153013910355, + "grad_norm": 0.6362238526344299, + "learning_rate": 1.0656035386197583e-07, + "loss": 0.2838, + "num_input_tokens_seen": 59295648, + "step": 38825 + }, + { + "epoch": 119.84698608964452, + "grad_norm": 0.5682768821716309, + "learning_rate": 1.0565679213093982e-07, + "loss": 0.2637, + "num_input_tokens_seen": 59302784, + "step": 38830 + }, + { + "epoch": 119.86244204018547, + "grad_norm": 0.48914405703544617, + "learning_rate": 1.0475706942046638e-07, + "loss": 0.2266, + "num_input_tokens_seen": 59310304, + "step": 38835 + }, + { + "epoch": 119.87789799072642, + "grad_norm": 0.3551863729953766, + "learning_rate": 1.0386118586930282e-07, + "loss": 0.2104, + "num_input_tokens_seen": 59317792, + "step": 38840 + }, + { + "epoch": 119.89335394126739, + "grad_norm": 0.34093770384788513, + "learning_rate": 1.0296914161561367e-07, + "loss": 0.2269, + "num_input_tokens_seen": 59325760, + "step": 38845 + }, + { + "epoch": 119.90880989180835, + "grad_norm": 0.7462531924247742, + "learning_rate": 1.0208093679695552e-07, + "loss": 0.196, + "num_input_tokens_seen": 59333568, + "step": 38850 + }, + { + "epoch": 119.9242658423493, + "grad_norm": 0.5594061017036438, + "learning_rate": 1.0119657155030493e-07, + "loss": 0.2578, + "num_input_tokens_seen": 59341216, + "step": 38855 + }, + { + "epoch": 119.93972179289027, + "grad_norm": 0.42490822076797485, + "learning_rate": 1.003160460120417e-07, + "loss": 0.2603, + "num_input_tokens_seen": 59348896, + "step": 38860 + }, + { + "epoch": 119.95517774343122, + "grad_norm": 0.45945796370506287, + "learning_rate": 9.943936031795165e-08, + "loss": 0.2658, + "num_input_tokens_seen": 59356928, + "step": 38865 + }, + { + "epoch": 119.97063369397218, + "grad_norm": 0.40142032504081726, + "learning_rate": 9.856651460323219e-08, + "loss": 0.2163, + "num_input_tokens_seen": 59365120, + "step": 38870 + }, + { + "epoch": 119.98608964451314, + "grad_norm": 0.5160882472991943, + "learning_rate": 9.769750900248953e-08, + "loss": 0.3008, + "num_input_tokens_seen": 59373088, + "step": 38875 + }, + { + "epoch": 120.0, + "grad_norm": 0.9190237522125244, + "learning_rate": 9.683234364973038e-08, + "loss": 0.2387, + "num_input_tokens_seen": 59379872, + "step": 38880 + }, + { + "epoch": 120.01545595054095, + "grad_norm": 0.4506450295448303, + "learning_rate": 9.597101867837854e-08, + "loss": 0.2518, + "num_input_tokens_seen": 59387232, + "step": 38885 + }, + { + "epoch": 120.03091190108192, + "grad_norm": 0.4866096079349518, + "learning_rate": 9.511353422125835e-08, + "loss": 0.2567, + "num_input_tokens_seen": 59394432, + "step": 38890 + }, + { + "epoch": 120.04636785162288, + "grad_norm": 0.45974212884902954, + "learning_rate": 9.42598904106029e-08, + "loss": 0.2222, + "num_input_tokens_seen": 59402112, + "step": 38895 + }, + { + "epoch": 120.06182380216383, + "grad_norm": 0.37367501854896545, + "learning_rate": 9.341008737806245e-08, + "loss": 0.2129, + "num_input_tokens_seen": 59409536, + "step": 38900 + }, + { + "epoch": 120.0772797527048, + "grad_norm": 0.49267300963401794, + "learning_rate": 9.256412525467661e-08, + "loss": 0.2983, + "num_input_tokens_seen": 59417696, + "step": 38905 + }, + { + "epoch": 120.09273570324575, + "grad_norm": 0.7519434690475464, + "learning_rate": 9.172200417091326e-08, + "loss": 0.2281, + "num_input_tokens_seen": 59424928, + "step": 38910 + }, + { + "epoch": 120.1081916537867, + "grad_norm": 0.3127264976501465, + "learning_rate": 9.088372425663239e-08, + "loss": 0.2858, + "num_input_tokens_seen": 59433056, + "step": 38915 + }, + { + "epoch": 120.12364760432767, + "grad_norm": 0.34740662574768066, + "learning_rate": 9.004928564110837e-08, + "loss": 0.2542, + "num_input_tokens_seen": 59440896, + "step": 38920 + }, + { + "epoch": 120.13910355486863, + "grad_norm": 0.40764984488487244, + "learning_rate": 8.92186884530244e-08, + "loss": 0.1971, + "num_input_tokens_seen": 59448160, + "step": 38925 + }, + { + "epoch": 120.15455950540958, + "grad_norm": 0.5765553712844849, + "learning_rate": 8.83919328204641e-08, + "loss": 0.3178, + "num_input_tokens_seen": 59455936, + "step": 38930 + }, + { + "epoch": 120.17001545595055, + "grad_norm": 0.4773620367050171, + "learning_rate": 8.756901887093105e-08, + "loss": 0.2245, + "num_input_tokens_seen": 59463680, + "step": 38935 + }, + { + "epoch": 120.1854714064915, + "grad_norm": 0.4841006100177765, + "learning_rate": 8.674994673132098e-08, + "loss": 0.2092, + "num_input_tokens_seen": 59471360, + "step": 38940 + }, + { + "epoch": 120.20092735703246, + "grad_norm": 0.6352056264877319, + "learning_rate": 8.593471652794949e-08, + "loss": 0.2557, + "num_input_tokens_seen": 59478144, + "step": 38945 + }, + { + "epoch": 120.21638330757341, + "grad_norm": 0.3009229004383087, + "learning_rate": 8.512332838653548e-08, + "loss": 0.2555, + "num_input_tokens_seen": 59485344, + "step": 38950 + }, + { + "epoch": 120.23183925811438, + "grad_norm": 0.5446906685829163, + "learning_rate": 8.431578243220106e-08, + "loss": 0.2091, + "num_input_tokens_seen": 59492928, + "step": 38955 + }, + { + "epoch": 120.24729520865533, + "grad_norm": 0.3876579999923706, + "learning_rate": 8.351207878948552e-08, + "loss": 0.2362, + "num_input_tokens_seen": 59499968, + "step": 38960 + }, + { + "epoch": 120.26275115919628, + "grad_norm": 0.8221309781074524, + "learning_rate": 8.271221758232583e-08, + "loss": 0.2133, + "num_input_tokens_seen": 59507392, + "step": 38965 + }, + { + "epoch": 120.27820710973725, + "grad_norm": 0.39087995886802673, + "learning_rate": 8.191619893407332e-08, + "loss": 0.2006, + "num_input_tokens_seen": 59515296, + "step": 38970 + }, + { + "epoch": 120.2936630602782, + "grad_norm": 0.6624817848205566, + "learning_rate": 8.112402296748534e-08, + "loss": 0.245, + "num_input_tokens_seen": 59523456, + "step": 38975 + }, + { + "epoch": 120.30911901081916, + "grad_norm": 0.47909966111183167, + "learning_rate": 8.033568980471973e-08, + "loss": 0.3165, + "num_input_tokens_seen": 59531520, + "step": 38980 + }, + { + "epoch": 120.32457496136013, + "grad_norm": 0.5168655514717102, + "learning_rate": 7.955119956735146e-08, + "loss": 0.2174, + "num_input_tokens_seen": 59538336, + "step": 38985 + }, + { + "epoch": 120.34003091190108, + "grad_norm": 0.6671820282936096, + "learning_rate": 7.877055237636155e-08, + "loss": 0.2278, + "num_input_tokens_seen": 59546656, + "step": 38990 + }, + { + "epoch": 120.35548686244204, + "grad_norm": 0.472329705953598, + "learning_rate": 7.79937483521287e-08, + "loss": 0.2467, + "num_input_tokens_seen": 59554240, + "step": 38995 + }, + { + "epoch": 120.370942812983, + "grad_norm": 0.447256475687027, + "learning_rate": 7.722078761444873e-08, + "loss": 0.2171, + "num_input_tokens_seen": 59562208, + "step": 39000 + }, + { + "epoch": 120.370942812983, + "eval_loss": 0.3018070161342621, + "eval_runtime": 6.2971, + "eval_samples_per_second": 91.311, + "eval_steps_per_second": 22.868, + "num_input_tokens_seen": 59562208, + "step": 39000 + }, + { + "epoch": 120.38639876352396, + "grad_norm": 0.4492761194705963, + "learning_rate": 7.645167028252631e-08, + "loss": 0.1968, + "num_input_tokens_seen": 59569248, + "step": 39005 + }, + { + "epoch": 120.40185471406491, + "grad_norm": 0.9352847337722778, + "learning_rate": 7.568639647496379e-08, + "loss": 0.2722, + "num_input_tokens_seen": 59577088, + "step": 39010 + }, + { + "epoch": 120.41731066460588, + "grad_norm": 0.502873957157135, + "learning_rate": 7.492496630977508e-08, + "loss": 0.2429, + "num_input_tokens_seen": 59584416, + "step": 39015 + }, + { + "epoch": 120.43276661514683, + "grad_norm": 0.49107739329338074, + "learning_rate": 7.416737990438571e-08, + "loss": 0.2305, + "num_input_tokens_seen": 59592000, + "step": 39020 + }, + { + "epoch": 120.44822256568779, + "grad_norm": 0.6035904288291931, + "learning_rate": 7.341363737562445e-08, + "loss": 0.2274, + "num_input_tokens_seen": 59599104, + "step": 39025 + }, + { + "epoch": 120.46367851622875, + "grad_norm": 0.42905285954475403, + "learning_rate": 7.266373883972887e-08, + "loss": 0.226, + "num_input_tokens_seen": 59606400, + "step": 39030 + }, + { + "epoch": 120.47913446676971, + "grad_norm": 0.859806478023529, + "learning_rate": 7.191768441233981e-08, + "loss": 0.2459, + "num_input_tokens_seen": 59613856, + "step": 39035 + }, + { + "epoch": 120.49459041731066, + "grad_norm": 0.4493069052696228, + "learning_rate": 7.11754742085069e-08, + "loss": 0.2879, + "num_input_tokens_seen": 59621696, + "step": 39040 + }, + { + "epoch": 120.51004636785163, + "grad_norm": 0.5819570422172546, + "learning_rate": 7.043710834269413e-08, + "loss": 0.2808, + "num_input_tokens_seen": 59629760, + "step": 39045 + }, + { + "epoch": 120.52550231839258, + "grad_norm": 0.5581464767456055, + "learning_rate": 6.970258692876319e-08, + "loss": 0.2279, + "num_input_tokens_seen": 59637600, + "step": 39050 + }, + { + "epoch": 120.54095826893354, + "grad_norm": 0.5747694373130798, + "learning_rate": 6.897191007998738e-08, + "loss": 0.2752, + "num_input_tokens_seen": 59645408, + "step": 39055 + }, + { + "epoch": 120.5564142194745, + "grad_norm": 0.6208328604698181, + "learning_rate": 6.824507790904599e-08, + "loss": 0.2101, + "num_input_tokens_seen": 59653248, + "step": 39060 + }, + { + "epoch": 120.57187017001546, + "grad_norm": 0.7937318682670593, + "learning_rate": 6.752209052802439e-08, + "loss": 0.209, + "num_input_tokens_seen": 59660544, + "step": 39065 + }, + { + "epoch": 120.58732612055641, + "grad_norm": 0.5920931696891785, + "learning_rate": 6.680294804841946e-08, + "loss": 0.2195, + "num_input_tokens_seen": 59668896, + "step": 39070 + }, + { + "epoch": 120.60278207109737, + "grad_norm": 0.46642133593559265, + "learning_rate": 6.608765058112865e-08, + "loss": 0.2615, + "num_input_tokens_seen": 59676768, + "step": 39075 + }, + { + "epoch": 120.61823802163833, + "grad_norm": 0.6128515601158142, + "learning_rate": 6.537619823646368e-08, + "loss": 0.2273, + "num_input_tokens_seen": 59684544, + "step": 39080 + }, + { + "epoch": 120.63369397217929, + "grad_norm": 0.5858321785926819, + "learning_rate": 6.466859112413404e-08, + "loss": 0.28, + "num_input_tokens_seen": 59691840, + "step": 39085 + }, + { + "epoch": 120.64914992272024, + "grad_norm": 0.3633618950843811, + "learning_rate": 6.39648293532663e-08, + "loss": 0.2326, + "num_input_tokens_seen": 59699712, + "step": 39090 + }, + { + "epoch": 120.66460587326121, + "grad_norm": 0.4511694312095642, + "learning_rate": 6.32649130323848e-08, + "loss": 0.2403, + "num_input_tokens_seen": 59707136, + "step": 39095 + }, + { + "epoch": 120.68006182380216, + "grad_norm": 0.4801919162273407, + "learning_rate": 6.256884226943094e-08, + "loss": 0.2449, + "num_input_tokens_seen": 59714400, + "step": 39100 + }, + { + "epoch": 120.69551777434312, + "grad_norm": 0.3442040681838989, + "learning_rate": 6.187661717174386e-08, + "loss": 0.2297, + "num_input_tokens_seen": 59722496, + "step": 39105 + }, + { + "epoch": 120.71097372488408, + "grad_norm": 0.4575553834438324, + "learning_rate": 6.118823784607708e-08, + "loss": 0.2356, + "num_input_tokens_seen": 59730400, + "step": 39110 + }, + { + "epoch": 120.72642967542504, + "grad_norm": 0.6560827493667603, + "learning_rate": 6.050370439858178e-08, + "loss": 0.2687, + "num_input_tokens_seen": 59737600, + "step": 39115 + }, + { + "epoch": 120.74188562596599, + "grad_norm": 0.387778103351593, + "learning_rate": 5.98230169348235e-08, + "loss": 0.3081, + "num_input_tokens_seen": 59745280, + "step": 39120 + }, + { + "epoch": 120.75734157650696, + "grad_norm": 0.5270826816558838, + "learning_rate": 5.914617555977664e-08, + "loss": 0.3447, + "num_input_tokens_seen": 59752672, + "step": 39125 + }, + { + "epoch": 120.77279752704791, + "grad_norm": 0.3610554337501526, + "learning_rate": 5.8473180377816017e-08, + "loss": 0.1799, + "num_input_tokens_seen": 59760224, + "step": 39130 + }, + { + "epoch": 120.78825347758887, + "grad_norm": 0.3340883255004883, + "learning_rate": 5.780403149272251e-08, + "loss": 0.2301, + "num_input_tokens_seen": 59767520, + "step": 39135 + }, + { + "epoch": 120.80370942812984, + "grad_norm": 0.47615137696266174, + "learning_rate": 5.7138729007694126e-08, + "loss": 0.2787, + "num_input_tokens_seen": 59775264, + "step": 39140 + }, + { + "epoch": 120.81916537867079, + "grad_norm": 0.6754286289215088, + "learning_rate": 5.64772730253238e-08, + "loss": 0.2234, + "num_input_tokens_seen": 59782400, + "step": 39145 + }, + { + "epoch": 120.83462132921174, + "grad_norm": 0.5188019275665283, + "learning_rate": 5.5819663647618814e-08, + "loss": 0.2342, + "num_input_tokens_seen": 59789536, + "step": 39150 + }, + { + "epoch": 120.85007727975271, + "grad_norm": 0.3625258207321167, + "learning_rate": 5.5165900975989723e-08, + "loss": 0.2376, + "num_input_tokens_seen": 59797952, + "step": 39155 + }, + { + "epoch": 120.86553323029366, + "grad_norm": 0.8671061992645264, + "learning_rate": 5.451598511125311e-08, + "loss": 0.2394, + "num_input_tokens_seen": 59806208, + "step": 39160 + }, + { + "epoch": 120.88098918083462, + "grad_norm": 0.3347380459308624, + "learning_rate": 5.3869916153637124e-08, + "loss": 0.2454, + "num_input_tokens_seen": 59813568, + "step": 39165 + }, + { + "epoch": 120.89644513137559, + "grad_norm": 0.37471556663513184, + "learning_rate": 5.322769420277318e-08, + "loss": 0.2387, + "num_input_tokens_seen": 59821024, + "step": 39170 + }, + { + "epoch": 120.91190108191654, + "grad_norm": 0.31863003969192505, + "learning_rate": 5.258931935769873e-08, + "loss": 0.2767, + "num_input_tokens_seen": 59828736, + "step": 39175 + }, + { + "epoch": 120.9273570324575, + "grad_norm": 0.47755181789398193, + "learning_rate": 5.19547917168628e-08, + "loss": 0.2438, + "num_input_tokens_seen": 59836416, + "step": 39180 + }, + { + "epoch": 120.94281298299846, + "grad_norm": 0.3870830535888672, + "learning_rate": 5.13241113781121e-08, + "loss": 0.2652, + "num_input_tokens_seen": 59843840, + "step": 39185 + }, + { + "epoch": 120.95826893353942, + "grad_norm": 0.6458680629730225, + "learning_rate": 5.0697278438707755e-08, + "loss": 0.3016, + "num_input_tokens_seen": 59852352, + "step": 39190 + }, + { + "epoch": 120.97372488408037, + "grad_norm": 0.44219014048576355, + "learning_rate": 5.0074292995316854e-08, + "loss": 0.2566, + "num_input_tokens_seen": 59860064, + "step": 39195 + }, + { + "epoch": 120.98918083462132, + "grad_norm": 0.4939335882663727, + "learning_rate": 4.945515514400978e-08, + "loss": 0.2523, + "num_input_tokens_seen": 59867712, + "step": 39200 + }, + { + "epoch": 120.98918083462132, + "eval_loss": 0.3015228509902954, + "eval_runtime": 6.3003, + "eval_samples_per_second": 91.265, + "eval_steps_per_second": 22.856, + "num_input_tokens_seen": 59867712, + "step": 39200 + }, + { + "epoch": 121.0030911901082, + "grad_norm": 0.5378869771957397, + "learning_rate": 4.883986498026571e-08, + "loss": 0.2046, + "num_input_tokens_seen": 59874128, + "step": 39205 + }, + { + "epoch": 121.01854714064915, + "grad_norm": 0.42773908376693726, + "learning_rate": 4.822842259896987e-08, + "loss": 0.2473, + "num_input_tokens_seen": 59881552, + "step": 39210 + }, + { + "epoch": 121.03400309119012, + "grad_norm": 0.5065765380859375, + "learning_rate": 4.762082809441626e-08, + "loss": 0.2287, + "num_input_tokens_seen": 59888496, + "step": 39215 + }, + { + "epoch": 121.04945904173107, + "grad_norm": 0.6128968596458435, + "learning_rate": 4.7017081560302156e-08, + "loss": 0.2387, + "num_input_tokens_seen": 59896592, + "step": 39220 + }, + { + "epoch": 121.06491499227202, + "grad_norm": 0.4159579575061798, + "learning_rate": 4.6417183089730866e-08, + "loss": 0.2245, + "num_input_tokens_seen": 59904304, + "step": 39225 + }, + { + "epoch": 121.08037094281298, + "grad_norm": 0.5264980792999268, + "learning_rate": 4.5821132775217265e-08, + "loss": 0.2412, + "num_input_tokens_seen": 59912112, + "step": 39230 + }, + { + "epoch": 121.09582689335394, + "grad_norm": 0.35955268144607544, + "learning_rate": 4.5228930708679504e-08, + "loss": 0.26, + "num_input_tokens_seen": 59919888, + "step": 39235 + }, + { + "epoch": 121.1112828438949, + "grad_norm": 0.49580663442611694, + "learning_rate": 4.464057698144175e-08, + "loss": 0.3003, + "num_input_tokens_seen": 59927088, + "step": 39240 + }, + { + "epoch": 121.12673879443585, + "grad_norm": 0.43964919447898865, + "learning_rate": 4.4056071684236974e-08, + "loss": 0.2552, + "num_input_tokens_seen": 59935216, + "step": 39245 + }, + { + "epoch": 121.14219474497682, + "grad_norm": 0.38925060629844666, + "learning_rate": 4.347541490719864e-08, + "loss": 0.2923, + "num_input_tokens_seen": 59942832, + "step": 39250 + }, + { + "epoch": 121.15765069551777, + "grad_norm": 0.4813286066055298, + "learning_rate": 4.2898606739877336e-08, + "loss": 0.2802, + "num_input_tokens_seen": 59950352, + "step": 39255 + }, + { + "epoch": 121.17310664605873, + "grad_norm": 0.3562460243701935, + "learning_rate": 4.232564727122135e-08, + "loss": 0.2365, + "num_input_tokens_seen": 59957520, + "step": 39260 + }, + { + "epoch": 121.1885625965997, + "grad_norm": 0.5455992817878723, + "learning_rate": 4.1756536589585004e-08, + "loss": 0.2811, + "num_input_tokens_seen": 59964624, + "step": 39265 + }, + { + "epoch": 121.20401854714065, + "grad_norm": 0.7314848899841309, + "learning_rate": 4.119127478273976e-08, + "loss": 0.2604, + "num_input_tokens_seen": 59972304, + "step": 39270 + }, + { + "epoch": 121.2194744976816, + "grad_norm": 0.4001946747303009, + "learning_rate": 4.062986193784923e-08, + "loss": 0.2887, + "num_input_tokens_seen": 59980208, + "step": 39275 + }, + { + "epoch": 121.23493044822257, + "grad_norm": 0.4083077609539032, + "learning_rate": 4.007229814149416e-08, + "loss": 0.2225, + "num_input_tokens_seen": 59988144, + "step": 39280 + }, + { + "epoch": 121.25038639876352, + "grad_norm": 0.4722798764705658, + "learning_rate": 3.951858347965576e-08, + "loss": 0.2528, + "num_input_tokens_seen": 59995824, + "step": 39285 + }, + { + "epoch": 121.26584234930448, + "grad_norm": 0.3645178973674774, + "learning_rate": 3.896871803772684e-08, + "loss": 0.2543, + "num_input_tokens_seen": 60003504, + "step": 39290 + }, + { + "epoch": 121.28129829984545, + "grad_norm": 0.35406044125556946, + "learning_rate": 3.842270190050068e-08, + "loss": 0.3051, + "num_input_tokens_seen": 60011760, + "step": 39295 + }, + { + "epoch": 121.2967542503864, + "grad_norm": 0.31883808970451355, + "learning_rate": 3.7880535152179376e-08, + "loss": 0.2407, + "num_input_tokens_seen": 60019408, + "step": 39300 + }, + { + "epoch": 121.31221020092735, + "grad_norm": 0.47962358593940735, + "learning_rate": 3.734221787637382e-08, + "loss": 0.2257, + "num_input_tokens_seen": 60027376, + "step": 39305 + }, + { + "epoch": 121.32766615146832, + "grad_norm": 0.5843390226364136, + "learning_rate": 3.680775015609817e-08, + "loss": 0.2309, + "num_input_tokens_seen": 60035280, + "step": 39310 + }, + { + "epoch": 121.34312210200927, + "grad_norm": 0.6161026954650879, + "learning_rate": 3.627713207377537e-08, + "loss": 0.3208, + "num_input_tokens_seen": 60043248, + "step": 39315 + }, + { + "epoch": 121.35857805255023, + "grad_norm": 0.2814505398273468, + "learning_rate": 3.575036371123164e-08, + "loss": 0.2516, + "num_input_tokens_seen": 60050704, + "step": 39320 + }, + { + "epoch": 121.3740340030912, + "grad_norm": 0.41428810358047485, + "learning_rate": 3.5227445149704776e-08, + "loss": 0.2778, + "num_input_tokens_seen": 60058160, + "step": 39325 + }, + { + "epoch": 121.38948995363215, + "grad_norm": 0.43714427947998047, + "learning_rate": 3.470837646983027e-08, + "loss": 0.1884, + "num_input_tokens_seen": 60066128, + "step": 39330 + }, + { + "epoch": 121.4049459041731, + "grad_norm": 0.48224976658821106, + "learning_rate": 3.419315775165799e-08, + "loss": 0.2071, + "num_input_tokens_seen": 60073648, + "step": 39335 + }, + { + "epoch": 121.42040185471407, + "grad_norm": 0.47056153416633606, + "learning_rate": 3.368178907464103e-08, + "loss": 0.2536, + "num_input_tokens_seen": 60081296, + "step": 39340 + }, + { + "epoch": 121.43585780525503, + "grad_norm": 0.6946339011192322, + "learning_rate": 3.317427051763855e-08, + "loss": 0.2676, + "num_input_tokens_seen": 60089392, + "step": 39345 + }, + { + "epoch": 121.45131375579598, + "grad_norm": 0.7390259504318237, + "learning_rate": 3.267060215891571e-08, + "loss": 0.2145, + "num_input_tokens_seen": 60096912, + "step": 39350 + }, + { + "epoch": 121.46676970633693, + "grad_norm": 0.39511457085609436, + "learning_rate": 3.217078407614649e-08, + "loss": 0.2613, + "num_input_tokens_seen": 60104144, + "step": 39355 + }, + { + "epoch": 121.4822256568779, + "grad_norm": 0.4216708540916443, + "learning_rate": 3.1674816346405345e-08, + "loss": 0.2036, + "num_input_tokens_seen": 60111728, + "step": 39360 + }, + { + "epoch": 121.49768160741885, + "grad_norm": 0.3827250003814697, + "learning_rate": 3.11826990461811e-08, + "loss": 0.2159, + "num_input_tokens_seen": 60119408, + "step": 39365 + }, + { + "epoch": 121.51313755795981, + "grad_norm": 0.5740914940834045, + "learning_rate": 3.069443225136304e-08, + "loss": 0.2278, + "num_input_tokens_seen": 60126736, + "step": 39370 + }, + { + "epoch": 121.52859350850078, + "grad_norm": 0.4722764194011688, + "learning_rate": 3.021001603724372e-08, + "loss": 0.2334, + "num_input_tokens_seen": 60134416, + "step": 39375 + }, + { + "epoch": 121.54404945904173, + "grad_norm": 0.46134528517723083, + "learning_rate": 2.9729450478532818e-08, + "loss": 0.2259, + "num_input_tokens_seen": 60142320, + "step": 39380 + }, + { + "epoch": 121.55950540958268, + "grad_norm": 0.6129893064498901, + "learning_rate": 2.9252735649337726e-08, + "loss": 0.2675, + "num_input_tokens_seen": 60149904, + "step": 39385 + }, + { + "epoch": 121.57496136012365, + "grad_norm": 0.7673532962799072, + "learning_rate": 2.8779871623171863e-08, + "loss": 0.2298, + "num_input_tokens_seen": 60157136, + "step": 39390 + }, + { + "epoch": 121.5904173106646, + "grad_norm": 0.8218713998794556, + "learning_rate": 2.8310858472957448e-08, + "loss": 0.245, + "num_input_tokens_seen": 60165104, + "step": 39395 + }, + { + "epoch": 121.60587326120556, + "grad_norm": 0.3353762626647949, + "learning_rate": 2.784569627101996e-08, + "loss": 0.3036, + "num_input_tokens_seen": 60173616, + "step": 39400 + }, + { + "epoch": 121.60587326120556, + "eval_loss": 0.301624596118927, + "eval_runtime": 6.318, + "eval_samples_per_second": 91.01, + "eval_steps_per_second": 22.792, + "num_input_tokens_seen": 60173616, + "step": 39400 + }, + { + "epoch": 121.62132921174653, + "grad_norm": 0.9196118116378784, + "learning_rate": 2.738438508909924e-08, + "loss": 0.2162, + "num_input_tokens_seen": 60181200, + "step": 39405 + }, + { + "epoch": 121.63678516228748, + "grad_norm": 0.40611186623573303, + "learning_rate": 2.692692499833005e-08, + "loss": 0.2118, + "num_input_tokens_seen": 60188784, + "step": 39410 + }, + { + "epoch": 121.65224111282843, + "grad_norm": 0.45942261815071106, + "learning_rate": 2.647331606926151e-08, + "loss": 0.2599, + "num_input_tokens_seen": 60196048, + "step": 39415 + }, + { + "epoch": 121.6676970633694, + "grad_norm": 0.34708255529403687, + "learning_rate": 2.6023558371843225e-08, + "loss": 0.2147, + "num_input_tokens_seen": 60203504, + "step": 39420 + }, + { + "epoch": 121.68315301391036, + "grad_norm": 0.4902358055114746, + "learning_rate": 2.557765197543638e-08, + "loss": 0.2086, + "num_input_tokens_seen": 60210928, + "step": 39425 + }, + { + "epoch": 121.69860896445131, + "grad_norm": 0.6832306981086731, + "learning_rate": 2.513559694880263e-08, + "loss": 0.2329, + "num_input_tokens_seen": 60218448, + "step": 39430 + }, + { + "epoch": 121.71406491499228, + "grad_norm": 0.6398513317108154, + "learning_rate": 2.469739336011523e-08, + "loss": 0.2537, + "num_input_tokens_seen": 60226416, + "step": 39435 + }, + { + "epoch": 121.72952086553323, + "grad_norm": 0.8191109895706177, + "learning_rate": 2.4263041276947894e-08, + "loss": 0.2455, + "num_input_tokens_seen": 60234032, + "step": 39440 + }, + { + "epoch": 121.74497681607419, + "grad_norm": 0.39539408683776855, + "learning_rate": 2.3832540766283164e-08, + "loss": 0.2441, + "num_input_tokens_seen": 60242000, + "step": 39445 + }, + { + "epoch": 121.76043276661515, + "grad_norm": 0.6037710905075073, + "learning_rate": 2.3405891894512366e-08, + "loss": 0.2359, + "num_input_tokens_seen": 60249616, + "step": 39450 + }, + { + "epoch": 121.7758887171561, + "grad_norm": 0.8315793871879578, + "learning_rate": 2.29830947274301e-08, + "loss": 0.2879, + "num_input_tokens_seen": 60257904, + "step": 39455 + }, + { + "epoch": 121.79134466769706, + "grad_norm": 0.4680388867855072, + "learning_rate": 2.2564149330231432e-08, + "loss": 0.2468, + "num_input_tokens_seen": 60265008, + "step": 39460 + }, + { + "epoch": 121.80680061823801, + "grad_norm": 0.4141889810562134, + "learning_rate": 2.2149055767528572e-08, + "loss": 0.2337, + "num_input_tokens_seen": 60272976, + "step": 39465 + }, + { + "epoch": 121.82225656877898, + "grad_norm": 0.5013156533241272, + "learning_rate": 2.1737814103334197e-08, + "loss": 0.2106, + "num_input_tokens_seen": 60280208, + "step": 39470 + }, + { + "epoch": 121.83771251931994, + "grad_norm": 0.3163033127784729, + "learning_rate": 2.1330424401064253e-08, + "loss": 0.235, + "num_input_tokens_seen": 60287888, + "step": 39475 + }, + { + "epoch": 121.85316846986089, + "grad_norm": 0.37733137607574463, + "learning_rate": 2.092688672354348e-08, + "loss": 0.2539, + "num_input_tokens_seen": 60295568, + "step": 39480 + }, + { + "epoch": 121.86862442040186, + "grad_norm": 0.7330491542816162, + "learning_rate": 2.0527201133005435e-08, + "loss": 0.205, + "num_input_tokens_seen": 60303664, + "step": 39485 + }, + { + "epoch": 121.88408037094281, + "grad_norm": 0.503341019153595, + "learning_rate": 2.0131367691084148e-08, + "loss": 0.2192, + "num_input_tokens_seen": 60311184, + "step": 39490 + }, + { + "epoch": 121.89953632148377, + "grad_norm": 0.7132977843284607, + "learning_rate": 1.9739386458819675e-08, + "loss": 0.2269, + "num_input_tokens_seen": 60318928, + "step": 39495 + }, + { + "epoch": 121.91499227202473, + "grad_norm": 0.6654589176177979, + "learning_rate": 1.9351257496666442e-08, + "loss": 0.2385, + "num_input_tokens_seen": 60326608, + "step": 39500 + }, + { + "epoch": 121.93044822256569, + "grad_norm": 0.5642110109329224, + "learning_rate": 1.896698086447657e-08, + "loss": 0.2446, + "num_input_tokens_seen": 60334064, + "step": 39505 + }, + { + "epoch": 121.94590417310664, + "grad_norm": 0.5770296454429626, + "learning_rate": 1.8586556621505436e-08, + "loss": 0.3216, + "num_input_tokens_seen": 60341872, + "step": 39510 + }, + { + "epoch": 121.96136012364761, + "grad_norm": 0.6700767874717712, + "learning_rate": 1.820998482642833e-08, + "loss": 0.2425, + "num_input_tokens_seen": 60349104, + "step": 39515 + }, + { + "epoch": 121.97681607418856, + "grad_norm": 0.525616466999054, + "learning_rate": 1.7837265537309912e-08, + "loss": 0.2301, + "num_input_tokens_seen": 60356848, + "step": 39520 + }, + { + "epoch": 121.99227202472952, + "grad_norm": 0.4667595624923706, + "learning_rate": 1.7468398811629206e-08, + "loss": 0.217, + "num_input_tokens_seen": 60364208, + "step": 39525 + }, + { + "epoch": 122.00618238021639, + "grad_norm": 0.4104345142841339, + "learning_rate": 1.710338470627404e-08, + "loss": 0.2303, + "num_input_tokens_seen": 60370640, + "step": 39530 + }, + { + "epoch": 122.02163833075734, + "grad_norm": 0.604128897190094, + "learning_rate": 1.6742223277529945e-08, + "loss": 0.2521, + "num_input_tokens_seen": 60378128, + "step": 39535 + }, + { + "epoch": 122.0370942812983, + "grad_norm": 0.4450373649597168, + "learning_rate": 1.6384914581094036e-08, + "loss": 0.2272, + "num_input_tokens_seen": 60385456, + "step": 39540 + }, + { + "epoch": 122.05255023183926, + "grad_norm": 0.5351849794387817, + "learning_rate": 1.6031458672069455e-08, + "loss": 0.3037, + "num_input_tokens_seen": 60392816, + "step": 39545 + }, + { + "epoch": 122.06800618238022, + "grad_norm": 0.6127430200576782, + "learning_rate": 1.5681855604962602e-08, + "loss": 0.2593, + "num_input_tokens_seen": 60400400, + "step": 39550 + }, + { + "epoch": 122.08346213292117, + "grad_norm": 0.6177833676338196, + "learning_rate": 1.5336105433683135e-08, + "loss": 0.2487, + "num_input_tokens_seen": 60407824, + "step": 39555 + }, + { + "epoch": 122.09891808346214, + "grad_norm": 0.6216335892677307, + "learning_rate": 1.499420821155506e-08, + "loss": 0.2482, + "num_input_tokens_seen": 60415376, + "step": 39560 + }, + { + "epoch": 122.11437403400309, + "grad_norm": 0.6301831007003784, + "learning_rate": 1.4656163991302874e-08, + "loss": 0.2338, + "num_input_tokens_seen": 60422608, + "step": 39565 + }, + { + "epoch": 122.12982998454405, + "grad_norm": 0.6043234467506409, + "learning_rate": 1.4321972825051544e-08, + "loss": 0.2086, + "num_input_tokens_seen": 60430768, + "step": 39570 + }, + { + "epoch": 122.14528593508501, + "grad_norm": 0.7777138948440552, + "learning_rate": 1.3991634764345951e-08, + "loss": 0.2608, + "num_input_tokens_seen": 60437936, + "step": 39575 + }, + { + "epoch": 122.16074188562597, + "grad_norm": 0.575378954410553, + "learning_rate": 1.3665149860120352e-08, + "loss": 0.2356, + "num_input_tokens_seen": 60445552, + "step": 39580 + }, + { + "epoch": 122.17619783616692, + "grad_norm": 0.7903811931610107, + "learning_rate": 1.3342518162728912e-08, + "loss": 0.3307, + "num_input_tokens_seen": 60453840, + "step": 39585 + }, + { + "epoch": 122.19165378670789, + "grad_norm": 0.5525996685028076, + "learning_rate": 1.30237397219235e-08, + "loss": 0.2134, + "num_input_tokens_seen": 60461296, + "step": 39590 + }, + { + "epoch": 122.20710973724884, + "grad_norm": 0.4675174355506897, + "learning_rate": 1.2708814586862016e-08, + "loss": 0.2541, + "num_input_tokens_seen": 60468912, + "step": 39595 + }, + { + "epoch": 122.2225656877898, + "grad_norm": 0.6878390312194824, + "learning_rate": 1.2397742806111168e-08, + "loss": 0.2435, + "num_input_tokens_seen": 60476592, + "step": 39600 + }, + { + "epoch": 122.2225656877898, + "eval_loss": 0.30229419469833374, + "eval_runtime": 6.2973, + "eval_samples_per_second": 91.309, + "eval_steps_per_second": 22.867, + "num_input_tokens_seen": 60476592, + "step": 39600 + }, + { + "epoch": 122.23802163833076, + "grad_norm": 0.3920705318450928, + "learning_rate": 1.209052442764369e-08, + "loss": 0.2524, + "num_input_tokens_seen": 60484400, + "step": 39605 + }, + { + "epoch": 122.25347758887172, + "grad_norm": 0.7113028764724731, + "learning_rate": 1.17871594988328e-08, + "loss": 0.2646, + "num_input_tokens_seen": 60492176, + "step": 39610 + }, + { + "epoch": 122.26893353941267, + "grad_norm": 0.6047004461288452, + "learning_rate": 1.1487648066466072e-08, + "loss": 0.2331, + "num_input_tokens_seen": 60500272, + "step": 39615 + }, + { + "epoch": 122.28438948995363, + "grad_norm": 0.4390365183353424, + "learning_rate": 1.1191990176728784e-08, + "loss": 0.2342, + "num_input_tokens_seen": 60507888, + "step": 39620 + }, + { + "epoch": 122.2998454404946, + "grad_norm": 0.4122958779335022, + "learning_rate": 1.0900185875215018e-08, + "loss": 0.2652, + "num_input_tokens_seen": 60515920, + "step": 39625 + }, + { + "epoch": 122.31530139103555, + "grad_norm": 0.3695749342441559, + "learning_rate": 1.0612235206924891e-08, + "loss": 0.2278, + "num_input_tokens_seen": 60523056, + "step": 39630 + }, + { + "epoch": 122.3307573415765, + "grad_norm": 0.3478049337863922, + "learning_rate": 1.0328138216264549e-08, + "loss": 0.2032, + "num_input_tokens_seen": 60530672, + "step": 39635 + }, + { + "epoch": 122.34621329211747, + "grad_norm": 0.4422418773174286, + "learning_rate": 1.004789494704339e-08, + "loss": 0.2224, + "num_input_tokens_seen": 60537936, + "step": 39640 + }, + { + "epoch": 122.36166924265842, + "grad_norm": 0.9128666520118713, + "learning_rate": 9.771505442482397e-09, + "loss": 0.2214, + "num_input_tokens_seen": 60545552, + "step": 39645 + }, + { + "epoch": 122.37712519319938, + "grad_norm": 0.5909255743026733, + "learning_rate": 9.498969745200259e-09, + "loss": 0.2604, + "num_input_tokens_seen": 60553552, + "step": 39650 + }, + { + "epoch": 122.39258114374034, + "grad_norm": 0.6087014675140381, + "learning_rate": 9.230287897230017e-09, + "loss": 0.2725, + "num_input_tokens_seen": 60560944, + "step": 39655 + }, + { + "epoch": 122.4080370942813, + "grad_norm": 0.5950091481208801, + "learning_rate": 8.965459940002419e-09, + "loss": 0.2733, + "num_input_tokens_seen": 60568432, + "step": 39660 + }, + { + "epoch": 122.42349304482225, + "grad_norm": 0.5086413621902466, + "learning_rate": 8.704485914357019e-09, + "loss": 0.3062, + "num_input_tokens_seen": 60576592, + "step": 39665 + }, + { + "epoch": 122.43894899536322, + "grad_norm": 0.7843717336654663, + "learning_rate": 8.447365860539402e-09, + "loss": 0.2359, + "num_input_tokens_seen": 60584208, + "step": 39670 + }, + { + "epoch": 122.45440494590417, + "grad_norm": 0.6429862976074219, + "learning_rate": 8.194099818201184e-09, + "loss": 0.2562, + "num_input_tokens_seen": 60592144, + "step": 39675 + }, + { + "epoch": 122.46986089644513, + "grad_norm": 0.37585681676864624, + "learning_rate": 7.944687826400011e-09, + "loss": 0.2348, + "num_input_tokens_seen": 60600048, + "step": 39680 + }, + { + "epoch": 122.4853168469861, + "grad_norm": 0.6466646790504456, + "learning_rate": 7.699129923599557e-09, + "loss": 0.2535, + "num_input_tokens_seen": 60608016, + "step": 39685 + }, + { + "epoch": 122.50077279752705, + "grad_norm": 0.6359344720840454, + "learning_rate": 7.457426147663982e-09, + "loss": 0.2276, + "num_input_tokens_seen": 60615728, + "step": 39690 + }, + { + "epoch": 122.516228748068, + "grad_norm": 0.8152884244918823, + "learning_rate": 7.219576535871797e-09, + "loss": 0.2781, + "num_input_tokens_seen": 60623120, + "step": 39695 + }, + { + "epoch": 122.53168469860897, + "grad_norm": 0.6411160230636597, + "learning_rate": 6.985581124896445e-09, + "loss": 0.2577, + "num_input_tokens_seen": 60630800, + "step": 39700 + }, + { + "epoch": 122.54714064914992, + "grad_norm": 0.580336332321167, + "learning_rate": 6.755439950828501e-09, + "loss": 0.3062, + "num_input_tokens_seen": 60638800, + "step": 39705 + }, + { + "epoch": 122.56259659969088, + "grad_norm": 0.6146988868713379, + "learning_rate": 6.5291530491562444e-09, + "loss": 0.3027, + "num_input_tokens_seen": 60646352, + "step": 39710 + }, + { + "epoch": 122.57805255023185, + "grad_norm": 0.3322477638721466, + "learning_rate": 6.3067204547739845e-09, + "loss": 0.2477, + "num_input_tokens_seen": 60653712, + "step": 39715 + }, + { + "epoch": 122.5935085007728, + "grad_norm": 0.7709954977035522, + "learning_rate": 6.088142201987612e-09, + "loss": 0.2097, + "num_input_tokens_seen": 60661520, + "step": 39720 + }, + { + "epoch": 122.60896445131375, + "grad_norm": 0.29856929183006287, + "learning_rate": 5.873418324503499e-09, + "loss": 0.2134, + "num_input_tokens_seen": 60669680, + "step": 39725 + }, + { + "epoch": 122.62442040185472, + "grad_norm": 0.402868390083313, + "learning_rate": 5.6625488554340465e-09, + "loss": 0.2353, + "num_input_tokens_seen": 60677744, + "step": 39730 + }, + { + "epoch": 122.63987635239567, + "grad_norm": 0.4155671298503876, + "learning_rate": 5.455533827297688e-09, + "loss": 0.2457, + "num_input_tokens_seen": 60685264, + "step": 39735 + }, + { + "epoch": 122.65533230293663, + "grad_norm": 0.3832892179489136, + "learning_rate": 5.252373272018885e-09, + "loss": 0.2325, + "num_input_tokens_seen": 60692528, + "step": 39740 + }, + { + "epoch": 122.67078825347758, + "grad_norm": 0.3897162675857544, + "learning_rate": 5.053067220925356e-09, + "loss": 0.2308, + "num_input_tokens_seen": 60700048, + "step": 39745 + }, + { + "epoch": 122.68624420401855, + "grad_norm": 0.3422393500804901, + "learning_rate": 4.857615704759177e-09, + "loss": 0.238, + "num_input_tokens_seen": 60707888, + "step": 39750 + }, + { + "epoch": 122.7017001545595, + "grad_norm": 0.3750162124633789, + "learning_rate": 4.666018753654577e-09, + "loss": 0.2518, + "num_input_tokens_seen": 60715120, + "step": 39755 + }, + { + "epoch": 122.71715610510046, + "grad_norm": 0.3376654088497162, + "learning_rate": 4.478276397162917e-09, + "loss": 0.2023, + "num_input_tokens_seen": 60722416, + "step": 39760 + }, + { + "epoch": 122.73261205564143, + "grad_norm": 0.4223079979419708, + "learning_rate": 4.294388664233262e-09, + "loss": 0.2578, + "num_input_tokens_seen": 60730512, + "step": 39765 + }, + { + "epoch": 122.74806800618238, + "grad_norm": 1.1384764909744263, + "learning_rate": 4.114355583223484e-09, + "loss": 0.2401, + "num_input_tokens_seen": 60737488, + "step": 39770 + }, + { + "epoch": 122.76352395672333, + "grad_norm": 0.5637760758399963, + "learning_rate": 3.9381771818974845e-09, + "loss": 0.2318, + "num_input_tokens_seen": 60744784, + "step": 39775 + }, + { + "epoch": 122.7789799072643, + "grad_norm": 0.644205629825592, + "learning_rate": 3.765853487427973e-09, + "loss": 0.2661, + "num_input_tokens_seen": 60752112, + "step": 39780 + }, + { + "epoch": 122.79443585780525, + "grad_norm": 0.5608834624290466, + "learning_rate": 3.5973845263825857e-09, + "loss": 0.2463, + "num_input_tokens_seen": 60759664, + "step": 39785 + }, + { + "epoch": 122.80989180834621, + "grad_norm": 0.466185599565506, + "learning_rate": 3.4327703247488684e-09, + "loss": 0.2421, + "num_input_tokens_seen": 60767344, + "step": 39790 + }, + { + "epoch": 122.82534775888718, + "grad_norm": 0.6080759763717651, + "learning_rate": 3.2720109079037443e-09, + "loss": 0.2329, + "num_input_tokens_seen": 60774960, + "step": 39795 + }, + { + "epoch": 122.84080370942813, + "grad_norm": 0.36011552810668945, + "learning_rate": 3.1151063006468193e-09, + "loss": 0.2313, + "num_input_tokens_seen": 60782960, + "step": 39800 + }, + { + "epoch": 122.84080370942813, + "eval_loss": 0.3018482029438019, + "eval_runtime": 6.2986, + "eval_samples_per_second": 91.29, + "eval_steps_per_second": 22.862, + "num_input_tokens_seen": 60782960, + "step": 39800 + }, + { + "epoch": 122.85625965996908, + "grad_norm": 0.6933723092079163, + "learning_rate": 2.962056527169854e-09, + "loss": 0.2007, + "num_input_tokens_seen": 60790352, + "step": 39805 + }, + { + "epoch": 122.87171561051005, + "grad_norm": 0.36413460969924927, + "learning_rate": 2.8128616110761898e-09, + "loss": 0.2722, + "num_input_tokens_seen": 60797808, + "step": 39810 + }, + { + "epoch": 122.887171561051, + "grad_norm": 0.2877125144004822, + "learning_rate": 2.6675215753724223e-09, + "loss": 0.2925, + "num_input_tokens_seen": 60805808, + "step": 39815 + }, + { + "epoch": 122.90262751159196, + "grad_norm": 0.4598154127597809, + "learning_rate": 2.5260364424739557e-09, + "loss": 0.1946, + "num_input_tokens_seen": 60813552, + "step": 39820 + }, + { + "epoch": 122.91808346213293, + "grad_norm": 0.6296015381813049, + "learning_rate": 2.3884062341994475e-09, + "loss": 0.2406, + "num_input_tokens_seen": 60821200, + "step": 39825 + }, + { + "epoch": 122.93353941267388, + "grad_norm": 0.4627452790737152, + "learning_rate": 2.25463097177081e-09, + "loss": 0.2495, + "num_input_tokens_seen": 60829232, + "step": 39830 + }, + { + "epoch": 122.94899536321483, + "grad_norm": 0.37661653757095337, + "learning_rate": 2.1247106758215397e-09, + "loss": 0.2772, + "num_input_tokens_seen": 60836720, + "step": 39835 + }, + { + "epoch": 122.9644513137558, + "grad_norm": 0.6478320360183716, + "learning_rate": 1.998645366382834e-09, + "loss": 0.3358, + "num_input_tokens_seen": 60844656, + "step": 39840 + }, + { + "epoch": 122.97990726429676, + "grad_norm": 0.44542479515075684, + "learning_rate": 1.876435062897475e-09, + "loss": 0.1866, + "num_input_tokens_seen": 60852336, + "step": 39845 + }, + { + "epoch": 122.99536321483771, + "grad_norm": 0.5722594261169434, + "learning_rate": 1.758079784211497e-09, + "loss": 0.2243, + "num_input_tokens_seen": 60861072, + "step": 39850 + }, + { + "epoch": 123.00927357032458, + "grad_norm": 0.368943989276886, + "learning_rate": 1.6435795485797434e-09, + "loss": 0.229, + "num_input_tokens_seen": 60867632, + "step": 39855 + }, + { + "epoch": 123.02472952086553, + "grad_norm": 0.4931950569152832, + "learning_rate": 1.5329343736547596e-09, + "loss": 0.2225, + "num_input_tokens_seen": 60874832, + "step": 39860 + }, + { + "epoch": 123.04018547140649, + "grad_norm": 0.42966246604919434, + "learning_rate": 1.4261442765006739e-09, + "loss": 0.2594, + "num_input_tokens_seen": 60882512, + "step": 39865 + }, + { + "epoch": 123.05564142194746, + "grad_norm": 0.6536299586296082, + "learning_rate": 1.3232092735876445e-09, + "loss": 0.2863, + "num_input_tokens_seen": 60890000, + "step": 39870 + }, + { + "epoch": 123.07109737248841, + "grad_norm": 0.5967891216278076, + "learning_rate": 1.2241293807918607e-09, + "loss": 0.2733, + "num_input_tokens_seen": 60897680, + "step": 39875 + }, + { + "epoch": 123.08655332302936, + "grad_norm": 0.5632314682006836, + "learning_rate": 1.128904613387216e-09, + "loss": 0.2334, + "num_input_tokens_seen": 60905648, + "step": 39880 + }, + { + "epoch": 123.10200927357033, + "grad_norm": 0.3128339648246765, + "learning_rate": 1.0375349860591853e-09, + "loss": 0.2416, + "num_input_tokens_seen": 60913008, + "step": 39885 + }, + { + "epoch": 123.11746522411129, + "grad_norm": 0.7044672966003418, + "learning_rate": 9.5002051290205e-10, + "loss": 0.2936, + "num_input_tokens_seen": 60920432, + "step": 39890 + }, + { + "epoch": 123.13292117465224, + "grad_norm": 0.3901006281375885, + "learning_rate": 8.663612074077954e-10, + "loss": 0.2006, + "num_input_tokens_seen": 60928144, + "step": 39895 + }, + { + "epoch": 123.14837712519319, + "grad_norm": 0.8211728930473328, + "learning_rate": 7.865570824799884e-10, + "loss": 0.252, + "num_input_tokens_seen": 60936016, + "step": 39900 + }, + { + "epoch": 123.16383307573416, + "grad_norm": 0.4093096852302551, + "learning_rate": 7.106081504254514e-10, + "loss": 0.3027, + "num_input_tokens_seen": 60943632, + "step": 39905 + }, + { + "epoch": 123.17928902627511, + "grad_norm": 0.4531402289867401, + "learning_rate": 6.385144229570372e-10, + "loss": 0.2114, + "num_input_tokens_seen": 60951120, + "step": 39910 + }, + { + "epoch": 123.19474497681607, + "grad_norm": 0.5073720812797546, + "learning_rate": 5.70275911190854e-10, + "loss": 0.2343, + "num_input_tokens_seen": 60958576, + "step": 39915 + }, + { + "epoch": 123.21020092735704, + "grad_norm": 0.4082237482070923, + "learning_rate": 5.058926256490403e-10, + "loss": 0.212, + "num_input_tokens_seen": 60966224, + "step": 39920 + }, + { + "epoch": 123.22565687789799, + "grad_norm": 0.64656001329422, + "learning_rate": 4.4536457626254134e-10, + "loss": 0.2991, + "num_input_tokens_seen": 60973744, + "step": 39925 + }, + { + "epoch": 123.24111282843894, + "grad_norm": 0.7668834328651428, + "learning_rate": 3.88691772365557e-10, + "loss": 0.2839, + "num_input_tokens_seen": 60980944, + "step": 39930 + }, + { + "epoch": 123.25656877897991, + "grad_norm": 0.3583858013153076, + "learning_rate": 3.358742226955425e-10, + "loss": 0.2191, + "num_input_tokens_seen": 60988752, + "step": 39935 + }, + { + "epoch": 123.27202472952087, + "grad_norm": 0.6126299500465393, + "learning_rate": 2.8691193539875925e-10, + "loss": 0.2159, + "num_input_tokens_seen": 60996112, + "step": 39940 + }, + { + "epoch": 123.28748068006182, + "grad_norm": 0.3931877613067627, + "learning_rate": 2.418049180274995e-10, + "loss": 0.2474, + "num_input_tokens_seen": 61003312, + "step": 39945 + }, + { + "epoch": 123.30293663060279, + "grad_norm": 0.6987758874893188, + "learning_rate": 2.005531775373104e-10, + "loss": 0.2608, + "num_input_tokens_seen": 61010736, + "step": 39950 + }, + { + "epoch": 123.31839258114374, + "grad_norm": 0.7503628134727478, + "learning_rate": 1.6315672028699435e-10, + "loss": 0.2687, + "num_input_tokens_seen": 61018256, + "step": 39955 + }, + { + "epoch": 123.3338485316847, + "grad_norm": 0.5623623728752136, + "learning_rate": 1.2961555204693555e-10, + "loss": 0.2167, + "num_input_tokens_seen": 61025648, + "step": 39960 + }, + { + "epoch": 123.34930448222566, + "grad_norm": 0.4729248583316803, + "learning_rate": 9.992967798799768e-11, + "loss": 0.2394, + "num_input_tokens_seen": 61033136, + "step": 39965 + }, + { + "epoch": 123.36476043276662, + "grad_norm": 0.8349291086196899, + "learning_rate": 7.409910268707521e-11, + "loss": 0.2314, + "num_input_tokens_seen": 61041168, + "step": 39970 + }, + { + "epoch": 123.38021638330757, + "grad_norm": 0.3207522928714752, + "learning_rate": 5.212383012986877e-11, + "loss": 0.2089, + "num_input_tokens_seen": 61049584, + "step": 39975 + }, + { + "epoch": 123.39567233384854, + "grad_norm": 0.8182545304298401, + "learning_rate": 3.400386370533415e-11, + "loss": 0.3791, + "num_input_tokens_seen": 61057584, + "step": 39980 + }, + { + "epoch": 123.41112828438949, + "grad_norm": 0.5054162740707397, + "learning_rate": 1.9739206205682258e-11, + "loss": 0.2373, + "num_input_tokens_seen": 61066064, + "step": 39985 + }, + { + "epoch": 123.42658423493044, + "grad_norm": 0.7571989893913269, + "learning_rate": 9.329859829154685e-12, + "loss": 0.1968, + "num_input_tokens_seen": 61073936, + "step": 39990 + }, + { + "epoch": 123.44204018547141, + "grad_norm": 0.525150716304779, + "learning_rate": 2.7758261855748148e-12, + "loss": 0.2698, + "num_input_tokens_seen": 61081840, + "step": 39995 + }, + { + "epoch": 123.45749613601237, + "grad_norm": 0.45516717433929443, + "learning_rate": 7.710628524559838e-14, + "loss": 0.2238, + "num_input_tokens_seen": 61089232, + "step": 40000 + }, + { + "epoch": 123.45749613601237, + "eval_loss": 0.3018151819705963, + "eval_runtime": 6.2914, + "eval_samples_per_second": 91.394, + "eval_steps_per_second": 22.888, + "num_input_tokens_seen": 61089232, + "step": 40000 + }, + { + "epoch": 123.45749613601237, + "num_input_tokens_seen": 61089232, + "step": 40000, + "total_flos": 2.558479435145134e+17, + "train_loss": 0.34382759584337474, + "train_runtime": 26746.5084, + "train_samples_per_second": 23.928, + "train_steps_per_second": 1.496 + } + ], + "logging_steps": 5, + "max_steps": 40000, + "num_input_tokens_seen": 61089232, + "num_train_epochs": 124, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.558479435145134e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}